-
Notifications
You must be signed in to change notification settings - Fork 145
Time - semiotic class for Vietnamese TN #302
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
d6dc7bc
f07cc8a
f5830a6
72ab3fa
91575f5
08e7674
cfd9797
bf32952
da4db96
0674880
854137a
4f8f38a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,13 @@ | ||
| # Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,18 @@ | ||
| GMT GMT | ||
| UTC UTC | ||
| CST CST | ||
| PST PST | ||
| EST EST | ||
| JST JST | ||
| PT PT | ||
| ET ET | ||
| CET CET | ||
| gmt GMT | ||
| utc UTC | ||
| cst CST | ||
| pst PST | ||
| est EST | ||
| jst JST | ||
| pt PT | ||
| et ET | ||
| cet CET |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,141 @@ | ||
| # Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| import pynini | ||
| from pynini.lib import pynutil | ||
|
|
||
| from nemo_text_processing.text_normalization.vi.graph_utils import ( | ||
| NEMO_DIGIT, | ||
| NEMO_SPACE, | ||
| GraphFst, | ||
| convert_space, | ||
| insert_space, | ||
| ) | ||
| from nemo_text_processing.text_normalization.vi.utils import get_abs_path | ||
|
|
||
|
|
||
| class TimeFst(GraphFst): | ||
| """ | ||
| Finite state transducer for classifying time in Vietnamese. | ||
|
|
||
| Supports various formats including: | ||
| - Digital formats: "8:30", "14:45", "5:20:35" | ||
| - Vietnamese formats: "14 giờ 30 phút", "2 giờ 15 phút 10 giây" | ||
| - Abbreviated formats: "9h", "9g", "14h30", "14g30", "3p20s" | ||
| - With time zones: "8:23 gmt", "15h cst" | ||
|
|
||
| Args: | ||
| cardinal: CardinalFst for number conversion | ||
| deterministic: if True will provide a single transduction option, | ||
| for False multiple transduction are generated (used for audio-based normalization) | ||
| """ | ||
|
|
||
| def __init__(self, cardinal: GraphFst, deterministic: bool = True): | ||
| super().__init__(name="time", kind="classify", deterministic=deterministic) | ||
|
|
||
| time_zone = pynini.string_file(get_abs_path("data/time/time_zones.tsv")) | ||
| digit = NEMO_DIGIT | ||
| delete_leading_zero = (pynutil.delete("0").ques | (digit - "0")) + digit | ||
| cardinal_graph = cardinal.graph | ||
|
|
||
| hours = pynini.union(*[str(x) for x in range(0, 25)]) | ||
| minutes_seconds = pynini.union(*[str(x) for x in range(0, 60)]) | ||
|
|
||
| def label(name, graph): | ||
| return pynutil.insert(f'{name}: "') + graph + pynutil.insert('"') | ||
|
|
||
| hour = label('hours', delete_leading_zero @ hours @ cardinal_graph) | ||
| minute = label('minutes', delete_leading_zero @ minutes_seconds @ cardinal_graph) | ||
| second = label('seconds', delete_leading_zero @ minutes_seconds @ cardinal_graph) | ||
| zone = label('zone', convert_space(time_zone)) | ||
|
|
||
| h_suffix = pynini.union(pynutil.delete("h"), pynutil.delete("g")) | ||
| h_word = pynutil.delete(" giờ") | ||
| m_word = pynutil.delete(" phút") | ||
| s_word = pynutil.delete(" giây") | ||
|
|
||
| opt_zone_space = pynini.closure(pynini.accep(NEMO_SPACE) + zone, 0, 1) | ||
| opt_zone = pynini.closure(zone, 0, 1) | ||
| preserve = pynutil.insert(" preserve_order: true") | ||
|
|
||
| # Define sub-patterns for better readability | ||
| # Digital formats | ||
| pattern_hour_minute = hour + pynutil.delete(":") + insert_space + minute + opt_zone_space | ||
|
|
||
| pattern_hour_minute_second = ( | ||
| hour | ||
| + pynutil.delete(":") | ||
| + insert_space | ||
| + minute | ||
| + pynutil.delete(":") | ||
| + insert_space | ||
| + second | ||
| + opt_zone_space | ||
| + preserve | ||
| ) | ||
|
|
||
| # Abbreviated formats | ||
| pattern_hour_suffix = hour + h_suffix + opt_zone_space | ||
| pattern_hour_suffix_minute = hour + h_suffix + minute + opt_zone | ||
| pattern_minute_p = minute + pynutil.delete("p") | ||
| pattern_second_s = second + pynutil.delete("s") | ||
| pattern_minute_p_second_s = minute + pynutil.delete("p") + insert_space + second + pynutil.delete("s") | ||
|
|
||
| # Vietnamese word formats | ||
| pattern_hour_word = hour + h_word + opt_zone_space | ||
|
|
||
| pattern_hour_word_minute = hour + h_word + pynutil.delete(NEMO_SPACE) + minute + m_word + opt_zone_space | ||
|
|
||
| pattern_hour_word_minute_second = ( | ||
| hour | ||
| + h_word | ||
folivoramanh marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| + pynutil.delete(NEMO_SPACE) | ||
| + minute | ||
| + m_word | ||
| + pynutil.delete(NEMO_SPACE) | ||
| + second | ||
| + s_word | ||
| + opt_zone_space | ||
| + preserve | ||
| ) | ||
|
|
||
| pattern_minute_word = minute + m_word | ||
| pattern_minute_word_second = minute + m_word + pynutil.delete(NEMO_SPACE) + second + s_word | ||
| pattern_second_word = second + s_word | ||
|
|
||
| # Time zone specific patterns | ||
| pattern_hour_suffix_space_zone = hour + h_suffix + pynini.accep(NEMO_SPACE) + zone | ||
| pattern_hour_suffix_zone = hour + h_suffix + zone | ||
|
|
||
| patterns = [ | ||
| pattern_hour_minute, | ||
| pattern_hour_minute_second, | ||
| pattern_hour_suffix, | ||
| pattern_hour_suffix_minute, | ||
| pattern_minute_p, | ||
| pattern_second_s, | ||
| pattern_minute_p_second_s, | ||
| pattern_hour_word, | ||
| pattern_hour_word_minute, | ||
| pattern_hour_word_minute_second, | ||
| pattern_minute_word, | ||
| pattern_minute_word_second, | ||
| pattern_second_word, | ||
| pattern_hour_suffix_space_zone, | ||
| pattern_hour_suffix_zone, | ||
| ] | ||
|
|
||
| final_graph = pynini.union(*patterns).optimize() | ||
|
|
||
| self.fst = self.add_tokens(final_graph).optimize() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,6 +31,7 @@ | |
| from nemo_text_processing.text_normalization.vi.taggers.ordinal import OrdinalFst | ||
| from nemo_text_processing.text_normalization.vi.taggers.punctuation import PunctuationFst | ||
| from nemo_text_processing.text_normalization.vi.taggers.roman import RomanFst | ||
| from nemo_text_processing.text_normalization.vi.taggers.time import TimeFst | ||
| from nemo_text_processing.text_normalization.vi.taggers.whitelist import WhiteListFst | ||
| from nemo_text_processing.text_normalization.vi.taggers.word import WordFst | ||
| from nemo_text_processing.utils.logging import logger | ||
|
|
@@ -104,6 +105,11 @@ def __init__( | |
| roman_graph = roman.fst | ||
| logger.debug(f"roman: {time.time() - start_time: .2f}s -- {roman_graph.num_states()} nodes") | ||
|
|
||
| start_time = time.time() | ||
| time_fst = TimeFst(cardinal=cardinal, deterministic=deterministic) | ||
| time_graph = time_fst.fst | ||
| logger.debug(f"time: {time.time() - start_time: .2f}s -- {time_graph.num_states()} nodes") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is there a logger here?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm using the English template for this file. If it seems redundant, I'll remove the logger. |
||
|
|
||
| classify = ( | ||
| pynutil.add_weight(whitelist_graph, 1.01) | ||
| | pynutil.add_weight(roman_graph, 1.1) | ||
|
|
@@ -112,6 +118,7 @@ def __init__( | |
| | pynutil.add_weight(ordinal_graph, 1.1) | ||
| | pynutil.add_weight(decimal_graph, 1.1) | ||
| | pynutil.add_weight(fraction_graph, 1.1) | ||
| | pynutil.add_weight(time_graph, 1.1) | ||
| | pynutil.add_weight(word_graph, 100) | ||
| ) | ||
| punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, 1.1) + pynutil.insert(" }") | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,174 @@ | ||
| # Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| import pynini | ||
| from pynini.lib import pynutil | ||
|
|
||
| from nemo_text_processing.text_normalization.vi.graph_utils import ( | ||
| NEMO_NOT_QUOTE, | ||
| NEMO_SPACE, | ||
| GraphFst, | ||
| convert_space, | ||
| delete_preserve_order, | ||
| delete_space, | ||
| extract_field, | ||
| ) | ||
| from nemo_text_processing.text_normalization.vi.utils import get_abs_path | ||
|
|
||
|
|
||
| class TimeFst(GraphFst): | ||
| """ | ||
| Finite state transducer for verbalizing Vietnamese time. | ||
|
|
||
| Converts tagged time entities into spoken form, e.g.: | ||
| - time { hours: "tám" minutes: "ba mươi" } -> tám giờ ba mươi phút | ||
| - time { hours: "mười bốn" minutes: "mười lăm" } -> mười bốn giờ mười lăm phút | ||
| - time { hours: "chín" } -> chín giờ | ||
| - time { minutes: "ba" seconds: "hai mươi" } -> ba phút hai mươi giây | ||
| - time { hours: "tám" minutes: "hai mươi ba" zone: "g m t" } -> tám giờ hai mươi ba phút GMT | ||
|
|
||
| Args: | ||
| deterministic: if True will provide a single transduction option, | ||
| for False multiple transduction are generated (used for audio-based normalization) | ||
| """ | ||
|
|
||
| def __init__(self, deterministic: bool = True): | ||
| super().__init__(name="time", kind="verbalize", deterministic=deterministic) | ||
|
|
||
| time_zone = convert_space(pynini.string_file(get_abs_path("data/time/time_zones.tsv"))) | ||
|
|
||
| # Extract components | ||
| hour_component = extract_field("hours") | ||
| timezone_component = extract_field("zone") @ time_zone | ||
|
|
||
| # Handle zero and non-zero components | ||
| zero_minute_component = pynutil.delete("minutes:") + delete_space + pynutil.delete("\"không\"") | ||
| zero_second_component = pynutil.delete("seconds:") + delete_space + pynutil.delete("\"không\"") | ||
|
|
||
| non_zero_minute_component = ( | ||
| pynutil.delete("minutes:") | ||
| + delete_space | ||
| + pynutil.delete("\"") | ||
| + pynini.closure(NEMO_NOT_QUOTE - pynini.accep("không")) | ||
| + pynutil.delete("\"") | ||
| ) | ||
| non_zero_second_component = ( | ||
| pynutil.delete("seconds:") | ||
| + delete_space | ||
| + pynutil.delete("\"") | ||
| + pynini.closure(NEMO_NOT_QUOTE - pynini.accep("không")) | ||
| + pynutil.delete("\"") | ||
| ) | ||
|
|
||
| # Components with units | ||
| hour_with_unit = hour_component + pynutil.insert(" giờ") | ||
| minute_with_unit = non_zero_minute_component + pynutil.insert(" phút") | ||
| second_with_unit = non_zero_second_component + pynutil.insert(" giây") | ||
|
|
||
| # Optional components | ||
| optional_timezone = pynini.closure(delete_space + pynutil.insert(NEMO_SPACE) + timezone_component, 0, 1) | ||
| optional_preserve_order = pynini.closure(delete_space + delete_preserve_order, 0, 1) | ||
|
|
||
| # Pattern 1: hours + optional zero minutes/seconds + optional timezone | ||
| pattern_hours_only = ( | ||
| hour_with_unit | ||
folivoramanh marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| + pynini.closure(delete_space + zero_minute_component, 0, 1) | ||
| + pynini.closure(delete_space + zero_second_component, 0, 1) | ||
| + optional_timezone | ||
| + optional_preserve_order | ||
| ) | ||
|
|
||
| # Pattern 2: hours + minutes + optional zero seconds + optional timezone | ||
| pattern_hours_minutes = ( | ||
| hour_with_unit | ||
| + delete_space | ||
| + pynutil.insert(NEMO_SPACE) | ||
| + minute_with_unit | ||
| + pynini.closure(delete_space + zero_second_component, 0, 1) | ||
| + optional_timezone | ||
| + optional_preserve_order | ||
| ) | ||
|
|
||
| # Pattern 3: hours + zero minutes + seconds + optional timezone | ||
| pattern_hours_seconds = ( | ||
| hour_with_unit | ||
| + delete_space | ||
| + zero_minute_component | ||
| + delete_space | ||
| + pynutil.insert(NEMO_SPACE) | ||
| + second_with_unit | ||
| + optional_timezone | ||
| + optional_preserve_order | ||
| ) | ||
|
|
||
| # Pattern 4: hours + minutes + seconds + optional timezone | ||
| pattern_hours_minutes_seconds = ( | ||
| hour_with_unit | ||
| + delete_space | ||
| + pynutil.insert(NEMO_SPACE) | ||
| + minute_with_unit | ||
| + delete_space | ||
| + pynutil.insert(NEMO_SPACE) | ||
| + second_with_unit | ||
| + optional_timezone | ||
| + optional_preserve_order | ||
| ) | ||
|
|
||
| # Pattern 5: minutes only + optional zero seconds | ||
| pattern_minutes_only = minute_with_unit + pynini.closure(delete_space + zero_second_component, 0, 1) | ||
|
|
||
| # Pattern 6: minutes + seconds | ||
| pattern_minutes_seconds = minute_with_unit + delete_space + pynutil.insert(NEMO_SPACE) + second_with_unit | ||
|
|
||
| # Pattern 7: seconds only | ||
| pattern_seconds_only = second_with_unit | ||
|
|
||
| patterns = [ | ||
| pattern_hours_only, | ||
| pattern_hours_minutes, | ||
| pattern_hours_seconds, | ||
| pattern_hours_minutes_seconds, | ||
| pattern_minutes_only, | ||
| pattern_minutes_seconds, | ||
| pattern_seconds_only, | ||
| ] | ||
|
|
||
| final_graph = pynini.union(*patterns) | ||
|
|
||
| if not deterministic: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Any phrases equivalent to "quarter til XYZ" "half past ABC"?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Vietnamese doesn’t use expressions like “quarter past” or “quarter to” directly, but the meaning is conveyed through minute numbers. The word “rưỡi” (meaning “half”) is commonly used and widely accepted to express 30 minutes past the hour. The “kém” structure (equivalent to “till” or “to” in English), which follows the pattern [next hour] + “kém” + [number of minutes], is grammatically correct and can be applied for 1 to 30 minutes before the hour. However, in practice, it is less commonly used compared to reading the full time numerically
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. alright, may come up as a new pr in the future or you can tackle now. either or is fine. |
||
| # Add special case for half hour ("rưỡi") | ||
| half_hour = ( | ||
| pynutil.delete("minutes:") + delete_space + pynutil.delete("\"ba mươi\"") + pynutil.insert("rưỡi") | ||
| ) | ||
| half_hour_pattern = ( | ||
| hour_with_unit | ||
| + delete_space | ||
| + pynutil.insert(NEMO_SPACE) | ||
| + half_hour | ||
| + optional_timezone | ||
| + optional_preserve_order | ||
| ) | ||
| self.graph = pynini.union(final_graph, half_hour_pattern) | ||
| else: | ||
| self.graph = final_graph | ||
|
|
||
| # Remove zero minutes and seconds from output | ||
| remove_zero_minutes = pynini.cdrewrite(pynutil.delete(" không phút"), "", "", pynini.closure(NEMO_NOT_QUOTE)) | ||
| remove_zero_seconds = pynini.cdrewrite(pynutil.delete(" không giây"), "", "", pynini.closure(NEMO_NOT_QUOTE)) | ||
|
|
||
| self.fst = ( | ||
| self.delete_tokens(self.graph + optional_preserve_order).optimize() | ||
| @ remove_zero_minutes | ||
| @ remove_zero_seconds | ||
| ) | ||
Uh oh!
There was an error while loading. Please reload this page.