Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions nemo_text_processing/text_normalization/vi/data/time/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
GMT GMT
UTC UTC
CST CST
PST PST
EST EST
JST JST
PT PT
ET ET
CET CET
gmt GMT
utc UTC
cst CST
pst PST
est EST
jst JST
pt PT
et ET
cet CET
10 changes: 10 additions & 0 deletions nemo_text_processing/text_normalization/vi/graph_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,16 @@
delete_zero_or_one_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE, 0, 1))
insert_space = pynutil.insert(" ")
delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ")
delete_preserve_order = pynini.closure(
pynutil.delete(" preserve_order: true")
| (pynutil.delete(' field_order: "') + NEMO_NOT_QUOTE + pynutil.delete('"'))
)

quoted_text = pynini.closure(NEMO_NOT_QUOTE)


def extract_field(field_name):
return pynutil.delete(f"{field_name}:") + delete_space + pynutil.delete("\"") + quoted_text + pynutil.delete("\"")


def convert_space(fst) -> "pynini.FstLike":
Expand Down
141 changes: 141 additions & 0 deletions nemo_text_processing/text_normalization/vi/taggers/time.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.vi.graph_utils import (
NEMO_DIGIT,
NEMO_SPACE,
GraphFst,
convert_space,
insert_space,
)
from nemo_text_processing.text_normalization.vi.utils import get_abs_path


class TimeFst(GraphFst):
"""
Finite state transducer for classifying time in Vietnamese.

Supports various formats including:
- Digital formats: "8:30", "14:45", "5:20:35"
- Vietnamese formats: "14 giờ 30 phút", "2 giờ 15 phút 10 giây"
- Abbreviated formats: "9h", "9g", "14h30", "14g30", "3p20s"
- With time zones: "8:23 gmt", "15h cst"

Args:
cardinal: CardinalFst for number conversion
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""

def __init__(self, cardinal: GraphFst, deterministic: bool = True):
super().__init__(name="time", kind="classify", deterministic=deterministic)

time_zone = pynini.string_file(get_abs_path("data/time/time_zones.tsv"))
digit = NEMO_DIGIT
delete_leading_zero = (pynutil.delete("0").ques | (digit - "0")) + digit
cardinal_graph = cardinal.graph

hours = pynini.union(*[str(x) for x in range(0, 25)])
minutes_seconds = pynini.union(*[str(x) for x in range(0, 60)])

def label(name, graph):
return pynutil.insert(f'{name}: "') + graph + pynutil.insert('"')

hour = label('hours', delete_leading_zero @ hours @ cardinal_graph)
minute = label('minutes', delete_leading_zero @ minutes_seconds @ cardinal_graph)
second = label('seconds', delete_leading_zero @ minutes_seconds @ cardinal_graph)
zone = label('zone', convert_space(time_zone))

h_suffix = pynini.union(pynutil.delete("h"), pynutil.delete("g"))
h_word = pynutil.delete(" giờ")
m_word = pynutil.delete(" phút")
s_word = pynutil.delete(" giây")

opt_zone_space = pynini.closure(pynini.accep(NEMO_SPACE) + zone, 0, 1)
opt_zone = pynini.closure(zone, 0, 1)
preserve = pynutil.insert(" preserve_order: true")

# Define sub-patterns for better readability
# Digital formats
pattern_hour_minute = hour + pynutil.delete(":") + insert_space + minute + opt_zone_space

pattern_hour_minute_second = (
hour
+ pynutil.delete(":")
+ insert_space
+ minute
+ pynutil.delete(":")
+ insert_space
+ second
+ opt_zone_space
+ preserve
)

# Abbreviated formats
pattern_hour_suffix = hour + h_suffix + opt_zone_space
pattern_hour_suffix_minute = hour + h_suffix + minute + opt_zone
pattern_minute_p = minute + pynutil.delete("p")
pattern_second_s = second + pynutil.delete("s")
pattern_minute_p_second_s = minute + pynutil.delete("p") + insert_space + second + pynutil.delete("s")

# Vietnamese word formats
pattern_hour_word = hour + h_word + opt_zone_space

pattern_hour_word_minute = hour + h_word + pynutil.delete(NEMO_SPACE) + minute + m_word + opt_zone_space

pattern_hour_word_minute_second = (
hour
+ h_word
+ pynutil.delete(NEMO_SPACE)
+ minute
+ m_word
+ pynutil.delete(NEMO_SPACE)
+ second
+ s_word
+ opt_zone_space
+ preserve
)

pattern_minute_word = minute + m_word
pattern_minute_word_second = minute + m_word + pynutil.delete(NEMO_SPACE) + second + s_word
pattern_second_word = second + s_word

# Time zone specific patterns
pattern_hour_suffix_space_zone = hour + h_suffix + pynini.accep(NEMO_SPACE) + zone
pattern_hour_suffix_zone = hour + h_suffix + zone

patterns = [
pattern_hour_minute,
pattern_hour_minute_second,
pattern_hour_suffix,
pattern_hour_suffix_minute,
pattern_minute_p,
pattern_second_s,
pattern_minute_p_second_s,
pattern_hour_word,
pattern_hour_word_minute,
pattern_hour_word_minute_second,
pattern_minute_word,
pattern_minute_word_second,
pattern_second_word,
pattern_hour_suffix_space_zone,
pattern_hour_suffix_zone,
]

final_graph = pynini.union(*patterns).optimize()

self.fst = self.add_tokens(final_graph).optimize()
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from nemo_text_processing.text_normalization.vi.taggers.ordinal import OrdinalFst
from nemo_text_processing.text_normalization.vi.taggers.punctuation import PunctuationFst
from nemo_text_processing.text_normalization.vi.taggers.roman import RomanFst
from nemo_text_processing.text_normalization.vi.taggers.time import TimeFst
from nemo_text_processing.text_normalization.vi.taggers.whitelist import WhiteListFst
from nemo_text_processing.text_normalization.vi.taggers.word import WordFst
from nemo_text_processing.utils.logging import logger
Expand Down Expand Up @@ -104,6 +105,11 @@ def __init__(
roman_graph = roman.fst
logger.debug(f"roman: {time.time() - start_time: .2f}s -- {roman_graph.num_states()} nodes")

start_time = time.time()
time_fst = TimeFst(cardinal=cardinal, deterministic=deterministic)
time_graph = time_fst.fst
logger.debug(f"time: {time.time() - start_time: .2f}s -- {time_graph.num_states()} nodes")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is there a logger here?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm using the English template for this file. If it seems redundant, I'll remove the logger.


classify = (
pynutil.add_weight(whitelist_graph, 1.01)
| pynutil.add_weight(roman_graph, 1.1)
Expand All @@ -112,6 +118,7 @@ def __init__(
| pynutil.add_weight(ordinal_graph, 1.1)
| pynutil.add_weight(decimal_graph, 1.1)
| pynutil.add_weight(fraction_graph, 1.1)
| pynutil.add_weight(time_graph, 1.1)
| pynutil.add_weight(word_graph, 100)
)
punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, 1.1) + pynutil.insert(" }")
Expand Down
174 changes: 174 additions & 0 deletions nemo_text_processing/text_normalization/vi/verbalizers/time.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.vi.graph_utils import (
NEMO_NOT_QUOTE,
NEMO_SPACE,
GraphFst,
convert_space,
delete_preserve_order,
delete_space,
extract_field,
)
from nemo_text_processing.text_normalization.vi.utils import get_abs_path


class TimeFst(GraphFst):
"""
Finite state transducer for verbalizing Vietnamese time.

Converts tagged time entities into spoken form, e.g.:
- time { hours: "tám" minutes: "ba mươi" } -> tám giờ ba mươi phút
- time { hours: "mười bốn" minutes: "mười lăm" } -> mười bốn giờ mười lăm phút
- time { hours: "chín" } -> chín giờ
- time { minutes: "ba" seconds: "hai mươi" } -> ba phút hai mươi giây
- time { hours: "tám" minutes: "hai mươi ba" zone: "g m t" } -> tám giờ hai mươi ba phút GMT

Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""

def __init__(self, deterministic: bool = True):
super().__init__(name="time", kind="verbalize", deterministic=deterministic)

time_zone = convert_space(pynini.string_file(get_abs_path("data/time/time_zones.tsv")))

# Extract components
hour_component = extract_field("hours")
timezone_component = extract_field("zone") @ time_zone

# Handle zero and non-zero components
zero_minute_component = pynutil.delete("minutes:") + delete_space + pynutil.delete("\"không\"")
zero_second_component = pynutil.delete("seconds:") + delete_space + pynutil.delete("\"không\"")

non_zero_minute_component = (
pynutil.delete("minutes:")
+ delete_space
+ pynutil.delete("\"")
+ pynini.closure(NEMO_NOT_QUOTE - pynini.accep("không"))
+ pynutil.delete("\"")
)
non_zero_second_component = (
pynutil.delete("seconds:")
+ delete_space
+ pynutil.delete("\"")
+ pynini.closure(NEMO_NOT_QUOTE - pynini.accep("không"))
+ pynutil.delete("\"")
)

# Components with units
hour_with_unit = hour_component + pynutil.insert(" giờ")
minute_with_unit = non_zero_minute_component + pynutil.insert(" phút")
second_with_unit = non_zero_second_component + pynutil.insert(" giây")

# Optional components
optional_timezone = pynini.closure(delete_space + pynutil.insert(NEMO_SPACE) + timezone_component, 0, 1)
optional_preserve_order = pynini.closure(delete_space + delete_preserve_order, 0, 1)

# Pattern 1: hours + optional zero minutes/seconds + optional timezone
pattern_hours_only = (
hour_with_unit
+ pynini.closure(delete_space + zero_minute_component, 0, 1)
+ pynini.closure(delete_space + zero_second_component, 0, 1)
+ optional_timezone
+ optional_preserve_order
)

# Pattern 2: hours + minutes + optional zero seconds + optional timezone
pattern_hours_minutes = (
hour_with_unit
+ delete_space
+ pynutil.insert(NEMO_SPACE)
+ minute_with_unit
+ pynini.closure(delete_space + zero_second_component, 0, 1)
+ optional_timezone
+ optional_preserve_order
)

# Pattern 3: hours + zero minutes + seconds + optional timezone
pattern_hours_seconds = (
hour_with_unit
+ delete_space
+ zero_minute_component
+ delete_space
+ pynutil.insert(NEMO_SPACE)
+ second_with_unit
+ optional_timezone
+ optional_preserve_order
)

# Pattern 4: hours + minutes + seconds + optional timezone
pattern_hours_minutes_seconds = (
hour_with_unit
+ delete_space
+ pynutil.insert(NEMO_SPACE)
+ minute_with_unit
+ delete_space
+ pynutil.insert(NEMO_SPACE)
+ second_with_unit
+ optional_timezone
+ optional_preserve_order
)

# Pattern 5: minutes only + optional zero seconds
pattern_minutes_only = minute_with_unit + pynini.closure(delete_space + zero_second_component, 0, 1)

# Pattern 6: minutes + seconds
pattern_minutes_seconds = minute_with_unit + delete_space + pynutil.insert(NEMO_SPACE) + second_with_unit

# Pattern 7: seconds only
pattern_seconds_only = second_with_unit

patterns = [
pattern_hours_only,
pattern_hours_minutes,
pattern_hours_seconds,
pattern_hours_minutes_seconds,
pattern_minutes_only,
pattern_minutes_seconds,
pattern_seconds_only,
]

final_graph = pynini.union(*patterns)

if not deterministic:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any phrases equivalent to "quarter til XYZ" "half past ABC"?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Vietnamese doesn’t use expressions like “quarter past” or “quarter to” directly, but the meaning is conveyed through minute numbers. The word “rưỡi” (meaning “half”) is commonly used and widely accepted to express 30 minutes past the hour.

The “kém” structure (equivalent to “till” or “to” in English), which follows the pattern [next hour] + “kém” + [number of minutes], is grammatically correct and can be applied for 1 to 30 minutes before the hour. However, in practice, it is less commonly used compared to reading the full time numerically

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

alright, may come up as a new pr in the future or you can tackle now. either or is fine.

# Add special case for half hour ("rưỡi")
half_hour = (
pynutil.delete("minutes:") + delete_space + pynutil.delete("\"ba mươi\"") + pynutil.insert("rưỡi")
)
half_hour_pattern = (
hour_with_unit
+ delete_space
+ pynutil.insert(NEMO_SPACE)
+ half_hour
+ optional_timezone
+ optional_preserve_order
)
self.graph = pynini.union(final_graph, half_hour_pattern)
else:
self.graph = final_graph

# Remove zero minutes and seconds from output
remove_zero_minutes = pynini.cdrewrite(pynutil.delete(" không phút"), "", "", pynini.closure(NEMO_NOT_QUOTE))
remove_zero_seconds = pynini.cdrewrite(pynutil.delete(" không giây"), "", "", pynini.closure(NEMO_NOT_QUOTE))

self.fst = (
self.delete_tokens(self.graph + optional_preserve_order).optimize()
@ remove_zero_minutes
@ remove_zero_seconds
)
Loading