From db367eeb1b30c8ee99541514442256e391ba9c47 Mon Sep 17 00:00:00 2001 From: Shreyas Pawar Date: Tue, 21 Apr 2026 04:24:48 +0000 Subject: [PATCH 1/2] date class accuracy improvement Signed-off-by: Shreyas Pawar --- .../text_normalization/hi/data/date/days.tsv | 24 +- .../hi/data/date/months.tsv | 35 ++- .../hi/data/date/prefixes.tsv | 7 +- .../hi/data/date/unambiguous_days.tsv | 38 +++ .../text_normalization/hi/taggers/date.py | 284 +++++++++++++++--- .../hi/taggers/tokenize_and_classify.py | 4 +- .../test_cases_date.txt | 2 +- 7 files changed, 345 insertions(+), 49 deletions(-) create mode 100644 nemo_text_processing/text_normalization/hi/data/date/unambiguous_days.tsv diff --git a/nemo_text_processing/text_normalization/hi/data/date/days.tsv b/nemo_text_processing/text_normalization/hi/data/date/days.tsv index 633e2aec0..7d2dc7fbb 100644 --- a/nemo_text_processing/text_normalization/hi/data/date/days.tsv +++ b/nemo_text_processing/text_normalization/hi/data/date/days.tsv @@ -3,7 +3,7 @@ ०३ तीन ०४ चार ०५ पाँच -०६ छः +०६ छह ०७ सात ०८ आठ ०९ नौ @@ -34,7 +34,7 @@ 03 तीन 04 चार 05 पाँच -06 छः +06 छह 07 सात 08 आठ 09 नौ @@ -59,4 +59,22 @@ 28 अट्ठाईस 29 उनतीस 30 तीस -31 इकतीस \ No newline at end of file +31 इकतीस +१ एक +२ दो +३ तीन +४ चार +५ पाँच +६ छह +७ सात +८ आठ +९ नौ +1 एक +2 दो +3 तीन +4 चार +5 पाँच +6 छह +7 सात +8 आठ +9 नौ \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/date/months.tsv b/nemo_text_processing/text_normalization/hi/data/date/months.tsv index af770dafc..5eaafb648 100644 --- a/nemo_text_processing/text_normalization/hi/data/date/months.tsv +++ b/nemo_text_processing/text_normalization/hi/data/date/months.tsv @@ -21,4 +21,37 @@ 09 सितंबर 10 अक्टूबर 11 नवंबर -12 दिसंबर \ No newline at end of file +12 दिसंबर +जनवरी जनवरी +फ़रवरी फ़रवरी +फरवरी फरवरी +मार्च मार्च +अप्रैल अप्रैल +अप्रील अप्रील +मई मई +जून जून +जुलाई जुलाई +अगस्त अगस्त +सितंबर सितंबर +अक्टूबर अक्टूबर +अक्तूबर अक्तूबर +नवंबर नवंबर +दिसंबर दिसंबर +१ जनवरी +२ फ़रवरी +३ मार्च +४ अप्रैल +५ मई +६ जून +७ जुलाई +८ अगस्त +९ सितंबर +1 जनवरी +2 फ़रवरी +3 मार्च +4 अप्रैल +5 मई +6 जून +7 जुलाई +8 अगस्त +9 सितंबर \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/date/prefixes.tsv b/nemo_text_processing/text_normalization/hi/data/date/prefixes.tsv index d4c1ca0b1..6166ec327 100644 --- a/nemo_text_processing/text_normalization/hi/data/date/prefixes.tsv +++ b/nemo_text_processing/text_normalization/hi/data/date/prefixes.tsv @@ -1,3 +1,4 @@ -सन् -सन -साल \ No newline at end of file +सन् +सन +साल +दशक \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/date/unambiguous_days.tsv b/nemo_text_processing/text_normalization/hi/data/date/unambiguous_days.tsv new file mode 100644 index 000000000..7fb5f5380 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/date/unambiguous_days.tsv @@ -0,0 +1,38 @@ +१३ तेरह +१४ चौदह +१५ पंद्रह +१६ सोलह +१७ सत्रह +१८ अठारह +१९ उन्नीस +२० बीस +२१ इक्कीस +२२ बाईस +२३ तेईस +२४ चौबीस +२५ पच्चीस +२६ छब्बीस +२७ सत्ताईस +२८ अट्ठाईस +२९ उनतीस +३० तीस +३१ इकतीस +13 तेरह +14 चौदह +15 पंद्रह +16 सोलह +17 सत्रह +18 अठारह +19 उन्नीस +20 बीस +21 इक्कीस +22 बाईस +23 तेईस +24 चौबीस +25 पच्चीस +26 छब्बीस +27 सत्ताईस +28 अट्ठाईस +29 उनतीस +30 तीस +31 इकतीस \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index da917f3de..1dc1c86ba 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -25,6 +25,7 @@ from nemo_text_processing.text_normalization.hi.utils import get_abs_path days = pynini.string_file(get_abs_path("data/date/days.tsv")) +unambiguous_days = pynini.string_file(get_abs_path("data/date/unambiguous_days.tsv")) months = pynini.string_file(get_abs_path("data/date/months.tsv")) year_suffix = pynini.string_file(get_abs_path("data/date/year_suffix.tsv")) digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) @@ -33,23 +34,38 @@ teens_ties = pynini.union(teens_ties_hi, teens_ties_en) teens_and_ties = pynutil.add_weight(teens_ties, -0.1) -# Read suffixes from file into a list +digit_as_day = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + with open(get_abs_path("data/date/suffixes.tsv"), "r", encoding="utf-8") as f: - suffixes_list = f.read().splitlines() + suffixes_list = [line.rstrip("\n") for line in f if line.strip()] with open(get_abs_path("data/date/prefixes.tsv"), "r", encoding="utf-8") as f: - prefixes_list = f.read().splitlines() + prefixes_list = [line.rstrip("\n") for line in f if line.strip()] -# Create union of suffixes and prefixes suffix_union = pynini.union(*suffixes_list) prefix_union = pynini.union(*prefixes_list) +verbalized_hundreds = teens_ties_hi.project("output") +verbalized_unit = pynini.union( + teens_ties_hi.project("output"), + digit.project("output") +) + +verbalized_year_sou = ( + verbalized_hundreds + + pynini.accep(" सौ") + + pynini.closure(pynini.accep(" ") + verbalized_unit, 0, 1) +) + class DateFst(GraphFst): """ Finite state transducer for classifying date, e.g. - "०१-०४-२०२४" -> date { day: "एक" month: "अप्रैल" year: "दो हज़ार चौबीस" } - "०४-०१-२०२४" -> date { month: "अप्रैल" day: "एक" year: "दो हज़ार चौबीस" } - + "०१-०४-२०२४" -> date { day: "एक" month: "अप्रैल" year: "दो हज़ार चौबीस" } + "६ मार्च, २०१०" -> date { day: "छह" month: "मार्च" year: "दो हज़ार दस" } + "३१ मई, १९९० ई." -> date { day: "इकतीस" month: "मई" year: "उन्नीस सौ नब्बे" era: "ईसवी" } + "उन्नीस सौ बीस में" -> date { era: "उन्नीस सौ बीस में" } + "०३-२०१०" -> date { month: "मार्च" year: "दो हज़ार दस" } + "11-2024" -> date { month: "नवंबर" year: "दो हज़ार चौबीस" } Args: cardinal: cardinal GraphFst @@ -60,60 +76,230 @@ class DateFst(GraphFst): def __init__(self, cardinal: GraphFst): super().__init__(name="date", kind="classify") + # ── Year number graphs ──────────────────────────────────────────────── graph_year_thousands = pynini.compose( - (NEMO_ALL_DIGIT + NEMO_ALL_ZERO + NEMO_ALL_DIGIT + NEMO_ALL_DIGIT), cardinal.graph_thousands + (NEMO_ALL_DIGIT + NEMO_ALL_ZERO + NEMO_ALL_DIGIT + NEMO_ALL_DIGIT), + cardinal.graph_thousands ) graph_year_hundreds_as_thousands = pynini.compose( - (NEMO_ALL_DIGIT + NEMO_ALL_NON_ZERO + NEMO_ALL_DIGIT + NEMO_ALL_DIGIT), cardinal.graph_hundreds_as_thousand + (NEMO_ALL_DIGIT + NEMO_ALL_NON_ZERO + NEMO_ALL_DIGIT + NEMO_ALL_DIGIT), + cardinal.graph_hundreds_as_thousand ) cardinal_graph = pynini.union( - digit, teens_and_ties, cardinal.graph_hundreds, graph_year_thousands, graph_year_hundreds_as_thousands + digit, + teens_and_ties, + cardinal.graph_hundreds, + graph_year_thousands, + graph_year_hundreds_as_thousands, ) graph_year = pynini.union(graph_year_thousands, graph_year_hundreds_as_thousands) - delete_dash = pynutil.delete("-") - delete_slash = pynutil.delete("/") + graph_year_era = pynini.union( + graph_year_thousands, + graph_year_hundreds_as_thousands, + cardinal.graph_hundreds, + ) - days_graph = pynutil.insert("day: \"") + days + pynutil.insert("\"") + insert_space + # ── Separators ─────────────────────────────────────────────────────── + delete_dash = pynutil.delete("-") + delete_slash = pynutil.delete("/") + delete_comma = pynutil.delete(",") + delete_space = pynutil.delete(" ") + delete_optional_space = pynini.closure(pynutil.delete(" "), 0, 1) + delete_comma_sep = delete_comma + delete_optional_space + delete_numeric_sep = pynini.union(delete_dash, delete_slash) + + # ── Day graphs ─────────────────────────────────────────────────────── + # Full day graph — all days 1-31 (used in DD-MM graphs) + day_num = pynini.union( + days, + digit_as_day, + teens_and_ties, + ) - months_graph = pynutil.insert("month: \"") + months + pynutil.insert("\"") + insert_space + days_graph = ( + pynutil.insert("day: \"") + day_num + pynutil.insert("\"") + insert_space + ) - years_graph = pynutil.insert("year: \"") + graph_year + pynutil.insert("\"") + insert_space + # Unambiguous day graph — only days 13-31 + # Used in MM-DD graphs so they only fire when day cannot be a month number + unambiguous_day_num = pynini.union( + unambiguous_days, + ) - graph_dd_mm = days_graph + delete_dash + months_graph + unambiguous_days_graph = ( + pynutil.insert("day: \"") + unambiguous_day_num + pynutil.insert("\"") + insert_space + ) - graph_mm_dd = months_graph + delete_dash + days_graph + # ── Month graph ────────────────────────────────────────────────────── + months_graph = ( + pynutil.insert("month: \"") + months + pynutil.insert("\"") + insert_space + ) - graph_mm_dd += pynutil.insert(" preserve_order: true ") + # ── Year graph ─────────────────────────────────────────────────────── + years_graph = ( + pynutil.insert("year: \"") + graph_year + pynutil.insert("\"") + insert_space + ) - # Graph for era - era_graph = pynutil.insert("era: \"") + year_suffix + pynutil.insert("\"") + insert_space + # ── Era graph ──────────────────────────────────────────────────────── + era_graph = ( + pynutil.insert("era: \"") + year_suffix + pynutil.insert("\"") + insert_space + ) + # ── Range graph (e.g. २९७-२७२ ई. पू.) ────────────────────────────── range_graph = pynini.cross("-", "से") - # Graph for year - century_number = pynini.compose(pynini.closure(NEMO_ALL_DIGIT, 1), cardinal_graph) + pynini.accep("वीं") - century_text = pynutil.insert("era: \"") + century_number + pynutil.insert("\"") + insert_space + # ── Century ordinal (e.g. २०वीं, १८वीं) ──────────────────────────── + century_number = ( + pynini.compose(pynini.closure(NEMO_ALL_DIGIT, 1), cardinal_graph) + + pynini.accep("वीं") + ) + century_text = ( + pynutil.insert("era: \"") + century_number + pynutil.insert("\"") + insert_space + ) - # Updated logic to use suffix_union + # ── Year + suffix (e.g. २०२० में, १९९० का) ────────────────────────── year_number = graph_year + suffix_union - year_text = pynutil.insert("era: \"") + year_number + pynutil.insert("\"") + insert_space + year_text = ( + pynutil.insert("era: \"") + year_number + pynutil.insert("\"") + insert_space + ) + + # ── Year + prefix (e.g. सन् २०२४, साल २०२०) ──────────────────────── + year_prefix = ( + pynutil.insert("era: \"") + + prefix_union + + pynini.accep(" ") + + graph_year + + pynutil.insert("\"") + ) + + # ── Year + prefix + suffix (e.g. सन २००८ में) ─────────────────────── + year_prefix_suffix = ( + pynutil.insert("era: \"") + + prefix_union + + pynini.accep(" ") + + graph_year + + suffix_union + + pynutil.insert("\"") + ) + + # ── Verbalized year passthrough graphs ─────────────────────────────── + graph_verbalized_year_suffix = ( + pynutil.insert("era: \"") + + verbalized_year_sou + + suffix_union + + pynutil.insert("\"") + + insert_space + ) + + graph_verbalized_year_bare = ( + pynutil.insert("era: \"") + + verbalized_year_sou + + pynutil.insert("\"") + + insert_space + ) - # Updated logic to use prefix_union - year_prefix = pynutil.insert("era: \"") + prefix_union + insert_space + graph_year + pynutil.insert("\"") + graph_verbalized_year_prefix = ( + pynutil.insert("era: \"") + + prefix_union + + pynini.accep(" ") + + verbalized_year_sou + + pynutil.insert("\"") + ) - delete_separator = pynini.union(delete_dash, delete_slash) - graph_dd_mm_yyyy = days_graph + delete_separator + months_graph + delete_separator + years_graph + graph_verbalized_year_prefix_suffix = ( + pynutil.insert("era: \"") + + prefix_union + + pynini.accep(" ") + + verbalized_year_sou + + suffix_union + + pynutil.insert("\"") + ) + + # ── Numeric separator date graphs ──────────────────────────────────── + # DD-MM: uses full day range (all 1-31) + graph_dd_mm = days_graph + delete_numeric_sep + months_graph + + # MM-DD: only fires when day is unambiguously > 12 + # This prevents 01-10 being read as MM-DD (January 10) + graph_mm_dd = months_graph + delete_numeric_sep + unambiguous_days_graph + graph_mm_dd += pynutil.insert(" preserve_order: true ") - graph_mm_dd_yyyy = months_graph + delete_separator + days_graph + delete_separator + years_graph + # DD-MM-YYYY: uses full day range + graph_dd_mm_yyyy = ( + days_graph + + delete_numeric_sep + + months_graph + + delete_numeric_sep + + years_graph + ) + # MM-DD-YYYY: only fires when day is unambiguously > 12 + graph_mm_dd_yyyy = ( + months_graph + + delete_numeric_sep + + unambiguous_days_graph + + delete_numeric_sep + + years_graph + ) graph_mm_dd_yyyy += pynutil.insert(" preserve_order: true ") - graph_mm_yyyy = months_graph + delete_dash + insert_space + years_graph + # ── Space-separated date graphs ────────────────────────────────────── + graph_dd_month = ( + days_graph + + delete_space + + months_graph + ) - graph_year_suffix = era_graph + graph_dd_month_comma_yyyy = ( + days_graph + + delete_space + + months_graph + + delete_comma_sep + + years_graph + ) + + graph_dd_month_comma_yyyy_era = ( + days_graph + + delete_space + + months_graph + + delete_comma_sep + + years_graph + + era_graph + ) + + graph_month_comma_yyyy = ( + months_graph + + delete_comma_sep + + years_graph + ) + + graph_month_comma_yyyy_era = ( + months_graph + + delete_comma_sep + + years_graph + + era_graph + ) + + # MM-YYYY: supports both space and dash separator + # e.g. "मार्च २००३", "०३-२०१०", "11-2024" + graph_mm_yyyy = ( + months_graph + + pynini.union(delete_space, delete_dash) + + years_graph + ) + + # ── Era-only graphs ────────────────────────────────────────────────── + graph_year_era_only = ( + pynutil.insert("era: \"") + + graph_year_era + + insert_space + + year_suffix + + pynutil.insert("\"") + + insert_space + ) graph_range = ( pynutil.insert("era: \"") @@ -126,21 +312,41 @@ def __init__(self, cardinal: GraphFst): + pynutil.insert(" preserve_order: true ") ) - # default assume dd_mm_yyyy + graph_year_suffix = era_graph + # ── Final graph ─────────────────────────────────────────────────────── final_graph = ( - pynutil.add_weight(graph_dd_mm, -0.001) - | graph_mm_dd + # Full date with era — most specific first + pynutil.add_weight(graph_dd_month_comma_yyyy_era, -0.003) + | pynutil.add_weight(graph_month_comma_yyyy_era, -0.003) + # Full numeric dates | pynutil.add_weight(graph_dd_mm_yyyy, -0.001) | graph_mm_dd_yyyy + # Full space/comma dates + | pynutil.add_weight(graph_dd_month_comma_yyyy, -0.001) + # Day + month only + | pynutil.add_weight(graph_dd_mm, -0.001) + | pynutil.add_weight(graph_dd_month, -0.001) + | graph_mm_dd + # Month + year — space or dash | pynutil.add_weight(graph_mm_yyyy, -0.2) - | pynutil.add_weight(graph_year_suffix, -0.001) + | pynutil.add_weight(graph_month_comma_yyyy, -0.2) + # Era graphs + | pynutil.add_weight(graph_year_era_only, -0.005) | pynutil.add_weight(graph_range, -0.005) + | pynutil.add_weight(graph_year_suffix, -0.001) + # Century ordinal | pynutil.add_weight(century_text, -0.001) - | pynutil.add_weight(year_text, -0.001) + # Verbalized year passthrough — more specific first + | pynutil.add_weight(graph_verbalized_year_prefix_suffix, -0.012) + | pynutil.add_weight(graph_verbalized_year_prefix, -0.011) + | pynutil.add_weight(graph_verbalized_year_suffix, -0.010) + | pynutil.add_weight(graph_verbalized_year_bare, -0.009) + # Numeric year with suffix/prefix + | pynutil.add_weight(year_prefix_suffix, -0.010) | pynutil.add_weight(year_prefix, -0.009) + | pynutil.add_weight(year_text, -0.001) ) self.final_graph = final_graph.optimize() - - self.fst = self.add_tokens(self.final_graph) + self.fst = self.add_tokens(self.final_graph) \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py index cb03ebce6..df523d7e0 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py @@ -117,7 +117,7 @@ def __init__( pynutil.add_weight(whitelist_graph, 1.01) | pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(decimal_graph, 1.1) - | pynutil.add_weight(fraction_graph, 1.1) + | pynutil.add_weight(fraction_graph, 1.05) | pynutil.add_weight(date_graph, 1.1) | pynutil.add_weight(time_graph, 1.1) | pynutil.add_weight(measure_graph, 1.1) @@ -160,4 +160,4 @@ def __init__( if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) - logging.info(f"ClassifyFst grammars are saved to {far_file}.") + logging.info(f"ClassifyFst grammars are saved to {far_file}.") \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt index 86f1f6678..2df448456 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt @@ -1,4 +1,4 @@ -06-05~छः मई +06-05~छह मई ३१-०६~इकतीस जून 02-01~दो जनवरी ०४-०१~चार जनवरी From 79f8c530627e51ac4e92d8882bde36e49c865fa5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 21 Apr 2026 10:53:54 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../text_normalization/hi/taggers/date.py | 141 ++++-------------- .../hi/taggers/tokenize_and_classify.py | 2 +- 2 files changed, 32 insertions(+), 111 deletions(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index 1dc1c86ba..42d266547 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -45,15 +45,10 @@ prefix_union = pynini.union(*prefixes_list) verbalized_hundreds = teens_ties_hi.project("output") -verbalized_unit = pynini.union( - teens_ties_hi.project("output"), - digit.project("output") -) +verbalized_unit = pynini.union(teens_ties_hi.project("output"), digit.project("output")) verbalized_year_sou = ( - verbalized_hundreds - + pynini.accep(" सौ") - + pynini.closure(pynini.accep(" ") + verbalized_unit, 0, 1) + verbalized_hundreds + pynini.accep(" सौ") + pynini.closure(pynini.accep(" ") + verbalized_unit, 0, 1) ) @@ -78,12 +73,10 @@ def __init__(self, cardinal: GraphFst): # ── Year number graphs ──────────────────────────────────────────────── graph_year_thousands = pynini.compose( - (NEMO_ALL_DIGIT + NEMO_ALL_ZERO + NEMO_ALL_DIGIT + NEMO_ALL_DIGIT), - cardinal.graph_thousands + (NEMO_ALL_DIGIT + NEMO_ALL_ZERO + NEMO_ALL_DIGIT + NEMO_ALL_DIGIT), cardinal.graph_thousands ) graph_year_hundreds_as_thousands = pynini.compose( - (NEMO_ALL_DIGIT + NEMO_ALL_NON_ZERO + NEMO_ALL_DIGIT + NEMO_ALL_DIGIT), - cardinal.graph_hundreds_as_thousand + (NEMO_ALL_DIGIT + NEMO_ALL_NON_ZERO + NEMO_ALL_DIGIT + NEMO_ALL_DIGIT), cardinal.graph_hundreds_as_thousand ) cardinal_graph = pynini.union( @@ -103,13 +96,13 @@ def __init__(self, cardinal: GraphFst): ) # ── Separators ─────────────────────────────────────────────────────── - delete_dash = pynutil.delete("-") - delete_slash = pynutil.delete("/") - delete_comma = pynutil.delete(",") - delete_space = pynutil.delete(" ") + delete_dash = pynutil.delete("-") + delete_slash = pynutil.delete("/") + delete_comma = pynutil.delete(",") + delete_space = pynutil.delete(" ") delete_optional_space = pynini.closure(pynutil.delete(" "), 0, 1) - delete_comma_sep = delete_comma + delete_optional_space - delete_numeric_sep = pynini.union(delete_dash, delete_slash) + delete_comma_sep = delete_comma + delete_optional_space + delete_numeric_sep = pynini.union(delete_dash, delete_slash) # ── Day graphs ─────────────────────────────────────────────────────── # Full day graph — all days 1-31 (used in DD-MM graphs) @@ -119,9 +112,7 @@ def __init__(self, cardinal: GraphFst): teens_and_ties, ) - days_graph = ( - pynutil.insert("day: \"") + day_num + pynutil.insert("\"") + insert_space - ) + days_graph = pynutil.insert("day: \"") + day_num + pynutil.insert("\"") + insert_space # Unambiguous day graph — only days 13-31 # Used in MM-DD graphs so they only fire when day cannot be a month number @@ -129,51 +120,30 @@ def __init__(self, cardinal: GraphFst): unambiguous_days, ) - unambiguous_days_graph = ( - pynutil.insert("day: \"") + unambiguous_day_num + pynutil.insert("\"") + insert_space - ) + unambiguous_days_graph = pynutil.insert("day: \"") + unambiguous_day_num + pynutil.insert("\"") + insert_space # ── Month graph ────────────────────────────────────────────────────── - months_graph = ( - pynutil.insert("month: \"") + months + pynutil.insert("\"") + insert_space - ) + months_graph = pynutil.insert("month: \"") + months + pynutil.insert("\"") + insert_space # ── Year graph ─────────────────────────────────────────────────────── - years_graph = ( - pynutil.insert("year: \"") + graph_year + pynutil.insert("\"") + insert_space - ) + years_graph = pynutil.insert("year: \"") + graph_year + pynutil.insert("\"") + insert_space # ── Era graph ──────────────────────────────────────────────────────── - era_graph = ( - pynutil.insert("era: \"") + year_suffix + pynutil.insert("\"") + insert_space - ) + era_graph = pynutil.insert("era: \"") + year_suffix + pynutil.insert("\"") + insert_space # ── Range graph (e.g. २९७-२७२ ई. पू.) ────────────────────────────── range_graph = pynini.cross("-", "से") # ── Century ordinal (e.g. २०वीं, १८वीं) ──────────────────────────── - century_number = ( - pynini.compose(pynini.closure(NEMO_ALL_DIGIT, 1), cardinal_graph) - + pynini.accep("वीं") - ) - century_text = ( - pynutil.insert("era: \"") + century_number + pynutil.insert("\"") + insert_space - ) + century_number = pynini.compose(pynini.closure(NEMO_ALL_DIGIT, 1), cardinal_graph) + pynini.accep("वीं") + century_text = pynutil.insert("era: \"") + century_number + pynutil.insert("\"") + insert_space # ── Year + suffix (e.g. २०२० में, १९९० का) ────────────────────────── year_number = graph_year + suffix_union - year_text = ( - pynutil.insert("era: \"") + year_number + pynutil.insert("\"") + insert_space - ) + year_text = pynutil.insert("era: \"") + year_number + pynutil.insert("\"") + insert_space # ── Year + prefix (e.g. सन् २०२४, साल २०२०) ──────────────────────── - year_prefix = ( - pynutil.insert("era: \"") - + prefix_union - + pynini.accep(" ") - + graph_year - + pynutil.insert("\"") - ) + year_prefix = pynutil.insert("era: \"") + prefix_union + pynini.accep(" ") + graph_year + pynutil.insert("\"") # ── Year + prefix + suffix (e.g. सन २००८ में) ─────────────────────── year_prefix_suffix = ( @@ -187,26 +157,15 @@ def __init__(self, cardinal: GraphFst): # ── Verbalized year passthrough graphs ─────────────────────────────── graph_verbalized_year_suffix = ( - pynutil.insert("era: \"") - + verbalized_year_sou - + suffix_union - + pynutil.insert("\"") - + insert_space + pynutil.insert("era: \"") + verbalized_year_sou + suffix_union + pynutil.insert("\"") + insert_space ) graph_verbalized_year_bare = ( - pynutil.insert("era: \"") - + verbalized_year_sou - + pynutil.insert("\"") - + insert_space + pynutil.insert("era: \"") + verbalized_year_sou + pynutil.insert("\"") + insert_space ) graph_verbalized_year_prefix = ( - pynutil.insert("era: \"") - + prefix_union - + pynini.accep(" ") - + verbalized_year_sou - + pynutil.insert("\"") + pynutil.insert("era: \"") + prefix_union + pynini.accep(" ") + verbalized_year_sou + pynutil.insert("\"") ) graph_verbalized_year_prefix_suffix = ( @@ -228,68 +187,30 @@ def __init__(self, cardinal: GraphFst): graph_mm_dd += pynutil.insert(" preserve_order: true ") # DD-MM-YYYY: uses full day range - graph_dd_mm_yyyy = ( - days_graph - + delete_numeric_sep - + months_graph - + delete_numeric_sep - + years_graph - ) + graph_dd_mm_yyyy = days_graph + delete_numeric_sep + months_graph + delete_numeric_sep + years_graph # MM-DD-YYYY: only fires when day is unambiguously > 12 graph_mm_dd_yyyy = ( - months_graph - + delete_numeric_sep - + unambiguous_days_graph - + delete_numeric_sep - + years_graph + months_graph + delete_numeric_sep + unambiguous_days_graph + delete_numeric_sep + years_graph ) graph_mm_dd_yyyy += pynutil.insert(" preserve_order: true ") # ── Space-separated date graphs ────────────────────────────────────── - graph_dd_month = ( - days_graph - + delete_space - + months_graph - ) + graph_dd_month = days_graph + delete_space + months_graph - graph_dd_month_comma_yyyy = ( - days_graph - + delete_space - + months_graph - + delete_comma_sep - + years_graph - ) + graph_dd_month_comma_yyyy = days_graph + delete_space + months_graph + delete_comma_sep + years_graph graph_dd_month_comma_yyyy_era = ( - days_graph - + delete_space - + months_graph - + delete_comma_sep - + years_graph - + era_graph + days_graph + delete_space + months_graph + delete_comma_sep + years_graph + era_graph ) - graph_month_comma_yyyy = ( - months_graph - + delete_comma_sep - + years_graph - ) + graph_month_comma_yyyy = months_graph + delete_comma_sep + years_graph - graph_month_comma_yyyy_era = ( - months_graph - + delete_comma_sep - + years_graph - + era_graph - ) + graph_month_comma_yyyy_era = months_graph + delete_comma_sep + years_graph + era_graph # MM-YYYY: supports both space and dash separator # e.g. "मार्च २००३", "०३-२०१०", "11-2024" - graph_mm_yyyy = ( - months_graph - + pynini.union(delete_space, delete_dash) - + years_graph - ) + graph_mm_yyyy = months_graph + pynini.union(delete_space, delete_dash) + years_graph # ── Era-only graphs ────────────────────────────────────────────────── graph_year_era_only = ( @@ -349,4 +270,4 @@ def __init__(self, cardinal: GraphFst): ) self.final_graph = final_graph.optimize() - self.fst = self.add_tokens(self.final_graph) \ No newline at end of file + self.fst = self.add_tokens(self.final_graph) diff --git a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py index df523d7e0..5c3ee661a 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py @@ -160,4 +160,4 @@ def __init__( if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) - logging.info(f"ClassifyFst grammars are saved to {far_file}.") \ No newline at end of file + logging.info(f"ClassifyFst grammars are saved to {far_file}.")