From 5e42e2b1c05e78ab97990b0329f6ad6cac548302 Mon Sep 17 00:00:00 2001 From: RajanPutty Date: Fri, 17 Apr 2026 19:47:48 +0530 Subject: [PATCH 1/2] Add hi_en Code Switched Signed-off-by: RajanPutty --- .../hi_en/__init__.py | 17 + .../hi_en/data/__init__.py | 13 + .../hi_en/data/en_whitelist.tsv | 416 ++++++++++++++++++ .../hi_en/data/hi_whitelist.tsv | 7 + .../hi_en/graph_utils.py | 13 + .../hi_en/taggers/__init__.py | 13 + .../hi_en/taggers/tokenize_and_classify.py | 175 ++++++++ .../inverse_text_normalization/hi_en/utils.py | 27 ++ .../hi_en/verbalizers/__init__.py | 13 + .../hi_en/verbalizers/verbalize.py | 102 +++++ .../hi_en/verbalizers/verbalize_final.py | 44 ++ .../inverse_normalize.py | 34 +- .../run_evaluate.py | 2 +- tests/nemo_text_processing/hi_en/__init__.py | 13 + .../test_cases_address.txt | 30 ++ .../test_cases_cardinal.txt | 58 +++ .../test_cases_date.txt | 32 ++ .../test_cases_date_cased.txt | 70 +++ .../test_cases_decimal.txt | 25 ++ .../test_cases_electronic.txt | 17 + .../test_cases_fraction.txt | 39 ++ .../test_cases_measure.txt | 25 ++ .../test_cases_money.txt | 36 ++ .../test_cases_ordinal.txt | 26 ++ .../test_cases_telephone.txt | 35 ++ .../test_cases_time.txt | 29 ++ .../test_cases_whitelist.txt | 17 + .../test_cases_word.txt | 19 + .../hi_en/test_address.py | 31 ++ .../hi_en/test_cardinal.py | 41 ++ tests/nemo_text_processing/hi_en/test_date.py | 41 ++ .../hi_en/test_decimal.py | 41 ++ .../hi_en/test_electronic.py | 41 ++ .../hi_en/test_fraction.py | 31 ++ .../hi_en/test_measure.py | 41 ++ .../nemo_text_processing/hi_en/test_money.py | 41 ++ .../hi_en/test_ordinal.py | 41 ++ ..._sparrowhawk_inverse_text_normalization.sh | 102 +++++ .../hi_en/test_telephone.py | 41 ++ tests/nemo_text_processing/hi_en/test_time.py | 41 ++ .../hi_en/test_whitelist.py | 31 ++ tests/nemo_text_processing/hi_en/test_word.py | 41 ++ .../pynini_export.py | 22 +- 43 files changed, 1933 insertions(+), 41 deletions(-) create mode 100644 nemo_text_processing/inverse_text_normalization/hi_en/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/hi_en/data/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/hi_en/data/en_whitelist.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi_en/data/hi_whitelist.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi_en/graph_utils.py create mode 100644 nemo_text_processing/inverse_text_normalization/hi_en/taggers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/hi_en/taggers/tokenize_and_classify.py create mode 100644 nemo_text_processing/inverse_text_normalization/hi_en/utils.py create mode 100644 nemo_text_processing/inverse_text_normalization/hi_en/verbalizers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/hi_en/verbalizers/verbalize.py create mode 100644 nemo_text_processing/inverse_text_normalization/hi_en/verbalizers/verbalize_final.py create mode 100644 tests/nemo_text_processing/hi_en/__init__.py create mode 100644 tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_address.txt create mode 100644 tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_cardinal.txt create mode 100644 tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_date.txt create mode 100644 tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_date_cased.txt create mode 100644 tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_decimal.txt create mode 100644 tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_electronic.txt create mode 100644 tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_fraction.txt create mode 100644 tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_measure.txt create mode 100644 tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_money.txt create mode 100644 tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_ordinal.txt create mode 100644 tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_telephone.txt create mode 100644 tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_time.txt create mode 100644 tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_whitelist.txt create mode 100644 tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_word.txt create mode 100644 tests/nemo_text_processing/hi_en/test_address.py create mode 100644 tests/nemo_text_processing/hi_en/test_cardinal.py create mode 100644 tests/nemo_text_processing/hi_en/test_date.py create mode 100644 tests/nemo_text_processing/hi_en/test_decimal.py create mode 100644 tests/nemo_text_processing/hi_en/test_electronic.py create mode 100644 tests/nemo_text_processing/hi_en/test_fraction.py create mode 100644 tests/nemo_text_processing/hi_en/test_measure.py create mode 100644 tests/nemo_text_processing/hi_en/test_money.py create mode 100644 tests/nemo_text_processing/hi_en/test_ordinal.py create mode 100644 tests/nemo_text_processing/hi_en/test_sparrowhawk_inverse_text_normalization.sh create mode 100644 tests/nemo_text_processing/hi_en/test_telephone.py create mode 100644 tests/nemo_text_processing/hi_en/test_time.py create mode 100644 tests/nemo_text_processing/hi_en/test_whitelist.py create mode 100644 tests/nemo_text_processing/hi_en/test_word.py diff --git a/nemo_text_processing/inverse_text_normalization/hi_en/__init__.py b/nemo_text_processing/inverse_text_normalization/hi_en/__init__.py new file mode 100644 index 000000000..cfe932251 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi_en/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.inverse_text_normalization.hi_en.taggers.tokenize_and_classify import ClassifyFst +from nemo_text_processing.inverse_text_normalization.hi_en.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.hi_en.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/inverse_text_normalization/hi_en/data/__init__.py b/nemo_text_processing/inverse_text_normalization/hi_en/data/__init__.py new file mode 100644 index 000000000..4fc25d0d3 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi_en/data/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/hi_en/data/en_whitelist.tsv b/nemo_text_processing/inverse_text_normalization/hi_en/data/en_whitelist.tsv new file mode 100644 index 000000000..78424367a --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi_en/data/en_whitelist.tsv @@ -0,0 +1,416 @@ +10K ten k +1D one d +1G one g +1K one k +2.5G two point five g +2D two d +2G two g +2K two k +3D three d +3G three g +4D four d +4G four g +4K four k +5D five d +5G five g +6G six g +7-eleven seven eleven +7G seven g +8K eight k +AAA triple a +AC a c +ACL a c l +ADAS a das +AEB a e b +AES a e s +AGX a g x +AI a i +AI ai +AI ais +ALU a l u +AMD a m d +API a p i +API api +ARP a r p +ASAP a s a p +ASAP a sap +ASMC a s m c +ASR a s r +ASUS a sus +ASUS asus +AT&T a t and t +ATM a t m +AV1 a v one +AV1 av one +AVAS a v a s +AVAS a vas +AVRCP a v r c p +AWS a w s +AWS aws +AlphaFold alpha fold +AstraZeneca astra zeneca +AstraZeneca astrazeneca +Audio2Face audio to face +Audio2Face audio too face +BBC b b c +BCP b c p +BEV b e v +BGA b g a +BGP b g p +BIOS Bios +BIOS bios +BIS b i s +BSP b s p +BasePOD base pod +BasePOD basepod +Billion-X billion X +C# c sharp +C++ c plus plus +CAT5 cat five +CAT5e cat five e +CAT6 cat six +CAT6A cat six a +CAT7 cat seven +CAT8 cat eight +CBC c b c +CCS c c s +CCTV c c t v +CCTV c c tv +CDN c d n +CEO c e o +CES c e s +CFO c f o +CHAdeMO cha demo +CHAdeMO chai demo +CLI c l i +CMO c m o +CMS c m s +CNN c n n +COVID covid +COVID-19 covid nineteen +CPU c p u +CSO c s o +CSP c s p +CSS c s s +CV c v +CVT c v t +CalTech cal tech +CalTech caltech +CapEx cap ex +CapEx capex +ChatGPT chat g p t +CloudXR cloud x r +ConnectX connect x +Core 2 core two +DALI d a l i +DB d b +DC d c +DCDB d c d b +DDR d d r +DDoS d d o s +DDoS d dos +DGL d g l +DGX d g x -1.1 +DGX A100 d g x a hundred -1.1 +DGX A100 d g x a one hundred -1.1 +DGX Station d g x station -1.1 +DHCP d h c p +DLSS d l s s +DLSS 2 d l s s two +DLSS 3 d l s s three +DNA d n a +DNN d n n +DNS d n s +DP d p +DPDT d p d t +DPU d p u +DPX d p x +DeepMind deep mind +DeepMind deepmind +DeepStream deep stream +DeepStream deepstream +DevOps dev ops +Direct3D direct three d +DisplayPort display port +EBC e b c +EBS e b s +EC2 e c two +ECC e c c +ECS e c s +EFS e f s +EGX e g x +EKS e k s +ELB e l b +ELCB e l c b +EMR e m r +EOL e o l +EV e v +Earth-2 earth two +FIFO fee foe +FIFO fifo +FOMO foe moe +FOMO fomo +FPS f p s +FSA f s a +FST f s t +FTP f t p +FTPS f t p s +FX f x +G-SYNC g sync +GCC g c c +GCP g c p +GCS g c s +GFN G f n +GPU g p u +GRPC g r p c +GRU g r u +GSK g s k +GTC g t c +GTX g t x -1.1 +GUI g u i +GUI gui +GeForce geforce +GeForce NOW geforce now +H.264 H two six four +H.264 H two sixty four +H.265 H two six five +H.265 H two sixty five +H100 H one hundred +HDMI h d m i +HEVC h e v c +HGX h g x +HMD Hmd +HPC h p c +HQ h q +HSM h s m +HT h t +HTML h t m l +HTTP h t t p +HTTPS h t t p s +HuggingFace hugging face +I/O i o +ICMP i c m p +IEC i e c +IEEE i triple e +IGMP i g m p +IGX i g x +IISc i i s c +IIT i i t +INT8 int eight +IP i p +IPSec i p sec +ISC i s c +ISI i s i +InfiniBand inifiband +IoT i o t +Isaac SIM isaac sim +JSON j son +JavaScript java script +JavaScript javascript +KVM k v m +LFP lee fei po +LFP life po +LGA l g a -1.1 +LLC l l c +LLM l l m +LLMs l l ms +LLP l l p +LSTM l s t m +LT l t +LTT l t t +LinkedIn linked in +LinkedIn linkedin +MB m b +MCB m c b +MCCB m c c b +MCP m c p +MDM m d m +MFA m f a +MIT m i t +ML m l +MLPerf m l perf +MONAI monai +MWC m w c +MXNet m x net +MacBook mac book +MacBook macbook +MacBook Pro mac book pro +Max-Q max q +Million-X million x +MoCap mo cap +NC n c +NGC n g c +NLP n l p +NLU n l u +NMC n m c +NO n o +NTP n t p +NVDec n v dec +NVDec n v deck +NVEnc n v enc +NVEnc n v inc +NVEnc n v ink +NVMe n v m e +Ni-CD knee cad +NiMH n i m h +NoSQL no s q l +NoSQL no sql +Nvidia A100 nvidia a hundred +Nvidia A100 nvidia a one hundred +Nvidia A30 nvidia a thirty +Nvidia A40 nvidia a forty +OCI o c i +OEM o e m +OEMs o e ms +OLTC o l t c +OLTP o l t p +ONNX o n n x +OVX o v x +OpEx op ex +OpEx opex +OpenCL open c l +OpenGL open g l +PCI p c i +PCIe p c i e -1.1 +PDF p d f +PHEV p h e v +POP3 pop three +PSO p s o +Photoshop photo shop +PhysX fizz ex +PhysX fizz x +PhysX phys ex +PhysX phys x +PlayStation play station +Premiere Pro premiere pro +ProVis pro vis +ProVis provis +PyG pi g +PyG py g +PyTorch pi torch +QODA q o d a +QOS q o s +RCBO r c b o +RCCB r c c b +RDS r d s +RFP r f p +RISC risc +RL r l +RNN r n n +ROI r o i +RSS r s s +RT r t +RTP r t p +RTX r t x -1.1 +RTX ON r t x on +RTX Off r t x off +RacerX racer x +Ryzen 2 rye zen two +Ryzen 3 rye zen three +Ryzen 5 rye zen five +Ryzen 7 rye zen seven +Ryzen 9 rye zen nine +S&P s and p +S3 s three +SAE s a e +SDK s d k +SEO s e o +SFU s f u +SHIELD TV shield tv +SIGGRAPH sig graph +SIGGRAPH siggraph +SMPS s m p s +SMTP s m t p +SNMP s n m p +SNS s n s +SOL s o l +SPDT s p d t +SPN s p n +SPST s p s t +SQL s q l +SQS s q s +SSD s s d +SSH s s h +SSL s s l +SSO s s o +SSR s s r +SUV s u v +SWG s w g +SaaS saas +ServiceNow service now +SuperCloud super cloud +SuperPOD super pod +TCP t c p +TCP/IP t c p i p +TFTP t f t p +TJ t j +TOS t o s +TPM t p m +TPMS t p m s +TPN t p n +TSL t s l +TSMC t s m c +TTS t t s +Telnet tell net +TensorRT tensor r t +Tick-Tock tick tock +TikTok app tick tock app +TikTok is tick tock is +UDP u d p +UEFI u e f i +UFI u f i +UPS u p s +USB u s b +VAAPI v a a p i +VAAPI v a api +VDI v d i +VDPAU v d p a u +VDPAU v d paw +VDPAU v d pow +VFD v f d +VLAN v lan +VP v p +VPN v p n +Vcc v c c +Vdd v d d +VoIP v o i p +WFSA w f s a +WFST w f s t +Wi-Fi wi fi +Wi-Fi wifi +XFX x f x +XFX x fx +XML x m l +Zen 2 zen two +Zen 3 zen three +Zen 4 zen four +c# c sharp +c++ c plus plus +cuBLAS cue blah +cuBLAS cue blahs +cuBLAS q blahs +cuDNN c u d n n +cuDNN cue d n n +cuDNN q d n n +cuOpt cue opt +cuOpt q opt +cuQuantum cue quantum +cuQuantum q quantum +dr. doctor +e.g. for example +es3 e s three +ext4 e x t four +i3 i three +i5 i five +i7 i seven +i9 i nine +iOS ios +iTPMS i t p m s +mr. mister +mrs. misses +on TikTok on tick tock +sFTP s f t p +st. saint +x1 x one +x2 x two +x8 x eight diff --git a/nemo_text_processing/inverse_text_normalization/hi_en/data/hi_whitelist.tsv b/nemo_text_processing/inverse_text_normalization/hi_en/data/hi_whitelist.tsv new file mode 100644 index 000000000..8cfd0e19f --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi_en/data/hi_whitelist.tsv @@ -0,0 +1,7 @@ +१/४ पाव +कु. कुमारी +स्मि. श्रीमती +श्री. श्री +श्री. श्रीमान +मा. मास्टर +डॉ. डॉक्टर diff --git a/nemo_text_processing/inverse_text_normalization/hi_en/graph_utils.py b/nemo_text_processing/inverse_text_normalization/hi_en/graph_utils.py new file mode 100644 index 000000000..4fc25d0d3 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi_en/graph_utils.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/hi_en/taggers/__init__.py b/nemo_text_processing/inverse_text_normalization/hi_en/taggers/__init__.py new file mode 100644 index 000000000..4fc25d0d3 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi_en/taggers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/hi_en/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/hi_en/taggers/tokenize_and_classify.py new file mode 100644 index 000000000..199a27817 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi_en/taggers/tokenize_and_classify.py @@ -0,0 +1,175 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.en.taggers.cardinal import CardinalFst as EnCardinalFst +from nemo_text_processing.inverse_text_normalization.en.taggers.date import DateFst as EnDateFst +from nemo_text_processing.inverse_text_normalization.en.taggers.decimal import DecimalFst as EnDecimalFst +from nemo_text_processing.inverse_text_normalization.en.taggers.electronic import ElectronicFst as EnElectronicFst +from nemo_text_processing.inverse_text_normalization.en.taggers.measure import MeasureFst as EnMeasureFst +from nemo_text_processing.inverse_text_normalization.en.taggers.money import MoneyFst as EnMoneyFst +from nemo_text_processing.inverse_text_normalization.en.taggers.ordinal import OrdinalFst as EnOrdinalFst +from nemo_text_processing.inverse_text_normalization.en.taggers.punctuation import PunctuationFst as EnPunctuationFst +from nemo_text_processing.inverse_text_normalization.en.taggers.telephone import TelephoneFst as EnTelephoneFst +from nemo_text_processing.inverse_text_normalization.en.taggers.time import TimeFst as EnTimeFst +from nemo_text_processing.inverse_text_normalization.en.taggers.whitelist import WhiteListFst as EnWhiteListFst +from nemo_text_processing.inverse_text_normalization.en.taggers.word import WordFst as EnWordFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.date import DateFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.decimal import DecimalFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.fraction import FractionFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.measure import MeasureFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.money import MoneyFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.punctuation import PunctuationFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.telephone import TelephoneFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.time import TimeFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.whitelist import WhiteListFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.word import WordFst +from nemo_text_processing.inverse_text_normalization.hi_en.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import ( + INPUT_LOWER_CASED, + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) +from nemo_text_processing.utils.logging import logger + + +class ClassifyFst(GraphFst): + """ + Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + + Args: + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + whitelist: path to a file with whitelist replacements + input_case: accepting either "lower_cased" or "cased" input. + """ + + def __init__( + self, + cache_dir: str = None, + overwrite_cache: bool = False, + whitelist: str = None, + en_whitelist: str = None, + input_case: str = INPUT_LOWER_CASED, + ): + super().__init__(name="tokenize_and_classify", kind="classify") + + far_file = None + if whitelist is None: + whitelist = get_abs_path("data/hi_whitelist.tsv") + if en_whitelist is None: + en_whitelist = get_abs_path("data/en_whitelist.tsv") + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"hi_en_itn_{input_case}.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] + logger.info(f"ClassifyFst.fst was restored from {far_file}.") + else: + logger.info(f"Creating ClassifyFst grammars.") + + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + + ordinal = OrdinalFst(cardinal) + ordinal_graph = ordinal.fst + + decimal = DecimalFst(cardinal) + decimal_graph = decimal.fst + + fraction = FractionFst(cardinal) + fraction_graph = fraction.fst + + measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal).fst + date_graph = DateFst(cardinal, ordinal).fst + word_graph = WordFst().fst + time_graph = TimeFst(cardinal).fst + money_graph = MoneyFst(cardinal=cardinal, decimal=decimal).fst + whitelist_graph = WhiteListFst(input_file=whitelist).fst + punct_graph = PunctuationFst().fst + telephone_graph = TelephoneFst(cardinal).fst + + en_cardinal = EnCardinalFst(input_case=input_case) + en_cardinal_graph = en_cardinal.fst + + en_ordinal = EnOrdinalFst(cardinal=en_cardinal, input_case=input_case) + en_ordinal_graph = en_ordinal.fst + + en_decimal = EnDecimalFst(cardinal=en_cardinal, input_case=input_case) + en_decimal_graph = en_decimal.fst + + en_measure_graph = EnMeasureFst(cardinal=en_cardinal, decimal=en_decimal, input_case=input_case).fst + en_date_graph = EnDateFst(ordinal=en_ordinal, input_case=input_case).fst + en_word_graph = EnWordFst().fst + en_time_graph = EnTimeFst(input_case=input_case).fst + en_money_graph = EnMoneyFst(cardinal=en_cardinal, decimal=en_decimal, input_case=input_case).fst + en_whitelist_graph = EnWhiteListFst(input_file=en_whitelist, input_case=input_case).fst + en_punct_graph = EnPunctuationFst().fst + en_electronic_graph = EnElectronicFst(input_case=input_case).fst + en_telephone_graph = EnTelephoneFst(cardinal=en_cardinal, input_case=input_case).fst + + classify = ( + pynutil.add_weight(whitelist_graph, 1.01) + | pynutil.add_weight(en_whitelist_graph, 1.01) + | pynutil.add_weight(time_graph, 1.1) + | pynutil.add_weight(en_time_graph, 1.1) + | pynutil.add_weight(date_graph, 1.09) + | pynutil.add_weight(en_date_graph, 1.09) + | pynutil.add_weight(decimal_graph, 1.09) + | pynutil.add_weight(en_decimal_graph, 1.09) + | pynutil.add_weight(fraction_graph, 1.09) + | pynutil.add_weight(measure_graph, 1.6) + | pynutil.add_weight(en_measure_graph, 1.1) + | pynutil.add_weight(cardinal_graph, 1.6) + | pynutil.add_weight(en_cardinal_graph, 1.1) + | pynutil.add_weight(ordinal_graph, 1.6) + | pynutil.add_weight(en_ordinal_graph, 1.09) + | pynutil.add_weight(money_graph, 1.6) + | pynutil.add_weight(en_money_graph, 1.1) + | pynutil.add_weight(telephone_graph, 1.6) + | pynutil.add_weight(en_telephone_graph, 1.1) + | pynutil.add_weight(en_electronic_graph, 1.1) + | pynutil.add_weight(word_graph, 100) + | pynutil.add_weight(en_word_graph, 120) + ) + + punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }") + en_punct = ( + pynutil.insert("tokens { ") + pynutil.add_weight(en_punct_graph, weight=1.3) + pynutil.insert(" }") + ) + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") + token_plus_punct = ( + pynini.closure(punct + pynutil.insert(" ")) + + token + + pynini.closure(pynutil.insert(" ") + punct | en_punct) + ) + + graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct) + graph = delete_space + graph + delete_space + + self.fst = graph.optimize() + + if far_file: + generator_main(far_file, {"tokenize_and_classify": self.fst}) + logger.info(f"ClassifyFst grammars are saved to {far_file}.") diff --git a/nemo_text_processing/inverse_text_normalization/hi_en/utils.py b/nemo_text_processing/inverse_text_normalization/hi_en/utils.py new file mode 100644 index 000000000..2bcba780d --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi_en/utils.py @@ -0,0 +1,27 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + + +def get_abs_path(rel_path): + """ + Get absolute path + + Args: + rel_path: relative path to this file + + Returns absolute path + """ + return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path diff --git a/nemo_text_processing/inverse_text_normalization/hi_en/verbalizers/__init__.py b/nemo_text_processing/inverse_text_normalization/hi_en/verbalizers/__init__.py new file mode 100644 index 000000000..4fc25d0d3 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi_en/verbalizers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/hi_en/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/hi_en/verbalizers/verbalize.py new file mode 100644 index 000000000..81cc937a1 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi_en/verbalizers/verbalize.py @@ -0,0 +1,102 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.en.verbalizers.cardinal import CardinalFst as EnCardinalFst +from nemo_text_processing.inverse_text_normalization.en.verbalizers.date import DateFst as EnDateFst +from nemo_text_processing.inverse_text_normalization.en.verbalizers.decimal import DecimalFst as EnDecimalFst +from nemo_text_processing.inverse_text_normalization.en.verbalizers.electronic import ElectronicFst as EnElectronicFst +from nemo_text_processing.inverse_text_normalization.en.verbalizers.measure import MeasureFst as EnMeasureFst +from nemo_text_processing.inverse_text_normalization.en.verbalizers.money import MoneyFst as EnMoneyFst +from nemo_text_processing.inverse_text_normalization.en.verbalizers.ordinal import OrdinalFst as EnOrdinalFst +from nemo_text_processing.inverse_text_normalization.en.verbalizers.telephone import TelephoneFst as EnTelephoneFst +from nemo_text_processing.inverse_text_normalization.en.verbalizers.time import TimeFst as EnTimeFst +from nemo_text_processing.inverse_text_normalization.en.verbalizers.whitelist import WhiteListFst as EnWhiteListFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.date import DateFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.decimal import DecimalFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.fraction import FractionFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.measure import MeasureFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.money import MoneyFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.telephone import TelephoneFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.time import TimeFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.whitelist import WhiteListFst +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst + + +class VerbalizeFst(GraphFst): + """ + Composes other verbalizer grammars. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + """ + + def __init__(self): + super().__init__(name="verbalize", kind="verbalize") + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + + ordinal = OrdinalFst() + ordinal_graph = ordinal.fst + + decimal = DecimalFst() + decimal_graph = decimal.fst + + fraction_graph = FractionFst().fst + + date_graph = DateFst(cardinal, ordinal).fst + time_graph = TimeFst().fst + measure_graph = MeasureFst(cardinal, decimal).fst + money_graph = MoneyFst(cardinal, decimal).fst + telephone_graph = TelephoneFst(cardinal).fst + whitelist_graph = WhiteListFst().fst + + en_cardinal = EnCardinalFst() + en_cardinal_graph = en_cardinal.fst + en_ordinal_graph = EnOrdinalFst().fst + en_decimal = EnDecimalFst() + en_decimal_graph = en_decimal.fst + en_measure_graph = EnMeasureFst(decimal=en_decimal, cardinal=en_cardinal).fst + en_money_graph = EnMoneyFst(decimal=en_decimal).fst + en_date_graph = EnDateFst().fst + en_whitelist_graph = EnWhiteListFst().fst + en_telephone_graph = EnTelephoneFst().fst + en_time_graph = EnTimeFst().fst + en_electronic_graph = EnElectronicFst().fst + + graph = ( + en_time_graph + | pynutil.add_weight(time_graph, 1.1) + | date_graph + | pynutil.add_weight(en_date_graph, 1.1) + | money_graph + | pynutil.add_weight(en_money_graph, 1.1) + | fraction_graph + | measure_graph + | pynutil.add_weight(en_measure_graph, 1.1) + | ordinal_graph + | pynutil.add_weight(en_ordinal_graph, 1.1) + | decimal_graph + | pynutil.add_weight(en_decimal_graph, 1.1) + | cardinal_graph + | pynutil.add_weight(en_cardinal_graph, 1.1) + | whitelist_graph + | pynutil.add_weight(en_whitelist_graph, 1.1) + | telephone_graph + | pynutil.add_weight(en_telephone_graph, 1.1) + | en_electronic_graph + ) + self.fst = graph diff --git a/nemo_text_processing/inverse_text_normalization/hi_en/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/hi_en/verbalizers/verbalize_final.py new file mode 100644 index 000000000..05386f09d --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi_en/verbalizers/verbalize_final.py @@ -0,0 +1,44 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.word import WordFst +from nemo_text_processing.inverse_text_normalization.hi_en.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, delete_extra_space, delete_space + + +class VerbalizeFinalFst(GraphFst): + """ + Finite state transducer that verbalizes an entire sentence, e.g. + tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now + """ + + def __init__(self): + super().__init__(name="verbalize_final", kind="verbalize") + verbalize = VerbalizeFst().fst + word = WordFst().fst + types = verbalize | word + graph = ( + pynutil.delete("tokens") + + delete_space + + pynutil.delete("{") + + delete_space + + types + + delete_space + + pynutil.delete("}") + ) + graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space + self.fst = graph diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py index 1ab727660..df5247260 100644 --- a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py +++ b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py @@ -101,7 +101,7 @@ def __init__( from nemo_text_processing.inverse_text_normalization.ar.verbalizers.verbalize_final import ( VerbalizeFinalFst, ) - elif lang == 'es_en': # Arabic + elif lang == 'es_en': # Spanish-English code-switch from nemo_text_processing.inverse_text_normalization.es_en.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.inverse_text_normalization.es_en.verbalizers.verbalize_final import ( VerbalizeFinalFst, @@ -121,6 +121,11 @@ def __init__( from nemo_text_processing.inverse_text_normalization.hi.verbalizers.verbalize_final import ( VerbalizeFinalFst, ) + elif lang == 'hi_en': # Hindi-English code-switch + from nemo_text_processing.inverse_text_normalization.hi_en.taggers.tokenize_and_classify import ClassifyFst + from nemo_text_processing.inverse_text_normalization.hi_en.verbalizers.verbalize_final import ( + VerbalizeFinalFst, + ) elif lang == 'hy': from nemo_text_processing.inverse_text_normalization.hy.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.inverse_text_normalization.hy.verbalizers.verbalize_final import ( @@ -136,13 +141,6 @@ def __init__( from nemo_text_processing.inverse_text_normalization.he.verbalizers.verbalize_final import ( VerbalizeFinalFst, ) - elif lang == 'ko': # Korean - from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst - from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import ( - VerbalizeFinalFst, - ) - else: - raise NotImplementedError(f"Language {lang} has not been supported yet.") self.tagger = ClassifyFst( cache_dir=cache_dir, whitelist=whitelist, overwrite_cache=overwrite_cache, input_case=input_case @@ -187,25 +185,7 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=[ - 'en', - 'de', - 'es', - 'pt', - 'ru', - 'fr', - 'sv', - 'vi', - 'ar', - 'es_en', - 'zh', - 'he', - 'hi', - 'hy', - 'mr', - 'ja', - 'ko', - ], + choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'hi_en', 'zh', 'he', 'hi', 'hy', 'mr', 'ja'], default="en", type=str, ) diff --git a/nemo_text_processing/inverse_text_normalization/run_evaluate.py b/nemo_text_processing/inverse_text_normalization/run_evaluate.py index c93d8df64..b6d0a5e80 100644 --- a/nemo_text_processing/inverse_text_normalization/run_evaluate.py +++ b/nemo_text_processing/inverse_text_normalization/run_evaluate.py @@ -35,7 +35,7 @@ def parse_args(): parser.add_argument( "--lang", help="language", - choices=["ar", "de", "en", "es", "es_en", "fr", "hi", "hy", "ko", "mr", "pt", "ru", "sv", "vi", "zh", 'ja'], + choices=["ar", "de", "en", "es", "es_en", "fr", "hi", "hi_en", "hy", "mr", "pt", "ru", "sv", "vi", "zh", 'ja'], default="en", type=str, ) diff --git a/tests/nemo_text_processing/hi_en/__init__.py b/tests/nemo_text_processing/hi_en/__init__.py new file mode 100644 index 000000000..4fc25d0d3 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_address.txt b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_address.txt new file mode 100644 index 000000000..810c9cbbd --- /dev/null +++ b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_address.txt @@ -0,0 +1,30 @@ +दिल्ली एक एक शून्य शून्य शून्य एक~दिल्ली ११०००१ +मुंबई चार शून्य शून्य शून्य शून्य एक~मुंबई ४००००१ +चेन्नई छह शून्य शून्य शून्य शून्य एक~चेन्नई ६००००१ +कोलकाता सात शून्य शून्य शून्य शून्य एक~कोलकाता ७००००१ +बेंगलुरु पाँच छह शून्य शून्य शून्य एक~बेंगलुरु ५६०००१ +सात शून्य शून्य ओक स्ट्रीट~७०० ओक स्ट्रीट +एक एक जंगल रोड~११ जंगल रोड +तीन शून्य एक पार्क एवेन्यू~३०१ पार्क एवेन्यू +गली नंबर एक सात जीएकगढ़~गली नंबर १७ जीएकगढ़ +अदनान अपार्टमेंट फ्लैट नंबर पाँच पाँच~अदनान अपार्टमेंट फ्लैट नंबर ५५ +प्लॉट नंबर आठ बालाजी मार्केट~प्लॉट नंबर ८ बालाजी मार्केट +बूथ सात शून्य, सेक्टर आठ, चंडीगढ़~बूथ ७०, सेक्टर ८, चंडीगढ़ +दो दो दो एक सदर्न स्ट्रीट~२२२१ सदर्न स्ट्रीट +छह दो पाँच स्कूल स्ट्रीट~६२५ स्कूल स्ट्रीट +पाँच शून्य छह स्टेट रोड~५०६ स्टेट रोड +छह छह हाइफ़न चार, पार्कहर्स्ट रोड~६६-४, पार्कहर्स्ट रोड +एक चार बटा तीन, मथुरा रोड~१४/३, मथुरा रोड +अमरावती छह पाँच पाँच नौ तीन शून्य~अमरावती ६५५९३० +अमरावती चार छह आठ दो पाँच दो~अमरावती ४६८२५२ +शिमला, हिमाचल प्रदेश पाँच नौ तीन नौ आठ आठ~शिमला, हिमाचल प्रदेश ५९३९८८ +रांची, झारखंड सात तीन छह पाँच पाँच सात~रांची, झारखंड ७३६५५७ +कोहिमा, नागालैंड चार चार आठ तीन सात सात~कोहिमा, नागालैंड ४४८३७७ +मुंबई, महाराष्ट्र आठ तीन नौ चार आठ आठ~मुंबई, महाराष्ट्र ८३९४८८ +मुंबई, महाराष्ट्र दो नौ शून्य नौ तीन सात~मुंबई, महाराष्ट्र २९०९३७ +गांधीनगर, गुजरात आठ शून्य आठ तीन सात चार~गांधीनगर, गुजरात ८०८३७४ +रायपुर, छत्तीसगढ़ एक एक शून्य छह तीन पाँच~रायपुर, छत्तीसगढ़ ११०६३५ +भोपाल, मध्य प्रदेश सात पाँच एक दो दो पाँच~भोपाल, मध्य प्रदेश ७५१२२५ +अगरतला, त्रिपुरा नौ एक पाँच तीन शून्य पाँच~अगरतला, त्रिपुरा ९१५३०५ +लखनऊ, उत्तर प्रदेश आठ शून्य दो चार आठ एक~लखनऊ, उत्तर प्रदेश ८०२४८१ +श्रीनगर, जम्मू और कश्मीर नौ छह चार पाँच दो तीन~श्रीनगर, जम्मू और कश्मीर ९६४५२३ diff --git a/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_cardinal.txt new file mode 100644 index 000000000..9dadbe1cd --- /dev/null +++ b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_cardinal.txt @@ -0,0 +1,58 @@ +शून्य~० +एक~१ +दो~२ +तीन~३ +दस~१० +ग्यारह~११ +बारह~१२ +तेरह~१३ +चौदह~१४ +पन्द्रह~१५ +सोलह~१६ +बीस~२० +तेईस~२३ +पचास~५० +सत्तर~७० +नब्बे~९० +सौ~१०० +दो सौ~२०० +एक सौ दस~११० +तीन सौ पचास~३५० +हज़ार~१००० +एक हज़ार~१००० +दो हज़ार~२००० +दस हज़ार~१०००० +एक लाख~१००००० +दो लाख~२००००० +एक करोड़~१००००००० +पाँच करोड़~५००००००० +साढ़े तीन सौ~३५० +सवा दो सौ~२२५ +डेढ़ सौ~१५० +ढाई सौ~२५० +साढ़े तीन हज़ार~३५०० +सवा दो हज़ार~२२५० +ढाई हज़ार~२५०० +पौने चार सौ~३७५ +पौने दो सौ~१७५ +zero~zero +sixty~60 +nineteen~19 +two hundred and fifty four~254 +one hundred forty seven thousand four hundred fifty one~147451 +one million one hundred fifty six thousand one hundred seventy three~1156173 +one billion five hundred ninety three million seventy two thousand nine hundred sixty one~1593072961 +minus twenty five thousand thirty seven~-25037 +minus sixty~-60 +forty six thousand six hundred sixty four~46664 +two million three~2000003 +one thousand thirteen~1013 +one thousand one~1001 +one thousand one hundred~1100 +one thousand twenty six~1026 +one thousand one hundred twenty six~1126 +eleven hundred~1100 +twenty one hundred~2100 +twenty one hundred and eleven~2111 +eleven hundred twenty one~1121 +twenty one crore ninety eight lakh thirty six thousand five hundred and ninety three~219836593 diff --git a/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_date.txt new file mode 100644 index 000000000..bdad780d5 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_date.txt @@ -0,0 +1,32 @@ +पाँच जनवरी~५ जनवरी +दस फ़रवरी~१० फ़रवरी +पन्द्रह मार्च~१५ मार्च +बीस अप्रैल~२० अप्रैल +तीस जून~३० जून +पाँच जनवरी दो हज़ार बारह~५ जनवरी, २०१२ +दस फ़रवरी उन्नीस सौ नब्बे~१० फ़रवरी, १९९० +दो हज़ार बारह~२०१२ +उन्नीस सौ सत्तर~१९७० +दो हज़ार~२००० +उन्नीस सौ~१९०० +दो हज़ार बारह से दो हज़ार पन्द्रह~२०१२-२०१५ +पहली सदी~पहली सदी +बीसवीं सदी~२०वीं सदी +दो सौ तीन ईसा पूर्व~२०३ ई.पू. +चार सौ बीस ईसवी~४२० ई. +पन्द्रह सौ ईसवी~१५०० ई. +दो हज़ार बीस ईसवी~२०२० ई. +पन्द्रह अगस्त उन्नीस सौ सैंतालीस~१५ अगस्त, १९४७ +छब्बीस जनवरी उन्नीस सौ पचास~२६ जनवरी, १९५० +january first~january 1 +july twenty second two thousand eight~july 22, 2008 +june thirty~june 30 +july twenty fifth twenty twelve~july 25, 2012 +nineteen seventeen~1917 +twenty twelve~2012 +nineteen ninety four~1994 +two thousand three~2003 +the twenty fifth of july twenty twelve~25 july, 2012 +the fifteenth of january~15 january +february twenty fifth twenty sixteen~february 25, 2016 +november twenty fourth twenty fourteen~november 24, 2014 diff --git a/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_date_cased.txt b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_date_cased.txt new file mode 100644 index 000000000..96bbc7d32 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_date_cased.txt @@ -0,0 +1,70 @@ +july twenty fifth two thousand twelve~july 25, 2012 +nineteen eighties~1980s +two thousand and twenty~2020 +two thousand and nine~2009 +the twenty fifth of july twenty twelve~25 july, 2012 +the twenty fifth of july two thousand twelve~25 july, 2012 +the twenty second of july twenty twelve~22 july, 2012 +the fifteenth of january~15 january +the seventeenth of may twenty ten~17 may, 2010 +january first~january 1 +july twenty second two thousand eight~july 22, 2008 +june thirty~june 30 +july twenty fifth twenty twelve~july 25, 2012 +nineteen seventeen~1917 +twenty twelve~2012 +march sixteen sixty five~march 1665 +sixteen sixty five~1665 +july two thousand twelve~july 2012 +october nineteen oh five~october 1905 +july fifteen o six~july 1506 +the twenty fifth of july twenty twelve~25 july, 2012 +july twenty fifth twenty twelve~july 25, 2012 +july twenty fifth two thousand twelve~july 25, 2012 +july one thousand eight hundred seventy six~july 1876 +february twenty fifth twenty sixteen~february 25, 2016 +november twenty fourth twenty fourteen~november 24, 2014 +nineteen ninety four~1994 +two thousand three~2003 +one thousand eight~1008 +nineteen seventy six~1976 +june twentieth twenty fourteen~june 20, 2014 +nineteen seventy three~1973 +nineteen seventy five~1975 +eleven fifty five~1155 +July twenty fifth two thousand twelve~July 25, 2012 +Nineteen eighties~1980s +Two thousand and twenty~2020 +Two thousand and nine~2009 +The twenty fifth of july twenty twelve~25 july, 2012 +The twenty fifth of july two thousand twelve~25 july, 2012 +The twenty second of july twenty twelve~22 july, 2012 +The fifteenth of january~15 january +The fifteenth of January~15 January +The seventeenth of may twenty ten~17 may, 2010 +January first~January 1 +July twenty second two thousand eight~July 22, 2008 +June thirty~June 30 +July twenty fifth twenty twelve~July 25, 2012 +Nineteen seventeen~1917 +Twenty twelve~2012 +March sixteen sixty five~March 1665 +Sixteen sixty five~1665 +July two thousand twelve~July 2012 +October nineteen oh five~October 1905 +July fifteen o six~July 1506 +The twenty fifth of july twenty twelve~25 july, 2012 +The twenty fifth of July twenty twelve~25 July, 2012 +July twenty fifth twenty twelve~July 25, 2012 +July twenty fifth two thousand twelve~July 25, 2012 +July one thousand eight hundred seventy six~July 1876 +February twenty fifth twenty sixteen~February 25, 2016 +November twenty fourth twenty fourteen~November 24, 2014 +Nineteen ninety four~1994 +Two thousand three~2003 +One thousand eight~1008 +Nineteen seventy six~1976 +June twentieth twenty fourteen~June 20, 2014 +Nineteen seventy three~1973 +Nineteen seventy five~1975 +Eleven fifty five~1155 diff --git a/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_decimal.txt b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_decimal.txt new file mode 100644 index 000000000..075aebab7 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_decimal.txt @@ -0,0 +1,25 @@ +एक दशमलव दो छह~१.२६ +शून्य दशमलव पाँच~०.५ +ऋण एक दशमलव दो छह~-१.२६ +दो दशमलव तीन चार~२.३४ +पाँच दशमलव शून्य एक~५.०१ +five point two million~5.2 million +one hundred sixty four point five eight thousand~164.58 thousand +four hundred million~400 million +fifty billion~50 billion +one point two five billion~1.25 billion +thirteen billion~13 billion +thirty billion~30 billion +two thousand eight hundred five point eight seven three billion~2805.873 billion +eighteen~18 +eighteen point eight five~18.85 +eighteen point five o~18.50 +eighteen point five six~18.56 +eighteen point nine~18.9 +eighteen point o five~18.05 +eighteen point one two~18.12 +eighteen point o one~18.01 +zero point two six~0.26 +point zero two~.02 +sixty point two~60.2 +minus sixty point two four zero zero~-60.2400 diff --git a/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_electronic.txt b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_electronic.txt new file mode 100644 index 000000000..de609c263 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_electronic.txt @@ -0,0 +1,17 @@ +a dot b c at g mail dot com~a.bc@gmail.com +a at gmail dot com~a@gmail.com +c d f at a b c dot e d u~cdf@abc.edu +a b c at g mail dot a b c~abc@gmail.abc +a b c at a b c dot com~abc@abc.com +a s d f one two three at a b c dot com~asdf123@abc.com +a one b two at a b c dot com~a1b2@abc.com +a b three dot s d d dot three at g mail dot com~ab3.sdd.3@gmail.com +h t t p colon slash slash w w w dot o u r d a i l y n e w s dot com dot s m~http://www.ourdailynews.com.sm +w w w dot c o m d a i l y n e w s dot a b slash s m~www.comdailynews.ab/sm +n vidia dot com~nvidia.com +abc at gmail dot com~abc@gmail.com +athreed at gmail dot com~athreed@gmail.com +kore dot ai~kore.ai +a at nvidia dot com~a@nvidia.com +a dot b c at nvidia dot com~a.bc@nvidia.com +a b three hyphen s d d dash three at g mail dot com~ab3-sdd-3@gmail.com diff --git a/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_fraction.txt b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_fraction.txt new file mode 100644 index 000000000..ce534bc21 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_fraction.txt @@ -0,0 +1,39 @@ +आठ बटा तीन~८/३ +दो बटा पाँच~२/५ +एक बटा चार~१/४ +दो सही दो बटा तीन~२ २/३ +तीन सही एक बटा चार~३ १/४ +पाँच सही तीन बटा सात~५ ३/७ +सात बटे ग्यारह~७/११ +छह बटा तेरह~६/१३ +डेढ़~१ १/२ +ढाई~२ १/२ +पाव~१/४ +एक सौ नौ बटा एक सौ चौबीस~१०९/१२४ +एक सौ एक बटा दो~१०१/२ +दो सौ एक बटा दो~२०१/२ +एक सौ एक बटा चार~१०१/४ +दो सौ बटा पाँच सौ~२००/५०० +दो सौ बटा बारह~२००/१२ +एक सौ तेईस बटा एक सौ पच्चीस~१२३/१२५ +छह सौ बासठ बटा एक~६६२/१ +एक सौ पाँच बटा सात~१०५/७ +छह सौ चौवन बटा तीन~६५४/३ +एक सौ तैंतीस सही एक बटा दो~१३३ १/२ +एक सौ तैंतीस सही दो बटा तीन~१३३ २/३ +एक सही छह बटा छह~१ ६/६ +दो सही एक बटा छह~२ १/६ +तीन सही तीन बटा चार~३ ३/४ +एक सौ बीस सही तीन बटा चार~१२० ३/४ +एक सौ बीस सही पिछत्तर बटा नब्बे~१२० ७५/९० +सवा चौरासी~८४ १/४ +आधा~१/२ +साढ़े~१/२ +सवा~१/४ +पौन~३/४ +पौना~३/४ +सवा पैंतीस~३५ १/४ +साढ़े चार सौ बटा दस~४५०/१० +तीन चौथाई~३/४ +दो तिहाई~२/३ +एक चौथाई~१/४ diff --git a/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_measure.txt new file mode 100644 index 000000000..d232013ae --- /dev/null +++ b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_measure.txt @@ -0,0 +1,25 @@ +दो सौ मीटर~२०० m +तीन किलोग्राम~३ kg +साढ़े तीन किलोग्राम~३.५ kg +ऋण बारह किलोग्राम~-१२ kg +पचास किलोमीटर~५० km +तीन सौ ग्राम~३०० g +सवा दो किलोग्राम~२.२५ kg +दो सौ किलोमीटर प्रति घंटा~२०० km/h +बीस डिग्री सेल्सियस~२० °C +एक सौ मीटर~१०० m +two hundred meters~200 m +three hours~3 h +two hundred kilometers per hour~200 km/h +minus sixty six kilograms~-66 kg +eight point five megawatts~8.5 mW +eight point five meters~8.5 m +eight point five two percent~8.52 % +eight point four four percent~8.44 % +five degrees celsius~5 °C +seventy two degrees fahrenheit~72 °F +two hundred seventy three kelvin~273 K +eighteen feet~18 ft +eighteen point five kilometers~18.5 km +eight hundred fifty meters~850 m +eight hundred kilograms~800 kg diff --git a/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_money.txt new file mode 100644 index 000000000..120868b6f --- /dev/null +++ b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_money.txt @@ -0,0 +1,36 @@ +दस रुपये~₹१० +सौ रुपये~₹१०० +हज़ार रुपये~₹१००० +दो सौ रुपये~₹२०० +पाँच सौ रुपये~₹५०० +दो हज़ार रुपये~₹२००० +एक लाख रुपये~₹१००००० +दस रुपये और पचास पैसे~₹१०.५० +बीस डॉलर~$२० +पचास यूरो~€५० +बहत्तर बिटकॉइन~₿७२ +ढाई सौ रुपये~₹२५० +साढ़े तीन सौ रुपये~₹३५० +सवा दो सौ रुपये~₹२२५ +पौने चार सौ रुपये~₹३७५ +ढाई करोड़ रुपये~₹२५०००००० +साढ़े तीन लाख रुपये~₹३५०००० +सवा दो लाख रुपये~₹२२५००० +पचास हज़ार रुपये~₹५०००० +two dollars~$2 +one cent~$0.01 +four united states dollars and sixty nine cents~$4.69 +seventy five dollars sixty three~$75.63 +twenty nine dollars fifty cents~$29.50 +eleven dollars and fifty one cents~$11.51 +nine hundred ninety three dollars and ninety two cents~$993.92 +four hundred sixty billion won~₩460 billion +thirty billion yen~¥30 billion +two point five billion dollars~$2.5 billion +forty five billion dollars~$45 billion +fifty million dollars~$50 million +one dollar~$1 +fifteen thousand dollars~$15000 +twenty dollar~$20 +eighteen dollars~$18 +fifteen hundred dollars~$1500 diff --git a/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_ordinal.txt b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_ordinal.txt new file mode 100644 index 000000000..94910d49c --- /dev/null +++ b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_ordinal.txt @@ -0,0 +1,26 @@ +पहला~पहला +दूसरा~दूसरा +तीसरा~तीसरा +चौथा~चौथा +दसवां~१०वां +ग्यारहवां~११वां +बीसवां~२०वां +तेईसवां~२३वां +पचासवां~५०वां +नब्बेवाँ~९०वाँ +दसवीं~१०वीं +बीसवीं~२०वीं +first~1st +second~2nd +third~3rd +fourth~4th +eleventh~11th +twelfth~12th +thirteenth~13th +twenty first~21st +twenty third~23rd +one hundredth~100th +one hundred eleventh~111th +one thousandth~1000th +forty second~42nd +seventy first~71st diff --git a/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_telephone.txt b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_telephone.txt new file mode 100644 index 000000000..748668a64 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_telephone.txt @@ -0,0 +1,35 @@ +एक एक एक एक एक एक~११११११ +पाँच शून्य शून्य शून्य एक दो~५०००१२ +एक दो तीन चार पाँच छह~१२३४५६ +चार शून्य शून्य शून्य एक शून्य~४०००१० +सात पाँच शून्य शून्य शून्य दो~७५०००२ +आठ आठ शून्य नौ नौ शून्य~८८०९९० +नौ आठ सात छह पाँच चार तीन दो एक शून्य~९८७६५४३२१० +सात शून्य एक दो तीन चार पाँच छह सात आठ~७०१२३४५६७८ +आठ आठ आठ सात सात सात छह छह छह छह~८८८७७७६६६६ +छह दो नौ शून्य एक पाँच सात तीन चार आठ~६२९०१५७३४८ +नौ नौ आठ आठ सात सात छह छह पाँच पाँच~९९८८७७६६५५ +प्लस नौ एक नौ आठ सात छह पाँच चार तीन दो एक शून्य~+९१ ९८७६५४३२१० +प्लस नौ एक सात शून्य एक दो तीन चार पाँच छह सात आठ~+९१ ७०१२३४५६७८ +प्लस नौ एक आठ आठ आठ सात सात सात छह छह छह छह~+९१ ८८८७७७६६६६ +प्लस नौ एक एक एक एक एक एक एक एक एक एक एक~+९१ ११११११११११ +शून्य दो शून्य दो चार तीन सात एक पाँच चार दो~०२०२४३७१५४२ +शून्य एक एक दो छह एक दो तीन चार पाँच छह~०११२६१२३४५६ +चार चार दो दो आठ आठ छह छह चार चार~४४२२८८६६४४ +शून्य आठ शून्य चार एक दो तीन चार पाँच छह सात~०८०४१२३४५६७ +दो दो छह छह पांच चार तीन दो एक शून्य~२२६६५४३२१० +पाँच शून्य शून्य नौ~५००९ +एक शून्य दो शून्य~१०२० +one two three one two three five six seven eight~123-123-5678 +plus nine one one two three one two three five six seven eight~+91 123-123-5678 +plus forty four one two three one two three five six seven eight~+44 123-123-5678 +o two three one two three five six seven eight~023-123-5678 +oh two three one two three five six seven eight~023-123-5678 +double oh three one two three five six seven eight~003-123-5678 +two two five dot double five dot o dot four o~225.55.0.40 +two two five dot double five dot o dot forty five~225.55.0.45 +ssn is seven double nine one two three double one three~ssn is 799-12-3113 +seven nine nine~799 +a b nine~ab9 +a b c~a b c +five w k r a three one~5wkra31 diff --git a/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_time.txt new file mode 100644 index 000000000..134b699b4 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_time.txt @@ -0,0 +1,29 @@ +एक बजे~१:०० +दो बजे~२:०० +दस बजे~१०:०० +बारह बजे~१२:०० +एक बजके सात मिनट~१:०७ +चार बजे चवालीस मिनट~४:४४ +दस बजके तीस मिनट~१०:३० +तीन बजके पन्द्रह मिनट~३:१५ +एक बजके दस मिनट दो सेकंड~१:१०:०२ +साढ़े तीन बजे~३:३० +सवा चार बजे~४:१५ +पौने पाँच बजे~४:४५ +डेढ़ बजे~१:३० +ढाई बजे~२:३० +eight oclock g m t~08:00 gmt +seven a m e s t~07:00 a.m. est +two p m~02:00 p.m. +two thirty~02:30 +three o'clock~03:00 +quarter past one~01:15 +half past three~03:30 +eight fifty one~08:51 +eight forty~08:40 +eleven fifty five p m~11:55 p.m. +eleven forty five a m~11:45 a.m. +eleven forty six a m~11:46 a.m. +quarter to twelve~11:45 +set alarm at ten to eleven pm~set alarm at 10:50 p.m. +one min to one am~12:59 a.m. diff --git a/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_whitelist.txt new file mode 100644 index 000000000..2b22a8a78 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_whitelist.txt @@ -0,0 +1,17 @@ +डॉक्टर~डॉ. +श्रीमती~स्मि. +श्री~श्री. +श्रीमान~श्री. +कुमारी~कु. +मास्टर~मा. +पाव~१/४ +doctor dao~dr. dao +misses smith~mrs. smith +mister dao~mr. dao +saint george~st. george +i like for example ice cream~i like e.g. ice cream +s and p five hundred~S&P 500 +seven eleven stores~7-eleven stores +r t x~RTX +nvidia a one hundred~Nvidia A100 +c u d n n~cuDNN diff --git a/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_word.txt b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_word.txt new file mode 100644 index 000000000..14977faec --- /dev/null +++ b/tests/nemo_text_processing/hi_en/data_inverse_text_normalization/test_cases_word.txt @@ -0,0 +1,19 @@ +~ +yahoo!~yahoo! +twenty!~20 ! +x ~x +—~— +aaa~aaa +aabach~aabach +aabenraa~aabenraa +aachen's~aachen's +aadri~aadri +aahar~aahar +aahh~aahh +~ +, one~, one +, one , two , three , four~, one , two , three , four +e s three~es3 +नमस्ते~नमस्ते +भारत~भारत +दुनिया~दुनिया diff --git a/tests/nemo_text_processing/hi_en/test_address.py b/tests/nemo_text_processing/hi_en/test_address.py new file mode 100644 index 000000000..71964eec4 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/test_address.py @@ -0,0 +1,31 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestAddress: + inverse_normalizer = InverseNormalizer(lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('hi_en/data_inverse_text_normalization/test_cases_address.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred.strip() == expected.strip() diff --git a/tests/nemo_text_processing/hi_en/test_cardinal.py b/tests/nemo_text_processing/hi_en/test_cardinal.py new file mode 100644 index 000000000..d82dceed2 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/test_cardinal.py @@ -0,0 +1,41 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestCardinal: + inverse_normalizer = InverseNormalizer(lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) + + @parameterized.expand(parse_test_case_file('hi_en/data_inverse_text_normalization/test_cases_cardinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_cardinal_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hi_en/test_date.py b/tests/nemo_text_processing/hi_en/test_date.py new file mode 100644 index 000000000..374555afe --- /dev/null +++ b/tests/nemo_text_processing/hi_en/test_date.py @@ -0,0 +1,41 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestDate: + inverse_normalizer = InverseNormalizer(lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) + + @parameterized.expand(parse_test_case_file('hi_en/data_inverse_text_normalization/test_cases_date.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('hi_en/data_inverse_text_normalization/test_cases_date_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hi_en/test_decimal.py b/tests/nemo_text_processing/hi_en/test_decimal.py new file mode 100644 index 000000000..1128fdfa9 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/test_decimal.py @@ -0,0 +1,41 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestDecimal: + inverse_normalizer = InverseNormalizer(lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) + + @parameterized.expand(parse_test_case_file('hi_en/data_inverse_text_normalization/test_cases_decimal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_decimal_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hi_en/test_electronic.py b/tests/nemo_text_processing/hi_en/test_electronic.py new file mode 100644 index 000000000..e57c846de --- /dev/null +++ b/tests/nemo_text_processing/hi_en/test_electronic.py @@ -0,0 +1,41 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestElectronic: + inverse_normalizer = InverseNormalizer(lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) + + @parameterized.expand(parse_test_case_file('hi_en/data_inverse_text_normalization/test_cases_electronic.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_electronic_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hi_en/test_fraction.py b/tests/nemo_text_processing/hi_en/test_fraction.py new file mode 100644 index 000000000..984d7ecf9 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/test_fraction.py @@ -0,0 +1,31 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestFraction: + inverse_normalizer = InverseNormalizer(lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('hi_en/data_inverse_text_normalization/test_cases_fraction.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hi_en/test_measure.py b/tests/nemo_text_processing/hi_en/test_measure.py new file mode 100644 index 000000000..a06cd3f15 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/test_measure.py @@ -0,0 +1,41 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestMeasure: + inverse_normalizer = InverseNormalizer(lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) + + @parameterized.expand(parse_test_case_file('hi_en/data_inverse_text_normalization/test_cases_measure.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_measure_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hi_en/test_money.py b/tests/nemo_text_processing/hi_en/test_money.py new file mode 100644 index 000000000..66ebe143c --- /dev/null +++ b/tests/nemo_text_processing/hi_en/test_money.py @@ -0,0 +1,41 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestMoney: + inverse_normalizer = InverseNormalizer(lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) + + @parameterized.expand(parse_test_case_file('hi_en/data_inverse_text_normalization/test_cases_money.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_money_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hi_en/test_ordinal.py b/tests/nemo_text_processing/hi_en/test_ordinal.py new file mode 100644 index 000000000..1dd063efc --- /dev/null +++ b/tests/nemo_text_processing/hi_en/test_ordinal.py @@ -0,0 +1,41 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestOrdinal: + inverse_normalizer = InverseNormalizer(lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) + + @parameterized.expand(parse_test_case_file('hi_en/data_inverse_text_normalization/test_cases_ordinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_ordinal_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hi_en/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/hi_en/test_sparrowhawk_inverse_text_normalization.sh new file mode 100644 index 000000000..0a805bbed --- /dev/null +++ b/tests/nemo_text_processing/hi_en/test_sparrowhawk_inverse_text_normalization.sh @@ -0,0 +1,102 @@ +#! /bin/sh +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +PROJECT_DIR=${2:-"/workspace/tests"} + +runtest () { + input=$1 + echo "INPUT is $input" + cd ${GRAMMARS_DIR} + + # read test file + while read testcase; do + IFS='~' read spoken written <<< $testcase + denorm_pred=$(echo $spoken | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') + + # trim white space + written="$(echo -e "${written}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + + # input expected actual + assertEquals "$spoken" "$written" "$denorm_pred" + done < "$input" +} + +testITNCardinal() { + input=$PROJECT_DIR/hi_en/data_inverse_text_normalization/test_cases_cardinal.txt + runtest $input +} + +testITNDate() { + input=$PROJECT_DIR/hi_en/data_inverse_text_normalization/test_cases_date.txt + runtest $input +} + +testITNDecimal() { + input=$PROJECT_DIR/hi_en/data_inverse_text_normalization/test_cases_decimal.txt + runtest $input +} + +testITNOrdinal() { + input=$PROJECT_DIR/hi_en/data_inverse_text_normalization/test_cases_ordinal.txt + runtest $input +} + +testITNTime() { + input=$PROJECT_DIR/hi_en/data_inverse_text_normalization/test_cases_time.txt + runtest $input +} + +testITNMeasure() { + input=$PROJECT_DIR/hi_en/data_inverse_text_normalization/test_cases_measure.txt + runtest $input +} + +testITNMoney() { + input=$PROJECT_DIR/hi_en/data_inverse_text_normalization/test_cases_money.txt + runtest $input +} + +testITNWhitelist() { + input=$PROJECT_DIR/hi_en/data_inverse_text_normalization/test_cases_whitelist.txt + runtest $input +} + +testITNTelephone() { + input=$PROJECT_DIR/hi_en/data_inverse_text_normalization/test_cases_telephone.txt + runtest $input +} + +testITNAddress() { + input=$PROJECT_DIR/hi_en/data_inverse_text_normalization/test_cases_address.txt + runtest $input +} + +testITNFraction() { + input=$PROJECT_DIR/hi_en/data_inverse_text_normalization/test_cases_fraction.txt + runtest $input +} + +testITNElectronic() { + input=$PROJECT_DIR/hi_en/data_inverse_text_normalization/test_cases_electronic.txt + runtest $input +} + +# Remove all command-line arguments +shift $# + +# Load shUnit2 +. /workspace/shunit2/shunit2 diff --git a/tests/nemo_text_processing/hi_en/test_telephone.py b/tests/nemo_text_processing/hi_en/test_telephone.py new file mode 100644 index 000000000..4c3d23640 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/test_telephone.py @@ -0,0 +1,41 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestTelephone: + inverse_normalizer = InverseNormalizer(lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) + + @parameterized.expand(parse_test_case_file('hi_en/data_inverse_text_normalization/test_cases_telephone.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_telephone_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hi_en/test_time.py b/tests/nemo_text_processing/hi_en/test_time.py new file mode 100644 index 000000000..2ac327a65 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/test_time.py @@ -0,0 +1,41 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestTime: + inverse_normalizer = InverseNormalizer(lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) + + @parameterized.expand(parse_test_case_file('hi_en/data_inverse_text_normalization/test_cases_time.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_time_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hi_en/test_whitelist.py b/tests/nemo_text_processing/hi_en/test_whitelist.py new file mode 100644 index 000000000..d2989f5a1 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/test_whitelist.py @@ -0,0 +1,31 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestWhitelist: + inverse_normalizer = InverseNormalizer(lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('hi_en/data_inverse_text_normalization/test_cases_whitelist.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hi_en/test_word.py b/tests/nemo_text_processing/hi_en/test_word.py new file mode 100644 index 000000000..132ee43c5 --- /dev/null +++ b/tests/nemo_text_processing/hi_en/test_word.py @@ -0,0 +1,41 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestWord: + inverse_normalizer = InverseNormalizer(lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False) + inverse_normalizer_cased = InverseNormalizer( + lang='hi_en', cache_dir=CACHE_DIR, overwrite_cache=False, input_case="cased" + ) + + @parameterized.expand(parse_test_case_file('hi_en/data_inverse_text_normalization/test_cases_word.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm_hi(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + @parameterized.expand(parse_test_case_file('en/data_inverse_text_normalization/test_cases_word_cased.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_cased.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 3e80b56ff..1f6b5f1e8 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -103,11 +103,11 @@ def parse_args(): 'es_en', 'he', 'hi', + 'hi_en', 'hy', 'mr', 'ja', 'rw', - 'ko', ], type=str, default='en', @@ -139,7 +139,7 @@ def parse_args(): if __name__ == '__main__': args = parse_args() - if args.language in ['pt', 'ru', 'es_en', 'mr'] and args.grammars == 'tn_grammars': + if args.language in ['pt', 'ru', 'es_en', 'hi_en', 'mr'] and args.grammars == 'tn_grammars': raise ValueError('Only ITN grammars could be deployed in Sparrowhawk for the selected languages.') TNPostProcessingFst = None ITNPostProcessingFst = None @@ -285,6 +285,13 @@ def parse_args(): from nemo_text_processing.inverse_text_normalization.es_en.verbalizers.verbalize import ( VerbalizeFst as ITNVerbalizeFst, ) + elif args.language == 'hi_en': + from nemo_text_processing.inverse_text_normalization.hi_en.taggers.tokenize_and_classify import ( + ClassifyFst as ITNClassifyFst, + ) + from nemo_text_processing.inverse_text_normalization.hi_en.verbalizers.verbalize import ( + VerbalizeFst as ITNVerbalizeFst, + ) elif args.language == 'mr': from nemo_text_processing.inverse_text_normalization.mr.taggers.tokenize_and_classify import ( ClassifyFst as ITNClassifyFst, @@ -328,17 +335,6 @@ def parse_args(): ClassifyFst as TNClassifyFst, ) from nemo_text_processing.text_normalization.rw.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst - elif args.language == 'ko': - from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ( - ClassifyFst as ITNClassifyFst, - ) - from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import ( - VerbalizeFst as ITNVerbalizeFst, - ) - from nemo_text_processing.text_normalization.ko.taggers.tokenize_and_classify import ( - ClassifyFst as TNClassifyFst, - ) - from nemo_text_processing.text_normalization.ko.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst else: raise KeyError(f"Language {args.language} is not defined for export.") output_dir = os.path.join(args.output_dir, f"{args.language}_{args.grammars}_{args.input_case}") From f735184b6cc94e553401014d3c18cb9a224e8e5c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 17 Apr 2026 14:22:55 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../inverse_normalize.py | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py index df5247260..35df8abd9 100644 --- a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py +++ b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py @@ -185,7 +185,25 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'hi_en', 'zh', 'he', 'hi', 'hy', 'mr', 'ja'], + choices=[ + 'en', + 'de', + 'es', + 'pt', + 'ru', + 'fr', + 'sv', + 'vi', + 'ar', + 'es_en', + 'hi_en', + 'zh', + 'he', + 'hi', + 'hy', + 'mr', + 'ja', + ], default="en", type=str, )