Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
.com ponto com
.com.br ponto com ponto br
.gov.br ponto gov ponto br
.org ponto org
.net ponto net
.edu ponto edu
.br ponto br
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
gmail
nvidia
outlook
hotmail
yahoo
live
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
. ponto
- traço
_ underscore
! exclamação
# cerquilha
$ dólar
% por cento
& e comercial
' apóstrofo
* asterisco
+ mais
/ barra
= igual
? interrogação
^ acento circunflexo
` crase
{ chave esquerda
| barra vertical
} chave direita
~ til
, vírgula
: dois pontos
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
h horas
min minutos
s segundos
ms milissegundos
ns nanossegundos
μs microssegundos
t toneladas
kg quilos
kg quilogramas
g gramas
mg miligramas
μm micrômetros
nm nanômetros
mm milímetros
cm centímetros
cm² centímetros quadrado
cm³ centímetros cúbico
m metros
m² metros quadrados
m³ metros cúbicos
km quilômetros
km² quilômetros quadrados
ha hectares
kph quilômetros por hora
mph milhas por hora
m/s metros por segundo
l litros
ml mililitros
kgf quilogramas forças
kgf quilogramas força
% por cento
°F fahrenheit
°F graus fahrenheit
°C graus celsius
Hz hertz
kHz quilo hertz
MHz mega hertz
GHz giga hertz
W watts
kW quilowatts
MW megawatts
GW gigawatts
Wh watts hora
kWh quilowatts hora
MWh megawatts hora
GWh gigawatts hora
kV quilovolts
V volts
mV milivolts
A amperes
mA miliamperes
rpm rotações por minuto
db decibéis
cal calorias
kcal quilocalorias
G gramas
KG quilos
KG quilogramas
KM quilômetros
M metros
L litros
ML mililitros
M2 metros quadrados
M^2 metros quadrados
C graus celsius
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
h hora
min minuto
s segundo
ms milissegundo
ns nanossegundo
μs microssegundo
t tonelada
kg quilo
kg quilograma
g grama
mg miligrama
μm micrômetro
nm nanômetro
mm milímetro
cm centímetro
cm² centímetro quadrado
cm³ centímetro cúbico
m metro
m² metro quadrado
m³ metro cúbico
km quilômetro
km² quilômetro quadrado
ha hectare
kph quilômetro por hora
mph milha por hora
m/s metro por segundo
l litro
ml mililitro
kgf quilograma força
% por cento
°F fahrenheit
°C celsius
°F grau fahrenheit
°C grau celsius
Hz hertz
kHz quilo hertz
MHz mega hertz
GHz giga hertz
W watt
kW quilowatt
MW megawatt
GW gigawatt
Wh watt hora
kWh quilowatt hora
MWh megawatt hora
GWh gigawatt hora
kV quilovolt
V volt
mV milivolt
A ampere
mA miliampere
rpm rotação por minuto
db decibel
cal caloria
kcal quilocaloria
G grama
KG quilo
KG quilograma
KM quilômetro
M metro
L litro
ML mililitro
M2 metro quadrado
M^2 metro quadrado
C celsius
13 changes: 13 additions & 0 deletions nemo_text_processing/text_normalization/pt/data/money/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
US$ dólar americano
R$ real
€ euro
£ libra esterlina
$ dólar
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
dólar americano dólares americanos
real reais
euro euros
libra esterlina libras esterlinas
dólar dólares
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
US$ centavo
R$ centavo
€ centavo
£ centavo
$ centavo
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
centavo centavos
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
ramal
extensão
ext.
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ip ip
endereço de ip endereço de i p
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
ligue para
telefone
celular
meu número é
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,12 @@ def __init__(self, deterministic: bool = True):
self.tens = graph_tens.optimize()
self.two_digit_non_zero = pynini.union(digit, graph_tens, (pynini.cross("0", NEMO_SPACE) + digit)).optimize()

# After "X00" hundreds (oitocentos, …), suffix "01"-"09" needs leading zero stripped
# (graph_tens has no path for "09"; connector+digit only consumes one digit).
graph_hundreds = hundreds + pynini.union(
pynutil.delete("00"),
(connector_e + graph_tens),
(connector_e + pynutil.delete("0") + digit),
(connector_e + digit),
)
# "100" -> cem only (cross("1", cento)+delete("00") would also match "100" but
Expand Down
82 changes: 82 additions & 0 deletions nemo_text_processing/text_normalization/pt/taggers/electronic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.pt.graph_utils import NEMO_ALPHA, NEMO_DIGIT, NEMO_SPACE, GraphFst
from nemo_text_processing.text_normalization.pt.utils import get_abs_path, load_labels


class ElectronicFst(GraphFst):
"""
Finite state transducer for classifying electronic strings in pt-BR:
abc@hotmail.com -> electronic { username: "abc" domain: "hotmail.com" preserve_order: true }
https://www.abc.com -> electronic { protocol: "https://www." domain: "abc.com" preserve_order: true }
"""

def __init__(self, deterministic: bool = True):
super().__init__(name="electronic", kind="classify", deterministic=deterministic)

full_stop = pynini.accep(".")
at_symbol = "@"
protocol_string = "protocol"
domain_string = "domain"
username_string = "username"
http = "http"
https = "https"
www = "www"

symbols = [x[0] for x in load_labels(get_abs_path("data/electronic/symbols.tsv"))]
symbols = pynini.union(*symbols)
symbols_no_full_stop = pynini.difference(symbols, full_stop)
accepted_characters = pynini.closure((NEMO_ALPHA | NEMO_DIGIT | symbols_no_full_stop), 1)
all_characters = pynini.closure((NEMO_ALPHA | NEMO_DIGIT | symbols), 1)

domain_component = full_stop + accepted_characters
domain_graph = (
pynutil.insert(domain_string + ': "')
+ (accepted_characters + pynini.closure(domain_component, 1))
+ pynutil.insert('"')
)

username = (
pynutil.insert(username_string + ': "')
+ all_characters
+ pynutil.insert('"')
+ pynini.cross(at_symbol, NEMO_SPACE)
)
email = username + domain_graph

social_tag = (
pynini.cross(at_symbol, "")
+ pynutil.insert(username_string + ': "')
+ (accepted_characters | (accepted_characters + pynini.closure(domain_component, 1)))
+ pynutil.insert('"')
)

protocol_start = pynini.accep(https + "://") | pynini.accep(http + "://")
protocol_end = pynini.accep(www + ".")
if not deterministic:
protocol_end |= pynini.cross(www + ".", "dáblio dáblio dáblio.")

protocol = protocol_start | protocol_end | (protocol_start + protocol_end)
protocol = pynutil.insert(protocol_string + ': "') + protocol + pynutil.insert('"')
url = protocol + pynutil.insert(NEMO_SPACE) + domain_graph

graph = url | domain_graph | email | social_tag
self.graph = graph

final_graph = self.add_tokens(self.graph + pynutil.insert(" preserve_order: true"))
self.fst = final_graph.optimize()
Loading