From d2df1aaf59b759176fa72d6f1ab54a3453ac5826 Mon Sep 17 00:00:00 2001 From: Mai Anh Date: Mon, 18 May 2026 10:41:24 +0700 Subject: [PATCH 1/3] spanish tn code switch Signed-off-by: Mai Anh --- .../en/data/address/address_word.tsv | 2 + .../es/data/address/__init__.py | 13 ++ .../es/data/address/apt_designator.tsv | 3 + .../es/data/address/direction.tsv | 4 + .../es/data/address/po_box.tsv | 1 + .../es/data/address/suite_designator.tsv | 2 + .../es/data/address/unit_designator.tsv | 1 + .../es/data/address/zip_digit.tsv | 10 ++ .../text_normalization/es/graph_utils.py | 14 ++ .../text_normalization/es/taggers/address.py | 166 ++++++++++++++++++ .../text_normalization/es/taggers/cardinal.py | 4 + .../text_normalization/es/taggers/measure.py | 9 + .../es/verbalizers/measure.py | 42 ++++- .../test_cases_measure.txt | 12 +- tests/nemo_text_processing/es/test_measure.py | 2 +- 15 files changed, 280 insertions(+), 5 deletions(-) create mode 100644 nemo_text_processing/text_normalization/es/data/address/__init__.py create mode 100644 nemo_text_processing/text_normalization/es/data/address/apt_designator.tsv create mode 100644 nemo_text_processing/text_normalization/es/data/address/direction.tsv create mode 100644 nemo_text_processing/text_normalization/es/data/address/po_box.tsv create mode 100644 nemo_text_processing/text_normalization/es/data/address/suite_designator.tsv create mode 100644 nemo_text_processing/text_normalization/es/data/address/unit_designator.tsv create mode 100644 nemo_text_processing/text_normalization/es/data/address/zip_digit.tsv create mode 100644 nemo_text_processing/text_normalization/es/taggers/address.py diff --git a/nemo_text_processing/text_normalization/en/data/address/address_word.tsv b/nemo_text_processing/text_normalization/en/data/address/address_word.tsv index 2e9e71615..609b23a9f 100644 --- a/nemo_text_processing/text_normalization/en/data/address/address_word.tsv +++ b/nemo_text_processing/text_normalization/en/data/address/address_word.tsv @@ -4,6 +4,8 @@ expy Expressway fwy Freeway hwy Highway dr Drive +rd Road +road Road ct Court ave Avenue av Avenue diff --git a/nemo_text_processing/text_normalization/es/data/address/__init__.py b/nemo_text_processing/text_normalization/es/data/address/__init__.py new file mode 100644 index 000000000..9e3fb699d --- /dev/null +++ b/nemo_text_processing/text_normalization/es/data/address/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/es/data/address/apt_designator.tsv b/nemo_text_processing/text_normalization/es/data/address/apt_designator.tsv new file mode 100644 index 000000000..13d31e807 --- /dev/null +++ b/nemo_text_processing/text_normalization/es/data/address/apt_designator.tsv @@ -0,0 +1,3 @@ +Apt. Apartamento +Apt. Apartamento +Apt Apartamento diff --git a/nemo_text_processing/text_normalization/es/data/address/direction.tsv b/nemo_text_processing/text_normalization/es/data/address/direction.tsv new file mode 100644 index 000000000..a30781406 --- /dev/null +++ b/nemo_text_processing/text_normalization/es/data/address/direction.tsv @@ -0,0 +1,4 @@ +E East +S South +W West +N North diff --git a/nemo_text_processing/text_normalization/es/data/address/po_box.tsv b/nemo_text_processing/text_normalization/es/data/address/po_box.tsv new file mode 100644 index 000000000..84299001b --- /dev/null +++ b/nemo_text_processing/text_normalization/es/data/address/po_box.tsv @@ -0,0 +1 @@ +P.O. Box P.O. Box diff --git a/nemo_text_processing/text_normalization/es/data/address/suite_designator.tsv b/nemo_text_processing/text_normalization/es/data/address/suite_designator.tsv new file mode 100644 index 000000000..9c18d6dd9 --- /dev/null +++ b/nemo_text_processing/text_normalization/es/data/address/suite_designator.tsv @@ -0,0 +1,2 @@ +Ste. Suite +Ste Suite diff --git a/nemo_text_processing/text_normalization/es/data/address/unit_designator.tsv b/nemo_text_processing/text_normalization/es/data/address/unit_designator.tsv new file mode 100644 index 000000000..041a49d05 --- /dev/null +++ b/nemo_text_processing/text_normalization/es/data/address/unit_designator.tsv @@ -0,0 +1 @@ +Unit Unit diff --git a/nemo_text_processing/text_normalization/es/data/address/zip_digit.tsv b/nemo_text_processing/text_normalization/es/data/address/zip_digit.tsv new file mode 100644 index 000000000..a33a4ba5b --- /dev/null +++ b/nemo_text_processing/text_normalization/es/data/address/zip_digit.tsv @@ -0,0 +1,10 @@ +0 cero +1 uno +2 dos +3 tres +4 cuatro +5 cinco +6 seis +7 siete +8 ocho +9 nueve diff --git a/nemo_text_processing/text_normalization/es/graph_utils.py b/nemo_text_processing/text_normalization/es/graph_utils.py index 946f4234e..89f6a9cfb 100644 --- a/nemo_text_processing/text_normalization/es/graph_utils.py +++ b/nemo_text_processing/text_normalization/es/graph_utils.py @@ -133,6 +133,20 @@ def strip_cardinal_apocope(fst: "pynini.FstLike") -> "pynini.FstLike": return fst @ strip +def normalize_spanish_cardinal_for_us_address_street(fst: "pynini.FstLike") -> "pynini.FstLike": + """ + Spanish cardinals often apocopate before a following vowel (e.g. ``veintiún``). US street names + are ASCII and usually start with a consonant, but the cardinal FST does not see that context when + materializing digits alone. Normalize common ``…ún`` spoken forms to ``…uno`` / ``… y uno`` for + address surfaces (same intent as ``strip_cardinal_apocope`` but not restricted to string end). + """ + out = fst + out = out @ pynini.cdrewrite(pynini.cross("veintiún", "veintiuno"), "", "", NEMO_SIGMA) + out = out @ pynini.cdrewrite(pynini.cross("treintún", "treinta y uno"), "", "", NEMO_SIGMA) + out = out @ pynini.cdrewrite(pynini.cross(" y ún", " y uno"), "", "", NEMO_SIGMA) + return strip_cardinal_apocope(out) + + def add_cardinal_apocope_fem(fst: "pynini.FstLike") -> "pynini.FstLike": """ Adds apocope on cardinal strings in line with stressing rules. e.g. "una" -> "un". This only occurs when "una" precedes a stressed "a" sound in formal speech. This is not predictable diff --git a/nemo_text_processing/text_normalization/es/taggers/address.py b/nemo_text_processing/text_normalization/es/taggers/address.py new file mode 100644 index 000000000..82c161c7a --- /dev/null +++ b/nemo_text_processing/text_normalization/es/taggers/address.py @@ -0,0 +1,166 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +US-style postal address surface for Spanish TN (embedded in ``MeasureFst`` as +``units: "address_us_es"``). + +Street numbers and ZIP are Spanish; street types, states, and ordinals (e.g. ``42nd``) +use English expansions from shared ``en/data/address/`` lexicons. +""" + +import pynini +from pynini.examples import plurals +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_ALPHA, + NEMO_DIGIT, + NEMO_SIGMA, + NEMO_SPACE, + NEMO_UPPER, + GraphFst, + insert_space, +) +from nemo_text_processing.text_normalization.en.taggers.cardinal import CardinalFst as EnCardinalFst +from nemo_text_processing.text_normalization.en.taggers.ordinal import OrdinalFst as OrdinalTagger +from nemo_text_processing.text_normalization.en.taggers.whitelist import get_formats +from nemo_text_processing.text_normalization.en.utils import get_abs_path as en_get_abs_path +from nemo_text_processing.text_normalization.en.utils import load_labels +from nemo_text_processing.text_normalization.en.verbalizers.ordinal import OrdinalFst as OrdinalVerbalizer +from nemo_text_processing.text_normalization.es.graph_utils import normalize_spanish_cardinal_for_us_address_street +from nemo_text_processing.text_normalization.es.utils import get_abs_path + + +class AddressUSSurfaceFst(GraphFst): + """ + Surface FST for US addresses inside Spanish sentences. + + Output is the spoken string stored in ``measure { units: "address_us_es" cardinal { integer: "..." } }``. + Not registered in ``tokenize_and_classify``; consumed by :class:`~nemo_text_processing.text_normalization.es.taggers.measure.MeasureFst`. + + Args: + cardinal: Spanish :class:`~nemo_text_processing.text_normalization.es.taggers.cardinal.CardinalFst` + deterministic: passed to English ordinal/cardinal helpers + """ + + def __init__(self, cardinal: GraphFst, deterministic: bool = True): + super().__init__(name="address_us_es_surface", kind="classify", deterministic=deterministic) + + graph_direction = pynini.string_file(get_abs_path("data/address/direction.tsv")) + graph_zip_digit = pynini.string_file(get_abs_path("data/address/zip_digit.tsv")) + graph_suite_designator = pynini.string_file(get_abs_path("data/address/suite_designator.tsv")) + graph_apt_designator = pynini.string_file(get_abs_path("data/address/apt_designator.tsv")) + graph_unit_designator = pynini.string_file(get_abs_path("data/address/unit_designator.tsv")) + graph_po_box = pynini.string_file(get_abs_path("data/address/po_box.tsv")) + + en_cardinal = EnCardinalFst(deterministic=deterministic) + g = cardinal.graph + + ordinal_en = pynini.compose( + pynutil.insert('integer: "') + OrdinalTagger(cardinal=en_cardinal).graph + pynutil.insert('"'), + OrdinalVerbalizer().graph, + ) + + address_num = NEMO_DIGIT ** (1, 2) @ cardinal.graph_hundreds_component_at_least_one_none_zero_digit + address_num += insert_space + NEMO_DIGIT**2 @ ( + pynini.closure(pynini.cross("0", "cero "), 0, 1) + + cardinal.graph_hundreds_component_at_least_one_none_zero_digit + ) + address_num = pynini.compose(NEMO_DIGIT ** (3, 4), address_num) + address_num = normalize_spanish_cardinal_for_us_address_street( + plurals._priority_union(address_num, g, NEMO_SIGMA).optimize() + ) + + direction = pynini.closure( + pynini.accep(NEMO_SPACE) + graph_direction + pynini.closure(pynutil.delete("."), 0, 1), + 0, + 1, + ) + + address_words = get_formats(en_get_abs_path("data/address/address_word.tsv")) + street = ( + pynini.accep(NEMO_SPACE) + + (pynini.closure(ordinal_en, 0, 1) | NEMO_UPPER + pynini.closure(NEMO_ALPHA, 1)) + + NEMO_SPACE + + pynini.closure(NEMO_UPPER + pynini.closure(NEMO_ALPHA) + NEMO_SPACE) + + address_words + ) + + zip_five = ( + graph_zip_digit + + insert_space + + graph_zip_digit + + insert_space + + graph_zip_digit + + insert_space + + graph_zip_digit + + insert_space + + graph_zip_digit + ).optimize() + + city = pynini.closure(NEMO_ALPHA | pynini.accep(NEMO_SPACE), 1) + city = pynini.closure(pynini.accep(",") + pynini.accep(NEMO_SPACE) + city, 0, 1) + + states = load_labels(en_get_abs_path("data/address/state.tsv")) + states_extra = [(x, f"{y[0]}.{y[1:]}") for x, y in states] + states.extend(states_extra) + state = pynini.closure( + pynini.accep(",") + pynini.accep(NEMO_SPACE) + pynini.invert(pynini.string_map(states)), 0, 1 + ) + + zip_code = pynini.compose(NEMO_DIGIT**5, zip_five) + zip_code = pynini.closure( + pynini.closure(pynini.accep(","), 0, 1) + pynini.accep(NEMO_SPACE) + zip_code, + 0, + 1, + ) + tail = pynini.closure(city + state + zip_code, 0, 1).optimize() + + suite_num = normalize_spanish_cardinal_for_us_address_street( + (pynini.closure(NEMO_DIGIT, 1, 4) @ g).optimize() + ) + unit_num = normalize_spanish_cardinal_for_us_address_street( + (pynini.closure(NEMO_DIGIT, 1, 3) @ g).optimize() + ) + + comma_sp = pynini.accep(",") + pynini.accep(NEMO_SPACE) + suite = graph_suite_designator + pynini.closure(NEMO_SPACE, 0, 1) + suite_num + apt = graph_apt_designator + pynini.closure(NEMO_DIGIT | NEMO_UPPER, 1, 4) + unit = graph_unit_designator + unit_num + middle = pynini.closure(comma_sp + (suite | apt | unit), 0, 3).optimize() + + po_box = ( + graph_po_box + + normalize_spanish_cardinal_for_us_address_street(pynini.closure(NEMO_DIGIT, 1, 4) @ g) + + tail + ).optimize() + + standard = address_num + direction + street + middle + tail + hyphen = pynini.accep("-") + alpha_chars = NEMO_ALPHA | hyphen + standard_eos = ( + address_num + + direction + + street + + middle + + pynini.accep(".") + + pynini.closure(NEMO_SPACE, 1, 2) + + NEMO_UPPER + + pynini.closure(alpha_chars) + ) + standard |= pynutil.add_weight(standard_eos, -0.001) + standard |= address_num + direction + street + middle + pynini.closure(pynini.cross(".", ""), 0, 1) + + self.graph = (po_box | standard.optimize()).optimize() diff --git a/nemo_text_processing/text_normalization/es/taggers/cardinal.py b/nemo_text_processing/text_normalization/es/taggers/cardinal.py index 85402089f..371a48d2e 100644 --- a/nemo_text_processing/text_normalization/es/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/es/taggers/cardinal.py @@ -169,6 +169,10 @@ def __init__(self, deterministic: bool = True): self.graph = filter_punctuation(self.graph).optimize() + self.graph_hundreds_component_at_least_one_none_zero_digit = ( + graph_hundreds_component_at_least_one_none_zero_digit.optimize() + ) + optional_minus_graph = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) final_graph = optional_minus_graph + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"") diff --git a/nemo_text_processing/text_normalization/es/taggers/measure.py b/nemo_text_processing/text_normalization/es/taggers/measure.py index a63677c47..4b20cb966 100644 --- a/nemo_text_processing/text_normalization/es/taggers/measure.py +++ b/nemo_text_processing/text_normalization/es/taggers/measure.py @@ -14,6 +14,7 @@ import pynini from pynini.lib import pynutil +from nemo_text_processing.text_normalization.es.taggers.address import AddressUSSurfaceFst from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_ALPHA, NEMO_NON_BREAKING_SPACE, @@ -199,6 +200,13 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, de + pynutil.insert("\" } preserve_order: true") ) + address_us_es_inner = AddressUSSurfaceFst(cardinal, deterministic=deterministic).graph + address_us_es = ( + pynutil.insert('units: "address_us_es" cardinal { integer: "') + + address_us_es_inner + + pynutil.insert('" } preserve_order: true') + ) + final_graph = ( subgraph_decimal | subgraph_cardinal @@ -210,6 +218,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, de | cardinal_times | alpha_dash_decimal | math + | address_us_es ) final_graph = self.add_tokens(final_graph) diff --git a/nemo_text_processing/text_normalization/es/verbalizers/measure.py b/nemo_text_processing/text_normalization/es/verbalizers/measure.py index de877446d..511868a42 100644 --- a/nemo_text_processing/text_normalization/es/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/es/verbalizers/measure.py @@ -23,6 +23,7 @@ delete_extra_space, delete_preserve_order, delete_space, + insert_space, ) from nemo_text_processing.text_normalization.es.graph_utils import ones from nemo_text_processing.text_normalization.es.utils import get_abs_path @@ -65,12 +66,26 @@ def __init__(self, decimal: GraphFst, cardinal: GraphFst, fraction: GraphFst, de NEMO_WHITE_SPACE + "por" + pynini.closure(NEMO_NOT_QUOTE, 1), 0, 1 ) unit_masc |= "por" + pynini.closure(NEMO_NOT_QUOTE, 1) - unit_masc = pynutil.delete("units: \"") + (pynini.closure(NEMO_NOT_QUOTE) @ unit_masc) + pynutil.delete("\"") + unit_masc = ( + pynutil.delete("units: \"") + + ( + pynini.difference(pynini.closure(NEMO_NOT_QUOTE, 1), pynini.union("math", "address_us_es")) + @ unit_masc + ) + + pynutil.delete("\"") + ) unit_fem = (unit_plural_fem | unit_singular_fem) + pynini.closure( NEMO_WHITE_SPACE + "por" + pynini.closure(NEMO_NOT_QUOTE, 1), 0, 1 ) - unit_fem = pynutil.delete("units: \"") + (pynini.closure(NEMO_NOT_QUOTE) @ unit_fem) + pynutil.delete("\"") + unit_fem = ( + pynutil.delete("units: \"") + + ( + pynini.difference(pynini.closure(NEMO_NOT_QUOTE, 1), pynini.union("math", "address_us_es")) + @ unit_fem + ) + + pynutil.delete("\"") + ) graph_masc = (graph_cardinal_masc | graph_decimal_masc) + NEMO_WHITE_SPACE + unit_masc graph_masc |= graph_fraction_masc + NEMO_WHITE_SPACE + pynutil.insert("de ") + unit_masc @@ -96,7 +111,11 @@ def __init__(self, decimal: GraphFst, cardinal: GraphFst, fraction: GraphFst, de graph @= pynini.cdrewrite(pynini.cross(ones, "uno"), "", NEMO_WHITE_SPACE + "por", NEMO_SIGMA) # To manage alphanumeric combonations ("a-8, 5x"), we let them use a weighted default path. - alpha_num_unit = pynutil.delete("units: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + alpha_num_unit = ( + pynutil.delete("units: \"") + + pynini.difference(pynini.closure(NEMO_NOT_QUOTE), pynini.union("math", "address_us_es")) + + pynutil.delete("\"") + ) graph_alpha_num = pynini.union( (graph_cardinal_masc | graph_decimal_masc) + NEMO_SPACE + alpha_num_unit, alpha_num_unit + delete_extra_space + (graph_cardinal_masc | graph_decimal_masc), @@ -106,8 +125,25 @@ def __init__(self, decimal: GraphFst, cardinal: GraphFst, fraction: GraphFst, de pynutil.delete("units: \"math\"") + delete_space + graph_cardinal_masc + delete_space, -1 ) + preserve_order_tail = ( + pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space + ) + address_us_es = ( + pynutil.delete('units: "address_us_es" ') + + delete_space + + pynutil.delete("cardinal { integer: \"") + + delete_space + + pynini.closure(NEMO_NOT_QUOTE) + + pynutil.delete("\"") + + delete_space + + pynutil.delete("}") + + delete_space + + pynini.closure(preserve_order_tail) + ) + graph |= pynutil.add_weight(graph_alpha_num, 0.01) graph |= math + graph |= address_us_es graph += delete_preserve_order diff --git a/tests/nemo_text_processing/es/data_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/es/data_text_normalization/test_cases_measure.txt index 092dcbc33..42be1bf7d 100644 --- a/tests/nemo_text_processing/es/data_text_normalization/test_cases_measure.txt +++ b/tests/nemo_text_processing/es/data_text_normalization/test_cases_measure.txt @@ -29,4 +29,14 @@ a-5~a cinco -8° c~menos ocho grados centígrados 40 ° k~cuarenta grados kelvin 180 psi~ciento ochenta p s i -2 + 2 - 1 = 3~dos más dos menos uno es igual a tres \ No newline at end of file +2 + 2 - 1 = 3~dos más dos menos uno es igual a tres +Mi dirección es 1234 Maple St., Springfield, IL 62704~Mi dirección es mil doscientos treinta y cuatro Maple Street, Springfield, Illinois seis dos siete cero cuatro +La oficina está ubicada en 567 Main St., Ste. 200, Dallas, TX 75201~La oficina está ubicada en quinientos sesenta y siete Main Street, Suite doscientos, Dallas, Texas siete cinco dos cero uno +Por favor envía el paquete a 890 Oak Ave., Apt. 5B, Brooklyn, NY 11201~Por favor envía el paquete a ochocientos noventa Oak Avenue, Apartamento 5B, Brooklyn, New York uno uno dos cero uno +Vivo en 4321 Sunset Blvd., Los Angeles, CA 90028, cerca del centro~Vivo en cuatro mil trescientos veintiuno Sunset Boulevard, Los Angeles, California nueve cero cero dos ocho, cerca del centro +La nueva tienda abrirá en 100 Market Rd., San Francisco, CA 94105~La nueva tienda abrirá en cien Market Road, San Francisco, California nueve cuatro uno cero cinco +Su casa queda en 25 W 42nd St., New York, NY 10036~Su casa queda en veinticinco West forty second Street, New York, New York uno cero cero tres seis +El hospital se encuentra en 7890 Lincoln Dr., Miami, FL 33133~El hospital se encuentra en siete mil ochocientos noventa Lincoln Drive, Miami, Florida tres tres uno tres tres +Mándame la carta a P.O. Box 456, Austin, TX 78701, por favor~Mándame la carta a P.O. Box cuatrocientos cincuenta y seis, Austin, Texas siete ocho siete cero uno, por favor +La escuela de mis hijos está en 321 Elm St., Boston, MA 02108~La escuela de mis hijos está en trescientos veintiuno Elm Street, Boston, Massachusetts cero dos uno cero ocho +Nos mudamos a 9876 Pine Rd., Unit 12, Seattle, WA 98101 el mes pasado~Nos mudamos a nueve mil ochocientos setenta y seis Pine Road, Unit doce, Seattle, Washington nueve ocho uno cero uno el mes pasado \ No newline at end of file diff --git a/tests/nemo_text_processing/es/test_measure.py b/tests/nemo_text_processing/es/test_measure.py index 572c88d03..6bb048bce 100644 --- a/tests/nemo_text_processing/es/test_measure.py +++ b/tests/nemo_text_processing/es/test_measure.py @@ -58,7 +58,7 @@ def test_denorm(self, test_input, expected): @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm(self, test_input, expected): - pred = self.normalizer.normalize(test_input, verbose=False) + pred = self.normalizer.normalize(test_input, verbose=False, punct_post_process=True) assert pred in expected if self.normalizer_with_audio: From 2c5bbb3c28ea1dcfcbbd7ea3f6ff419465cc021b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 18 May 2026 03:44:49 +0000 Subject: [PATCH 2/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../text_normalization/es/taggers/address.py | 10 +++------- .../text_normalization/es/taggers/measure.py | 2 +- .../text_normalization/es/verbalizers/measure.py | 14 +++----------- 3 files changed, 7 insertions(+), 19 deletions(-) diff --git a/nemo_text_processing/text_normalization/es/taggers/address.py b/nemo_text_processing/text_normalization/es/taggers/address.py index 82c161c7a..9cda2bb09 100644 --- a/nemo_text_processing/text_normalization/es/taggers/address.py +++ b/nemo_text_processing/text_normalization/es/taggers/address.py @@ -64,7 +64,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): graph_apt_designator = pynini.string_file(get_abs_path("data/address/apt_designator.tsv")) graph_unit_designator = pynini.string_file(get_abs_path("data/address/unit_designator.tsv")) graph_po_box = pynini.string_file(get_abs_path("data/address/po_box.tsv")) - + en_cardinal = EnCardinalFst(deterministic=deterministic) g = cardinal.graph @@ -128,12 +128,8 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): ) tail = pynini.closure(city + state + zip_code, 0, 1).optimize() - suite_num = normalize_spanish_cardinal_for_us_address_street( - (pynini.closure(NEMO_DIGIT, 1, 4) @ g).optimize() - ) - unit_num = normalize_spanish_cardinal_for_us_address_street( - (pynini.closure(NEMO_DIGIT, 1, 3) @ g).optimize() - ) + suite_num = normalize_spanish_cardinal_for_us_address_street((pynini.closure(NEMO_DIGIT, 1, 4) @ g).optimize()) + unit_num = normalize_spanish_cardinal_for_us_address_street((pynini.closure(NEMO_DIGIT, 1, 3) @ g).optimize()) comma_sp = pynini.accep(",") + pynini.accep(NEMO_SPACE) suite = graph_suite_designator + pynini.closure(NEMO_SPACE, 0, 1) + suite_num diff --git a/nemo_text_processing/text_normalization/es/taggers/measure.py b/nemo_text_processing/text_normalization/es/taggers/measure.py index 4b20cb966..bc1e0c0c8 100644 --- a/nemo_text_processing/text_normalization/es/taggers/measure.py +++ b/nemo_text_processing/text_normalization/es/taggers/measure.py @@ -14,7 +14,6 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.es.taggers.address import AddressUSSurfaceFst from nemo_text_processing.text_normalization.en.graph_utils import ( NEMO_ALPHA, NEMO_NON_BREAKING_SPACE, @@ -26,6 +25,7 @@ insert_space, ) from nemo_text_processing.text_normalization.es.graph_utils import strip_cardinal_apocope +from nemo_text_processing.text_normalization.es.taggers.address import AddressUSSurfaceFst from nemo_text_processing.text_normalization.es.utils import get_abs_path unit = pynini.string_file(get_abs_path("data/measures/measurements.tsv")) diff --git a/nemo_text_processing/text_normalization/es/verbalizers/measure.py b/nemo_text_processing/text_normalization/es/verbalizers/measure.py index 511868a42..cb15e7a12 100644 --- a/nemo_text_processing/text_normalization/es/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/es/verbalizers/measure.py @@ -68,10 +68,7 @@ def __init__(self, decimal: GraphFst, cardinal: GraphFst, fraction: GraphFst, de unit_masc |= "por" + pynini.closure(NEMO_NOT_QUOTE, 1) unit_masc = ( pynutil.delete("units: \"") - + ( - pynini.difference(pynini.closure(NEMO_NOT_QUOTE, 1), pynini.union("math", "address_us_es")) - @ unit_masc - ) + + (pynini.difference(pynini.closure(NEMO_NOT_QUOTE, 1), pynini.union("math", "address_us_es")) @ unit_masc) + pynutil.delete("\"") ) @@ -80,10 +77,7 @@ def __init__(self, decimal: GraphFst, cardinal: GraphFst, fraction: GraphFst, de ) unit_fem = ( pynutil.delete("units: \"") - + ( - pynini.difference(pynini.closure(NEMO_NOT_QUOTE, 1), pynini.union("math", "address_us_es")) - @ unit_fem - ) + + (pynini.difference(pynini.closure(NEMO_NOT_QUOTE, 1), pynini.union("math", "address_us_es")) @ unit_fem) + pynutil.delete("\"") ) @@ -125,9 +119,7 @@ def __init__(self, decimal: GraphFst, cardinal: GraphFst, fraction: GraphFst, de pynutil.delete("units: \"math\"") + delete_space + graph_cardinal_masc + delete_space, -1 ) - preserve_order_tail = ( - pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space - ) + preserve_order_tail = pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space address_us_es = ( pynutil.delete('units: "address_us_es" ') + delete_space From 6b25f509cd55740d0b5915345d34540f8b69a517 Mon Sep 17 00:00:00 2001 From: Mai Anh Date: Wed, 20 May 2026 09:34:10 +0700 Subject: [PATCH 3/3] modify with review Signed-off-by: Mai Anh --- .../es/data/address/apt_designator.tsv | 9 ++-- .../es/data/address/zip_digit.tsv | 10 ---- .../text_normalization/es/graph_utils.py | 1 + .../text_normalization/es/taggers/address.py | 25 ++++------ .../test_cases_address.txt | 10 ++++ .../test_cases_measure.txt | 10 ---- tests/nemo_text_processing/es/test_address.py | 47 +++++++++++++++++++ .../es/test_sparrowhawk_normalization.sh | 5 ++ 8 files changed, 79 insertions(+), 38 deletions(-) delete mode 100644 nemo_text_processing/text_normalization/es/data/address/zip_digit.tsv create mode 100644 tests/nemo_text_processing/es/data_text_normalization/test_cases_address.txt create mode 100644 tests/nemo_text_processing/es/test_address.py diff --git a/nemo_text_processing/text_normalization/es/data/address/apt_designator.tsv b/nemo_text_processing/text_normalization/es/data/address/apt_designator.tsv index 13d31e807..8275f42d3 100644 --- a/nemo_text_processing/text_normalization/es/data/address/apt_designator.tsv +++ b/nemo_text_processing/text_normalization/es/data/address/apt_designator.tsv @@ -1,3 +1,6 @@ -Apt. Apartamento -Apt. Apartamento -Apt Apartamento +Apt. Apartamento +Apt. Apartamento +Dept. Departamento +Dept Departamento +Depto. Departamento +Depto Departamento diff --git a/nemo_text_processing/text_normalization/es/data/address/zip_digit.tsv b/nemo_text_processing/text_normalization/es/data/address/zip_digit.tsv deleted file mode 100644 index a33a4ba5b..000000000 --- a/nemo_text_processing/text_normalization/es/data/address/zip_digit.tsv +++ /dev/null @@ -1,10 +0,0 @@ -0 cero -1 uno -2 dos -3 tres -4 cuatro -5 cinco -6 seis -7 siete -8 ocho -9 nueve diff --git a/nemo_text_processing/text_normalization/es/graph_utils.py b/nemo_text_processing/text_normalization/es/graph_utils.py index 89f6a9cfb..2c539fe09 100644 --- a/nemo_text_processing/text_normalization/es/graph_utils.py +++ b/nemo_text_processing/text_normalization/es/graph_utils.py @@ -144,6 +144,7 @@ def normalize_spanish_cardinal_for_us_address_street(fst: "pynini.FstLike") -> " out = out @ pynini.cdrewrite(pynini.cross("veintiún", "veintiuno"), "", "", NEMO_SIGMA) out = out @ pynini.cdrewrite(pynini.cross("treintún", "treinta y uno"), "", "", NEMO_SIGMA) out = out @ pynini.cdrewrite(pynini.cross(" y ún", " y uno"), "", "", NEMO_SIGMA) + out = out @ pynini.cdrewrite(pynini.cross(" y un", " y uno"), "", "", NEMO_SIGMA) return strip_cardinal_apocope(out) diff --git a/nemo_text_processing/text_normalization/es/taggers/address.py b/nemo_text_processing/text_normalization/es/taggers/address.py index 9cda2bb09..ab793b855 100644 --- a/nemo_text_processing/text_normalization/es/taggers/address.py +++ b/nemo_text_processing/text_normalization/es/taggers/address.py @@ -59,7 +59,11 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): super().__init__(name="address_us_es_surface", kind="classify", deterministic=deterministic) graph_direction = pynini.string_file(get_abs_path("data/address/direction.tsv")) - graph_zip_digit = pynini.string_file(get_abs_path("data/address/zip_digit.tsv")) + graph_zip_digit = pynini.invert( + pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + | pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + ).optimize() + graph_zip_digit @= pynini.cdrewrite(pynini.cross("un", "uno"), "", "", NEMO_SIGMA) graph_suite_designator = pynini.string_file(get_abs_path("data/address/suite_designator.tsv")) graph_apt_designator = pynini.string_file(get_abs_path("data/address/apt_designator.tsv")) graph_unit_designator = pynini.string_file(get_abs_path("data/address/unit_designator.tsv")) @@ -98,17 +102,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): + address_words ) - zip_five = ( - graph_zip_digit - + insert_space - + graph_zip_digit - + insert_space - + graph_zip_digit - + insert_space - + graph_zip_digit - + insert_space - + graph_zip_digit - ).optimize() + zip_five = (pynini.closure(graph_zip_digit + insert_space, 4) + graph_zip_digit).optimize() city = pynini.closure(NEMO_ALPHA | pynini.accep(NEMO_SPACE), 1) city = pynini.closure(pynini.accep(",") + pynini.accep(NEMO_SPACE) + city, 0, 1) @@ -120,9 +114,8 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): pynini.accep(",") + pynini.accep(NEMO_SPACE) + pynini.invert(pynini.string_map(states)), 0, 1 ) - zip_code = pynini.compose(NEMO_DIGIT**5, zip_five) zip_code = pynini.closure( - pynini.closure(pynini.accep(","), 0, 1) + pynini.accep(NEMO_SPACE) + zip_code, + pynini.closure(pynini.accep(","), 0, 1) + pynini.accep(NEMO_SPACE) + zip_five, 0, 1, ) @@ -130,10 +123,12 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): suite_num = normalize_spanish_cardinal_for_us_address_street((pynini.closure(NEMO_DIGIT, 1, 4) @ g).optimize()) unit_num = normalize_spanish_cardinal_for_us_address_street((pynini.closure(NEMO_DIGIT, 1, 3) @ g).optimize()) + apt_char = graph_zip_digit | NEMO_UPPER + apt_num = (apt_char + pynini.closure(insert_space + apt_char, 0, 3)).optimize() comma_sp = pynini.accep(",") + pynini.accep(NEMO_SPACE) suite = graph_suite_designator + pynini.closure(NEMO_SPACE, 0, 1) + suite_num - apt = graph_apt_designator + pynini.closure(NEMO_DIGIT | NEMO_UPPER, 1, 4) + apt = graph_apt_designator + pynini.closure(NEMO_SPACE, 0, 1) + apt_num unit = graph_unit_designator + unit_num middle = pynini.closure(comma_sp + (suite | apt | unit), 0, 3).optimize() diff --git a/tests/nemo_text_processing/es/data_text_normalization/test_cases_address.txt b/tests/nemo_text_processing/es/data_text_normalization/test_cases_address.txt new file mode 100644 index 000000000..c0badfb20 --- /dev/null +++ b/tests/nemo_text_processing/es/data_text_normalization/test_cases_address.txt @@ -0,0 +1,10 @@ +Mi dirección es 1234 Maple St., Springfield, IL 62704~Mi dirección es mil doscientos treinta y cuatro Maple Street, Springfield, Illinois seis dos siete cero cuatro +La oficina está ubicada en 567 Main St., Ste. 200, Dallas, TX 75201~La oficina está ubicada en quinientos sesenta y siete Main Street, Suite doscientos, Dallas, Texas siete cinco dos cero uno +Por favor envía el paquete a 890 Oak Ave., Apt. 5B, Brooklyn, NY 11201~Por favor envía el paquete a ochocientos noventa Oak Avenue, Apartamento cinco B, Brooklyn, New York uno uno dos cero uno +Vivo en 4321 Sunset Blvd., Los Angeles, CA 90028, cerca del centro~Vivo en cuatro mil trescientos veintiuno Sunset Boulevard, Los Angeles, California nueve cero cero dos ocho, cerca del centro +La nueva tienda abrirá en 100 Market Rd., San Francisco, CA 94105~La nueva tienda abrirá en cien Market Road, San Francisco, California nueve cuatro uno cero cinco +Su casa queda en 25 W 42nd St., New York, NY 10036~Su casa queda en veinticinco West forty second Street, New York, New York uno cero cero tres seis +El hospital se encuentra en 7890 Lincoln Dr., Miami, FL 33133~El hospital se encuentra en siete mil ochocientos noventa Lincoln Drive, Miami, Florida tres tres uno tres tres +Mándame la carta a P.O. Box 456, Austin, TX 78701, por favor~Mándame la carta a P.O. Box cuatrocientos cincuenta y seis, Austin, Texas siete ocho siete cero uno, por favor +La escuela de mis hijos está en 321 Elm St., Boston, MA 02108~La escuela de mis hijos está en trescientos veintiuno Elm Street, Boston, Massachusetts cero dos uno cero ocho +Nos mudamos a 9876 Pine Rd., Unit 12, Seattle, WA 98101 el mes pasado~Nos mudamos a nueve mil ochocientos setenta y seis Pine Road, Unit doce, Seattle, Washington nueve ocho uno cero uno el mes pasado diff --git a/tests/nemo_text_processing/es/data_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/es/data_text_normalization/test_cases_measure.txt index 42be1bf7d..2f8d33fb4 100644 --- a/tests/nemo_text_processing/es/data_text_normalization/test_cases_measure.txt +++ b/tests/nemo_text_processing/es/data_text_normalization/test_cases_measure.txt @@ -30,13 +30,3 @@ a-5~a cinco 40 ° k~cuarenta grados kelvin 180 psi~ciento ochenta p s i 2 + 2 - 1 = 3~dos más dos menos uno es igual a tres -Mi dirección es 1234 Maple St., Springfield, IL 62704~Mi dirección es mil doscientos treinta y cuatro Maple Street, Springfield, Illinois seis dos siete cero cuatro -La oficina está ubicada en 567 Main St., Ste. 200, Dallas, TX 75201~La oficina está ubicada en quinientos sesenta y siete Main Street, Suite doscientos, Dallas, Texas siete cinco dos cero uno -Por favor envía el paquete a 890 Oak Ave., Apt. 5B, Brooklyn, NY 11201~Por favor envía el paquete a ochocientos noventa Oak Avenue, Apartamento 5B, Brooklyn, New York uno uno dos cero uno -Vivo en 4321 Sunset Blvd., Los Angeles, CA 90028, cerca del centro~Vivo en cuatro mil trescientos veintiuno Sunset Boulevard, Los Angeles, California nueve cero cero dos ocho, cerca del centro -La nueva tienda abrirá en 100 Market Rd., San Francisco, CA 94105~La nueva tienda abrirá en cien Market Road, San Francisco, California nueve cuatro uno cero cinco -Su casa queda en 25 W 42nd St., New York, NY 10036~Su casa queda en veinticinco West forty second Street, New York, New York uno cero cero tres seis -El hospital se encuentra en 7890 Lincoln Dr., Miami, FL 33133~El hospital se encuentra en siete mil ochocientos noventa Lincoln Drive, Miami, Florida tres tres uno tres tres -Mándame la carta a P.O. Box 456, Austin, TX 78701, por favor~Mándame la carta a P.O. Box cuatrocientos cincuenta y seis, Austin, Texas siete ocho siete cero uno, por favor -La escuela de mis hijos está en 321 Elm St., Boston, MA 02108~La escuela de mis hijos está en trescientos veintiuno Elm Street, Boston, Massachusetts cero dos uno cero ocho -Nos mudamos a 9876 Pine Rd., Unit 12, Seattle, WA 98101 el mes pasado~Nos mudamos a nueve mil ochocientos setenta y seis Pine Road, Unit doce, Seattle, Washington nueve ocho uno cero uno el mes pasado \ No newline at end of file diff --git a/tests/nemo_text_processing/es/test_address.py b/tests/nemo_text_processing/es/test_address.py new file mode 100644 index 000000000..71d3d5097 --- /dev/null +++ b/tests/nemo_text_processing/es/test_address.py @@ -0,0 +1,47 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio + +from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file + + +class TestAddress: + normalizer_es = Normalizer(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) + + normalizer_with_audio_es = ( + NormalizerWithAudio(input_case='cased', lang='es', cache_dir=CACHE_DIR, overwrite_cache=False) + if RUN_AUDIO_BASED_TESTS + else None + ) + + # Spanish US-address code-switching is tagged by the measure class. + @parameterized.expand(parse_test_case_file('es/data_text_normalization/test_cases_address.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer_es.normalize(test_input, verbose=False, punct_post_process=True) + assert pred == expected + + if self.normalizer_with_audio_es: + pred_non_deterministic = self.normalizer_with_audio_es.normalize( + test_input, + n_tagged=500, + punct_post_process=False, + ) + assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/es/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/es/test_sparrowhawk_normalization.sh index 102d7e04f..66549515c 100644 --- a/tests/nemo_text_processing/es/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/es/test_sparrowhawk_normalization.sh @@ -77,6 +77,11 @@ testTNMeasure() { runtest $input } +testTNAddress() { + input=$PROJECT_DIR/es/data_text_normalization/test_cases_address.txt + runtest $input +} + testTNWhitelist() { input=$PROJECT_DIR/es/data_text_normalization/test_cases_whitelist.txt runtest $input