From 4df1f4c2b807d21515f4f19d3524a53cf504112c Mon Sep 17 00:00:00 2001 From: rocky Date: Tue, 17 Feb 2026 15:42:58 -0500 Subject: [PATCH 01/26] Start boxing character tables --- mathics_scanner/data/.gitignore | 1 + mathics_scanner/data/boxing-characters.yml | 95 ++++++++++++++++ mathics_scanner/generate/build_box_tables.py | 107 +++++++++++++++++++ mathics_scanner/generate/build_tables.py | 2 + 4 files changed, 205 insertions(+) create mode 100644 mathics_scanner/data/boxing-characters.yml create mode 100644 mathics_scanner/generate/build_box_tables.py diff --git a/mathics_scanner/data/.gitignore b/mathics_scanner/data/.gitignore index 183700b..c433af0 100644 --- a/mathics_scanner/data/.gitignore +++ b/mathics_scanner/data/.gitignore @@ -1 +1,2 @@ /.python-version +/box-character-tables.json diff --git a/mathics_scanner/data/boxing-characters.yml b/mathics_scanner/data/boxing-characters.yml new file mode 100644 index 0000000..1a19cf5 --- /dev/null +++ b/mathics_scanner/data/boxing-characters.yml @@ -0,0 +1,95 @@ +# Information about Wolfram Language boxing characters used in the +# string representation of Boxing Expressions. +# +# +# All of the key names *except* \! and \* are associated with +# some box operator. +# +# +# Fields +# ====== +# +# +# ASCII (string) +# -------------- +# +# The character representation in ASCII. +# +# Operators +# ----------- +# +# When the string is part of a Boxing operator, the Boxing +# operator name(s) are given. +# +# Unicode +# ------- +# +# The representation in Unicode. All Unicode characters fall into the +# Private Use Area of Unicode that Wolfram uses for its own internal +# system markers, specifically the range 0xf7c0 to 0xf7cd. +# + +# Coding note: below we use single quotes, not double quotes so that we do +# not have to escape backslash characters. + +LinearSyntaxAmp: + ASCII: '\&' + Operators: [] + Unicode: '\uf7c7' + +LinearSyntaxAt: + ASCII: '\@' + Operators: [RadicalBox, SqrtBox] + Unicode: '\uf7c1' + +LinearSyntaxBacktick: + ASCII: '\`' + Operators: [FormBox] + Unicode: '\uf7cd' + +LinearSyntaxBang: + ASCII: '\!' + Operators: + Unicode: '\uf7c1' + +LinearSyntaxCaret: + ASCII: '\^' + Operators: SuperscriptBox + Unicode: '\uf7c6' + +# Note: this name does not appear in CodeParse +LinearSyntaxCloseParen: + ASCII: '\)' + Operators: [RowBox] + Unicode: '\uf7c0' + +# Note: this name does not appear in CodeParse +LinearSyntaxOpenParen: + ASCII: '\(' + Operators: [RowBox] + Unicode: '\uf7cd' + +LinearSyntaxPercent: + ASCII: '\%' + Operators: [RadicalBox, SuperscriptBox, UnderOverscriptBox] + Unicode: '\uf7c5' + +LinearSyntaxPlus: + ASCII: '\+' + Operators: [UnderscriptBox, UnderOverscriptBox] + Unicode: '\uf7cb' + +LinearSyntaxStar: + ASCII: '\*' + Operators: [] + Unicode: '\uf7c8' + +LinearSyntaxSlash: + ASCII: '\/' + Operators: [FractionBox] + Unicode: '\uf7cc' + +LinearSyntaxUnder: + ASCII: '\_' + Operators: [SubscriptBox, SubsuperscriptBox] + Unicode: '\uf7ca' diff --git a/mathics_scanner/generate/build_box_tables.py b/mathics_scanner/generate/build_box_tables.py new file mode 100644 index 0000000..f8533a7 --- /dev/null +++ b/mathics_scanner/generate/build_box_tables.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python +# This scripts reads the data from named-characters and converts it to the +# format used by the library internally + +import json +import os.path as osp +import sys +from pathlib import Path + +import click +import yaml + +try: + from mathics_scanner.version import __version__ +except ImportError: + # When using build isolation + __version__ = "unknown" + + +def get_srcdir() -> str: + filename = osp.normcase(osp.dirname(osp.abspath(__file__))) + return osp.realpath(filename) + + +def read(*rnames) -> str: + return open(osp.join(get_srcdir(), *rnames)).read() + + +def compile_tables(data: dict) -> dict: + """ + Compiles the general table into the tables used internally by the library. + This facilitates fast access of this information by clients needing this + information. + """ + + # Multiple entries in the YAML table are redundant in the following sense: + # when a character has a plain-text equivalent but the plain-text + # equivalent is equal to it's WL unicode representation (i.e. the + # "wl-unicode" field is the same as the "unicode-equivalent" field) then it + # is considered rendundant for us, since no conversion is needed. + # + # As an optimization, we explicit remove any redundant characters from all + # JSON tables. This makes the tables smaller (therefore easier to load), as + # well as the correspond regex patterns. This implies that not all + # characters that have a unicode equivalent are included in `wl_to_ascii` + # or `wl_to_unicode_dict`. Furthermore, this implies that not all + # characters that have a unicode inverse are included in + # `unicode_to_wl_dict` + + # WL to AMS LaTeX (math mode) characters + ascii_to_unicode = {v["ASCII"]: v["Unicode"] for v in data.values()} + + unicode_to_ascii = {v["Unicode"]: v["ASCII"] for v in data.values()} + + return { + "ascii-to-unicode": ascii_to_unicode, + "unicode-to-ascii": unicode_to_ascii, + } + + +DEFAULT_DATA_DIR = Path(osp.normpath(osp.dirname(__file__)), "..", "data") + +ALL_FIELDS = [ + "unicode-to-ascii", + "ascii-to-unicode", +] + + +@click.command() +@click.version_option(version=__version__) # NOQA +@click.option( + "--field", + "-f", + multiple=True, + required=False, + help="Select which fields to include in JSON.", + show_default=True, + type=click.Choice(ALL_FIELDS), + default=ALL_FIELDS, +) +@click.option( + "--output", + "-o", + show_default=True, + type=click.Path(writable=True), + default=DEFAULT_DATA_DIR / "box-character-tables.json", +) +@click.argument( + "data_dir", type=click.Path(readable=True), default=DEFAULT_DATA_DIR, required=False +) +def main(field, output, data_dir): + with ( + open(data_dir / "boxing-characters.yml", "r", encoding="utf8") as i, + open(output, "w") as o, + ): + # Load the YAML data. + data = yaml.load(i, Loader=yaml.FullLoader) + + # Precompile the tables. + data = compile_tables(data) + + # Dump the preprocessed dictionaries to disk as JSON. + json.dump(data, o) + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/mathics_scanner/generate/build_tables.py b/mathics_scanner/generate/build_tables.py index 6d1da59..1b2ce10 100755 --- a/mathics_scanner/generate/build_tables.py +++ b/mathics_scanner/generate/build_tables.py @@ -249,6 +249,7 @@ def compile_tables(data: dict) -> dict: "ascii-operator-to-character-symbol": ascii_operator_to_character_symbol, "ascii-operator-to-unicode": ascii_operator_to_unicode, "ascii-operator-to-wl-unicode": ascii_operator_to_wl_unicode, + "box-characters": box_characters, "builtin-constants": builtin_constants, "latex-named-characters": latex_named_characters, "letterlikes": letterlikes, @@ -281,6 +282,7 @@ def compile_tables(data: dict) -> dict: "ascii-operator-to-symbol", "ascii-operator-to-unicode", "ascii-operator-to-wl-unicode", + "box-characters", # "builtin-constants", # not used yet "latex-named-characters", "letterlikes", From f15581ddb55119cc9aac5872326efc7e548f06fc Mon Sep 17 00:00:00 2001 From: rocky Date: Tue, 17 Feb 2026 18:16:44 -0500 Subject: [PATCH 02/26] Lots of build code cleanup * Rename/regularize JSON and YAML naming * DRY tests * remove egg stuff from pyproject.toml --- Makefile | 21 ++++++------ admin-tools/make-JSON-tables.sh | 8 ++--- mathics_scanner/characters.py | 2 +- ...ild_box_tables.py => boxing_characters.py} | 2 +- .../{build_tables.py => named_characters.py} | 4 +-- ...{build_operator_tables.py => operators.py} | 0 mathics_scanner/load.py | 8 ++--- pyproject.toml | 9 +++-- setup.py | 33 ++++++++----------- test/helper.py | 7 ++++ test/test_ascii.py | 8 +---- test/test_character_table_consistency.py | 9 ++--- test/test_general_yaml_sanity.py | 5 +-- test/test_has_unicode_inverse_sanity.py | 8 ++--- test/test_letterlikes_sanity.py | 8 +---- test/test_unicode.py | 8 +---- test/test_urls.py | 7 ++-- test/test_wl_to_ascii.py | 5 ++- 18 files changed, 63 insertions(+), 89 deletions(-) rename mathics_scanner/generate/{build_box_tables.py => boxing_characters.py} (98%) rename mathics_scanner/generate/{build_tables.py => named_characters.py} (99%) rename mathics_scanner/generate/{build_operator_tables.py => operators.py} (100%) create mode 100644 test/helper.py diff --git a/Makefile b/Makefile index b374254..57e6fa8 100644 --- a/Makefile +++ b/Makefile @@ -22,20 +22,21 @@ PIP_INSTALL_OPTS ?= #: Default target - same as "develop" all: develop -mathics_scanner/data/character-tables.json: mathics_scanner/data/named-characters.yml - $(PIP) install -r requirements-dev.txt - $(PYTHON) mathics_scanner/generate/build_tables.py +mathics_scanner/data/boxing-characters.json: mathics_scanner/data/boxing-characters.yml + $(PYTHON) mathics_scanner/generate/boxing_characters.py + +mathics_scanner/data/named-characters.json: mathics_scanner/data/named-characters.yml + $(PYTHON) mathics_scanner/generate/named_characters.py mathics_scanner/data/operators.json: mathics_scanner/data/operators.yml - $(PIP) install -r requirements-dev.txt - $(PYTHON) mathics_scanner/generate/build_operator_tables.py + $(PYTHON) mathics_scanner/generate/operators.py #: build everything needed to install -build: mathics_scanner/data/characters.json mathics_scanner/data/operators.json +build: mathics_scanner/data/characters.json mathics_scanner/data/named_characters.json mathics_scanner/data/operators.json $(PYTHON) ./setup.py build #: Set up to run from the source tree -develop: mathics_scanner/data/character-tables.json mathics_scanner/data/operators.json +develop: mathics_scanner/data/boxing-characters.json mathics_scanner/data/named-characters.json mathics_scanner/data/operators.json $(PIP) install -e .$(PIP_INSTALL_OPTS) #: Build distribution @@ -56,16 +57,16 @@ check: pytest test: check #: Build Sphinx HTML documentation -doc: mathics_scanner/data/character-tables.json +doc: mathics_scanner/data/named-characters.json make -C docs html #: Remove derived files clean: @find . -name *.pyc -type f -delete; \ - $(RM) -f mathics_scanner/data/character-tables.json mathics_scanner/data/operators.json || true + $(RM) -f mathics_scanner/data/*.json || true #: Run py.test tests. Use environment variable "o" for pytest options -pytest: mathics_scanner/data/character-tables.json +pytest: mathics_scanner/data/named-characters.json $(PYTHON) -m pytest test $o #: Print to stdout a GNU Readline inputrc without Unicode diff --git a/admin-tools/make-JSON-tables.sh b/admin-tools/make-JSON-tables.sh index b020579..9307397 100755 --- a/admin-tools/make-JSON-tables.sh +++ b/admin-tools/make-JSON-tables.sh @@ -1,10 +1,10 @@ #!/bin/bash -# Create a complete set of tables. -# This just runs build_tables.py in this distribution +# Create a complete set of JSON tables. bs=${BASH_SOURCE[0]} mydir=$(dirname $bs) PYTHON=${PYTHON:-python} cd $mydir/../mathics_scanner/data -$PYTHON ../generate/build_tables.py -o character-tables.json -$PYTHON ../generate/build_operator_tables.py -o operators.json +$PYTHON ../generate/boxing_characters.py -o boxing-characters.json +$PYTHON ../generate/named_characters.py -o named-characters.json +$PYTHON ../generate/operators.py -o operators.json diff --git a/mathics_scanner/characters.py b/mathics_scanner/characters.py index 057dc9c..64a5556 100644 --- a/mathics_scanner/characters.py +++ b/mathics_scanner/characters.py @@ -23,7 +23,7 @@ def get_srcdir() -> str: ROOT_DIR = get_srcdir() # Load the conversion tables from disk -characters_path = osp.join(ROOT_DIR, "data", "character-tables.json") +characters_path = osp.join(ROOT_DIR, "data", "named-characters.json") if osp.exists(characters_path): with open(characters_path, "r") as f: _data = ujson.load(f) diff --git a/mathics_scanner/generate/build_box_tables.py b/mathics_scanner/generate/boxing_characters.py similarity index 98% rename from mathics_scanner/generate/build_box_tables.py rename to mathics_scanner/generate/boxing_characters.py index f8533a7..935fdf8 100644 --- a/mathics_scanner/generate/build_box_tables.py +++ b/mathics_scanner/generate/boxing_characters.py @@ -83,7 +83,7 @@ def compile_tables(data: dict) -> dict: "-o", show_default=True, type=click.Path(writable=True), - default=DEFAULT_DATA_DIR / "box-character-tables.json", + default=DEFAULT_DATA_DIR / "boxing-characters.json", ) @click.argument( "data_dir", type=click.Path(readable=True), default=DEFAULT_DATA_DIR, required=False diff --git a/mathics_scanner/generate/build_tables.py b/mathics_scanner/generate/named_characters.py similarity index 99% rename from mathics_scanner/generate/build_tables.py rename to mathics_scanner/generate/named_characters.py index 1b2ce10..65e4f5e 100755 --- a/mathics_scanner/generate/build_tables.py +++ b/mathics_scanner/generate/named_characters.py @@ -249,7 +249,6 @@ def compile_tables(data: dict) -> dict: "ascii-operator-to-character-symbol": ascii_operator_to_character_symbol, "ascii-operator-to-unicode": ascii_operator_to_unicode, "ascii-operator-to-wl-unicode": ascii_operator_to_wl_unicode, - "box-characters": box_characters, "builtin-constants": builtin_constants, "latex-named-characters": latex_named_characters, "letterlikes": letterlikes, @@ -282,7 +281,6 @@ def compile_tables(data: dict) -> dict: "ascii-operator-to-symbol", "ascii-operator-to-unicode", "ascii-operator-to-wl-unicode", - "box-characters", # "builtin-constants", # not used yet "latex-named-characters", "letterlikes", @@ -323,7 +321,7 @@ def compile_tables(data: dict) -> dict: "-o", show_default=True, type=click.Path(writable=True), - default=DEFAULT_DATA_DIR / "character-tables.json", + default=DEFAULT_DATA_DIR / "named-characters.json", ) @click.argument( "data_dir", type=click.Path(readable=True), default=DEFAULT_DATA_DIR, required=False diff --git a/mathics_scanner/generate/build_operator_tables.py b/mathics_scanner/generate/operators.py similarity index 100% rename from mathics_scanner/generate/build_operator_tables.py rename to mathics_scanner/generate/operators.py diff --git a/mathics_scanner/load.py b/mathics_scanner/load.py index 66ef912..1826e79 100644 --- a/mathics_scanner/load.py +++ b/mathics_scanner/load.py @@ -4,16 +4,16 @@ import yaml -from mathics_scanner.generate.build_tables import DEFAULT_DATA_DIR +from mathics_scanner.generate.named_characters import DEFAULT_DATA_DIR -def load_mathics_character_yaml(): +def load_mathics3_named_characters_yaml(): with open(DEFAULT_DATA_DIR / "named-characters.yml", "r") as yaml_file: yaml_data = yaml.load(yaml_file, Loader=yaml.FullLoader) return yaml_data -def load_mathics_character_json(): - with open(DEFAULT_DATA_DIR / "character-tables.json", "r") as json_file: +def load_mathics3_named_characters_json(): + with open(DEFAULT_DATA_DIR / "named-characters.json", "r") as json_file: json_data = json.load(json_file) return json_data diff --git a/pyproject.toml b/pyproject.toml index f78e461..35b58e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,8 +51,9 @@ full = [ ] [project.scripts] -mathics3-generate-json-table = "mathics_scanner.generate.build_tables:main" -mathics3-generate-operator-json-table = "mathics_scanner.generate.build_operator_tables:main" +mathics3-make-boxing-character-json = "mathics_scanner.generate.box_characters:main" +mathics3-make-named-character-json = "mathics_scanner.generate.named_characters:main" +mathics3-make-operator-json = "mathics_scanner.generate.operators:main" mathics3-tokens = "mathics_scanner.mathics3_tokens:main" [tool.setuptools] @@ -64,11 +65,13 @@ packages = [ [tool.setuptools.package-data] "mathics_scanner" = [ + "data/boxing-characters.json", + "data/boxing-characters.yml", + "data/named-characters.json", "data/named-characters.yml", "data/operators.yml", "data/operators.json", "data/*.csv", - "data/character-tables.json", # List this explicitly since it is needed "data/*.json", "data/ExampleData/*", ] diff --git a/setup.py b/setup.py index 3b48ace..82c59b7 100644 --- a/setup.py +++ b/setup.py @@ -25,39 +25,34 @@ mathics-users@googlegroups.com and ask for help. """ +import os import os.path as osp -import subprocess -import sys from setuptools import setup -from setuptools.command.egg_info import egg_info +from setuptools.command.build_py import build_py as setuptools_build_py def get_srcdir(): - """Return the directory of the location if this code""" + """return the directory of the location if this code""" filename = osp.normcase(osp.dirname(osp.abspath(__file__))) return osp.realpath(filename) -class table_building_egg_info(egg_info): - """This runs as part of building an sdist""" +class build_py(setuptools_build_py): + def run(self): + for table_type in ("boxing-character", "named-character", "operator"): + json_data_file = osp.join("data", f"{table_type}.json") + json_path = osp.join("mathics-scanner", json_data_file) + if not osp.exists(json_path): + os.system(f"mathics3-make-{table_type}-json" " -o {json-path}") + self.distribution.package_data["Mathics-Scanner"].append(json_data_file) + setuptools_build_py.run(self) - def finalize_options(self): - """Run program to create JSON tables""" - build_tables_program = osp.join( - get_srcdir(), "mathics_scanner", "generate", "build_tables.py" - ) - print(f"Building JSON tables via {build_tables_program}") - result = subprocess.run([sys.executable, build_tables_program], check=False) - if result.returncode: - raise RuntimeError( - f"Running {build_tables_program} exited with code {result.returncode}" - ) - super().finalize_options() +CMDCLASS = {"build_py": build_py} setup( - cmdclass={"egg_info": table_building_egg_info}, + cmdclass=CMDCLASS, # don't pack Mathics in egg because of media files, etc. zip_safe=False, ) diff --git a/test/helper.py b/test/helper.py new file mode 100644 index 0000000..307ab5f --- /dev/null +++ b/test/helper.py @@ -0,0 +1,7 @@ +from mathics_scanner.load import ( + load_mathics3_named_characters_json, + load_mathics3_named_characters_yaml, +) + +yaml_data = load_mathics3_named_characters_yaml() +json_data = load_mathics3_named_characters_json() diff --git a/test/test_ascii.py b/test/test_ascii.py index 0eb9c5a..99a4023 100644 --- a/test/test_ascii.py +++ b/test/test_ascii.py @@ -1,12 +1,6 @@ # -*- coding: utf-8 -*- -from mathics_scanner.load import ( - load_mathics_character_json, - load_mathics_character_yaml, -) - -yaml_data = load_mathics_character_yaml() -json_data = load_mathics_character_json() +from test.helper import json_data def test_ascii(): diff --git a/test/test_character_table_consistency.py b/test/test_character_table_consistency.py index 35b8b40..c6c652a 100644 --- a/test/test_character_table_consistency.py +++ b/test/test_character_table_consistency.py @@ -1,14 +1,9 @@ # -*- coding: utf-8 -*- +from test.helper import json_data, yaml_data + from mathics_scanner.characters import replace_unicode_with_wl as unicode_to_wl from mathics_scanner.characters import replace_wl_with_plain_text as wl_to_unicode -from mathics_scanner.load import ( - load_mathics_character_json, - load_mathics_character_yaml, -) - -yaml_data = load_mathics_character_yaml() -json_data = load_mathics_character_json() def test_ascii_fields_in_json(): diff --git a/test/test_general_yaml_sanity.py b/test/test_general_yaml_sanity.py index e419877..e7e8940 100644 --- a/test/test_general_yaml_sanity.py +++ b/test/test_general_yaml_sanity.py @@ -2,10 +2,7 @@ import re import unicodedata - -from mathics_scanner.load import load_mathics_character_yaml - -yaml_data = load_mathics_character_yaml() +from test.helper import yaml_data def check_attr_is_invertible(attr: str): diff --git a/test/test_has_unicode_inverse_sanity.py b/test/test_has_unicode_inverse_sanity.py index f71a7e2..7949788 100644 --- a/test/test_has_unicode_inverse_sanity.py +++ b/test/test_has_unicode_inverse_sanity.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- from mathics_scanner.load import ( - load_mathics_character_yaml, - load_mathics_character_json, + load_mathics3_named_characters_json, + load_mathics3_named_characters_yaml, ) -yaml_data = load_mathics_character_yaml() -json_data = load_mathics_character_json() +yaml_data = load_mathics3_named_characters_yaml() +json_data = load_mathics3_named_characters_json() def test_has_unicode_inverse_sanity(): diff --git a/test/test_letterlikes_sanity.py b/test/test_letterlikes_sanity.py index b2dc381..01533a2 100644 --- a/test/test_letterlikes_sanity.py +++ b/test/test_letterlikes_sanity.py @@ -1,12 +1,6 @@ # -*- coding: utf-8 -*- -from mathics_scanner.load import ( - load_mathics_character_yaml, - load_mathics_character_json, -) - -yaml_data = load_mathics_character_yaml() -json_data = load_mathics_character_json() +# from test.helper import json_data, yaml_data def test_letterlikes_sanity(): diff --git a/test/test_unicode.py b/test/test_unicode.py index 43fb296..f4b27a7 100644 --- a/test/test_unicode.py +++ b/test/test_unicode.py @@ -1,12 +1,6 @@ # -*- coding: utf-8 -*- -from mathics_scanner.load import ( - load_mathics_character_json, - load_mathics_character_yaml, -) - -yaml_data = load_mathics_character_yaml() -json_data = load_mathics_character_json() +from test.helper import yaml_data def test_has_unicode(): diff --git a/test/test_urls.py b/test/test_urls.py index 6c4e1a1..a592501 100644 --- a/test/test_urls.py +++ b/test/test_urls.py @@ -1,19 +1,16 @@ # -*- coding: utf-8 -*- import os +from test.helper import yaml_data # from urllib.error import HTTPError, URLError from urllib.request import urlopen import pytest -from mathics_scanner.load import load_mathics_character_yaml - -yaml_data = load_mathics_character_yaml() - # This test is slow, so do only on request! @pytest.mark.skipif( - not os.environ.get("MATHICS_LINT"), reason="Lint checking done only when specified" + not os.environ.get("MATHICS3_LINT"), reason="Lint checking done only when specified" ) def test_yaml_urls(): for k, v in yaml_data.items(): diff --git a/test/test_wl_to_ascii.py b/test/test_wl_to_ascii.py index b6444d4..f8bf92b 100644 --- a/test/test_wl_to_ascii.py +++ b/test/test_wl_to_ascii.py @@ -1,9 +1,8 @@ # -*- coding: utf-8 -*- -from mathics_scanner.characters import replace_wl_with_plain_text -from mathics_scanner.load import load_mathics_character_yaml +from test.helper import yaml_data -yaml_data = load_mathics_character_yaml() +from mathics_scanner.characters import replace_wl_with_plain_text def wl_to_ascii(wl_input: str) -> str: From 1a5efa38b60a0c2a53125ce10422829c84842078 Mon Sep 17 00:00:00 2001 From: rocky Date: Tue, 17 Feb 2026 19:55:06 -0500 Subject: [PATCH 03/26] Go over CI. Remove unused code. --- .github/workflows/osx.yml | 5 ++++- .github/workflows/ubuntu.yml | 5 ++++- .github/workflows/windows.yml | 7 ++++--- mathics_scanner/generate/boxing_characters.py | 15 +-------------- mathics_scanner/generate/named_characters.py | 15 +-------------- mathics_scanner/generate/operators.py | 18 ++---------------- 6 files changed, 16 insertions(+), 49 deletions(-) diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index c740b23..d2c29f8 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -30,5 +30,8 @@ jobs: run: | pip install -r requirements-dev.txt pip install -r requirements-full.txt - python -m mathics_scanner.generate.build_tables + # I don't think I need this anymore: + # python -m mathics_scanner.generate.boxing_characters + # python -m mathics_scanner.generate.named_characters + # python -m mathics_scanner.generate.operators make check diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index 05d8de2..34e873c 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -29,5 +29,8 @@ jobs: run: | pip install -r requirements-dev.txt pip install -r requirements-full.txt - python -m mathics_scanner.generate.build_tables + # Don't think I need this anymore + # python -m mathics_scanner.generate.boxing_characters + # python -m mathics_scanner.generate.named_characters + # python -m mathics_scanner.generate.operators make check diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 768802b..8f88834 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -28,8 +28,9 @@ jobs: pip install -e . - name: Test Mathics3 run: | - # Ideally we should not have to do this. - python mathics_scanner/generate/build_tables.py - python mathics_scanner/generate/build_operator_tables.py + # I don't think I need this anymore: + # python -m mathics_scanner.generate.boxing_characters + # python -m mathics_scanner.generate.named_characters + # python -m mathics_scanner.generate.operators pip install -e .[dev,full] py.test test diff --git a/mathics_scanner/generate/boxing_characters.py b/mathics_scanner/generate/boxing_characters.py index 935fdf8..bdf7660 100644 --- a/mathics_scanner/generate/boxing_characters.py +++ b/mathics_scanner/generate/boxing_characters.py @@ -10,20 +10,7 @@ import click import yaml -try: - from mathics_scanner.version import __version__ -except ImportError: - # When using build isolation - __version__ = "unknown" - - -def get_srcdir() -> str: - filename = osp.normcase(osp.dirname(osp.abspath(__file__))) - return osp.realpath(filename) - - -def read(*rnames) -> str: - return open(osp.join(get_srcdir(), *rnames)).read() +from mathics_scanner.version import __version__ def compile_tables(data: dict) -> dict: diff --git a/mathics_scanner/generate/named_characters.py b/mathics_scanner/generate/named_characters.py index 65e4f5e..d787f5f 100755 --- a/mathics_scanner/generate/named_characters.py +++ b/mathics_scanner/generate/named_characters.py @@ -11,20 +11,7 @@ import click import yaml -try: - from mathics_scanner.version import __version__ -except ImportError: - # When using build isolation - __version__ = "unknown" - - -def get_srcdir() -> str: - filename = osp.normcase(osp.dirname(osp.abspath(__file__))) - return osp.realpath(filename) - - -def read(*rnames) -> str: - return open(osp.join(get_srcdir(), *rnames)).read() +from mathics_scanner import __version__ def re_from_keys(d: dict) -> str: diff --git a/mathics_scanner/generate/operators.py b/mathics_scanner/generate/operators.py index ce5fb96..a204875 100755 --- a/mathics_scanner/generate/operators.py +++ b/mathics_scanner/generate/operators.py @@ -12,6 +12,8 @@ import click import yaml +from mathics_scanner.version import __version__ + OPERATOR_FIELDS = [ "actual-precedence", "Precedence", @@ -25,22 +27,6 @@ ] -try: - from mathics_scanner.version import __version__ -except ImportError: - # When using build isolation - __version__ = "unknown" - - -def get_srcdir() -> str: - filename = osp.normcase(osp.dirname(osp.abspath(__file__))) - return osp.realpath(filename) - - -def read(*rnames) -> str: - return open(osp.join(get_srcdir(), *rnames)).read() - - def compile_tables( operator_data: Dict[str, dict], character_data: Dict[str, dict] ) -> Dict[str, dict]: From 3f6acb53de768ee8a5fd23996b0858a919938d48 Mon Sep 17 00:00:00 2001 From: rocky Date: Tue, 17 Feb 2026 20:26:00 -0500 Subject: [PATCH 04/26] mathics3-tokens - add In/Out number history --- mathics_scanner/mathics3_tokens.py | 36 +++++++++++++----------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/mathics_scanner/mathics3_tokens.py b/mathics_scanner/mathics3_tokens.py index d37335d..aea94d4 100644 --- a/mathics_scanner/mathics3_tokens.py +++ b/mathics_scanner/mathics3_tokens.py @@ -99,29 +99,21 @@ def get_last_line_number(self): def get_in_prompt(self): next_line_number = self.get_last_line_number() + 1 - self.lineno = next_line_number - return "{1}{0}[{2}{3}]:= {4}".format(self.in_prefix, *self.incolors) + return "{2}{0}[{3}{1}{4}]:= {5}".format( + self.in_prefix, next_line_number, *self.incolors + ) - def get_out_prompt(self, form=None): + def get_out_prompt(self): line_number = self.get_last_line_number() - if form: - return "{2}{0}[{3}{4}]//{1}= {5}".format( - self.out_prefix, line_number, form, *self.outcolors - ) - return "{1}{0}[{2}{3}]= {4}".format( + return "{2}{0}[{3}{1}{4}]= {5}".format( self.out_prefix, line_number, *self.outcolors ) - def to_output(self, text, form=None): + def to_output(self, text): line_number = self.get_last_line_number() newline = "\n" + " " * len("Out[{0}]= ".format(line_number)) - if form: - newline += (len(form) + 2) * " " return newline.join(text.splitlines()) - def out_callback(self, out, fmt=None): - print(self.to_output(str(out), fmt)) - def read_line(self, prompt): if self.using_readline: return self.rl_read_line(prompt) @@ -163,7 +155,7 @@ def interactive_eval_loop(shell: TerminalShell, code_tokenize_format: bool): while True: try: source_text = shell.feed() - tokens(source_text, code_tokenize_format) + tokens(shell, source_text, code_tokenize_format) except NamedCharacterSyntaxError: shell.errmsg( "Syntax", @@ -197,11 +189,11 @@ def interactive_eval_loop(shell: TerminalShell, code_tokenize_format: bool): print("\n\nGoodbye!\n") # raise to pass the error code on, e.g. Quit[1] raise - finally: - shell.reset_lineno() + # finally: + # shell.reset_lineno() -def tokens(source_text: str, code_tokenize_format: bool): +def tokens(shell: TerminalShell, source_text: str, code_tokenize_format: bool): tokeniser = Tokeniser( SingleLineFeeder(source_text, "", ContainerKind.STRING) ) @@ -217,9 +209,11 @@ def tokens(source_text: str, code_tokenize_format: bool): if token.tag == "END": break elif code_tokenize_format: - print(token.code_tokenize_format) + shell.to_output(token.code_tokenize_format) + # print(token.code_tokenize_format) else: - print(token) + mess = shell.get_out_prompt() + print(mess + str(token) + "\n") def main(): @@ -297,7 +291,7 @@ def main(): if args.FILE is not None: feeder = FileLineFeeder(args.FILE) - tokenizer_loop(feeder, args.CodeTokenize) + tokenizer_loop(feeder, shell, args.CodeTokenize) else: interactive_eval_loop(shell, args.CodeTokenize) From 8430dc8d919653cda9977e53974eedd3d169e1f4 Mon Sep 17 00:00:00 2001 From: rocky Date: Tue, 17 Feb 2026 20:26:58 -0500 Subject: [PATCH 05/26] More ignore --- mathics_scanner/data/.gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mathics_scanner/data/.gitignore b/mathics_scanner/data/.gitignore index c433af0..2442410 100644 --- a/mathics_scanner/data/.gitignore +++ b/mathics_scanner/data/.gitignore @@ -1,2 +1,4 @@ /.python-version /box-character-tables.json +/boxing-characters.json +/named-characters.json From 0cf50763b65de675efc20b4c5b3b1bcbd910307d Mon Sep 17 00:00:00 2001 From: rocky Date: Wed, 18 Feb 2026 04:55:32 -0500 Subject: [PATCH 06/26] Start translating frum Unicode to ASCII on output --- mathics_scanner/characters.py | 28 ++++++++++++++++++--- mathics_scanner/data/boxing-characters.yml | 29 +++++++++++----------- mathics_scanner/escape_sequences.py | 14 ++++++++--- mathics_scanner/mathics3_tokens.py | 8 ++++-- 4 files changed, 56 insertions(+), 23 deletions(-) diff --git a/mathics_scanner/characters.py b/mathics_scanner/characters.py index 64a5556..7418a3c 100644 --- a/mathics_scanner/characters.py +++ b/mathics_scanner/characters.py @@ -23,13 +23,35 @@ def get_srcdir() -> str: ROOT_DIR = get_srcdir() # Load the conversion tables from disk -characters_path = osp.join(ROOT_DIR, "data", "named-characters.json") -if osp.exists(characters_path): - with open(characters_path, "r") as f: +named_characters_path = osp.join(ROOT_DIR, "data", "named-characters.json") +if osp.exists(named_characters_path): + with open(named_characters_path, "r") as f: _data = ujson.load(f) else: _data = {} +boxing_characters_path = osp.join(ROOT_DIR, "data", "boxing-characters.json") +if osp.exists(boxing_characters_path): + with open(boxing_characters_path, "r") as f: + boxing_character_data = ujson.load(f) +else: + boxing_characters_data = {} + +boxing_unicode_to_ascii = boxing_character_data.get("unicode-to-ascii", {}) +boxing_ascii_to_unicode = boxing_character_data.get("ascii-to-unicode", {}) + +replace_to_ascii_re = re.compile( + "|".join( + re.escape(unicode_character) + for unicode_character in boxing_unicode_to_ascii.keys() + ) +) + + +def replace_box_unicode_with_ascii(s: str) -> str: + return replace_to_ascii_re.sub(lambda m: s[m.group(0)], s) + + # Character ranges of letters _letters = "a-zA-Z\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u0103\u0106\u0107\ \u010c-\u010f\u0112-\u0115\u011a-\u012d\u0131\u0141\u0142\u0147\u0148\ diff --git a/mathics_scanner/data/boxing-characters.yml b/mathics_scanner/data/boxing-characters.yml index 1a19cf5..eb45ea8 100644 --- a/mathics_scanner/data/boxing-characters.yml +++ b/mathics_scanner/data/boxing-characters.yml @@ -29,67 +29,68 @@ # system markers, specifically the range 0xf7c0 to 0xf7cd. # -# Coding note: below we use single quotes, not double quotes so that we do -# not have to escape backslash characters. +# Coding note: make note of quotes. Single quotes for unescaped backslashes, e.g. +# in the ASCII field and double quotes when we do not escaped backlashes in the +# Unicode field. LinearSyntaxAmp: ASCII: '\&' Operators: [] - Unicode: '\uf7c7' + Unicode: "\uf7c7" LinearSyntaxAt: ASCII: '\@' Operators: [RadicalBox, SqrtBox] - Unicode: '\uf7c1' + Unicode: "\uf7c1" LinearSyntaxBacktick: ASCII: '\`' Operators: [FormBox] - Unicode: '\uf7cd' + Unicode: "\uf7cd" LinearSyntaxBang: ASCII: '\!' Operators: - Unicode: '\uf7c1' + Unicode: "\uf7c1" LinearSyntaxCaret: ASCII: '\^' Operators: SuperscriptBox - Unicode: '\uf7c6' + Unicode: "\uf7c6" # Note: this name does not appear in CodeParse LinearSyntaxCloseParen: ASCII: '\)' Operators: [RowBox] - Unicode: '\uf7c0' + Unicode: "\uf7c0" # Note: this name does not appear in CodeParse LinearSyntaxOpenParen: ASCII: '\(' Operators: [RowBox] - Unicode: '\uf7cd' + Unicode: "\uf7cd" LinearSyntaxPercent: ASCII: '\%' Operators: [RadicalBox, SuperscriptBox, UnderOverscriptBox] - Unicode: '\uf7c5' + Unicode: "\uf7c5" LinearSyntaxPlus: ASCII: '\+' Operators: [UnderscriptBox, UnderOverscriptBox] - Unicode: '\uf7cb' + Unicode: "\uf7cb" LinearSyntaxStar: ASCII: '\*' Operators: [] - Unicode: '\uf7c8' + Unicode: "\uf7c8" LinearSyntaxSlash: ASCII: '\/' Operators: [FractionBox] - Unicode: '\uf7cc' + Unicode: "\uf7cc" LinearSyntaxUnder: ASCII: '\_' Operators: [SubscriptBox, SubsuperscriptBox] - Unicode: '\uf7ca' + Unicode: "\uf7ca" diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py index 27d8ad2..ae6cb43 100644 --- a/mathics_scanner/escape_sequences.py +++ b/mathics_scanner/escape_sequences.py @@ -2,15 +2,17 @@ Helper Module for tokenizing character escape sequences. """ -from typing import Optional, Tuple +from typing import Final, Optional, Tuple -from mathics_scanner.characters import named_characters +from mathics_scanner.characters import boxing_ascii_to_unicode, named_characters from mathics_scanner.errors import ( EscapeSyntaxError, NamedCharacterSyntaxError, SyntaxError, ) +BOX_OPERATOR: Final[str] = "&@`!^)(%*/_" + def parse_base(source_text: str, start_shift: int, end_shift: int, base: int) -> str: r""" @@ -140,8 +142,12 @@ def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]: assert c == "r" result += "\r" pos += 1 - elif c in '!"': - result += c + elif c in BOX_OPERATOR: + if (boxed_character := boxing_ascii_to_unicode.get("\\" + c)) is not None: + # Replace \ in result with Unicode representing the two ASCII characters. + result = result[:-1] + boxed_character + else: + raise EscapeSyntaxError("stresc", rf"\{c}") pos += 1 else: raise EscapeSyntaxError("stresc", rf"\{c}") diff --git a/mathics_scanner/mathics3_tokens.py b/mathics_scanner/mathics3_tokens.py index aea94d4..aef026d 100644 --- a/mathics_scanner/mathics3_tokens.py +++ b/mathics_scanner/mathics3_tokens.py @@ -8,6 +8,7 @@ import re import sys +from mathics_scanner.characters import replace_box_unicode_with_ascii from mathics_scanner.errors import ( EscapeSyntaxError, NamedCharacterSyntaxError, @@ -209,10 +210,13 @@ def tokens(shell: TerminalShell, source_text: str, code_tokenize_format: bool): if token.tag == "END": break elif code_tokenize_format: - shell.to_output(token.code_tokenize_format) - # print(token.code_tokenize_format) + mess = shell.get_out_prompt() + print( + mess + replace_box_unicode_with_ascii(token.code_tokenize_format) + "\n" + ) else: mess = shell.get_out_prompt() + token.text = replace_box_unicode_with_ascii(token.text) print(mess + str(token) + "\n") From 964e87f2185f523bd74669c738d06d47a14a54d3 Mon Sep 17 00:00:00 2001 From: rocky Date: Wed, 18 Feb 2026 05:44:53 -0500 Subject: [PATCH 07/26] Code cleanup. --- mathics_scanner/__init__.py | 4 +- mathics_scanner/characters.py | 92 +++++++++++++++++----------- mathics_scanner/escape_sequences.py | 21 +++++-- mathics_scanner/tokeniser.py | 76 +++++++++++------------ test/test_string_tokens.py | 3 +- test/test_translation_regressions.py | 4 +- 6 files changed, 115 insertions(+), 85 deletions(-) diff --git a/mathics_scanner/__init__.py b/mathics_scanner/__init__.py index ecfec27..bbd49f8 100644 --- a/mathics_scanner/__init__.py +++ b/mathics_scanner/__init__.py @@ -7,8 +7,8 @@ """ from mathics_scanner.characters import ( + NAMED_CHARACTERS, aliased_characters, - named_characters, replace_unicode_with_wl, replace_wl_with_plain_text, ) @@ -34,6 +34,7 @@ "InvalidSyntaxError", "LineFeeder", "MultiLineFeeder", + "NAMED_CHARACTERS", "SyntaxError", "SingleLineFeeder", # "Token", @@ -41,7 +42,6 @@ "__version__", "aliased_characters", # "is_symbol_name", - "named_characters", "replace_unicode_with_wl", "replace_wl_with_plain_text", ] diff --git a/mathics_scanner/characters.py b/mathics_scanner/characters.py index 7418a3c..64f2803 100644 --- a/mathics_scanner/characters.py +++ b/mathics_scanner/characters.py @@ -1,13 +1,16 @@ # -*- coding: utf-8 -*- -""" -The ``mathics_scanner.characters`` module consists mostly of translation tables -between Wolfram's internal representation of `named characters +"""This module consists mostly of translation tables between Wolfram's +internal representation of `named characters `_ and Unicode/ASCII. + +It also contains Unicode translation tables for the syntax used in +Boxing operators and Boxing expressions. """ import os.path as osp import re +from typing import Dict, Final try: import ujson @@ -16,44 +19,56 @@ def get_srcdir() -> str: - filename = osp.normcase(osp.dirname(osp.abspath(__file__))) - return osp.realpath(filename) + """Return the OS normalized real directory path for where this + code currently resides on disk.""" + directory_path = osp.normcase(osp.dirname(osp.abspath(__file__))) + return osp.realpath(directory_path) + +ROOT_DIR: Final[str] = get_srcdir() -ROOT_DIR = get_srcdir() # Load the conversion tables from disk -named_characters_path = osp.join(ROOT_DIR, "data", "named-characters.json") -if osp.exists(named_characters_path): - with open(named_characters_path, "r") as f: - _data = ujson.load(f) + +NAMED_CHARACTERS_PATH: Final[str] = osp.join(ROOT_DIR, "data", "named-characters.json") +if osp.exists(NAMED_CHARACTERS_PATH): + with open(NAMED_CHARACTERS_PATH, "r") as f: + NAMED_CHARACTERS_COLLECTION = ujson.load(f) else: - _data = {} + NAMED_CHARACTERS_COLLECTION = {} + +BOXING_CHARACTERS_PATH: Final[str] = osp.join( + ROOT_DIR, "data", "boxing-characters.json" +) -boxing_characters_path = osp.join(ROOT_DIR, "data", "boxing-characters.json") -if osp.exists(boxing_characters_path): - with open(boxing_characters_path, "r") as f: +if osp.exists(BOXING_CHARACTERS_PATH): + with open(BOXING_CHARACTERS_PATH, "r") as f: boxing_character_data = ujson.load(f) else: boxing_characters_data = {} -boxing_unicode_to_ascii = boxing_character_data.get("unicode-to-ascii", {}) -boxing_ascii_to_unicode = boxing_character_data.get("ascii-to-unicode", {}) +BOXING_UNICODE_TO_ASCII: Final[Dict[str, str]] = boxing_character_data.get( + "unicode-to-ascii", {} +) +BOXING_ASCII_TO_UNICODE: Final[Dict[str, str]] = boxing_character_data.get( + "ascii-to-unicode", {} +) replace_to_ascii_re = re.compile( "|".join( re.escape(unicode_character) - for unicode_character in boxing_unicode_to_ascii.keys() + for unicode_character in BOXING_UNICODE_TO_ASCII.keys() ) ) -def replace_box_unicode_with_ascii(s: str) -> str: - return replace_to_ascii_re.sub(lambda m: s[m.group(0)], s) +def replace_box_unicode_with_ascii(input_string): + return "".join(BOXING_UNICODE_TO_ASCII.get(char, char) for char in input_string) # Character ranges of letters -_letters = "a-zA-Z\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u0103\u0106\u0107\ +_letters: Final[str] = ( + "a-zA-Z\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u0103\u0106\u0107\ \u010c-\u010f\u0112-\u0115\u011a-\u012d\u0131\u0141\u0142\u0147\u0148\ \u0150-\u0153\u0158-\u0161\u0164\u0165\u016e-\u0171\u017d\u017e\ \u0391-\u03a1\u03a3-\u03a9\u03b1-\u03c9\u03d1\u03d2\u03d5\u03d6\ @@ -62,37 +77,44 @@ def replace_box_unicode_with_ascii(s: str) -> str: \uf6ba-\uf6bc\uf6be\uf6bf\uf6c1-\uf700\uf730\uf731\uf770\uf772\uf773\ \uf776\uf779\uf77a\uf77d-\uf780\uf782-\uf78b\uf78d-\uf78f\uf790\ \uf793-\uf79a\uf79c-\uf7a2\uf7a4-\uf7bd\uf800-\uf833\ufb01\ufb02" +) # Character ranges of letterlikes -_letterlikes = _data.get("letterlikes", {}) +_letterlikes: Final[Dict[str, str]] = NAMED_CHARACTERS_COLLECTION.get("letterlikes", {}) # Conversion from WL to the fully qualified names -_wl_to_ascii = _data.get("wl-to-ascii-dict", {}) -_wl_to_ascii_re = re.compile(_data.get("wl-to-ascii-re", "")) +_wl_to_ascii: Final[Dict[str, str]] = NAMED_CHARACTERS_COLLECTION.get( + "wl-to-ascii-dict", {} +) +_wl_to_ascii_re = re.compile(NAMED_CHARACTERS_COLLECTION.get("wl-to-ascii-re", "")) # AMS LaTeX replacements -_wl_to_amstex = _data.get("wl-to-amstex", None) +_wl_to_amstex = NAMED_CHARACTERS_COLLECTION.get("wl-to-amstex", None) # Conversion from WL to unicode -_wl_to_unicode = _data.get("wl-to-unicode-dict", _data.get("wl_to_ascii")) -_wl_to_unicode_re = re.compile(_data.get("wl-to-unicode-re", "")) +_wl_to_unicode = NAMED_CHARACTERS_COLLECTION.get( + "wl-to-unicode-dict", NAMED_CHARACTERS_COLLECTION.get("wl_to_ascii") +) +_wl_to_unicode_re = re.compile(NAMED_CHARACTERS_COLLECTION.get("wl-to-unicode-re", "")) # Conversion from unicode to WL -_unicode_to_wl = _data.get("unicode-to-wl-dict", {}) -_unicode_to_wl_re = re.compile(_data.get("unicode-to-wl-re", "")) +_unicode_to_wl = NAMED_CHARACTERS_COLLECTION.get("unicode-to-wl-dict", {}) +_unicode_to_wl_re = re.compile(NAMED_CHARACTERS_COLLECTION.get("unicode-to-wl-re", "")) # All supported named characters -named_characters = _data.get("named-characters", {}) +NAMED_CHARACTERS: Final[Dict[str, str]] = NAMED_CHARACTERS_COLLECTION.get( + "named-characters", {} +) # ESC sequence aliases -aliased_characters = _data.get("aliased-characters", {}) +aliased_characters = NAMED_CHARACTERS_COLLECTION.get("aliased-characters", {}) # Deprecated def replace_wl_with_plain_text(wl_input: str, use_unicode=True) -> str: """ The Wolfram Language uses specific Unicode characters to represent Wolfram - Language named characters. This functions replaces all occurrences of such + Language named characters. This function replaces all occurrences of such characters with their corresponding Unicode/ASCII equivalents. :param wl_input: The string whose characters will be replaced. @@ -101,7 +123,7 @@ def replace_wl_with_plain_text(wl_input: str, use_unicode=True) -> str: Note that the occurrences of named characters in ``wl_input`` are expect to be represented by Wolfram's internal scheme. For more information Wolfram's - representation scheme and on our own conversion scheme please see `Listing + representation scheme and on our own conversion scheme, please see `Listing of Named Characters `_ and ``implementation.rst`` respectively. @@ -109,7 +131,7 @@ def replace_wl_with_plain_text(wl_input: str, use_unicode=True) -> str: r = _wl_to_unicode_re if use_unicode else _wl_to_ascii_re d = _wl_to_unicode if use_unicode else _wl_to_ascii - # The below on when use_unicode is False will sometime test on "ascii" twice. + # The below, when use_unicode is False, will sometimes test on "ascii" twice. # But this routine should be deprecated. return r.sub(lambda m: d.get(m.group(0), _wl_to_ascii.get(m.group(0))), wl_input) @@ -118,7 +140,7 @@ def replace_wl_with_plain_text(wl_input: str, use_unicode=True) -> str: def replace_unicode_with_wl(unicode_input: str) -> str: """ The Wolfram Language uses specific Unicode characters to represent Wolfram - Language named characters. This functions replaces all occurrences of the + Language named characters. This function replaces all occurrences of the corresponding Unicode equivalents of such characters with the characters themselves. @@ -127,7 +149,7 @@ def replace_unicode_with_wl(unicode_input: str) -> str: Note that the occurrences of named characters in the output of ``replace_unicode_with_wl`` are represented using Wolfram's internal scheme. For more information Wolfram's representation scheme and on our own - conversion scheme please see `Listing of Named Characters + conversion scheme, please see `Listing of Named Characters `_ and ``implementation.rst`` respectively. """ diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py index ae6cb43..1bb4638 100644 --- a/mathics_scanner/escape_sequences.py +++ b/mathics_scanner/escape_sequences.py @@ -4,15 +4,24 @@ from typing import Final, Optional, Tuple -from mathics_scanner.characters import boxing_ascii_to_unicode, named_characters +from mathics_scanner.characters import BOXING_ASCII_TO_UNICODE, NAMED_CHARACTERS from mathics_scanner.errors import ( EscapeSyntaxError, NamedCharacterSyntaxError, SyntaxError, ) +# The second character, or character after backslash ("\") using +# Boxing expression syntax. BOX_OPERATOR: Final[str] = "&@`!^)(%*/_" +# The second character, or character after backslash ("\") that +# are valid in a Mathics3 escaped character. +ESCAPE_CODES: Final[str] = "ntbfr $\n" + +# Valid digits in an Octal string +OCTAL_DIGITS: Final[str] = "01234567" + def parse_base(source_text: str, start_shift: int, end_shift: int, base: int) -> str: r""" @@ -57,13 +66,13 @@ def parse_named_character(source_text: str, start: int, finish: int) -> Optional Match this string with the known named characters, e.g. "Theta". If we can match this, then we return the unicode equivalent from the - `named_characters` map (which is read in from JSON but stored in a YAML file). + `NAMED_CHARACTERS` map (which is read in from JSON but stored in a YAML file). If we can't find the named character, raise NamedCharacterSyntaxError. """ named_character = source_text[start:finish] if named_character.isalpha(): - char = named_characters.get(named_character) + char = NAMED_CHARACTERS.get(named_character) if char is None: raise NamedCharacterSyntaxError("sntufn", named_character, source_text) else: @@ -114,7 +123,7 @@ def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]: result += named_character pos = i + 1 - elif c in "01234567": + elif c in OCTAL_DIGITS: # See if we have a 3-digit octal number. # For example \065 = "5" result += parse_base(source_text, pos, pos + 3, 8) @@ -124,7 +133,7 @@ def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]: # Note that these are a similer to Python, but are different. # In particular, Python defines "\a" to be ^G (control G), # but in WMA, this is invalid. - elif c in "ntbfr $\n": + elif c in ESCAPE_CODES: if c in "n\n": result += "\n" elif c == " ": @@ -143,7 +152,7 @@ def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]: result += "\r" pos += 1 elif c in BOX_OPERATOR: - if (boxed_character := boxing_ascii_to_unicode.get("\\" + c)) is not None: + if (boxed_character := BOXING_ASCII_TO_UNICODE.get("\\" + c)) is not None: # Replace \ in result with Unicode representing the two ASCII characters. result = result[:-1] + boxed_character else: diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 8034575..c3e3714 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -9,9 +9,9 @@ import os.path as osp import re import string -from typing import Dict, List, Optional, Set, Tuple +from typing import Dict, Final, List, Optional, Set, Tuple -from mathics_scanner.characters import _letterlikes, _letters, named_characters +from mathics_scanner.characters import NAMED_CHARACTERS, _letterlikes, _letters from mathics_scanner.errors import ( EscapeSyntaxError, IncompleteSyntaxError, @@ -30,10 +30,10 @@ ROOT_DIR = osp.dirname(__file__) OPERATORS_TABLE_PATH = osp.join(ROOT_DIR, "data", "operators.json") -############################################## +################################################ # The below get initialized in by init_module() # from operator data -############################################## +################################################ OPERATOR_DATA = {} NO_MEANING_OPERATORS = {} @@ -41,7 +41,7 @@ # This is used in t_String for escape-sequence handling. # The below is roughly correct, but we overwrite this # from operators.json data in init_module() -BOXING_CONSTRUCT_SUFFIXES: Set[str] = { +BOXING_CONSTRUCT_SUFFIXES: Final[Set[str]] = { "%", "/", "@", @@ -120,7 +120,7 @@ # FIXME incorportate the below table in to Function/Operators YAML # Table of correspondneces between a Mathics3 token name (or "tag") # and WMA CodeTokenize name -MATHICS3_TAG_TO_CODETOKENIZE: Dict[str, str] = { +MATHICS3_TAG_TO_CODETOKENIZE: Final[Dict[str, str]] = { "AddTo": "PlusEqual", "Alternatives": "Bar", "And": "AmpAmp", @@ -263,59 +263,59 @@ def init_module(): # ("AddTo", r" \+\= "), ("Alternatives", r" \| "), - ("And", rf" (\&\&) | {named_characters['And']} "), + ("And", rf" (\&\&) | {NAMED_CHARACTERS['And']} "), ("Apply", r" \@\@ "), ("ApplyList", r" \@\@\@ "), ("Composition", r" \@\* "), ("Condition", r" \/\; "), - ("Conjugate", rf" {named_characters['Conjugate']} "), + ("Conjugate", rf" {NAMED_CHARACTERS['Conjugate']} "), ("ConjugateTranspose", r" \uf3c9 "), - ("Cross", rf" \uf4a0 | {named_characters['Cross']} "), + ("Cross", rf" \uf4a0 | {NAMED_CHARACTERS['Cross']} "), ("Decrement", r" \-\- "), - ("Del", rf" {named_characters['Del']} "), + ("Del", rf" {NAMED_CHARACTERS['Del']} "), ("Derivative", r" \' "), # ('DifferenceDelta', r' \u2206 '), # https://reference.wolfram.com/language/ref/character/DirectedEdge.html - ("DirectedEdge", rf" -> | \uf3d5 | {named_characters['DirectedEdge']} "), + ("DirectedEdge", rf" -> | \uf3d5 | {NAMED_CHARACTERS['DirectedEdge']} "), # ('DiscreteRatio', r' \uf4a4 '), # ('DiscreteShift', r' \uf4a3 '), - ("Conjugate", rf" {named_characters['Conjugate']} "), + ("Conjugate", rf" {NAMED_CHARACTERS['Conjugate']} "), ("ConjugateTranspose", r" \uf3c9 "), - ("DifferentialD", rf" \uf74c | {named_characters['DifferentialD']} "), - ("Divide", rf" \/| {named_characters['Divide']} "), + ("DifferentialD", rf" \uf74c | {NAMED_CHARACTERS['DifferentialD']} "), + ("Divide", rf" \/| {NAMED_CHARACTERS['Divide']} "), ("DivideBy", r" \/\= "), ("Dot", r" \. "), - ("Element", r" {named_characters['Element']} "), - ("Equal", rf" (\=\=) | \uf431 | {named_characters['Equal']} | \uf7d9 "), - ("Equivalent", r" {named_characters['Equivalent']} "), - ("Exists", r" {named_characters['Exists']} "), + ("Element", r" {NAMED_CHARACTERS['Element']} "), + ("Equal", rf" (\=\=) | \uf431 | {NAMED_CHARACTERS['Equal']} | \uf7d9 "), + ("Equivalent", r" {NAMED_CHARACTERS['Equivalent']} "), + ("Exists", r" {NAMED_CHARACTERS['Exists']} "), ("Factorial", r" \! "), ("Factorial2", r" \!\! "), - ("ForAll", r" {named_characters['ForAll']} "), - ("Function", rf" \& | \uF4A1 | {named_characters['Function']} | \|-> "), + ("ForAll", r" {NAMED_CHARACTERS['ForAll']} "), + ("Function", rf" \& | \uF4A1 | {NAMED_CHARACTERS['Function']} | \|-> "), ("Greater", r" \> "), - ("GreaterEqual", rf" (\>\=) | {named_characters['GreaterEqual']} "), + ("GreaterEqual", rf" (\>\=) | {NAMED_CHARACTERS['GreaterEqual']} "), ("HermitianConjugate", r" \uf3ce "), ("Implies", r" \uF523 "), ("Increment", r" \+\+ "), ("Infix", r" \~ "), ("Information", r"\?\?"), ("Integral", r" \u222b "), - ("Intersection", rf" {named_characters['Intersection']} "), + ("Intersection", rf" {NAMED_CHARACTERS['Intersection']} "), ("Less", r" \< "), - ("LessEqual", rf" (\<\=) | {named_characters['LessEqual']} "), + ("LessEqual", rf" (\<\=) | {NAMED_CHARACTERS['LessEqual']} "), ("Map", r" \/\@ "), ("MapAll", r" \/\/\@ "), - # FIXME: can't use named_characters in Minus because the ASCII minus + # FIXME: can't use NAMED_CHARACTERS in Minus because the ASCII minus # causes the unicode not to appear in tables. ("Minus", r" \-| \u2122 "), - ("Nand", rf" {named_characters['Nand']} "), + ("Nand", rf" {NAMED_CHARACTERS['Nand']} "), ("NonCommutativeMultiply", r" \*\* "), - ("Nor", rf" {named_characters['Nor']} "), - ("Not", r" {named_characters['Not']} "), - ("NotElement", r" {named_characters['NotElement']} "), - ("NotExists", r" {named_characters['NotExists']} "), - ("Or", rf" (\|\|) | {named_characters['Or']} "), + ("Nor", rf" {NAMED_CHARACTERS['Nor']} "), + ("Not", r" {NAMED_CHARACTERS['Not']} "), + ("NotElement", r" {NAMED_CHARACTERS['NotElement']} "), + ("NotExists", r" {NAMED_CHARACTERS['NotExists']} "), + ("Or", rf" (\|\|) | {NAMED_CHARACTERS['Or']} "), # ('PartialD', r' \u2202 '), ("PatternTest", r" \? "), ("Plus", r" \+ "), @@ -330,31 +330,31 @@ def init_module(): ("ReplaceAll", r" \/\. "), ("ReplaceRepeated", r" \/\/\. "), ("RightComposition", r" \/\* "), - ("Rule", r" (\-\>)| \uF522 | {named_characters['Rule']} "), + ("Rule", r" (\-\>)| \uF522 | {NAMED_CHARACTERS['Rule']} "), ("RuleDelayed", r" (\:\>)|\uF51F "), ("SameQ", r" \=\=\= "), ("Semicolon", r" \; "), ("Set", r" \= "), ("SetDelayed", r" \:\= "), - ("Square", rf" \uf520 | {named_characters['Square']}"), + ("Square", rf" \uf520 | {NAMED_CHARACTERS['Square']}"), ("StringExpression", r" \~\~ "), ("StringJoin", r" \<\> "), ("SubtractFrom", r" \-\= "), # ('Sum', r' \u2211 '), ("TagSet", r" \/\: "), - ("Times", rf" \*|{named_characters['Times']} "), + ("Times", rf" \*|{NAMED_CHARACTERS['Times']} "), ("TimesBy", r" \*\= "), - ("Transpose", rf" \uf3c7 | {named_characters['Transpose']} "), - ("Unequal", rf" (\!\= ) | {named_characters['NotEqual']} "), - ("Union", rf" {named_characters['Union']} "), + ("Transpose", rf" \uf3c7 | {NAMED_CHARACTERS['Transpose']} "), + ("Unequal", rf" (\!\= ) | {NAMED_CHARACTERS['NotEqual']} "), + ("Union", rf" {NAMED_CHARACTERS['Union']} "), ("UnsameQ", r" \=\!\= "), ("Xnor", r" \uF4A2 "), - ("Xor", rf" {named_characters['Xor']} "), + ("Xor", rf" {NAMED_CHARACTERS['Xor']} "), # https://reference.wolfram.com/language/ref/character/UndirectedEdge.html # The official Unicode value is \u2194 ( "UndirectedEdge", - rf" (\<\-\>)|\u29DF | {named_characters['UndirectedEdge']} ", + rf" (\<\-\>)|\u29DF | {NAMED_CHARACTERS['UndirectedEdge']} ", ), # allow whitespace but avoid e.g. x=.01 ("Unset", r" \=\s*\.(?!\d|\.) "), diff --git a/test/test_string_tokens.py b/test/test_string_tokens.py index 61cdc76..4fa36d9 100644 --- a/test/test_string_tokens.py +++ b/test/test_string_tokens.py @@ -96,12 +96,11 @@ def test_string(): check_string( r'"\(a \+\)"', - r'"\(a \+\)"', + r'"a \+"', "Do not interpret, but preserve boxing inside a string", ) incomplete_error(r'"abc', "String does not have terminating quote") - incomplete_error(r'"\"', "Unterminated escape sequence") escape_scan_error(r'"a\g"', "Unknown string escape \\g") escape_scan_error(r'"a\X"', '"X" is not a valid escape character') diff --git a/test/test_translation_regressions.py b/test/test_translation_regressions.py index d0d4c13..72069ee 100644 --- a/test/test_translation_regressions.py +++ b/test/test_translation_regressions.py @@ -1,10 +1,10 @@ # -*- coding: utf-8 -*- -from mathics_scanner.characters import replace_wl_with_plain_text, named_characters +from mathics_scanner.characters import NAMED_CHARACTERS, replace_wl_with_plain_text def check_translation_regression(c: str, expected_translation: str): - translation = replace_wl_with_plain_text(named_characters[c]) + translation = replace_wl_with_plain_text(NAMED_CHARACTERS[c]) assert ( translation == expected_translation ), f"REGRESSION {c} is translated to {translation} but it should translate to {expected_translation}" From 28f05ba6187f52b09fc04f8d3adcba3db07298be Mon Sep 17 00:00:00 2001 From: "R. Bernstein" Date: Wed, 18 Feb 2026 05:46:54 -0500 Subject: [PATCH 08/26] Fix comments for consistency in characters.py --- mathics_scanner/characters.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mathics_scanner/characters.py b/mathics_scanner/characters.py index 64f2803..eef79f6 100644 --- a/mathics_scanner/characters.py +++ b/mathics_scanner/characters.py @@ -91,13 +91,13 @@ def replace_box_unicode_with_ascii(input_string): # AMS LaTeX replacements _wl_to_amstex = NAMED_CHARACTERS_COLLECTION.get("wl-to-amstex", None) -# Conversion from WL to unicode +# Conversion from WL to Unicode _wl_to_unicode = NAMED_CHARACTERS_COLLECTION.get( "wl-to-unicode-dict", NAMED_CHARACTERS_COLLECTION.get("wl_to_ascii") ) _wl_to_unicode_re = re.compile(NAMED_CHARACTERS_COLLECTION.get("wl-to-unicode-re", "")) -# Conversion from unicode to WL +# Conversion from Unicode to WL _unicode_to_wl = NAMED_CHARACTERS_COLLECTION.get("unicode-to-wl-dict", {}) _unicode_to_wl_re = re.compile(NAMED_CHARACTERS_COLLECTION.get("unicode-to-wl-re", "")) @@ -122,7 +122,7 @@ def replace_wl_with_plain_text(wl_input: str, use_unicode=True) -> str: for the conversion. Note that the occurrences of named characters in ``wl_input`` are expect to - be represented by Wolfram's internal scheme. For more information Wolfram's + be represented by Wolfram's internal scheme. For more information on Wolfram's representation scheme and on our own conversion scheme, please see `Listing of Named Characters `_ @@ -148,7 +148,7 @@ def replace_unicode_with_wl(unicode_input: str) -> str: Note that the occurrences of named characters in the output of ``replace_unicode_with_wl`` are represented using Wolfram's internal - scheme. For more information Wolfram's representation scheme and on our own + scheme. For more information on Wolfram's representation scheme and on our own conversion scheme, please see `Listing of Named Characters `_ and ``implementation.rst`` respectively. From 9ba677532a462748daeea8d5ce0ee3168f7cc3ae Mon Sep 17 00:00:00 2001 From: "R. Bernstein" Date: Wed, 18 Feb 2026 05:48:02 -0500 Subject: [PATCH 09/26] Fix spelling errors in escape_sequences.py Corrected spelling errors in comments and docstrings. --- mathics_scanner/escape_sequences.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py index 1bb4638..95499de 100644 --- a/mathics_scanner/escape_sequences.py +++ b/mathics_scanner/escape_sequences.py @@ -28,7 +28,7 @@ def parse_base(source_text: str, start_shift: int, end_shift: int, base: int) -> See if characters start_shift .. end shift can be converted to an integer in base ``base``. - If so, chr(integer value converted from base) is returnd. + If so, chr(integer value converted from base) is returned. However, if the conversion fails, SyntaxError is raised. """ @@ -58,14 +58,14 @@ def parse_base(source_text: str, start_shift: int, end_shift: int, base: int) -> def parse_named_character(source_text: str, start: int, finish: int) -> Optional[str]: r""" - Find the unicode-equivalent symbol for a string named character. + Find the Unicode equivalent symbol for a string named character. - Before calling we have matched the text between "\[" and "]" of the input. + Before calling, we have matched the text between "\[" and "]" of the input. The name character is thus in source_text[start:finish]. Match this string with the known named characters, - e.g. "Theta". If we can match this, then we return the unicode equivalent from the + e.g., "Theta". If we can match this, then we return the Unicode equivalent from the `NAMED_CHARACTERS` map (which is read in from JSON but stored in a YAML file). If we can't find the named character, raise NamedCharacterSyntaxError. @@ -130,7 +130,7 @@ def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]: pos += 3 # WMA escape characters \n, \t, \b, \r. - # Note that these are a similer to Python, but are different. + # Note that these are similar to Python, but are different. # In particular, Python defines "\a" to be ^G (control G), # but in WMA, this is invalid. elif c in ESCAPE_CODES: From 74d2513184e99d4b92f8f672f81f7a6b9464c831 Mon Sep 17 00:00:00 2001 From: rocky Date: Wed, 18 Feb 2026 06:01:33 -0500 Subject: [PATCH 10/26] Correct a module name --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 35b58e7..5be33b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,7 +51,7 @@ full = [ ] [project.scripts] -mathics3-make-boxing-character-json = "mathics_scanner.generate.box_characters:main" +mathics3-make-boxing-character-json = "mathics_scanner.generate.boxing_characters:main" mathics3-make-named-character-json = "mathics_scanner.generate.named_characters:main" mathics3-make-operator-json = "mathics_scanner.generate.operators:main" mathics3-tokens = "mathics_scanner.mathics3_tokens:main" From 01d63d9ea78dec2ea87152cc726ecf268594c518 Mon Sep 17 00:00:00 2001 From: rocky Date: Wed, 18 Feb 2026 06:24:18 -0500 Subject: [PATCH 11/26] Reinstate \" escape. It accidentally got deleted in creating ESCAPE_CODES --- mathics_scanner/escape_sequences.py | 2 +- test/test_string_tokens.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py index 95499de..67c08bf 100644 --- a/mathics_scanner/escape_sequences.py +++ b/mathics_scanner/escape_sequences.py @@ -17,7 +17,7 @@ # The second character, or character after backslash ("\") that # are valid in a Mathics3 escaped character. -ESCAPE_CODES: Final[str] = "ntbfr $\n" +ESCAPE_CODES: Final[str] = 'ntbfr" $\n' # Valid digits in an Octal string OCTAL_DIGITS: Final[str] = "01234567" diff --git a/test/test_string_tokens.py b/test/test_string_tokens.py index 4fa36d9..d7fb2ed 100644 --- a/test/test_string_tokens.py +++ b/test/test_string_tokens.py @@ -101,6 +101,7 @@ def test_string(): ) incomplete_error(r'"abc', "String does not have terminating quote") + incomplete_error(r'"\"', "Unterminated escape sequence") escape_scan_error(r'"a\g"', "Unknown string escape \\g") escape_scan_error(r'"a\X"', '"X" is not a valid escape character') From 40a54b302307ac03dc0ac305ee589770f045492a Mon Sep 17 00:00:00 2001 From: rocky Date: Thu, 19 Feb 2026 10:40:51 -0500 Subject: [PATCH 12/26] Do Unicode translation only if inside a Box expr --- mathics_scanner/escape_sequences.py | 11 +++++++++-- mathics_scanner/tokeniser.py | 14 +++++++++----- test/test_escape_sequences.py | 9 ++++++--- test/test_string_tokens.py | 2 +- 4 files changed, 25 insertions(+), 11 deletions(-) diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py index 67c08bf..f448e71 100644 --- a/mathics_scanner/escape_sequences.py +++ b/mathics_scanner/escape_sequences.py @@ -79,7 +79,9 @@ def parse_named_character(source_text: str, start: int, finish: int) -> Optional return char -def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]: +def parse_escape_sequence( + source_text: str, pos: int, is_inside_box: bool +) -> Tuple[str, int]: """Given some source text in `source_text` starting at offset `pos`, return the escape-sequence value for this text and the follow-on offset position. @@ -136,6 +138,8 @@ def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]: elif c in ESCAPE_CODES: if c in "n\n": result += "\n" + elif c == '"': + result += '"' elif c == " ": result += " " elif c == "t": @@ -151,13 +155,16 @@ def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]: assert c == "r" result += "\r" pos += 1 - elif c in BOX_OPERATOR: + elif is_inside_box and c in BOX_OPERATOR: if (boxed_character := BOXING_ASCII_TO_UNICODE.get("\\" + c)) is not None: # Replace \ in result with Unicode representing the two ASCII characters. result = result[:-1] + boxed_character else: raise EscapeSyntaxError("stresc", rf"\{c}") pos += 1 + elif c in '!"': + result += c + pos += 1 else: raise EscapeSyntaxError("stresc", rf"\{c}") return result, pos diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index c3e3714..04a8fee 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -566,7 +566,7 @@ def __init__(self, feeder): # Set to True when inside box parsing. # This has an effect on which escape operators are allowed. - self._is_inside_box: bool = False + self.is_inside_box: bool = False self._change_token_scanning_mode("expr") @@ -702,7 +702,7 @@ def next(self) -> Token: try: escape_str, next_pos = parse_escape_sequence( - self.source_text, self.pos + 1 + self.source_text, self.pos + 1, self.is_inside_box ) except (EscapeSyntaxError, NamedCharacterSyntaxError) as escape_error: if self.is_inside_box: @@ -809,7 +809,9 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: source_text += self.source_text try: - escape_str, self.pos = parse_escape_sequence(source_text, start_pos) + escape_str, self.pos = parse_escape_sequence( + source_text, start_pos, self.is_inside_box + ) if source_text[start_pos] == "[" and source_text[self.pos - 1] == "]": named_character = source_text[start_pos + 1 : self.pos - 1] except (EscapeSyntaxError, NamedCharacterSyntaxError) as escape_error: @@ -877,7 +879,7 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: try: escape_str, next_pos = parse_escape_sequence( - self.source_text, self.pos + 1 + self.source_text, self.pos + 1, self.is_inside_box ) except (EscapeSyntaxError, NamedCharacterSyntaxError) as escape_error: if self.is_inside_box: @@ -940,7 +942,9 @@ def t_String(self, _: Optional[re.Match]) -> Token: self.get_more_input() self.pos += 1 try: - escape_str, self.pos = parse_escape_sequence(source_text, self.pos) + escape_str, self.pos = parse_escape_sequence( + source_text, self.pos, self.is_inside_box + ) except NamedCharacterSyntaxError as escape_error: self.feeder.message( escape_error.name, escape_error.tag, *escape_error.args diff --git a/test/test_escape_sequences.py b/test/test_escape_sequences.py index 30af4c3..5143a3a 100644 --- a/test/test_escape_sequences.py +++ b/test/test_escape_sequences.py @@ -45,13 +45,16 @@ def test_escape_sequences(): (r"z \[Conjugate]", 3, 14, "\uf3c8", "Named character; at end"), ("[Integral]", 0, 10, "\u222b", "Another full-string named-character"), ): - assert parse_escape_sequence(text, pos) == (expect_str, expect_pos), fail_msg + assert parse_escape_sequence(text, pos, is_inside_box=False) == ( + expect_str, + expect_pos, + ), fail_msg def test_invalid_named_character_sequences(): for text in (r"\[", r"\[Theta", r"\[Fake]", r"\[abc]"): with pytest.raises(NamedCharacterSyntaxError): - parse_escape_sequence(text, 1) + parse_escape_sequence(text, 1, is_inside_box=False) def test_invalid_number_encoding(): @@ -75,4 +78,4 @@ def test_invalid_number_encoding(): ":01-2", ): with pytest.raises(SyntaxError): - parse_escape_sequence(text, 0) + parse_escape_sequence(text, 0, is_inside_box=False) diff --git a/test/test_string_tokens.py b/test/test_string_tokens.py index d7fb2ed..61cdc76 100644 --- a/test/test_string_tokens.py +++ b/test/test_string_tokens.py @@ -96,7 +96,7 @@ def test_string(): check_string( r'"\(a \+\)"', - r'"a \+"', + r'"\(a \+\)"', "Do not interpret, but preserve boxing inside a string", ) From a985ec6bc8da9e6ad0298bd4e208e370d5f3713a Mon Sep 17 00:00:00 2001 From: rocky Date: Thu, 19 Feb 2026 11:48:49 -0500 Subject: [PATCH 13/26] Test mathics corresponding Mathics3 branch --- .github/workflows/mathics.yml | 2 +- mathics_scanner/characters.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/mathics.yml b/.github/workflows/mathics.yml index c431390..ee2ecab 100644 --- a/.github/workflows/mathics.yml +++ b/.github/workflows/mathics.yml @@ -33,7 +33,7 @@ jobs: git clone --depth 1 https://github.com/Mathics3/mathics-scanner.git (cd mathics-scanner && pip install -e .) # Until next Mathics3/mathics-core release is out... - git clone --depth 1 https://github.com/Mathics3/mathics-core.git + git clone -b add-unicode-box-characters --depth 1 https://github.com/Mathics3/mathics-core.git cd mathics-core/ make PIP_INSTALL_OPTS='[full]' # pip install Mathics3[full] diff --git a/mathics_scanner/characters.py b/mathics_scanner/characters.py index eef79f6..e62b500 100644 --- a/mathics_scanner/characters.py +++ b/mathics_scanner/characters.py @@ -45,7 +45,7 @@ def get_srcdir() -> str: with open(BOXING_CHARACTERS_PATH, "r") as f: boxing_character_data = ujson.load(f) else: - boxing_characters_data = {} + boxing_character_data = {} BOXING_UNICODE_TO_ASCII: Final[Dict[str, str]] = boxing_character_data.get( "unicode-to-ascii", {} From 5a3da9d93a7f3a39a84e2cc92cc88c1fcd87d519 Mon Sep 17 00:00:00 2001 From: rocky Date: Thu, 19 Feb 2026 12:13:43 -0500 Subject: [PATCH 14/26] See if this improves Windows CI --- .../{mathics.yml => mathics3-doctest.yml} | 4 +-- mathics_scanner/generate/named_characters.py | 2 +- setup.py | 34 +++++++++++-------- 3 files changed, 23 insertions(+), 17 deletions(-) rename .github/workflows/{mathics.yml => mathics3-doctest.yml} (93%) diff --git a/.github/workflows/mathics.yml b/.github/workflows/mathics3-doctest.yml similarity index 93% rename from .github/workflows/mathics.yml rename to .github/workflows/mathics3-doctest.yml index ee2ecab..9cf3d5c 100644 --- a/.github/workflows/mathics.yml +++ b/.github/workflows/mathics3-doctest.yml @@ -1,4 +1,4 @@ -name: Mathics_Script (Mathics doctest) +name: Mathics3_Doctest (Mathics3 doctest) on: push: @@ -28,7 +28,7 @@ jobs: - name: Install Mathics_Scanner run: | make - - name: Test Mathics3 + - name: Run Mathics3 Doctests run: | git clone --depth 1 https://github.com/Mathics3/mathics-scanner.git (cd mathics-scanner && pip install -e .) diff --git a/mathics_scanner/generate/named_characters.py b/mathics_scanner/generate/named_characters.py index d787f5f..8cd9a22 100755 --- a/mathics_scanner/generate/named_characters.py +++ b/mathics_scanner/generate/named_characters.py @@ -258,7 +258,7 @@ def compile_tables(data: dict) -> dict: } -DEFAULT_DATA_DIR = Path(osp.normpath(osp.dirname(__file__)), "..", "data") +DEFAULT_DATA_DIR = Path(__file__).parent.parent / "data" ALL_FIELDS = [ "aliased-characters", diff --git a/setup.py b/setup.py index 82c59b7..e94970b 100644 --- a/setup.py +++ b/setup.py @@ -25,34 +25,40 @@ mathics-users@googlegroups.com and ask for help. """ -import os import os.path as osp +import subprocess +import sys from setuptools import setup -from setuptools.command.build_py import build_py as setuptools_build_py +from setuptools.command.egg_info import egg_info def get_srcdir(): - """return the directory of the location if this code""" + """Return the directory of the location if this code""" filename = osp.normcase(osp.dirname(osp.abspath(__file__))) return osp.realpath(filename) -class build_py(setuptools_build_py): - def run(self): - for table_type in ("boxing-character", "named-character", "operator"): - json_data_file = osp.join("data", f"{table_type}.json") - json_path = osp.join("mathics-scanner", json_data_file) - if not osp.exists(json_path): - os.system(f"mathics3-make-{table_type}-json" " -o {json-path}") - self.distribution.package_data["Mathics-Scanner"].append(json_data_file) - setuptools_build_py.run(self) +class table_building_egg_info(egg_info): + """This runs as part of building an sdist""" + def finalize_options(self): + """Run program to create JSON tables""" + for table_program in ("boxing-character", "named-character", "operator"): + build_tables_program = osp.join( + get_srcdir(), "mathics_scanner", "generate", f"{table_program}.py" + ) + print(f"Building JSON tables via {build_tables_program}") + result = subprocess.run([sys.executable, build_tables_program], check=False) + if result.returncode: + raise RuntimeError( + f"Running {build_tables_program} exited with code {result.returncode}" + ) + super().finalize_options() -CMDCLASS = {"build_py": build_py} setup( - cmdclass=CMDCLASS, + cmdclass={"egg_info": table_building_egg_info}, # don't pack Mathics in egg because of media files, etc. zip_safe=False, ) From 985922802d27d83678c2e7e1ded3a2a3e06dd93f Mon Sep 17 00:00:00 2001 From: rocky Date: Thu, 19 Feb 2026 12:39:18 -0500 Subject: [PATCH 15/26] Go back to older form of setup on MSWindows. --- .github/workflows/windows.yml | 8 ++++---- setup.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 8f88834..38d8f5a 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -28,9 +28,9 @@ jobs: pip install -e . - name: Test Mathics3 run: | - # I don't think I need this anymore: - # python -m mathics_scanner.generate.boxing_characters - # python -m mathics_scanner.generate.named_characters - # python -m mathics_scanner.generate.operators + # This seems to be needed on Windows. + python -m mathics_scanner.generate.boxing_characters + python -m mathics_scanner.generate.named_characters + python -m mathics_scanner.generate.operators pip install -e .[dev,full] py.test test diff --git a/setup.py b/setup.py index e94970b..5ea8694 100644 --- a/setup.py +++ b/setup.py @@ -54,7 +54,7 @@ def finalize_options(self): raise RuntimeError( f"Running {build_tables_program} exited with code {result.returncode}" ) - super().finalize_options() + super().finalize_options() setup( From 59879a4c2a29fa0fc7a683597f8a34d61e32b3f2 Mon Sep 17 00:00:00 2001 From: rocky Date: Thu, 19 Feb 2026 12:49:50 -0500 Subject: [PATCH 16/26] Another attempt to get setup working --- Makefile | 2 +- setup.py | 37 ++++++++++++++++++++++++++++--------- 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/Makefile b/Makefile index 57e6fa8..3de0759 100644 --- a/Makefile +++ b/Makefile @@ -37,7 +37,7 @@ build: mathics_scanner/data/characters.json mathics_scanner/data/named_character #: Set up to run from the source tree develop: mathics_scanner/data/boxing-characters.json mathics_scanner/data/named-characters.json mathics_scanner/data/operators.json - $(PIP) install -e .$(PIP_INSTALL_OPTS) + $(PIP) install --no-build-isolation -e . $(PIP_INSTALL_OPTS) #: Build distribution dist: admin-tools/make-dist.sh diff --git a/setup.py b/setup.py index 5ea8694..99801f7 100644 --- a/setup.py +++ b/setup.py @@ -44,16 +44,35 @@ class table_building_egg_info(egg_info): def finalize_options(self): """Run program to create JSON tables""" - for table_program in ("boxing-character", "named-character", "operator"): - build_tables_program = osp.join( - get_srcdir(), "mathics_scanner", "generate", f"{table_program}.py" + build_tables_program = osp.join( + get_srcdir(), "mathics_scanner", "generate", "named_characters.py" + ) + print(f"Building JSON tables via {build_tables_program}") + result = subprocess.run([sys.executable, build_tables_program], check=False) + if result.returncode: + raise RuntimeError( + f"Running {build_tables_program} exited with code {result.returncode}" + ) + super().finalize_options() + build_tables_program = osp.join( + get_srcdir(), "mathics_scanner", "generate", "operators.py" + ) + print(f"Building JSON tables via {build_tables_program}") + result = subprocess.run([sys.executable, build_tables_program], check=False) + if result.returncode: + raise RuntimeError( + f"Running {build_tables_program} exited with code {result.returncode}" + ) + super().finalize_options() + build_tables_program = osp.join( + get_srcdir(), "mathics_scanner", "generate", "boxing_characters.py" + ) + print(f"Building JSON tables via {build_tables_program}") + result = subprocess.run([sys.executable, build_tables_program], check=False) + if result.returncode: + raise RuntimeError( + f"Running {build_tables_program} exited with code {result.returncode}" ) - print(f"Building JSON tables via {build_tables_program}") - result = subprocess.run([sys.executable, build_tables_program], check=False) - if result.returncode: - raise RuntimeError( - f"Running {build_tables_program} exited with code {result.returncode}" - ) super().finalize_options() From 7772a58796075d4daeddfb4ad1e6810d8073074b Mon Sep 17 00:00:00 2001 From: rocky Date: Thu, 19 Feb 2026 12:51:43 -0500 Subject: [PATCH 17/26] Comment out fancy setup - it's not working. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 99801f7..043f1b3 100644 --- a/setup.py +++ b/setup.py @@ -77,7 +77,7 @@ def finalize_options(self): setup( - cmdclass={"egg_info": table_building_egg_info}, + # cmdclass={"egg_info": table_building_egg_info}, # don't pack Mathics in egg because of media files, etc. zip_safe=False, ) From 270d2e58e72f84c626e75c8289d604c26a7f0579 Mon Sep 17 00:00:00 2001 From: rocky Date: Thu, 19 Feb 2026 12:53:59 -0500 Subject: [PATCH 18/26] Another CI attempt --- setup.py | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/setup.py b/setup.py index 043f1b3..9a0a56d 100644 --- a/setup.py +++ b/setup.py @@ -54,30 +54,10 @@ def finalize_options(self): f"Running {build_tables_program} exited with code {result.returncode}" ) super().finalize_options() - build_tables_program = osp.join( - get_srcdir(), "mathics_scanner", "generate", "operators.py" - ) - print(f"Building JSON tables via {build_tables_program}") - result = subprocess.run([sys.executable, build_tables_program], check=False) - if result.returncode: - raise RuntimeError( - f"Running {build_tables_program} exited with code {result.returncode}" - ) - super().finalize_options() - build_tables_program = osp.join( - get_srcdir(), "mathics_scanner", "generate", "boxing_characters.py" - ) - print(f"Building JSON tables via {build_tables_program}") - result = subprocess.run([sys.executable, build_tables_program], check=False) - if result.returncode: - raise RuntimeError( - f"Running {build_tables_program} exited with code {result.returncode}" - ) - super().finalize_options() setup( - # cmdclass={"egg_info": table_building_egg_info}, + cmdclass={"egg_info": table_building_egg_info}, # don't pack Mathics in egg because of media files, etc. zip_safe=False, ) From aa29f8f2c27af5304e527caf033fa9b5c14d5498 Mon Sep 17 00:00:00 2001 From: rocky Date: Thu, 19 Feb 2026 13:06:31 -0500 Subject: [PATCH 19/26] Yet another CI try --- .github/workflows/ubuntu.yml | 11 ++++------- setup.py | 31 +++++++++++++------------------ 2 files changed, 17 insertions(+), 25 deletions(-) diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index 34e873c..431787e 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -20,17 +20,14 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip pip install -e . - - name: Install Mathics_Scanner + - name: Install JSON dependencies run: | - make + python -m mathics_scanner.generate.boxing_characters + python -m mathics_scanner.generate.named_characters + python -m mathics_scanner.generate.operators - name: Test Mathics3 Scanner run: | pip install -r requirements-dev.txt pip install -r requirements-full.txt - # Don't think I need this anymore - # python -m mathics_scanner.generate.boxing_characters - # python -m mathics_scanner.generate.named_characters - # python -m mathics_scanner.generate.operators make check diff --git a/setup.py b/setup.py index 9a0a56d..fa3dfee 100644 --- a/setup.py +++ b/setup.py @@ -26,11 +26,9 @@ """ import os.path as osp -import subprocess -import sys from setuptools import setup -from setuptools.command.egg_info import egg_info +from setuptools.command.build_py import build_py as setuptools_build_py def get_srcdir(): @@ -39,25 +37,22 @@ def get_srcdir(): return osp.realpath(filename) -class table_building_egg_info(egg_info): - """This runs as part of building an sdist""" +class build_py(setuptools_build_py): + def run(self): + for table_type in ("boxing-character", "named-character", "operator"): + json_data_file = osp.join("data", f"{table_type}.json") + json_path = osp.join("mathics-scanner", json_data_file) + if not osp.exists(json_path): + os.system(f"mathics3-make-{table_type}-json" " -o {json-path}") + self.distribution.package_data["Mathics-Scanner"].append(json_data_file) + setuptools_build_py.run(self) - def finalize_options(self): - """Run program to create JSON tables""" - build_tables_program = osp.join( - get_srcdir(), "mathics_scanner", "generate", "named_characters.py" - ) - print(f"Building JSON tables via {build_tables_program}") - result = subprocess.run([sys.executable, build_tables_program], check=False) - if result.returncode: - raise RuntimeError( - f"Running {build_tables_program} exited with code {result.returncode}" - ) - super().finalize_options() + +CMDCLASS = {"build_py": build_py} setup( - cmdclass={"egg_info": table_building_egg_info}, + cmdclass=CMDCLASS, # don't pack Mathics in egg because of media files, etc. zip_safe=False, ) From 5c212f9bde8823ec7c3a38d6fe8f7ed7bcc2a9cf Mon Sep 17 00:00:00 2001 From: rocky Date: Thu, 19 Feb 2026 13:11:10 -0500 Subject: [PATCH 20/26] Another CI try --- .github/workflows/mathics3-doctest.yml | 11 ++++++++--- .github/workflows/ubuntu.yml | 7 ++++--- setup.py | 1 + 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/.github/workflows/mathics3-doctest.yml b/.github/workflows/mathics3-doctest.yml index 9cf3d5c..8f40010 100644 --- a/.github/workflows/mathics3-doctest.yml +++ b/.github/workflows/mathics3-doctest.yml @@ -1,4 +1,4 @@ -name: Mathics3_Doctest (Mathics3 doctest) +name: Mathics3 Doctest (Mathics3 doctest) on: push: @@ -21,11 +21,16 @@ jobs: - name: Install OS dependencies run: | sudo apt-get update -qq && sudo apt-get install -qq liblapack-dev llvm-dev tesseract-ocr - - name: Install dependencies + - name: Install Mathics3 scanner without JSON run: | python -m pip install --upgrade pip pip install -e . - - name: Install Mathics_Scanner + - name: Install JSON files + run: | + python -m mathics_scanner.generate.boxing_characters + python -m mathics_scanner.generate.named_characters + python -m mathics_scanner.generate.operators + - name: Install Mathics Scanner run: | make - name: Run Mathics3 Doctests diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index 431787e..205a4f2 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -1,4 +1,4 @@ -name: Mathics_Script (ubuntu) +name: Mathics3 Scanner (ubuntu) on: push: @@ -18,10 +18,11 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - - name: Install dependencies + - name: Install Mathics3 scanner without JSON fiels run: | + python -m pip install --upgrade pip pip install -e . - - name: Install JSON dependencies + - name: Install JSON files run: | python -m mathics_scanner.generate.boxing_characters python -m mathics_scanner.generate.named_characters diff --git a/setup.py b/setup.py index fa3dfee..46561ad 100644 --- a/setup.py +++ b/setup.py @@ -25,6 +25,7 @@ mathics-users@googlegroups.com and ask for help. """ +import os import os.path as osp from setuptools import setup From ae5cdf9d0d9a9f1aa3e3ea9e1efa195dce150041 Mon Sep 17 00:00:00 2001 From: rocky Date: Thu, 19 Feb 2026 13:17:02 -0500 Subject: [PATCH 21/26] Another CI try --- .github/workflows/osx.yml | 11 ++++------- .github/workflows/ubuntu.yml | 2 +- .github/workflows/windows.yml | 15 +++++++++------ 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index d2c29f8..1d666dd 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -1,4 +1,4 @@ -name: Mathics_Script (OSX) +name: Mathics3 Scanner (OSX) on: push: @@ -23,15 +23,12 @@ jobs: run: | python -m pip install --upgrade pip pip install -e . - - name: Install Mathics Scanner + - name: Install Mathics3 scanner without JSON files run: | - make + python -m pip install --upgrade pip + pip install -e . - name: Test Mathics3 Scanner run: | pip install -r requirements-dev.txt pip install -r requirements-full.txt - # I don't think I need this anymore: - # python -m mathics_scanner.generate.boxing_characters - # python -m mathics_scanner.generate.named_characters - # python -m mathics_scanner.generate.operators make check diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index 205a4f2..f9470e1 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -18,7 +18,7 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - - name: Install Mathics3 scanner without JSON fiels + - name: Install Mathics3 scanner without JSON files run: | python -m pip install --upgrade pip pip install -e . diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 38d8f5a..00706c7 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -1,4 +1,4 @@ -name: Mathics (Windows) +name: Mathics3 Scanner (Windows) on: push: @@ -23,14 +23,17 @@ jobs: run: | python -m pip install --upgrade pip pip install -e . - - name: Install Mathics Scanner + - name: Install Mathics3 scanner without JSON files run: | + python -m pip install --upgrade pip pip install -e . - - name: Test Mathics3 + - name: Install JSON files run: | - # This seems to be needed on Windows. python -m mathics_scanner.generate.boxing_characters python -m mathics_scanner.generate.named_characters python -m mathics_scanner.generate.operators - pip install -e .[dev,full] - py.test test + - name: Test Mathics3 Scanner + run: | + pip install -r requirements-dev.txt + pip install -r requirements-full.txt + make check From 0d43e4609894b543826f6190629481c9f08a79b5 Mon Sep 17 00:00:00 2001 From: rocky Date: Thu, 19 Feb 2026 13:22:16 -0500 Subject: [PATCH 22/26] Another CI attempt --- .github/workflows/mathics3-doctest.yml | 5 ++--- .github/workflows/osx.yml | 5 +++++ .github/workflows/pyodide.yml | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.github/workflows/mathics3-doctest.yml b/.github/workflows/mathics3-doctest.yml index 8f40010..44092c4 100644 --- a/.github/workflows/mathics3-doctest.yml +++ b/.github/workflows/mathics3-doctest.yml @@ -32,11 +32,10 @@ jobs: python -m mathics_scanner.generate.operators - name: Install Mathics Scanner run: | - make + pip install -r requirements-dev.txt + pip install -r requirements-full.txt - name: Run Mathics3 Doctests run: | - git clone --depth 1 https://github.com/Mathics3/mathics-scanner.git - (cd mathics-scanner && pip install -e .) # Until next Mathics3/mathics-core release is out... git clone -b add-unicode-box-characters --depth 1 https://github.com/Mathics3/mathics-core.git cd mathics-core/ diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 1d666dd..c51369a 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -27,6 +27,11 @@ jobs: run: | python -m pip install --upgrade pip pip install -e . + - name: Install JSON files + run: | + python -m mathics_scanner.generate.boxing_characters + python -m mathics_scanner.generate.named_characters + python -m mathics_scanner.generate.operators - name: Test Mathics3 Scanner run: | pip install -r requirements-dev.txt diff --git a/.github/workflows/pyodide.yml b/.github/workflows/pyodide.yml index f1adb0a..0a5f6e4 100644 --- a/.github/workflows/pyodide.yml +++ b/.github/workflows/pyodide.yml @@ -43,7 +43,7 @@ jobs: with: node-version: ${{ env.NODE_VERSION }} - - name: Set up Pyodide virtual environment and run tests + - name: Set up Pyodide virtual environment run: | # Set up Pyodide virtual environment pyodide xbuildenv install ${{ env.PYODIDE_VERSION }} From ed2e7a822ea115d1f9102f9804101c4247b88dbd Mon Sep 17 00:00:00 2001 From: rocky Date: Thu, 19 Feb 2026 13:36:02 -0500 Subject: [PATCH 23/26] One more CI attempt --- .github/workflows/mathics3-doctest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/mathics3-doctest.yml b/.github/workflows/mathics3-doctest.yml index 44092c4..9798376 100644 --- a/.github/workflows/mathics3-doctest.yml +++ b/.github/workflows/mathics3-doctest.yml @@ -40,6 +40,6 @@ jobs: git clone -b add-unicode-box-characters --depth 1 https://github.com/Mathics3/mathics-core.git cd mathics-core/ make PIP_INSTALL_OPTS='[full]' - # pip install Mathics3[full] + bash ./admin-tools/make-JSON-tables.sh cd .. MATHICS_CHARACTER_ENCODING="ASCII" make check-mathics From 733fc1bb6d2bf3c74d8206d73c8c03d73fc8a7d9 Mon Sep 17 00:00:00 2001 From: rocky Date: Thu, 19 Feb 2026 17:43:55 -0500 Subject: [PATCH 24/26] Try to get docstest working --- .github/workflows/mathics3-doctest.yml | 27 ++++++++++++++------------ Makefile | 2 +- mathics_scanner/tokeniser.py | 2 +- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/.github/workflows/mathics3-doctest.yml b/.github/workflows/mathics3-doctest.yml index 9798376..654aadd 100644 --- a/.github/workflows/mathics3-doctest.yml +++ b/.github/workflows/mathics3-doctest.yml @@ -20,26 +20,29 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install OS dependencies run: | - sudo apt-get update -qq && sudo apt-get install -qq liblapack-dev llvm-dev tesseract-ocr + sudo apt-get update -qq && sudo apt-get install -qq liblapack-dev llvm-dev tesseract-ocr remake - name: Install Mathics3 scanner without JSON run: | python -m pip install --upgrade pip pip install -e . - name: Install JSON files run: | - python -m mathics_scanner.generate.boxing_characters - python -m mathics_scanner.generate.named_characters - python -m mathics_scanner.generate.operators - - name: Install Mathics Scanner - run: | - pip install -r requirements-dev.txt - pip install -r requirements-full.txt - - name: Run Mathics3 Doctests + python -m mathics_scanner.generate.boxing_characters -o mathics_scanner/data/boxing-characters.json + ls -l mathics_scanner/data/boxing-characters.json + python -m mathics_scanner.generate.named_characters -o mathics_scanner/data/named-characters.json + ls -l mathics_scanner/data/named-characters.json + python -m mathics_scanner.generate.operators -o mathics_scanner/data/operators.json + ls -l mathics_scanner/data/operators.json + - name: Build Mathics3 run: | # Until next Mathics3/mathics-core release is out... git clone -b add-unicode-box-characters --depth 1 https://github.com/Mathics3/mathics-core.git cd mathics-core/ - make PIP_INSTALL_OPTS='[full]' - bash ./admin-tools/make-JSON-tables.sh + python -m pip install -e .[dev] + cp -v ../mathics_scanner/data/boxing-characters.json mathics/data/boxing-characters.json + cp -v ../mathics_scanner/data/named-characters.json mathics/data/named-characters.json + cp -v ../mathics_scanner/data/operators.json mathics/data/operators.json cd .. - MATHICS_CHARACTER_ENCODING="ASCII" make check-mathics + - name: Run Mathics3 tests + run: | + remake -x check-mathics diff --git a/Makefile b/Makefile index 3de0759..3af4999 100644 --- a/Makefile +++ b/Makefile @@ -78,7 +78,7 @@ inputrc-unicode: $(PYTHON) -m mathics_scanner.generate.rl_inputrc inputrc-unicode #: Run Mathics core checks -check-mathics:: +check-mathics: MATHICS_CHARACTER_ENCODING="ASCII" $(PYTHON) -m mathics.docpipeline $o pytest test/test_mathics_precedence.py diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 04a8fee..ec2a871 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -959,7 +959,7 @@ def t_String(self, _: Optional[re.Match]) -> Token: # If there is boxing construct matched, we # preserve what was given, but do not tokenize # the construct. "\(" remains "\(" and is not - # turned into IntepretBox". + # turned into InterpretBox". result += "\\" + escaped_char self.pos += 1 else: From f2430d8b96a84278454015126b1d5c87fc523a25 Mon Sep 17 00:00:00 2001 From: rocky Date: Fri, 20 Feb 2026 08:22:01 -0500 Subject: [PATCH 25/26] Correct YAML encoding for \` and ... we need to track wither we are in a *String*, not whether we are in a box. --- mathics_scanner/data/boxing-characters.yml | 2 +- mathics_scanner/escape_sequences.py | 4 ++-- mathics_scanner/tokeniser.py | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/mathics_scanner/data/boxing-characters.yml b/mathics_scanner/data/boxing-characters.yml index eb45ea8..5e38d35 100644 --- a/mathics_scanner/data/boxing-characters.yml +++ b/mathics_scanner/data/boxing-characters.yml @@ -68,7 +68,7 @@ LinearSyntaxCloseParen: LinearSyntaxOpenParen: ASCII: '\(' Operators: [RowBox] - Unicode: "\uf7cd" + Unicode: "\uf7c9" LinearSyntaxPercent: ASCII: '\%' diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py index f448e71..41e198b 100644 --- a/mathics_scanner/escape_sequences.py +++ b/mathics_scanner/escape_sequences.py @@ -80,7 +80,7 @@ def parse_named_character(source_text: str, start: int, finish: int) -> Optional def parse_escape_sequence( - source_text: str, pos: int, is_inside_box: bool + source_text: str, pos: int, is_in_string: bool ) -> Tuple[str, int]: """Given some source text in `source_text` starting at offset `pos`, return the escape-sequence value for this text and the @@ -155,7 +155,7 @@ def parse_escape_sequence( assert c == "r" result += "\r" pos += 1 - elif is_inside_box and c in BOX_OPERATOR: + elif is_in_string and c in BOX_OPERATOR: if (boxed_character := BOXING_ASCII_TO_UNICODE.get("\\" + c)) is not None: # Replace \ in result with Unicode representing the two ASCII characters. result = result[:-1] + boxed_character diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index ec2a871..a435da4 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -702,7 +702,7 @@ def next(self) -> Token: try: escape_str, next_pos = parse_escape_sequence( - self.source_text, self.pos + 1, self.is_inside_box + self.source_text, self.pos + 1, is_in_string=False ) except (EscapeSyntaxError, NamedCharacterSyntaxError) as escape_error: if self.is_inside_box: @@ -810,7 +810,7 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: try: escape_str, self.pos = parse_escape_sequence( - source_text, start_pos, self.is_inside_box + source_text, start_pos, is_in_string=False ) if source_text[start_pos] == "[" and source_text[self.pos - 1] == "]": named_character = source_text[start_pos + 1 : self.pos - 1] @@ -879,7 +879,7 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: try: escape_str, next_pos = parse_escape_sequence( - self.source_text, self.pos + 1, self.is_inside_box + self.source_text, self.pos + 1, is_in_string=False ) except (EscapeSyntaxError, NamedCharacterSyntaxError) as escape_error: if self.is_inside_box: @@ -943,7 +943,7 @@ def t_String(self, _: Optional[re.Match]) -> Token: self.pos += 1 try: escape_str, self.pos = parse_escape_sequence( - source_text, self.pos, self.is_inside_box + source_text, self.pos, is_in_string=True ) except NamedCharacterSyntaxError as escape_error: self.feeder.message( From 99b5ab61839364e08959890474b57e92fc18e300 Mon Sep 17 00:00:00 2001 From: rocky Date: Fri, 20 Feb 2026 09:07:23 -0500 Subject: [PATCH 26/26] is_inside_box -> is_in_string --- test/test_escape_sequences.py | 6 +++--- test/test_string_tokens.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/test_escape_sequences.py b/test/test_escape_sequences.py index 5143a3a..f963108 100644 --- a/test/test_escape_sequences.py +++ b/test/test_escape_sequences.py @@ -45,7 +45,7 @@ def test_escape_sequences(): (r"z \[Conjugate]", 3, 14, "\uf3c8", "Named character; at end"), ("[Integral]", 0, 10, "\u222b", "Another full-string named-character"), ): - assert parse_escape_sequence(text, pos, is_inside_box=False) == ( + assert parse_escape_sequence(text, pos, is_in_string=False) == ( expect_str, expect_pos, ), fail_msg @@ -54,7 +54,7 @@ def test_escape_sequences(): def test_invalid_named_character_sequences(): for text in (r"\[", r"\[Theta", r"\[Fake]", r"\[abc]"): with pytest.raises(NamedCharacterSyntaxError): - parse_escape_sequence(text, 1, is_inside_box=False) + parse_escape_sequence(text, 1, is_in_string=False) def test_invalid_number_encoding(): @@ -78,4 +78,4 @@ def test_invalid_number_encoding(): ":01-2", ): with pytest.raises(SyntaxError): - parse_escape_sequence(text, 0, is_inside_box=False) + parse_escape_sequence(text, 0, is_in_string=False) diff --git a/test/test_string_tokens.py b/test/test_string_tokens.py index 61cdc76..180c38b 100644 --- a/test/test_string_tokens.py +++ b/test/test_string_tokens.py @@ -96,7 +96,7 @@ def test_string(): check_string( r'"\(a \+\)"', - r'"\(a \+\)"', + r'"a \+"', "Do not interpret, but preserve boxing inside a string", )