diff --git a/.github/workflows/mathics.yml b/.github/workflows/mathics.yml deleted file mode 100644 index c431390..0000000 --- a/.github/workflows/mathics.yml +++ /dev/null @@ -1,41 +0,0 @@ -name: Mathics_Script (Mathics doctest) - -on: - push: - branches: [ master ] - pull_request: - branches: [ master ] - -jobs: - build: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ['3.13'] - steps: - - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - name: Install OS dependencies - run: | - sudo apt-get update -qq && sudo apt-get install -qq liblapack-dev llvm-dev tesseract-ocr - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -e . - - name: Install Mathics_Scanner - run: | - make - - name: Test Mathics3 - run: | - git clone --depth 1 https://github.com/Mathics3/mathics-scanner.git - (cd mathics-scanner && pip install -e .) - # Until next Mathics3/mathics-core release is out... - git clone --depth 1 https://github.com/Mathics3/mathics-core.git - cd mathics-core/ - make PIP_INSTALL_OPTS='[full]' - # pip install Mathics3[full] - cd .. - MATHICS_CHARACTER_ENCODING="ASCII" make check-mathics diff --git a/.github/workflows/mathics3-doctest.yml b/.github/workflows/mathics3-doctest.yml new file mode 100644 index 0000000..654aadd --- /dev/null +++ b/.github/workflows/mathics3-doctest.yml @@ -0,0 +1,48 @@ +name: Mathics3 Doctest (Mathics3 doctest) + +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ['3.13'] + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install OS dependencies + run: | + sudo apt-get update -qq && sudo apt-get install -qq liblapack-dev llvm-dev tesseract-ocr remake + - name: Install Mathics3 scanner without JSON + run: | + python -m pip install --upgrade pip + pip install -e . + - name: Install JSON files + run: | + python -m mathics_scanner.generate.boxing_characters -o mathics_scanner/data/boxing-characters.json + ls -l mathics_scanner/data/boxing-characters.json + python -m mathics_scanner.generate.named_characters -o mathics_scanner/data/named-characters.json + ls -l mathics_scanner/data/named-characters.json + python -m mathics_scanner.generate.operators -o mathics_scanner/data/operators.json + ls -l mathics_scanner/data/operators.json + - name: Build Mathics3 + run: | + # Until next Mathics3/mathics-core release is out... + git clone -b add-unicode-box-characters --depth 1 https://github.com/Mathics3/mathics-core.git + cd mathics-core/ + python -m pip install -e .[dev] + cp -v ../mathics_scanner/data/boxing-characters.json mathics/data/boxing-characters.json + cp -v ../mathics_scanner/data/named-characters.json mathics/data/named-characters.json + cp -v ../mathics_scanner/data/operators.json mathics/data/operators.json + cd .. + - name: Run Mathics3 tests + run: | + remake -x check-mathics diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index c740b23..c51369a 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -1,4 +1,4 @@ -name: Mathics_Script (OSX) +name: Mathics3 Scanner (OSX) on: push: @@ -23,12 +23,17 @@ jobs: run: | python -m pip install --upgrade pip pip install -e . - - name: Install Mathics Scanner + - name: Install Mathics3 scanner without JSON files run: | - make + python -m pip install --upgrade pip + pip install -e . + - name: Install JSON files + run: | + python -m mathics_scanner.generate.boxing_characters + python -m mathics_scanner.generate.named_characters + python -m mathics_scanner.generate.operators - name: Test Mathics3 Scanner run: | pip install -r requirements-dev.txt pip install -r requirements-full.txt - python -m mathics_scanner.generate.build_tables make check diff --git a/.github/workflows/pyodide.yml b/.github/workflows/pyodide.yml index f1adb0a..0a5f6e4 100644 --- a/.github/workflows/pyodide.yml +++ b/.github/workflows/pyodide.yml @@ -43,7 +43,7 @@ jobs: with: node-version: ${{ env.NODE_VERSION }} - - name: Set up Pyodide virtual environment and run tests + - name: Set up Pyodide virtual environment run: | # Set up Pyodide virtual environment pyodide xbuildenv install ${{ env.PYODIDE_VERSION }} diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index 05d8de2..f9470e1 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -1,4 +1,4 @@ -name: Mathics_Script (ubuntu) +name: Mathics3 Scanner (ubuntu) on: push: @@ -18,16 +18,17 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - - name: Install dependencies + - name: Install Mathics3 scanner without JSON files run: | python -m pip install --upgrade pip pip install -e . - - name: Install Mathics_Scanner + - name: Install JSON files run: | - make + python -m mathics_scanner.generate.boxing_characters + python -m mathics_scanner.generate.named_characters + python -m mathics_scanner.generate.operators - name: Test Mathics3 Scanner run: | pip install -r requirements-dev.txt pip install -r requirements-full.txt - python -m mathics_scanner.generate.build_tables make check diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 768802b..00706c7 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -1,4 +1,4 @@ -name: Mathics (Windows) +name: Mathics3 Scanner (Windows) on: push: @@ -23,13 +23,17 @@ jobs: run: | python -m pip install --upgrade pip pip install -e . - - name: Install Mathics Scanner + - name: Install Mathics3 scanner without JSON files run: | + python -m pip install --upgrade pip pip install -e . - - name: Test Mathics3 + - name: Install JSON files + run: | + python -m mathics_scanner.generate.boxing_characters + python -m mathics_scanner.generate.named_characters + python -m mathics_scanner.generate.operators + - name: Test Mathics3 Scanner run: | - # Ideally we should not have to do this. - python mathics_scanner/generate/build_tables.py - python mathics_scanner/generate/build_operator_tables.py - pip install -e .[dev,full] - py.test test + pip install -r requirements-dev.txt + pip install -r requirements-full.txt + make check diff --git a/Makefile b/Makefile index b374254..3af4999 100644 --- a/Makefile +++ b/Makefile @@ -22,21 +22,22 @@ PIP_INSTALL_OPTS ?= #: Default target - same as "develop" all: develop -mathics_scanner/data/character-tables.json: mathics_scanner/data/named-characters.yml - $(PIP) install -r requirements-dev.txt - $(PYTHON) mathics_scanner/generate/build_tables.py +mathics_scanner/data/boxing-characters.json: mathics_scanner/data/boxing-characters.yml + $(PYTHON) mathics_scanner/generate/boxing_characters.py + +mathics_scanner/data/named-characters.json: mathics_scanner/data/named-characters.yml + $(PYTHON) mathics_scanner/generate/named_characters.py mathics_scanner/data/operators.json: mathics_scanner/data/operators.yml - $(PIP) install -r requirements-dev.txt - $(PYTHON) mathics_scanner/generate/build_operator_tables.py + $(PYTHON) mathics_scanner/generate/operators.py #: build everything needed to install -build: mathics_scanner/data/characters.json mathics_scanner/data/operators.json +build: mathics_scanner/data/characters.json mathics_scanner/data/named_characters.json mathics_scanner/data/operators.json $(PYTHON) ./setup.py build #: Set up to run from the source tree -develop: mathics_scanner/data/character-tables.json mathics_scanner/data/operators.json - $(PIP) install -e .$(PIP_INSTALL_OPTS) +develop: mathics_scanner/data/boxing-characters.json mathics_scanner/data/named-characters.json mathics_scanner/data/operators.json + $(PIP) install --no-build-isolation -e . $(PIP_INSTALL_OPTS) #: Build distribution dist: admin-tools/make-dist.sh @@ -56,16 +57,16 @@ check: pytest test: check #: Build Sphinx HTML documentation -doc: mathics_scanner/data/character-tables.json +doc: mathics_scanner/data/named-characters.json make -C docs html #: Remove derived files clean: @find . -name *.pyc -type f -delete; \ - $(RM) -f mathics_scanner/data/character-tables.json mathics_scanner/data/operators.json || true + $(RM) -f mathics_scanner/data/*.json || true #: Run py.test tests. Use environment variable "o" for pytest options -pytest: mathics_scanner/data/character-tables.json +pytest: mathics_scanner/data/named-characters.json $(PYTHON) -m pytest test $o #: Print to stdout a GNU Readline inputrc without Unicode @@ -77,7 +78,7 @@ inputrc-unicode: $(PYTHON) -m mathics_scanner.generate.rl_inputrc inputrc-unicode #: Run Mathics core checks -check-mathics:: +check-mathics: MATHICS_CHARACTER_ENCODING="ASCII" $(PYTHON) -m mathics.docpipeline $o pytest test/test_mathics_precedence.py diff --git a/admin-tools/make-JSON-tables.sh b/admin-tools/make-JSON-tables.sh index b020579..9307397 100755 --- a/admin-tools/make-JSON-tables.sh +++ b/admin-tools/make-JSON-tables.sh @@ -1,10 +1,10 @@ #!/bin/bash -# Create a complete set of tables. -# This just runs build_tables.py in this distribution +# Create a complete set of JSON tables. bs=${BASH_SOURCE[0]} mydir=$(dirname $bs) PYTHON=${PYTHON:-python} cd $mydir/../mathics_scanner/data -$PYTHON ../generate/build_tables.py -o character-tables.json -$PYTHON ../generate/build_operator_tables.py -o operators.json +$PYTHON ../generate/boxing_characters.py -o boxing-characters.json +$PYTHON ../generate/named_characters.py -o named-characters.json +$PYTHON ../generate/operators.py -o operators.json diff --git a/mathics_scanner/__init__.py b/mathics_scanner/__init__.py index ecfec27..bbd49f8 100644 --- a/mathics_scanner/__init__.py +++ b/mathics_scanner/__init__.py @@ -7,8 +7,8 @@ """ from mathics_scanner.characters import ( + NAMED_CHARACTERS, aliased_characters, - named_characters, replace_unicode_with_wl, replace_wl_with_plain_text, ) @@ -34,6 +34,7 @@ "InvalidSyntaxError", "LineFeeder", "MultiLineFeeder", + "NAMED_CHARACTERS", "SyntaxError", "SingleLineFeeder", # "Token", @@ -41,7 +42,6 @@ "__version__", "aliased_characters", # "is_symbol_name", - "named_characters", "replace_unicode_with_wl", "replace_wl_with_plain_text", ] diff --git a/mathics_scanner/characters.py b/mathics_scanner/characters.py index 057dc9c..e62b500 100644 --- a/mathics_scanner/characters.py +++ b/mathics_scanner/characters.py @@ -1,13 +1,16 @@ # -*- coding: utf-8 -*- -""" -The ``mathics_scanner.characters`` module consists mostly of translation tables -between Wolfram's internal representation of `named characters +"""This module consists mostly of translation tables between Wolfram's +internal representation of `named characters `_ and Unicode/ASCII. + +It also contains Unicode translation tables for the syntax used in +Boxing operators and Boxing expressions. """ import os.path as osp import re +from typing import Dict, Final try: import ujson @@ -16,22 +19,56 @@ def get_srcdir() -> str: - filename = osp.normcase(osp.dirname(osp.abspath(__file__))) - return osp.realpath(filename) + """Return the OS normalized real directory path for where this + code currently resides on disk.""" + directory_path = osp.normcase(osp.dirname(osp.abspath(__file__))) + return osp.realpath(directory_path) + +ROOT_DIR: Final[str] = get_srcdir() -ROOT_DIR = get_srcdir() # Load the conversion tables from disk -characters_path = osp.join(ROOT_DIR, "data", "character-tables.json") -if osp.exists(characters_path): - with open(characters_path, "r") as f: - _data = ujson.load(f) + +NAMED_CHARACTERS_PATH: Final[str] = osp.join(ROOT_DIR, "data", "named-characters.json") +if osp.exists(NAMED_CHARACTERS_PATH): + with open(NAMED_CHARACTERS_PATH, "r") as f: + NAMED_CHARACTERS_COLLECTION = ujson.load(f) +else: + NAMED_CHARACTERS_COLLECTION = {} + +BOXING_CHARACTERS_PATH: Final[str] = osp.join( + ROOT_DIR, "data", "boxing-characters.json" +) + +if osp.exists(BOXING_CHARACTERS_PATH): + with open(BOXING_CHARACTERS_PATH, "r") as f: + boxing_character_data = ujson.load(f) else: - _data = {} + boxing_character_data = {} + +BOXING_UNICODE_TO_ASCII: Final[Dict[str, str]] = boxing_character_data.get( + "unicode-to-ascii", {} +) +BOXING_ASCII_TO_UNICODE: Final[Dict[str, str]] = boxing_character_data.get( + "ascii-to-unicode", {} +) + +replace_to_ascii_re = re.compile( + "|".join( + re.escape(unicode_character) + for unicode_character in BOXING_UNICODE_TO_ASCII.keys() + ) +) + + +def replace_box_unicode_with_ascii(input_string): + return "".join(BOXING_UNICODE_TO_ASCII.get(char, char) for char in input_string) + # Character ranges of letters -_letters = "a-zA-Z\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u0103\u0106\u0107\ +_letters: Final[str] = ( + "a-zA-Z\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u0103\u0106\u0107\ \u010c-\u010f\u0112-\u0115\u011a-\u012d\u0131\u0141\u0142\u0147\u0148\ \u0150-\u0153\u0158-\u0161\u0164\u0165\u016e-\u0171\u017d\u017e\ \u0391-\u03a1\u03a3-\u03a9\u03b1-\u03c9\u03d1\u03d2\u03d5\u03d6\ @@ -40,37 +77,44 @@ def get_srcdir() -> str: \uf6ba-\uf6bc\uf6be\uf6bf\uf6c1-\uf700\uf730\uf731\uf770\uf772\uf773\ \uf776\uf779\uf77a\uf77d-\uf780\uf782-\uf78b\uf78d-\uf78f\uf790\ \uf793-\uf79a\uf79c-\uf7a2\uf7a4-\uf7bd\uf800-\uf833\ufb01\ufb02" +) # Character ranges of letterlikes -_letterlikes = _data.get("letterlikes", {}) +_letterlikes: Final[Dict[str, str]] = NAMED_CHARACTERS_COLLECTION.get("letterlikes", {}) # Conversion from WL to the fully qualified names -_wl_to_ascii = _data.get("wl-to-ascii-dict", {}) -_wl_to_ascii_re = re.compile(_data.get("wl-to-ascii-re", "")) +_wl_to_ascii: Final[Dict[str, str]] = NAMED_CHARACTERS_COLLECTION.get( + "wl-to-ascii-dict", {} +) +_wl_to_ascii_re = re.compile(NAMED_CHARACTERS_COLLECTION.get("wl-to-ascii-re", "")) # AMS LaTeX replacements -_wl_to_amstex = _data.get("wl-to-amstex", None) +_wl_to_amstex = NAMED_CHARACTERS_COLLECTION.get("wl-to-amstex", None) -# Conversion from WL to unicode -_wl_to_unicode = _data.get("wl-to-unicode-dict", _data.get("wl_to_ascii")) -_wl_to_unicode_re = re.compile(_data.get("wl-to-unicode-re", "")) +# Conversion from WL to Unicode +_wl_to_unicode = NAMED_CHARACTERS_COLLECTION.get( + "wl-to-unicode-dict", NAMED_CHARACTERS_COLLECTION.get("wl_to_ascii") +) +_wl_to_unicode_re = re.compile(NAMED_CHARACTERS_COLLECTION.get("wl-to-unicode-re", "")) -# Conversion from unicode to WL -_unicode_to_wl = _data.get("unicode-to-wl-dict", {}) -_unicode_to_wl_re = re.compile(_data.get("unicode-to-wl-re", "")) +# Conversion from Unicode to WL +_unicode_to_wl = NAMED_CHARACTERS_COLLECTION.get("unicode-to-wl-dict", {}) +_unicode_to_wl_re = re.compile(NAMED_CHARACTERS_COLLECTION.get("unicode-to-wl-re", "")) # All supported named characters -named_characters = _data.get("named-characters", {}) +NAMED_CHARACTERS: Final[Dict[str, str]] = NAMED_CHARACTERS_COLLECTION.get( + "named-characters", {} +) # ESC sequence aliases -aliased_characters = _data.get("aliased-characters", {}) +aliased_characters = NAMED_CHARACTERS_COLLECTION.get("aliased-characters", {}) # Deprecated def replace_wl_with_plain_text(wl_input: str, use_unicode=True) -> str: """ The Wolfram Language uses specific Unicode characters to represent Wolfram - Language named characters. This functions replaces all occurrences of such + Language named characters. This function replaces all occurrences of such characters with their corresponding Unicode/ASCII equivalents. :param wl_input: The string whose characters will be replaced. @@ -78,8 +122,8 @@ def replace_wl_with_plain_text(wl_input: str, use_unicode=True) -> str: for the conversion. Note that the occurrences of named characters in ``wl_input`` are expect to - be represented by Wolfram's internal scheme. For more information Wolfram's - representation scheme and on our own conversion scheme please see `Listing + be represented by Wolfram's internal scheme. For more information on Wolfram's + representation scheme and on our own conversion scheme, please see `Listing of Named Characters `_ and ``implementation.rst`` respectively. @@ -87,7 +131,7 @@ def replace_wl_with_plain_text(wl_input: str, use_unicode=True) -> str: r = _wl_to_unicode_re if use_unicode else _wl_to_ascii_re d = _wl_to_unicode if use_unicode else _wl_to_ascii - # The below on when use_unicode is False will sometime test on "ascii" twice. + # The below, when use_unicode is False, will sometimes test on "ascii" twice. # But this routine should be deprecated. return r.sub(lambda m: d.get(m.group(0), _wl_to_ascii.get(m.group(0))), wl_input) @@ -96,7 +140,7 @@ def replace_wl_with_plain_text(wl_input: str, use_unicode=True) -> str: def replace_unicode_with_wl(unicode_input: str) -> str: """ The Wolfram Language uses specific Unicode characters to represent Wolfram - Language named characters. This functions replaces all occurrences of the + Language named characters. This function replaces all occurrences of the corresponding Unicode equivalents of such characters with the characters themselves. @@ -104,8 +148,8 @@ def replace_unicode_with_wl(unicode_input: str) -> str: Note that the occurrences of named characters in the output of ``replace_unicode_with_wl`` are represented using Wolfram's internal - scheme. For more information Wolfram's representation scheme and on our own - conversion scheme please see `Listing of Named Characters + scheme. For more information on Wolfram's representation scheme and on our own + conversion scheme, please see `Listing of Named Characters `_ and ``implementation.rst`` respectively. """ diff --git a/mathics_scanner/data/.gitignore b/mathics_scanner/data/.gitignore index 183700b..2442410 100644 --- a/mathics_scanner/data/.gitignore +++ b/mathics_scanner/data/.gitignore @@ -1 +1,4 @@ /.python-version +/box-character-tables.json +/boxing-characters.json +/named-characters.json diff --git a/mathics_scanner/data/boxing-characters.yml b/mathics_scanner/data/boxing-characters.yml new file mode 100644 index 0000000..5e38d35 --- /dev/null +++ b/mathics_scanner/data/boxing-characters.yml @@ -0,0 +1,96 @@ +# Information about Wolfram Language boxing characters used in the +# string representation of Boxing Expressions. +# +# +# All of the key names *except* \! and \* are associated with +# some box operator. +# +# +# Fields +# ====== +# +# +# ASCII (string) +# -------------- +# +# The character representation in ASCII. +# +# Operators +# ----------- +# +# When the string is part of a Boxing operator, the Boxing +# operator name(s) are given. +# +# Unicode +# ------- +# +# The representation in Unicode. All Unicode characters fall into the +# Private Use Area of Unicode that Wolfram uses for its own internal +# system markers, specifically the range 0xf7c0 to 0xf7cd. +# + +# Coding note: make note of quotes. Single quotes for unescaped backslashes, e.g. +# in the ASCII field and double quotes when we do not escaped backlashes in the +# Unicode field. + +LinearSyntaxAmp: + ASCII: '\&' + Operators: [] + Unicode: "\uf7c7" + +LinearSyntaxAt: + ASCII: '\@' + Operators: [RadicalBox, SqrtBox] + Unicode: "\uf7c1" + +LinearSyntaxBacktick: + ASCII: '\`' + Operators: [FormBox] + Unicode: "\uf7cd" + +LinearSyntaxBang: + ASCII: '\!' + Operators: + Unicode: "\uf7c1" + +LinearSyntaxCaret: + ASCII: '\^' + Operators: SuperscriptBox + Unicode: "\uf7c6" + +# Note: this name does not appear in CodeParse +LinearSyntaxCloseParen: + ASCII: '\)' + Operators: [RowBox] + Unicode: "\uf7c0" + +# Note: this name does not appear in CodeParse +LinearSyntaxOpenParen: + ASCII: '\(' + Operators: [RowBox] + Unicode: "\uf7c9" + +LinearSyntaxPercent: + ASCII: '\%' + Operators: [RadicalBox, SuperscriptBox, UnderOverscriptBox] + Unicode: "\uf7c5" + +LinearSyntaxPlus: + ASCII: '\+' + Operators: [UnderscriptBox, UnderOverscriptBox] + Unicode: "\uf7cb" + +LinearSyntaxStar: + ASCII: '\*' + Operators: [] + Unicode: "\uf7c8" + +LinearSyntaxSlash: + ASCII: '\/' + Operators: [FractionBox] + Unicode: "\uf7cc" + +LinearSyntaxUnder: + ASCII: '\_' + Operators: [SubscriptBox, SubsuperscriptBox] + Unicode: "\uf7ca" diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py index 27d8ad2..41e198b 100644 --- a/mathics_scanner/escape_sequences.py +++ b/mathics_scanner/escape_sequences.py @@ -2,22 +2,33 @@ Helper Module for tokenizing character escape sequences. """ -from typing import Optional, Tuple +from typing import Final, Optional, Tuple -from mathics_scanner.characters import named_characters +from mathics_scanner.characters import BOXING_ASCII_TO_UNICODE, NAMED_CHARACTERS from mathics_scanner.errors import ( EscapeSyntaxError, NamedCharacterSyntaxError, SyntaxError, ) +# The second character, or character after backslash ("\") using +# Boxing expression syntax. +BOX_OPERATOR: Final[str] = "&@`!^)(%*/_" + +# The second character, or character after backslash ("\") that +# are valid in a Mathics3 escaped character. +ESCAPE_CODES: Final[str] = 'ntbfr" $\n' + +# Valid digits in an Octal string +OCTAL_DIGITS: Final[str] = "01234567" + def parse_base(source_text: str, start_shift: int, end_shift: int, base: int) -> str: r""" See if characters start_shift .. end shift can be converted to an integer in base ``base``. - If so, chr(integer value converted from base) is returnd. + If so, chr(integer value converted from base) is returned. However, if the conversion fails, SyntaxError is raised. """ @@ -47,28 +58,30 @@ def parse_base(source_text: str, start_shift: int, end_shift: int, base: int) -> def parse_named_character(source_text: str, start: int, finish: int) -> Optional[str]: r""" - Find the unicode-equivalent symbol for a string named character. + Find the Unicode equivalent symbol for a string named character. - Before calling we have matched the text between "\[" and "]" of the input. + Before calling, we have matched the text between "\[" and "]" of the input. The name character is thus in source_text[start:finish]. Match this string with the known named characters, - e.g. "Theta". If we can match this, then we return the unicode equivalent from the - `named_characters` map (which is read in from JSON but stored in a YAML file). + e.g., "Theta". If we can match this, then we return the Unicode equivalent from the + `NAMED_CHARACTERS` map (which is read in from JSON but stored in a YAML file). If we can't find the named character, raise NamedCharacterSyntaxError. """ named_character = source_text[start:finish] if named_character.isalpha(): - char = named_characters.get(named_character) + char = NAMED_CHARACTERS.get(named_character) if char is None: raise NamedCharacterSyntaxError("sntufn", named_character, source_text) else: return char -def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]: +def parse_escape_sequence( + source_text: str, pos: int, is_in_string: bool +) -> Tuple[str, int]: """Given some source text in `source_text` starting at offset `pos`, return the escape-sequence value for this text and the follow-on offset position. @@ -112,19 +125,21 @@ def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]: result += named_character pos = i + 1 - elif c in "01234567": + elif c in OCTAL_DIGITS: # See if we have a 3-digit octal number. # For example \065 = "5" result += parse_base(source_text, pos, pos + 3, 8) pos += 3 # WMA escape characters \n, \t, \b, \r. - # Note that these are a similer to Python, but are different. + # Note that these are similar to Python, but are different. # In particular, Python defines "\a" to be ^G (control G), # but in WMA, this is invalid. - elif c in "ntbfr $\n": + elif c in ESCAPE_CODES: if c in "n\n": result += "\n" + elif c == '"': + result += '"' elif c == " ": result += " " elif c == "t": @@ -140,6 +155,13 @@ def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]: assert c == "r" result += "\r" pos += 1 + elif is_in_string and c in BOX_OPERATOR: + if (boxed_character := BOXING_ASCII_TO_UNICODE.get("\\" + c)) is not None: + # Replace \ in result with Unicode representing the two ASCII characters. + result = result[:-1] + boxed_character + else: + raise EscapeSyntaxError("stresc", rf"\{c}") + pos += 1 elif c in '!"': result += c pos += 1 diff --git a/mathics_scanner/generate/boxing_characters.py b/mathics_scanner/generate/boxing_characters.py new file mode 100644 index 0000000..bdf7660 --- /dev/null +++ b/mathics_scanner/generate/boxing_characters.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python +# This scripts reads the data from named-characters and converts it to the +# format used by the library internally + +import json +import os.path as osp +import sys +from pathlib import Path + +import click +import yaml + +from mathics_scanner.version import __version__ + + +def compile_tables(data: dict) -> dict: + """ + Compiles the general table into the tables used internally by the library. + This facilitates fast access of this information by clients needing this + information. + """ + + # Multiple entries in the YAML table are redundant in the following sense: + # when a character has a plain-text equivalent but the plain-text + # equivalent is equal to it's WL unicode representation (i.e. the + # "wl-unicode" field is the same as the "unicode-equivalent" field) then it + # is considered rendundant for us, since no conversion is needed. + # + # As an optimization, we explicit remove any redundant characters from all + # JSON tables. This makes the tables smaller (therefore easier to load), as + # well as the correspond regex patterns. This implies that not all + # characters that have a unicode equivalent are included in `wl_to_ascii` + # or `wl_to_unicode_dict`. Furthermore, this implies that not all + # characters that have a unicode inverse are included in + # `unicode_to_wl_dict` + + # WL to AMS LaTeX (math mode) characters + ascii_to_unicode = {v["ASCII"]: v["Unicode"] for v in data.values()} + + unicode_to_ascii = {v["Unicode"]: v["ASCII"] for v in data.values()} + + return { + "ascii-to-unicode": ascii_to_unicode, + "unicode-to-ascii": unicode_to_ascii, + } + + +DEFAULT_DATA_DIR = Path(osp.normpath(osp.dirname(__file__)), "..", "data") + +ALL_FIELDS = [ + "unicode-to-ascii", + "ascii-to-unicode", +] + + +@click.command() +@click.version_option(version=__version__) # NOQA +@click.option( + "--field", + "-f", + multiple=True, + required=False, + help="Select which fields to include in JSON.", + show_default=True, + type=click.Choice(ALL_FIELDS), + default=ALL_FIELDS, +) +@click.option( + "--output", + "-o", + show_default=True, + type=click.Path(writable=True), + default=DEFAULT_DATA_DIR / "boxing-characters.json", +) +@click.argument( + "data_dir", type=click.Path(readable=True), default=DEFAULT_DATA_DIR, required=False +) +def main(field, output, data_dir): + with ( + open(data_dir / "boxing-characters.yml", "r", encoding="utf8") as i, + open(output, "w") as o, + ): + # Load the YAML data. + data = yaml.load(i, Loader=yaml.FullLoader) + + # Precompile the tables. + data = compile_tables(data) + + # Dump the preprocessed dictionaries to disk as JSON. + json.dump(data, o) + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/mathics_scanner/generate/build_tables.py b/mathics_scanner/generate/named_characters.py similarity index 96% rename from mathics_scanner/generate/build_tables.py rename to mathics_scanner/generate/named_characters.py index 6d1da59..8cd9a22 100755 --- a/mathics_scanner/generate/build_tables.py +++ b/mathics_scanner/generate/named_characters.py @@ -11,20 +11,7 @@ import click import yaml -try: - from mathics_scanner.version import __version__ -except ImportError: - # When using build isolation - __version__ = "unknown" - - -def get_srcdir() -> str: - filename = osp.normcase(osp.dirname(osp.abspath(__file__))) - return osp.realpath(filename) - - -def read(*rnames) -> str: - return open(osp.join(get_srcdir(), *rnames)).read() +from mathics_scanner import __version__ def re_from_keys(d: dict) -> str: @@ -271,7 +258,7 @@ def compile_tables(data: dict) -> dict: } -DEFAULT_DATA_DIR = Path(osp.normpath(osp.dirname(__file__)), "..", "data") +DEFAULT_DATA_DIR = Path(__file__).parent.parent / "data" ALL_FIELDS = [ "aliased-characters", @@ -321,7 +308,7 @@ def compile_tables(data: dict) -> dict: "-o", show_default=True, type=click.Path(writable=True), - default=DEFAULT_DATA_DIR / "character-tables.json", + default=DEFAULT_DATA_DIR / "named-characters.json", ) @click.argument( "data_dir", type=click.Path(readable=True), default=DEFAULT_DATA_DIR, required=False diff --git a/mathics_scanner/generate/build_operator_tables.py b/mathics_scanner/generate/operators.py similarity index 94% rename from mathics_scanner/generate/build_operator_tables.py rename to mathics_scanner/generate/operators.py index ce5fb96..a204875 100755 --- a/mathics_scanner/generate/build_operator_tables.py +++ b/mathics_scanner/generate/operators.py @@ -12,6 +12,8 @@ import click import yaml +from mathics_scanner.version import __version__ + OPERATOR_FIELDS = [ "actual-precedence", "Precedence", @@ -25,22 +27,6 @@ ] -try: - from mathics_scanner.version import __version__ -except ImportError: - # When using build isolation - __version__ = "unknown" - - -def get_srcdir() -> str: - filename = osp.normcase(osp.dirname(osp.abspath(__file__))) - return osp.realpath(filename) - - -def read(*rnames) -> str: - return open(osp.join(get_srcdir(), *rnames)).read() - - def compile_tables( operator_data: Dict[str, dict], character_data: Dict[str, dict] ) -> Dict[str, dict]: diff --git a/mathics_scanner/load.py b/mathics_scanner/load.py index 66ef912..1826e79 100644 --- a/mathics_scanner/load.py +++ b/mathics_scanner/load.py @@ -4,16 +4,16 @@ import yaml -from mathics_scanner.generate.build_tables import DEFAULT_DATA_DIR +from mathics_scanner.generate.named_characters import DEFAULT_DATA_DIR -def load_mathics_character_yaml(): +def load_mathics3_named_characters_yaml(): with open(DEFAULT_DATA_DIR / "named-characters.yml", "r") as yaml_file: yaml_data = yaml.load(yaml_file, Loader=yaml.FullLoader) return yaml_data -def load_mathics_character_json(): - with open(DEFAULT_DATA_DIR / "character-tables.json", "r") as json_file: +def load_mathics3_named_characters_json(): + with open(DEFAULT_DATA_DIR / "named-characters.json", "r") as json_file: json_data = json.load(json_file) return json_data diff --git a/mathics_scanner/mathics3_tokens.py b/mathics_scanner/mathics3_tokens.py index d37335d..aef026d 100644 --- a/mathics_scanner/mathics3_tokens.py +++ b/mathics_scanner/mathics3_tokens.py @@ -8,6 +8,7 @@ import re import sys +from mathics_scanner.characters import replace_box_unicode_with_ascii from mathics_scanner.errors import ( EscapeSyntaxError, NamedCharacterSyntaxError, @@ -99,29 +100,21 @@ def get_last_line_number(self): def get_in_prompt(self): next_line_number = self.get_last_line_number() + 1 - self.lineno = next_line_number - return "{1}{0}[{2}{3}]:= {4}".format(self.in_prefix, *self.incolors) + return "{2}{0}[{3}{1}{4}]:= {5}".format( + self.in_prefix, next_line_number, *self.incolors + ) - def get_out_prompt(self, form=None): + def get_out_prompt(self): line_number = self.get_last_line_number() - if form: - return "{2}{0}[{3}{4}]//{1}= {5}".format( - self.out_prefix, line_number, form, *self.outcolors - ) - return "{1}{0}[{2}{3}]= {4}".format( + return "{2}{0}[{3}{1}{4}]= {5}".format( self.out_prefix, line_number, *self.outcolors ) - def to_output(self, text, form=None): + def to_output(self, text): line_number = self.get_last_line_number() newline = "\n" + " " * len("Out[{0}]= ".format(line_number)) - if form: - newline += (len(form) + 2) * " " return newline.join(text.splitlines()) - def out_callback(self, out, fmt=None): - print(self.to_output(str(out), fmt)) - def read_line(self, prompt): if self.using_readline: return self.rl_read_line(prompt) @@ -163,7 +156,7 @@ def interactive_eval_loop(shell: TerminalShell, code_tokenize_format: bool): while True: try: source_text = shell.feed() - tokens(source_text, code_tokenize_format) + tokens(shell, source_text, code_tokenize_format) except NamedCharacterSyntaxError: shell.errmsg( "Syntax", @@ -197,11 +190,11 @@ def interactive_eval_loop(shell: TerminalShell, code_tokenize_format: bool): print("\n\nGoodbye!\n") # raise to pass the error code on, e.g. Quit[1] raise - finally: - shell.reset_lineno() + # finally: + # shell.reset_lineno() -def tokens(source_text: str, code_tokenize_format: bool): +def tokens(shell: TerminalShell, source_text: str, code_tokenize_format: bool): tokeniser = Tokeniser( SingleLineFeeder(source_text, "", ContainerKind.STRING) ) @@ -217,9 +210,14 @@ def tokens(source_text: str, code_tokenize_format: bool): if token.tag == "END": break elif code_tokenize_format: - print(token.code_tokenize_format) + mess = shell.get_out_prompt() + print( + mess + replace_box_unicode_with_ascii(token.code_tokenize_format) + "\n" + ) else: - print(token) + mess = shell.get_out_prompt() + token.text = replace_box_unicode_with_ascii(token.text) + print(mess + str(token) + "\n") def main(): @@ -297,7 +295,7 @@ def main(): if args.FILE is not None: feeder = FileLineFeeder(args.FILE) - tokenizer_loop(feeder, args.CodeTokenize) + tokenizer_loop(feeder, shell, args.CodeTokenize) else: interactive_eval_loop(shell, args.CodeTokenize) diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 8034575..a435da4 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -9,9 +9,9 @@ import os.path as osp import re import string -from typing import Dict, List, Optional, Set, Tuple +from typing import Dict, Final, List, Optional, Set, Tuple -from mathics_scanner.characters import _letterlikes, _letters, named_characters +from mathics_scanner.characters import NAMED_CHARACTERS, _letterlikes, _letters from mathics_scanner.errors import ( EscapeSyntaxError, IncompleteSyntaxError, @@ -30,10 +30,10 @@ ROOT_DIR = osp.dirname(__file__) OPERATORS_TABLE_PATH = osp.join(ROOT_DIR, "data", "operators.json") -############################################## +################################################ # The below get initialized in by init_module() # from operator data -############################################## +################################################ OPERATOR_DATA = {} NO_MEANING_OPERATORS = {} @@ -41,7 +41,7 @@ # This is used in t_String for escape-sequence handling. # The below is roughly correct, but we overwrite this # from operators.json data in init_module() -BOXING_CONSTRUCT_SUFFIXES: Set[str] = { +BOXING_CONSTRUCT_SUFFIXES: Final[Set[str]] = { "%", "/", "@", @@ -120,7 +120,7 @@ # FIXME incorportate the below table in to Function/Operators YAML # Table of correspondneces between a Mathics3 token name (or "tag") # and WMA CodeTokenize name -MATHICS3_TAG_TO_CODETOKENIZE: Dict[str, str] = { +MATHICS3_TAG_TO_CODETOKENIZE: Final[Dict[str, str]] = { "AddTo": "PlusEqual", "Alternatives": "Bar", "And": "AmpAmp", @@ -263,59 +263,59 @@ def init_module(): # ("AddTo", r" \+\= "), ("Alternatives", r" \| "), - ("And", rf" (\&\&) | {named_characters['And']} "), + ("And", rf" (\&\&) | {NAMED_CHARACTERS['And']} "), ("Apply", r" \@\@ "), ("ApplyList", r" \@\@\@ "), ("Composition", r" \@\* "), ("Condition", r" \/\; "), - ("Conjugate", rf" {named_characters['Conjugate']} "), + ("Conjugate", rf" {NAMED_CHARACTERS['Conjugate']} "), ("ConjugateTranspose", r" \uf3c9 "), - ("Cross", rf" \uf4a0 | {named_characters['Cross']} "), + ("Cross", rf" \uf4a0 | {NAMED_CHARACTERS['Cross']} "), ("Decrement", r" \-\- "), - ("Del", rf" {named_characters['Del']} "), + ("Del", rf" {NAMED_CHARACTERS['Del']} "), ("Derivative", r" \' "), # ('DifferenceDelta', r' \u2206 '), # https://reference.wolfram.com/language/ref/character/DirectedEdge.html - ("DirectedEdge", rf" -> | \uf3d5 | {named_characters['DirectedEdge']} "), + ("DirectedEdge", rf" -> | \uf3d5 | {NAMED_CHARACTERS['DirectedEdge']} "), # ('DiscreteRatio', r' \uf4a4 '), # ('DiscreteShift', r' \uf4a3 '), - ("Conjugate", rf" {named_characters['Conjugate']} "), + ("Conjugate", rf" {NAMED_CHARACTERS['Conjugate']} "), ("ConjugateTranspose", r" \uf3c9 "), - ("DifferentialD", rf" \uf74c | {named_characters['DifferentialD']} "), - ("Divide", rf" \/| {named_characters['Divide']} "), + ("DifferentialD", rf" \uf74c | {NAMED_CHARACTERS['DifferentialD']} "), + ("Divide", rf" \/| {NAMED_CHARACTERS['Divide']} "), ("DivideBy", r" \/\= "), ("Dot", r" \. "), - ("Element", r" {named_characters['Element']} "), - ("Equal", rf" (\=\=) | \uf431 | {named_characters['Equal']} | \uf7d9 "), - ("Equivalent", r" {named_characters['Equivalent']} "), - ("Exists", r" {named_characters['Exists']} "), + ("Element", r" {NAMED_CHARACTERS['Element']} "), + ("Equal", rf" (\=\=) | \uf431 | {NAMED_CHARACTERS['Equal']} | \uf7d9 "), + ("Equivalent", r" {NAMED_CHARACTERS['Equivalent']} "), + ("Exists", r" {NAMED_CHARACTERS['Exists']} "), ("Factorial", r" \! "), ("Factorial2", r" \!\! "), - ("ForAll", r" {named_characters['ForAll']} "), - ("Function", rf" \& | \uF4A1 | {named_characters['Function']} | \|-> "), + ("ForAll", r" {NAMED_CHARACTERS['ForAll']} "), + ("Function", rf" \& | \uF4A1 | {NAMED_CHARACTERS['Function']} | \|-> "), ("Greater", r" \> "), - ("GreaterEqual", rf" (\>\=) | {named_characters['GreaterEqual']} "), + ("GreaterEqual", rf" (\>\=) | {NAMED_CHARACTERS['GreaterEqual']} "), ("HermitianConjugate", r" \uf3ce "), ("Implies", r" \uF523 "), ("Increment", r" \+\+ "), ("Infix", r" \~ "), ("Information", r"\?\?"), ("Integral", r" \u222b "), - ("Intersection", rf" {named_characters['Intersection']} "), + ("Intersection", rf" {NAMED_CHARACTERS['Intersection']} "), ("Less", r" \< "), - ("LessEqual", rf" (\<\=) | {named_characters['LessEqual']} "), + ("LessEqual", rf" (\<\=) | {NAMED_CHARACTERS['LessEqual']} "), ("Map", r" \/\@ "), ("MapAll", r" \/\/\@ "), - # FIXME: can't use named_characters in Minus because the ASCII minus + # FIXME: can't use NAMED_CHARACTERS in Minus because the ASCII minus # causes the unicode not to appear in tables. ("Minus", r" \-| \u2122 "), - ("Nand", rf" {named_characters['Nand']} "), + ("Nand", rf" {NAMED_CHARACTERS['Nand']} "), ("NonCommutativeMultiply", r" \*\* "), - ("Nor", rf" {named_characters['Nor']} "), - ("Not", r" {named_characters['Not']} "), - ("NotElement", r" {named_characters['NotElement']} "), - ("NotExists", r" {named_characters['NotExists']} "), - ("Or", rf" (\|\|) | {named_characters['Or']} "), + ("Nor", rf" {NAMED_CHARACTERS['Nor']} "), + ("Not", r" {NAMED_CHARACTERS['Not']} "), + ("NotElement", r" {NAMED_CHARACTERS['NotElement']} "), + ("NotExists", r" {NAMED_CHARACTERS['NotExists']} "), + ("Or", rf" (\|\|) | {NAMED_CHARACTERS['Or']} "), # ('PartialD', r' \u2202 '), ("PatternTest", r" \? "), ("Plus", r" \+ "), @@ -330,31 +330,31 @@ def init_module(): ("ReplaceAll", r" \/\. "), ("ReplaceRepeated", r" \/\/\. "), ("RightComposition", r" \/\* "), - ("Rule", r" (\-\>)| \uF522 | {named_characters['Rule']} "), + ("Rule", r" (\-\>)| \uF522 | {NAMED_CHARACTERS['Rule']} "), ("RuleDelayed", r" (\:\>)|\uF51F "), ("SameQ", r" \=\=\= "), ("Semicolon", r" \; "), ("Set", r" \= "), ("SetDelayed", r" \:\= "), - ("Square", rf" \uf520 | {named_characters['Square']}"), + ("Square", rf" \uf520 | {NAMED_CHARACTERS['Square']}"), ("StringExpression", r" \~\~ "), ("StringJoin", r" \<\> "), ("SubtractFrom", r" \-\= "), # ('Sum', r' \u2211 '), ("TagSet", r" \/\: "), - ("Times", rf" \*|{named_characters['Times']} "), + ("Times", rf" \*|{NAMED_CHARACTERS['Times']} "), ("TimesBy", r" \*\= "), - ("Transpose", rf" \uf3c7 | {named_characters['Transpose']} "), - ("Unequal", rf" (\!\= ) | {named_characters['NotEqual']} "), - ("Union", rf" {named_characters['Union']} "), + ("Transpose", rf" \uf3c7 | {NAMED_CHARACTERS['Transpose']} "), + ("Unequal", rf" (\!\= ) | {NAMED_CHARACTERS['NotEqual']} "), + ("Union", rf" {NAMED_CHARACTERS['Union']} "), ("UnsameQ", r" \=\!\= "), ("Xnor", r" \uF4A2 "), - ("Xor", rf" {named_characters['Xor']} "), + ("Xor", rf" {NAMED_CHARACTERS['Xor']} "), # https://reference.wolfram.com/language/ref/character/UndirectedEdge.html # The official Unicode value is \u2194 ( "UndirectedEdge", - rf" (\<\-\>)|\u29DF | {named_characters['UndirectedEdge']} ", + rf" (\<\-\>)|\u29DF | {NAMED_CHARACTERS['UndirectedEdge']} ", ), # allow whitespace but avoid e.g. x=.01 ("Unset", r" \=\s*\.(?!\d|\.) "), @@ -566,7 +566,7 @@ def __init__(self, feeder): # Set to True when inside box parsing. # This has an effect on which escape operators are allowed. - self._is_inside_box: bool = False + self.is_inside_box: bool = False self._change_token_scanning_mode("expr") @@ -702,7 +702,7 @@ def next(self) -> Token: try: escape_str, next_pos = parse_escape_sequence( - self.source_text, self.pos + 1 + self.source_text, self.pos + 1, is_in_string=False ) except (EscapeSyntaxError, NamedCharacterSyntaxError) as escape_error: if self.is_inside_box: @@ -809,7 +809,9 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: source_text += self.source_text try: - escape_str, self.pos = parse_escape_sequence(source_text, start_pos) + escape_str, self.pos = parse_escape_sequence( + source_text, start_pos, is_in_string=False + ) if source_text[start_pos] == "[" and source_text[self.pos - 1] == "]": named_character = source_text[start_pos + 1 : self.pos - 1] except (EscapeSyntaxError, NamedCharacterSyntaxError) as escape_error: @@ -877,7 +879,7 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: try: escape_str, next_pos = parse_escape_sequence( - self.source_text, self.pos + 1 + self.source_text, self.pos + 1, is_in_string=False ) except (EscapeSyntaxError, NamedCharacterSyntaxError) as escape_error: if self.is_inside_box: @@ -940,7 +942,9 @@ def t_String(self, _: Optional[re.Match]) -> Token: self.get_more_input() self.pos += 1 try: - escape_str, self.pos = parse_escape_sequence(source_text, self.pos) + escape_str, self.pos = parse_escape_sequence( + source_text, self.pos, is_in_string=True + ) except NamedCharacterSyntaxError as escape_error: self.feeder.message( escape_error.name, escape_error.tag, *escape_error.args @@ -955,7 +959,7 @@ def t_String(self, _: Optional[re.Match]) -> Token: # If there is boxing construct matched, we # preserve what was given, but do not tokenize # the construct. "\(" remains "\(" and is not - # turned into IntepretBox". + # turned into InterpretBox". result += "\\" + escaped_char self.pos += 1 else: diff --git a/pyproject.toml b/pyproject.toml index f78e461..5be33b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,8 +51,9 @@ full = [ ] [project.scripts] -mathics3-generate-json-table = "mathics_scanner.generate.build_tables:main" -mathics3-generate-operator-json-table = "mathics_scanner.generate.build_operator_tables:main" +mathics3-make-boxing-character-json = "mathics_scanner.generate.boxing_characters:main" +mathics3-make-named-character-json = "mathics_scanner.generate.named_characters:main" +mathics3-make-operator-json = "mathics_scanner.generate.operators:main" mathics3-tokens = "mathics_scanner.mathics3_tokens:main" [tool.setuptools] @@ -64,11 +65,13 @@ packages = [ [tool.setuptools.package-data] "mathics_scanner" = [ + "data/boxing-characters.json", + "data/boxing-characters.yml", + "data/named-characters.json", "data/named-characters.yml", "data/operators.yml", "data/operators.json", "data/*.csv", - "data/character-tables.json", # List this explicitly since it is needed "data/*.json", "data/ExampleData/*", ] diff --git a/setup.py b/setup.py index 3b48ace..46561ad 100644 --- a/setup.py +++ b/setup.py @@ -25,12 +25,11 @@ mathics-users@googlegroups.com and ask for help. """ +import os import os.path as osp -import subprocess -import sys from setuptools import setup -from setuptools.command.egg_info import egg_info +from setuptools.command.build_py import build_py as setuptools_build_py def get_srcdir(): @@ -39,25 +38,22 @@ def get_srcdir(): return osp.realpath(filename) -class table_building_egg_info(egg_info): - """This runs as part of building an sdist""" +class build_py(setuptools_build_py): + def run(self): + for table_type in ("boxing-character", "named-character", "operator"): + json_data_file = osp.join("data", f"{table_type}.json") + json_path = osp.join("mathics-scanner", json_data_file) + if not osp.exists(json_path): + os.system(f"mathics3-make-{table_type}-json" " -o {json-path}") + self.distribution.package_data["Mathics-Scanner"].append(json_data_file) + setuptools_build_py.run(self) - def finalize_options(self): - """Run program to create JSON tables""" - build_tables_program = osp.join( - get_srcdir(), "mathics_scanner", "generate", "build_tables.py" - ) - print(f"Building JSON tables via {build_tables_program}") - result = subprocess.run([sys.executable, build_tables_program], check=False) - if result.returncode: - raise RuntimeError( - f"Running {build_tables_program} exited with code {result.returncode}" - ) - super().finalize_options() + +CMDCLASS = {"build_py": build_py} setup( - cmdclass={"egg_info": table_building_egg_info}, + cmdclass=CMDCLASS, # don't pack Mathics in egg because of media files, etc. zip_safe=False, ) diff --git a/test/helper.py b/test/helper.py new file mode 100644 index 0000000..307ab5f --- /dev/null +++ b/test/helper.py @@ -0,0 +1,7 @@ +from mathics_scanner.load import ( + load_mathics3_named_characters_json, + load_mathics3_named_characters_yaml, +) + +yaml_data = load_mathics3_named_characters_yaml() +json_data = load_mathics3_named_characters_json() diff --git a/test/test_ascii.py b/test/test_ascii.py index 0eb9c5a..99a4023 100644 --- a/test/test_ascii.py +++ b/test/test_ascii.py @@ -1,12 +1,6 @@ # -*- coding: utf-8 -*- -from mathics_scanner.load import ( - load_mathics_character_json, - load_mathics_character_yaml, -) - -yaml_data = load_mathics_character_yaml() -json_data = load_mathics_character_json() +from test.helper import json_data def test_ascii(): diff --git a/test/test_character_table_consistency.py b/test/test_character_table_consistency.py index 35b8b40..c6c652a 100644 --- a/test/test_character_table_consistency.py +++ b/test/test_character_table_consistency.py @@ -1,14 +1,9 @@ # -*- coding: utf-8 -*- +from test.helper import json_data, yaml_data + from mathics_scanner.characters import replace_unicode_with_wl as unicode_to_wl from mathics_scanner.characters import replace_wl_with_plain_text as wl_to_unicode -from mathics_scanner.load import ( - load_mathics_character_json, - load_mathics_character_yaml, -) - -yaml_data = load_mathics_character_yaml() -json_data = load_mathics_character_json() def test_ascii_fields_in_json(): diff --git a/test/test_escape_sequences.py b/test/test_escape_sequences.py index 30af4c3..f963108 100644 --- a/test/test_escape_sequences.py +++ b/test/test_escape_sequences.py @@ -45,13 +45,16 @@ def test_escape_sequences(): (r"z \[Conjugate]", 3, 14, "\uf3c8", "Named character; at end"), ("[Integral]", 0, 10, "\u222b", "Another full-string named-character"), ): - assert parse_escape_sequence(text, pos) == (expect_str, expect_pos), fail_msg + assert parse_escape_sequence(text, pos, is_in_string=False) == ( + expect_str, + expect_pos, + ), fail_msg def test_invalid_named_character_sequences(): for text in (r"\[", r"\[Theta", r"\[Fake]", r"\[abc]"): with pytest.raises(NamedCharacterSyntaxError): - parse_escape_sequence(text, 1) + parse_escape_sequence(text, 1, is_in_string=False) def test_invalid_number_encoding(): @@ -75,4 +78,4 @@ def test_invalid_number_encoding(): ":01-2", ): with pytest.raises(SyntaxError): - parse_escape_sequence(text, 0) + parse_escape_sequence(text, 0, is_in_string=False) diff --git a/test/test_general_yaml_sanity.py b/test/test_general_yaml_sanity.py index e419877..e7e8940 100644 --- a/test/test_general_yaml_sanity.py +++ b/test/test_general_yaml_sanity.py @@ -2,10 +2,7 @@ import re import unicodedata - -from mathics_scanner.load import load_mathics_character_yaml - -yaml_data = load_mathics_character_yaml() +from test.helper import yaml_data def check_attr_is_invertible(attr: str): diff --git a/test/test_has_unicode_inverse_sanity.py b/test/test_has_unicode_inverse_sanity.py index f71a7e2..7949788 100644 --- a/test/test_has_unicode_inverse_sanity.py +++ b/test/test_has_unicode_inverse_sanity.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- from mathics_scanner.load import ( - load_mathics_character_yaml, - load_mathics_character_json, + load_mathics3_named_characters_json, + load_mathics3_named_characters_yaml, ) -yaml_data = load_mathics_character_yaml() -json_data = load_mathics_character_json() +yaml_data = load_mathics3_named_characters_yaml() +json_data = load_mathics3_named_characters_json() def test_has_unicode_inverse_sanity(): diff --git a/test/test_letterlikes_sanity.py b/test/test_letterlikes_sanity.py index b2dc381..01533a2 100644 --- a/test/test_letterlikes_sanity.py +++ b/test/test_letterlikes_sanity.py @@ -1,12 +1,6 @@ # -*- coding: utf-8 -*- -from mathics_scanner.load import ( - load_mathics_character_yaml, - load_mathics_character_json, -) - -yaml_data = load_mathics_character_yaml() -json_data = load_mathics_character_json() +# from test.helper import json_data, yaml_data def test_letterlikes_sanity(): diff --git a/test/test_string_tokens.py b/test/test_string_tokens.py index 61cdc76..180c38b 100644 --- a/test/test_string_tokens.py +++ b/test/test_string_tokens.py @@ -96,7 +96,7 @@ def test_string(): check_string( r'"\(a \+\)"', - r'"\(a \+\)"', + r'"a \+"', "Do not interpret, but preserve boxing inside a string", ) diff --git a/test/test_translation_regressions.py b/test/test_translation_regressions.py index d0d4c13..72069ee 100644 --- a/test/test_translation_regressions.py +++ b/test/test_translation_regressions.py @@ -1,10 +1,10 @@ # -*- coding: utf-8 -*- -from mathics_scanner.characters import replace_wl_with_plain_text, named_characters +from mathics_scanner.characters import NAMED_CHARACTERS, replace_wl_with_plain_text def check_translation_regression(c: str, expected_translation: str): - translation = replace_wl_with_plain_text(named_characters[c]) + translation = replace_wl_with_plain_text(NAMED_CHARACTERS[c]) assert ( translation == expected_translation ), f"REGRESSION {c} is translated to {translation} but it should translate to {expected_translation}" diff --git a/test/test_unicode.py b/test/test_unicode.py index 43fb296..f4b27a7 100644 --- a/test/test_unicode.py +++ b/test/test_unicode.py @@ -1,12 +1,6 @@ # -*- coding: utf-8 -*- -from mathics_scanner.load import ( - load_mathics_character_json, - load_mathics_character_yaml, -) - -yaml_data = load_mathics_character_yaml() -json_data = load_mathics_character_json() +from test.helper import yaml_data def test_has_unicode(): diff --git a/test/test_urls.py b/test/test_urls.py index 6c4e1a1..a592501 100644 --- a/test/test_urls.py +++ b/test/test_urls.py @@ -1,19 +1,16 @@ # -*- coding: utf-8 -*- import os +from test.helper import yaml_data # from urllib.error import HTTPError, URLError from urllib.request import urlopen import pytest -from mathics_scanner.load import load_mathics_character_yaml - -yaml_data = load_mathics_character_yaml() - # This test is slow, so do only on request! @pytest.mark.skipif( - not os.environ.get("MATHICS_LINT"), reason="Lint checking done only when specified" + not os.environ.get("MATHICS3_LINT"), reason="Lint checking done only when specified" ) def test_yaml_urls(): for k, v in yaml_data.items(): diff --git a/test/test_wl_to_ascii.py b/test/test_wl_to_ascii.py index b6444d4..f8bf92b 100644 --- a/test/test_wl_to_ascii.py +++ b/test/test_wl_to_ascii.py @@ -1,9 +1,8 @@ # -*- coding: utf-8 -*- -from mathics_scanner.characters import replace_wl_with_plain_text -from mathics_scanner.load import load_mathics_character_yaml +from test.helper import yaml_data -yaml_data = load_mathics_character_yaml() +from mathics_scanner.characters import replace_wl_with_plain_text def wl_to_ascii(wl_input: str) -> str: