diff --git a/.github/workflows/mathics3-doctest.yml b/.github/workflows/mathics3-doctest.yml index 72c991df..1af5d23b 100644 --- a/.github/workflows/mathics3-doctest.yml +++ b/.github/workflows/mathics3-doctest.yml @@ -36,12 +36,9 @@ jobs: - name: Build Mathics3 run: | # Until next Mathics3/mathics-core release is out... - git clone --depth 1 https://github.com/Mathics3/mathics-core.git + git clone --depth 1 -b more-scanner-changes https://github.com/Mathics3/mathics-core.git cd mathics-core/ python -m pip install -e .[dev] - cp -v ../mathics_scanner/data/boxing-characters.json mathics/data/boxing-characters.json - cp -v ../mathics_scanner/data/named-characters.json mathics/data/named-characters.json - cp -v ../mathics_scanner/data/operators.json mathics/data/operators.json cd .. - name: Run Mathics3 tests run: | diff --git a/mathics_scanner/data/named-characters.yml b/mathics_scanner/data/named-characters.yml index 0f926205..2c633e94 100644 --- a/mathics_scanner/data/named-characters.yml +++ b/mathics_scanner/data/named-characters.yml @@ -5970,6 +5970,12 @@ Implies: wl-reference: https://reference.wolfram.com/language/ref/character/Implies.html wl-unicode: "\uF523" +Increment: + ascii: "++" + has-unicode-inverse: false + is-letter-like: false + operator-name: Increment + IndentingNewLine: esc-alias: nl has-unicode-inverse: false @@ -5999,12 +6005,6 @@ Infix: has-unicode-inverse: false is-letter-like: false -Information: - ascii: "??" - has-unicode-inverse: false - is-letter-like: false - operator-name: Information - Integral: amslatex: '\int' esc-alias: int @@ -7794,6 +7794,9 @@ OptionKey: wl-reference: https://reference.wolfram.com/language/ref/character/OptionKey.html wl-unicode: "\uF7D2" +# Optional and Pattern are both operators. When ":" represents one or +# another depends on context, and determining which is what is +# mysterious right now. Optional: ascii: ":" has-unicode-inverse: false @@ -7868,6 +7871,15 @@ PartialD: wl-unicode: "\u2202" wl-unicode-name: PARTIAL DIFFERENTIAL +# Optional and Pattern are both operators. When ":" represents one or +# another depends on context, and determining which is what is +# mysterious right now. +Pattern: + ascii: ":" + has-unicode-inverse: false + is-letter-like: false + operator-name: Pattern + # See also RawQuestion PatternTest: ascii: "?" @@ -8134,6 +8146,12 @@ QuarterNote: wl-unicode: "\u2669" wl-unicode-name: QUARTER NOTE +QuestionQuestion: + ascii: "??" + has-unicode-inverse: false + is-letter-like: false + operator-name: QuestionQuestion + RHacek: esc-alias: rv has-unicode-inverse: false @@ -9738,6 +9756,18 @@ SkeletonIndicator: wl-unicode: "\u2043" wl-unicode-name: HYPHEN BULLET +Slot: + ascii: "#" + has-unicode-inverse: false + is-letter-like: false + operator-name: Slot + +SlotSequence: + ascii: "##" + has-unicode-inverse: false + is-letter-like: false + operator-name: SlotSequence + SmallCircle: amslatex: '\circ' esc-alias: sc diff --git a/mathics_scanner/data/operators.yml b/mathics_scanner/data/operators.yml index b0dd7cf4..c0285d00 100644 --- a/mathics_scanner/data/operators.yml +++ b/mathics_scanner/data/operators.yml @@ -4457,6 +4457,9 @@ NumberPrecisionPostfix: meaningful: true # comments: Specifies the precision of "number" to be $MachinePrecision. Any magnitude must come after `." +# How one determines when the ":" operator is a "Optional" or "Pattern" is a mystery. +# But apparently they have difference precedences, and cannot be combined or thought +# of as one operator. Optional: precedence: 140 WolframLanguageData: 64 @@ -4466,10 +4469,17 @@ Optional: # N-tokens: {} # L-tokens: {":"} # O-tokens: {} - # usage: "patt : expr" - FullForm: Optional[patt, expr] + # usage: "symb : expr" + FullForm: Pattern[symb, expr] arity: Binary affix: Infix + # Should be right associative, + # but something in implicit multiplication + # is messing things up? In: + # ConditionalExpression[expr1_, cond_] expr2_ + # ? + # Possibly this is due to a confusion between + # whether the ":" represents Optional or Pattern associativity: "unknown" meaningful: true # comments: @@ -4718,6 +4728,9 @@ PartialUnderscriptBox: meaningful: false # comments: This operator is an invisible Unicode character and is used in the layout of displayed expressions. +# How one determines when ":" is an operator is a Pattern or Optional, is a mystery. +# But apparently they have difference precedences, and cannot be combined or thought +# of as one operator. Pattern: precedence: 150 WolframLanguageData: 64 @@ -5134,6 +5147,25 @@ RawBackquote: meaningful: true # comments: Specifies the accuracy (in number of digits to the right of decimal) of "number", where s is a positive decimal expressed without using ^^ or `. A magnitude must come after ``. +# QuestionQuestion precedence needs to be less than Definition "?" which is +# 5000 so that we don't treat ?? as ? ?. +QuestionQuestion: + precedence: 5001 # Seems a bit extreme. This is the old Mathics data + WolframLanguageData: None + WolframLanguageData-corrected: None + UnicodeCharacters.tr: None + UnicodeCharacters-corrected.tr: None + # N-tokens: None + # L-tokens: None + # O-tokens: None + usage: "??Times" + FullForm: None + arity: Unary + affix: Prefix + associativity: null + meaningful: true + # comments: None + Repeated: precedence: 170 WolframLanguageData: 62 diff --git a/mathics_scanner/generate/operators.py b/mathics_scanner/generate/operators.py index d2257f30..549247b0 100755 --- a/mathics_scanner/generate/operators.py +++ b/mathics_scanner/generate/operators.py @@ -39,10 +39,15 @@ def compile_tables( information. """ operator_precedences = {} + operator_name_to_character_name = {} for k, v in operator_data.items(): operator_precedences[k] = v["precedence"] + for character_name, character_info in character_data.items(): + if (operator_name := character_info.get("operator-name")) is not None: + operator_name_to_character_name[operator_name] = character_name + box_operators = {} flat_binary_operators: Dict[str, int] = {} left_binary_operators: Dict[str, int] = {} @@ -101,7 +106,11 @@ def compile_tables( character_info = character_data.get(operator_name) if character_info is None: - continue + if ( + character_name := operator_name_to_character_name.get(operator_name) + ) is None: + continue + character_info = character_data.get(character_name) unicode_char = character_info.get("unicode-equivalent", "no-unicode") ascii_chars = character_info.get("ascii", "no-ascii") diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 9d8d7e13..86d6c84e 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -157,7 +157,6 @@ "Alternatives": "Bar", "And": "AmpAmp", "Apply": "AtAt", - "ApplyList": "AtAtAt", "Cap": "LongName`Cap", "Cup": "LongName`Cup", "Decrement": "MinusMinus", @@ -171,6 +170,7 @@ "Increment": "PlusPlus", "Infix": "Tilde", "InterpretedBox": "LinearSequence`Bang", + "MapApply": "AtAtAt", "MessageName": "ColonColon", "Or": "BarBar", "Pattern": "Under", @@ -194,6 +194,147 @@ "Unequal": "BangEqual", } +# The format below maps a string character to a tuple of possible +# Token tag names. + +# For the tag name, we try to use CodeTokenize names. However in +# some situations this is not feasibile, given how our scanner and +# parser interact. In particular, the parser needs precedence +# information for binary operators. To get this, it is convenient +# to work off the operator name indicated by token value. So a +# token tag of "PatternTest" (for binary operators) is more +# convenient than "?" and a lookup of the binary operator name. + +# Note that the tuple below is in reverse string length order. In particular, +# tokens associated with a single character tokens like Factorial +# (!), has to come after both Unequal (!=), and Factorial2 (!!) to +# ensure that all of the token-name candidates are considered. + +# This is not Final since init_module adds to it. +LITERAL_TOKENS: Dict[str, Tuple[str]] = { + "!": ( + # Note that "Factorial" has to come last. + "Unequal", + "Factorial2", + "Factorial", + ), + '"': ("String",), + "#": ( + # Note that "Slot" has to come last. + "SlotSequence", + "Slot", + ), + "%": ("Out",), + "&": ("And", "Function"), + "'": ("Derivative",), + "(": ("OpenParen",), + ")": ("CloseParen",), + "*": ("NonCommutativeMultiply", "TimesBy", "Times"), + "+": ("Increment", "AddTo", "Plus"), + ",": ("RawComma",), + "-": ( + # Note that "Minus" has to come last. + "Decrement", + "SubtractFrom", + "Rule", + "Minus", + ), + ".": ( + # Note that "Dot" has to come last. + "Number", + "RepeatedNull", + "Repeated", + "Dot", + ), + "/": ( + # Note that "Divide" has to come last. + "MapAll", + "ReplaceRepeated", + "Map", + "DivideBy", + "ReplaceAll", + "RightComposition", + "Postfix", + "TagSet", + "Condition", + "Divide", + ), + ":": ("MessageName", "RuleDelayed", "SetDelayed", "RawColon"), + ";": ( + # Note that "Semicolon" has to come last. + "Span", + "Semicolon", + ), + "<": ( + # Note that "Less" has to come last. + "LessBar", + "UndirectedEdge", + "Get", + "StringJoin", + "LessEqual", + "Less", + ), + "=": ( + # Note that "Set" has to come last. + "SameQ", + "UnsameQ", + "Equal", + "Unset", + "Set", + ), + ">": ( + # Note that "Greater" has to come last. + "PutAppend", + "Put", + "GreaterEqual", + "Greater", + ), + "?": ( + # Note that "PatternTest" has to come last. + "QuestionQuestion", # Long-form Information + "PatternTest", + ), + "@": ("MapApply", "Apply", "Composition", "Prefix"), + "[": ("OpenSquare",), + "\\": ( + # Note that "RawBackSlash" has to come last. + "LinearSyntaxStar", + "LeftRowBox", + "RightRowBox", + "InterpretedBox", + "SuperscriptBox", + "SubscriptBox", + "OverscriptBox", + "UnderscriptBox", + "OtherscriptBox", + "FractionBox", + "SqrtBox", + "RadicalBox", + "FormBox", + "RawBackslash", + ), + "]": ("CloseSquare",), + "^": ( + # Note that "Power" has to come last. + "UpSetDelayed", + "UpSet", + "Power", + ), + "_": ("Pattern",), + "`": ( + "Pattern", + "Symbol", + ), + "|": ("BarGreater", "Or", "Alternatives", "Function"), + "{": ("OpenCurly",), + "}": ("CloseCurly",), + "~": ( + # Note that "Infix" has to come last. + "StringExpression", + "Infix", + ), +} + def compile_pattern(pattern: str) -> re.Pattern: """Compile a pattern from a regular expression in verbose mode""" @@ -227,7 +368,7 @@ def init_module(): ) tokens: List[Tuple[str, ...]] = [ - ("BoxInputEscape", r" \\[*]"), + ("LinearSyntaxStar", r" \\[*]"), ("Definition", r"\? "), ("Get", r"\<\<"), ("QuestionQuestion", r"\?\? "), @@ -276,7 +417,6 @@ def init_module(): ("Alternatives", r" \| "), ("And", rf" (\&\&) | {NAMED_CHARACTERS['And']} "), ("Apply", r" \@\@ "), - ("ApplyList", r" \@\@\@ "), ("Composition", r" \@\* "), ("Condition", r" \/\; "), ("Conjugate", f" {NAMED_CHARACTERS['Conjugate']} "), @@ -329,6 +469,7 @@ def init_module(): ("LessEqual", rf" (\<\=) | {NAMED_CHARACTERS['LessEqual']} "), ("Map", r" \/\@ "), ("MapAll", r" \/\/\@ "), + ("MapApply", r" \@\@\@ "), ("Minus", r" \-| {NAME_TO_WL_UNICODE['Minus']} "), ("Nand", rf" {NAMED_CHARACTERS['Nand']} "), ("NonCommutativeMultiply", r" \*\* "), @@ -403,151 +544,11 @@ def init_module(): unicode = unicode[0] tokens.append((operator_name, rf" {unicode} ")) - # The format below maps a string character to a tuple of possible - # Token tag names. - - # For the tag name, we try to use CodeTokenize names. However in - # some situations this is not feasibile, given how our scanner and - # parser interact. In particular, the parser needs precedence - # information for binary operators. To get this, it is convenient - # to work off the operator name indicated by token value. So a - # token tag of "PatternTest" (for binary operators) is more - # convenient than "?" and a lookup of the binary operator name. - - # Note that the tuple below is in priority order. In particular, - # tokens associated with a single character tokens like Factorial - # (!), has to come after both Unequal (!=), and Factorial2 (!!) to - # ensure all the candidates be considered. - - literal_tokens: Dict[str, Tuple[str]] = { - "!": ( - # Note that "Factorial" has to come last. - "Unequal", - "Factorial2", - "Factorial", - ), - '"': ("String",), - "#": ( - # Note that "Slot" has to come last. - "SlotSequence", - "Slot", - ), - "%": ("Out",), - "&": ("And", "Function"), - "'": ("Derivative",), - "(": ("OpenParen",), - ")": ("CloseParen",), - "*": ("NonCommutativeMultiply", "TimesBy", "Times"), - "+": ("Increment", "AddTo", "Plus"), - ",": ("RawComma",), - "-": ( - # Note that "Minus" has to come last. - "Decrement", - "SubtractFrom", - "Rule", - "Minus", - ), - ".": ( - # Note that "Dot" has to come last. - "Number", - "RepeatedNull", - "Repeated", - "Dot", - ), - "/": ( - # Note that "Divide" has to come last. - "MapAll", - "Map", - "DivideBy", - "ReplaceRepeated", - "ReplaceAll", - "RightComposition", - "Postfix", - "TagSet", - "Condition", - "Divide", - ), - ":": ("MessageName", "RuleDelayed", "SetDelayed", "RawColon"), - ";": ( - # Note that "Semicolon" has to come last. - "Span", - "Semicolon", - ), - "<": ( - # Note that "Less" has to come last. - "LessBar", - "UndirectedEdge", - "Get", - "StringJoin", - "LessEqual", - "Less", - ), - "=": ( - # Note that "Set" has to come last. - "SameQ", - "UnsameQ", - "Equal", - "Unset", - "Set", - ), - ">": ( - # Note that "Greater" has to come last. - "PutAppend", - "Put", - "GreaterEqual", - "Greater", - ), - "?": ( - # Note that "PatternTest" has to come last. - "QuestionQuestion", - "PatternTest", - ), - "@": ("ApplyList", "Apply", "Composition", "Prefix"), - "[": ("OpenSquare",), - "\\": ( - # Note that "RawBackSlash" has to come last. - "BoxInputEscape", - "LeftRowBox", - "RightRowBox", - "InterpretedBox", - "SuperscriptBox", - "SubscriptBox", - "OverscriptBox", - "UnderscriptBox", - "OtherscriptBox", - "FractionBox", - "SqrtBox", - "RadicalBox", - "FormBox", - "RawBackslash", - ), - "]": ("CloseSquare",), - "^": ( - # Note that "Power" has to come last. - "UpSetDelayed", - "UpSet", - "Power", - ), - "_": ("Pattern",), - "`": ( - "Pattern", - "Symbol", - ), - "|": ("BarGreater", "Or", "Alternatives", "Function"), - "{": ("OpenCurly",), - "}": ("CloseCurly",), - "~": ( - # Note that "Infix" has to come last. - "StringExpression", - "Infix", - ), - } - for c in string.ascii_letters: - literal_tokens[c] = ("Pattern", "Symbol") + LITERAL_TOKENS[c] = ("Pattern", "Symbol") for c in string.digits: - literal_tokens[c] = ("Number",) + LITERAL_TOKENS[c] = ("Number",) # The token and its matching pattern in filename mode. filename_tokens = [("Filename", FILENAME_PATTERN)] @@ -562,7 +563,7 @@ def init_module(): FILENAME_TOKENS.clear() TOKENS.extend(compile_tokens(tokens)) - TOKEN_INDICES.update(find_indices(literal_tokens)) + TOKEN_INDICES.update(find_indices(LITERAL_TOKENS)) FILENAME_TOKENS.extend(compile_tokens(filename_tokens)) NAME_PATTERN_TOKENS.extend(compile_tokens(name_pattern_tokens)) diff --git a/test/test_tokeniser.py b/test/test_tokeniser.py index 357fe26c..fb0c292d 100644 --- a/test/test_tokeniser.py +++ b/test/test_tokeniser.py @@ -5,10 +5,11 @@ import random import sys -from typing import List +from typing import Dict, Final, List, Tuple import pytest +from mathics_scanner.characters import OPERATOR_DATA from mathics_scanner.errors import ( EscapeSyntaxError, IncompleteSyntaxError, @@ -17,7 +18,11 @@ ) from mathics_scanner.feed import MultiLineFeeder, SingleLineFeeder from mathics_scanner.location import ContainerKind -from mathics_scanner.tokeniser import Token, Tokeniser, is_symbol_name +from mathics_scanner.tokeniser import LITERAL_TOKENS, Token, Tokeniser, is_symbol_name + +OPERATOR_TO_STRING: Final[Dict[str, Tuple[str, ...]]] = OPERATOR_DATA[ + "operator-to-string" +] def check_number(source_code: str): @@ -86,6 +91,63 @@ def tokens(source_code) -> List[Token]: return tokens +def test_LITERAL_TOKENS_dict(): + for start_character, token_names in LITERAL_TOKENS.items(): + if len(token_names) < 2: + continue + last_token_name = token_names[0] + + if last_token_name in ( + "BarGreater", + "LessBar", + "LinearSyntaxStar", + "Number", + "Pattern", + "Symbol", + "RawColon", + "Unequal", # FIXME reinstate this + ): + continue + # In the case of Function, there are several strings representations + # We need to use the one that has start charcter in it. + OPERATOR_TO_STRING[last_token_name] + for last_operator in reversed(OPERATOR_TO_STRING[last_token_name]): + if start_character == last_operator[0]: + break + else: + assert False, f"I did not find an operator for {start_character}" + + last_length = len(last_operator) + for token_name in token_names[1:]: + # Function should not be in the list below. + # But right now it is has different symbols & and |-> + # with the same name "Function" + if token_name in ( + "Number", + "RawColon", + "Semicolon", + "Greater", # FIXME reinstate this + "Function", # FIXME reinstate this + ): + continue + for operator in reversed(OPERATOR_TO_STRING[token_name]): + if start_character == operator[0]: + break + else: + assert ( + False + ), f"Did not find an in {start_character} an operator named {token_name}" + operator = OPERATOR_TO_STRING[token_name][-1] + n = len(operator) + assert last_length >= n, ( + f"Out of order tuple in {start_character}: " + f"{last_operator} ({last_token_name}) is shorter than {operator} ({token_name})" + ) + last_length = n + last_token_name = token_name + last_operator = operator + + def test_accuracy(): scanner_error("1.5``") check_number("1.0``20") @@ -135,7 +197,7 @@ def test_boxes(): ] assert tokens("\\(\\*RowBox[a]\\)") == [ Token("LeftRowBox", "\\(", 0), - Token("BoxInputEscape", "\\*", 2), + Token("LinearSyntaxStar", "\\*", 2), Token("Symbol", "RowBox", 4), Token("OpenSquare", "[", 10), Token("Symbol", "a", 11),