|
| 1 | +import builtins |
| 2 | +import keyword |
1 | 3 | import os |
2 | 4 | import sys |
| 5 | +import token as T |
| 6 | +import tokenize |
3 | 7 |
|
| 8 | +from collections import deque |
4 | 9 | from collections.abc import Callable, Iterator, Mapping |
5 | 10 | from dataclasses import dataclass, field, Field |
| 11 | +from io import StringIO |
| 12 | +from tokenize import TokenInfo as TI |
| 13 | +from typing import Iterable, Match, NamedTuple |
6 | 14 |
|
7 | 15 | COLORIZE = True |
8 | 16 |
|
@@ -373,3 +381,238 @@ def set_theme(t: Theme) -> None: |
373 | 381 |
|
374 | 382 |
|
375 | 383 | set_theme(default_theme) |
| 384 | + |
| 385 | + |
| 386 | +# --------------------------- Syntax colorizer ------------------------------- # |
| 387 | + |
| 388 | +IDENTIFIERS_AFTER = {"def", "class"} |
| 389 | +KEYWORD_CONSTANTS = {"True", "False", "None"} |
| 390 | +BUILTINS = {str(name) for name in dir(builtins) if not name.startswith('_')} |
| 391 | +_keyword_first_sets_match = {"False", "None", "True", "await", "lambda", "not"} |
| 392 | +_keyword_first_sets_case = {"False", "None", "True"} |
| 393 | + |
| 394 | + |
| 395 | +class _Span(NamedTuple): |
| 396 | + """Span indexing that's inclusive on both ends.""" |
| 397 | + |
| 398 | + start: int |
| 399 | + end: int |
| 400 | + |
| 401 | + @classmethod |
| 402 | + def from_re(cls, m: Match[str], group: int | str): |
| 403 | + re_span = m.span(group) |
| 404 | + return cls(re_span[0], re_span[1] - 1) |
| 405 | + |
| 406 | + @classmethod |
| 407 | + def from_token(cls, token: TI, line_len: list[int]): |
| 408 | + end_offset = -1 |
| 409 | + if (token.type in {T.FSTRING_MIDDLE, T.TSTRING_MIDDLE} |
| 410 | + and token.string.endswith(("{", "}"))): |
| 411 | + # gh-134158: a visible trailing brace comes from a double brace in input |
| 412 | + end_offset += 1 |
| 413 | + |
| 414 | + return cls( |
| 415 | + line_len[token.start[0] - 1] + token.start[1], |
| 416 | + line_len[token.end[0] - 1] + token.end[1] + end_offset, |
| 417 | + ) |
| 418 | + |
| 419 | + |
| 420 | +class _ColorSpan(NamedTuple): |
| 421 | + span: Span |
| 422 | + tag: str |
| 423 | + |
| 424 | + |
| 425 | +def _prev_next_window[T]( |
| 426 | + iterable: Iterable[T] |
| 427 | +) -> Iterator[tuple[T | None, ...]]: |
| 428 | + """Generates three-tuples of (previous, current, next) items. |
| 429 | +
|
| 430 | + On the first iteration previous is None. On the last iteration next |
| 431 | + is None. In case of exception next is None and the exception is re-raised |
| 432 | + on a subsequent next() call. |
| 433 | +
|
| 434 | + Inspired by `sliding_window` from `itertools` recipes. |
| 435 | + """ |
| 436 | + |
| 437 | + iterator = iter(iterable) |
| 438 | + window = deque((None, next(iterator)), maxlen=3) |
| 439 | + try: |
| 440 | + for x in iterator: |
| 441 | + window.append(x) |
| 442 | + yield tuple(window) |
| 443 | + except Exception: |
| 444 | + raise |
| 445 | + finally: |
| 446 | + window.append(None) |
| 447 | + yield tuple(window) |
| 448 | + |
| 449 | + |
| 450 | +def _is_soft_keyword_used(*tokens: TI | None) -> bool: |
| 451 | + """Returns True if the current token is a keyword in this context. |
| 452 | +
|
| 453 | + For the `*tokens` to match anything, they have to be a three-tuple of |
| 454 | + (previous, current, next). |
| 455 | + """ |
| 456 | + match tokens: |
| 457 | + case ( |
| 458 | + None | TI(T.NEWLINE) | TI(T.INDENT) | TI(string=":"), |
| 459 | + TI(string="match"), |
| 460 | + TI(T.NUMBER | T.STRING | T.FSTRING_START | T.TSTRING_START) |
| 461 | + | TI(T.OP, string="(" | "*" | "[" | "{" | "~" | "...") |
| 462 | + ): |
| 463 | + return True |
| 464 | + case ( |
| 465 | + None | TI(T.NEWLINE) | TI(T.INDENT) | TI(string=":"), |
| 466 | + TI(string="match"), |
| 467 | + TI(T.NAME, string=s) |
| 468 | + ): |
| 469 | + if keyword.iskeyword(s): |
| 470 | + return s in _keyword_first_sets_match |
| 471 | + return True |
| 472 | + case ( |
| 473 | + None | TI(T.NEWLINE) | TI(T.INDENT) | TI(T.DEDENT) | TI(string=":"), |
| 474 | + TI(string="case"), |
| 475 | + TI(T.NUMBER | T.STRING | T.FSTRING_START | T.TSTRING_START) |
| 476 | + | TI(T.OP, string="(" | "*" | "-" | "[" | "{") |
| 477 | + ): |
| 478 | + return True |
| 479 | + case ( |
| 480 | + None | TI(T.NEWLINE) | TI(T.INDENT) | TI(T.DEDENT) | TI(string=":"), |
| 481 | + TI(string="case"), |
| 482 | + TI(T.NAME, string=s) |
| 483 | + ): |
| 484 | + if keyword.iskeyword(s): |
| 485 | + return s in _keyword_first_sets_case |
| 486 | + return True |
| 487 | + case (TI(string="case"), TI(string="_"), TI(string=":")): |
| 488 | + return True |
| 489 | + case ( |
| 490 | + None | TI(T.NEWLINE) | TI(T.INDENT) | TI(T.DEDENT) | TI(string=":"), |
| 491 | + TI(string="type"), |
| 492 | + TI(T.NAME, string=s) |
| 493 | + ): |
| 494 | + return not keyword.iskeyword(s) |
| 495 | + case _: |
| 496 | + return False |
| 497 | + |
| 498 | + |
| 499 | +def _gen_colors_from_token_stream( |
| 500 | + token_generator: Iterator[TI], |
| 501 | + line_lengths: list[int], |
| 502 | +) -> Iterator[_ColorSpan]: |
| 503 | + token_window = _prev_next_window(token_generator) |
| 504 | + |
| 505 | + is_def_name = False |
| 506 | + bracket_level = 0 |
| 507 | + for prev_token, token, next_token in token_window: |
| 508 | + assert token is not None |
| 509 | + if token.start == token.end: |
| 510 | + continue |
| 511 | + |
| 512 | + match token.type: |
| 513 | + case ( |
| 514 | + T.STRING |
| 515 | + | T.FSTRING_START | T.FSTRING_MIDDLE | T.FSTRING_END |
| 516 | + | T.TSTRING_START | T.TSTRING_MIDDLE | T.TSTRING_END |
| 517 | + ): |
| 518 | + span = _Span.from_token(token, line_lengths) |
| 519 | + yield _ColorSpan(span, "string") |
| 520 | + case T.COMMENT: |
| 521 | + span = _Span.from_token(token, line_lengths) |
| 522 | + yield _ColorSpan(span, "comment") |
| 523 | + case T.NUMBER: |
| 524 | + span = _Span.from_token(token, line_lengths) |
| 525 | + yield _ColorSpan(span, "number") |
| 526 | + case T.OP: |
| 527 | + if token.string in "([{": |
| 528 | + bracket_level += 1 |
| 529 | + elif token.string in ")]}": |
| 530 | + bracket_level -= 1 |
| 531 | + span = _Span.from_token(token, line_lengths) |
| 532 | + yield _ColorSpan(span, "op") |
| 533 | + case T.NAME: |
| 534 | + if is_def_name: |
| 535 | + is_def_name = False |
| 536 | + span = _Span.from_token(token, line_lengths) |
| 537 | + yield _ColorSpan(span, "definition") |
| 538 | + elif keyword.iskeyword(token.string): |
| 539 | + span_cls = "keyword" |
| 540 | + if token.string in KEYWORD_CONSTANTS: |
| 541 | + span_cls = "keyword_constant" |
| 542 | + span = _Span.from_token(token, line_lengths) |
| 543 | + yield _ColorSpan(span, span_cls) |
| 544 | + if token.string in IDENTIFIERS_AFTER: |
| 545 | + is_def_name = True |
| 546 | + elif ( |
| 547 | + keyword.issoftkeyword(token.string) |
| 548 | + and bracket_level == 0 |
| 549 | + and _is_soft_keyword_used(prev_token, token, next_token) |
| 550 | + ): |
| 551 | + span = _Span.from_token(token, line_lengths) |
| 552 | + yield _ColorSpan(span, "soft_keyword") |
| 553 | + elif ( |
| 554 | + token.string in BUILTINS |
| 555 | + and not (prev_token and prev_token.exact_type == T.DOT) |
| 556 | + ): |
| 557 | + span = _Span.from_token(token, line_lengths) |
| 558 | + yield _ColorSpan(span, "builtin") |
| 559 | + |
| 560 | + |
| 561 | +def _recover_unterminated_string( |
| 562 | + exc: tokenize.TokenError, |
| 563 | + line_lengths: list[int], |
| 564 | + last_emitted: _ColorSpan | None, |
| 565 | + buffer: str, |
| 566 | +) -> Iterator[_ColorSpan]: |
| 567 | + msg, loc = exc.args |
| 568 | + if loc is None: |
| 569 | + return |
| 570 | + |
| 571 | + line_no, column = loc |
| 572 | + |
| 573 | + if msg.startswith( |
| 574 | + ( |
| 575 | + "unterminated string literal", |
| 576 | + "unterminated f-string literal", |
| 577 | + "unterminated t-string literal", |
| 578 | + "EOF in multi-line string", |
| 579 | + "unterminated triple-quoted f-string literal", |
| 580 | + "unterminated triple-quoted t-string literal", |
| 581 | + ) |
| 582 | + ): |
| 583 | + start = line_lengths[line_no - 1] + column - 1 |
| 584 | + end = line_lengths[-1] - 1 |
| 585 | + |
| 586 | + # in case FSTRING_START was already emitted |
| 587 | + if last_emitted and start <= last_emitted.span.start: |
| 588 | + start = last_emitted.span.end + 1 |
| 589 | + |
| 590 | + span = _Span(start, end) |
| 591 | + yield _ColorSpan(span, "string") |
| 592 | + |
| 593 | + |
| 594 | +def _gen_colors(buffer: str) -> Iterator[_ColorSpan]: |
| 595 | + """Returns a list of index spans to color using the given color tag. |
| 596 | +
|
| 597 | + The input `buffer` should be a valid start of a Python code block, i.e. |
| 598 | + it cannot be a block starting in the middle of a multiline string. |
| 599 | + """ |
| 600 | + sio = StringIO(buffer) |
| 601 | + line_lengths = [0] + [len(line) for line in sio.readlines()] |
| 602 | + # make line_lengths cumulative |
| 603 | + for i in range(1, len(line_lengths)): |
| 604 | + line_lengths[i] += line_lengths[i-1] |
| 605 | + |
| 606 | + sio.seek(0) |
| 607 | + gen = tokenize.generate_tokens(sio.readline) |
| 608 | + last_emitted: _ColorSpan | None = None |
| 609 | + try: |
| 610 | + for color in _gen_colors_from_token_stream(gen, line_lengths): |
| 611 | + yield color |
| 612 | + last_emitted = color |
| 613 | + except SyntaxError: |
| 614 | + return |
| 615 | + except tokenize.TokenError as te: |
| 616 | + yield from _recover_unterminated_string( |
| 617 | + te, line_lengths, last_emitted, buffer |
| 618 | + ) |
0 commit comments