diff --git a/.github/workflows/downstream.yml b/.github/workflows/downstream.yml new file mode 100644 index 00000000..59f121f0 --- /dev/null +++ b/.github/workflows/downstream.yml @@ -0,0 +1,76 @@ +name: downstream + +concurrency: + group: "${{github.workflow}}-${{github.ref}}" + cancel-in-progress: true + +on: + workflow_dispatch: + push: + branches: + - master + pull_request: + types: [opened, synchronize] + branches: + - '*' + +jobs: + skeleton: + runs-on: ubuntu-latest + steps: + - run: echo hello world + + parse5: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + repository: inikulin/parse5 + submodules: recursive + - run: rm -rf test/data/html5lib-tests/ + - uses: actions/checkout@v2 + with: + path: test/data/html5lib-tests/ + - uses: actions/setup-node@v3 + with: + node-version: lts/* + cache: npm + - run: npm ci + - run: npm run build --if-present + - run: npm run unit-tests + + html5gum: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + repository: untitaker/html5gum + - run: rm -rf tests/html5lib-tests/ + - uses: actions/checkout@v2 + with: + path: tests/html5lib-tests/ + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + - run: cargo test + + nokogiri: + runs-on: ubuntu-latest + container: + image: ghcr.io/sparklemotion/nokogiri-test:mri-3.2 + steps: + - uses: actions/checkout@v3 + with: + repository: sparklemotion/nokogiri + path: nokogiri + - uses: actions/checkout@v3 + with: + path: nokogiri/test/html5lib-tests + - working-directory: nokogiri + name: "Run the Nokogiri test suite" + run: | + bundle install + bundle exec rake compile -- --enable-system-libraries + bundle exec rake test diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 00000000..99f67c50 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,25 @@ +name: lint + +concurrency: + group: "${{github.workflow}}-${{github.ref}}" + cancel-in-progress: true + +on: + workflow_dispatch: + push: + branches: + - master + pull_request: + types: [opened, synchronize] + branches: + - '*' + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: '3.11' + - run: ./lint diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..f8b56708 --- /dev/null +++ b/.gitignore @@ -0,0 +1,79 @@ +# Copyright (c) 2014 GitHub, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover + +# Translations +*.mo +*.pot + +# Django stuff: +*.log + +# Sphinx documentation +doc/_build/ + +# PyBuilder +target/ diff --git a/encoding/scripted/tests1.dat b/encoding/scripted/tests1.dat new file mode 100644 index 00000000..04d18bb9 --- /dev/null +++ b/encoding/scripted/tests1.dat @@ -0,0 +1,5 @@ +#data + + +#encoding +iso-8859-2 diff --git a/encoding/tests1.dat b/encoding/tests1.dat index 77b0e41d..7aa9586d 100644 --- a/encoding/tests1.dat +++ b/encoding/tests1.dat @@ -356,12 +356,6 @@ iso-8859-2 #encoding iso-8859-2 -#data - - -#encoding -iso-8859-2 - #data diff --git a/lint b/lint new file mode 100755 index 00000000..19b7f50c --- /dev/null +++ b/lint @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 +import sys + +import lint_lib.lint as lint + +sys.exit(lint.main()) diff --git a/lint_lib/__init__.py b/lint_lib/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/lint_lib/_vendor-patches/funcparserlib.patch b/lint_lib/_vendor-patches/funcparserlib.patch new file mode 100644 index 00000000..fc294880 --- /dev/null +++ b/lint_lib/_vendor-patches/funcparserlib.patch @@ -0,0 +1,24 @@ +diff --git a/lint_lib/_vendor/funcparserlib/parser.py b/lint_lib/_vendor/funcparserlib/parser.py +index eb2f53f..0f86e6c 100644 +--- a/lint_lib/_vendor/funcparserlib/parser.py ++++ b/lint_lib/_vendor/funcparserlib/parser.py +@@ -137,19 +137,6 @@ class Parser(object): + "('x', 'y')" + + ``` +- +- !!! Note +- +- You can enable the parsing log this way: +- +- ```python +- import logging +- logging.basicConfig(level=logging.DEBUG) +- import funcparserlib.parser +- funcparserlib.parser.debug = True +- ``` +- +- The way to enable the parsing log may be changed in future versions. + """ + self.name = name + return self diff --git a/lint_lib/_vendor/__init__.py b/lint_lib/_vendor/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/lint_lib/_vendor/funcparserlib/LICENSE b/lint_lib/_vendor/funcparserlib/LICENSE new file mode 100644 index 00000000..31d3a95b --- /dev/null +++ b/lint_lib/_vendor/funcparserlib/LICENSE @@ -0,0 +1,18 @@ +Copyright © 2009/2021 Andrey Vlasovskikh + +Permission is hereby granted, free of charge, to any person obtaining a copy of this +software and associated documentation files (the "Software"), to deal in the Software +without restriction, including without limitation the rights to use, copy, modify, +merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice shall be included in all copies or +substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT +OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. diff --git a/lint_lib/_vendor/funcparserlib/__init__.py b/lint_lib/_vendor/funcparserlib/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/lint_lib/_vendor/funcparserlib/lexer.py b/lint_lib/_vendor/funcparserlib/lexer.py new file mode 100644 index 00000000..0a5b5e9e --- /dev/null +++ b/lint_lib/_vendor/funcparserlib/lexer.py @@ -0,0 +1,211 @@ +# -*- coding: utf-8 -*- + +# Copyright © 2009/2021 Andrey Vlasovskikh +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this +# software and associated documentation files (the "Software"), to deal in the Software +# without restriction, including without limitation the rights to use, copy, modify, +# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies +# or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +# PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE +# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import unicode_literals + +__all__ = ["make_tokenizer", "TokenSpec", "Token", "LexerError"] + +import re + + +class LexerError(Exception): + def __init__(self, place, msg): + self.place = place + self.msg = msg + + def __str__(self): + s = "cannot tokenize data" + line, pos = self.place + return '%s: %d,%d: "%s"' % (s, line, pos, self.msg) + + +class TokenSpec(object): + """A token specification for generating a lexer via `make_tokenizer()`.""" + + def __init__(self, type, pattern, flags=0): + """Initialize a `TokenSpec` object. + + Parameters: + type (str): User-defined type of the token (e.g. `"name"`, `"number"`, + `"operator"`) + pattern (str): Regexp for matching this token type + flags (int, optional): Regexp flags, the second argument of `re.compile()` + """ + self.type = type + self.pattern = pattern + self.flags = flags + + def __repr__(self): + return "TokenSpec(%r, %r, %r)" % (self.type, self.pattern, self.flags) + + +class Token(object): + """A token object that represents a substring of certain type in your text. + + You can compare tokens for equality using the `==` operator. Tokens also define + custom `repr()` and `str()`. + + Attributes: + type (str): User-defined type of the token (e.g. `"name"`, `"number"`, + `"operator"`) + value (str): Text value of the token + start (Optional[Tuple[int, int]]): Start position (_line_, _column_) + end (Optional[Tuple[int, int]]): End position (_line_, _column_) + """ + + def __init__(self, type, value, start=None, end=None): + """Initialize a `Token` object.""" + self.type = type + self.value = value + self.start = start + self.end = end + + def __repr__(self): + return "Token(%r, %r)" % (self.type, self.value) + + def __eq__(self, other): + # FIXME: Case sensitivity is assumed here + if other is None: + return False + else: + return self.type == other.type and self.value == other.value + + def _pos_str(self): + if self.start is None or self.end is None: + return "" + else: + sl, sp = self.start + el, ep = self.end + return "%d,%d-%d,%d:" % (sl, sp, el, ep) + + def __str__(self): + s = "%s %s '%s'" % (self._pos_str(), self.type, self.value) + return s.strip() + + @property + def name(self): + return self.value + + def pformat(self): + return "%s %s '%s'" % ( + self._pos_str().ljust(20), # noqa + self.type.ljust(14), + self.value, + ) + + +def make_tokenizer(specs): + # noinspection GrazieInspection + """Make a function that tokenizes text based on the regexp specs. + + Type: `(Sequence[TokenSpec | Tuple]) -> Callable[[str], Iterable[Token]]` + + A token spec is `TokenSpec` instance. + + !!! Note + + For legacy reasons, a token spec may also be a tuple of (_type_, _args_), where + _type_ sets the value of `Token.type` for the token, and _args_ are the + positional arguments for `re.compile()`: either just (_pattern_,) or + (_pattern_, _flags_). + + It returns a tokenizer function that takes a string and returns an iterable of + `Token` objects, or raises `LexerError` if it cannot tokenize the string according + to its token specs. + + Examples: + + ```pycon + >>> tokenize = make_tokenizer([ + ... TokenSpec("space", r"\\s+"), + ... TokenSpec("id", r"\\w+"), + ... TokenSpec("op", r"[,!]"), + ... ]) + >>> text = "Hello, World!" + >>> [t for t in tokenize(text) if t.type != "space"] # noqa + [Token('id', 'Hello'), Token('op', ','), Token('id', 'World'), Token('op', '!')] + >>> text = "Bye?" + >>> list(tokenize(text)) + Traceback (most recent call last): + ... + lexer.LexerError: cannot tokenize data: 1,4: "Bye?" + + ``` + """ + compiled = [] + for spec in specs: + if isinstance(spec, TokenSpec): + c = spec.type, re.compile(spec.pattern, spec.flags) + else: + name, args = spec + c = name, re.compile(*args) + compiled.append(c) + + def match_specs(s, i, position): + line, pos = position + for type, regexp in compiled: + m = regexp.match(s, i) + if m is not None: + value = m.group() + nls = value.count("\n") + n_line = line + nls + if nls == 0: + n_pos = pos + len(value) + else: + n_pos = len(value) - value.rfind("\n") - 1 + return Token(type, value, (line, pos + 1), (n_line, n_pos)) + else: + err_line = s.splitlines()[line - 1] + raise LexerError((line, pos + 1), err_line) + + def f(s): + length = len(s) + line, pos = 1, 0 + i = 0 + while i < length: + t = match_specs(s, i, (line, pos)) + yield t + line, pos = t.end + i += len(t.value) + + return f + + +# This is an example of token specs. See also [this article][1] for a +# discussion of searching for multiline comments using regexps (including `*?`). +# +# [1]: http://ostermiller.org/findcomment.html +_example_token_specs = [ + TokenSpec("COMMENT", r"\(\*(.|[\r\n])*?\*\)", re.MULTILINE), + TokenSpec("COMMENT", r"\{(.|[\r\n])*?\}", re.MULTILINE), + TokenSpec("COMMENT", r"//.*"), + TokenSpec("NL", r"[\r\n]+"), + TokenSpec("SPACE", r"[ \t\r\n]+"), + TokenSpec("NAME", r"[A-Za-z_][A-Za-z_0-9]*"), + TokenSpec("REAL", r"[0-9]+\.[0-9]*([Ee][+\-]?[0-9]+)*"), + TokenSpec("INT", r"[0-9]+"), + TokenSpec("INT", r"\$[0-9A-Fa-f]+"), + TokenSpec("OP", r"(\.\.)|(<>)|(<=)|(>=)|(:=)|[;,=\(\):\[\]\.+\-<>\*/@\^]"), + TokenSpec("STRING", r"'([^']|(''))*'"), + TokenSpec("CHAR", r"#[0-9]+"), + TokenSpec("CHAR", r"#\$[0-9A-Fa-f]+"), +] +# tokenize = make_tokenizer(_example_token_specs) diff --git a/lint_lib/_vendor/funcparserlib/lexer.pyi b/lint_lib/_vendor/funcparserlib/lexer.pyi new file mode 100644 index 00000000..b1e88fe7 --- /dev/null +++ b/lint_lib/_vendor/funcparserlib/lexer.pyi @@ -0,0 +1,34 @@ +from typing import Tuple, Optional, Callable, Iterable, Text, Sequence + +_Place = Tuple[int, int] +_Spec = Tuple[Text, Tuple] + +class Token: + type: Text + value: Text + start: Optional[_Place] + end: Optional[_Place] + name: Text + def __init__( + self, + type: Text, + value: Text, + start: Optional[_Place] = ..., + end: Optional[_Place] = ..., + ) -> None: ... + def pformat(self) -> Text: ... + +class TokenSpec: + name: Text + pattern: Text + flags: int + def __init__(self, name: Text, pattern: Text, flags: int = ...) -> None: ... + +def make_tokenizer( + specs: Sequence[TokenSpec | _Spec], +) -> Callable[[Text], Iterable[Token]]: ... + +class LexerError(Exception): + place: Tuple[int, int] + msg: Text + def __init__(self, place: _Place, msg: Text) -> None: ... diff --git a/lint_lib/_vendor/funcparserlib/parser.py b/lint_lib/_vendor/funcparserlib/parser.py new file mode 100644 index 00000000..0bbac7f5 --- /dev/null +++ b/lint_lib/_vendor/funcparserlib/parser.py @@ -0,0 +1,872 @@ +# -*- coding: utf-8 -*- + +# Copyright © 2009/2021 Andrey Vlasovskikh +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this +# software and associated documentation files (the "Software"), to deal in the Software +# without restriction, including without limitation the rights to use, copy, modify, +# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies +# or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +# PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE +# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +"""Functional parsing combinators. + +Parsing combinators define an internal domain-specific language (DSL) for describing +the parsing rules of a grammar. The DSL allows you to start with a few primitive +parsers, then combine your parsers to get more complex ones, and finally cover +the whole grammar you want to parse. + +The structure of the language: + +* Class `Parser` + * All the primitives and combinators of the language return `Parser` objects + * It defines the main `Parser.parse(tokens)` method +* Primitive parsers + * `tok(type, value)`, `a(value)`, `some(pred)`, `forward_decl()`, `finished` +* Parser combinators + * `p1 + p2`, `p1 | p2`, `p >> f`, `-p`, `maybe(p)`, `many(p)`, `oneplus(p)`, + `skip(p)` +* Abstraction + * Use regular Python variables `p = ... # Expression of type Parser` to define new + rules (non-terminals) of your grammar + +Every time you apply one of the combinators, you get a new `Parser` object. In other +words, the set of `Parser` objects is closed under the means of combination. + +!!! Note + + We took the parsing combinators language from the book [Introduction to Functional + Programming][1] and translated it from ML into Python. + + [1]: https://www.cl.cam.ac.uk/teaching/Lectures/funprog-jrh-1996/ +""" + +from __future__ import unicode_literals + +__all__ = [ + "some", + "a", + "tok", + "many", + "pure", + "finished", + "maybe", + "skip", + "oneplus", + "forward_decl", + "NoParseError", + "Parser", +] + +import sys +import logging +import warnings + +from lint_lib._vendor.funcparserlib.lexer import Token + +log = logging.getLogger("funcparserlib") + +debug = False +if sys.version_info < (3,): + string_types = (str, unicode) # noqa +else: + string_types = str + + +class Parser(object): + """A parser object that can parse a sequence of tokens or can be combined with + other parsers using `+`, `|`, `>>`, `many()`, and other parsing combinators. + + Type: `Parser[A, B]` + + The generic variables in the type are: `A` — the type of the tokens in the + sequence to parse,`B` — the type of the parsed value. + + In order to define a parser for your grammar: + + 1. You start with primitive parsers by calling `a(value)`, `some(pred)`, + `forward_decl()`, `finished` + 2. You use parsing combinators `p1 + p2`, `p1 | p2`, `p >> f`, `many(p)`, and + others to combine parsers into a more complex parser + 3. You can assign complex parsers to variables to define names that correspond to + the rules of your grammar + + !!! Note + + The constructor `Parser.__init__()` is considered **internal** and may be + changed in future versions. Use primitive parsers and parsing combinators to + construct new parsers. + """ + + def __init__(self, p): + """Wrap the parser function `p` into a `Parser` object.""" + self.name = "" + self.define(p) + + def named(self, name): + # noinspection GrazieInspection + """Specify the name of the parser for easier debugging. + + Type: `(str) -> Parser[A, B]` + + This name is used in the debug-level parsing log. You can also get it via the + `Parser.name` attribute. + + Examples: + + ```pycon + >>> expr = (a("x") + a("y")).named("expr") + >>> expr.name + 'expr' + + ``` + + ```pycon + >>> expr = a("x") + a("y") + >>> expr.name + "('x', 'y')" + + ``` + """ + self.name = name + return self + + def define(self, p): + """Define the parser created earlier as a forward declaration. + + Type: `(Parser[A, B]) -> None` + + Use `p = forward_decl()` in combination with `p.define(...)` to define + recursive parsers. + + See the examples in the docs for `forward_decl()`. + """ + f = getattr(p, "run", p) + if debug: + setattr(self, "_run", f) + else: + setattr(self, "run", f) + self.named(getattr(p, "name", p.__doc__)) + + def run(self, tokens, s): + """Run the parser against the tokens with the specified parsing state. + + Type: `(Sequence[A], State) -> Tuple[B, State]` + + The parsing state includes the current position in the sequence being parsed, + and the position of the rightmost token that has been consumed while parsing for + better error messages. + + If the parser fails to parse the tokens, it raises `NoParseError`. + + !!! Warning + + This is method is **internal** and may be changed in future versions. Use + `Parser.parse(tokens)` instead and let the parser object take care of + updating the parsing state. + """ + if debug: + log.debug("trying %s" % self.name) + return self._run(tokens, s) # noqa + + def _run(self, tokens, s): + raise NotImplementedError("you must define() a parser") + + def parse(self, tokens): + """Parse the sequence of tokens and return the parsed value. + + Type: `(Sequence[A]) -> B` + + It takes a sequence of tokens of arbitrary type `A` and returns the parsed value + of arbitrary type `B`. + + If the parser fails to parse the tokens, it raises `NoParseError`. + + !!! Note + + Although `Parser.parse()` can parse sequences of any objects (including + `str` which is a sequence of `str` chars), **the recommended way** is + parsing sequences of `Token` objects. + + You **should** use a regexp-based tokenizer `make_tokenizer()` defined in + `funcparserlib.lexer` to convert your text into a sequence of `Token` + objects before parsing it. You will get more readable parsing error messages + (as `Token` objects contain their position in the source file) and good + separation of the lexical and syntactic levels of the grammar. + """ + try: + (tree, _) = self.run(tokens, State(0, 0, None)) + return tree + except NoParseError as e: + max = e.state.max + if len(tokens) > max: + t = tokens[max] + if isinstance(t, Token): + if t.start is None or t.end is None: + loc = "" + else: + s_line, s_pos = t.start + e_line, e_pos = t.end + loc = "%d,%d-%d,%d: " % (s_line, s_pos, e_line, e_pos) + msg = "%s%s: %r" % (loc, e.msg, t.value) + elif isinstance(t, string_types): + msg = "%s: %r" % (e.msg, t) + else: + msg = "%s: %s" % (e.msg, t) + else: + msg = "got unexpected end of input" + if e.state.parser is not None: + msg = "%s, expected: %s" % (msg, e.state.parser.name) + e.msg = msg + raise + + def __add__(self, other): + """Sequential combination of parsers. It runs this parser, then the other + parser. + + The return value of the resulting parser is a tuple of each parsed value in + the sum of parsers. We merge all parsing results of `p1 + p2 + ... + pN` into a + single tuple. It means that the parsing result may be a 2-tuple, a 3-tuple, + a 4-tuple, etc. of parsed values. You avoid this by transforming the parsed + pair into a new value using the `>>` combinator. + + You can also skip some parsing results in the resulting parsers by using `-p` + or `skip(p)` for some parsers in your sum of parsers. It means that the parsing + result might be a single value, not a tuple of parsed values. See the docs + for `Parser.__neg__()` for more examples. + + Overloaded types (lots of them to provide stricter checking for the quite + dynamic return type of this method): + + * `(self: Parser[A, B], _IgnoredParser[A]) -> Parser[A, B]` + * `(self: Parser[A, B], Parser[A, C]) -> _TupleParser[A, Tuple[B, C]]` + * `(self: _TupleParser[A, B], _IgnoredParser[A]) -> _TupleParser[A, B]` + * `(self: _TupleParser[A, B], Parser[A, Any]) -> Parser[A, Any]` + * `(self: _IgnoredParser[A], _IgnoredParser[A]) -> _IgnoredParser[A]` + * `(self: _IgnoredParser[A], Parser[A, C]) -> Parser[A, C]` + + Examples: + + ```pycon + >>> expr = a("x") + a("y") + >>> expr.parse("xy") + ('x', 'y') + + ``` + + ```pycon + >>> expr = a("x") + a("y") + a("z") + >>> expr.parse("xyz") + ('x', 'y', 'z') + + ``` + + ```pycon + >>> expr = a("x") + a("y") + >>> expr.parse("xz") + Traceback (most recent call last): + ... + parser.NoParseError: got unexpected token: 'z', expected: 'y' + + ``` + """ + + def magic(v1, v2): + if isinstance(v1, _Tuple): + return _Tuple(v1 + (v2,)) + else: + return _Tuple((v1, v2)) + + @_TupleParser + def _add(tokens, s): + (v1, s2) = self.run(tokens, s) + (v2, s3) = other.run(tokens, s2) + return magic(v1, v2), s3 + + @Parser + def ignored_right(tokens, s): + v, s2 = self.run(tokens, s) + _, s3 = other.run(tokens, s2) + return v, s3 + + name = "(%s, %s)" % (self.name, other.name) + if isinstance(other, _IgnoredParser): + return ignored_right.named(name) + else: + return _add.named(name) + + def __or__(self, other): + """Choice combination of parsers. + + It runs this parser and returns its result. If the parser fails, it runs the + other parser. + + Examples: + + ```pycon + >>> expr = a("x") | a("y") + >>> expr.parse("x") + 'x' + >>> expr.parse("y") + 'y' + >>> expr.parse("z") + Traceback (most recent call last): + ... + parser.NoParseError: got unexpected token: 'z', expected: 'x' or 'y' + + ``` + """ + + @Parser + def _or(tokens, s): + try: + return self.run(tokens, s) + except NoParseError as e: + state = e.state + try: + return other.run(tokens, State(s.pos, state.max, state.parser)) + except NoParseError as e: + if s.pos == e.state.max: + e.state = State(e.state.pos, e.state.max, _or) + raise + + _or.name = "%s or %s" % (self.name, other.name) + return _or + + def __rshift__(self, f): + """Transform the parsing result by applying the specified function. + + Type: `(Callable[[B], C]) -> Parser[A, C]` + + You can use it for transforming the parsed value into another value before + including it into the parse tree (the AST). + + Examples: + + ```pycon + >>> def make_canonical_name(s): + ... return s.lower() + >>> expr = (a("D") | a("d")) >> make_canonical_name + >>> expr.parse("D") + 'd' + >>> expr.parse("d") + 'd' + + ``` + """ + + @Parser + def _shift(tokens, s): + (v, s2) = self.run(tokens, s) + return f(v), s2 + + return _shift.named(self.name) + + def bind(self, f): + """Bind the parser to a monadic function that returns a new parser. + + Type: `(Callable[[B], Parser[A, C]]) -> Parser[A, C]` + + Also known as `>>=` in Haskell. + + !!! Note + + You can parse any context-free grammar without resorting to `bind`. Due + to its poor performance please use it only when you really need it. + """ + + @Parser + def _bind(tokens, s): + (v, s2) = self.run(tokens, s) + return f(v).run(tokens, s2) + + _bind.name = "(%s >>=)" % (self.name,) + return _bind + + def __neg__(self): + """Return a parser that parses the same tokens, but its parsing result is + ignored by the sequential `+` combinator. + + Type: `(Parser[A, B]) -> _IgnoredParser[A]` + + You can use it for throwing away elements of concrete syntax (e.g. `","`, + `";"`). + + Examples: + + ```pycon + >>> expr = -a("x") + a("y") + >>> expr.parse("xy") + 'y' + + ``` + + ```pycon + >>> expr = a("x") + -a("y") + >>> expr.parse("xy") + 'x' + + ``` + + ```pycon + >>> expr = a("x") + -a("y") + a("z") + >>> expr.parse("xyz") + ('x', 'z') + + ``` + + ```pycon + >>> expr = -a("x") + a("y") + -a("z") + >>> expr.parse("xyz") + 'y' + + ``` + + ```pycon + >>> expr = -a("x") + a("y") + >>> expr.parse("yz") + Traceback (most recent call last): + ... + parser.NoParseError: got unexpected token: 'y', expected: 'x' + + ``` + + ```pycon + >>> expr = a("x") + -a("y") + >>> expr.parse("xz") + Traceback (most recent call last): + ... + parser.NoParseError: got unexpected token: 'z', expected: 'y' + + ``` + + !!! Note + + You **should not** pass the resulting parser to any combinators other than + `+`. You **should** have at least one non-skipped value in your + `p1 + p2 + ... + pN`. The parsed value of `-p` is an **internal** `_Ignored` + object, not intended for actual use. + """ + return _IgnoredParser(self) + + def __class_getitem__(cls, key): + return cls + + +class State(object): + """Parsing state that is maintained basically for error reporting. + + It consists of the current position `pos` in the sequence being parsed, and the + position `max` of the rightmost token that has been consumed while parsing. + """ + + def __init__(self, pos, max, parser=None): + self.pos = pos + self.max = max + self.parser = parser + + def __str__(self): + return str((self.pos, self.max)) + + def __repr__(self): + return "State(%r, %r)" % (self.pos, self.max) + + +class NoParseError(Exception): + def __init__(self, msg, state): + self.msg = msg + self.state = state + + def __str__(self): + return self.msg + + +class _Tuple(tuple): + pass + + +class _TupleParser(Parser): + pass + + +class _Ignored(object): + def __init__(self, value): + self.value = value + + def __repr__(self): + return "_Ignored(%s)" % repr(self.value) + + def __eq__(self, other): + return isinstance(other, _Ignored) and self.value == other.value + + +@Parser +def finished(tokens, s): + """A parser that throws an exception if there are any unparsed tokens left in the + sequence.""" + if s.pos >= len(tokens): + return None, s + else: + s2 = State(s.pos, s.max, finished if s.pos == s.max else s.parser) + raise NoParseError("got unexpected token", s2) + + +finished.name = "end of input" + + +def many(p): + """Return a parser that applies the parser `p` as many times as it succeeds at + parsing the tokens. + + Return a parser that infinitely applies the parser `p` to the input sequence + of tokens as long as it successfully parses them. The parsed value is a list of + the sequentially parsed values. + + Examples: + + ```pycon + >>> expr = many(a("x")) + >>> expr.parse("x") + ['x'] + >>> expr.parse("xx") + ['x', 'x'] + >>> expr.parse("xxxy") # noqa + ['x', 'x', 'x'] + >>> expr.parse("y") + [] + + ``` + """ + + @Parser + def _many(tokens, s): + res = [] + try: + while True: + (v, s) = p.run(tokens, s) + res.append(v) + except NoParseError as e: + s2 = State(s.pos, e.state.max, e.state.parser) + if debug: + log.debug( + "*matched* %d instances of %s, new state = %s" + % (len(res), _many.name, s2) + ) + return res, s2 + + _many.name = "{ %s }" % p.name + return _many + + +def some(pred): + """Return a parser that parses a token if it satisfies the predicate `pred`. + + Type: `(Callable[[A], bool]) -> Parser[A, A]` + + Examples: + + ```pycon + >>> expr = some(lambda s: s.isalpha()).named('alpha') + >>> expr.parse("x") + 'x' + >>> expr.parse("y") + 'y' + >>> expr.parse("1") + Traceback (most recent call last): + ... + parser.NoParseError: got unexpected token: '1', expected: alpha + + ``` + + !!! Warning + + The `some()` combinator is quite slow and may be changed or removed in future + versions. If you need a parser for a token by its type (e.g. any identifier) + and maybe its value, use `tok(type[, value])` instead. You should use + `make_tokenizer()` from `funcparserlib.lexer` to tokenize your text first. + """ + + @Parser + def _some(tokens, s): + if s.pos >= len(tokens): + s2 = State(s.pos, s.max, _some if s.pos == s.max else s.parser) + raise NoParseError("got unexpected end of input", s2) + else: + t = tokens[s.pos] + if pred(t): + pos = s.pos + 1 + s2 = State(pos, max(pos, s.max), s.parser) + if debug: + log.debug("*matched* %r, new state = %s" % (t, s2)) + return t, s2 + else: + s2 = State(s.pos, s.max, _some if s.pos == s.max else s.parser) + if debug: + log.debug( + "failed %r, state = %s, expected = %s" % (t, s2, s2.parser.name) + ) + raise NoParseError("got unexpected token", s2) + + _some.name = "some(...)" + return _some + + +def a(value): + """Return a parser that parses a token if it's equal to `value`. + + Type: `(A) -> Parser[A, A]` + + Examples: + + ```pycon + >>> expr = a("x") + >>> expr.parse("x") + 'x' + >>> expr.parse("y") + Traceback (most recent call last): + ... + parser.NoParseError: got unexpected token: 'y', expected: 'x' + + ``` + + !!! Note + + Although `Parser.parse()` can parse sequences of any objects (including + `str` which is a sequence of `str` chars), **the recommended way** is + parsing sequences of `Token` objects. + + You **should** use a regexp-based tokenizer `make_tokenizer()` defined in + `funcparserlib.lexer` to convert your text into a sequence of `Token` objects + before parsing it. You will get more readable parsing error messages (as `Token` + objects contain their position in the source file) and good separation of the + lexical and syntactic levels of the grammar. + """ + name = getattr(value, "name", value) + return some(lambda t: t == value).named(repr(name)) + + +def tok(type, value=None): + """Return a parser that parses a `Token` and returns the string value of the token. + + Type: `(str, Optional[str]) -> Parser[Token, str]` + + You can match any token of the specified `type` or you can match a specific token by + its `type` and `value`. + + Examples: + + ```pycon + >>> expr = tok("expr") + >>> expr.parse([Token("expr", "foo")]) + 'foo' + >>> expr.parse([Token("expr", "bar")]) + 'bar' + >>> expr.parse([Token("op", "=")]) + Traceback (most recent call last): + ... + parser.NoParseError: got unexpected token: '=', expected: expr + + ``` + + ```pycon + >>> expr = tok("op", "=") + >>> expr.parse([Token("op", "=")]) + '=' + >>> expr.parse([Token("op", "+")]) + Traceback (most recent call last): + ... + parser.NoParseError: got unexpected token: '+', expected: '=' + + ``` + + !!! Note + + In order to convert your text to parse into a sequence of `Token` objects, + use a regexp-based tokenizer `make_tokenizer()` defined in + `funcparserlib.lexer`. You will get more readable parsing error messages (as + `Token` objects contain their position in the source file) and good separation + of the lexical and syntactic levels of the grammar. + """ + if value is not None: + p = a(Token(type, value)) + else: + p = some(lambda t: t.type == type).named(type) + return (p >> (lambda t: t.value)).named(p.name) + + +def pure(x): + """Wrap any object into a parser. + + Type: `(A) -> Parser[A, A]` + + A pure parser doesn't touch the tokens sequence, it just returns its pure `x` + value. + + Also known as `return` in Haskell. + """ + + @Parser + def _pure(_, s): + return x, s + + _pure.name = "(pure %r)" % (x,) + return _pure + + +def maybe(p): + """Return a parser that returns `None` if the parser `p` fails. + + Examples: + + ```pycon + >>> expr = maybe(a("x")) + >>> expr.parse("x") + 'x' + >>> expr.parse("y") is None + True + + ``` + """ + return (p | pure(None)).named("[ %s ]" % (p.name,)) + + +def skip(p): + """An alias for `-p`. + + See also the docs for `Parser.__neg__()`. + """ + return -p + + +class _IgnoredParser(Parser): + def __init__(self, p): + super(_IgnoredParser, self).__init__(p) + run = self._run if debug else self.run + + def ignored(tokens, s): + v, s2 = run(tokens, s) + return v if isinstance(v, _Ignored) else _Ignored(v), s2 + + self.define(ignored) + self.name = getattr(p, "name", p.__doc__) + + def __add__(self, other): + def ignored_left(tokens, s): + _, s2 = self.run(tokens, s) + v, s3 = other.run(tokens, s2) + return v, s3 + + if isinstance(other, _IgnoredParser): + return _IgnoredParser(ignored_left).named( + "(%s, %s)" % (self.name, other.name) + ) + else: + return Parser(ignored_left).named("(%s, %s)" % (self.name, other.name)) + + +def oneplus(p): + """Return a parser that applies the parser `p` one or more times. + + A similar parser combinator `many(p)` means apply `p` zero or more times, whereas + `oneplus(p)` means apply `p` one or more times. + + Examples: + + ```pycon + >>> expr = oneplus(a("x")) + >>> expr.parse("x") + ['x'] + >>> expr.parse("xx") + ['x', 'x'] + >>> expr.parse("y") + Traceback (most recent call last): + ... + parser.NoParseError: got unexpected token: 'y', expected: 'x' + + ``` + """ + + @Parser + def _oneplus(tokens, s): + (v1, s2) = p.run(tokens, s) + (v2, s3) = many(p).run(tokens, s2) + return [v1] + v2, s3 + + _oneplus.name = "(%s, { %s })" % (p.name, p.name) + return _oneplus + + +def with_forward_decls(suspension): + warnings.warn( + "Use forward_decl() instead:\n" + "\n" + " p = forward_decl()\n" + " ...\n" + " p.define(parser_value)\n", + DeprecationWarning, + ) + + @Parser + def f(tokens, s): + return suspension().run(tokens, s) + + return f + + +def forward_decl(): + """Return an undefined parser that can be used as a forward declaration. + + Type: `Parser[Any, Any]` + + Use `p = forward_decl()` in combination with `p.define(...)` to define recursive + parsers. + + + Examples: + + ```pycon + >>> expr = forward_decl() + >>> expr.define(a("x") + maybe(expr) + a("y")) + >>> expr.parse("xxyy") # noqa + ('x', ('x', None, 'y'), 'y') + >>> expr.parse("xxy") + Traceback (most recent call last): + ... + parser.NoParseError: got unexpected end of input, expected: 'y' + + ``` + + !!! Note + + If you care about static types, you should add a type hint for your forward + declaration, so that your type checker can check types in `p.define(...)` later: + + ```python + p: Parser[str, int] = forward_decl() + p.define(a("x")) # Type checker error + p.define(a("1") >> int) # OK + ``` + """ + + @Parser + def f(_tokens, _s): + raise NotImplementedError("you must define() a forward_decl somewhere") + + f.name = "forward_decl()" + return f + + +if __name__ == "__main__": + import doctest + + doctest.testmod() diff --git a/lint_lib/_vendor/funcparserlib/parser.pyi b/lint_lib/_vendor/funcparserlib/parser.pyi new file mode 100644 index 00000000..e21ded5a --- /dev/null +++ b/lint_lib/_vendor/funcparserlib/parser.pyi @@ -0,0 +1,83 @@ +from typing import ( + Optional, + Generic, + TypeVar, + Union, + Callable, + Tuple, + Sequence, + Any, + List, + Text, + overload, +) +from funcparserlib.lexer import Token + +_A = TypeVar("_A") +_B = TypeVar("_B") +_C = TypeVar("_C") +_D = TypeVar("_D") + +class State: + pos: int + max: int + parser: Union[Parser, _ParserCallable, None] + def __init__( + self, + pos: int, + max: int, + parser: Union[Parser, _ParserCallable, None] = ..., + ) -> None: ... + +_ParserCallable = Callable[[_A, State], Tuple[_B, State]] + +class Parser(Generic[_A, _B]): + name: Text + def __init__(self, p: Union[Parser[_A, _B], _ParserCallable]) -> None: ... + def named(self, name: Text) -> Parser[_A, _B]: ... + def define(self, p: Union[Parser[_A, _B], _ParserCallable]) -> None: ... + def run(self, tokens: Sequence[_A], s: State) -> Tuple[_B, State]: ... + def parse(self, tokens: Sequence[_A]) -> _B: ... + @overload + def __add__( # type: ignore[misc] + self, other: _IgnoredParser[_A] + ) -> Parser[_A, _B]: ... + @overload + def __add__(self, other: Parser[_A, _C]) -> _TupleParser[_A, Tuple[_B, _C]]: ... + def __or__(self, other: Parser[_A, _C]) -> Parser[_A, Union[_B, _C]]: ... + def __rshift__(self, f: Callable[[_B], _C]) -> Parser[_A, _C]: ... + def bind(self, f: Callable[[_B], Parser[_A, _C]]) -> Parser[_A, _C]: ... + def __neg__(self) -> _IgnoredParser[_A]: ... + +class _Ignored: + value: Any + def __init__(self, value: Any) -> None: ... + +class _IgnoredParser(Parser[_A, _Ignored]): + @overload # type: ignore[override] + def __add__(self, other: _IgnoredParser[_A]) -> _IgnoredParser[_A]: ... + @overload # type: ignore[override] + def __add__(self, other: Parser[_A, _C]) -> Parser[_A, _C]: ... + +class _TupleParser(Parser[_A, _B]): + @overload # type: ignore[override] + def __add__(self, other: _IgnoredParser[_A]) -> _TupleParser[_A, _B]: ... + @overload + def __add__(self, other: Parser[_A, Any]) -> Parser[_A, Any]: ... + +finished: Parser[Any, None] + +def many(p: Parser[_A, _B]) -> Parser[_A, List[_B]]: ... +def some(pred: Callable[[_A], bool]) -> Parser[_A, _A]: ... +def a(value: _A) -> Parser[_A, _A]: ... +def tok(type: Text, value: Optional[Text] = ...) -> Parser[Token, Text]: ... +def pure(x: _A) -> Parser[_A, _A]: ... +def maybe(p: Parser[_A, _B]) -> Parser[_A, Optional[_B]]: ... +def skip(p: Parser[_A, Any]) -> _IgnoredParser[_A]: ... +def oneplus(p: Parser[_A, _B]) -> Parser[_A, List[_B]]: ... +def forward_decl() -> Parser[Any, Any]: ... + +class NoParseError(Exception): + msg: Text + state: State + def __init__(self, msg: Text, state: State) -> None: ... diff --git a/lint_lib/_vendor/funcparserlib/py.typed b/lint_lib/_vendor/funcparserlib/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/lint_lib/_vendor/funcparserlib/util.py b/lint_lib/_vendor/funcparserlib/util.py new file mode 100644 index 00000000..5c9ea51e --- /dev/null +++ b/lint_lib/_vendor/funcparserlib/util.py @@ -0,0 +1,72 @@ +# -*- coding: utf-8 -*- + +# Copyright © 2009/2021 Andrey Vlasovskikh +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this +# software and associated documentation files (the "Software"), to deal in the Software +# without restriction, including without limitation the rights to use, copy, modify, +# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies +# or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +# PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE +# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import unicode_literals + + +def pretty_tree(x, kids, show): + """Return a pseudo-graphic tree representation of the object `x` similar to the + `tree` command in Unix. + + Type: `(T, Callable[[T], List[T]], Callable[[T], str]) -> str` + + It applies the parameter `show` (which is a function of type `(T) -> str`) to get a + textual representation of the objects to show. + + It applies the parameter `kids` (which is a function of type `(T) -> List[T]`) to + list the children of the object to show. + + Examples: + + ```pycon + >>> print(pretty_tree( + ... ["foo", ["bar", "baz"], "quux"], + ... lambda obj: obj if isinstance(obj, list) else [], + ... lambda obj: "[]" if isinstance(obj, list) else str(obj), + ... )) + [] + |-- foo + |-- [] + | |-- bar + | `-- baz + `-- quux + + ``` + """ + (MID, END, CONT, LAST, ROOT) = ("|-- ", "`-- ", "| ", " ", "") + + def rec(obj, indent, sym): + line = indent + sym + show(obj) + obj_kids = kids(obj) + if len(obj_kids) == 0: + return line + else: + if sym == MID: + next_indent = indent + CONT + elif sym == ROOT: + next_indent = indent + ROOT + else: + next_indent = indent + LAST + chars = [MID] * (len(obj_kids) - 1) + [END] + lines = [rec(kid, next_indent, sym) for kid, sym in zip(obj_kids, chars)] + return "\n".join([line] + lines) + + return rec(x, "", ROOT) diff --git a/lint_lib/_vendor/funcparserlib/util.pyi b/lint_lib/_vendor/funcparserlib/util.pyi new file mode 100644 index 00000000..cf6a3d48 --- /dev/null +++ b/lint_lib/_vendor/funcparserlib/util.pyi @@ -0,0 +1,7 @@ +from typing import TypeVar, Callable, List, Text + +_A = TypeVar("_A") + +def pretty_tree( + x: _A, kids: Callable[[_A], List[_A]], show: Callable[[_A], Text] +) -> Text: ... diff --git a/lint_lib/_vendor/vendor.txt b/lint_lib/_vendor/vendor.txt new file mode 100644 index 00000000..8af787f1 --- /dev/null +++ b/lint_lib/_vendor/vendor.txt @@ -0,0 +1 @@ +funcparserlib==1.0.1 diff --git a/lint_lib/lint.py b/lint_lib/lint.py new file mode 100644 index 00000000..de4ccd09 --- /dev/null +++ b/lint_lib/lint.py @@ -0,0 +1,280 @@ +import codecs +import contextlib +import io +import json +import os +import re +import sys +from collections import Counter +from os.path import dirname, join, pardir, relpath +from typing import Any, Dict, List, Optional, Set, TypeVar + +from . import parser +from ._vendor.funcparserlib.parser import NoParseError + +text_type = str +binary_type = bytes + +StringLike = TypeVar("StringLike", str, bytes) + +base = join(dirname(__file__), pardir) + +_surrogateRe = re.compile(r"\\u([0-9A-Fa-f]{4})(?:\\u([0-9A-Fa-f]{4}))?") + + +def clean_path(path: str) -> str: + return relpath(path, base) + + +def is_subsequence(l1: List[StringLike], l2: List[StringLike]) -> bool: + """checks if l1 is a subsequence of l2""" + i = 0 + for x in l2: + if l1[i] == x: + i += 1 + if i == len(l1): + return True + return False + + +def unescape_json(obj: Any) -> Any: + def decode_str(inp): + """Decode \\uXXXX escapes + + This decodes \\uXXXX escapes, possibly into non-BMP characters when + two surrogate character escapes are adjacent to each other. + """ + + # This cannot be implemented using the unicode_escape codec + # because that requires its input be ISO-8859-1, and we need + # arbitrary unicode as input. + def repl(m): + if m.group(2) is not None: + high = int(m.group(1), 16) + low = int(m.group(2), 16) + if ( + 0xD800 <= high <= 0xDBFF + and 0xDC00 <= low <= 0xDFFF + and sys.maxunicode == 0x10FFFF + ): + cp = ((high - 0xD800) << 10) + (low - 0xDC00) + 0x10000 + return chr(cp) + else: + return chr(high) + chr(low) + else: + return chr(int(m.group(1), 16)) + + return _surrogateRe.sub(repl, inp) + + if isinstance(obj, dict): + return {decode_str(k): unescape_json(v) for k, v in obj.items()} + elif isinstance(obj, list): + return [unescape_json(x) for x in obj] + elif isinstance(obj, text_type): + return decode_str(obj) + else: + return obj + + +def lint_dat_format( + path: str, + encoding: Optional[str], + first_header: StringLike, + expected_headers: Optional[List[StringLike]] = None, + input_headers: Optional[Set[StringLike]] = None, +) -> List[Dict[StringLike, StringLike]]: + if expected_headers is not None and first_header not in expected_headers: + raise ValueError("First header must be an expected header. (lint config error)") + + if ( + input_headers is not None + and expected_headers is not None + and not (set(input_headers) < set(expected_headers)) + ): + raise ValueError( + "Input header must be a subset of expected headers. (lint config error)" + ) + + if expected_headers is not None and len(set(expected_headers)) < len( + expected_headers + ): + raise ValueError( + "Can't expect a single header multiple times. (lint config error)" + ) + + if input_headers is None: + input_headers = set(expected_headers) + + try: + if encoding is not None: + with codecs.open(path, "r", encoding=encoding) as fp: + dat = fp.read() + parsed = parser.parse(dat, first_header) + else: + with open(path, "rb") as fp: + dat = fp.read() + parsed = parser.parse(dat, first_header) + except NoParseError as e: + print("Parse error in {}, {}".format(path, e)) + return + + seen_items = {} + + for item in parsed: + # Check we don't have duplicate headers within one item. + headers = Counter(x[0] for x in item.data) + headers.subtract(set(headers.elements())) # remove one instance of each + for header in set(headers.elements()): + c = headers[header] + print( + f"Duplicate header {header!r} occurs {c+1} times in one item in {path} at line {item.lineno}" + ) + + item_dict = dict(item.data) + + # Check we only have expected headers. + if expected_headers is not None: + if not is_subsequence( + list(item_dict.keys()), + expected_headers, + ): + unexpected = item_dict.keys() + print( + f"Unexpected item headings in {list(unexpected)!r} in {path} at line {item.lineno}" + ) + + # Check for duplicated items. + if input_headers is not None: + found_input = set() + for input_header in input_headers: + found_input.add((input_header, item_dict.get(input_header))) + else: + found_input = set(item_dict.items()) + + first_line = seen_items.setdefault(frozenset(found_input), item.lineno) + if first_line is not None and first_line != item.lineno: + print( + f"Duplicate item in {path} at line {item.lineno} previously seen on line {first_line}" + ) + + return [dict(x.data) for x in parsed] + + +def lint_encoding_test(path: str) -> None: + parsed = lint_dat_format( + path, + None, + b"data", + expected_headers=[b"data", b"encoding"], + input_headers={b"data"}, + ) + if not parsed: + # We'll already have output if there's a parse error. + return + + # We'd put extra linting here, if we ever have anything specific to the + # encoding tests here. + + +def lint_encoding_tests(path: str) -> None: + for root, dirs, files in os.walk(path): + for file in sorted(files): + if not file.endswith(".dat"): + continue + lint_encoding_test(clean_path(join(root, file))) + + +def lint_tokenizer_test(path: str) -> None: + all_keys = { + "description", + "input", + "output", + "initialStates", + "lastStartTag", + "ignoreErrorOrder", + "doubleEscaped", + "errors", + } + required = {"input", "output"} + with codecs.open(path, "r", "utf-8") as fp: + parsed = json.load(fp) + if not parsed: + return + if not isinstance(parsed, dict): + print("Top-level must be an object in %s" % path) + return + for test_group in parsed.values(): + if not isinstance(test_group, list): + print("Test groups must be a lists in %s" % path) + continue + for test in test_group: + if "doubleEscaped" in test and test["doubleEscaped"] is True: + test = unescape_json(test) + keys = set(test.keys()) + if not (required <= keys): + print( + "missing test properties {!r} in {}".format(required - keys, path) + ) + if not (keys <= all_keys): + print( + "unknown test properties {!r} in {}".format(keys - all_keys, path) + ) + + +def lint_tokenizer_tests(path: str) -> None: + for root, dirs, files in os.walk(path): + for file in sorted(files): + if not file.endswith(".test"): + continue + lint_tokenizer_test(clean_path(join(root, file))) + + +def lint_tree_construction_test(path: str) -> None: + parsed = lint_dat_format( + path, + "utf-8", + "data", + expected_headers=[ + "data", + "errors", + "new-errors", + "document-fragment", + "script-off", + "script-on", + "document", + ], + input_headers={ + "data", + "document-fragment", + "script-on", + "script-off", + }, + ) + if not parsed: + # We'll already have output if there's a parse error. + return + + # We'd put extra linting here, if we ever have anything specific to the + # tree construction tests here. + + +def lint_tree_construction_tests(path: str) -> None: + for root, dirs, files in os.walk(path): + for file in sorted(files): + if not file.endswith(".dat"): + continue + lint_tree_construction_test(clean_path(join(root, file))) + + +def main() -> int: + with contextlib.redirect_stdout(io.StringIO()) as f: + lint_encoding_tests(join(base, "encoding")) + lint_tokenizer_tests(join(base, "tokenizer")) + lint_tree_construction_tests(join(base, "tree-construction")) + + print(f.getvalue(), end="") + return 0 if f.getvalue() == "" else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/lint_lib/parser.py b/lint_lib/parser.py new file mode 100644 index 00000000..d18605a6 --- /dev/null +++ b/lint_lib/parser.py @@ -0,0 +1,177 @@ +import re +from typing import Callable, List, Optional, Tuple, Type, TypeVar, Union + +from ._vendor.funcparserlib.lexer import LexerError, Token +from ._vendor.funcparserlib.parser import ( + NoParseError, + Parser, + _Tuple, + finished, + many, + pure, + skip, + some, + tok, +) + +StringLike = TypeVar("StringLike", str, bytes) + + +class Test: + def __init__( + self, data: List[Tuple[StringLike, StringLike]], lineno: Optional[int] = None + ) -> None: + self.data = data + self.lineno = lineno + + +def _make_tokenizer(specs: List[Tuple[str, Tuple[StringLike]]]) -> Callable: + # Forked from upstream funcparserlib.lexer to fix #46 + def compile_spec(spec): + name, args = spec + return name, re.compile(*args) + + compiled = [compile_spec(s) for s in specs] + + def match_specs(specs, s, i, position): + if isinstance(s, str): + lf = "\n" + else: + lf = b"\n" + line, pos = position + for type, regexp in specs: + m = regexp.match(s, i) + if m is not None: + value = m.group() + nls = value.count(lf) + n_line = line + nls + if nls == 0: + n_pos = pos + len(value) + else: + n_pos = len(value) - value.rfind(lf) - 1 + return Token(type, value, (line, pos + 1), (n_line, n_pos)) + else: + errline = s.splitlines()[line - 1] + raise LexerError((line, pos + 1), errline) + + def f(s): + length = len(s) + line, pos = 1, 0 + i = 0 + while i < length: + t = match_specs(compiled, s, i, (line, pos)) + yield t + line, pos = t.end + i += len(t.value) + + return f + + +_token_specs_u = [ + ("HEADER", (r"[ \t]*#[^\n]*",)), + ("BODY", (r"[^#\n][^\n]*",)), + ("EOL", (r"\n",)), +] + +_token_specs_b = [ + (name, (regexp.encode("ascii"),)) for (name, (regexp,)) in _token_specs_u +] + +_tokenizer_u = _make_tokenizer(_token_specs_u) +_tokenizer_b = _make_tokenizer(_token_specs_b) + + +def _many_merge(toks: _Tuple) -> List[Test]: + x, xs = toks + return [x] + xs + + +def _notFollowedBy(p: Parser) -> Parser: + @Parser + def __notFollowedBy(tokens, s): + try: + p.run(tokens, s) + except NoParseError: + return skip(pure(None)).run(tokens, s) + else: + raise NoParseError("is followed by", s) + + __notFollowedBy.name = "(notFollowedBy {})".format(p) + return __notFollowedBy + + +def _trim_prefix(s: StringLike, prefix: StringLike) -> StringLike: + if s.startswith(prefix): + return s[len(prefix) :] + else: + return s + + +def _make_test(result: _Tuple) -> Test: + first, rest = result + (first_header, first_lineno), first_body = first + return Test([(first_header, first_body)] + rest, lineno=first_lineno) + + +def _parser( + tokens: List[Token], + new_test_header: StringLike, + tok_type: Union[Type[str], Type[bytes]], +) -> List[Test]: + if tok_type is str: + header_prefix = "#" + elif tok_type is bytes: + header_prefix = b"#" + else: + assert False, "unreachable" + + first_header = ( + some( + lambda tok: tok.type == "HEADER" + and tok.value == header_prefix + new_test_header + ) + >> ( + lambda x: ( + _trim_prefix(x.value, header_prefix), + x.start[0] if x.start is not None else None, + ) + ) + ) + skip(tok("EOL")) + + header = ( + some( + lambda tok: tok.type == "HEADER" + and tok.value != header_prefix + new_test_header + ) + >> (lambda x: _trim_prefix(x.value, header_prefix)) + ) + skip(tok("EOL")) + + body = tok("BODY") + tok("EOL") >> (lambda x: x[0] + x[1]) + empty = tok("EOL") + + actual_body = many(body | (empty + skip(_notFollowedBy(first_header)))) >> ( + lambda xs: tok_type().join(xs)[:-1] + ) + + first_segment = first_header + actual_body >> tuple + rest_segment = header + actual_body >> tuple + + test = first_segment + many(rest_segment) >> _make_test + + tests = (test + many(skip(empty) + test)) >> _many_merge + + toplevel = tests + skip(finished) + + return toplevel.parse(tokens) + + +def parse(s: StringLike, new_test_header: StringLike) -> List[Test]: + if type(s) != type(new_test_header): + raise TypeError("s and new_test_header must have same type") + + if isinstance(s, str): + return _parser(list(_tokenizer_u(s)), new_test_header, str) + elif isinstance(s, bytes): + return _parser(list(_tokenizer_b(s)), new_test_header, bytes) + else: + raise TypeError("s must be unicode or bytes object") diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..a68f7874 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,7 @@ +[tool.vendoring] +destination = "lint_lib/_vendor/" +requirements = "lint_lib/_vendor/vendor.txt" +namespace = "lint_lib._vendor" + +protected-files = ["__init__.py", "vendor.txt"] +patches-dir = "lint_lib/_vendor-patches" diff --git a/serializer/core.test b/serializer/core.test index c0b4222d..a6fa0754 100644 --- a/serializer/core.test +++ b/serializer/core.test @@ -112,12 +112,12 @@ "expected": [""] }, -{"description": "HTML 4.01 DOCTYPE without system identifer", +{"description": "HTML 4.01 DOCTYPE without system identifier", "input": [["Doctype", "HTML", "-//W3C//DTD HTML 4.01//EN"]], "expected": [""] }, -{"description": "IBM DOCTYPE without public identifer", +{"description": "IBM DOCTYPE without public identifier", "input": [["Doctype", "html", "", "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"]], "expected": [""] } diff --git a/tokenizer/contentModelFlags.test b/tokenizer/contentModelFlags.test index 5197b68e..9cf7c8bd 100644 --- a/tokenizer/contentModelFlags.test +++ b/tokenizer/contentModelFlags.test @@ -6,6 +6,12 @@ "input":"
&body;", "output":[["Character", "&body;"]]}, +{"description":"PLAINTEXT with seeming close tag", +"initialStates":["PLAINTEXT state"], +"lastStartTag":"plaintext", +"input":"&body;", +"output":[["Character", "&body;"]]}, + {"description":"End tag closing RCDATA or RAWTEXT", "initialStates":["RCDATA state", "RAWTEXT state"], "lastStartTag":"xmp", diff --git a/tokenizer/domjs.test b/tokenizer/domjs.test index b17a5df5..1a0824d7 100644 --- a/tokenizer/domjs.test +++ b/tokenizer/domjs.test @@ -25,7 +25,7 @@ ] }, { - "description":"NUL in RCDATA, RAWTEXT, PLAINTEXT and Script data", + "description":"Raw NUL replacement", "doubleEscaped":true, "initialStates":["RCDATA state", "RAWTEXT state", "PLAINTEXT state", "Script data state"], "input":"\\u0000", @@ -34,6 +34,13 @@ { "code": "unexpected-null-character", "line": 1, "col": 1 } ] }, + { + "description":"NUL in CDATA section", + "doubleEscaped":true, + "initialStates":["CDATA section state"], + "input":"\\u0000]]>", + "output":[["Character", "\\u0000"]] + }, { "description":"NUL in script HTML comment", "doubleEscaped":true, @@ -112,20 +119,95 @@ { "code": "eof-in-script-html-comment-like-text", "line": 1, "col": 13 } ] }, + { + "description":"Dash in script HTML comment", + "initialStates":["Script data state"], + "input":"", + "output":[["Character", ""]] + }, + { + "description":"Dash less-than in script HTML comment", + "initialStates":["Script data state"], + "input":"", + "output":[["Character", ""]] + }, + { + "description":"Dash at end of script HTML comment", + "initialStates":["Script data state"], + "input":"", + "output":[["Character", ""]] + }, + { + "description":" in script HTML comment", + "initialStates":["Script data state"], + "lastStartTag":"script", + "input":"", + "output":[["Character", ""], ["EndTag", "script"]] + }, + { + "description":" in script HTML comment - double escaped", + "initialStates":["Script data state"], + "lastStartTag":"script", + "input":"", + "output":[["Character", ""], ["EndTag", "script"]] + }, + { + "description":" in script HTML comment - double escaped with nested -->", + "output":[["Character", ""], ["EndTag", "script"]] + }, + { + "description":" in script HTML comment - double escaped with abrupt end", + "initialStates":["Script data state"], + "lastStartTag":"script", + "input":" -->", + "output":[["Character", ""], ["EndTag", "script"], ["Character", " -->"], ["EndTag", "script"]] + }, + { + "description":"Incomplete start tag in script HTML comment double escaped", + "initialStates":["Script data state"], + "lastStartTag":"script", + "input":"", + "output":[["Character", ""]] + }, + { + "description":"Unclosed start tag in script HTML comment double escaped", + "initialStates":["Script data state"], + "lastStartTag":"script", + "input":"", + "output":[["Character", ""]] + }, + { + "description":"Incomplete end tag in script HTML comment double escaped", + "initialStates":["Script data state"], + "lastStartTag":"script", + "input":"", + "output":[["Character", ""]] + }, + { + "description":"Unclosed end tag in script HTML comment double escaped", + "initialStates":["Script data state"], + "lastStartTag":"script", + "input":"", + "output":[["Character", ""]] + }, { "description":"leading U+FEFF must pass through", + "initialStates":["Data state", "RCDATA state", "RAWTEXT state", "Script data state"], "doubleEscaped":true, "input":"\\uFEFFfoo\\uFEFFbar", "output":[["Character", "\\uFEFFfoo\\uFEFFbar"]] }, { - "description":"Non BMP-charref in in RCDATA", + "description":"Non BMP-charref in RCDATA", "initialStates":["RCDATA state"], "input":"≂̸", "output":[["Character", "\u2242\u0338"]] }, { - "description":"Bad charref in in RCDATA", + "description":"Bad charref in RCDATA", "initialStates":["RCDATA state"], "input":"&NotEqualTild;", "output":[["Character", "&NotEqualTild;"]], @@ -134,36 +216,36 @@ ] }, { - "description":"lowercase endtags in RCDATA and RAWTEXT", - "initialStates":["RCDATA state", "RAWTEXT state"], + "description":"lowercase endtags", + "initialStates":["RCDATA state", "RAWTEXT state", "Script data state"], "lastStartTag":"xmp", "input":"", "output":[["EndTag","xmp"]] }, { - "description":"bad endtag in RCDATA and RAWTEXT", - "initialStates":["RCDATA state", "RAWTEXT state"], + "description":"bad endtag (space before name)", + "initialStates":["RCDATA state", "RAWTEXT state", "Script data state"], "lastStartTag":"xmp", "input":" XMP>", "output":[["Character"," XMP>"]] }, { - "description":"bad endtag in RCDATA and RAWTEXT", - "initialStates":["RCDATA state", "RAWTEXT state"], + "description":"bad endtag (not matching last start tag)", + "initialStates":["RCDATA state", "RAWTEXT state", "Script data state"], "lastStartTag":"xmp", "input":"", "output":[["Character",""]] }, { - "description":"bad endtag in RCDATA and RAWTEXT", - "initialStates":["RCDATA state", "RAWTEXT state"], + "description":"bad endtag (without close bracket)", + "initialStates":["RCDATA state", "RAWTEXT state", "Script data state"], "lastStartTag":"xmp", "input":"", + "initialStates":["CDATA section state"], + "output":[["Character", "foo "]] + }, + { + "description":"CDATA followed by HTML content", + "input":"foo ]]> ", + "initialStates":["CDATA section state"], + "output":[["Character", "foo "]] + }, + { + "description":"CDATA with extra bracket", + "input":"foo]]]>", + "initialStates":["CDATA section state"], + "output":[["Character", "foo]"]] + }, + { + "description":"CDATA without end marker", + "input":"foo", "initialStates":["CDATA section state"], - "output":[["Character", "foo&bar"]], + "output":[["Character", "foo"]], "errors":[ - { "code": "eof-in-cdata", "line": 1, "col": 8 } + { "code": "eof-in-cdata", "line": 1, "col": 4 } ] + }, + { + "description":"CDATA with single bracket ending", + "input":"foo]", + "initialStates":["CDATA section state"], + "output":[["Character", "foo]"]], + "errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 5 } + ] + }, + { + "description":"CDATA with two brackets ending", + "input":"foo]]", + "initialStates":["CDATA section state"], + "output":[["Character", "foo]]"]], + "errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 6 } + ] + }, + { + "description": "HTML tag in script data", + "input": "hello world", + "initialStates": ["Script data state"], + "output": [["Character", "hello world"]] } - ] } diff --git a/tokenizer/entities.test b/tokenizer/entities.test index 7c514563..a6469cd0 100644 --- a/tokenizer/entities.test +++ b/tokenizer/entities.test @@ -1,13 +1,47 @@ {"tests": [ -{"description": "Undefined named entity in attribute value ending in semicolon and whose name starts with a known entity name.", +{"description": "Undefined named entity in a double-quoted attribute value ending in semicolon and whose name starts with a known entity name.", +"input":"