diff --git a/.github/workflows/downstream.yml b/.github/workflows/downstream.yml new file mode 100644 index 00000000..59f121f0 --- /dev/null +++ b/.github/workflows/downstream.yml @@ -0,0 +1,76 @@ +name: downstream + +concurrency: + group: "${{github.workflow}}-${{github.ref}}" + cancel-in-progress: true + +on: + workflow_dispatch: + push: + branches: + - master + pull_request: + types: [opened, synchronize] + branches: + - '*' + +jobs: + skeleton: + runs-on: ubuntu-latest + steps: + - run: echo hello world + + parse5: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + repository: inikulin/parse5 + submodules: recursive + - run: rm -rf test/data/html5lib-tests/ + - uses: actions/checkout@v2 + with: + path: test/data/html5lib-tests/ + - uses: actions/setup-node@v3 + with: + node-version: lts/* + cache: npm + - run: npm ci + - run: npm run build --if-present + - run: npm run unit-tests + + html5gum: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + repository: untitaker/html5gum + - run: rm -rf tests/html5lib-tests/ + - uses: actions/checkout@v2 + with: + path: tests/html5lib-tests/ + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + - run: cargo test + + nokogiri: + runs-on: ubuntu-latest + container: + image: ghcr.io/sparklemotion/nokogiri-test:mri-3.2 + steps: + - uses: actions/checkout@v3 + with: + repository: sparklemotion/nokogiri + path: nokogiri + - uses: actions/checkout@v3 + with: + path: nokogiri/test/html5lib-tests + - working-directory: nokogiri + name: "Run the Nokogiri test suite" + run: | + bundle install + bundle exec rake compile -- --enable-system-libraries + bundle exec rake test diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 00000000..99f67c50 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,25 @@ +name: lint + +concurrency: + group: "${{github.workflow}}-${{github.ref}}" + cancel-in-progress: true + +on: + workflow_dispatch: + push: + branches: + - master + pull_request: + types: [opened, synchronize] + branches: + - '*' + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: '3.11' + - run: ./lint diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..f8b56708 --- /dev/null +++ b/.gitignore @@ -0,0 +1,79 @@ +# Copyright (c) 2014 GitHub, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover + +# Translations +*.mo +*.pot + +# Django stuff: +*.log + +# Sphinx documentation +doc/_build/ + +# PyBuilder +target/ diff --git a/encoding/scripted/tests1.dat b/encoding/scripted/tests1.dat new file mode 100644 index 00000000..04d18bb9 --- /dev/null +++ b/encoding/scripted/tests1.dat @@ -0,0 +1,5 @@ +#data + + +#encoding +iso-8859-2 diff --git a/encoding/tests1.dat b/encoding/tests1.dat index 77b0e41d..7aa9586d 100644 --- a/encoding/tests1.dat +++ b/encoding/tests1.dat @@ -356,12 +356,6 @@ iso-8859-2 #encoding iso-8859-2 -#data - - -#encoding -iso-8859-2 - #data diff --git a/lint b/lint new file mode 100755 index 00000000..19b7f50c --- /dev/null +++ b/lint @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 +import sys + +import lint_lib.lint as lint + +sys.exit(lint.main()) diff --git a/lint_lib/__init__.py b/lint_lib/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/lint_lib/_vendor-patches/funcparserlib.patch b/lint_lib/_vendor-patches/funcparserlib.patch new file mode 100644 index 00000000..fc294880 --- /dev/null +++ b/lint_lib/_vendor-patches/funcparserlib.patch @@ -0,0 +1,24 @@ +diff --git a/lint_lib/_vendor/funcparserlib/parser.py b/lint_lib/_vendor/funcparserlib/parser.py +index eb2f53f..0f86e6c 100644 +--- a/lint_lib/_vendor/funcparserlib/parser.py ++++ b/lint_lib/_vendor/funcparserlib/parser.py +@@ -137,19 +137,6 @@ class Parser(object): + "('x', 'y')" + + ``` +- +- !!! Note +- +- You can enable the parsing log this way: +- +- ```python +- import logging +- logging.basicConfig(level=logging.DEBUG) +- import funcparserlib.parser +- funcparserlib.parser.debug = True +- ``` +- +- The way to enable the parsing log may be changed in future versions. + """ + self.name = name + return self diff --git a/lint_lib/_vendor/__init__.py b/lint_lib/_vendor/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/lint_lib/_vendor/funcparserlib/LICENSE b/lint_lib/_vendor/funcparserlib/LICENSE new file mode 100644 index 00000000..31d3a95b --- /dev/null +++ b/lint_lib/_vendor/funcparserlib/LICENSE @@ -0,0 +1,18 @@ +Copyright © 2009/2021 Andrey Vlasovskikh + +Permission is hereby granted, free of charge, to any person obtaining a copy of this +software and associated documentation files (the "Software"), to deal in the Software +without restriction, including without limitation the rights to use, copy, modify, +merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice shall be included in all copies or +substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT +OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. diff --git a/lint_lib/_vendor/funcparserlib/__init__.py b/lint_lib/_vendor/funcparserlib/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/lint_lib/_vendor/funcparserlib/lexer.py b/lint_lib/_vendor/funcparserlib/lexer.py new file mode 100644 index 00000000..0a5b5e9e --- /dev/null +++ b/lint_lib/_vendor/funcparserlib/lexer.py @@ -0,0 +1,211 @@ +# -*- coding: utf-8 -*- + +# Copyright © 2009/2021 Andrey Vlasovskikh +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this +# software and associated documentation files (the "Software"), to deal in the Software +# without restriction, including without limitation the rights to use, copy, modify, +# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies +# or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +# PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE +# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import unicode_literals + +__all__ = ["make_tokenizer", "TokenSpec", "Token", "LexerError"] + +import re + + +class LexerError(Exception): + def __init__(self, place, msg): + self.place = place + self.msg = msg + + def __str__(self): + s = "cannot tokenize data" + line, pos = self.place + return '%s: %d,%d: "%s"' % (s, line, pos, self.msg) + + +class TokenSpec(object): + """A token specification for generating a lexer via `make_tokenizer()`.""" + + def __init__(self, type, pattern, flags=0): + """Initialize a `TokenSpec` object. + + Parameters: + type (str): User-defined type of the token (e.g. `"name"`, `"number"`, + `"operator"`) + pattern (str): Regexp for matching this token type + flags (int, optional): Regexp flags, the second argument of `re.compile()` + """ + self.type = type + self.pattern = pattern + self.flags = flags + + def __repr__(self): + return "TokenSpec(%r, %r, %r)" % (self.type, self.pattern, self.flags) + + +class Token(object): + """A token object that represents a substring of certain type in your text. + + You can compare tokens for equality using the `==` operator. Tokens also define + custom `repr()` and `str()`. + + Attributes: + type (str): User-defined type of the token (e.g. `"name"`, `"number"`, + `"operator"`) + value (str): Text value of the token + start (Optional[Tuple[int, int]]): Start position (_line_, _column_) + end (Optional[Tuple[int, int]]): End position (_line_, _column_) + """ + + def __init__(self, type, value, start=None, end=None): + """Initialize a `Token` object.""" + self.type = type + self.value = value + self.start = start + self.end = end + + def __repr__(self): + return "Token(%r, %r)" % (self.type, self.value) + + def __eq__(self, other): + # FIXME: Case sensitivity is assumed here + if other is None: + return False + else: + return self.type == other.type and self.value == other.value + + def _pos_str(self): + if self.start is None or self.end is None: + return "" + else: + sl, sp = self.start + el, ep = self.end + return "%d,%d-%d,%d:" % (sl, sp, el, ep) + + def __str__(self): + s = "%s %s '%s'" % (self._pos_str(), self.type, self.value) + return s.strip() + + @property + def name(self): + return self.value + + def pformat(self): + return "%s %s '%s'" % ( + self._pos_str().ljust(20), # noqa + self.type.ljust(14), + self.value, + ) + + +def make_tokenizer(specs): + # noinspection GrazieInspection + """Make a function that tokenizes text based on the regexp specs. + + Type: `(Sequence[TokenSpec | Tuple]) -> Callable[[str], Iterable[Token]]` + + A token spec is `TokenSpec` instance. + + !!! Note + + For legacy reasons, a token spec may also be a tuple of (_type_, _args_), where + _type_ sets the value of `Token.type` for the token, and _args_ are the + positional arguments for `re.compile()`: either just (_pattern_,) or + (_pattern_, _flags_). + + It returns a tokenizer function that takes a string and returns an iterable of + `Token` objects, or raises `LexerError` if it cannot tokenize the string according + to its token specs. + + Examples: + + ```pycon + >>> tokenize = make_tokenizer([ + ... TokenSpec("space", r"\\s+"), + ... TokenSpec("id", r"\\w+"), + ... TokenSpec("op", r"[,!]"), + ... ]) + >>> text = "Hello, World!" + >>> [t for t in tokenize(text) if t.type != "space"] # noqa + [Token('id', 'Hello'), Token('op', ','), Token('id', 'World'), Token('op', '!')] + >>> text = "Bye?" + >>> list(tokenize(text)) + Traceback (most recent call last): + ... + lexer.LexerError: cannot tokenize data: 1,4: "Bye?" + + ``` + """ + compiled = [] + for spec in specs: + if isinstance(spec, TokenSpec): + c = spec.type, re.compile(spec.pattern, spec.flags) + else: + name, args = spec + c = name, re.compile(*args) + compiled.append(c) + + def match_specs(s, i, position): + line, pos = position + for type, regexp in compiled: + m = regexp.match(s, i) + if m is not None: + value = m.group() + nls = value.count("\n") + n_line = line + nls + if nls == 0: + n_pos = pos + len(value) + else: + n_pos = len(value) - value.rfind("\n") - 1 + return Token(type, value, (line, pos + 1), (n_line, n_pos)) + else: + err_line = s.splitlines()[line - 1] + raise LexerError((line, pos + 1), err_line) + + def f(s): + length = len(s) + line, pos = 1, 0 + i = 0 + while i < length: + t = match_specs(s, i, (line, pos)) + yield t + line, pos = t.end + i += len(t.value) + + return f + + +# This is an example of token specs. See also [this article][1] for a +# discussion of searching for multiline comments using regexps (including `*?`). +# +# [1]: http://ostermiller.org/findcomment.html +_example_token_specs = [ + TokenSpec("COMMENT", r"\(\*(.|[\r\n])*?\*\)", re.MULTILINE), + TokenSpec("COMMENT", r"\{(.|[\r\n])*?\}", re.MULTILINE), + TokenSpec("COMMENT", r"//.*"), + TokenSpec("NL", r"[\r\n]+"), + TokenSpec("SPACE", r"[ \t\r\n]+"), + TokenSpec("NAME", r"[A-Za-z_][A-Za-z_0-9]*"), + TokenSpec("REAL", r"[0-9]+\.[0-9]*([Ee][+\-]?[0-9]+)*"), + TokenSpec("INT", r"[0-9]+"), + TokenSpec("INT", r"\$[0-9A-Fa-f]+"), + TokenSpec("OP", r"(\.\.)|(<>)|(<=)|(>=)|(:=)|[;,=\(\):\[\]\.+\-<>\*/@\^]"), + TokenSpec("STRING", r"'([^']|(''))*'"), + TokenSpec("CHAR", r"#[0-9]+"), + TokenSpec("CHAR", r"#\$[0-9A-Fa-f]+"), +] +# tokenize = make_tokenizer(_example_token_specs) diff --git a/lint_lib/_vendor/funcparserlib/lexer.pyi b/lint_lib/_vendor/funcparserlib/lexer.pyi new file mode 100644 index 00000000..b1e88fe7 --- /dev/null +++ b/lint_lib/_vendor/funcparserlib/lexer.pyi @@ -0,0 +1,34 @@ +from typing import Tuple, Optional, Callable, Iterable, Text, Sequence + +_Place = Tuple[int, int] +_Spec = Tuple[Text, Tuple] + +class Token: + type: Text + value: Text + start: Optional[_Place] + end: Optional[_Place] + name: Text + def __init__( + self, + type: Text, + value: Text, + start: Optional[_Place] = ..., + end: Optional[_Place] = ..., + ) -> None: ... + def pformat(self) -> Text: ... + +class TokenSpec: + name: Text + pattern: Text + flags: int + def __init__(self, name: Text, pattern: Text, flags: int = ...) -> None: ... + +def make_tokenizer( + specs: Sequence[TokenSpec | _Spec], +) -> Callable[[Text], Iterable[Token]]: ... + +class LexerError(Exception): + place: Tuple[int, int] + msg: Text + def __init__(self, place: _Place, msg: Text) -> None: ... diff --git a/lint_lib/_vendor/funcparserlib/parser.py b/lint_lib/_vendor/funcparserlib/parser.py new file mode 100644 index 00000000..0bbac7f5 --- /dev/null +++ b/lint_lib/_vendor/funcparserlib/parser.py @@ -0,0 +1,872 @@ +# -*- coding: utf-8 -*- + +# Copyright © 2009/2021 Andrey Vlasovskikh +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this +# software and associated documentation files (the "Software"), to deal in the Software +# without restriction, including without limitation the rights to use, copy, modify, +# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies +# or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +# PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE +# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +"""Functional parsing combinators. + +Parsing combinators define an internal domain-specific language (DSL) for describing +the parsing rules of a grammar. The DSL allows you to start with a few primitive +parsers, then combine your parsers to get more complex ones, and finally cover +the whole grammar you want to parse. + +The structure of the language: + +* Class `Parser` + * All the primitives and combinators of the language return `Parser` objects + * It defines the main `Parser.parse(tokens)` method +* Primitive parsers + * `tok(type, value)`, `a(value)`, `some(pred)`, `forward_decl()`, `finished` +* Parser combinators + * `p1 + p2`, `p1 | p2`, `p >> f`, `-p`, `maybe(p)`, `many(p)`, `oneplus(p)`, + `skip(p)` +* Abstraction + * Use regular Python variables `p = ... # Expression of type Parser` to define new + rules (non-terminals) of your grammar + +Every time you apply one of the combinators, you get a new `Parser` object. In other +words, the set of `Parser` objects is closed under the means of combination. + +!!! Note + + We took the parsing combinators language from the book [Introduction to Functional + Programming][1] and translated it from ML into Python. + + [1]: https://www.cl.cam.ac.uk/teaching/Lectures/funprog-jrh-1996/ +""" + +from __future__ import unicode_literals + +__all__ = [ + "some", + "a", + "tok", + "many", + "pure", + "finished", + "maybe", + "skip", + "oneplus", + "forward_decl", + "NoParseError", + "Parser", +] + +import sys +import logging +import warnings + +from lint_lib._vendor.funcparserlib.lexer import Token + +log = logging.getLogger("funcparserlib") + +debug = False +if sys.version_info < (3,): + string_types = (str, unicode) # noqa +else: + string_types = str + + +class Parser(object): + """A parser object that can parse a sequence of tokens or can be combined with + other parsers using `+`, `|`, `>>`, `many()`, and other parsing combinators. + + Type: `Parser[A, B]` + + The generic variables in the type are: `A` — the type of the tokens in the + sequence to parse,`B` — the type of the parsed value. + + In order to define a parser for your grammar: + + 1. You start with primitive parsers by calling `a(value)`, `some(pred)`, + `forward_decl()`, `finished` + 2. You use parsing combinators `p1 + p2`, `p1 | p2`, `p >> f`, `many(p)`, and + others to combine parsers into a more complex parser + 3. You can assign complex parsers to variables to define names that correspond to + the rules of your grammar + + !!! Note + + The constructor `Parser.__init__()` is considered **internal** and may be + changed in future versions. Use primitive parsers and parsing combinators to + construct new parsers. + """ + + def __init__(self, p): + """Wrap the parser function `p` into a `Parser` object.""" + self.name = "" + self.define(p) + + def named(self, name): + # noinspection GrazieInspection + """Specify the name of the parser for easier debugging. + + Type: `(str) -> Parser[A, B]` + + This name is used in the debug-level parsing log. You can also get it via the + `Parser.name` attribute. + + Examples: + + ```pycon + >>> expr = (a("x") + a("y")).named("expr") + >>> expr.name + 'expr' + + ``` + + ```pycon + >>> expr = a("x") + a("y") + >>> expr.name + "('x', 'y')" + + ``` + """ + self.name = name + return self + + def define(self, p): + """Define the parser created earlier as a forward declaration. + + Type: `(Parser[A, B]) -> None` + + Use `p = forward_decl()` in combination with `p.define(...)` to define + recursive parsers. + + See the examples in the docs for `forward_decl()`. + """ + f = getattr(p, "run", p) + if debug: + setattr(self, "_run", f) + else: + setattr(self, "run", f) + self.named(getattr(p, "name", p.__doc__)) + + def run(self, tokens, s): + """Run the parser against the tokens with the specified parsing state. + + Type: `(Sequence[A], State) -> Tuple[B, State]` + + The parsing state includes the current position in the sequence being parsed, + and the position of the rightmost token that has been consumed while parsing for + better error messages. + + If the parser fails to parse the tokens, it raises `NoParseError`. + + !!! Warning + + This is method is **internal** and may be changed in future versions. Use + `Parser.parse(tokens)` instead and let the parser object take care of + updating the parsing state. + """ + if debug: + log.debug("trying %s" % self.name) + return self._run(tokens, s) # noqa + + def _run(self, tokens, s): + raise NotImplementedError("you must define() a parser") + + def parse(self, tokens): + """Parse the sequence of tokens and return the parsed value. + + Type: `(Sequence[A]) -> B` + + It takes a sequence of tokens of arbitrary type `A` and returns the parsed value + of arbitrary type `B`. + + If the parser fails to parse the tokens, it raises `NoParseError`. + + !!! Note + + Although `Parser.parse()` can parse sequences of any objects (including + `str` which is a sequence of `str` chars), **the recommended way** is + parsing sequences of `Token` objects. + + You **should** use a regexp-based tokenizer `make_tokenizer()` defined in + `funcparserlib.lexer` to convert your text into a sequence of `Token` + objects before parsing it. You will get more readable parsing error messages + (as `Token` objects contain their position in the source file) and good + separation of the lexical and syntactic levels of the grammar. + """ + try: + (tree, _) = self.run(tokens, State(0, 0, None)) + return tree + except NoParseError as e: + max = e.state.max + if len(tokens) > max: + t = tokens[max] + if isinstance(t, Token): + if t.start is None or t.end is None: + loc = "" + else: + s_line, s_pos = t.start + e_line, e_pos = t.end + loc = "%d,%d-%d,%d: " % (s_line, s_pos, e_line, e_pos) + msg = "%s%s: %r" % (loc, e.msg, t.value) + elif isinstance(t, string_types): + msg = "%s: %r" % (e.msg, t) + else: + msg = "%s: %s" % (e.msg, t) + else: + msg = "got unexpected end of input" + if e.state.parser is not None: + msg = "%s, expected: %s" % (msg, e.state.parser.name) + e.msg = msg + raise + + def __add__(self, other): + """Sequential combination of parsers. It runs this parser, then the other + parser. + + The return value of the resulting parser is a tuple of each parsed value in + the sum of parsers. We merge all parsing results of `p1 + p2 + ... + pN` into a + single tuple. It means that the parsing result may be a 2-tuple, a 3-tuple, + a 4-tuple, etc. of parsed values. You avoid this by transforming the parsed + pair into a new value using the `>>` combinator. + + You can also skip some parsing results in the resulting parsers by using `-p` + or `skip(p)` for some parsers in your sum of parsers. It means that the parsing + result might be a single value, not a tuple of parsed values. See the docs + for `Parser.__neg__()` for more examples. + + Overloaded types (lots of them to provide stricter checking for the quite + dynamic return type of this method): + + * `(self: Parser[A, B], _IgnoredParser[A]) -> Parser[A, B]` + * `(self: Parser[A, B], Parser[A, C]) -> _TupleParser[A, Tuple[B, C]]` + * `(self: _TupleParser[A, B], _IgnoredParser[A]) -> _TupleParser[A, B]` + * `(self: _TupleParser[A, B], Parser[A, Any]) -> Parser[A, Any]` + * `(self: _IgnoredParser[A], _IgnoredParser[A]) -> _IgnoredParser[A]` + * `(self: _IgnoredParser[A], Parser[A, C]) -> Parser[A, C]` + + Examples: + + ```pycon + >>> expr = a("x") + a("y") + >>> expr.parse("xy") + ('x', 'y') + + ``` + + ```pycon + >>> expr = a("x") + a("y") + a("z") + >>> expr.parse("xyz") + ('x', 'y', 'z') + + ``` + + ```pycon + >>> expr = a("x") + a("y") + >>> expr.parse("xz") + Traceback (most recent call last): + ... + parser.NoParseError: got unexpected token: 'z', expected: 'y' + + ``` + """ + + def magic(v1, v2): + if isinstance(v1, _Tuple): + return _Tuple(v1 + (v2,)) + else: + return _Tuple((v1, v2)) + + @_TupleParser + def _add(tokens, s): + (v1, s2) = self.run(tokens, s) + (v2, s3) = other.run(tokens, s2) + return magic(v1, v2), s3 + + @Parser + def ignored_right(tokens, s): + v, s2 = self.run(tokens, s) + _, s3 = other.run(tokens, s2) + return v, s3 + + name = "(%s, %s)" % (self.name, other.name) + if isinstance(other, _IgnoredParser): + return ignored_right.named(name) + else: + return _add.named(name) + + def __or__(self, other): + """Choice combination of parsers. + + It runs this parser and returns its result. If the parser fails, it runs the + other parser. + + Examples: + + ```pycon + >>> expr = a("x") | a("y") + >>> expr.parse("x") + 'x' + >>> expr.parse("y") + 'y' + >>> expr.parse("z") + Traceback (most recent call last): + ... + parser.NoParseError: got unexpected token: 'z', expected: 'x' or 'y' + + ``` + """ + + @Parser + def _or(tokens, s): + try: + return self.run(tokens, s) + except NoParseError as e: + state = e.state + try: + return other.run(tokens, State(s.pos, state.max, state.parser)) + except NoParseError as e: + if s.pos == e.state.max: + e.state = State(e.state.pos, e.state.max, _or) + raise + + _or.name = "%s or %s" % (self.name, other.name) + return _or + + def __rshift__(self, f): + """Transform the parsing result by applying the specified function. + + Type: `(Callable[[B], C]) -> Parser[A, C]` + + You can use it for transforming the parsed value into another value before + including it into the parse tree (the AST). + + Examples: + + ```pycon + >>> def make_canonical_name(s): + ... return s.lower() + >>> expr = (a("D") | a("d")) >> make_canonical_name + >>> expr.parse("D") + 'd' + >>> expr.parse("d") + 'd' + + ``` + """ + + @Parser + def _shift(tokens, s): + (v, s2) = self.run(tokens, s) + return f(v), s2 + + return _shift.named(self.name) + + def bind(self, f): + """Bind the parser to a monadic function that returns a new parser. + + Type: `(Callable[[B], Parser[A, C]]) -> Parser[A, C]` + + Also known as `>>=` in Haskell. + + !!! Note + + You can parse any context-free grammar without resorting to `bind`. Due + to its poor performance please use it only when you really need it. + """ + + @Parser + def _bind(tokens, s): + (v, s2) = self.run(tokens, s) + return f(v).run(tokens, s2) + + _bind.name = "(%s >>=)" % (self.name,) + return _bind + + def __neg__(self): + """Return a parser that parses the same tokens, but its parsing result is + ignored by the sequential `+` combinator. + + Type: `(Parser[A, B]) -> _IgnoredParser[A]` + + You can use it for throwing away elements of concrete syntax (e.g. `","`, + `";"`). + + Examples: + + ```pycon + >>> expr = -a("x") + a("y") + >>> expr.parse("xy") + 'y' + + ``` + + ```pycon + >>> expr = a("x") + -a("y") + >>> expr.parse("xy") + 'x' + + ``` + + ```pycon + >>> expr = a("x") + -a("y") + a("z") + >>> expr.parse("xyz") + ('x', 'z') + + ``` + + ```pycon + >>> expr = -a("x") + a("y") + -a("z") + >>> expr.parse("xyz") + 'y' + + ``` + + ```pycon + >>> expr = -a("x") + a("y") + >>> expr.parse("yz") + Traceback (most recent call last): + ... + parser.NoParseError: got unexpected token: 'y', expected: 'x' + + ``` + + ```pycon + >>> expr = a("x") + -a("y") + >>> expr.parse("xz") + Traceback (most recent call last): + ... + parser.NoParseError: got unexpected token: 'z', expected: 'y' + + ``` + + !!! Note + + You **should not** pass the resulting parser to any combinators other than + `+`. You **should** have at least one non-skipped value in your + `p1 + p2 + ... + pN`. The parsed value of `-p` is an **internal** `_Ignored` + object, not intended for actual use. + """ + return _IgnoredParser(self) + + def __class_getitem__(cls, key): + return cls + + +class State(object): + """Parsing state that is maintained basically for error reporting. + + It consists of the current position `pos` in the sequence being parsed, and the + position `max` of the rightmost token that has been consumed while parsing. + """ + + def __init__(self, pos, max, parser=None): + self.pos = pos + self.max = max + self.parser = parser + + def __str__(self): + return str((self.pos, self.max)) + + def __repr__(self): + return "State(%r, %r)" % (self.pos, self.max) + + +class NoParseError(Exception): + def __init__(self, msg, state): + self.msg = msg + self.state = state + + def __str__(self): + return self.msg + + +class _Tuple(tuple): + pass + + +class _TupleParser(Parser): + pass + + +class _Ignored(object): + def __init__(self, value): + self.value = value + + def __repr__(self): + return "_Ignored(%s)" % repr(self.value) + + def __eq__(self, other): + return isinstance(other, _Ignored) and self.value == other.value + + +@Parser +def finished(tokens, s): + """A parser that throws an exception if there are any unparsed tokens left in the + sequence.""" + if s.pos >= len(tokens): + return None, s + else: + s2 = State(s.pos, s.max, finished if s.pos == s.max else s.parser) + raise NoParseError("got unexpected token", s2) + + +finished.name = "end of input" + + +def many(p): + """Return a parser that applies the parser `p` as many times as it succeeds at + parsing the tokens. + + Return a parser that infinitely applies the parser `p` to the input sequence + of tokens as long as it successfully parses them. The parsed value is a list of + the sequentially parsed values. + + Examples: + + ```pycon + >>> expr = many(a("x")) + >>> expr.parse("x") + ['x'] + >>> expr.parse("xx") + ['x', 'x'] + >>> expr.parse("xxxy") # noqa + ['x', 'x', 'x'] + >>> expr.parse("y") + [] + + ``` + """ + + @Parser + def _many(tokens, s): + res = [] + try: + while True: + (v, s) = p.run(tokens, s) + res.append(v) + except NoParseError as e: + s2 = State(s.pos, e.state.max, e.state.parser) + if debug: + log.debug( + "*matched* %d instances of %s, new state = %s" + % (len(res), _many.name, s2) + ) + return res, s2 + + _many.name = "{ %s }" % p.name + return _many + + +def some(pred): + """Return a parser that parses a token if it satisfies the predicate `pred`. + + Type: `(Callable[[A], bool]) -> Parser[A, A]` + + Examples: + + ```pycon + >>> expr = some(lambda s: s.isalpha()).named('alpha') + >>> expr.parse("x") + 'x' + >>> expr.parse("y") + 'y' + >>> expr.parse("1") + Traceback (most recent call last): + ... + parser.NoParseError: got unexpected token: '1', expected: alpha + + ``` + + !!! Warning + + The `some()` combinator is quite slow and may be changed or removed in future + versions. If you need a parser for a token by its type (e.g. any identifier) + and maybe its value, use `tok(type[, value])` instead. You should use + `make_tokenizer()` from `funcparserlib.lexer` to tokenize your text first. + """ + + @Parser + def _some(tokens, s): + if s.pos >= len(tokens): + s2 = State(s.pos, s.max, _some if s.pos == s.max else s.parser) + raise NoParseError("got unexpected end of input", s2) + else: + t = tokens[s.pos] + if pred(t): + pos = s.pos + 1 + s2 = State(pos, max(pos, s.max), s.parser) + if debug: + log.debug("*matched* %r, new state = %s" % (t, s2)) + return t, s2 + else: + s2 = State(s.pos, s.max, _some if s.pos == s.max else s.parser) + if debug: + log.debug( + "failed %r, state = %s, expected = %s" % (t, s2, s2.parser.name) + ) + raise NoParseError("got unexpected token", s2) + + _some.name = "some(...)" + return _some + + +def a(value): + """Return a parser that parses a token if it's equal to `value`. + + Type: `(A) -> Parser[A, A]` + + Examples: + + ```pycon + >>> expr = a("x") + >>> expr.parse("x") + 'x' + >>> expr.parse("y") + Traceback (most recent call last): + ... + parser.NoParseError: got unexpected token: 'y', expected: 'x' + + ``` + + !!! Note + + Although `Parser.parse()` can parse sequences of any objects (including + `str` which is a sequence of `str` chars), **the recommended way** is + parsing sequences of `Token` objects. + + You **should** use a regexp-based tokenizer `make_tokenizer()` defined in + `funcparserlib.lexer` to convert your text into a sequence of `Token` objects + before parsing it. You will get more readable parsing error messages (as `Token` + objects contain their position in the source file) and good separation of the + lexical and syntactic levels of the grammar. + """ + name = getattr(value, "name", value) + return some(lambda t: t == value).named(repr(name)) + + +def tok(type, value=None): + """Return a parser that parses a `Token` and returns the string value of the token. + + Type: `(str, Optional[str]) -> Parser[Token, str]` + + You can match any token of the specified `type` or you can match a specific token by + its `type` and `value`. + + Examples: + + ```pycon + >>> expr = tok("expr") + >>> expr.parse([Token("expr", "foo")]) + 'foo' + >>> expr.parse([Token("expr", "bar")]) + 'bar' + >>> expr.parse([Token("op", "=")]) + Traceback (most recent call last): + ... + parser.NoParseError: got unexpected token: '=', expected: expr + + ``` + + ```pycon + >>> expr = tok("op", "=") + >>> expr.parse([Token("op", "=")]) + '=' + >>> expr.parse([Token("op", "+")]) + Traceback (most recent call last): + ... + parser.NoParseError: got unexpected token: '+', expected: '=' + + ``` + + !!! Note + + In order to convert your text to parse into a sequence of `Token` objects, + use a regexp-based tokenizer `make_tokenizer()` defined in + `funcparserlib.lexer`. You will get more readable parsing error messages (as + `Token` objects contain their position in the source file) and good separation + of the lexical and syntactic levels of the grammar. + """ + if value is not None: + p = a(Token(type, value)) + else: + p = some(lambda t: t.type == type).named(type) + return (p >> (lambda t: t.value)).named(p.name) + + +def pure(x): + """Wrap any object into a parser. + + Type: `(A) -> Parser[A, A]` + + A pure parser doesn't touch the tokens sequence, it just returns its pure `x` + value. + + Also known as `return` in Haskell. + """ + + @Parser + def _pure(_, s): + return x, s + + _pure.name = "(pure %r)" % (x,) + return _pure + + +def maybe(p): + """Return a parser that returns `None` if the parser `p` fails. + + Examples: + + ```pycon + >>> expr = maybe(a("x")) + >>> expr.parse("x") + 'x' + >>> expr.parse("y") is None + True + + ``` + """ + return (p | pure(None)).named("[ %s ]" % (p.name,)) + + +def skip(p): + """An alias for `-p`. + + See also the docs for `Parser.__neg__()`. + """ + return -p + + +class _IgnoredParser(Parser): + def __init__(self, p): + super(_IgnoredParser, self).__init__(p) + run = self._run if debug else self.run + + def ignored(tokens, s): + v, s2 = run(tokens, s) + return v if isinstance(v, _Ignored) else _Ignored(v), s2 + + self.define(ignored) + self.name = getattr(p, "name", p.__doc__) + + def __add__(self, other): + def ignored_left(tokens, s): + _, s2 = self.run(tokens, s) + v, s3 = other.run(tokens, s2) + return v, s3 + + if isinstance(other, _IgnoredParser): + return _IgnoredParser(ignored_left).named( + "(%s, %s)" % (self.name, other.name) + ) + else: + return Parser(ignored_left).named("(%s, %s)" % (self.name, other.name)) + + +def oneplus(p): + """Return a parser that applies the parser `p` one or more times. + + A similar parser combinator `many(p)` means apply `p` zero or more times, whereas + `oneplus(p)` means apply `p` one or more times. + + Examples: + + ```pycon + >>> expr = oneplus(a("x")) + >>> expr.parse("x") + ['x'] + >>> expr.parse("xx") + ['x', 'x'] + >>> expr.parse("y") + Traceback (most recent call last): + ... + parser.NoParseError: got unexpected token: 'y', expected: 'x' + + ``` + """ + + @Parser + def _oneplus(tokens, s): + (v1, s2) = p.run(tokens, s) + (v2, s3) = many(p).run(tokens, s2) + return [v1] + v2, s3 + + _oneplus.name = "(%s, { %s })" % (p.name, p.name) + return _oneplus + + +def with_forward_decls(suspension): + warnings.warn( + "Use forward_decl() instead:\n" + "\n" + " p = forward_decl()\n" + " ...\n" + " p.define(parser_value)\n", + DeprecationWarning, + ) + + @Parser + def f(tokens, s): + return suspension().run(tokens, s) + + return f + + +def forward_decl(): + """Return an undefined parser that can be used as a forward declaration. + + Type: `Parser[Any, Any]` + + Use `p = forward_decl()` in combination with `p.define(...)` to define recursive + parsers. + + + Examples: + + ```pycon + >>> expr = forward_decl() + >>> expr.define(a("x") + maybe(expr) + a("y")) + >>> expr.parse("xxyy") # noqa + ('x', ('x', None, 'y'), 'y') + >>> expr.parse("xxy") + Traceback (most recent call last): + ... + parser.NoParseError: got unexpected end of input, expected: 'y' + + ``` + + !!! Note + + If you care about static types, you should add a type hint for your forward + declaration, so that your type checker can check types in `p.define(...)` later: + + ```python + p: Parser[str, int] = forward_decl() + p.define(a("x")) # Type checker error + p.define(a("1") >> int) # OK + ``` + """ + + @Parser + def f(_tokens, _s): + raise NotImplementedError("you must define() a forward_decl somewhere") + + f.name = "forward_decl()" + return f + + +if __name__ == "__main__": + import doctest + + doctest.testmod() diff --git a/lint_lib/_vendor/funcparserlib/parser.pyi b/lint_lib/_vendor/funcparserlib/parser.pyi new file mode 100644 index 00000000..e21ded5a --- /dev/null +++ b/lint_lib/_vendor/funcparserlib/parser.pyi @@ -0,0 +1,83 @@ +from typing import ( + Optional, + Generic, + TypeVar, + Union, + Callable, + Tuple, + Sequence, + Any, + List, + Text, + overload, +) +from funcparserlib.lexer import Token + +_A = TypeVar("_A") +_B = TypeVar("_B") +_C = TypeVar("_C") +_D = TypeVar("_D") + +class State: + pos: int + max: int + parser: Union[Parser, _ParserCallable, None] + def __init__( + self, + pos: int, + max: int, + parser: Union[Parser, _ParserCallable, None] = ..., + ) -> None: ... + +_ParserCallable = Callable[[_A, State], Tuple[_B, State]] + +class Parser(Generic[_A, _B]): + name: Text + def __init__(self, p: Union[Parser[_A, _B], _ParserCallable]) -> None: ... + def named(self, name: Text) -> Parser[_A, _B]: ... + def define(self, p: Union[Parser[_A, _B], _ParserCallable]) -> None: ... + def run(self, tokens: Sequence[_A], s: State) -> Tuple[_B, State]: ... + def parse(self, tokens: Sequence[_A]) -> _B: ... + @overload + def __add__( # type: ignore[misc] + self, other: _IgnoredParser[_A] + ) -> Parser[_A, _B]: ... + @overload + def __add__(self, other: Parser[_A, _C]) -> _TupleParser[_A, Tuple[_B, _C]]: ... + def __or__(self, other: Parser[_A, _C]) -> Parser[_A, Union[_B, _C]]: ... + def __rshift__(self, f: Callable[[_B], _C]) -> Parser[_A, _C]: ... + def bind(self, f: Callable[[_B], Parser[_A, _C]]) -> Parser[_A, _C]: ... + def __neg__(self) -> _IgnoredParser[_A]: ... + +class _Ignored: + value: Any + def __init__(self, value: Any) -> None: ... + +class _IgnoredParser(Parser[_A, _Ignored]): + @overload # type: ignore[override] + def __add__(self, other: _IgnoredParser[_A]) -> _IgnoredParser[_A]: ... + @overload # type: ignore[override] + def __add__(self, other: Parser[_A, _C]) -> Parser[_A, _C]: ... + +class _TupleParser(Parser[_A, _B]): + @overload # type: ignore[override] + def __add__(self, other: _IgnoredParser[_A]) -> _TupleParser[_A, _B]: ... + @overload + def __add__(self, other: Parser[_A, Any]) -> Parser[_A, Any]: ... + +finished: Parser[Any, None] + +def many(p: Parser[_A, _B]) -> Parser[_A, List[_B]]: ... +def some(pred: Callable[[_A], bool]) -> Parser[_A, _A]: ... +def a(value: _A) -> Parser[_A, _A]: ... +def tok(type: Text, value: Optional[Text] = ...) -> Parser[Token, Text]: ... +def pure(x: _A) -> Parser[_A, _A]: ... +def maybe(p: Parser[_A, _B]) -> Parser[_A, Optional[_B]]: ... +def skip(p: Parser[_A, Any]) -> _IgnoredParser[_A]: ... +def oneplus(p: Parser[_A, _B]) -> Parser[_A, List[_B]]: ... +def forward_decl() -> Parser[Any, Any]: ... + +class NoParseError(Exception): + msg: Text + state: State + def __init__(self, msg: Text, state: State) -> None: ... diff --git a/lint_lib/_vendor/funcparserlib/py.typed b/lint_lib/_vendor/funcparserlib/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/lint_lib/_vendor/funcparserlib/util.py b/lint_lib/_vendor/funcparserlib/util.py new file mode 100644 index 00000000..5c9ea51e --- /dev/null +++ b/lint_lib/_vendor/funcparserlib/util.py @@ -0,0 +1,72 @@ +# -*- coding: utf-8 -*- + +# Copyright © 2009/2021 Andrey Vlasovskikh +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this +# software and associated documentation files (the "Software"), to deal in the Software +# without restriction, including without limitation the rights to use, copy, modify, +# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be included in all copies +# or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +# PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE +# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import unicode_literals + + +def pretty_tree(x, kids, show): + """Return a pseudo-graphic tree representation of the object `x` similar to the + `tree` command in Unix. + + Type: `(T, Callable[[T], List[T]], Callable[[T], str]) -> str` + + It applies the parameter `show` (which is a function of type `(T) -> str`) to get a + textual representation of the objects to show. + + It applies the parameter `kids` (which is a function of type `(T) -> List[T]`) to + list the children of the object to show. + + Examples: + + ```pycon + >>> print(pretty_tree( + ... ["foo", ["bar", "baz"], "quux"], + ... lambda obj: obj if isinstance(obj, list) else [], + ... lambda obj: "[]" if isinstance(obj, list) else str(obj), + ... )) + [] + |-- foo + |-- [] + | |-- bar + | `-- baz + `-- quux + + ``` + """ + (MID, END, CONT, LAST, ROOT) = ("|-- ", "`-- ", "| ", " ", "") + + def rec(obj, indent, sym): + line = indent + sym + show(obj) + obj_kids = kids(obj) + if len(obj_kids) == 0: + return line + else: + if sym == MID: + next_indent = indent + CONT + elif sym == ROOT: + next_indent = indent + ROOT + else: + next_indent = indent + LAST + chars = [MID] * (len(obj_kids) - 1) + [END] + lines = [rec(kid, next_indent, sym) for kid, sym in zip(obj_kids, chars)] + return "\n".join([line] + lines) + + return rec(x, "", ROOT) diff --git a/lint_lib/_vendor/funcparserlib/util.pyi b/lint_lib/_vendor/funcparserlib/util.pyi new file mode 100644 index 00000000..cf6a3d48 --- /dev/null +++ b/lint_lib/_vendor/funcparserlib/util.pyi @@ -0,0 +1,7 @@ +from typing import TypeVar, Callable, List, Text + +_A = TypeVar("_A") + +def pretty_tree( + x: _A, kids: Callable[[_A], List[_A]], show: Callable[[_A], Text] +) -> Text: ... diff --git a/lint_lib/_vendor/vendor.txt b/lint_lib/_vendor/vendor.txt new file mode 100644 index 00000000..8af787f1 --- /dev/null +++ b/lint_lib/_vendor/vendor.txt @@ -0,0 +1 @@ +funcparserlib==1.0.1 diff --git a/lint_lib/lint.py b/lint_lib/lint.py new file mode 100644 index 00000000..de4ccd09 --- /dev/null +++ b/lint_lib/lint.py @@ -0,0 +1,280 @@ +import codecs +import contextlib +import io +import json +import os +import re +import sys +from collections import Counter +from os.path import dirname, join, pardir, relpath +from typing import Any, Dict, List, Optional, Set, TypeVar + +from . import parser +from ._vendor.funcparserlib.parser import NoParseError + +text_type = str +binary_type = bytes + +StringLike = TypeVar("StringLike", str, bytes) + +base = join(dirname(__file__), pardir) + +_surrogateRe = re.compile(r"\\u([0-9A-Fa-f]{4})(?:\\u([0-9A-Fa-f]{4}))?") + + +def clean_path(path: str) -> str: + return relpath(path, base) + + +def is_subsequence(l1: List[StringLike], l2: List[StringLike]) -> bool: + """checks if l1 is a subsequence of l2""" + i = 0 + for x in l2: + if l1[i] == x: + i += 1 + if i == len(l1): + return True + return False + + +def unescape_json(obj: Any) -> Any: + def decode_str(inp): + """Decode \\uXXXX escapes + + This decodes \\uXXXX escapes, possibly into non-BMP characters when + two surrogate character escapes are adjacent to each other. + """ + + # This cannot be implemented using the unicode_escape codec + # because that requires its input be ISO-8859-1, and we need + # arbitrary unicode as input. + def repl(m): + if m.group(2) is not None: + high = int(m.group(1), 16) + low = int(m.group(2), 16) + if ( + 0xD800 <= high <= 0xDBFF + and 0xDC00 <= low <= 0xDFFF + and sys.maxunicode == 0x10FFFF + ): + cp = ((high - 0xD800) << 10) + (low - 0xDC00) + 0x10000 + return chr(cp) + else: + return chr(high) + chr(low) + else: + return chr(int(m.group(1), 16)) + + return _surrogateRe.sub(repl, inp) + + if isinstance(obj, dict): + return {decode_str(k): unescape_json(v) for k, v in obj.items()} + elif isinstance(obj, list): + return [unescape_json(x) for x in obj] + elif isinstance(obj, text_type): + return decode_str(obj) + else: + return obj + + +def lint_dat_format( + path: str, + encoding: Optional[str], + first_header: StringLike, + expected_headers: Optional[List[StringLike]] = None, + input_headers: Optional[Set[StringLike]] = None, +) -> List[Dict[StringLike, StringLike]]: + if expected_headers is not None and first_header not in expected_headers: + raise ValueError("First header must be an expected header. (lint config error)") + + if ( + input_headers is not None + and expected_headers is not None + and not (set(input_headers) < set(expected_headers)) + ): + raise ValueError( + "Input header must be a subset of expected headers. (lint config error)" + ) + + if expected_headers is not None and len(set(expected_headers)) < len( + expected_headers + ): + raise ValueError( + "Can't expect a single header multiple times. (lint config error)" + ) + + if input_headers is None: + input_headers = set(expected_headers) + + try: + if encoding is not None: + with codecs.open(path, "r", encoding=encoding) as fp: + dat = fp.read() + parsed = parser.parse(dat, first_header) + else: + with open(path, "rb") as fp: + dat = fp.read() + parsed = parser.parse(dat, first_header) + except NoParseError as e: + print("Parse error in {}, {}".format(path, e)) + return + + seen_items = {} + + for item in parsed: + # Check we don't have duplicate headers within one item. + headers = Counter(x[0] for x in item.data) + headers.subtract(set(headers.elements())) # remove one instance of each + for header in set(headers.elements()): + c = headers[header] + print( + f"Duplicate header {header!r} occurs {c+1} times in one item in {path} at line {item.lineno}" + ) + + item_dict = dict(item.data) + + # Check we only have expected headers. + if expected_headers is not None: + if not is_subsequence( + list(item_dict.keys()), + expected_headers, + ): + unexpected = item_dict.keys() + print( + f"Unexpected item headings in {list(unexpected)!r} in {path} at line {item.lineno}" + ) + + # Check for duplicated items. + if input_headers is not None: + found_input = set() + for input_header in input_headers: + found_input.add((input_header, item_dict.get(input_header))) + else: + found_input = set(item_dict.items()) + + first_line = seen_items.setdefault(frozenset(found_input), item.lineno) + if first_line is not None and first_line != item.lineno: + print( + f"Duplicate item in {path} at line {item.lineno} previously seen on line {first_line}" + ) + + return [dict(x.data) for x in parsed] + + +def lint_encoding_test(path: str) -> None: + parsed = lint_dat_format( + path, + None, + b"data", + expected_headers=[b"data", b"encoding"], + input_headers={b"data"}, + ) + if not parsed: + # We'll already have output if there's a parse error. + return + + # We'd put extra linting here, if we ever have anything specific to the + # encoding tests here. + + +def lint_encoding_tests(path: str) -> None: + for root, dirs, files in os.walk(path): + for file in sorted(files): + if not file.endswith(".dat"): + continue + lint_encoding_test(clean_path(join(root, file))) + + +def lint_tokenizer_test(path: str) -> None: + all_keys = { + "description", + "input", + "output", + "initialStates", + "lastStartTag", + "ignoreErrorOrder", + "doubleEscaped", + "errors", + } + required = {"input", "output"} + with codecs.open(path, "r", "utf-8") as fp: + parsed = json.load(fp) + if not parsed: + return + if not isinstance(parsed, dict): + print("Top-level must be an object in %s" % path) + return + for test_group in parsed.values(): + if not isinstance(test_group, list): + print("Test groups must be a lists in %s" % path) + continue + for test in test_group: + if "doubleEscaped" in test and test["doubleEscaped"] is True: + test = unescape_json(test) + keys = set(test.keys()) + if not (required <= keys): + print( + "missing test properties {!r} in {}".format(required - keys, path) + ) + if not (keys <= all_keys): + print( + "unknown test properties {!r} in {}".format(keys - all_keys, path) + ) + + +def lint_tokenizer_tests(path: str) -> None: + for root, dirs, files in os.walk(path): + for file in sorted(files): + if not file.endswith(".test"): + continue + lint_tokenizer_test(clean_path(join(root, file))) + + +def lint_tree_construction_test(path: str) -> None: + parsed = lint_dat_format( + path, + "utf-8", + "data", + expected_headers=[ + "data", + "errors", + "new-errors", + "document-fragment", + "script-off", + "script-on", + "document", + ], + input_headers={ + "data", + "document-fragment", + "script-on", + "script-off", + }, + ) + if not parsed: + # We'll already have output if there's a parse error. + return + + # We'd put extra linting here, if we ever have anything specific to the + # tree construction tests here. + + +def lint_tree_construction_tests(path: str) -> None: + for root, dirs, files in os.walk(path): + for file in sorted(files): + if not file.endswith(".dat"): + continue + lint_tree_construction_test(clean_path(join(root, file))) + + +def main() -> int: + with contextlib.redirect_stdout(io.StringIO()) as f: + lint_encoding_tests(join(base, "encoding")) + lint_tokenizer_tests(join(base, "tokenizer")) + lint_tree_construction_tests(join(base, "tree-construction")) + + print(f.getvalue(), end="") + return 0 if f.getvalue() == "" else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/lint_lib/parser.py b/lint_lib/parser.py new file mode 100644 index 00000000..d18605a6 --- /dev/null +++ b/lint_lib/parser.py @@ -0,0 +1,177 @@ +import re +from typing import Callable, List, Optional, Tuple, Type, TypeVar, Union + +from ._vendor.funcparserlib.lexer import LexerError, Token +from ._vendor.funcparserlib.parser import ( + NoParseError, + Parser, + _Tuple, + finished, + many, + pure, + skip, + some, + tok, +) + +StringLike = TypeVar("StringLike", str, bytes) + + +class Test: + def __init__( + self, data: List[Tuple[StringLike, StringLike]], lineno: Optional[int] = None + ) -> None: + self.data = data + self.lineno = lineno + + +def _make_tokenizer(specs: List[Tuple[str, Tuple[StringLike]]]) -> Callable: + # Forked from upstream funcparserlib.lexer to fix #46 + def compile_spec(spec): + name, args = spec + return name, re.compile(*args) + + compiled = [compile_spec(s) for s in specs] + + def match_specs(specs, s, i, position): + if isinstance(s, str): + lf = "\n" + else: + lf = b"\n" + line, pos = position + for type, regexp in specs: + m = regexp.match(s, i) + if m is not None: + value = m.group() + nls = value.count(lf) + n_line = line + nls + if nls == 0: + n_pos = pos + len(value) + else: + n_pos = len(value) - value.rfind(lf) - 1 + return Token(type, value, (line, pos + 1), (n_line, n_pos)) + else: + errline = s.splitlines()[line - 1] + raise LexerError((line, pos + 1), errline) + + def f(s): + length = len(s) + line, pos = 1, 0 + i = 0 + while i < length: + t = match_specs(compiled, s, i, (line, pos)) + yield t + line, pos = t.end + i += len(t.value) + + return f + + +_token_specs_u = [ + ("HEADER", (r"[ \t]*#[^\n]*",)), + ("BODY", (r"[^#\n][^\n]*",)), + ("EOL", (r"\n",)), +] + +_token_specs_b = [ + (name, (regexp.encode("ascii"),)) for (name, (regexp,)) in _token_specs_u +] + +_tokenizer_u = _make_tokenizer(_token_specs_u) +_tokenizer_b = _make_tokenizer(_token_specs_b) + + +def _many_merge(toks: _Tuple) -> List[Test]: + x, xs = toks + return [x] + xs + + +def _notFollowedBy(p: Parser) -> Parser: + @Parser + def __notFollowedBy(tokens, s): + try: + p.run(tokens, s) + except NoParseError: + return skip(pure(None)).run(tokens, s) + else: + raise NoParseError("is followed by", s) + + __notFollowedBy.name = "(notFollowedBy {})".format(p) + return __notFollowedBy + + +def _trim_prefix(s: StringLike, prefix: StringLike) -> StringLike: + if s.startswith(prefix): + return s[len(prefix) :] + else: + return s + + +def _make_test(result: _Tuple) -> Test: + first, rest = result + (first_header, first_lineno), first_body = first + return Test([(first_header, first_body)] + rest, lineno=first_lineno) + + +def _parser( + tokens: List[Token], + new_test_header: StringLike, + tok_type: Union[Type[str], Type[bytes]], +) -> List[Test]: + if tok_type is str: + header_prefix = "#" + elif tok_type is bytes: + header_prefix = b"#" + else: + assert False, "unreachable" + + first_header = ( + some( + lambda tok: tok.type == "HEADER" + and tok.value == header_prefix + new_test_header + ) + >> ( + lambda x: ( + _trim_prefix(x.value, header_prefix), + x.start[0] if x.start is not None else None, + ) + ) + ) + skip(tok("EOL")) + + header = ( + some( + lambda tok: tok.type == "HEADER" + and tok.value != header_prefix + new_test_header + ) + >> (lambda x: _trim_prefix(x.value, header_prefix)) + ) + skip(tok("EOL")) + + body = tok("BODY") + tok("EOL") >> (lambda x: x[0] + x[1]) + empty = tok("EOL") + + actual_body = many(body | (empty + skip(_notFollowedBy(first_header)))) >> ( + lambda xs: tok_type().join(xs)[:-1] + ) + + first_segment = first_header + actual_body >> tuple + rest_segment = header + actual_body >> tuple + + test = first_segment + many(rest_segment) >> _make_test + + tests = (test + many(skip(empty) + test)) >> _many_merge + + toplevel = tests + skip(finished) + + return toplevel.parse(tokens) + + +def parse(s: StringLike, new_test_header: StringLike) -> List[Test]: + if type(s) != type(new_test_header): + raise TypeError("s and new_test_header must have same type") + + if isinstance(s, str): + return _parser(list(_tokenizer_u(s)), new_test_header, str) + elif isinstance(s, bytes): + return _parser(list(_tokenizer_b(s)), new_test_header, bytes) + else: + raise TypeError("s must be unicode or bytes object") diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..a68f7874 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,7 @@ +[tool.vendoring] +destination = "lint_lib/_vendor/" +requirements = "lint_lib/_vendor/vendor.txt" +namespace = "lint_lib._vendor" + +protected-files = ["__init__.py", "vendor.txt"] +patches-dir = "lint_lib/_vendor-patches" diff --git a/serializer/core.test b/serializer/core.test index c0b4222d..a6fa0754 100644 --- a/serializer/core.test +++ b/serializer/core.test @@ -112,12 +112,12 @@ "expected": [""] }, -{"description": "HTML 4.01 DOCTYPE without system identifer", +{"description": "HTML 4.01 DOCTYPE without system identifier", "input": [["Doctype", "HTML", "-//W3C//DTD HTML 4.01//EN"]], "expected": [""] }, -{"description": "IBM DOCTYPE without public identifer", +{"description": "IBM DOCTYPE without public identifier", "input": [["Doctype", "html", "", "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"]], "expected": [""] } diff --git a/tokenizer/contentModelFlags.test b/tokenizer/contentModelFlags.test index 5197b68e..9cf7c8bd 100644 --- a/tokenizer/contentModelFlags.test +++ b/tokenizer/contentModelFlags.test @@ -6,6 +6,12 @@ "input":"&body;", "output":[["Character", "&body;"]]}, +{"description":"PLAINTEXT with seeming close tag", +"initialStates":["PLAINTEXT state"], +"lastStartTag":"plaintext", +"input":"&body;", +"output":[["Character", "&body;"]]}, + {"description":"End tag closing RCDATA or RAWTEXT", "initialStates":["RCDATA state", "RAWTEXT state"], "lastStartTag":"xmp", diff --git a/tokenizer/domjs.test b/tokenizer/domjs.test index b17a5df5..1a0824d7 100644 --- a/tokenizer/domjs.test +++ b/tokenizer/domjs.test @@ -25,7 +25,7 @@ ] }, { - "description":"NUL in RCDATA, RAWTEXT, PLAINTEXT and Script data", + "description":"Raw NUL replacement", "doubleEscaped":true, "initialStates":["RCDATA state", "RAWTEXT state", "PLAINTEXT state", "Script data state"], "input":"\\u0000", @@ -34,6 +34,13 @@ { "code": "unexpected-null-character", "line": 1, "col": 1 } ] }, + { + "description":"NUL in CDATA section", + "doubleEscaped":true, + "initialStates":["CDATA section state"], + "input":"\\u0000]]>", + "output":[["Character", "\\u0000"]] + }, { "description":"NUL in script HTML comment", "doubleEscaped":true, @@ -112,20 +119,95 @@ { "code": "eof-in-script-html-comment-like-text", "line": 1, "col": 13 } ] }, + { + "description":"Dash in script HTML comment", + "initialStates":["Script data state"], + "input":"", + "output":[["Character", ""]] + }, + { + "description":"Dash less-than in script HTML comment", + "initialStates":["Script data state"], + "input":"", + "output":[["Character", ""]] + }, + { + "description":"Dash at end of script HTML comment", + "initialStates":["Script data state"], + "input":"", + "output":[["Character", ""]] + }, + { + "description":" in script HTML comment", + "initialStates":["Script data state"], + "lastStartTag":"script", + "input":"", + "output":[["Character", ""], ["EndTag", "script"]] + }, + { + "description":" in script HTML comment - double escaped", + "initialStates":["Script data state"], + "lastStartTag":"script", + "input":"", + "output":[["Character", ""], ["EndTag", "script"]] + }, + { + "description":" in script HTML comment - double escaped with nested -->", + "output":[["Character", ""], ["EndTag", "script"]] + }, + { + "description":" in script HTML comment - double escaped with abrupt end", + "initialStates":["Script data state"], + "lastStartTag":"script", + "input":" -->", + "output":[["Character", ""], ["EndTag", "script"], ["Character", " -->"], ["EndTag", "script"]] + }, + { + "description":"Incomplete start tag in script HTML comment double escaped", + "initialStates":["Script data state"], + "lastStartTag":"script", + "input":"", + "output":[["Character", ""]] + }, + { + "description":"Unclosed start tag in script HTML comment double escaped", + "initialStates":["Script data state"], + "lastStartTag":"script", + "input":"", + "output":[["Character", ""]] + }, + { + "description":"Incomplete end tag in script HTML comment double escaped", + "initialStates":["Script data state"], + "lastStartTag":"script", + "input":"", + "output":[["Character", ""]] + }, + { + "description":"Unclosed end tag in script HTML comment double escaped", + "initialStates":["Script data state"], + "lastStartTag":"script", + "input":"", + "output":[["Character", ""]] + }, { "description":"leading U+FEFF must pass through", + "initialStates":["Data state", "RCDATA state", "RAWTEXT state", "Script data state"], "doubleEscaped":true, "input":"\\uFEFFfoo\\uFEFFbar", "output":[["Character", "\\uFEFFfoo\\uFEFFbar"]] }, { - "description":"Non BMP-charref in in RCDATA", + "description":"Non BMP-charref in RCDATA", "initialStates":["RCDATA state"], "input":"≂̸", "output":[["Character", "\u2242\u0338"]] }, { - "description":"Bad charref in in RCDATA", + "description":"Bad charref in RCDATA", "initialStates":["RCDATA state"], "input":"&NotEqualTild;", "output":[["Character", "&NotEqualTild;"]], @@ -134,36 +216,36 @@ ] }, { - "description":"lowercase endtags in RCDATA and RAWTEXT", - "initialStates":["RCDATA state", "RAWTEXT state"], + "description":"lowercase endtags", + "initialStates":["RCDATA state", "RAWTEXT state", "Script data state"], "lastStartTag":"xmp", "input":"", "output":[["EndTag","xmp"]] }, { - "description":"bad endtag in RCDATA and RAWTEXT", - "initialStates":["RCDATA state", "RAWTEXT state"], + "description":"bad endtag (space before name)", + "initialStates":["RCDATA state", "RAWTEXT state", "Script data state"], "lastStartTag":"xmp", "input":"", "output":[["Character",""]] }, { - "description":"bad endtag in RCDATA and RAWTEXT", - "initialStates":["RCDATA state", "RAWTEXT state"], + "description":"bad endtag (not matching last start tag)", + "initialStates":["RCDATA state", "RAWTEXT state", "Script data state"], "lastStartTag":"xmp", "input":"", "output":[["Character",""]] }, { - "description":"bad endtag in RCDATA and RAWTEXT", - "initialStates":["RCDATA state", "RAWTEXT state"], + "description":"bad endtag (without close bracket)", + "initialStates":["RCDATA state", "RAWTEXT state", "Script data state"], "lastStartTag":"xmp", "input":"", + "initialStates":["CDATA section state"], + "output":[["Character", "foo "]] + }, + { + "description":"CDATA followed by HTML content", + "input":"foo ]]> ", + "initialStates":["CDATA section state"], + "output":[["Character", "foo "]] + }, + { + "description":"CDATA with extra bracket", + "input":"foo]]]>", + "initialStates":["CDATA section state"], + "output":[["Character", "foo]"]] + }, + { + "description":"CDATA without end marker", + "input":"foo", "initialStates":["CDATA section state"], - "output":[["Character", "foo&bar"]], + "output":[["Character", "foo"]], "errors":[ - { "code": "eof-in-cdata", "line": 1, "col": 8 } + { "code": "eof-in-cdata", "line": 1, "col": 4 } ] + }, + { + "description":"CDATA with single bracket ending", + "input":"foo]", + "initialStates":["CDATA section state"], + "output":[["Character", "foo]"]], + "errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 5 } + ] + }, + { + "description":"CDATA with two brackets ending", + "input":"foo]]", + "initialStates":["CDATA section state"], + "output":[["Character", "foo]]"]], + "errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 6 } + ] + }, + { + "description": "HTML tag in script data", + "input": "hello world", + "initialStates": ["Script data state"], + "output": [["Character", "hello world"]] } - ] } diff --git a/tokenizer/entities.test b/tokenizer/entities.test index 7c514563..a6469cd0 100644 --- a/tokenizer/entities.test +++ b/tokenizer/entities.test @@ -1,13 +1,47 @@ {"tests": [ -{"description": "Undefined named entity in attribute value ending in semicolon and whose name starts with a known entity name.", +{"description": "Undefined named entity in a double-quoted attribute value ending in semicolon and whose name starts with a known entity name.", +"input":"", +"output": [["StartTag", "h", {"a": "¬i;"}]]}, + +{"description": "Entity name requiring semicolon instead followed by the equals sign in a double-quoted attribute value.", +"input":"", +"output": [["StartTag", "h", {"a": "&lang="}]]}, + +{"description": "Valid entity name followed by the equals sign in a double-quoted attribute value.", +"input":"", +"output": [["StartTag", "h", {"a": "¬="}]]}, + +{"description": "Undefined named entity in a single-quoted attribute value ending in semicolon and whose name starts with a known entity name.", "input":"", "output": [["StartTag", "h", {"a": "¬i;"}]]}, -{"description": "Entity name followed by the equals sign in an attribute value.", +{"description": "Entity name requiring semicolon instead followed by the equals sign in a single-quoted attribute value.", "input":"", "output": [["StartTag", "h", {"a": "&lang="}]]}, +{"description": "Valid entity name followed by the equals sign in a single-quoted attribute value.", +"input":"", +"output": [["StartTag", "h", {"a": "¬="}]]}, + +{"description": "Undefined named entity in an unquoted attribute value ending in semicolon and whose name starts with a known entity name.", +"input":"", +"output": [["StartTag", "h", {"a": "¬i;"}]]}, + +{"description": "Entity name requiring semicolon instead followed by the equals sign in an unquoted attribute value.", +"input":"", +"output": [["StartTag", "h", {"a": "&lang="}]], +"errors":[ + { "code": "unexpected-character-in-unquoted-attribute-value", "line": 1, "col": 11 } +]}, + +{"description": "Valid entity name followed by the equals sign in an unquoted attribute value.", +"input":"", +"output": [["StartTag", "h", {"a": "¬="}]], +"errors":[ + { "code": "unexpected-character-in-unquoted-attribute-value", "line": 1, "col": 10 } +]}, + {"description": "Ambiguous ampersand.", "input":"&rrrraannddom;", "output": [["Character", "&rrrraannddom;"]], diff --git a/tokenizer/test1.test b/tokenizer/test1.test index 8b85050f..5323fbbe 100644 --- a/tokenizer/test1.test +++ b/tokenizer/test1.test @@ -102,6 +102,10 @@ "input":"", "output":[["Comment", " --comment "]]}, +{"description":"Comment, central less-than bang", +"input":"", +"output":[["Comment", "", "output":[["Comment", ""]], @@ -135,6 +145,22 @@ "input":"", "output":[["Comment", ""]]}, +{"description":"< in comment", +"input":"", +"output":[["Comment", " ", +"output":[["Comment", "<<"]]}, + +{"description":"", +"output":[["Comment", " ", +"output":[["Comment", " ", "output":[["Comment", " ", +"output":[["Comment", " <", +"output":[["Character", ""]]}, + +{"description":"", +"output":[["Character", ""]]}, + +{"description":"", +"output":[["Character", ""]]}, + +{"description":"Escaped script data", +"initialStates":["Script data state"], +"input":"", +"output":[["Character", ""]]}, + +{"description":"< in script HTML comment", +"initialStates":["Script data state"], +"input":"", +"output":[["Character", ""]]}, + +{"description":"", +"output":[["Character", ""]]}, + +{"description":"Start tag in script HTML comment", +"initialStates":["Script data state"], +"input":"", +"output":[["Character", ""]]}, + +{"description":"End tag in script HTML comment", +"initialStates":["Script data state"], +"input":"", +"output":[["Character", ""]]}, + +{"description":"- in script HTML comment double escaped", +"initialStates":["Script data state"], +"input":"", +"output":[["Character", ""]]}, + +{"description":"-- in script HTML comment double escaped", +"initialStates":["Script data state"], +"input":"", +"output":[["Character", ""]]}, + +{"description":"--- in script HTML comment double escaped", +"initialStates":["Script data state"], +"input":"", +"output":[["Character", ""]]}, + +{"description":"- spaced in script HTML comment double escaped", +"initialStates":["Script data state"], +"input":"", +"output":[["Character", ""]]}, + +{"description":"-- spaced in script HTML comment double escaped", +"initialStates":["Script data state"], +"input":"", +"output":[["Character", ""]]}, + {"description":"Ampersand EOF", "input":"&", "output":[["Character", "&"]]}, diff --git a/tokenizer/test2.test b/tokenizer/test2.test index 521694ca..c29e4c31 100644 --- a/tokenizer/test2.test +++ b/tokenizer/test2.test @@ -50,6 +50,10 @@ "input":"", "output":[["DOCTYPE", "html", null, "-//W3C//DTD HTML Transitional 4.01//EN", true]]}, +{"description":"DOCTYPE with single-quoted systemId", +"input":"", +"output":[["DOCTYPE", "html", null, "-//W3C//DTD HTML Transitional 4.01//EN", true]]}, + {"description":"DOCTYPE with publicId and systemId", "input":"", "output":[["DOCTYPE", "html", "-//W3C//DTD HTML Transitional 4.01//EN", "-//W3C//DTD HTML Transitional 4.01//EN", true]]}, @@ -186,7 +190,7 @@ { "code": "unexpected-question-mark-instead-of-tag-name", "line": 1, "col": 2 } ]}, -{"description":"A bogus comment stops at >, even if preceeded by two dashes", +{"description":"A bogus comment stops at >, even if preceded by two dashes", "input":"", "output":[["Comment", "?foo--"]], "errors":[ diff --git a/tokenizer/test3.test b/tokenizer/test3.test index 721f21de..901a581e 100644 --- a/tokenizer/test3.test +++ b/tokenizer/test3.test @@ -1,84 +1,451 @@ {"tests": [ -{"description":"", +{"description":"[empty]", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], "input":"", "output":[]}, +{"description":"[empty]", +"initialStates":["CDATA section state"], +"input":"", +"output":[], +"errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 1 } +]}, + {"description":"\\u0009", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], "input":"\u0009", "output":[["Character", "\u0009"]]}, +{"description":"\\u0009", +"initialStates":["CDATA section state"], +"input":"\u0009", +"output":[["Character", "\u0009"]], +"errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 2 } +]}, + {"description":"\\u000A", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], "input":"\u000A", "output":[["Character", "\u000A"]]}, +{"description":"\\u000A", +"initialStates":["CDATA section state"], +"input":"\u000A", +"output":[["Character", "\u000A"]], +"errors":[ + { "code": "eof-in-cdata", "line": 2, "col": 1 } +]}, + {"description":"\\u000B", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], "input":"\u000B", "output":[["Character", "\u000B"]], "errors":[ { "code": "control-character-in-input-stream", "line": 1, "col": 1 } ]}, +{"description":"\\u000B", +"initialStates":["CDATA section state"], +"input":"\u000B", +"output":[["Character", "\u000B"]], +"errors":[ + { "code": "control-character-in-input-stream", "line": 1, "col": 1 }, + { "code": "eof-in-cdata", "line": 1, "col": 2 } +]}, + {"description":"\\u000C", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], "input":"\u000C", "output":[["Character", "\u000C"]]}, +{"description":"\\u000C", +"initialStates":["CDATA section state"], +"input":"\u000C", +"output":[["Character", "\u000C"]], +"errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 2 } +]}, + {"description":" ", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], "input":" ", "output":[["Character", " "]]}, +{"description":" ", +"initialStates":["CDATA section state"], +"input":" ", +"output":[["Character", " "]], +"errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 2 } +]}, + {"description":"!", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], "input":"!", "output":[["Character", "!"]]}, +{"description":"!", +"initialStates":["CDATA section state"], +"input":"!", +"output":[["Character", "!"]], +"errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 2 } +]}, + {"description":"\"", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], "input":"\"", "output":[["Character", "\""]]}, +{"description":"\"", +"initialStates":["CDATA section state"], +"input":"\"", +"output":[["Character", "\""]], +"errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 2 } +]}, + {"description":"%", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], "input":"%", "output":[["Character", "%"]]}, +{"description":"%", +"initialStates":["CDATA section state"], +"input":"%", +"output":[["Character", "%"]], +"errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 2 } +]}, + {"description":"&", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], "input":"&", "output":[["Character", "&"]]}, +{"description":"&", +"initialStates":["CDATA section state"], +"input":"&", +"output":[["Character", "&"]], +"errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 2 } +]}, + {"description":"'", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], "input":"'", "output":[["Character", "'"]]}, +{"description":"'", +"initialStates":["CDATA section state"], +"input":"'", +"output":[["Character", "'"]], +"errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 2 } +]}, + {"description":",", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], "input":",", "output":[["Character", ","]]}, +{"description":",", +"initialStates":["CDATA section state"], +"input":",", +"output":[["Character", ","]], +"errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 2 } +]}, + {"description":"-", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], "input":"-", "output":[["Character", "-"]]}, +{"description":"-", +"initialStates":["CDATA section state"], +"input":"-", +"output":[["Character", "-"]], +"errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 2 } +]}, + {"description":".", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], "input":".", "output":[["Character", "."]]}, +{"description":".", +"initialStates":["CDATA section state"], +"input":".", +"output":[["Character", "."]], +"errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 2 } +]}, + {"description":"/", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], "input":"/", "output":[["Character", "/"]]}, +{"description":"/", +"initialStates":["CDATA section state"], +"input":"/", +"output":[["Character", "/"]], +"errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 2 } +]}, + {"description":"0", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], "input":"0", "output":[["Character", "0"]]}, +{"description":"0", +"initialStates":["CDATA section state"], +"input":"0", +"output":[["Character", "0"]], +"errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 2 } +]}, + {"description":"1", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], "input":"1", "output":[["Character", "1"]]}, +{"description":"1", +"initialStates":["CDATA section state"], +"input":"1", +"output":[["Character", "1"]], +"errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 2 } +]}, + {"description":"9", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], "input":"9", "output":[["Character", "9"]]}, +{"description":"9", +"initialStates":["CDATA section state"], +"input":"9", +"output":[["Character", "9"]], +"errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 2 } +]}, + {"description":";", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], "input":";", "output":[["Character", ";"]]}, +{"description":";", +"initialStates":["CDATA section state"], +"input":";", +"output":[["Character", ";"]], +"errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 2 } +]}, + +{"description":";=", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], +"input":";=", +"output":[["Character", ";="]]}, + +{"description":";=", +"initialStates":["CDATA section state"], +"input":";=", +"output":[["Character", ";="]], +"errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 3 } +]}, + +{"description":";>", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], +"input":";>", +"output":[["Character", ";>"]]}, + +{"description":";>", +"initialStates":["CDATA section state"], +"input":";>", +"output":[["Character", ";>"]], +"errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 3 } +]}, + +{"description":";?", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], +"input":";?", +"output":[["Character", ";?"]]}, + +{"description":";?", +"initialStates":["CDATA section state"], +"input":";?", +"output":[["Character", ";?"]], +"errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 3 } +]}, + +{"description":";@", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], +"input":";@", +"output":[["Character", ";@"]]}, + +{"description":";@", +"initialStates":["CDATA section state"], +"input":";@", +"output":[["Character", ";@"]], +"errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 3 } +]}, + +{"description":";A", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], +"input":";A", +"output":[["Character", ";A"]]}, + +{"description":";A", +"initialStates":["CDATA section state"], +"input":";A", +"output":[["Character", ";A"]], +"errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 3 } +]}, + +{"description":";B", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], +"input":";B", +"output":[["Character", ";B"]]}, + +{"description":";B", +"initialStates":["CDATA section state"], +"input":";B", +"output":[["Character", ";B"]], +"errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 3 } +]}, + +{"description":";Y", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], +"input":";Y", +"output":[["Character", ";Y"]]}, + +{"description":";Y", +"initialStates":["CDATA section state"], +"input":";Y", +"output":[["Character", ";Y"]], +"errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 3 } +]}, + +{"description":";Z", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], +"input":";Z", +"output":[["Character", ";Z"]]}, + +{"description":";Z", +"initialStates":["CDATA section state"], +"input":";Z", +"output":[["Character", ";Z"]], +"errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 3 } +]}, + +{"description":";`", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], +"input":";`", +"output":[["Character", ";`"]]}, + +{"description":";`", +"initialStates":["CDATA section state"], +"input":";`", +"output":[["Character", ";`"]], +"errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 3 } +]}, + +{"description":";a", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], +"input":";a", +"output":[["Character", ";a"]]}, + +{"description":";a", +"initialStates":["CDATA section state"], +"input":";a", +"output":[["Character", ";a"]], +"errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 3 } +]}, + +{"description":";b", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], +"input":";b", +"output":[["Character", ";b"]]}, + +{"description":";b", +"initialStates":["CDATA section state"], +"input":";b", +"output":[["Character", ";b"]], +"errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 3 } +]}, + +{"description":";y", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], +"input":";y", +"output":[["Character", ";y"]]}, + +{"description":";y", +"initialStates":["CDATA section state"], +"input":";y", +"output":[["Character", ";y"]], +"errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 3 } +]}, + +{"description":";z", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], +"input":";z", +"output":[["Character", ";z"]]}, + +{"description":";z", +"initialStates":["CDATA section state"], +"input":";z", +"output":[["Character", ";z"]], +"errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 3 } +]}, + +{"description":";{", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], +"input":";{", +"output":[["Character", ";{"]]}, + +{"description":";{", +"initialStates":["CDATA section state"], +"input":";{", +"output":[["Character", ";{"]], +"errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 3 } +]}, + +{"description":";\\uDBC0\\uDC00", +"initialStates":["Data state", "PLAINTEXT state", "RCDATA state", "RAWTEXT state", "Script data state"], +"input":";\uDBC0\uDC00", +"output":[["Character", ";\uDBC0\uDC00"]]}, + +{"description":";\\uDBC0\\uDC00", +"initialStates":["CDATA section state"], +"input":";\uDBC0\uDC00", +"output":[["Character", ";\uDBC0\uDC00"]], +"errors":[ + { "code": "eof-in-cdata", "line": 1, "col": 4 } +]}, + {"description":"<", "input":"<", "output":[["Character", "<"]], @@ -958,28 +1325,28 @@ "input":"BAZ FOOBAZ #errors (1,3): expected-doctype-but-got-chars -(1,15): unexpected-char-in-comment -(1,24): unexpected-char-in-comment #document | | @@ -86,8 +85,6 @@ FOOBAZ FOOBAZ #errors (1,3): expected-doctype-but-got-chars -(1,15): unexpected-char-in-comment -(1,24): unexpected-char-in-comment (1,31): unexpected-bang-after-double-dash-in-comment #new-errors (1:32) incorrectly-closed-comment @@ -103,9 +100,6 @@ FOOBAZ FOOBAZ FOOBAZ #errors (1,3): expected-doctype-but-got-chars -(1,10): unexpected-dash-after-double-dash-in-comment #document | | diff --git a/tree-construction/doctype01.dat b/tree-construction/doctype01.dat index c845becf..9efdaf70 100644 --- a/tree-construction/doctype01.dat +++ b/tree-construction/doctype01.dat @@ -34,7 +34,6 @@ #data Hello #errors -(1,9): need-space-after-doctype (1,10): expected-doctype-name-but-got-right-bracket (1,10): unknown-doctype #new-errors @@ -337,6 +336,7 @@ Hello #errors +(2,43): unknown-doctype #document | | @@ -421,6 +421,7 @@ #errors (1,50): unexpected-char-in-doctype +(1,89): unknown-doctype #new-errors (1:50) missing-whitespace-between-doctype-public-and-system-identifiers #document @@ -433,6 +434,7 @@ #errors (1,50): unexpected-char-in-doctype +(1,89): unknown-doctype #new-errors (1:50) missing-whitespace-between-doctype-public-and-system-identifiers #document @@ -446,6 +448,7 @@ #errors (1,21): unexpected-char-in-doctype (1,49): unexpected-char-in-doctype +(1,88): unknown-doctype #new-errors (1:22) missing-whitespace-after-doctype-public-keyword (1:49) missing-whitespace-between-doctype-public-and-system-identifiers @@ -460,6 +463,7 @@ #errors (1,21): unexpected-char-in-doctype (1,49): unexpected-char-in-doctype +(1,88): unknown-doctype #new-errors (1:22) missing-whitespace-after-doctype-public-keyword (1:49) missing-whitespace-between-doctype-public-and-system-identifiers diff --git a/tree-construction/entities02.dat b/tree-construction/entities02.dat index 0c6e898c..74965a35 100644 --- a/tree-construction/entities02.dat +++ b/tree-construction/entities02.dat @@ -45,7 +45,6 @@ #data
#errors -(1,15): named-entity-without-semicolon (1,20): expected-doctype-but-got-start-tag #document | @@ -204,7 +203,6 @@ #data
#errors -(1,18): named-entity-without-semicolon (1,23): expected-doctype-but-got-start-tag #document | @@ -299,6 +297,8 @@ #data
ZZÆ=
#errors +(1,5): expected-doctype-but-got-start-tag +(1:14) missing-semicolon-after-character-reference #new-errors (1:14) missing-semicolon-after-character-reference #document diff --git a/tree-construction/foreign-fragment.dat b/tree-construction/foreign-fragment.dat index c81ae817..e562c6b8 100644 --- a/tree-construction/foreign-fragment.dat +++ b/tree-construction/foreign-fragment.dat @@ -3,11 +3,10 @@ #errors 6: HTML start tag “nobr” in a foreign namespace context. 7: End of file seen and there were open elements. -6: Unclosed element “nobr”. #document-fragment svg path #document -| +| | "X" #data @@ -17,7 +16,7 @@ svg path #document-fragment svg path #document -| +| | color="" | "X" @@ -35,7 +34,6 @@ svg path #errors 10: End tag “path” did not match the name of the current open element (“g”). 11: End of file seen and there were open elements. -3: Unclosed element “g”. #document-fragment svg path #document @@ -173,7 +171,6 @@ math ms #errors 51: Self-closing syntax (“/>”) used on a non-void HTML element. Ignoring the slash and treating as a start tag. 52: End of file seen and there were open elements. -51: Unclosed element “ms”. #new-errors (1:44-1:49) non-void-html-element-start-tag-with-trailing-solidus #document-fragment @@ -216,7 +213,6 @@ math ms #errors 51: Self-closing syntax (“/>”) used on a non-void HTML element. Ignoring the slash and treating as a start tag. 52: End of file seen and there were open elements. -51: Unclosed element “mn”. #new-errors (1:44-1:49) non-void-html-element-start-tag-with-trailing-solidus #document-fragment @@ -259,7 +255,6 @@ math mn #errors 51: Self-closing syntax (“/>”) used on a non-void HTML element. Ignoring the slash and treating as a start tag. 52: End of file seen and there were open elements. -51: Unclosed element “mo”. #new-errors (1:44-1:49) non-void-html-element-start-tag-with-trailing-solidus #document-fragment @@ -302,7 +297,6 @@ math mo #errors 51: Self-closing syntax (“/>”) used on a non-void HTML element. Ignoring the slash and treating as a start tag. 52: End of file seen and there were open elements. -51: Unclosed element “mi”. #new-errors (1:44-1:49) non-void-html-element-start-tag-with-trailing-solidus #document-fragment @@ -345,7 +339,6 @@ math mi #errors 51: Self-closing syntax (“/>”) used on a non-void HTML element. Ignoring the slash and treating as a start tag. 52: End of file seen and there were open elements. -51: Unclosed element “mtext”. #new-errors (1:44-1:52) non-void-html-element-start-tag-with-trailing-solidus #document-fragment @@ -390,7 +383,7 @@ math mtext #document-fragment math annotation-xml #document -| +|
#data
@@ -407,7 +400,7 @@ math annotation-xml #document-fragment math math #document -| +|
#data
@@ -461,12 +454,11 @@ svg desc

X

#errors 5: HTML start tag “div” in a foreign namespace context. -9: HTML start tag “h1” in a foreign namespace context. #document-fragment svg svg #document -| -| +|
+|

| "X" #data @@ -476,7 +468,7 @@ svg svg #document-fragment svg svg #document -| +|
#data
@@ -486,14 +478,6 @@ svg desc #document |
-#data -
-#errors -#document-fragment -svg desc -#document -|
- #data <foo> #errors @@ -557,3 +541,105 @@ svg desc svg desc #document | "X" + +#data +<svg><p> +#errors +8: HTML start tag “p” in a foreign namespace context. +#document-fragment +div +#document +| <svg svg> +| <p> + +#data +<p> +#errors +3: HTML start tag “p” in a foreign namespace context. +#document-fragment +svg svg +#document +| <p> + +#data +<svg></p><foo> +#errors +9: HTML end tag “p” in a foreign namespace context. +(1:6) Unexpected </p> from in body insertion mode +(1:15) Unexpected EOF +#document-fragment +div +#document +| <svg svg> +| <p> +| <foo> + +#data +<svg></br><foo> +#errors +10: HTML end tag “br” in a foreign namespace context. +(1:6) Unexpected </br> from in body insertion mode +(1:16) Unexpected EOF +#document-fragment +div +#document +| <svg svg> +| <br> +| <foo> + +#data +</p><foo> +#errors +4: HTML end tag “p” in a foreign namespace context. +(1:1) Unexpected </p> from in body insertion mode +(1:10) Unexpected EOF +#document-fragment +svg svg +#document +| <p> +| <svg foo> + +#data +</br><foo> +#errors +5: HTML end tag “br” in a foreign namespace context. +(1:1) Unexpected </br> from in body insertion mode +(1:11) Unexpected EOF +#document-fragment +svg svg +#document +| <br> +| <svg foo> + +#data +<body><foo> +#errors +6: HTML start tag “body” in a foreign namespace context. +(1:1) Unexpected <body> from in body insertion mode +(1:12) Unexpected EOF +#document-fragment +svg svg +#document +| <svg foo> + +#data +<p><foo> +#errors +3: HTML start tag “p” in a foreign namespace context. +(1:9) Unexpected EOF +#document-fragment +svg svg +#document +| <p> +| <foo> + +#data +<p></p><foo> +#errors +3: HTML start tag “p” in a foreign namespace context. +(1:13) Unexpected EOF +#document-fragment +svg svg +#document +| <p> +| <svg foo> diff --git a/tree-construction/html5test-com.dat b/tree-construction/html5test-com.dat index f7380101..48d0bf95 100644 --- a/tree-construction/html5test-com.dat +++ b/tree-construction/html5test-com.dat @@ -142,7 +142,6 @@ #data <!--foo--bar--> #errors -(1,10): unexpected-char-in-comment (1,15): expected-doctype-but-got-eof #document | <!-- foo--bar --> diff --git a/tree-construction/math.dat b/tree-construction/math.dat index ae9cd7c6..d6a8ae56 100644 --- a/tree-construction/math.dat +++ b/tree-construction/math.dat @@ -1,6 +1,8 @@ #data <math><tr><td><mo><tr> #errors +(1,22): unexpected-start-tag +(1,23): expected-closing-tag-but-got-eof #document-fragment td #document @@ -12,6 +14,9 @@ td #data <math><tr><td><mo><tr> #errors +(1,6): foster-parenting-start-tag +(1,22): expected-tr-in-table-scope +(1,23): expected-closing-tag-but-got-eof #document-fragment tr #document @@ -23,6 +28,9 @@ tr #data <math><thead><mo><tbody> #errors +(1,6): foster-parenting-start-tag +(1,24): expected-table-part-in-table-scope +(1,25): expected-closing-tag-but-got-eof #document-fragment thead #document @@ -33,6 +41,9 @@ thead #data <math><tfoot><mo><tbody> #errors +(1,6): foster-parenting-start-tag +(1,24): expected-table-part-in-table-scope +(1,25): expected-closing-tag-but-got-eof #document-fragment tfoot #document @@ -43,6 +54,9 @@ tfoot #data <math><tbody><mo><tfoot> #errors +(1,6): foster-parenting-start-tag +(1,24): expected-table-part-in-table-scope +(1,25): expected-closing-tag-but-got-eof #document-fragment tbody #document @@ -53,6 +67,9 @@ tbody #data <math><tbody><mo></table> #errors +(1,6): foster-parenting-start-tag +(1,25): unexpected-end-tag-in-math +(1,26): expected-closing-tag-but-got-eof #document-fragment tbody #document @@ -63,6 +80,9 @@ tbody #data <math><thead><mo></table> #errors +(1,6): foster-parenting-start-tag +(1,25): unexpected-end-tag-in-math +(1,26): expected-closing-tag-but-got-eof #document-fragment tbody #document @@ -73,6 +93,9 @@ tbody #data <math><tfoot><mo></table> #errors +(1,6): foster-parenting-start-tag +(1,25): unexpected-end-tag-in-math +(1,26): expected-closing-tag-but-got-eof #document-fragment tbody #document diff --git a/tree-construction/menuitem-element.dat b/tree-construction/menuitem-element.dat index 43aa0c67..fb13c3c3 100644 --- a/tree-construction/menuitem-element.dat +++ b/tree-construction/menuitem-element.dat @@ -3,7 +3,6 @@ #errors 10: Start tag seen without seeing a doctype first. Expected “<!DOCTYPE html>”. 10: End of file seen and there were open elements. -10: Unclosed element “menuitem”. #document | <html> | <head> @@ -24,7 +23,6 @@ <!DOCTYPE html><body><menuitem>A #errors 32: End of file seen and there were open elements. -31: Unclosed element “menuitem”. #document | <!DOCTYPE html> | <html> @@ -37,8 +35,6 @@ <!DOCTYPE html><body><menuitem>A<menuitem>B #errors 43: End of file seen and there were open elements. -42: Unclosed element “menuitem”. -31: Unclosed element “menuitem”. #document | <!DOCTYPE html> | <html> @@ -53,7 +49,6 @@ <!DOCTYPE html><body><menuitem>A<menu>B</menu> #errors 46: End of file seen and there were open elements. -31: Unclosed element “menuitem”. #document | <!DOCTYPE html> | <html> @@ -68,7 +63,6 @@ <!DOCTYPE html><body><menuitem>A<hr>B #errors 37: End of file seen and there were open elements. -31: Unclosed element “menuitem”. #document | <!DOCTYPE html> | <html> @@ -83,7 +77,6 @@ <!DOCTYPE html><li><menuitem><li> #errors 33: End tag “li” implied, but there were open elements. -29: Unclosed element “menuitem”. #document | <!DOCTYPE html> | <html> @@ -98,7 +91,6 @@ #errors 39: Stray end tag “menuitem”. 40: End of file seen and there were open elements. -25: Unclosed element “menuitem”. #document | <!DOCTYPE html> | <html> @@ -112,9 +104,7 @@ <!DOCTYPE html><p><b></p><menuitem> #errors 25: End tag “p” seen, but there were open elements. -21: Unclosed element “b”. 35: End of file seen and there were open elements. -35: Unclosed element “menuitem”. #document | <!DOCTYPE html> | <html> @@ -129,7 +119,6 @@ <!DOCTYPE html><menuitem><asdf></menuitem>x #errors 42: End tag “menuitem” seen, but there were open elements. -31: Unclosed element “asdf”. #document | <!DOCTYPE html> | <html> @@ -184,7 +173,6 @@ <!DOCTYPE html><option><menuitem> #errors 33: End of file seen and there were open elements. -33: Unclosed element “menuitem”. #document | <!DOCTYPE html> | <html> @@ -197,7 +185,6 @@ <!DOCTYPE html><menuitem><option> #errors 33: End of file seen and there were open elements. -25: Unclosed element “menuitem”. #document | <!DOCTYPE html> | <html> @@ -210,7 +197,6 @@ <!DOCTYPE html><menuitem></body> #errors 32: End tag for “body” seen, but there were unclosed elements. -25: Unclosed element “menuitem”. #document | <!DOCTYPE html> | <html> @@ -222,7 +208,6 @@ <!DOCTYPE html><menuitem></html> #errors 32: End tag for “html” seen, but there were unclosed elements. -25: Unclosed element “menuitem”. #document | <!DOCTYPE html> | <html> @@ -234,7 +219,6 @@ <!DOCTYPE html><menuitem><p> #errors 28: End of file seen and there were open elements. -25: Unclosed element “menuitem”. #document | <!DOCTYPE html> | <html> @@ -247,7 +231,6 @@ <!DOCTYPE html><menuitem><li> #errors 29: End of file seen and there were open elements. -25: Unclosed element “menuitem”. #document | <!DOCTYPE html> | <html> diff --git a/tree-construction/namespace-sensitivity.dat b/tree-construction/namespace-sensitivity.dat index ca35c0e7..050dca75 100644 --- a/tree-construction/namespace-sensitivity.dat +++ b/tree-construction/namespace-sensitivity.dat @@ -1,6 +1,12 @@ #data <body><table><tr><td><svg><td><foreignObject><span></td>Foo #errors +(1,6): expected-doctype-but-got-start-tag +(1,56): unexpected-end-tag +(1,60): foster-parenting-character +(1,60): foster-parenting-character +(1,60): foster-parenting-character +(1,60): expected-closing-tag-but-got-eof #document | <html> | <head> diff --git a/tree-construction/plain-text-unsafe.dat b/tree-construction/plain-text-unsafe.dat index dfb5cb63..e904eff0 100644 Binary files a/tree-construction/plain-text-unsafe.dat and b/tree-construction/plain-text-unsafe.dat differ diff --git a/tree-construction/quirks01.dat b/tree-construction/quirks01.dat new file mode 100644 index 00000000..bc58de5c --- /dev/null +++ b/tree-construction/quirks01.dat @@ -0,0 +1,53 @@ +#data +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN" +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"><p><table> +#errors +(2,54): unknown-doctype +(2,64): eof-in-table +#document +| <!DOCTYPE html "-//W3C//DTD XHTML 1.0 Frameset//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"> +| <html> +| <head> +| <body> +| <p> +| <table> + +#data +<!DOCTYPE html SYSTEM "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"><p><table> +#errors +(1,83): unknown-doctype +(1,93): eof-in-table +#document +| <!DOCTYPE html "" "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"> +| <html> +| <head> +| <body> +| <p> +| <table> + +#data +<!DOCTYPE html PUBLIC "html"><p><table> +#errors +(1,30): unknown-doctype +(1,39): eof-in-table +#document +| <!DOCTYPE html "html" ""> +| <html> +| <head> +| <body> +| <p> +| <table> + +#data +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN" + "http://www.w3.org/TR/html4/strict.dtd"><p><table> +#errors +(2,43): unknown-doctype +(2,53): eof-in-table +#document +| <!DOCTYPE html "-//W3C//DTD HTML 3.2//EN" "http://www.w3.org/TR/html4/strict.dtd"> +| <html> +| <head> +| <body> +| <p> +| <table> diff --git a/tree-construction/ruby.dat b/tree-construction/ruby.dat index 696782f0..f4e5e4e4 100644 --- a/tree-construction/ruby.dat +++ b/tree-construction/ruby.dat @@ -203,6 +203,7 @@ <html><ruby>a<rtc>b<span></ruby></html> #errors (1,6): expected-doctype-but-got-start-tag +(1,32): unexpected-end-tag #document | <html> | <head> diff --git a/tree-construction/scriptdata01.dat b/tree-construction/scriptdata01.dat index e5708589..6abcb657 100644 --- a/tree-construction/scriptdata01.dat +++ b/tree-construction/scriptdata01.dat @@ -172,19 +172,6 @@ FOO<script>'<!-->'</script>BAR | "'<!-->'" | "BAR" -#data -FOO<script>'<!-->'</script>BAR -#errors -(1,3): expected-doctype-but-got-chars -#document -| <html> -| <head> -| <body> -| "FOO" -| <script> -| "'<!-->'" -| "BAR" - #data FOO<script>'<!-- potato'</script>BAR #errors diff --git a/tree-construction/search-element.dat b/tree-construction/search-element.dat new file mode 100644 index 00000000..2866d7ec --- /dev/null +++ b/tree-construction/search-element.dat @@ -0,0 +1,46 @@ +#data +<!doctype html><p>foo<search>bar<p>baz +#errors +(1,38): expected-closing-tag-but-got-eof +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| "foo" +| <search> +| "bar" +| <p> +| "baz" + +#data +<!doctype html><search><p>foo</search>bar +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <search> +| <p> +| "foo" +| "bar" + +#data +<!DOCTYPE html>xxx<svg><x><g><a><search><b> +#errors + * (1,44) unexpected HTML-like start tag token in foreign content + * (1,44) unexpected end of file +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| "xxx" +| <svg svg> +| <svg x> +| <svg g> +| <svg a> +| <svg search> +| <b> diff --git a/tree-construction/svg.dat b/tree-construction/svg.dat new file mode 100644 index 00000000..a452e7af --- /dev/null +++ b/tree-construction/svg.dat @@ -0,0 +1,104 @@ +#data +<svg><tr><td><title><tr> +#errors +(1:21) Unexpected <tr> tag +(1:25) Unexpected EOF +#document-fragment +td +#document +| <svg svg> +| <svg tr> +| <svg td> +| <svg title> + +#data +<svg><tr><td><title><tr> +#errors +(1:1) Unexpected <svg> tag +(1:21) Unexpected <tr> tag +(1:25) Unexpected EOF +#document-fragment +tr +#document +| <svg svg> +| <svg tr> +| <svg td> +| <svg title> + +#data +<svg><thead><title><tbody> +#errors +(1:1) Unexpected <svg> tag +(1:20) Unexpected <tbody> tag +(1:27) Unexpected EOF +#document-fragment +thead +#document +| <svg svg> +| <svg thead> +| <svg title> + +#data +<svg><tfoot><title><tbody> +#errors +(1:1) Unexpected <svg> tag +(1:20) Unexpected <tbody> tag +(1:27) Unexpected EOF +#document-fragment +tfoot +#document +| <svg svg> +| <svg tfoot> +| <svg title> + +#data +<svg><tbody><title><tfoot> +#errors +(1:1) Unexpected <svg> tag +(1:20) Unexpected <tfoot> tag +(1:27) Unexpected EOF +#document-fragment +tbody +#document +| <svg svg> +| <svg tbody> +| <svg title> + +#data +<svg><tbody><title></table> +#errors +(1:1) Unexpected <svg> tag +(1:20) Unexpected </table> tag +(1:28) Unexpected EOF +#document-fragment +tbody +#document +| <svg svg> +| <svg tbody> +| <svg title> + +#data +<svg><thead><title></table> +#errors +(1:1) Unexpected <svg> tag +(1:20) Unexpected </table> tag +(1:28) Unexpected EOF +#document-fragment +tbody +#document +| <svg svg> +| <svg thead> +| <svg title> + +#data +<svg><tfoot><title></table> +#errors +(1:1) Unexpected <svg> tag +(1:20) Unexpected </table> tag +(1:28) Unexpected EOF +#document-fragment +tbody +#document +| <svg svg> +| <svg tfoot> +| <svg title> diff --git a/tree-construction/tables01.dat b/tree-construction/tables01.dat index f0caaa3c..aa7915eb 100644 --- a/tree-construction/tables01.dat +++ b/tree-construction/tables01.dat @@ -284,3 +284,39 @@ | <svg svg> | <svg desc> | <td> + +#data +<div><table><svg><foreignObject><select><table><s> +#errors +1:1: Expected a doctype token +1:13: 'svg' tag isn't allowed here. Currently open tags: html, body, div, table. +1:33: 'select' tag isn't allowed here. Currently open tags: html, body, div, table, svg, foreignobject. +1:41: 'table' tag isn't allowed here. Currently open tags: html, body, div, table, svg, foreignobject, select. +1:41: 'table' tag isn't allowed here. Currently open tags: html, body, div, table, svg, foreignobject. +1:48: 's' tag isn't allowed here. Currently open tags: html, body, div, table. +1:51: Premature end of file. Currently open tags: html, body, div, table, s. +#document +| <html> +| <head> +| <body> +| <div> +| <svg svg> +| <svg foreignObject> +| <select> +| <table> +| <s> +| <table> + +#data +<table>a<!doctype html> +#errors +(1,1): expected-doctype-but-got-start-tag +(1,8): illegal-character-token +(1,9): illegal-doctype +(1,24): expected-closing-tag-but-got-eof +#document +| <html> +| <head> +| <body> +| "a" +| <table> diff --git a/tree-construction/template.dat b/tree-construction/template.dat index b38d4f58..45fb507c 100644 --- a/tree-construction/template.dat +++ b/tree-construction/template.dat @@ -867,21 +867,6 @@ no doctype | <link> | <td> -#data -<body><template><template><tr></tr></template><td></td></template> -#errors -no doctype -#document -| <html> -| <head> -| <body> -| <template> -| content -| <template> -| content -| <tr> -| <td> - #data <body><table><colgroup><template><col></col></template></colgroup></table></body> #errors @@ -1089,7 +1074,11 @@ eof in template <body><template><col>Hello #errors no doctype -unexpected text +(1,27): foster-parenting-character +(1,27): foster-parenting-character +(1,27): foster-parenting-character +(1,27): foster-parenting-character +(1,27): foster-parenting-character eof in template #document | <html> @@ -1103,7 +1092,7 @@ eof in template <body><template><i><menu>Foo</i> #errors no doctype -mising /menu +missing /menu eof in template #document | <html> @@ -1568,6 +1557,19 @@ no doctype | "Foo" | <body> +#data +<html><head></head><template></template><head> +#errors +no doctype +template-after-head +head-after-head +#document +| <html> +| <head> +| <template> +| content +| <body> + #data <!DOCTYPE HTML><dummy><table><template><table><template><table><script> #errors @@ -1593,6 +1595,11 @@ eof table #data <template><a><table><a> #errors +(1,10): expected-doctype-but-got-start-tag +(1,23): foster-parenting-start-tag +(1,23): unexpected-start-tag +(1,23): formatting-element-not-in-scope +(1,24): eof-in-template #document | <html> | <head> @@ -1602,3 +1609,65 @@ eof table | <a> | <table> | <body> + +#data +<template><form><input name="q"></form><div>second</div></template> +#errors +#document-fragment +template +#document +| <template> +| content +| <form> +| <input> +| name="q" +| <div> +| "second" + +#data +<!DOCTYPE HTML><template><tr><td>cell</td></tr></template> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <template> +| content +| <tr> +| <td> +| "cell" +| <body> + +#data +<!DOCTYPE HTML><template> <tr> <td>cell</td> </tr> </template> +#errors +#document +| <!DOCTYPE html> +| <html> +| <head> +| <template> +| content +| " " +| <tr> +| " " +| <td> +| "cell" +| " " +| " " +| <body> + +#data +<!DOCTYPE HTML><template><tr><td>cell</td></tr>a</template> +#errors +(1,59): foster-parenting-character +#document +| <!DOCTYPE html> +| <html> +| <head> +| <template> +| content +| <tr> +| <td> +| "cell" +| "a" +| <body> diff --git a/tree-construction/tests1.dat b/tree-construction/tests1.dat index 1c36c1b8..e80e6401 100644 --- a/tree-construction/tests1.dat +++ b/tree-construction/tests1.dat @@ -425,7 +425,6 @@ Line1<br>Line2<br>Line3<br>Line4 #data <!-----><font><div>hello<table>excite!<b>me!<th><i>please!</tr><!--X--> #errors -(1,7): unexpected-dash-after-double-dash-in-comment (1,14): expected-doctype-but-got-start-tag (1,41): unexpected-start-tag-implies-table-voodoo (1,48): foster-parenting-character-in-table @@ -1434,24 +1433,6 @@ Line1<br>Line2<br>Line3<br>Line4 | <meta> | <p> -#data -<b><table><td><i></table> -#errors -(1,3): expected-doctype-but-got-start-tag -(1,14): unexpected-cell-in-table-body -(1,25): unexpected-cell-end-tag -(1,25): expected-closing-tag-but-got-eof -#document -| <html> -| <head> -| <body> -| <b> -| <table> -| <tbody> -| <tr> -| <td> -| <i> - #data <b><table><td></b><i></table> #errors @@ -1548,19 +1529,6 @@ Line1<br>Line2<br>Line3<br>Line4 | <p> | <p> -#data -<p><hr></p> -#errors -(1,3): expected-doctype-but-got-start-tag -(1,11): unexpected-end-tag -#document -| <html> -| <head> -| <body> -| <p> -| <hr> -| <p> - #data <select><b><option><select><option></b></select> #errors diff --git a/tree-construction/tests16.dat b/tree-construction/tests16.dat index cea7340a..05f34c13 100644 --- a/tree-construction/tests16.dat +++ b/tree-construction/tests16.dat @@ -221,7 +221,6 @@ <!doctype html><script><! #errors (1,25): expected-script-data-but-got-eof -(1,25): expected-named-closing-tag-but-got-eof #document | <!DOCTYPE html> | <html> @@ -1525,7 +1524,6 @@ #errors (1,8): expected-doctype-but-got-start-tag (1,10): expected-script-data-but-got-eof -(1,10): expected-named-closing-tag-but-got-eof #document | <html> | <head> diff --git a/tree-construction/tests18.dat b/tree-construction/tests18.dat index 05363b39..0b6d5dc4 100644 --- a/tree-construction/tests18.dat +++ b/tree-construction/tests18.dat @@ -3,7 +3,6 @@ #errors 11: Start tag seen without seeing a doctype first. Expected “<!DOCTYPE html>”. 23: End of file seen and there were open elements. -11: Unclosed element “plaintext”. #document | <html> | <head> @@ -27,7 +26,6 @@ <!doctype html><html><plaintext></plaintext> #errors 44: End of file seen and there were open elements. -32: Unclosed element “plaintext”. #document | <!DOCTYPE html> | <html> @@ -40,7 +38,6 @@ <!doctype html><head><plaintext></plaintext> #errors 44: End of file seen and there were open elements. -32: Unclosed element “plaintext”. #document | <!DOCTYPE html> | <html> @@ -54,7 +51,6 @@ #errors 42: Bad start tag in “plaintext” in “head”. 54: End of file seen and there were open elements. -42: Unclosed element “plaintext”. #script-off #document | <!DOCTYPE html> @@ -69,7 +65,6 @@ <!doctype html></head><plaintext></plaintext> #errors 45: End of file seen and there were open elements. -33: Unclosed element “plaintext”. #document | <!DOCTYPE html> | <html> @@ -82,7 +77,6 @@ <!doctype html><body><plaintext></plaintext> #errors 44: End of file seen and there were open elements. -32: Unclosed element “plaintext”. #document | <!DOCTYPE html> | <html> @@ -95,8 +89,19 @@ <!doctype html><table><plaintext></plaintext> #errors (1,33): foster-parenting-start-tag -(1,45): foster-parenting-character -(1,45): eof-in-table +(1,46): foster-parenting-character +(1,46): foster-parenting-character +(1,46): foster-parenting-character +(1,46): foster-parenting-character +(1,46): foster-parenting-character +(1,46): foster-parenting-character +(1,46): foster-parenting-character +(1,46): foster-parenting-character +(1,46): foster-parenting-character +(1,46): foster-parenting-character +(1,46): foster-parenting-character +(1,46): foster-parenting-character +(1,46): eof-in-table #document | <!DOCTYPE html> | <html> @@ -110,8 +115,19 @@ <!doctype html><table><tbody><plaintext></plaintext> #errors (1,40): foster-parenting-start-tag -(1,41): foster-parenting-character -(1,52): eof-in-table +(1,53): foster-parenting-character +(1,53): foster-parenting-character +(1,53): foster-parenting-character +(1,53): foster-parenting-character +(1,53): foster-parenting-character +(1,53): foster-parenting-character +(1,53): foster-parenting-character +(1,53): foster-parenting-character +(1,53): foster-parenting-character +(1,53): foster-parenting-character +(1,53): foster-parenting-character +(1,53): foster-parenting-character +(1,53): eof-in-table #document | <!DOCTYPE html> | <html> @@ -126,8 +142,19 @@ <!doctype html><table><tbody><tr><plaintext></plaintext> #errors (1,44): foster-parenting-start-tag -(1,56): foster-parenting-character -(1,56): eof-in-table +(1,57): foster-parenting-character +(1,57): foster-parenting-character +(1,57): foster-parenting-character +(1,57): foster-parenting-character +(1,57): foster-parenting-character +(1,57): foster-parenting-character +(1,57): foster-parenting-character +(1,57): foster-parenting-character +(1,57): foster-parenting-character +(1,57): foster-parenting-character +(1,57): foster-parenting-character +(1,57): foster-parenting-character +(1,57): eof-in-table #document | <!DOCTYPE html> | <html> @@ -173,11 +200,20 @@ #data <!doctype html><table><colgroup><plaintext></plaintext> #errors -43: Start tag “plaintext” seen in “table”. -55: Misplaced non-space characters inside a table. +(1,43): foster-parenting-start-tag +(1,56): foster-parenting-character +(1,56): foster-parenting-character +(1,56): foster-parenting-character +(1,56): foster-parenting-character +(1,56): foster-parenting-character +(1,56): foster-parenting-character +(1,56): foster-parenting-character +(1,56): foster-parenting-character +(1,56): foster-parenting-character +(1,56): foster-parenting-character +(1,56): foster-parenting-character +(1,56): foster-parenting-character 55: End of file seen and there were open elements. -43: Unclosed element “plaintext”. -22: Unclosed element “table”. #document | <!DOCTYPE html> | <html> @@ -194,7 +230,6 @@ 34: Stray start tag “plaintext”. 46: Stray end tag “plaintext”. 47: End of file seen and there were open elements. -23: Unclosed element “select”. #document | <!DOCTYPE html> | <html> @@ -210,8 +245,6 @@ 41: Stray start tag “plaintext”. 51: “caption” start tag with “select” open. 52: End of file seen and there were open elements. -51: Unclosed element “caption”. -22: Unclosed element “table”. #document | <!DOCTYPE html> | <html> @@ -227,8 +260,6 @@ <!doctype html><template><plaintext>a</template>b #errors 49: End of file seen and there were open elements. -36: Unclosed element “plaintext”. -25: Unclosed element “template”. #document | <!DOCTYPE html> | <html> @@ -244,7 +275,6 @@ #errors 39: Stray start tag “plaintext”. 51: End of file seen and there were open elements. -39: Unclosed element “plaintext”. #document | <!DOCTYPE html> | <html> @@ -259,7 +289,6 @@ 36: Stray start tag “plaintext”. 48: Stray end tag “plaintext”. 48: End of file seen and there were open elements. -25: Unclosed element “frameset”. #document | <!DOCTYPE html> | <html> @@ -282,7 +311,6 @@ #errors 46: Stray start tag “plaintext”. 58: End of file seen and there were open elements. -46: Unclosed element “plaintext”. #document | <!DOCTYPE html> | <html> @@ -306,7 +334,6 @@ <!doctype html><svg><plaintext>a</plaintext>b #errors 45: End of file seen and there were open elements. -20: Unclosed element “svg”. #document | <!DOCTYPE html> | <html> @@ -321,9 +348,6 @@ <!doctype html><svg><title><plaintext>a</plaintext>b #errors 52: End of file seen and there were open elements. -38: Unclosed element “plaintext”. -27: Unclosed element “title”. -20: Unclosed element “svg”. #document | <!DOCTYPE html> | <html> diff --git a/tree-construction/tests19.dat b/tree-construction/tests19.dat index a1897774..20cdeabc 100644 --- a/tree-construction/tests19.dat +++ b/tree-construction/tests19.dat @@ -387,19 +387,6 @@ | <select> | <option> -#data -<!doctype html><select><option></optgroup> -#errors -(1,42): unexpected-end-tag-in-select -(1,42): eof-in-select -#document -| <!DOCTYPE html> -| <html> -| <head> -| <body> -| <select> -| <option> - #data <!doctype html><dd><optgroup><dd> #errors @@ -1015,7 +1002,6 @@ <!doctype html><p><math></p>a #errors (1,28): unexpected-end-tag -(1,28): unexpected-end-tag #document | <!DOCTYPE html> | <html> @@ -1236,48 +1222,6 @@ | "c" | <table> -#data -<!doctype html><table><i>a<b>b<div>c<a>d</i>e</b>f -#errors -(1,25): foster-parenting-start-tag -(1,26): foster-parenting-character -(1,29): foster-parenting-start-tag -(1,30): foster-parenting-character -(1,35): foster-parenting-start-tag -(1,36): foster-parenting-character -(1,39): foster-parenting-start-tag -(1,40): foster-parenting-character -(1,44): foster-parenting-end-tag -(1,44): adoption-agency-1.3 -(1,44): adoption-agency-1.3 -(1,45): foster-parenting-character -(1,49): foster-parenting-end-tag -(1,44): adoption-agency-1.3 -(1,44): adoption-agency-1.3 -(1,50): foster-parenting-character -(1,50): eof-in-table -#document -| <!DOCTYPE html> -| <html> -| <head> -| <body> -| <i> -| "a" -| <b> -| "b" -| <b> -| <div> -| <b> -| <i> -| "c" -| <a> -| "d" -| <a> -| "e" -| <a> -| "f" -| <table> - #data <!doctype html><table><i>a<div>b<tr>c<b>d</i>e #errors diff --git a/tree-construction/tests2.dat b/tree-construction/tests2.dat index b44fec4d..11ef9b16 100644 --- a/tree-construction/tests2.dat +++ b/tree-construction/tests2.dat @@ -584,6 +584,16 @@ | <head> | <body> +#data +<!DOCTYPE html> <!DOCTYPE html> +#errors +Line: 1 Col: 31 Unexpected DOCTYPE. Ignored. +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> + #data test test diff --git a/tree-construction/tests20.dat b/tree-construction/tests20.dat index afdae743..80c57d1a 100644 --- a/tree-construction/tests20.dat +++ b/tree-construction/tests20.dat @@ -25,6 +25,32 @@ | <button> | <address> +#data +<!doctype html><p><button><article> +#errors +(1,36): expected-closing-tag-but-got-eof +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <button> +| <article> + +#data +<!doctype html><p><button><aside> +#errors +(1,34): expected-closing-tag-but-got-eof +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <button> +| <aside> + #data <!doctype html><p><button><blockquote> #errors @@ -38,6 +64,175 @@ | <button> | <blockquote> +#data +<!doctype html><p><button><center> +#errors +(1,35): expected-closing-tag-but-got-eof +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <button> +| <center> + +#data +<!doctype html><p><button><details> +#errors +(1,36): expected-closing-tag-but-got-eof +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <button> +| <details> + +#data +<!doctype html><p><button><dialog> +#errors +(1,35): expected-closing-tag-but-got-eof +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <button> +| <dialog> + +#data +<!doctype html><p><button><dir> +#errors +(1,32): expected-closing-tag-but-got-eof +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <button> +| <dir> + +#data +<!doctype html><p><button><div> +#errors +(1,32): expected-closing-tag-but-got-eof +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <button> +| <div> + +#data +<!doctype html><p><button><dl> +#errors +(1,31): expected-closing-tag-but-got-eof +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <button> +| <dl> + +#data +<!doctype html><p><button><fieldset> +#errors +(1,37): expected-closing-tag-but-got-eof +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <button> +| <fieldset> + +#data +<!doctype html><p><button><figcaption> +#errors +(1,39): expected-closing-tag-but-got-eof +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <button> +| <figcaption> + +#data +<!doctype html><p><button><figure> +#errors +(1,35): expected-closing-tag-but-got-eof +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <button> +| <figure> + +#data +<!doctype html><p><button><footer> +#errors +(1,35): expected-closing-tag-but-got-eof +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <button> +| <footer> + +#data +<!doctype html><p><button><header> +#errors +(1,35): expected-closing-tag-but-got-eof +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <button> +| <header> + +#data +<!doctype html><p><button><hgroup> +#errors +(1,35): expected-closing-tag-but-got-eof +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <button> +| <hgroup> + +#data +<!doctype html><p><button><main> +#errors +(1,33): expected-closing-tag-but-got-eof +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <button> +| <main> + #data <!doctype html><p><button><menu> #errors @@ -51,6 +246,32 @@ | <button> | <menu> +#data +<!doctype html><p><button><nav> +#errors +(1,32): expected-closing-tag-but-got-eof +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <button> +| <nav> + +#data +<!doctype html><p><button><ol> +#errors +(1,31): expected-closing-tag-but-got-eof +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <button> +| <ol> + #data <!doctype html><p><button><p> #errors @@ -64,6 +285,45 @@ | <button> | <p> +#data +<!doctype html><p><button><search> +#errors +(1,35): expected-closing-tag-but-got-eof +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <button> +| <search> + +#data +<!doctype html><p><button><section> +#errors +(1,36): expected-closing-tag-but-got-eof +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <button> +| <section> + +#data +<!doctype html><p><button><summary> +#errors +(1,36): expected-closing-tag-but-got-eof +#document +| <!DOCTYPE html> +| <html> +| <head> +| <body> +| <p> +| <button> +| <summary> + #data <!doctype html><p><button><ul> #errors @@ -249,17 +509,16 @@ | <p> #data -<!doctype html><address><button></address>a +<!doctype html><button><p></button>x #errors -(1,42): end-tag-too-early #document | <!DOCTYPE html> | <html> | <head> | <body> -| <address> -| <button> -| "a" +| <button> +| <p> +| "x" #data <!doctype html><address><button></address>a @@ -557,6 +816,7 @@ <math><annotation-xml></svg>x #errors (1,6): expected-doctype-but-got-start-tag +(1,28): unexpected-end-tag-in-math (1,28): unexpected-end-tag (1,29): expected-closing-tag-but-got-eof #document diff --git a/tree-construction/tests21.dat b/tree-construction/tests21.dat index 1e2af7c1..a926b138 100644 --- a/tree-construction/tests21.dat +++ b/tree-construction/tests21.dat @@ -41,20 +41,7 @@ <svg><![CDATA[foo #errors (1,5): expected-doctype-but-got-start-tag -(1,17): expected-closing-tag-but-got-eof -#new-errors (1:18) eof-in-cdata -#document -| <html> -| <head> -| <body> -| <svg svg> -| "foo" - -#data -<svg><![CDATA[foo -#errors -(1,5): expected-doctype-but-got-start-tag (1,17): expected-closing-tag-but-got-eof #new-errors (1:18) eof-in-cdata @@ -69,6 +56,7 @@ <svg><![CDATA[ #errors (1,5): expected-doctype-but-got-start-tag +(1:15) eof-in-cdata (1,14): expected-closing-tag-but-got-eof #new-errors (1:15) eof-in-cdata @@ -101,22 +89,11 @@ | <svg svg> | "]] >" -#data -<svg><![CDATA[]] >]]> -#errors -(1,5): expected-doctype-but-got-start-tag -(1,21): expected-closing-tag-but-got-eof -#document -| <html> -| <head> -| <body> -| <svg svg> -| "]] >" - #data <svg><![CDATA[]] #errors (1,5): expected-doctype-but-got-start-tag +(1:17) eof-in-cdata (1,16): expected-closing-tag-but-got-eof #new-errors (1:17) eof-in-cdata @@ -131,6 +108,7 @@ <svg><![CDATA[] #errors (1,5): expected-doctype-but-got-start-tag +(1:16) eof-in-cdata (1,15): expected-closing-tag-but-got-eof #new-errors (1:16) eof-in-cdata @@ -145,6 +123,7 @@ <svg><![CDATA[]>a #errors (1,5): expected-doctype-but-got-start-tag +(1:16) eof-in-cdata (1,17): expected-closing-tag-but-got-eof #new-errors (1:18) eof-in-cdata @@ -236,6 +215,7 @@ <svg><![CDATA[<svg>a #errors (1,5): expected-doctype-but-got-start-tag +(1:21) eof-in-cdata (1,20): expected-closing-tag-but-got-eof #new-errors (1:21) eof-in-cdata @@ -250,6 +230,7 @@ <svg><![CDATA[</svg>a #errors (1,5): expected-doctype-but-got-start-tag +(1:22) eof-in-cdata (1,21): expected-closing-tag-but-got-eof #new-errors (1:22) eof-in-cdata diff --git a/tree-construction/tests26.dat b/tree-construction/tests26.dat index de453b9c..1ba2be2d 100644 --- a/tree-construction/tests26.dat +++ b/tree-construction/tests26.dat @@ -391,3 +391,63 @@ Line 1 Col 19 Expected closing tag. Unexpected end of file. | <button> | <p> | <button> + +#data +<svg></p><foo> +#errors +(1:1) Missing doctype +9: HTML end tag “p” in a foreign namespace context. +(1:6) Unexpected </p> from in body insertion mode +(1:16) Unexpected EOF +#document +| <html> +| <head> +| <body> +| <svg svg> +| <p> +| <foo> + +#data +<svg></br><foo> +#errors +(1:1) Missing doctype +10: HTML end tag “br” in a foreign namespace context. +(1:6) Unexpected </br> from in body insertion mode +(1:16) Unexpected EOF +#document +| <html> +| <head> +| <body> +| <svg svg> +| <br> +| <foo> + +#data +<math></p><foo> +#errors +(1:1) Missing doctype +10: HTML end tag “p” in a foreign namespace context. +(1:7) Unexpected </p> from in body insertion mode +(1:16) Unexpected EOF +#document +| <html> +| <head> +| <body> +| <math math> +| <p> +| <foo> + +#data +<math></br><foo> +#errors +(1:1) Missing doctype +11: HTML end tag “br” in a foreign namespace context. +(1:7) Unexpected </br> from in body insertion mode +(1:17) Unexpected EOF +#document +| <html> +| <head> +| <body> +| <math math> +| <br> +| <foo> diff --git a/tree-construction/tests4.dat b/tree-construction/tests4.dat index 0a6174c3..4f0cf70e 100644 --- a/tree-construction/tests4.dat +++ b/tree-construction/tests4.dat @@ -56,3 +56,19 @@ head #document | <title> | "setting head's innerHTML" + +#data +direct <title> content +#errors +#document-fragment +title +#document +| "direct <title> content" + +#data +<!-- inside </script> --> +#errors +#document-fragment +script +#document +| "<!-- inside </script> -->" diff --git a/tree-construction/tests6.dat b/tree-construction/tests6.dat index f3991232..8c36dd3d 100644 --- a/tree-construction/tests6.dat +++ b/tree-construction/tests6.dat @@ -48,7 +48,6 @@ #data <!doctype> #errors -(1,9): need-space-after-doctype (1,10): expected-doctype-name-but-got-right-bracket (1,10): unknown-doctype #new-errors @@ -604,6 +603,7 @@ html #data <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html></html> #errors +(1,50): doctype-has-public-identifier #document | <!DOCTYPE html "-//W3C//DTD HTML 4.01//EN" ""> | <html> diff --git a/tree-construction/tests7.dat b/tree-construction/tests7.dat index 395dc72b..b2db4de1 100644 --- a/tree-construction/tests7.dat +++ b/tree-construction/tests7.dat @@ -46,6 +46,42 @@ | "X" | <body> +#data +<!doctype html></head><base>X +#errors +(1,28): unexpected-start-tag-out-of-my-head +#document +| <!DOCTYPE html> +| <html> +| <head> +| <base> +| <body> +| "X" + +#data +<!doctype html></head><basefont>X +#errors +(1,32): unexpected-start-tag-out-of-my-head +#document +| <!DOCTYPE html> +| <html> +| <head> +| <basefont> +| <body> +| "X" + +#data +<!doctype html></head><bgsound>X +#errors +(1,31): unexpected-start-tag-out-of-my-head +#document +| <!DOCTYPE html> +| <html> +| <head> +| <bgsound> +| <body> +| "X" + #data <!doctype html><table><meta></table> #errors @@ -391,7 +427,6 @@ A<table><tr> B</tr> </em>C</table> (1,1): expected-doctype-but-got-chars (1,13): foster-parenting-character (1,14): foster-parenting-character -(1,20): foster-parenting-character (1,25): unexpected-end-tag (1,25): unexpected-end-tag-in-special-element (1,26): foster-parenting-character diff --git a/tree-construction/tests8.dat b/tree-construction/tests8.dat index ba2e63dd..d532801e 100644 --- a/tree-construction/tests8.dat +++ b/tree-construction/tests8.dat @@ -90,6 +90,9 @@ x" #data <table><li><li></table> #errors +(1,7): expected-doctype-but-got-start-tag +(1,11): foster-parenting-start-tag +(1,15): foster-parenting-start-tag #document | <html> | <head> diff --git a/tree-construction/tests_innerHTML_1.dat b/tree-construction/tests_innerHTML_1.dat index 54f43684..1a37ee52 100644 --- a/tree-construction/tests_innerHTML_1.dat +++ b/tree-construction/tests_innerHTML_1.dat @@ -110,16 +110,6 @@ table #document | <a> -#data -<a> -#errors -(1,3): unexpected-start-tag-implies-table-voodoo -(1,3): eof-in-table -#document-fragment -table -#document -| <a> - #data <a><caption>a #errors @@ -502,30 +492,6 @@ tbody | <tr> | <td> -#data -<a><td> -#errors -(1,3): unexpected-start-tag-implies-table-voodoo -(1,7): unexpected-cell-in-table-body -#document-fragment -tbody -#document -| <a> -| <tr> -| <td> - -#data -<a><td> -#errors -(1,3): unexpected-start-tag-implies-table-voodoo -(1,7): unexpected-cell-in-table-body -#document-fragment -tbody -#document -| <a> -| <tr> -| <td> - #data <td><table><tbody><a><tr> #errors @@ -648,16 +614,6 @@ tr | <table> | <td> -#data -<td><table></table><td> -#errors -#document-fragment -tr -#document -| <td> -| <table> -| <td> - #data <caption><a> #errors diff --git a/tree-construction/webkit01.dat b/tree-construction/webkit01.dat index b5fafdc7..d30e12e5 100644 --- a/tree-construction/webkit01.dat +++ b/tree-construction/webkit01.dat @@ -307,6 +307,20 @@ console.log("FOO<span>BAR</span>BAZ"); | <body> | <!-- Hi there --> +#data +<html><body></body></html><!-- Comment A --><!-- Comment B --><!-- Comment C --><!-- Comment D --><!-- Comment E --> +#errors +(1,6): expected-doctype-but-got-start-tag +#document +| <html> +| <head> +| <body> +| <!-- Comment A --> +| <!-- Comment B --> +| <!-- Comment C --> +| <!-- Comment D --> +| <!-- Comment E --> + #data <html><body></body></html>x<!-- Hi there --> #errors @@ -345,6 +359,32 @@ console.log("FOO<span>BAR</span>BAZ"); | <!-- Hi there --> | <!-- Again --> +#data +<html><body></body> + <!-- Hi there --></html> +#errors +no-doctype +#document +| <html> +| <head> +| <body> +| " + " +| <!-- Hi there --> + +#data +<html><body></body></html> + <!-- Hi there --> +#errors +no-doctype +#document +| <html> +| <head> +| <body> +| " + " +| <!-- Hi there --> + #data <html><body><ruby><div><rp>xx</rp></div></ruby></body></html> #errors @@ -673,6 +713,10 @@ console.log("FOO<span>BAR</span>BAZ"); #data <table><tr><td><svg><desc><td></desc><circle> #errors +(1,7): expected-doctype-but-got-start-tag +(1,30): unexpected-start-tag +(1,37): unexpected-end-tag +(1,22): expected-closing-tag-but-got-eof #document | <html> | <head> diff --git a/tree-construction/webkit02.dat b/tree-construction/webkit02.dat index 791991d2..7d817ec6 100644 --- a/tree-construction/webkit02.dat +++ b/tree-construction/webkit02.dat @@ -138,6 +138,7 @@ #data <legend>test</legend> #errors +(1,7): expected-doctype-but-got-start-tag #document | <html> | <head> @@ -148,6 +149,9 @@ #data <table><input> #errors +(1,7): expected-doctype-but-got-start-tag +(1,14): foster-parenting-start-tag +(1,15): expected-closing-tag-but-got-eof #document | <html> | <head> @@ -155,9 +159,36 @@ | <input> | <table> +#data +<b><em><dcell><postfield><postfield><postfield><postfield><missing_glyph><missing_glyph><missing_glyph><missing_glyph><hkern><aside></b></em> +#errors +unexpected-b-end-tag +unexpected-em-end-tag +eof-in-aside +#document-fragment +div +#document +| <b> +| <em> +| <dcell> +| <postfield> +| <postfield> +| <postfield> +| <postfield> +| <missing_glyph> +| <missing_glyph> +| <missing_glyph> +| <missing_glyph> +| <hkern> +| <aside> +| <b> + #data <b><em><foo><foo><aside></b> #errors +(1,3): expected-doctype-but-got-start-tag +(1,28): adoption-agency-9 +(1,29): expected-closing-tag-but-got-eof #document | <html> | <head> @@ -173,6 +204,10 @@ #data <b><em><foo><foo><aside></b></em> #errors +(1,3): expected-doctype-but-got-start-tag +(1,28): adoption-agency-9 +(1,33): adoption-agency-9 +(1,34): expected-closing-tag-but-got-eof #document | <html> | <head> @@ -189,6 +224,9 @@ #data <b><em><foo><foo><foo><aside></b> #errors +(1,3): expected-doctype-but-got-start-tag +(1,33): adoption-agency-9 +(1,34): expected-closing-tag-but-got-eof #document | <html> | <head> @@ -204,6 +242,10 @@ #data <b><em><foo><foo><foo><aside></b></em> #errors +(1,3): expected-doctype-but-got-start-tag +(1,33): adoption-agency-9 +(1,38): adoption-agency-9 +(1,39): expected-closing-tag-but-got-eof #document | <html> | <head> @@ -219,6 +261,9 @@ #data <b><em><foo><foo><foo><foo><foo><foo><foo><foo><foo><foo><aside></b></em> #errors +(1,68): adoption-agency-9 +(1,73): adoption-agency-9 +(1,74): expected-closing-tag-but-got-eof #document-fragment div #document @@ -240,6 +285,9 @@ div #data <b><em><foo><foob><foob><foob><foob><fooc><fooc><fooc><fooc><food><aside></b></em> #errors +(1,77): adoption-agency-9 +(1,82): adoption-agency-9 +(1,83): expected-closing-tag-but-got-eof #document-fragment div #document @@ -261,6 +309,8 @@ div #data <option><XH<optgroup></optgroup> #errors +(1,21): unexpected-start-tag-in-select +(1,32): unexpected-end-tag-in-select #document-fragment select #document @@ -269,6 +319,8 @@ select #data <svg><foreignObject><div>foo</div><plaintext></foreignObject></svg><div>bar</div> #errors +(1,5): expected-doctype-but-got-start-tag +(1,82): expected-closing-tag-but-got-eof #document | <html> | <head> @@ -283,6 +335,8 @@ select #data <svg><foreignObject></foreignObject><title></svg>foo #errors +(1,5): expected-doctype-but-got-start-tag +(1,49): expected-one-end-tag-but-got-another #document | <html> | <head> @@ -295,9 +349,206 @@ select #data </foreignObject><plaintext><div>foo</div> #errors +(1,16): expected-doctype-but-got-end-tag +(1,16): unexpected-end-tag-before-html +(1,42): expected-closing-tag-but-got-eof #document | <html> | <head> | <body> | <plaintext> | "<div>foo</div>" + +#data +<svg xml:base xml:lang xml:space xml:baaah definitionurl> +#errors +no-doctype +eof-in-svg +#document +| <html> +| <head> +| <body> +| <svg svg> +| definitionurl="" +| xml lang="" +| xml space="" +| xml:baaah="" +| xml:base="" + +#data +<math definitionurl xlink:title xlink:show> +#errors +no-doctype +eof-in-math +#document +| <html> +| <head> +| <body> +| <math math> +| definitionURL="" +| xlink show="" +| xlink title="" + +#data +<math DEFINITIONURL> +#errors +no-doctype +eof-in-math +#document +| <html> +| <head> +| <body> +| <math math> +| definitionURL="" + +#data +<select><hr> +#errors +1:1: ERROR: Expected a doctype token +1:13: ERROR: Premature end of file. Currently open tags: html, body, select. +#document +| <html> +| <head> +| <body> +| <select> +| <hr> + +#data +<select><option><hr> +#errors +1:1: ERROR: Expected a doctype token +1:21: ERROR: Premature end of file. Currently open tags: html, body, select. +#document +| <html> +| <head> +| <body> +| <select> +| <option> +| <hr> + +#data +<select><optgroup><option><hr> +#errors +1:1: ERROR: Expected a doctype token +1:31: ERROR: Premature end of file. Currently open tags: html, body, select. +#document +| <html> +| <head> +| <body> +| <select> +| <optgroup> +| <option> +| <hr> + +#data +<select><optgroup><hr> +#errors +1:1: ERROR: Expected a doctype token +1:23: ERROR: Premature end of file. Currently open tags: html, body, select. +#document +| <html> +| <head> +| <body> +| <select> +| <optgroup> +| <hr> + +#data +<select><option><optgroup><hr> +#errors +1:1: ERROR: Expected a doctype token +1:31: ERROR: Premature end of file. Currently open tags: html, body, select. +#document +| <html> +| <head> +| <body> +| <select> +| <option> +| <optgroup> +| <hr> + +#data +<table><tr><td><select><hr> +#errors +1:1: ERROR: Expected a doctype token +1:28: ERROR: Premature end of file. Currently open tags: html, body, table, tbody, tr, td, select. +#document +| <html> +| <head> +| <body> +| <table> +| <tbody> +| <tr> +| <td> +| <select> +| <hr> + +#data +<table><tr><td><select><option><hr> +#errors +1:1: ERROR: Expected a doctype token +1:36: ERROR: Premature end of file. Currently open tags: html, body, table, tbody, tr, td, select. +#document +| <html> +| <head> +| <body> +| <table> +| <tbody> +| <tr> +| <td> +| <select> +| <option> +| <hr> + +#data +<table><tr><td><select><optgroup><option><hr> +#errors +1:1: ERROR: Expected a doctype token +1:46: ERROR: Premature end of file. Currently open tags: html, body, table, tbody, tr, td, select. +#document +| <html> +| <head> +| <body> +| <table> +| <tbody> +| <tr> +| <td> +| <select> +| <optgroup> +| <option> +| <hr> + +#data +<table><tr><td><select><optgroup><hr> +#errors +1:1: ERROR: Expected a doctype token +1:38: ERROR: Premature end of file. Currently open tags: html, body, table, tbody, tr, td, select. +#document +| <html> +| <head> +| <body> +| <table> +| <tbody> +| <tr> +| <td> +| <select> +| <optgroup> +| <hr> + +#data +<table><tr><td><select><option><optgroup><hr> +#errors +1:1: ERROR: Expected a doctype token +1:46: ERROR: Premature end of file. Currently open tags: html, body, table, tbody, tr, td, select. +#document +| <html> +| <head> +| <body> +| <table> +| <tbody> +| <tr> +| <td> +| <select> +| <option> +| <optgroup> +| <hr>