8000 #20411 fix Yaml parsing for very long quoted strings by RichardBradley · Pull Request #21523 · symfony/symfony · GitHub
[go: up one dir, main page]

Skip to content

#20411 fix Yaml parsing for very long quoted strings #21523

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions src/Symfony/Component/Yaml/Inline.php
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,8 @@ public static function dump($value, $exceptionOnInvalidType = false, $objectSupp
case Escaper::requiresDoubleQuoting($value):
return Escaper::escapeWithDoubleQuotes($value);
case Escaper::requiresSingleQuoting($value):
case preg_match(self::getHexRegex(), $value):
case preg_match(self::getTimestampRegex(), $value):
case Parser::preg_match(self::getHexRegex(), $value):
case Parser::preg_match(self::getTimestampRegex(), $value):
return Escaper::escapeWithSingleQuotes($value);
default:
return $value;
Expand Down Expand Up @@ -242,10 +242,10 @@ public static function parseScalar($scalar, $delimiters = null, $stringDelimiter
$i += strlen($output);

// remove comments
if (preg_match('/[ \t]+#/', $output, $match, PREG_OFFSET_CAPTURE)) {
if (Parser::preg_match('/[ \t]+#/', $output, $match, PREG_OFFSET_CAPTURE)) {
$output = substr($output, 0, $match[0][1]);
}
} elseif (preg_match('/^(.+?)('.implode('|', $delimiters).')/', substr($scalar, $i), $match)) {
} elseif (Parser::preg_match('/^(.+?)('.implode('|', $delimiters).')/', substr($scalar, $i), $match)) {
$output = $match[1];
$i += strlen($output);
} else {
Expand All @@ -272,7 +272,7 @@ public static function parseScalar($scalar, $delimiters = null, $stringDelimiter
*/
private static function parseQuotedScalar($scalar, &$i)
{
if (!preg_match('/'.self::REGEX_QUOTED_STRING.'/Au', substr($scalar, $i), $match)) {
if (!Parser::preg_match('/'.self::REGEX_QUOTED_STRING.'/Au', substr($scalar, $i), $match)) {
throw new ParseException(sprintf('Malformed inline YAML string: %s.', substr($scalar, $i)));
}

Expand Down Expand Up @@ -520,16 +520,16 @@ private static function evaluateScalar($scalar, $references = array())

return '0' == $scalar[1] ? octdec($scalar) : (((string) $raw === (string) $cast) ? $cast : $raw);
case is_numeric($scalar):
case preg_match(self::getHexRegex(), $scalar):
case Parser::preg_match(self::getHexRegex(), $scalar):
return '0x' === $scalar[0].$scalar[1] ? hexdec($scalar) : (float) $scalar;
case '.inf' === $scalarLower:
case '.nan' === $scalarLower:
return -log(0);
case '-.inf' === $scalarLower:
return log(0);
case preg_match('/^(-|\+)?[0-9,]+(\.[0-9]+)?$/', $scalar):
case Parser::preg_match('/^(-|\+)?[0-9,]+(\.[0-9]+)?$/', $scalar):
return (float) str_replace(',', '', $scalar);
case preg_match(self::getTimestampRegex(), $scalar):
case Parser::preg_match(self::getTimestampRegex(), $scalar):
$timeZone = date_default_timezone_get();
date_default_timezone_set('UTC');
$time = strtotime($scalar);
Expand Down
84 changes: 53 additions & 31 deletions src/Symfony/Component/Yaml/Parser.php
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ public function __construct($offset = 0, $totalNumberOfLines = null, array $skip
*/
public function parse($value, $exceptionOnInvalidType = false, $objectSupport = false, $objectForMap = false)
{
if (!preg_match('//u', $value)) {
if (!self::preg_match('//u', $value)) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would not replace this, as it would throw an exception saying Malformed UTF-8 data. instead of the expected one

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. I'll add a test to cover this

throw new ParseException('The YAML value does not appear to be valid UTF-8.');
}
$this->currentLineNb = -1;
Expand Down Expand Up @@ -92,13 +92,13 @@ public function parse($value, $exceptionOnInvalidType = false, $objectSupport =
}

$isRef = $mergeNode = false;
if (preg_match('#^\-((?P<leadspaces>\s+)(?P<value>.+?))?\s*$#u', $this->currentLine, $values)) {
if (self::preg_match('#^\-((?P<leadspaces>\s+)(?P<value>.+))?$#u', rtrim($this->currentLine), $values)) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See comment on line 127 below

if ($context && 'mapping' == $context) {
throw new ParseException('You cannot define a sequence item when in a mapping', $this->getRealCurrentLineNb() + 1, $this->currentLine);
}
$context = 'sequence';

if (isset($values['value']) && preg_match('#^&(?P<ref>[^ ]+) *(?P<value>.*)#u', $values['value'], $matches)) {
if (isset($values['value']) && self::preg_match('#^&(?P<ref>[^ ]+) *(?P<value>.*)#u', $values[' 8000 ;value'], $matches)) {
$isRef = $matches['ref'];
$values['value'] = $matches['value'];
}
Expand All @@ -108,7 +108,7 @@ public function parse($value, $exceptionOnInvalidType = false, $objectSupport =
$data[] = $this->parseBlock($this->getRealCurrentLineNb() + 1, $this->getNextEmbedBlock(null, true), $exceptionOnInvalidType, $objectSupport, $objectForMap);
} else {
if (isset($values['leadspaces'])
&& preg_match('#^(?P<key>'.Inline::REGEX_QUOTED_STRING.'|[^ \'"\{\[].*?) *\:(\s+(?P<value>.+?))?\s*$#u', $values['value'], $matches)
&& self::preg_match('#^(?P<key>'.Inline::REGEX_QUOTED_STRING.'|[^ \'"\{\[].*?) *\:(\s+(?P<value>.+))?$#u', rtrim($values['value']), $matches)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see comment on line 127

) {
// this is a compact notation element, add to next block and parse
$block = $values['value'];
Expand All @@ -124,7 +124,10 @@ public function parse($value, $exceptionOnInvalidType = false, $objectSupport =
if ($isRef) {
$this->refs[$isRef] = end($data);
}
} elseif (preg_match('#^(?P<key>'.Inline::REGEX_QUOTED_STRING.'|[^ \'"\[\{].*?) *\:(\s+(?P<value>.+?))?\s*$#u', $this->currentLine, $values) && (false === strpos($values['key'], ' #') || in_array($values['key'][0], array('"', "'")))) {
} elseif (self::preg_match('#^(?P<key>'.Inline::REGEX_QUOTED_STRING.'|[^ \'"\[\{].*?) *\:(\s+(?P<value>.+))?$#u', rtrim($this->currentLine), $values)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here, as well as wrapping "preg_match", I have fixed the regex to avoid large numbers of pcre backtracks by moving the trailing whitespace trimming behaviour out of the regex pattern and into a "rtrim" in the argument list.

This may potentially be less performant in some cases (but I expect more performant in most cases, actually), but I have not measured this.

It demonstrably fixes a bug, as can be seen by the unit test I have added to this commit, which fails without this change and passes with it.

Copy link
Contributor Author
@RichardBradley RichardBradley Feb 3, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The specific problem with the regex here was the ".+?" followed by a "\s*$", which leads to a great deal of backtracking behaviour in long strings. I could not find a simpler fix than the one I propose here (the possessive quantifier fix used in #21279 would not work here)

&& (false === strpos($values['key'], ' #')
|| in_array($values['key'][0], array('"', "'")))) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The way the if condition is wrapped now IMO doesn't make it more readable. Maybe reformat it to something like this:

} elseif (
    self::preg_match('#^(?P<key>'.Inline::REGEX_QUOTED_STRING.'|[^ \'"\[\{].*?) *\:(\s+(?P<value>.+))?$#u', rtrim($this->currentLine), $values)
    && (false === strpos($values['key'], ' #') || in_array($values['key'][0], array('"', "'")))
) {

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have pushed a new version with your preferred indentation


if ($context && 'sequence' == $context) {
throw new ParseException('You cannot define a mapping item when in a sequence', $this->currentLineNb + 1, $this->currentLine);
}
Expand Down Expand Up @@ -203,7 +206,7 @@ public function parse($value, $exceptionOnInvalidType = false, $objectSupport =
}
}
}
} elseif (isset($values['value']) && preg_match('#^&(?P<ref>[^ ]+) *(?P<value>.*)#u', $values['value'], $matches)) {
} elseif (isset($values['value']) && self::preg_match('#^&(?P<ref>[^ ]+) *(?P<value>.*)#u', $values['value'], $matches)) {
$isRef = $matches['ref'];
$values['value'] = $matches['value'];
}
Expand Down Expand Up @@ -266,27 +269,7 @@ public function parse($value, $exceptionOnInvalidType = false, $objectSupport =
return $value;
}

switch (preg_last_error()) {
case PREG_INTERNAL_ERROR:
$error = 'Internal PCRE error.';
break;< 8000 /td>
case PREG_BACKTRACK_LIMIT_ERROR:
$error = 'pcre.backtrack_limit reached.';
break;
case PREG_RECURSION_LIMIT_ERROR:
$error = 'pcre.recursion_limit reached.';
break;
case PREG_BAD_UTF8_ERROR:
$error = 'Malformed UTF-8 data.';
break;
case PREG_BAD_UTF8_OFFSET_ERROR:
$error = 'Offset doesn\'t correspond to the begin of a valid UTF-8 code point.';
break;
default:
$error = 'Unable to parse.';
}

throw new ParseException($error, $this->getRealCurrentLineNb() + 1, $this->currentLine);
throw new ParseException("Unable to parse", $this->getRealCurrentLineNb() + 1, $this->currentLine);
}
}

Expand Down Expand Up @@ -520,7 +503,7 @@ private function parseValue($value, $exceptionOnInvalidType, $objectSupport, $ob
return $this->refs[$value];
}

if (preg_match('/^'.self::BLOCK_SCALAR_HEADER_PATTERN.'$/', $value, $matches)) {
if (self::preg_match('/^'.self::BLOCK_SCALAR_HEADER_PATTERN.'$/', $value, $matches)) {
$modifiers = isset($matches['modifiers']) ? $matches['modifiers'] : '';

return $this->parseBlockScalar($matches['separator'], preg_replace('#\d+#', '', $modifiers), (int) abs($modifiers));
Expand Down Expand Up @@ -566,7 +549,7 @@ private function parseBlockScalar($style, $chomping = '', $indentation = 0)

// determine indentation if not specified
if (0 === $indentation) {
if (preg_match('/^ +/', $this->currentLine, $matches)) {
if (self::preg_match('/^ +/', $this->currentLine, $matches)) {
$indentation = strlen($matches[0]);
}
}
Expand All @@ -577,7 +560,7 @@ private function parseBlockScalar($style, $chomping = '', $indentation = 0)
while (
$notEOF && (
$isCurrentLineBlank ||
preg_match($pattern, $this->currentLine, $matches)
self::preg_match($pattern, $this->currentLine, $matches)
)
) {
if ($isCurrentLineBlank && strlen($this->currentLine) > $indentation) {
Expand Down Expand Up @@ -800,6 +783,45 @@ private function isStringUnIndentedCollectionItem()
*/
private function isBlockScalarHeader()
{
return (bool) preg_match('~'.self::BLOCK_SCALAR_HEADER_PATTERN.'$~', $this->currentLine);
return (bool) self::preg_match('~'.self::BLOCK_SCALAR_HEADER_PATTERN.'$~', $this->currentLine);
}

/**
* A local wrapper for `preg_match` which will throw a ParseException if there
* is an internal error in the PCRE engine.
*
* This avoids us needing to check for "false" every time PCRE is used
* in the YAML engine
*
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

must be @internal as we clearly don't want to support BC on it

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, will do

* @throws ParseException on a PCRE internal error
* @see preg_last_error()
*/
static function preg_match($pattern, $subject, &$matches = null, $flags = 0, $offset = 0)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should probably be called pregMatch if you want to call it like this. Perhaps match would be a better name

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I deliberately named it "preg_match" so that it was a drop-in replacement for the builtin preg_match. Is that disallowed?

{
$ret = preg_match($pattern, $subject, $matches, $flags, $offset);
if ($ret === false) {
switch (preg_last_error()) {
case PREG_INTERNAL_ERROR:
$error = 'Internal PCRE error.';
break;
case PREG_BACKTRACK_LIMIT_ERROR:
$error = 'pcre.backtrack_limit reached.';
break;
case PREG_RECURSION_LIMIT_ERROR:
$error = 'pcre.recursion_limit reached.';
break;
case PREG_BAD_UTF8_ERROR:
$error = 'Malformed UTF-8 data.';
break;
case PREG_BAD_UTF8_OFFSET_ERROR:
$error = 'Offset doesn\'t correspond to the begin of a valid UTF-8 code point.';
break;
default:
$error = 'Error.';
}

throw new ParseException($error);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should not be reachable, but should hopefully mean that any further undetected "backtrack_limit" bugs, or any similar bugs added in the future, will result in an exception rather than an incorrect result.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this misses the location of the error in the ParseException

Copy link
Contributor Author
@RichardBradley RichardBradley Feb 3, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The location is not always available, unless I make two wrappers -- one static, for Inline and other callers, and one non-static, for use during the parse. Do you think that's worthwhile?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could pass the needed context to the new method. Might not look nice, but will do what it should.

}
return $ret;
}
}
12 changes: 12 additions & 0 deletions src/Symfony/Component/Yaml/Tests/ParserTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

class ParserTest extends \PHPUnit_Framework_TestCase
{
/** @var Parser */
protected $parser;

protected function setUp()
Expand Down Expand Up @@ -1143,6 +1144,17 @@ public function parserThrowsExceptionWithCorrectLineNumberProvider()
),
);
}

public function testCanParseVeryLongValue()
{
$longStringWithSpaces = str_repeat("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx ", 20000);
$trickyVal = array("x" => $longStringWithSpaces);

$yamlString = Yaml::dump($trickyVal);
$arrayFromYaml = $this->parser->parse($yamlString);

$this->assertEquals($trickyVal, $arrayFromYaml);
}
}

class B
Expand Down
0