8000 [JsonPath] Better handling of Unicode chars in expressions · symfony/symfony@251f3b9 · GitHub
[go: up one dir, main page]

Skip to content
8000

Commit 251f3b9

Browse files
[JsonPath] Better handling of Unicode chars in expressions
1 parent 0795d65 commit 251f3b9

File tree

4 files changed

+327
-2
lines changed

4 files changed

+327
-2
lines changed

src/Symfony/Component/JsonPath/JsonCrawler.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ private function evaluateBracket(string $expr, mixed $value): array
230230

231231
// quoted strings for object keys
232232
if (preg_match('/^([\'"])(.*)\1$/', $expr, $matches)) {
233-
$key = stripslashes($matches[2]);
233+
$key = JsonPathUtils::unescapeString($matches[2], $matches[1]);
234234

235235
return \array_key_exists($key, $value) ? [$value[$key]] : [];
236236
}
@@ -335,7 +335,7 @@ private function evaluateScalar(string $expr, array $context): mixed
335335

336336
// string literals
337337
if (preg_match('/^([\'"])(.*)\1$/', $expr, $matches)) {
338-
return $matches[2];
338+
return JsonPathUtils::unescapeString($matches[2], $matches[1]);
339339
}
340340

341341
// current node references

src/Symfony/Component/JsonPath/JsonPathUtils.php

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,4 +85,82 @@ public static function findSmallestDeserializableStringAndPath(array $tokens, mi
8585
'tokens' => $remainingTokens,
8686
];
8787
}
88+
89+
public static function unescapeString(string $str, string $quoteChar): string
90+
{
91+
if ('"' === $quoteChar) {
92+
// try JSON decoding first for unicode sequences
93+
$jsonStr = '"' . $str . '"';
94+
$decoded = json_decode($jsonStr, true);
95+
96+
if (null !== $decoded) {
97+
return $decoded;
98+
}
99+
}
100+
101+
$result = '';
102+
$length = strlen($str);
103+
104+
for ($i = 0; $i < $length; $i++) {
105+
if ('\\' === $str[$i] && $i + 1 < $length) {
106+
$nextChar = $str[$i + 1];
107+
108+
match ($nextChar) {
109+
'"' => $result .= '"',
110+
"'" => $result .= "'",
111+
'\\' => $result .= '\\',
112+
'/' => $result .= '/',
113+
'b' => $result .= "\b",
114+
'f' => $result .= "\f",
115+
'n' => $result .= "\n",
116+
'r' => $result .= "\r",
117+
't' => $result .= "\t",
118+
'u' => $result .= self::unescapeUnicodeSequence($str, $length, $i),
119+
default => $result .= $str[$i], // keep the backslash
120+
};
121+
122+
if ('u' !== $nextChar) {
123+
++$i;
B41A 124+
}
125+
} else {
126+
$result .= $str[$i];
127+
}
128+
}
129+
130+
return $result;
131+
}
132+
133+
private static function unescapeUnicodeSequence(string $str, int $length, int &$i): string
134+
{
135+
if ($i + 5 >= $length) {
136+
// not enough characters for Unicode escape, treat as literal
137+
return $str[$i];
138+
}
139+
140+
$hex = substr($str, $i + 2, 4);
141+
if (!ctype_xdigit($hex)) {
142+
// invalid hex, treat as literal
143+
return $str[$i];
144+
}
145+
146+
$codepoint = hexdec($hex);
147+
// looks like a valid Unicode codepoint, string length is sufficient and it starts with \u
148+
if (0xD800 <= $codepoint && 0xDBFF >= $codepoint && $i + 11 < $length && '\\' === $str[$i + 6] && 'u' === $str[$i + 7]) {
149+
$lowHex = substr($str, $i + 8, 4);
150+
if (ctype_xdigit($lowHex)) {
151+
$lowSurrogate = hexdec($lowHex);
152+
if (0xDC00 <= $lowSurrogate && 0xDFFF >= $lowSurrogate) {
153+
$codepoint = 0x10000 + (($codepoint & 0x3FF) << 10) + ($lowSurrogate & 0x3FF);
154+
$i += 11; // skip surrogate pair
155+
156+
return mb_chr($codepoint, 'UTF-8');
157+
}
158+
}
159+
}
160+
161+
// single Unicode character or invalid surrogate, skip the sequence
162+
$i += 5;
163+
164+
return mb_chr($codepoint, 'UTF-8');
165+
}
88166
}

src/Symfony/Component/JsonPath/Tests/JsonCrawlerTest.php

Lines changed: 246 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,6 +404,228 @@ public function testAcceptsJsonPath()
404404
$this->assertSame('red', $result[0]['color']);
405405
}
406406

407+
/**
408+
* @dataProvider provideUnicodeEscapeSequencesProvider
409+
*/
410+
public function testUnicodeEscapeSequences(string $jsonPath, array $expected)
411+
{
412+
$this->assertSame($expected, self::getUnicodeDocumentCrawler()->find($jsonPath));
413+
}
414+
415+
public static function provideUnicodeEscapeSequencesProvider(): array
416+
{
417+
return [
418+
[
419+
'$["caf\u00e9"]',
420+
['coffee'],
421+
],
422+
[
423+
'$["\u65e5\u672c"]',
424+
['Japan'],
425+
],
426+
[
427+
'$["M\u00fcller"]',
428+
[],
429+
],
430+
431+
[
432+
'$["emoji\ud83d\ude00"]',
433+
['smiley'],
434+
],
435+
436+
[
437+
'$["tab\there"]',
438+
['with tab'],
439+
],
440+
[
441+
'$["new\nline"]',
442+
['with newline'],
443+
],
444+
[
445+
'$["quote\"here"]',
446+
['with quote'],
447+
],
448+
[
449+
'$["backslash\\\\here"]',
450+
['with backslash'],
451+
],
452+
[
453+
'$["apostrophe\'here"]',
454+
['with apostrophe'],
455+
],
456+
457+
[
458+
'$["control\u0001char"]',
459+
['with control char'],
460+
],
461+
462+
[
463+
'$["\u0063af\u00e9"]',
464+
['coffee'],
465+
]
466+
];
467+
}
468+
469+
/**
470+
* @dataProvider provideSingleQuotedStringProvider
471+
*/
472+
public function testSingleQuotedStrings(string $jsonPath, array $expected)
473+
{
474+
$this->assertSame($expected, E377 self::getUnicodeDocumentCrawler()->find($jsonPath));
475+
}
476+
477+
public static function provideSingleQuotedStringProvider(): array
478+
{
479+
return [
480+
[
481+
'$[\'caf\u00e9\']',
482+
['coffee'],
483+
],
484+
[
485+
'$[\'\u65e5\u672c\']',
486+
['Japan'],
487+
],
488+
[
489+
'$[\'quote"here\']',
490+
['with quote'],
491+
],
492+
[
493+
'$[\'apostrophe\\\'here\']',
494+
['with apostrophe'],
495+
]
496+
];
497+
}
498+
499+
/**
500+
* @dataProvider provideFilterWithUnicodeProvider
501+
*/
502+
public function testFilterWithUnicodeStrings(string $jsonPath, int $expectedCount, string $expectedCountry)
503+
{
504+
$result = self::getUnicodeDocumentCrawler()->find($jsonPath);
505+
506+
$this->assertCount($expectedCount, $result);
507+
508+
if ($expectedCount > 0) {
509+
$this->assertSame($expectedCountry, $result[0]['country']);
510+
}
511+
}
512+
513+
public static function provideFilterWithUnicodeProvider(): array
514+
{
515+
return [
516+
[
517+
'$.users[?(@.name == "caf\u00e9")]',
518+
1,
519+
'France',
520+
],
521+
[
522+
'$.users[?(@.name == "\u65e5\u672c\u592a\u90ce")]',
523+
1,
524+
'Japan',
525+
],
526+
[
527+
'$.users[?(@.name == "Jos\u00e9")]',
528+
1,
529+
'Spain',
530+
],
531+
[
532+
'$.users[?(@.name == "John")]',
533+
1,
534+
'USA',
535+
],
536+
[
537+
'$.users[?(@.name == "NonExistent\u0020Name")]',
538+
0,
539+
'',
540+
]
541+
];
542+
}
543+
544+
/**
545+
* @dataProvider provideInvalidUnicodeSequenceProvider
546+
*/
547+
public function testInvalidUnicodeSequencesAreProcessedAsLiterals(string $jsonPath)
548+
{
549+
$this->assertIsArray(self::getUnicodeDocumentCrawler()->find($jsonPath), 'invalid unicode sequence should be treated as literal and not throw');
550+
}
551+
552+
public static function provideInvalidUnicodeSequenceProvider(): array
553+
{
554+
return [
555+
[
556+
'$["test\uZZZZ"]',
557+
],
558+
[
559+
'$["test\u123"]',
560+
],
561+
[
562+
'$["test\u"]',
563+
]
564+
10000 ];
565+
}
566+
567+
/**
568+
* @dataProvider provideComplexUnicodePath
569+
*/
570+
public function testComplexUnicodePaths(string $jsonPath, array $expected)
571+
{
572+
$complexJson = [
573+
'データ' => [
574+
'ユーザー' => [
575+
['名前' => 'テスト', 'ID' => 1],
576+
['名前' => 'サンプル', 'ID' => 2]
577+
]
578+
],
579+
'special🔑' => [
580+
'value💎' => 'treasure'
581+
]
582+
];
583+
584+
$crawler = new JsonCrawler(json_encode($complexJson));
585+
586+
$this->assertSame($expected, $crawler->find($jsonPath));
587+
}
588+
589+
public static function provideComplexUnicodePath(): array
590+
{
591+
return [
592+
[
593+
'$["\u30c7\u30fc\u30bf"]["\u30e6\u30fc\u30b6\u30fc"][0]["\u540d\u524d"]',
594+
['テスト'],
595+
],
596+
[
597+
'$["special\ud83d\udd11"]["value\ud83d\udc8e"]',
598+
['treasure'],
599+
],
600+
[
601+
'$["\u30c7\u30fc\u30bf"]["\u30e6\u30fc\u30b6\u30fc"][*]["\u540d\u524d"]',
602+
['テスト', 'サンプル'],
603+
]
604+
];
605+
}
606+
607+
public function testSurrogatePairHandling()
608+
{
609+
$json = ['𝒽𝑒𝓁𝓁𝑜' => 'mathematical script hello'];
610+
$crawler = new JsonCrawler(json_encode($json));
611+
612+
// mathematical script "hello" requires surrogate pairs for each character
613+
$result = $crawler->find('$["\ud835\udcbd\ud835\udc52\ud835\udcc1\ud835\udcc1\ud835\udc5c"]');
614+
$this->assertSame(['mathematical script hello'], $result);
615+
}
616+
617+
public function testMixedQuoteTypes()
618+
{
619+
$json = ['key"with"quotes' => 'value1', "key'with'apostrophes" => 'value2'];
620+
$crawler = new JsonCrawler(json_encode($json));
621+
622+
$result = $crawler->find('$[\'key"with"quotes\']');
623+
$this->assertSame(['value1'], $result);
624+
625+
$result = $crawler->find('$["key\'with\'apostrophes"]');
626+
$this->assertSame(['value2'], $result);
627+
}
628+
407629
private static function getBookstoreCrawler(): JsonCrawler
408630
{
409631
return new JsonCrawler(<<<JSON
@@ -453,4 +675,28 @@ private static function getSimpleCollectionCrawler(): JsonCrawler
453675
{"a": [3, 5, 1, 2, 4, 6]}
454676
JSON);
455677
}
678+
679+
private static function getUnicodeDocumentCrawler(): JsonCrawler
680+
{
681+
$json = [
682+
'café' => 'coffee',
683+
'日本' => 'Japan',
684+
'emoji😀' => 'smiley',
685+
'tab here' => 'with tab',
686+
"new\nline" => 'with newline',
687+
'quote"here' => 'with quote',
688+
'backslash\\here' => 'with backslash',
689+
'apostrophe\'here' => 'with apostrophe',
690+
"control\x01char" => 'with control char',
691+
'users' => [
692+
['name' => 'café', 'country' => 'France'],
693+
['name' => '日本太郎', 'country' => 'Japan'],
694+
['name' => 'John', 'country' => 'USA'],
695+
['name' => 'Müller', 'country' => 'Germany'],
696+
['name' => 'José', 'country' => 'Spain']
697+
]
698+
];
699+
700+
return new JsonCrawler(json_encode($json));
701+
}
456702
}

src/Symfony/Component/JsonPath/composer.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
],
1818
"require": {
1919
"php": ">=8.2",
20+
"symfony/polyfill-ctype": "~1.8",
2021
"symfony/polyfill-mbstring": "~1.0"
2122
},
2223
"require-dev": {

0 commit comments

Comments
 (0)
0