8000 [JsonPath] Better handling of Unicode chars in expressions · symfony/symfony@cefc5d9 · GitHub
[go: up one dir, main page]

Skip to content
8000

Commit cefc5d9

Browse files
[JsonPath] Better handling of Unicode chars in expressions
1 parent 0795d65 commit cefc5d9

File tree

4 files changed

+357
-2
lines changed

4 files changed

+357
-2
lines changed

src/Symfony/Component/JsonPath/JsonCrawler.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ private function evaluateBracket(string $expr, mixed $value): array
230230

231231
// quoted strings for object keys
232232
if (preg_match('/^([\'"])(.*)\1$/', $expr, $matches)) {
233-
$key = stripslashes($matches[2]);
233+
$key = JsonPathUtils::unescapeString($matches[2], $matches[1]);
234234

235235
return \array_key_exists($key, $value) ? [$value[$key]] : [];
236236
}
@@ -335,7 +335,7 @@ private function evaluateScalar(string $expr, array $context): mixed
335335

336336
// string literals
337337
if (preg_match('/^([\'"])(.*)\1$/', $expr, $matches)) {
338-
return $matches[2];
338+
return JsonPathUtils::unescapeString($matches[2], $matches[1]);
339339
}
340340

341341
// current node references

src/Symfony/Component/JsonPath/JsonPathUtils.php

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,4 +85,80 @@ public static function findSmallestDeserializableStringAndPath(array $tokens, mi
8585
'tokens' => $remainingTokens,
8686
];
8787
}
88+
89+
public static function unescapeString(string $str, string $quoteChar): string
90+
{
91+
if ('"' === $quoteChar) {
92+
// try JSON decoding first for unicode sequences
93+
$jsonStr = '"' . $str . '"';
94+
$decoded = json_decode($jsonStr, true);
95+
96+
if (null !== $decoded) {
97+
return $decoded;
98+
}
99+
}
100+
101+
$result = '';
102+
$length = strlen($str);
103+
104+
for ($i = 0; $i < $length; $i++) {
105+
if ('\\' === $str[$i] && $i + 1 < $length) {
106+
$nextChar = $str[$i + 1];
107+
108+
$result .= match ($nextChar) {
109+
'"' => '"',
110+
"'" => "'",
111+
'\\' => '\\',
112+
'/' => '/',
113+
'b' => "\b",
114+
'f' => "\f",
115+
'n' => "\n",
116+
'r' => "\r",
117+
't' => "\t",
118+
'u' => self::unescapeUnicodeSequence($str, $length, $i),
119+
default => $str[$i] . $str[$i + 1], // keep the backslash
120+
};
121+
122+
++$i;
123+
} else {
124+
$result .= $str[$i];
125+
}
126+
}
127+
128+
return $result;
129+
}
130+
131+
private static function unescapeUnicodeSequence(string $str, int $length, int &$i): string
132+
{
133+
if ($i + 5 >= $length) {
134+
// not enough characters for Unicode escape, treat as literal
135+
return $str[$i];
136+
}
137+
138+
$hex = substr($str, $i + 2, 4);
139+
if (!ctype_xdigit($hex)) {
140+
// invalid hex, treat as literal
141+
return $str[$i];
142+
}
143+
144+
$codepoint = hexdec($hex);
145+
// looks like a valid Unicode codepoint, string length is sufficient and it starts with \u
146+
if (0xD800 <= $codepoint && $codepoint <= 0xDBFF && $i + 11 < $length && '\\' === $str[$i + 6] && 'u' === $str[$i + 7]) {
147+
$lowHex = substr($str, $i + 8, 4);
148+
if (ctype_xdigit($lowHex)) {
149+
$lowSurrogate = hexdec($lowHex);
150+
if (0xDC00 <= $lowSurrogate && $lowSurrogate <= 0xDFFF) {
151+
$codepoint = 0x10000 + (($codepoint & 0x3FF) << 10) + ($lowSurrogate & 0x3FF);
152+
$i += 10; // skip surrogate pair
153+
154+
return mb_chr($codepoint, 'UTF-8');
155+
}
156+
}
157+
}
158+
159+
// single Unicode character or invalid surrogate, skip the sequence
160+
$i += 4;
161+
162+
return mb_chr($codepoint, 'UTF-8');
163+
}
88164
}

src/Symfony/Component/JsonPath/Tests/JsonCrawlerTest.php

Lines changed: 278 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,6 +404,260 @@ public function testAcceptsJsonPath()
404404
$this->assertSame('red', $result[0]['color']);
405405
}
406406

407+
/**
408+
* @dataProvider provideUnicodeEscapeSequencesProvider
409+
*/
410+
public function testUnicodeEscapeSequences(string $jsonPath, array $expected)
411+
{
412+
$this->assertSame($expected, self::getUnicodeDocumentCrawler()->find($jsonPath));
413+
}
414+
415+
public static function provideUnicodeEscapeSequencesProvider(): array
416+
{
417+
return [
418+
[
419+
'$["caf\u00e9"]',
420+
['coffee'],
421+
],
422+
[
423+
'$["\u65e5\u672c"]',
424+
['Japan'],
425+
],
426+
[
427+
'$["M\u00fcller"]',
428+
[],
429+
],
430+
431+
[
432+
'$["emoji\ud83d\ude00"]',
433+
['smiley'],
434+
],
435+
436+
[
437+
'$["tab\there"]',
438+
['with tab'],
439+
],
440+
[
441+
'$["new\nline"]',
442+
['with newline'],
443+
],
444+
[
445+
'$["quote\"here"]',
446+
['with quote'],
447+
],
448+
[
449+
'$["backslash\\\\here"]',
450+
['with backslash'],
451+
],
452+
[
453+
'$["apostrophe\'here"]',
454+
['with apostrophe'],
455+
],
456+
457+
[
458+
'$["control\u0001char"]',
459+
['with control char'],
460+
],
461+
462+
[
463+
'$["\u0063af\u00e9"]',
464+
['coffee'],
465+
]
466+
];
467+
}
468+
469+
/**
470+
* @dataProvider provideSingleQuotedStringProvider
471+
*/
472+
public function testSingleQuotedStrings(string $jsonPath, array $expected)
473+
{
474+
$this->assertSame($expected, self::getUnicodeDocumentCrawler()->find($jsonPath));
475+
}
476+
477+
public static function provideSingleQuotedStringProvider(): array
478+
{
479+
return [
480+
[
481+
"$['caf\\u00e9']",
482+
['coffee'],
483+
],
484+
[
485+
"$['\\u65e5\\u672c']",
486+
['Japan'],
487+
],
488+
[
489+
"$['quote\"here']",
490+
['with quote'],
491+
],
492+
[
493+
"$['M\\u00fcller']",
494+
[],
495+
],
496+
497+
[
498+
"$['emoji\\ud83d\\ude00']",
499+
['smiley'],
500+
],
501+
502+
[
503+
"$['tab\\there']",
504+
['with tab'],
505+
],
506+
[
507+
"$['quote\\\"here']",
508+
['with quote'],
509+
],
510+
[
511+
"$['backslash\\\\here']",
512+
['with backslash'],
513+
],
514+
[
515+
"$['apostrophe\\'here']",
516+
['with apostrophe'],
517+
],
518+
519+
[
520+
"$['control\\u0001char']",
521+
['with control char'],
522+
],
523+
524+
[
525+
"$['\\u0063af\\u00e9']",
526+
['coffee'],
527+
]
528+
];
529+
}
530+
531+
/**
532+
* @dataProvider provideFilterWithUnicodeProvider
533+
*/
534+
public function testFilterWithUnicodeStrings(string $jsonPath, int $expectedCount, string $expectedCountry)
535+
{
536+
$result = self::getUnicodeDocumentCrawler()->find($jsonPath);
537+
538+
$this->assertCount($expectedCount, $result);
539+
540+
if ($expectedCount > 0) {
541+
$this->assertSame($expectedCountry, $result[0]['country']);
542+
}
543+
}
544+
545+
public static function provideFilterWithUnicodeProvider(): array
546+
{
547+
return [
548+
[
549+
'$.users[?(@.name == "caf\u00e9")]',
550+
1,
551+
'France',
552+
],
553+
[
554+
'$.users[?(@.name == "\u65e5\u672c\u592a\u90ce")]',
555+
1,
556+
'Japan',
557+
],
558+
[
559+
'$.users[?(@.name == "Jos\u00e9")]',
560+
1,
561+
'Spain',
562+
],
563+
[
564+
'$.users[?(@.name == "John")]',
565+
1,
566+
'USA',
567+
],
568+
[
569+
'$.users[?(@.name == "NonExistent\u0020Name")]',
570+
0,
571+
'',
572+
]
573+
];
574+
}
575+
576+
/**
577+
* @dataProvider provideInvalidUnicodeSequenceProvider
578+
*/
579+
public function testInvalidUnicodeSequencesAreProcessedAsLiterals(string $jsonPath)
580+
{
581+
$this->assertIsArray(self::getUnicodeDocumentCrawler()->find($jsonPath), 'invalid unicode sequence should be treated as literal and not throw');
582+
}
583+
584+
public static function provideInvalidUnicodeSequenceProvider(): array
585+
{
586+
return [
587+
[
588+
'$["test\uZZZZ"]',
589+
],
590+
[
591+
'$["test\u123"]',
592+
],
593+
[
594+
'$["test\u"]',
595+
]
596+
];
597+
}
598+
599+
/**
600+
* @dataProvider provideComplexUnicodePath
601+
*/
602+
public function testComplexUnicodePaths(string $jsonPath, array $expected)
603+
{
604+
$complexJson = [
605+
'データ' => [
606+
'ユーザー' => [
607+
['名前' => 'テスト', 'ID' => 1],
608+
['名前' => 'サンプル', 'ID' => 2]
609+
]
610+
],
611+
'special🔑' => [
612+
'value💎' => 'treasure'
613+
]
614+
];
615+
616+
$crawler = new JsonCrawler(json_encode($complexJson));
617+
618+
$this->assertSame($expected, $crawler->find($jsonPath));
619+
}
620+
621+
public static function provideComplexUnicodePath(): array
622+
{
623+
return [
624+
[
625+
'$["\u30c7\u30fc\u30bf"]["\u30e6\u30fc\u30b6\u30fc"][0]["\u540d\u524d"]',
626+
['テスト'],
627+
],
628+
[
629+
'$["special\ud83d\udd11"]["value\ud83d\udc8e"]',
630+
['treasure'],
631+
],
632+
[
633+
'$["\u30c7\u30fc\u30bf"]["\u30e6\u30fc\u30b6\u30fc"][*]["\u540d\u524d"]',
634+
['テスト', 'サンプル'],
635+
]
636+
];
637+
}
638+
639+
public function testSurrogatePairHandling()
640+
{
641+
$json = ['𝒽𝑒𝓁𝓁𝑜' => 'mathematical script hello'];
642+
$crawler = new JsonCrawler(json_encode($json));
643+
644+
// mathematical script "hello" requires surrogate pairs for each character
645+
$result = $crawler->find('$["\ud835\udcbd\ud835\udc52\ud835\udcc1\ud835\udcc1\ud835\udc5c"]');
646+
$this->assertSame(['mathematical script hello'], $result);
647+
}
648+
649+
public function testMixedQuoteTypes()
650+
{
651+
$json = ['key"with"quotes' => 'value1', "key'with'apostrophes" => 'value2'];
652+
$crawler = new JsonCrawler(json_encode($json));
653+
654+
$result = $crawler->find('$[\'key"with"quotes\']');
655+
$this->assertSame(['value1'], $result);
656+
657+
$result = $crawler->find('$["key\'with\'apostrophes"]');
658+
$this->assertSame(['value2'], $result);
659+
}
660+
407661
private static function getBookstoreCrawler(): JsonCrawler
408662
{
409663
return new JsonCrawler(<<<JSON
@@ -453,4 +707,28 @@ private static function getSimpleCollectionCrawler(): JsonCrawler
453707
{"a": [3, 5, 1, 2, 4, 6]}
454708
JSON);
455709
}
710+
711+
private static function getUnicodeDocumentCrawler(): JsonCrawler
712+
{
713+
$json = [
714+
'café' => 'coffee',
715+
'日本' => 'Japan',
716+
'emoji😀' => 'smiley',
717+
'tab here' => 'with tab',
718+
"new\nline" => 'with newline',
719+
'quote"here' => 'with quote',
720+
'backslash\\here' => 'with backslash',
721+
'apostrophe\'here' => 'with apostrophe',
722+
"control\x01char" => 'with control char',
723+
'users' => [
724+
['name' => 'café', 'country' => 'France'],
725+
['name' => '日本太郎', 'country' => 'Japan'],
726+
['name' => 'John', 'country' => 'USA'],
727+
['name' => 'Müller', 'country' => 'Germany'],
728+
['name' => 'José', 'country' => 'Spain']
729+
]
730+
];
731+
732+
return new JsonCrawler(json_encode($json));
733+
}
456734
}

src/Symfony/Component/JsonPath/composer.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
],
1818
"require": {
1919
"php": ">=8.2",
20+
"symfony/polyfill-ctype": "~1.8",
2021
"symfony/polyfill-mbstring": "~1.0"
2122
},
2223
"require-dev": {

0 commit comments

Comments
 (0)
0