8000 [JsonPath] Better handling of Unicode chars in expressions · symfony/symfony@f12c05c · GitHub
[go: up one dir, main page]

Skip to content
10000

Commit f12c05c

Browse files
[JsonPath] Better handling of Unicode chars in expressions
1 parent 0795d65 commit f12c05c

File tree

3 files changed

+355
-2
lines changed

3 files changed

+355
-2
lines changed

src/Symfony/Component/JsonPath/JsonCrawler.php

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -230,14 +230,29 @@ private function evaluateBracket(string $expr, mixed $value): array
230230

231231
// quoted strings for object keys
232232
if (preg_match('/^([\'"])(.*)\1$/', $expr, $matches)) {
233-
$key = stripslashes($matches[2]);
233+
$key = $this->unescapeString($matches[2], $matches[1]);
234234

235235
return \array_key_exists($key, $value) ? [$value[$key]] : [];
236236
}
237237

238238
throw new \LogicException(\sprintf('Unsupported bracket expression "%s".', $expr));
239239
}
240240

241+
private function unescapeString(string $str, string $quoteChar): string
242+
{
243+
if ('"' === $quoteChar) {
244+
// try JSON decoding first for unicode sequences
245+
$jsonStr = '"' . $str . '"';
246+
$decoded = json_decode($jsonStr, true);
247+
248+
if (null !== $decoded) {
249+
return $decoded;
250+
}
251+
}
252+
253+
return JsonPathUtils::unescapeString($str);
254+
}
255+
241256
private function evaluateFilter(string $expr, mixed $value): array
242257
{
243258
if (!\is_array($value)) {
@@ -335,7 +350,7 @@ private function evaluateScalar(string $expr, array $context): mixed
335350

336351
// string literals
337352
if (preg_match('/^([\'"])(.*)\1$/', $expr, $matches)) {
338-
return $matches[2];
353+
return $this->unescapeString($matches[2], $matches[1]);
339354
}
340355

341356
// current node references

src/Symfony/Component/JsonPath/JsonPathUtils.php

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,4 +85,96 @@ public static function findSmallestDeserializableStringAndPath(array $tokens, mi
8585
'tokens' => $remainingTokens,
8686
];
8787
}
88+
89+
public static function unescapeString(string $str): string
90+
{
91+
$result = '';
92+
$length = strlen($str);
93+
94+
for ($i = 0; $i < $length; $i++) {
95+
if ('\\' === $str[$i] && $i + 1 < $length) {
96+
$nextChar = $str[$i + 1];
97+
98+
switch ($nextChar) {
99+
case '"':
100+
$result .= '"';
101+
$i++;
102+
break;
103+
case "'":
104+
$result .= "'";
105+
$i++;
106+
break;
107+
case '\\':
108+
$result .= '\\';
109+
$i++;
110+
break;
111+
case '/':
112+
$result .= '/';
113+
$i++;
114+
break;
115+
case 'b':
116+
$result .= "\b";
117+
$i++;
118+
break;
119+
case 'f':
120+
$result .= "\f";
121+
$i++;
122+
break;
123+
case 'n':
124+
$result .= "\n";
125+
$i++;
126+
break;
127+
case 'r':
128+
$result .= "\r";
129+
$i++;
130+
break;
131+
case 't':
132+
$result .= "\t";
133+
$i++;
134+
break;
135+
case 'u':
136+
if ($i + 5 < $length) {
137+
$hex = substr($str, $i + 2, 4);
138+
if (ctype_xdigit($hex)) {
139+
$codepoint = hexdec($hex);
140+
141+
if ($codepoint >= 0xD800 && $codepoint <= 0xDBFF && $i + 11 < $length) {
142+
if ('\\' === $str[$i + 6] && 'u' === $str[$i + 7]) {
143+
$lowHex = substr($str, $i + 8, 4);
144+
if (ctype_xdigit($lowHex)) {
145+
$lowSurrogate = hexdec($lowHex);
146+
if ($lowSurrogate >= 0xDC00 && $lowSurrogate <= 0xDFFF) {
147+
$codepoint = 0x10000 + (($codepoint & 0x3FF) << 10) + ($lowSurrogate & 0x3FF);
148+
$result .= mb_chr($codepoint, 'UTF-8');
149+
$i += 11; // skip both escape sequences
150+
break;
151+
}
152+
}
153+
}
154+
}
155+
156+
// single Unicode character or invalid surrogate
157+
$result .= mb_chr($codepoint, 'UTF-8');
158+
$i += 5;
159+
} else {
160+
// invalid hex, treat as literal
161+
$result .= $str[$i];
162+
}
163+
} else {
164+
// not enough characters for Unicode escape, treat as literal
165+
$result .= $str[$i];
166+
}
167+
break;
168+
default:
169+
// unknown escape sequence, keep the backslash
170+
$result .= $str[$i];
171+
break;
172+
}
173+
} else {
174+
$result .= $str[$i];
175+
}
176+
}
177+
178+
return $result;
179+
}
88180
}

src/Symfony/Component/JsonPath/Tests/JsonCrawlerTest.php

Lines changed: 246 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,6 +404,228 @@ public function testAcceptsJsonPath()
404404
$this->assertSame('red', $result[0]['color']);
405405
}
406406

407+
/**
408+
* @dataProvider provideUnicodeEscapeSequencesProvider
409+
*/
410+
public function testUnicodeEscapeSequences(string $jsonPath, array $expected)
411+
{
412+
$this->assertSame($expected, self::getUnicodeDocumentCrawler()->find($jsonPath));
413+
}
414+
415+
public static function provideUnicodeEscapeSequencesProvider(): array
416+
{
417+
return [
418+
[
419+
'$["caf\u00e9"]',
420+
['coffee'],
421+
],
422+
[
423+
'$["\u65e5\u672c"]',
424+
['Japan'],
425+
],
426+
[
427+
'$["M\u00fcller"]',
428+
[],
429+
],
430+
431+
[
432+
'$["emoji\ud83d\ude00"]',
433+
['smiley'],
434+
],
435+
436+
[
437+
'$["tab\there"]',
438+
['with tab'],
439+
],
440+
[
441+
'$["new\nline"]',
442+
['with newline'],
443+
],
444+
[
445+
'$["quote\"here"]',
446+
['with quote'],
447+
],
448+
[
449+
'$["backslash\\\\here"]',
450+
['with backslash'],
451+
],
452+
[
453+
'$["apostrophe\'here"]',
454+
['with apostrophe'],
455+
],
456+
457+
[
458+
'$["control\u0001char"]',
459+
['with control char'],
460+
],
461+
462+
[
463+
'$["\u0063af\u00e9"]',
464+
['coffee'],
465+
]
466+
];
467+
}
468+
469+
/**
470+
* @dataProvider provideSingleQuotedStringProvider
471+
*/
472+
public function testSingleQuotedStrings(string $jsonPath, array $expected)
473+
{
474+
$this->assertSame($expected, self::getUnicodeDocumentCrawler()->find($jsonPath));
475+
}
476+
477+
public static function provideSingleQuotedStringProvider(): array
478+
{
479+
return [
480+
[
481+
'$[\'caf\u00e9\']',
482+
['coffee'],
483+
],
484+
[
485+
'$[\'\u65e5\u672c\']',
486+
['Japan'],
487+
],
488+
[
489+
'$[\'quote"here\']',
490+
['with quote'],
491+
],
492+
[
493+
'$[\'apostrophe\\\'here\']',
494+
['with apostrophe'],
495+
]
496+
];
497+
}
498+
499+
/**
500+
* @dataProvider provideFilterWithUnicodeProvider
501+
*/
502+
public function testFilterWithUnicodeStrings(string $jsonPath, int $expectedCount, string $expectedCountry)
503+
{
504+
$result = self::getUnicodeDocumentCrawler()->find($jsonPath);
505+
506+
$this->assertCount($expectedCount, $result);
507+
508+
if ($expectedCount > 0) {
509+
$this->assertSame($expectedCountry, $result[0]['country']);
510+
}
511+
}
512+
513+
public static function provideFilterWithUnicodeProvider(): array
514+
{
515+
return [
516+
[
517+
'$.users[?(@.name == "caf\u00e9")]',
518+
1,
519+
'France',
520+
],
521+
[
522+
'$.users[?(@.name == "\u65e5\u672c\u592a\u90ce")]',
523+
1,
524+
'Japan',
525+
],
526+
[
527+
'$.users[?(@.name == "Jos\u00e9")]',
528+
1,
529+
'Spain',
530+
],
531+
[
532+
'$.users[?(@.name == "John")]',
533+
1,
534+
'USA',
535+
],
536+
[
537+
'$.users[?(@.name == "NonExistent\u0020Name")]',
538+
0,
539+
'',
540+
]
541+
];
542+
}
543+
544+
/**
545+
* @dataProvider provideInvalidUnicodeSequenceProvider
546+
*/
547+
public function testInvalidUnicodeSequencesAreProcessedAsLiterals(string $jsonPath)
548+
{
549+
$this->assertIsArray(self::getUnicodeDocumentCrawler()->find($jsonPath), 'invalid unicode sequence should be treated as literal and not throw');
550+
}
551+
552+
public static function provideInvalidUnicodeSequenceProvider(): array
553+
{
554+
return [
555+
[
556+
'$["test\uZZZZ"]',
557+
],
558+
[
559+
'$["test\u123"]',
560+
],
561+
[
562+
'$["test\u"]',
563+
]
564+
];
565+
}
566+
567+
/**
568+
* @dataProvider provideComplexUnicodePath
569+
*/
570+
public function testComplexUnicodePaths(string $jsonPath, array $expected)
571+
{
572+
$complexJson = [
573+
'データ' => [
574+
'ユーザー' => [
575+
['名前' => 'テスト', 'ID' => 1],
576+
['名前' => 'サンプル', 'ID' => 2]
577+
]
578+
],
579+
'special🔑' => [
580+
'value💎' => 'treasure'
581+
]
582+
];
583+
584+
$crawler = new JsonCrawler(json_encode($complexJson));
585+
586+
$this->assertSame($expected, $crawler->find($jsonPath));
587+
}
588+
589+
public static function provideComplexUnicodePath(): array
590+
{
591+
return [
592+
[
593+
'$["\u30c7\u30fc\u30bf"]["\u30e6\u30fc\u30b6\u30fc"][0]["\u540d\u524d"]',
594+
['テスト'],
595+
],
596+
[
597+
'$["special\ud83d\udd11"]["value\ud83d\udc8e"]',
598+
['treasure'],
599+
],
600+
[
601+
'$["\u30c7\u30fc\u30bf"]["\u30e6\u30fc\u30b6\u30fc"][*]["\u540d\u524d"]',
602+
['テスト', 'サンプル'],
603+
]
604+
];
605+
}
606+
607+
public function testSurrogatePairHandling()
608+
{
609+
$json = ['𝒽𝑒𝓁𝓁𝑜' => 'mathematical script hello'];
610+
$crawler = new JsonCrawler(json_encode($json));
611+
612+
// mathematical script "hello" requires surrogate pairs for each character
613+
$result = $crawler->find('$["\ud835\udcbd\ud835\udc52\ud835\udcc1\ud835\udcc1\ud835\udc5c"]');
614+
$this->assertSame(['mathematical script hello'], $result);
615+
}
616+
617+
public function testMixedQuoteTypes()
618+
{
619+
$json = ['key"with"quotes' => 'value1', "key'with'apostrophes" => 'value2'];
620+
$crawler = new JsonCrawler(json_encode($json));
621+
622+
$result = $crawler->find('$[\'key"with"quotes\']');
623+
$this->assertSame(['value1'], $result);
624+
625+
$result = $crawler->find('$["key\'with\'apostrophes"]');
626+
$this->assertSame(['value2'], $result);
627+
}
628+
407629
private static function getBookstoreCrawler(): JsonCrawler
408630
{
409631
return new JsonCrawler(<<<JSON
@@ -453,4 +675,28 @@ private static function getSimpleCollectionCrawler(): JsonCrawler
453675
{"a": [3, 5, 1, 2, 4, 6]}
454676
JSON);
455677
}
678+
679+
private static function getUnicodeDocumentCrawler(): JsonCrawler
680+
{
681+
$json = [
682+
'café' => 'coffee',
683+
'日本' => 'Japan',
684+
'emoji😀' => 'smiley',
685+
'tab here' => 'with tab',
686+
"new\nline" => 'with newline',
687+
'quote"here' => 'with quote',
688+
'backslash\\here' => 'with backslash',
689+
'apostrophe\'here' => 'with apostrophe',
690+
"control\x01char" => 'with control char',
691+
'users' => [
692+
['name' => 'café', 'country' => 'France'],
693+
['name' => '日本太郎', 'country' => 'Japan'],
694+
['name' => 'John', 'country' => 'USA'],
695+
['name' => 'Müller', 'country' => 'Germany'],
696+
['name' => 'José', 'country' => 'Spain']
697+
]
698+
];
699+
700+
return new JsonCrawler(json_encode($json));
701+
}
456702
}

0 commit comments

Comments
 (0)
0