From 43902c4bee55a0597ed42817c7c00662f75d3454 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Doeraene?= Date: Mon, 16 Jun 2025 14:01:24 +0200 Subject: [PATCH 1/2] Add more tests for Unicode case-insensitivity in regexes. --- .../javalib/util/regex/RegexEngineTest.scala | 46 ++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/test-suite/shared/src/test/scala/org/scalajs/testsuite/javalib/util/regex/RegexEngineTest.scala b/test-suite/shared/src/test/scala/org/scalajs/testsuite/javalib/util/regex/RegexEngineTest.scala index 4cc2a720b4..651afeade9 100644 --- a/test-suite/shared/src/test/scala/org/scalajs/testsuite/javalib/util/regex/RegexEngineTest.scala +++ b/test-suite/shared/src/test/scala/org/scalajs/testsuite/javalib/util/regex/RegexEngineTest.scala @@ -1976,8 +1976,52 @@ class RegexEngineTest { val s = compile("s", CaseInsensitive | UnicodeCase) assertMatches(s, "s") assertMatches(s, "S") - assertMatches(s, "\u017F") // ſ LATIN SMALL LETTER LONG S + assertMatches(s, "\u017F") // ſ LATIN SMALL LETTER LONG S; 017F folds to 's' assertNotMatches(s, "t") + + val ranges = compile("[g-l\uFB00\u0175-\u0182\u0540-\u0550\u1F68-\u1F8E\u1FAA-\u1FAF\u2126]", + CaseInsensitive | UnicodeCase) + // g-l + assertMatches(ranges, "H") + assertMatches(ranges, "\u212A") // K KELVIN SIGN, folds to 'k' + // FB00 + assertMatches(ranges, "\uFB00") // ff LATIN SMALL LIGATURE FF + // 0175-0182 (contains 017F which folds to 's') + if (!executingInJVM) { + // https://bugs.openjdk.org/browse/JDK-8360459 + assertMatches(ranges, "s") + assertMatches(ranges, "S") + } + assertMatches(ranges, "\u017F") + assertMatches(ranges, "\u0180") // in range; does not participate in case folding + // 0540-0550 + assertMatches(ranges, "\u0547") // in range + assertMatches(ranges, "\u0577") // 0547 folds to 0577 + // 1F68-1F8E + assertMatches(ranges, "\u1F65") // 1F6D folds to 1F65 + assertMatches(ranges, "\u1F6D") // in range + assertMatches(ranges, "\u1F82") // 1F8A folds to 1F82, and 1F82 is also in range + // 1FAA-1FAF + assertMatches(ranges, "\u1FA4") // 1FAC folds to 1FA4 only in simple case folding + // 2126 + assertMatches(ranges, "\u2126") // in the set + assertMatches(ranges, "\u03C9") // 2126 folds to 03C9 + assertMatches(ranges, "\u03A9") // 03A9 also folds to 03C9 + // No matches + assertNotMatches(ranges, "t") + assertNotMatches(ranges, "ff") // ff FB00 would only match with full case folding + + // Demonstrate that the JVM recognizes 017F as folding to 's' if the range is ASCII + val rangeWithASCII_S = compile("[P-U]", CaseInsensitive | UnicodeCase) + assertMatches(rangeWithASCII_S, "s") + assertMatches(rangeWithASCII_S, "S") + assertMatches(rangeWithASCII_S, "\u017F") + + // Demonstrate that the JVM recognizes 017F as folding to 's' if it is not a range + val nonRangeWith_017F = compile("[\u017F\u0184]", CaseInsensitive | UnicodeCase) + assertMatches(nonRangeWith_017F, "s") + assertMatches(nonRangeWith_017F, "S") + assertMatches(nonRangeWith_017F, "\u017F") } @Test def wordBoundary(): Unit = { From 5696a1a533b0169a5be484c659d96e36a5db5b9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Doeraene?= Date: Mon, 16 Jun 2025 18:06:56 +0200 Subject: [PATCH 2/2] Do not rely on `Formatter` in the debug prints of `RegexEngineTest`. `Formatter` is an entire beast of its own. It is a poor fit for the debugging output of another area of the test suite. --- .../javalib/util/regex/RegexEngineTest.scala | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/test-suite/shared/src/test/scala/org/scalajs/testsuite/javalib/util/regex/RegexEngineTest.scala b/test-suite/shared/src/test/scala/org/scalajs/testsuite/javalib/util/regex/RegexEngineTest.scala index 651afeade9..d174bda341 100644 --- a/test-suite/shared/src/test/scala/org/scalajs/testsuite/javalib/util/regex/RegexEngineTest.scala +++ b/test-suite/shared/src/test/scala/org/scalajs/testsuite/javalib/util/regex/RegexEngineTest.scala @@ -54,11 +54,12 @@ class RegexEngineTest { private def debugEscape(pattern: String): String = { pattern.flatMap { - case '\t' => "`t" - case '\n' => "`n" - case '\r' => "`r" - case c if c < ' ' => "`x%02X".format(c.toInt) - case c => c.toString() + case '\t' => "`t" + case '\n' => "`n" + case '\r' => "`r" + case c if c < 0x10 => "`x0" + c.toInt.toHexString + case c if c < ' ' => "`x" + c.toInt.toHexString + case c => c.toString() } }