10BC0 Bug 1957384 - Allow Unicode whitespace characters intervening between… · jwidar/LatencyZeroGithub@6338019 · GitHub
[go: up one dir, main page]

Skip to content

Commit 6338019

Browse files
committed
Bug 1957384 - Allow Unicode whitespace characters intervening between punctuation and letter in ::first-letter range. r=dshin
And allow multiple punctuation elements, with potential intervening whitespace, both before and after the letter. Tests to be added once web-platform-tests/wpt#51361 is merged. Differential Revision: https://phabricator.services.mozilla.com/D243674
1 parent 3b02038 commit 6338019

File tree

3 files changed

+124
-59
lines changed

3 files changed

+124
-59
lines changed

dom/base/nsContentUtils.cpp

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1919,33 +1919,6 @@ nsIBidiKeyboard* nsContentUtils::GetBidiKeyboard() {
19191919
return sBidiKeyboard;
19201920
}
19211921

1922-
/**
1923-
* This is used to determine whether a character is in one of the classes
1924-
* which CSS says should be part of the first-letter. Currently, that is
1925-
* all punctuation classes (P*). Note that this is a change from CSS2
1926-
* which excluded Pc and Pd.
1927-
*
1928-
* https://www.w3.org/TR/css-pseudo-4/#first-letter-pseudo
1929-
* "Punctuation (i.e, characters that belong to the Punctuation (P*) Unicode
1930-
* general category [UAX44]) [...]"
1931-
*/
1932-
1933-
// static
1934-
bool nsContentUtils::IsFirstLetterPunctuation(uint32_t aChar) {
1935-
switch (mozilla::unicode::GetGeneralCategory(aChar)) {
1936-
case HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION: /* Pc */
1937-
case HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION: /* Pd */
1938-
case HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION: /* Pe */
1939-
case HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION: /* Pf */
1940-
case HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION: /* Pi */
1941-
case HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION: /* Po */
1942-
case HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION: /* Ps */
1943-
return true;
1944-
default:
1945-
return false;
1946-
}
1947-
}
1948-
19491922
// static
19501923
bool nsContentUtils::IsAlphanumeric(uint32_t aChar) {
19511924
nsUGenCategory cat = mozilla::unicode::GetGenCategory(aChar);

dom/base/nsContentUtils.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -809,11 +809,6 @@ class nsContentUtils {
809809
static const nsDependentSubstring TrimWhitespace(const nsAString& aStr,
810810
bool aTrimTrailing = true);
811811

812-
/**
813-
* Returns true if aChar is of class Ps, Pi, Po, Pf, or Pe.
814-
*/
815-
static bool IsFirstLetterPunctuation(uint32_t aChar);
816-
817812
/**
818813
* Returns true if aChar is of class Lu, Ll, Lt, Lm, Lo, Nd, Nl or No
819814
*/

layout/generic/nsTextFrame.cpp

Lines changed: 124 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -8402,15 +8402,61 @@ std::pair<int32_t, int32_t> nsTextFrame::GetOffsets() const {
84028402
return std::make_pair(GetContentOffset(), GetContentEnd());
84038403
}
84048404

8405-
static int32_t FindEndOfPunctuationRun(const nsTextFragment* aFrag,
8406-
const gfxTextRun* aTextRun,
8407-
gfxSkipCharsIterator* aIter,
8408-
int32_t aOffset, int32_t aStart,
8409-
int32_t aEnd) {
8405+
static bool IsFirstLetterPrefixPunctuation(uint32_t aChar) {
8406+
switch (mozilla::unicode::GetGeneralCategory(aChar)) {
8407+
case HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION: /* Pc */
8408+
case HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION: /* Pd */
8409+
case HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION: /* Pe */
8410+
case HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION: /* Pf */
8411+
case HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION: /* Pi */
8412+
case HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION: /* Po */
8413+
case HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION: /* Ps */
8414+
return true;
8415+
default:
8416+
return false;
8417+
}
8418+
}
8419+
8420+
static bool IsFirstLetterSuffixPunctuation(uint32_t aChar) {
8421+
switch (mozilla::unicode::GetGeneralCategory(aChar)) {
8422+
case HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION: /* Pc */
8423+
case HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION: /* Pe */
8424+
case HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION: /* Pf */
8425+
case HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION: /* Pi */
8426+
case HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION: /* Po */
8427+
return true;
8428+
default:
8429+
return false;
8430+
}
8431+
}
8432+
8433+
static int32_t FindEndOfPrefixPunctuationRun(const nsTextFragment* aFrag,
8434+
const gfxTextRun* aTextRun,
8435+
gfxSkipCharsIterator* aIter,
8436+
int32_t aOffset, int32_t aStart,
8437+
int32_t aEnd) {
84108438
int32_t i;
8439+
for (i = aStart; i < aEnd - aOffset; ++i) {
8440+
if (IsFirstLetterPrefixPunctuation(
8441+
aFrag->ScalarValueAt(AssertedCast<uint32_t>(aOffset + i)))) {
8442+
aIter->SetOriginalOffset(aOffset + i);
8443+
FindClusterEnd(aTextRun, aEnd, aIter);
8444+
i = aIter->GetOriginalOffset() - aOffset;
8445+
} else {
8446+
break;
8447+
}
8448+
}
8449+
return i;
8450+
}
84118451

8452+
static int32_t FindEndOfSuffixPunctuationRun(const nsTextFragment* aFrag,
8453+
const gfxTextRun* aTextRun,
8454+
gfxSkipCharsIterator* aIter,
8455+
int32_t aOffset, int32_t aStart,
8456+
int32_t aEnd) {
8457+
int32_t i;
84128458
for (i = aStart; i < aEnd - aOffset; ++i) {
8413-
if (nsContentUtils::IsFirstLetterPunctuation(
8459+
if (IsFirstLetterSuffixPunctuation(
84148460
aFrag->ScalarValueAt(AssertedCast<uint32_t>(aOffset + i)))) {
84158461
aIter->SetOriginalOffset(aOffset + i);
84168462
FindClusterEnd(aTextRun, aEnd, aIter);
@@ -8440,7 +8486,6 @@ static bool FindFirstLetterRange(const nsTextFragment* aFrag,
84408486
const gfxTextRun* aTextRun, int32_t aOffset,
84418487
const gfxSkipCharsIterator& aIter,
84428488
int32_t* aLength) {
8443-
int32_t i;
84448489
int32_t length = *aLength;
84458490
int32_t endOffset = aOffset + length;
84468491
gfxSkipCharsIterator iter(aIter);
@@ -8464,25 +8509,39 @@ static bool FindFirstLetterRange(const nsTextFragment* aFrag,
84648509
return false;
84658510
};
84668511

8467-
// skip leading whitespace, then consume clusters that start with punctuation
8468-
i = FindEndOfPunctuationRun(
8469-
aFrag, aTextRun, &iter, aOffset,
8470-
GetTrimmableWhitespaceCount(aFrag, aOffset, length, 1), endOffset);
8471-
if (i == length) {
8472-
return false;
8473-
}
8512+
// Skip any trimmable leading whitespace.
8513+
int32_t i = GetTrimmableWhitespaceCount(aFrag, aOffset, length, 1);
8514+
while (true) {
8515+
// Scan past any leading punctuation. This leaves `j` at the first
8516+
// non-punctuation character.
8517+
int32_t j = FindEndOfPrefixPunctuationRun(aFrag, aTextRun, &iter, aOffset,
8518+
i, endOffset);
8519+
if (j == length) {
8520+
return false;
8521+
}
84748522

8475-
// skip space/no-break-space after punctuation
8476-
while (i < length) {
8477-
char16_t ch = aFrag->CharAt(AssertedCast<uint32_t>(aOffset + i));
8478-
if (ch == ' ' || ch == CH_NBSP) {
8479-
++i;
8480-
} else {
8523+
// Scan past any Unicode whitespace characters after punctuation.
8524+
while (j < length) {
8525+
char16_t ch = aFrag->CharAt(AssertedCast<uint32_t>(aOffset + j));
8526+
// The spec says to allow "characters that belong to the `Zs` Unicode
8527+
// general category _other than_ U+3000" here.
8528+
if (unicode::GetGeneralCategory(ch) ==
8529+
HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR &&
8530+
ch != 0x3000) {
8531+
++j;
8532+
} else {
8533+
break;
8534+
}
8535+
}
8536+
if (j == length) {
8537+
return false;
8538+
}
8539+
if (j == i) {
8540+
// If no whitespace was found, we've finished the first-letter prefix;
8541+
// if there was some, then go back to check for more punctuation.
84818542
break;
84828543
}
8483-
}
8484-
if (i == length) {
8485-
return false;
8544+
i = j;
84868545
}
84878546

84888547
// If the next character is not a letter, number or symbol, there is no
@@ -8495,7 +8554,7 @@ static bool FindFirstLetterRange(const nsTextFragment* aFrag,
84958554
return true;
84968555
}
84978556

8498-
// consume another cluster (the actual first letter)
8557+
// Consume another cluster (the actual first letter):
84998558

85008559
// For complex scripts such as Indic and SEAsian, where first-letter
85018560
// should extend to entire orthographic "syllable" clusters, we don't
@@ -8566,9 +8625,12 @@ static bool FindFirstLetterRange(const nsTextFragment* aFrag,
85668625
break;
85678626
}
85688627

8628+
// NOTE that FindClusterEnd sets the iterator to the last character that is
8629+
// part of the cluster, NOT to the first character beyond it.
85698630
iter.SetOriginalOffset(aOffset + i);
85708631
FindClusterEnd(aTextRun, endOffset, &iter, allowSplitLigature);
85718632

8633+
// Index of the last character included in the first-letter cluster.
85728634
i = iter.GetOriginalOffset() - aOffset;
85738635

85748636
// Heuristic for Indic scripts that like to form conjuncts:
@@ -8616,9 +8678,44 @@ static bool FindFirstLetterRange(const nsTextFragment* aFrag,
86168678
}
86178679
}
86188680

8619-
// consume clusters that start with punctuation
8620-
i = FindEndOfPunctuationRun(aFrag, aTextRun, &iter, aOffset, i + 1,
8621-
endOffset);
8681+
// When we reach here, `i` points to the last character of the first-letter
8682+
// cluster, NOT to the first character beyond it. Advance to the next char,
8683+
// ready to check for following whitespace/punctuation:
8684+
++i;
8685+
8686+
while (i < length) {
8687+
// Skip over whitespace, except for word separator characters, before the
8688+
// check for following punctuation. But remember the position before the
8689+
// whitespace, in case we need to reset.
8690+
const int32_t preWS = i;
8691+
while (i < length) {
8692+
char16_t ch = aFrag->CharAt(AssertedCast<uint32_t>(aOffset + i));
8693+
// The spec says the first-letter suffix includes "any intervening
8694+
// typographic space -- characters belonging to the Zs Unicode general
8695+
// category other than U+3000 IDEOGRAPHIC SPACE or a word separator",
8696+
// where "word separator" includes U+0020 and U+00A0.
8697+
if (ch == 0x0020 || ch == 0x00A0 || ch == 0x3000 ||
8698+
unicode::GetGeneralCategory(ch) !=
8699+
HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR) {
8700+
break;
8701+
} else {
8702+
++i;
8703+
}
8704+
}
8705+
8706+
// Consume clusters that start with punctuation.
8707+
const int32_t prePunct = i;
8708+
i = FindEndOfSuffixPunctuationRun(aFrag, aTextRun, &iter, aOffset, i,
8709+
endOffset);
8710+
8711+
// If we didn't find punctuation here, then we also don't want to include
8712+
// any preceding whitespace, so reset our index.
8713+
if (i == prePunct) {
8714+
i = preWS;
8715+
break;
8716+
}
8717+
}
8718+
86228719
if (i < length) {
86238720
*aLength = i;
86248721
}

0 commit comments

Comments
 (0)
0