@@ -8402,15 +8402,61 @@ std::pair<int32_t, int32_t> nsTextFrame::GetOffsets() const {
8402
8402
return std::make_pair (GetContentOffset (), GetContentEnd ());
8403
8403
}
8404
8404
8405
- static int32_t FindEndOfPunctuationRun (const nsTextFragment* aFrag,
8406
- const gfxTextRun* aTextRun,
8407
- gfxSkipCharsIterator* aIter,
8408
- int32_t aOffset, int32_t aStart,
8409
- int32_t aEnd) {
8405
+ static bool IsFirstLetterPrefixPunctuation (uint32_t aChar) {
8406
+ switch (mozilla::unicode::GetGeneralCategory (aChar)) {
8407
+ case HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION: /* Pc */
8408
+ case HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION: /* Pd */
8409
+ case HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION: /* Pe */
8410
+ case HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION: /* Pf */
8411
+ case HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION: /* Pi */
8412
+ case HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION: /* Po */
8413
+ case HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION: /* Ps */
8414
+ return true ;
8415
+ default :
8416
+ return false ;
8417
+ }
8418
+ }
8419
+
8420
+ static bool IsFirstLetterSuffixPunctuation (uint32_t aChar) {
8421
+ switch (mozilla::unicode::GetGeneralCategory (aChar)) {
8422
+ case HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION: /* Pc */
8423
+ case HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION: /* Pe */
8424
+ case HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION: /* Pf */
8425
+ case HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION: /* Pi */
8426
+ case HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION: /* Po */
8427
+ return true ;
8428
+ default :
8429
+ return false ;
8430
+ }
8431
+ }
8432
+
8433
+ static int32_t FindEndOfPrefixPunctuationRun (const nsTextFragment* aFrag,
8434
+ const gfxTextRun* aTextRun,
8435
+ gfxSkipCharsIterator* aIter,
8436
+ int32_t aOffset, int32_t aStart,
8437
+ int32_t aEnd) {
8410
8438
int32_t i;
8439
+ for (i = aStart; i < aEnd - aOffset; ++i) {
8440
+ if (IsFirstLetterPrefixPunctuation (
8441
+ aFrag->ScalarValueAt (AssertedCast<uint32_t >(aOffset + i)))) {
8442
+ aIter->SetOriginalOffset (aOffset + i);
8443
+ FindClusterEnd (aTextRun, aEnd, aIter);
8444
+ i = aIter->GetOriginalOffset () - aOffset;
8445
+ } else {
8446
+ break ;
8447
+ }
8448
+ }
8449
+ return i;
8450
+ }
8411
8451
8452
+ static int32_t FindEndOfSuffixPunctuationRun (const nsTextFragment* aFrag,
8453
+ const gfxTextRun* aTextRun,
8454
+ gfxSkipCharsIterator* aIter,
8455
+ int32_t aOffset, int32_t aStart,
8456
+ int32_t aEnd) {
8457
+ int32_t i;
8412
8458
for (i = aStart; i < aEnd - aOffset; ++i) {
8413
- if (nsContentUtils::IsFirstLetterPunctuation (
8459
+ if (IsFirstLetterSuffixPunctuation (
8414
8460
aFrag->ScalarValueAt (AssertedCast<uint32_t >(aOffset + i)))) {
8415
8461
aIter->SetOriginalOffset (aOffset + i);
8416
8462
FindClusterEnd (aTextRun, aEnd, aIter);
@@ -8440,7 +8486,6 @@ static bool FindFirstLetterRange(const nsTextFragment* aFrag,
8440
8486
const gfxTextRun* aTextRun, int32_t aOffset,
8441
8487
const gfxSkipCharsIterator& aIter,
8442
8488
int32_t * aLength) {
8443
- int32_t i;
8444
8489
int32_t length = *aLength;
8445
8490
int32_t endOffset = aOffset + length;
8446
8491
gfxSkipCharsIterator iter (aIter);
@@ -8464,25 +8509,39 @@ static bool FindFirstLetterRange(const nsTextFragment* aFrag,
8464
8509
return false ;
8465
8510
};
8466
8511
8467
- // skip leading whitespace, then consume clusters that start with punctuation
8468
- i = FindEndOfPunctuationRun (
8469
- aFrag, aTextRun, &iter, aOffset,
8470
- GetTrimmableWhitespaceCount (aFrag, aOffset, length, 1 ), endOffset);
8471
- if (i == length) {
8472
- return false ;
8473
- }
8512
+ // Skip any trimmable leading whitespace.
8513
+ int32_t i = GetTrimmableWhitespaceCount (aFrag, aOffset, length, 1 );
8514
+ while (true ) {
8515
+ // Scan past any leading punctuation. This leaves `j` at the first
8516
+ // non-punctuation character.
8517
+ int32_t j = FindEndOfPrefixPunctuationRun (aFrag, aTextRun, &iter, aOffset,
8518
+ i, endOffset);
8519
+ if (j == length) {
8520
+ return false ;
8521
+ }
8474
8522
8475
- // skip space/no-break-space after punctuation
8476
- while (i < length) {
8477
- char16_t ch = aFrag->CharAt (AssertedCast<uint32_t >(aOffset + i));
8478
- if (ch == ' ' || ch == CH_NBSP) {
8479
- ++i;
8480
- } else {
8523
+ // Scan past any Unicode whitespace characters after punctuation.
8524
+ while (j < length) {
8525
+ char16_t ch = aFrag->CharAt (AssertedCast<uint32_t >(aOffset + j));
8526
+ // The spec says to allow "characters that belong to the `Zs` Unicode
8527
+ // general category _other than_ U+3000" here.
8528
+ if (unicode::GetGeneralCategory (ch) ==
8529
+ HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR &&
8530
+ ch != 0x3000 ) {
8531
+ ++j;
8532
+ } else {
8533
+ break ;
8534
+ }
8535
+ }
8536
+ if (j == length) {
8537
+ return false ;
8538
+ }
8539
+ if (j == i) {
8540
+ // If no whitespace was found, we've finished the first-letter prefix;
8541
+ // if there was some, then go back to check for more punctuation.
8481
8542
break ;
8482
8543
}
8483
- }
8484
- if (i == length) {
8485
- return false ;
8544
+ i = j;
8486
8545
}
8487
8546
8488
8547
// If the next character is not a letter, number or symbol, there is no
@@ -8495,7 +8554,7 @@ static bool FindFirstLetterRange(const nsTextFragment* aFrag,
8495
8554
return true ;
8496
8555
}
8497
8556
8498
- // consume another cluster (the actual first letter)
8557
+ // Consume another cluster (the actual first letter):
8499
8558
8500
8559
// For complex scripts such as Indic and SEAsian, where first-letter
8501
8560
// should extend to entire orthographic "syllable" clusters, we don't
@@ -8566,9 +8625,12 @@ static bool FindFirstLetterRange(const nsTextFragment* aFrag,
8566
8625
break ;
8567
8626
}
8568
8627
8628
+ // NOTE that FindClusterEnd sets the iterator to the last character that is
8629
+ // part of the cluster, NOT to the first character beyond it.
8569
8630
iter.SetOriginalOffset (aOffset + i);
8570
8631
FindClusterEnd (aTextRun, endOffset, &iter, allowSplitLigature);
8571
8632
8633
+ // Index of the last character included in the first-letter cluster.
8572
8634
i = iter.GetOriginalOffset () - aOffset;
8573
8635
8574
8636
// Heuristic for Indic scripts that like to form conjuncts:
@@ -8616,9 +8678,44 @@ static bool FindFirstLetterRange(const nsTextFragment* aFrag,
8616
8678
}
8617
8679
}
8618
8680
8619
- // consume clusters that start with punctuation
8620
- i = FindEndOfPunctuationRun (aFrag, aTextRun, &iter, aOffset, i + 1 ,
8621
- endOffset);
8681
+ // When we reach here, `i` points to the last character of the first-letter
8682
+ // cluster, NOT to the first character beyond it. Advance to the next char,
8683
+ // ready to check for following whitespace/punctuation:
8684
+ ++i;
8685
+
8686
+ while (i < length) {
8687
+ // Skip over whitespace, except for word separator characters, before the
8688
+ // check for following punctuation. But remember the position before the
8689
+ // whitespace, in case we need to reset.
8690
+ const int32_t preWS = i;
8691
+ while (i < length) {
8692
+ char16_t ch = aFrag->CharAt (AssertedCast<uint32_t >(aOffset + i));
8693
+ // The spec says the first-letter suffix includes "any intervening
8694
+ // typographic space -- characters belonging to the Zs Unicode general
8695
+ // category other than U+3000 IDEOGRAPHIC SPACE or a word separator",
8696
+ // where "word separator" includes U+0020 and U+00A0.
8697
+ if (ch == 0x0020 || ch == 0x00A0 || ch == 0x3000 ||
8698
+ unicode::GetGeneralCategory (ch) !=
8699
+ HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR) {
8700
+ break ;
8701
+ } else {
8702
+ ++i;
8703
+ }
8704
+ }
8705
+
8706
+ // Consume clusters that start with punctuation.
8707
+ const int32_t prePunct = i;
8708
+ i = FindEndOfSuffixPunctuationRun (aFrag, aTextRun, &iter, aOffset, i,
8709
+ endOffset);
8710
+
8711
+ // If we didn't find punctuation here, then we also don't want to include
8712
+ // any preceding whitespace, so reset our index.
8713
+ if (i == prePunct) {
8714
+ i = preWS;
8715
+ break ;
8716
+ }
8717
+ }
8718
+
8622
8719
if (i < length) {
8623
8720
*aLength = i;
8624
8721
}
0 commit comments