8000 ICU-22885 Add parsing of approximately sign by sffc · Pull Request #3454 · unicode-org/icu · GitHub
[go: up one dir, main page]

Skip to content

ICU-22885 Add parsing of approximately sign #3454

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
ICU-22885 Add parsing of approximately sign
  • Loading branch information
sffc committed Mar 26, 2025
commit f17e5c0e0ee31b8e97422500c3cf66c92ff77325
4 changes: 4 additions & 0 deletions icu4c/source/common/static_unicode_sets.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,11 @@ void U_CALLCONV initNumberParseUniSets(UErrorCode& status) {
U_ASSERT(gUnicodeSets[PERCENT_SIGN] != nullptr);
U_ASSERT(gUnicodeSets[PERMILLE_SIGN] != nullptr);

// The following don't currently have parseLenients in data.
U_ASSERT(gUnicodeSets[INFINITY_SIGN] == nullptr);
gUnicodeSets[INFINITY_SIGN] = new UnicodeSet(u"[∞]", status);
U_ASSERT(gUnicodeSets[APPROXIMATELY_SIGN] == nullptr);
gUnicodeSets[APPROXIMATELY_SIGN] = new UnicodeSet(u"[∼~≈≃約]", status); // this set was manually curated
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how is this set determeind? What does it based on ? having "約" in this set is strange? Could we have a comments about this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I claim no moral authority for how this set was formed beyond "this set was manually curated".

What I did was open the xml files and look for characters used in the approximately pattern in various locales.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so you mean these characters are gathered by looking at the content of some xml files and some particuarl field in those xml files? If so, could you point out which XML files and which particular fields about HOW you "manually curated" ? Give a little bit more details of how you did that

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, here is what I tried

find common/main/* |xargs egrep approximatelySign |egrep -v "↑↑↑|unconfirmed"|cut -d '>' -f 2|cut -d '<' -f 1|sort -u
-
~
∼
≃
≈
ca.
dáàshì
dáàṣì
約

so... maybe adding comment as "This set of characters is gathered from the values of approximatelySign element of CLDR common/main/*.xml files." /

if (U_FAILURE(status)) { return; }

U_ASSERT(gUnicodeSets[DOLLAR_SIGN] != nullptr);
Expand Down
1 change: 1 addition & 0 deletions icu4c/source/common/static_unicode_sets.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ enum Key {
PERCENT_SIGN,
PERMILLE_SIGN,
INFINITY_SIGN,
APPROXIMATELY_SIGN,

// Currency Symbols
DOLLAR_SIGN,
Expand Down
8 changes: 8 additions & 0 deletions icu4c/source/i18n/numparse_affixes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ void AffixPatternMatcherBuilder::consumeToken(AffixPatternType type, UChar32 cp,
case TYPE_PLUS_SIGN:
addMatcher(fWarehouse.plusSign());
break;
case TYPE_APPROXIMATELY_SIGN:
addMatcher(fWarehouse.approximatelySign());
break;
case TYPE_PERCENT:
addMatcher(fWarehouse.percent());
break;
Expand All @@ -97,6 +100,7 @@ void AffixPatternMatcherBuilder::consumeToken(AffixPatternType type, UChar32 cp,
case TYPE_CURRENCY_TRIPLE:
case TYPE_CURRENCY_QUAD:
case TYPE_CURRENCY_QUINT:
case TYPE_CURRENCY_OVERFLOW:
// All currency symbols use the same matcher
addMatcher(fWarehouse.currency(status));
break;
Expand Down Expand Up @@ -142,6 +146,10 @@ NumberParseMatcher& AffixTokenMatcherWarehouse::plusSign() {
return fPlusSign = {fSetupData->dfs, true};
}

NumberParseMatcher& AffixTokenMatcherWarehouse::approximatelySign() {
return fApproximatelySign = {fSetupData->dfs, true};
}

NumberParseMatcher& AffixTokenMatcherWarehouse::percent() {
return fPercent = {fSetupData->dfs};
}
Expand Down
3 changes: 3 additions & 0 deletions icu4c/source/i18n/numparse_affixes.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ class U_I18N_API AffixTokenMatcherWarehouse : public UMemory {

NumberParseMatcher& plusSign();

NumberParseMatcher& approximatelySign();

NumberParseMatcher& percent();

NumberParseMatcher& permille();
Expand All @@ -108,6 +110,7 @@ class U_I18N_API AffixTokenMatcherWarehouse : public UMemory {
// NOTE: These are default-constructed and should not be used until initialized.
MinusSignMatcher fMinusSign;
PlusSignMatcher fPlusSign;
ApproximatelySignMatcher fApproximatelySign;
PercentMatcher fPercent;
PermilleMatcher fPermille;
CombinedCurrencyMatcher fCurrency;
Expand Down
1 change: 1 addition & 0 deletions icu4c/source/i18n/numparse_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ NumberParserImpl::createSimpleParser(const Locale& locale, const UnicodeString&
parser->addMatcher(parser->fLocalMatchers.decimal = {symbols, grouper, parseFlags});
parser->addMatcher(parser->fLocalMatchers.minusSign = {symbols, false});
parser->addMatcher(parser->fLocalMatchers.plusSign = {symbols, false});
parser->addMatcher(parser->fLocalMatchers.approximatelySign = {symbols, false});
parser->addMatcher(parser->fLocalMatchers.percent = {symbols});
parser->addMatcher(parser->fLocalMatchers.permille = {symbols});
parser->addMatcher(parser->fLocalMatchers.nan = {symbols});
Expand Down
1 change: 1 addition & 0 deletions icu4c/source/i18n/numparse_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ class U_I18N_API NumberParserImpl : public MutableMatcherCollection, public UMem
PercentMatcher percent;
PermilleMatcher permille;
PlusSignMatcher plusSign;
ApproximatelySignMatcher approximatelySign;
DecimalMatcher decimal;
ScientificMatcher scientific;
CombinedCurrencyMatcher currency;
Expand Down
14 changes: 14 additions & 0 deletions icu4c/source/i18n/numparse_symbols.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,4 +195,18 @@ void PlusSignMatcher::accept(StringSegment& segment, ParsedNumber& result) const
}


ApproximatelySignMatcher::ApproximatelySignMatcher(const DecimalFormatSymbols& dfs, bool allowTrailing)
: SymbolMatcher(dfs.getConstSymbol(DecimalFormatSymbols::kApproximatelySignSymbol), unisets::APPROXIMATELY_SIGN),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

line wrap for these two lines please

fAllowTrailing(allowTrailing) {
}

bool ApproximatelySignMatcher::isDisabled(const ParsedNumber& result) const {
return !fAllowTrailing && result.seenNumber();
}

void ApproximatelySignMatcher::accept(StringSegment& segment, ParsedNumber& result) const {
result.setCharsConsumed(segment);
}


#endif /* #if !UCONFIG_NO_FORMATTING */
17 changes: 17 additions & 0 deletions icu4c/source/i18n/numparse_symbols.h
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,23 @@ class U_I18N_API PlusSignMatcher : public SymbolMatcher {
bool fAllowTrailing;
};


// Exported as U_I18N_API for tests
class U_I18N_API ApproximatelySignMatcher : public SymbolMatcher {
public:
ApproximatelySignMatcher() = default; // WARNING: Leaves the object in an unusable state

ApproximatelySignMatcher(const DecimalFormatSymbols& dfs, bool allowTrailing);

protected:
bool isDisabled(const ParsedNumber& result) const override;

void accept(StringSegment& segment, ParsedNumber& result) const override;

private:
bool fAllowTrailing;
};

} // namespace numparse::impl
U_NAMESPACE_END

Expand Down
8 changes: 8 additions & 0 deletions icu4c/source/test/intltest/numbertest_parse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,10 @@ void NumberParserTest::testBasic() {
{3, u" 1,234", u"a0", 35, 1234.}, // should not hang
{3, u"NaN", u"0", 3, NAN},
{3, u"NaN E5", u"0", 6, NAN},
{3, u"~100", u"~0", 4, 100.0},
{3, u" ~ 100", u"~0", 6, 100.0},
{3, u"≈100", u"~0", 4, 100.0},
8000 {3, u"100≈", u"~0", 3, 100.0},
{3, u"0", u"0", 1, 0.0}};

parse_flags_t parseFlags = PARSE_FLAG_IGNORE_CASE | PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES;
Expand Down Expand Up @@ -180,6 +184,10 @@ void NumberParserTest::testBasic() {
assertEquals("Strict Parse failed: " + message,
cas.expectedResultDouble, resultObject.getDouble(status));
}

if (status.errDataIfFailureAndReset("parsing test failed")) {
continue;
}
}
}

Expand Down
28 changes: 26 additions & 2 deletions icu4c/source/test/intltest/numfmtst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ void NumberFormatTest::runIndexedTest( int32_t index, UBool exec, const char* &n
TESTCASE_AUTO(TestFormatAttributes);
TESTCASE_AUTO(TestFieldPositionIterator);
TESTCASE_AUTO(TestDecimal);
TESTCASE_AUTO(TestDecimalFormatParse7E);
TESTCASE_AUTO(TestCurrencyFractionDigits);
TESTCASE_AUTO(TestExponentParse);
TESTCASE_AUTO(TestExplicitParents);
Expand Down Expand Up @@ -6981,6 +6982,29 @@ void NumberFormatTest::TestDecimal() {

}

void NumberFormatTest::TestDecimalFormatParse7E() {
UErrorCode status = U_ZERO_ERROR;
UnicodeString testdata = u"~";
icu::Formattable result;
icu::DecimalFormat dfmt(testdata, status);
if (U_SUCCESS(status)) {
dfmt.parse(testdata, result, status);
}

// Test basic behavior
status = U_ZERO_ERROR;
dfmt = icu::DecimalFormat(u"~0", status);
ASSERT_SUCCESS(status);
dfmt.parse(u"200", result, status);
ASSERT_EQUALS(status, U_INVALID_FORMAT_ERROR);
status = U_ZERO_ERROR;
dfmt.parse(u"≈200", result, status);
ASSERT_SUCCESS(status);
if (result.getInt64() != 200) {
errln(UnicodeString(u"Got unexpected parse result: ") + DoubleToUnicodeString(result.getInt64()));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

line wrap please

}
}

void NumberFormatTest::TestCurrencyFractionDigits() {
UErro 8000 rCode status = U_ZERO_ERROR;
UnicodeString text1, text2;
Expand Down Expand Up @@ -10048,7 +10072,7 @@ void NumberFormatTest::Test13733_StrictAndLenient() {
parsedStrictValue = ca_strict->getNumber().getInt64();
}
assertEquals("Strict parse of " + inputString + " using " + patternString,
parsedStrictValue, cas.expectedStrictParse);
cas.expectedStrictParse, parsedStrictValue);

ppos.setIndex(0);
df.setLenient(true);
Expand All @@ -10058,7 +10082,7 @@ void NumberFormatTest::Test13733_StrictAndLenient() {
parsedLenientValue = ca_lenient->getNumber().getInt64();
}
assertEquals("Lenient parse of " + inputString + " using " + patternString,
parsedLenientValue, cas.expectedLenientParse);
cas.expectedLenientParse, parsedLenientValue);
}
}

Expand Down
1 change: 1 addition & 0 deletions icu4c/source/test/intltest/numfmtst.h
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ class NumberFormatTest: public CalendarTimeZoneTest {
void TestLenientParse();

void TestDecimal();
void TestDecimalFormatParse7E();
void TestCurrencyFractionDigits();

void TestExponentParse();
Expand Down
0