From 1ff2df58046b6fe26f11c3288a4d63a465f443b5 Mon Sep 17 00:00:00 2001 From: Fredrik Roubert Date: Fri, 6 Jun 2025 14:02:08 +0200 Subject: [PATCH] ICU-20392 Split the Locale payload into nested and heap allocated. All the most commonly used Locale objects have very little payload, most of them don't use any extensions, don't use a language tag longer than 3 characters and don't use more than a single variant. There's room for all that data in a simple 32 byte large payload object, which can be nested directly in the Locale object. Any payload larger than that can instead be heap allocated as needed, in order to save storage for the most commonly used objects while retaining the ability to create arbitrarily large and complex Locale objects. This reduces the storage requirements for all Locale objects. For nested payloads, this reduction is from 224 bytes to 48 bytes. For payloads that need to be heap allocated, the reduction depends on several factors, but for most cases there's some reduction. There are also cases where this refactoring actually increases the storage used, because CharString allocates more storage than necessary. There are a number of ways in which this could be improved upon, such as optimizing CharString to not allocate more than necessary when copying a string of known length, not allocating any empty CharString objects or possibly replacing CharString with a new class for fixed length strings. The public API remains unchanged but the operations which can lead to U_MEMORY_ALLOCATION_ERROR change. --- icu4c/source/common/locid.cpp | 440 ++++++++++++-------- icu4c/source/common/unicode/locid.h | 138 +++--- icu4c/source/test/depstest/dependencies.txt | 2 + icu4c/source/test/depstest/depstest.py | 7 +- 4 files changed, 369 insertions(+), 218 deletions(-) diff --git a/icu4c/source/common/locid.cpp b/icu4c/source/common/locid.cpp index 54a685d84927..82d2a6ff2ea3 100644 --- a/icu4c/source/common/locid.cpp +++ b/icu4c/source/common/locid.cpp @@ -33,7 +33,9 @@ #include #include +#include #include +#include #include "unicode/bytestream.h" #include "unicode/locid.h" @@ -232,9 +234,88 @@ locale_get_default() return Locale::getDefault().getName(); } +namespace { + +template +void copy_to_array(T* that, std::string_view sv) { + auto& field = that->*FIELD; + constexpr size_t capacity = std::extent_v>; + static_assert(capacity > 0); + if (!sv.empty()) { + U_ASSERT(sv.size() < capacity); + uprv_memcpy(field, sv.data(), sv.size()); + } + field[sv.size()] = '\0'; +} + +} // namespace U_NAMESPACE_BEGIN +Locale::Nest::Nest() : language{'\0'}, script{'\0'}, region{'\0'}, variantBegin{0}, baseName{'\0'} {} +Locale::Nest::~Nest() = default; + +Locale::Nest::Nest(const Nest& other) { + uprv_memcpy(this, &other, sizeof *this); +} + +Locale::Nest& Locale::Nest::operator=(const Nest& other) { + if (this != &other) { + uprv_memcpy(this, &other, sizeof *this); + } + return *this; +} + +void Locale::Nest::init(std::string_view language, + std::string_view script, + std::string_view region, + uint8_t variantBegin) { + copy_to_array<&Nest::language>(this, language); + copy_to_array<&Nest::script>(this, script); + copy_to_array<&Nest::region>(this, region); + this->variantBegin = variantBegin; +} + +struct Locale::Heap : public UMemory { + char language[ULOC_LANG_CAPACITY]; + char script[ULOC_SCRIPT_CAPACITY]; + char region[ULOC_COUNTRY_CAPACITY]; + int32_t variantBegin; + CharString fullName; + CharString baseName; + + const char* getLanguage() const { return language; } + const char* getScript() const { return script; } + const char* getRegion() const { return region; } + const char* getVariant() const { return variantBegin == 0 ? "" : getBaseName() + variantBegin; } + const char* getFullName() const { return fullName.data(); } + const char* getBaseName() const { return baseName.isEmpty() ? getFullName() : baseName.data(); } + + Heap(std::string_view language, + std::string_view script, + std::string_view region, + int32_t variantBegin) + : variantBegin(variantBegin), fullName(), baseName() { + copy_to_array<&Heap::language>(this, language); + copy_to_array<&Heap::script>(this, script); + copy_to_array<&Heap::region>(this, region); + } + + Heap(const Heap& other, UErrorCode& status) + : variantBegin(other.variantBegin), + fullName(other.fullName, status), + baseName(other.baseName, status) { + uprv_memcpy(language, other.language, sizeof language); + uprv_memcpy(script, other.script, sizeof script); + uprv_memcpy(region, other.region, sizeof region); + } + + // Move should be done on the std::unique_ptr object that owns this. + Heap(Heap&&) noexcept = delete; + + ~Heap() = default; +}; + UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Locale) /*Character separating the posix id fields*/ @@ -243,22 +324,10 @@ UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Locale) #define SEP_CHAR '_' #define NULL_CHAR '\0' -Locale::~Locale() -{ - if ((baseName != fullName) && (baseName != fullNameBuffer)) { - uprv_free(baseName); - } - baseName = nullptr; - /*if fullName is on the heap, we free it*/ - if (fullName != fullNameBuffer) - { - uprv_free(fullName); - fullName = nullptr; - } -} +Locale::~Locale() = default; Locale::Locale() - : UObject(), fullName(fullNameBuffer), baseName(nullptr) + : UObject(), payload() { init(nullptr, false); } @@ -269,9 +338,8 @@ Locale::Locale() * the default locale.) */ Locale::Locale(Locale::ELocaleType) - : UObject(), fullName(fullNameBuffer), baseName(nullptr) + : UObject(), payload() { - setToBogus(); } @@ -279,7 +347,7 @@ Locale::Locale( const char * newLanguage, const char * newCountry, const char * newVariant, const char * newKeywords) - : UObject(), fullName(fullNameBuffer), baseName(nullptr) + : UObject(), payload() { if( (newLanguage==nullptr) && (newCountry == nullptr) && (newVariant == nullptr) ) { @@ -300,7 +368,6 @@ Locale::Locale( const char * newLanguage, { lsize = static_cast(uprv_strlen(newLanguage)); if ( lsize < 0 || lsize > ULOC_STRING_LIMIT ) { // int32 wrap - setToBogus(); return; } } @@ -312,7 +379,6 @@ Locale::Locale( const char * newLanguage, { csize = static_cast(uprv_strlen(newCountry)); if ( csize < 0 || csize > ULOC_STRING_LIMIT ) { // int32 wrap - setToBogus(); return; } } @@ -329,7 +395,6 @@ Locale::Locale( const char * newLanguage, // remove trailing _'s vsize = static_cast(uprv_strlen(newVariant)); if ( vsize < 0 || vsize > ULOC_STRING_LIMIT ) { // int32 wrap - setToBogus(); return; } while( (vsize>1) && (newVariant[vsize-1] == SEP_CHAR) ) @@ -342,7 +407,6 @@ Locale::Locale( const char * newLanguage, { ksize = static_cast(uprv_strlen(newKeywords)); if ( ksize < 0 || ksize > ULOC_STRING_LIMIT ) { - setToBogus(); return; } } @@ -383,7 +447,6 @@ Locale::Locale( const char * newLanguage, if (U_FAILURE(status)) { // Something went wrong with appending, etc. - setToBogus(); return; } // Parse it, because for example 'language' might really be a complete @@ -393,13 +456,13 @@ Locale::Locale( const char * newLanguage, } Locale::Locale(const Locale &other) - : UObject(other), fullName(fullNameBuffer), baseName(nullptr) + : UObject(other), payload() { *this = other; } Locale::Locale(Locale&& other) noexcept - : UObject(other), fullName(fullNameBuffer), baseName(fullName) { + : UObject(other), payload() { *this = std::move(other); } @@ -408,64 +471,27 @@ Locale& Locale::operator=(const Locale& other) { return *this; } - setToBogus(); - - if (other.fullName == other.fullNameBuffer) { - uprv_strcpy(fullNameBuffer, other.fullNameBuffer); - } else if (other.fullName == nullptr) { - fullName = nullptr; + if (other.isBogus()) { + setToBogus(); + } else if (const Nest* nest = std::get_if(&other.payload)) { + payload.emplace(*nest); + } else if (const std::unique_ptr* heap = std::get_if>(&other.payload)) { + U_ASSERT(*heap); + if (UErrorCode status = U_ZERO_ERROR; + !payload.emplace>(std::make_unique(**heap, status)) || + U_FAILURE(status)) { + setToBogus(); + } } else { - fullName = uprv_strdup(other.fullName); - if (fullName == nullptr) return *this; - } - - if (other.baseName == other.fullName) { - baseName = fullName; - } else if (other.baseName != nullptr) { - baseName = uprv_strdup(other.baseName); - if (baseName == nullptr) return *this; + UPRV_UNREACHABLE_EXIT; } - uprv_strcpy(language, other.language); - uprv_strcpy(script, other.script); - uprv_strcpy(country, other.country); - - variantBegin = other.variantBegin; - fIsBogus = other.fIsBogus; - return *this; } Locale& Locale::operator=(Locale&& other) noexcept { - if ((baseName != fullName) && (baseName != fullNameBuffer)) uprv_free(baseName); - if (fullName != fullNameBuffer) uprv_free(fullName); - - if (other.fullName == other.fullNameBuffer || other.baseName == other.fullNameBuffer) { - uprv_strcpy(fullNameBuffer, other.fullNameBuffer); - } - if (other.fullName == other.fullNameBuffer) { - fullName = fullNameBuffer; - } else { - fullName = other.fullName; - } - - if (other.baseName == other.fullNameBuffer) { - baseName = fullNameBuffer; - } else if (other.baseName == other.fullName) { - baseName = fullName; - } else { - baseName = other.baseName; - } - - uprv_strcpy(language, other.language); - uprv_strcpy(script, other.script); - uprv_strcpy(country, other.country); - - variantBegin = other.variantBegin; - fIsBogus = other.fIsBogus; - - other.baseName = other.fullName = other.fullNameBuffer; - + payload = std::move(other.payload); + other.setToBogus(); return *this; } @@ -1836,16 +1862,8 @@ Locale& Locale::init(const char* localeID, UBool canonicalize) /*This function initializes a Locale from a C locale ID*/ Locale& Locale::init(StringPiece localeID, UBool canonicalize) { - fIsBogus = false; /* Free our current storage */ - if ((baseName != fullName) && (baseName != fullNameBuffer)) { - uprv_free(baseName); - } - baseName = nullptr; - if(fullName != fullNameBuffer) { - uprv_free(fullName); - fullName = fullNameBuffer; - } + Nest& nest = payload.emplace(); // not a loop: // just an easy way to have a common error-exit @@ -1859,9 +1877,6 @@ Locale& Locale::init(StringPiece localeID, UBool canonicalize) int32_t length; UErrorCode err; - /* preset all fields to empty */ - language[0] = script[0] = country[0] = 0; - const auto parse = [canonicalize](std::string_view localeID, char* name, int32_t nameCapacity, @@ -1879,18 +1894,23 @@ Locale& Locale::init(StringPiece localeID, UBool canonicalize) }; // "canonicalize" the locale ID to ICU/Java format + char* fullName = nest.baseName; err = U_ZERO_ERROR; - length = parse(localeID, fullName, sizeof fullNameBuffer, err); + length = parse(localeID, fullName, sizeof Nest::baseName, err); - if (err == U_BUFFER_OVERFLOW_ERROR || length >= static_cast(sizeof(fullNameBuffer))) { - U_ASSERT(baseName == nullptr); + std::unique_ptr fullNameBuffer; + if (err == U_BUFFER_OVERFLOW_ERROR || length >= static_cast(sizeof Nest::baseName)) { /*Go to heap for the fullName if necessary*/ - char* newFullName = static_cast(uprv_malloc(sizeof(char) * (length + 1))); - if (newFullName == nullptr) { + fullNameBuffer = std::make_unique(); + if (!fullNameBuffer) { break; // error: out of memory } - fullName = newFullName; err = U_ZERO_ERROR; + int32_t capacity; + fullName = fullNameBuffer->getAppendBuffer(length + 1, length + 1, capacity, err); + if (U_FAILURE(err)) { + break; // error: out of memory + } length = parse(localeID, fullName, length + 1, err); } if(U_FAILURE(err) || err == U_STRING_NOT_TERMINATED_WARNING) { @@ -1898,7 +1918,10 @@ Locale& Locale::init(StringPiece localeID, UBool canonicalize) break; } - variantBegin = length; + std::string_view language; + std::string_view script; + std::string_view region; + int32_t variantBegin = length; /* after uloc_getName/canonicalize() we know that only '_' are separators */ /* But _ could also appeared in timezone such as "en@timezone=America/Los_Angeles" */ @@ -1924,7 +1947,7 @@ Locale& Locale::init(StringPiece localeID, UBool canonicalize) fieldLen[fieldIdx - 1] = length - static_cast(field[fieldIdx - 1] - fullName); } - if (fieldLen[0] >= static_cast(sizeof(language))) + if (fieldLen[0] >= ULOC_LANG_CAPACITY) { break; // error: the language field is too long } @@ -1932,22 +1955,19 @@ Locale& Locale::init(StringPiece localeID, UBool canonicalize) variantField = 1; /* Usually the 2nd one, except when a script or country is also used. */ if (fieldLen[0] > 0) { /* We have a language */ - uprv_memcpy(language, fullName, fieldLen[0]); - language[fieldLen[0]] = 0; + language = {fullName, static_cast(fieldLen[0])}; } if (fieldLen[1] == 4 && uprv_isASCIILetter(field[1][0]) && uprv_isASCIILetter(field[1][1]) && uprv_isASCIILetter(field[1][2]) && uprv_isASCIILetter(field[1][3])) { /* We have at least a script */ - uprv_memcpy(script, field[1], fieldLen[1]); - script[fieldLen[1]] = 0; + script = {field[1], static_cast(fieldLen[1])}; variantField++; } if (fieldLen[variantField] == 2 || fieldLen[variantField] == 3) { /* We have a country */ - uprv_memcpy(country, field[variantField], fieldLen[variantField]); - country[fieldLen[variantField]] = 0; + region = {field[variantField], static_cast(fieldLen[variantField])}; variantField++; } else if (fieldLen[variantField] == 0) { variantField++; /* script or country empty but variant in next field (i.e. en__POSIX) */ @@ -1958,6 +1978,29 @@ Locale& Locale::init(StringPiece localeID, UBool canonicalize) variantBegin = static_cast(field[variantField] - fullName); } + if (Nest::fits(length, language, script, region)) { + U_ASSERT(fullName == nest.baseName); + U_ASSERT(!fullNameBuffer); + nest.init(language, script, region, variantBegin); + } else { + std::unique_ptr& heap = payload.emplace>( + std::make_unique(language, script, region, variantBegin)); + if (!heap) { + break; // error: out of memory + } + if (fullName == nest.baseName) { + U_ASSERT(!fullNameBuffer); + heap->fullName.copyFrom({fullName, length}, err); + } else { + U_ASSERT(fullNameBuffer); + fullNameBuffer->append(fullName, length, err); + heap->fullName = std::move(*fullNameBuffer); + } + if (U_FAILURE(err)) { + break; + } + } + err = U_ZERO_ERROR; initBaseName(err); if (U_FAILURE(err)) { @@ -2000,29 +2043,52 @@ Locale::initBaseName(UErrorCode &status) { if (U_FAILURE(status)) { return; } - U_ASSERT(baseName==nullptr || baseName==fullName); + U_ASSERT(!isBogus()); + + std::unique_ptr* heap = std::get_if>(&payload); + if (heap != nullptr && *heap && !(*heap)->baseName.isEmpty()) { + return; + } + + const char *fullName = getName(); const char *atPtr = uprv_strchr(fullName, '@'); const char *eqPtr = uprv_strchr(fullName, '='); if (atPtr && eqPtr && atPtr < eqPtr) { // Key words exist. int32_t baseNameLength = static_cast(atPtr - fullName); - char* newBaseName = static_cast(uprv_malloc(baseNameLength + 1)); - if (newBaseName == nullptr) { - status = U_MEMORY_ALLOCATION_ERROR; - return; + if (baseNameLength == 0) { return; } + + if (heap == nullptr) { + // There are keywords, so the payload needs to be moved from Nest + // to Heap so that it can get a baseName. + const Nest* nest = std::get_if(&payload); + U_ASSERT(nest != nullptr); + std::unique_ptr copy = std::make_unique(nest->language, + nest->script, + nest->region, + nest->variantBegin); + if (!copy) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + copy->fullName.copyFrom(fullName, status); + if (U_FAILURE(status)) { return; } + heap = &payload.emplace>(std::move(copy)); + if (!*heap) { + status = U_MEMORY_ALLOCATION_ERROR; + setToBogus(); + return; + } } - baseName = newBaseName; - uprv_strncpy(baseName, fullName, baseNameLength); - baseName[baseNameLength] = 0; // The original computation of variantBegin leaves it equal to the length // of fullName if there is no variant. It should instead be // the length of the baseName. - if (variantBegin > baseNameLength) { - variantBegin = baseNameLength; + if ((*heap)->variantBegin > baseNameLength) { + (*heap)->variantBegin = baseNameLength; } - } else { - baseName = fullName; + + (*heap)->baseName.copyFrom({fullName, baseNameLength}, status); } } @@ -2036,20 +2102,7 @@ Locale::hashCode() const void Locale::setToBogus() { /* Free our current storage */ - if((baseName != fullName) && (baseName != fullNameBuffer)) { - uprv_free(baseName); - } - baseName = nullptr; - if(fullName != fullNameBuffer) { - uprv_free(fullName); - fullName = fullNameBuffer; - } - *fullNameBuffer = 0; - *language = 0; - *script = 0; - *country = 0; - fIsBogus = true; - variantBegin = 0; + payload.emplace(); } const Locale& U_EXPORT2 @@ -2664,54 +2717,62 @@ Locale::setKeywordValue(StringPiece keywordName, status = U_ZERO_ERROR; } - int32_t length = static_cast(uprv_strlen(getName())); - int32_t capacity = fullName == fullNameBuffer ? ULOC_FULLNAME_CAPACITY : length + 1; - - const char* start = locale_getKeywordsStart(fullName); - int32_t offset = start == nullptr ? length : start - fullName; - - for (;;) { - // Remove -1 from the capacity so that this function can guarantee NUL termination. - CheckedArrayByteSink sink(fullName + offset, capacity - offset - 1); - - UErrorCode bufferStatus = U_ZERO_ERROR; - int32_t reslen = ulocimp_setKeywordValue( - {fullName + offset, static_cast(length - offset)}, - keywordName, - keywordValue, - sink, - bufferStatus); + CharString localeID(getName(), -1, status); + ulocimp_setKeywordValue(keywordName, keywordValue, localeID, status); + if (U_FAILURE(status)) { return; } - if (bufferStatus == U_BUFFER_OVERFLOW_ERROR) { - capacity = reslen + offset + 1; - char* newFullName = static_cast(uprv_malloc(capacity)); - if (newFullName == nullptr) { + Nest* nest = std::get_if(&payload); + if (locale_getKeywordsStart(localeID.toStringPiece()) == nullptr) { + if (nest == nullptr) { + // There are no longer any keywords left, so it might now be + // possible to move the payload from Heap to Nest. + std::unique_ptr* heap = std::get_if>(&payload); + U_ASSERT(heap != nullptr); + U_ASSERT(*heap); + std::string_view language = (*heap)->language; + std::string_view script = (*heap)->script; + std::string_view region = (*heap)->region; + if (Nest::fits(localeID.length(), language, script, region)) { + std::unique_ptr save = std::move(*heap); + U_ASSERT(save); + Nest& nest = payload.emplace(); + localeID.extract(nest.baseName, sizeof Nest::baseName, status); + nest.init(language, script, region, save->variantBegin); + } else { + (*heap)->baseName.clear(); + (*heap)->fullName = std::move(localeID); + } + } + } else { + std::unique_ptr* heap = nullptr; + if (nest != nullptr) { + // A keyword has been added, so the payload now needs to be moved + // from Nest to Heap so that it can get a baseName. + std::unique_ptr copy = std::make_unique(nest->language, + nest->script, + nest->region, + nest->variantBegin); + if (!copy) { status = U_MEMORY_ALLOCATION_ERROR; return; } - uprv_memcpy(newFullName, fullName, length + 1); - if (fullName != fullNameBuffer) { - if (baseName == fullName) { - baseName = newFullName; // baseName should not point to freed memory. - } - // if fullName is already on the heap, need to free it. - uprv_free(fullName); + heap = &payload.emplace>(std::move(copy)); + if (!*heap) { + status = U_MEMORY_ALLOCATION_ERROR; + setToBogus(); + return; } - fullName = newFullName; - continue; + } else { + heap = std::get_if>(&payload); + U_ASSERT(heap != nullptr); + U_ASSERT(*heap); } + (*heap)->fullName = std::move(localeID); - if (U_FAILURE(bufferStatus)) { - status = bufferStatus; - return; + if ((*heap)->baseName.isEmpty()) { + // Has added the first keyword, meaning that the fullName is no longer also the baseName. + initBaseName(status); } - u_terminateChars(fullName, capacity, reslen + offset, &status); - break; - } - - if (baseName == fullName) { - // May have added the first keyword, meaning that the fullName is no longer also the baseName. - initBaseName(status); } } @@ -2744,9 +2805,50 @@ Locale::setUnicodeKeywordValue(StringPiece keywordName, setKeywordValue(*legacy_key, value, status); } -const char * +const char* +Locale::getCountry() const { + return getField<&Nest::getRegion, &Heap::getRegion>(); +} + +const char* +Locale::getLanguage() const { + return getField<&Nest::getLanguage, &Heap::getLanguage>(); +} + +const char* +Locale::getScript() const { + return getField<&Nest::getScript, &Heap::getScript>(); +} + +const char* +Locale::getVariant() const { + return getField<&Nest::getVariant, &Heap::getVariant>(); +} + +const char* +Locale::getName() const { + return getField<&Nest::getBaseName, &Heap::getFullName>(); +} + +const char* Locale::getBaseName() const { - return baseName; + return getField<&Nest::getBaseName, &Heap::getBaseName>(); +} + +template +const char* Locale::getField() const { + if (isBogus()) { + return ""; + } + if (const Nest* nest = std::get_if(&payload)) { + return (nest->*NEST)(); + } + if (const std::unique_ptr* heap = std::get_if>(&payload)) { + U_ASSERT(*heap); + return ((**heap).*HEAP)(); + } + UPRV_UNREACHABLE_EXIT; } Locale::Iterator::~Iterator() = default; diff --git a/icu4c/source/common/unicode/locid.h b/icu4c/source/common/unicode/locid.h index 5ec5f33cb620..9e1a341147ca 100644 --- a/icu4c/source/common/unicode/locid.h +++ b/icu4c/source/common/unicode/locid.h @@ -35,6 +35,10 @@ #if U_SHOW_CPLUSPLUS_API +#include +#include +#include + #include "unicode/bytestream.h" #include "unicode/localpointer.h" #include "unicode/strenum.h" @@ -469,7 +473,7 @@ class U_COMMON_API Locale : public UObject { * @return An alias to the code * @stable ICU 2.0 */ - inline const char * getLanguage( ) const; + const char* getLanguage() const; /** * Returns the locale's ISO-15924 abbreviation script code. @@ -478,21 +482,21 @@ class U_COMMON_API Locale : public UObject { * @see uscript_getCode * @stable ICU 2.8 */ - inline const char * getScript( ) const; + const char* getScript() const; /** * Returns the locale's ISO-3166 country code. * @return An alias to the code * @stable ICU 2.0 */ - inline const char * getCountry( ) const; + const char* getCountry() const; /** * Returns the locale's variant code. * @return An alias to the code * @stable ICU 2.0 */ - inline const char * getVariant( ) const; + const char* getVariant() const; /** * Returns the programmatic name of the entire locale, with the language, @@ -502,7 +506,7 @@ class U_COMMON_API Locale : public UObject { * @return A pointer to "name". * @stable ICU 2.0 */ - inline const char * getName() const; + const char* getName() const; /** * Returns the programmatic name of the entire locale as getName() would return, @@ -511,7 +515,7 @@ class U_COMMON_API Locale : public UObject { * @see getName * @stable ICU 2.8 */ - const char * getBaseName() const; + const char* getBaseName() const; /** * Add the likely subtags for this Locale, per the algorithm described @@ -1157,17 +1161,74 @@ class U_COMMON_API Locale : public UObject { */ static Locale* getLocaleCache(); - char language[ULOC_LANG_CAPACITY]; - char script[ULOC_SCRIPT_CAPACITY]; - char country[ULOC_COUNTRY_CAPACITY]; - int32_t variantBegin; - char* fullName; - char fullNameBuffer[ULOC_FULLNAME_CAPACITY]; - // name without keywords - char* baseName; - void initBaseName(UErrorCode& status); + /** + * Locale data that can be nested directly within the std::variant object. + * @internal + */ + struct U_COMMON_API Nest { + static constexpr size_t SIZE = 32; + + char language[4]; + char script[5]; + char region[4]; + uint8_t variantBegin; + char baseName[SIZE - sizeof language - sizeof script - sizeof region - sizeof variantBegin]; + + const char* getLanguage() const { return language; } + const char* getScript() const { return script; } + const char* getRegion() const { return region; } + const char* getVariant() const { return variantBegin == 0 ? "" : getBaseName() + variantBegin; } + const char* getBaseName() const { return baseName; } + + // Doesn't inherit from UMemory, shouldn't be heap allocated. + static void* U_EXPORT2 operator new(size_t) noexcept = delete; + static void* U_EXPORT2 operator new[](size_t) noexcept = delete; + + Nest(); + ~Nest(); + + Nest(const Nest& other); + Nest& operator=(const Nest& other); + + void init(std::string_view language, + std::string_view script, + std::string_view region, + uint8_t variantBegin); + + static bool fits(int32_t length, + std::string_view language, + std::string_view script, + std::string_view region) { + return length < static_cast(sizeof Nest::baseName) && + language.size() < sizeof Nest::language && + script.size() < sizeof Nest::script && + region.size() < sizeof Nest::region; + } + }; + static_assert(sizeof(Nest) == Nest::SIZE); + + /** + * Locale data that needs to be heap allocated in a std::unique_ptr object. + * @internal + */ + struct Heap; + + std::variant> payload; - UBool fIsBogus; + /** + * Call a field getter function on either Nest or Heap in payload. + * (This is kind of std::visit but simpler and without exceptions.) + * + * @tparam NEST Pointer to the Nest getter function. + * @tparam HEAP Pointer to the Heap getter function. + * @return the result from the getter, or the empty string if isBogus(). + * @internal + */ + template + const char* getField() const; + + void initBaseName(UErrorCode& status); static const Locale &getLocale(int locid); @@ -1199,36 +1260,6 @@ Locale::toLanguageTag(UErrorCode& status) const return result; } -inline const char * -Locale::getCountry() const -{ - return country; -} - -inline const char * -Locale::getLanguage() const -{ - return language; -} - -inline const char * -Locale::getScript() const -{ - return script; -} - -inline const char * -Locale::getVariant() const -{ - return isBogus() ? "" : &baseName[variantBegin]; -} - -inline const char * -Locale::getName() const -{ - return fullName; -} - template inline void Locale::getKeywords(OutputIterator iterator, UErrorCode& status) const { @@ -1287,11 +1318,24 @@ Locale::getUnicodeKeywordValue(StringPiece keywordName, UErrorCode& status) cons inline UBool Locale::isBogus() const { - return fIsBogus; + return std::holds_alternative(payload); } U_NAMESPACE_END +/// @cond DOXYGEN_IGNORE +// Export an explicit template instantiation of the payload std::variant. +// (When building DLLs for Windows this is required.) +#if U_PF_WINDOWS <= U_PLATFORM && U_PLATFORM <= U_PF_CYGWIN +#if defined(U_REAL_MSVC) && defined(_MSVC_STL_VERSION) +template class U_COMMON_API + std::_Variant_storage_>; +#endif +template class U_COMMON_API + std::variant>; +#endif +/// @endcond + #endif /* U_SHOW_CPLUSPLUS_API */ #endif diff --git a/icu4c/source/test/depstest/dependencies.txt b/icu4c/source/test/depstest/dependencies.txt index caf3c57cdbe8..deced67cff80 100644 --- a/icu4c/source/test/depstest/dependencies.txt +++ b/icu4c/source/test/depstest/dependencies.txt @@ -67,6 +67,8 @@ group: c_strings # Additional symbols in an optimized build. __strcpy_chk __strncpy_chk __strcat_chk __strncat_chk __rawmemchr __memcpy_chk __memmove_chk __memset_chk + # glibc + __isoc23_strtol __isoc23_strtoul group: c_string_formatting atoi atol strtod strtod_l strtol strtoul diff --git a/icu4c/source/test/depstest/depstest.py b/icu4c/source/test/depstest/depstest.py index f993308fbd38..dccfa17bd50c 100755 --- a/icu4c/source/test/depstest/depstest.py +++ b/icu4c/source/test/depstest/depstest.py @@ -112,14 +112,17 @@ def _ReadLibrary(root_path, library_name): ("common/umutex.o", "__once_proxy"), ("common/umutex.o", "__tls_get_addr"), ("common/unifiedcache.o", "std::__throw_system_error(int)"), - # Some of the MessageFormat 2 modules reference exception-related symbols + # Some of the ICU4C modules reference exception-related symbols # in instantiations of the `std::get()` method that gets an alternative # from a `std::variant`. # These instantiations of `std::get()` are only called by compiler-generated # code (the implementations of built-in `swap()` methods for types # that include a `std::variant`; and `std::__detail::__variant::__gen_vtable_impl()`, - # which constructs vtables. The MessageFormat 2 code itself only calls + # which constructs vtables. The ICU4C code itself only calls # `std::get_if()`, which is exception-free; never `std::get()`. + ("common/locid.o", "typeinfo for std::exception"), + ("common/locid.o", "vtable for std::exception"), + ("common/locid.o", "std::exception::~exception()"), ("i18n/messageformat2_data_model.o", "typeinfo for std::exception"), ("i18n/messageformat2_data_model.o", "vtable for std::exception"), ("i18n/messageformat2_data_model.o", "std::exception::~exception()"),