30
30
#include " velocypack/Parser.h"
31
31
#include " velocypack/velocypack-aliases.h"
32
32
33
- #include " utils/locale_utils.hpp"
34
33
#include " utils/vpack_utils.hpp"
34
+ #include " utils/icu_locale_utils.hpp"
35
35
36
36
namespace {
37
37
38
38
using namespace irs ;
39
39
40
- constexpr VPackStringRef LOCALE_PARAM_NAME{" locale" };
40
+ constexpr VPackStringRef LOCALE_PARAM_NAME {" locale" };
41
41
42
42
bool parse_vpack_options (
43
43
const VPackSlice slice,
44
44
analysis::collation_token_stream::options_t & options) {
45
45
46
- if (!slice.isObject () && !slice. isString () ) {
46
+ if (!slice.isObject ()) {
47
47
IR_FRMT_ERROR (
48
- " Slice for collation_token_stream is not an object or string " );
48
+ " Slice for collation_token_stream is not an object" );
49
49
return false ;
50
50
}
51
51
52
52
try {
53
- switch (slice.type ()) {
53
+
54
+ auto locale_slice = slice.get (LOCALE_PARAM_NAME);
55
+
56
+ switch (locale_slice.type ()) {
54
57
case VPackValueType::String:
55
- return locale_utils::icu_locale (get_string<string_ref>(slice), options.locale ); // required
58
+ {
59
+ bool res = icu_locale_utils::get_locale_from_str (get_string<string_ref>(locale_slice),
60
+ options.locale ,
61
+ false , // true - new format of locale string
62
+ &options.unicode );
63
+ if (res && options.unicode == icu_locale_utils::Unicode::UTF8) {
64
+ return true ;
65
+ } else {
66
+ return false ;
67
+ }
68
+ }
56
69
case VPackValueType::Object:
57
70
{
58
- auto param_name_slice = slice.get (LOCALE_PARAM_NAME);
59
- if (param_name_slice.isString ()) {
60
- if (!locale_utils::icu_locale (get_string<string_ref>(param_name_slice), options.locale )) {
61
- return false ;
62
- }
63
-
64
- return true ;
65
- }
71
+ return icu_locale_utils::get_locale_from_vpack (locale_slice,
72
+ options.locale ,
73
+ &options.unicode );
74
+
66
75
}
67
76
[[fallthrough]];
68
77
default :
69
78
IR_FRMT_ERROR (
70
79
" Missing '%s' while constructing collation_token_stream"
71
- " from VPack arguments " ,
80
+ " or value is not a string or an object " ,
72
81
LOCALE_PARAM_NAME.data ());
73
82
}
74
83
} catch (const VPackException& ex) {
@@ -109,11 +118,13 @@ bool make_vpack_config(
109
118
const analysis::collation_token_stream::options_t & options,
110
119
VPackBuilder* builder) {
111
120
112
- VPackObjectBuilder object (builder);
121
+ VPackBuilder locale_builder;
122
+ icu_locale_utils::locale_to_vpack (options.locale , &locale_builder, &options.unicode );
123
+
124
+ // locale
125
+ VPackObjectBuilder locale_obj (builder);
113
126
{
114
- // locale
115
- const auto & locale_name = locale_utils::name (options.locale );
116
- builder->add
F438
span>(LOCALE_PARAM_NAME, VPackValue (locale_name));
127
+ builder->add (LOCALE_PARAM_NAME.data (), locale_builder.slice ());
117
128
}
118
129
119
130
return true ;
@@ -138,37 +149,6 @@ bool normalize_vpack_config(const string_ref& args, std::string& config) {
138
149
return false ;
139
150
}
140
151
141
- // //////////////////////////////////////////////////////////////////////////////
142
- // / @brief args is a language to use for normalizing
143
- // //////////////////////////////////////////////////////////////////////////////
144
- analysis::analyzer::ptr make_text (const string_ref& args) {
145
- try {
146
- analysis::collation_token_stream::options_t options;
147
-
148
- if (locale_utils::icu_locale (args, options.locale )) {// interpret 'args' as a locale name
149
- return memory::make_unique<analysis::collation_token_stream>(
150
- std::move (options));
151
- }
152
- } catch (...) {
153
- std::string err_msg = static_cast <std::string>(args);
154
- IR_FRMT_ERROR (
155
- " Caught error while constructing collation_token_stream TEXT arguments: %s" ,
156
- err_msg.c_str ());
157
- }
158
-
159
- return nullptr ;
160
- }
161
-
162
- bool normalize_text_config (const string_ref& args,
163
- std::string& definition) {
164
- std::locale locale;
165
- if (locale_utils::icu_locale (args, locale)){
166
- definition = locale_utils::name (locale);
167
- return true ;
168
- }
169
- return false ;
170
- }
171
-
172
152
analysis::analyzer::ptr make_json (const string_ref& args) {
173
153
try {
174
154
if (args.null ()) {
@@ -206,15 +186,13 @@ bool normalize_json_config(const string_ref& args, std::string& definition) {
206
186
ex.what ());
207
187
} catch (...) {
208
188
IR_FRMT_ERROR (
209
- " Caught error while normalizing text_token_normalizing_stream from JSON" );
189
+ " Caught error while normalizing collation_token_stream from JSON" );
210
190
}
211
191
return false ;
212
192
}
213
193
214
194
REGISTER_ANALYZER_JSON (analysis::collation_token_stream, make_json,
215
195
normalize_json_config);
216
- REGISTER_ANALYZER_TEXT (analysis::collation_token_stream, make_text,
217
- normalize_text_config);
218
196
REGISTER_ANALYZER_VPACK (analysis::collation_token_stream, make_vpack,
219
197
normalize_vpack_config);
220
198
@@ -226,27 +204,19 @@ namespace analysis {
226
204
constexpr size_t MAX_TOKEN_SIZE = 1 << 15 ;
227
205
228
206
struct collation_token_stream ::state_t {
229
- icu::Locale icu_locale;
230
207
const options_t options;
231
208
std::unique_ptr<icu::Collator> collator;
232
209
std::string utf8_buf;
233
210
byte_type term_buf[MAX_TOKEN_SIZE];
234
211
235
- state_t (const options_t & opts)
236
- : icu_locale(" C" ),
212
+ state_t (const options_t & opts) :
237
213
options (opts) {
238
- // NOTE: use of the default constructor for Locale() or
239
- // use of Locale::createFromName(nullptr)
240
- // causes a memory leak with Boost 1.58, as detected by valgrind
241
- icu_locale.setToBogus (); // set to uninitialized
242
214
}
243
215
};
244
216
245
217
/* static*/ void collation_token_stream::init () {
246
218
REGISTER_ANALYZER_JSON (collation_token_stream, make_json,
247
219
normalize_json_config);
248
- REGISTER_ANALYZER_TEXT (collation_token_stream, make_text,
249
- normalize_text_config);
250
220
REGISTER_ANALYZER_VPACK (collation_token_stream, make_vpack,
251
221
normalize_vpack_config);
252
222
}
@@ -257,7 +227,7 @@ void collation_token_stream::state_deleter_t::operator()(state_t* p) const noexc
257
227
258
228
/* static*/ analyzer::ptr collation_token_stream::make (
259
229
const string_ref& locale) {
260
- return make_text (locale);
230
+ return make_json (locale);
261
231
}
262
232
263
233
collation_token_stream::collation_token_stream (
@@ -268,19 +238,10 @@ collation_token_stream::collation_token_stream(
268
238
}
269
239
270
240
bool collation_token_stream::reset (const string_ref& data) {
271
- if (state_->icu_locale .isBogus ()) {
272
- state_->icu_locale = icu::Locale (
273
- std::string (locale_utils::language (state_->options .locale )).c_str (),
274
- std::string (locale_utils::country (state_->options .locale )).c_str ());
275
-
276
- if (state_->icu_locale .isBogus ()) {
277
- return false ;
278
- }
279
- }
280
241
281
242
if (!state_->collator ) {
282
243
auto err = UErrorCode::U_ZERO_ERROR;
283
- state_->collator .reset (icu::Collator::createInstance (state_->icu_locale , err));
244
+ state_->collator .reset (icu::Collator::createInstance (state_->options . locale , err));
284
245
285
246
if (!U_SUCCESS (err) || !state_->collator ) {
286
247
state_->collator .reset ();
@@ -289,26 +250,12 @@ bool collation_token_stream::reset(const string_ref& data) {
289
250
}
290
251
}
291
252
292
- // ...........................................................................
293
- // convert encoding to UTF8 for use with ICU
294
- // ...........................................................................
295
- string_ref data_utf8_ref;
296
- if (locale_utils::is_utf8 (state_->options .locale )) {
297
- data_utf8_ref = data;
298
- } else {
299
- // valid conversion since 'locale_' was created with internal unicode encoding
300
- if (!locale_utils::append_internal (state_->utf8_buf , data, state_->options .locale )) {
301
- return false ; // UTF8 conversion failure
302
- }
303
- data_utf8_ref = state_->utf8_buf ;
304
- }
305
-
306
- if (data_utf8_ref.size () > static_cast <uint32_t >(std::numeric_limits<int32_t >::max ())) {
253
+ if (data.size () > static_cast <uint32_t >(std::numeric_limits<int32_t >::max ())) {
307
254
return false ; // ICU UnicodeString signatures can handle at most INT32_MAX
308
255
}
309
256
310
257
const icu::UnicodeString icu_token = icu::UnicodeString::fromUTF8 (
311
- icu::StringPiece (data_utf8_ref .c_str (), static_cast <int32_t >(data_utf8_ref .size ())));
258
+ icu::StringPiece (data .c_str (), static_cast <int32_t >(data .size ())));
312
259
313
260
int32_t term_size = state_->collator ->getSortKey (
314
261
icu_token, state_->term_buf , sizeof state_->term_buf );
0 commit comments