8000 Feature/iresearch update (#14823) · arangodb/arangodb@d4bea91 · GitHub
[go: up one dir, main page]

Skip to content

Commit d4bea91

Browse files
authored
Feature/iresearch update (#14823)
1 parent 0c2fbee commit d4bea91

File tree

186 files changed

+105104
-104257
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

186 files changed

+105104
-104257
lines changed

3rdParty/iresearch/CHANGELOG

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,18 @@
11
master
22
-------------------------
33

4+
* Fix threading related issues
5+
6+
* Add "object" locale parsing for the `collation_token_stream` to support definig locale variant
7+
and keywords
8+
49
v1.1 (2021-08-25)
510
-------------------------
611

12+
* Get rid of `utf8_path` in favor of `std::filesystem::path`.
13+
14+
* Fix sporadic "error while reading compact" failures.
15+
716
* Rework Compression API to return `std::unique_ptr` instead of `std::shared_ptr`.
817

918
* Rework Analyzer API to return `std::unique_ptr` instead of `std::shared_ptr`.

3rdParty/iresearch/core/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,6 @@ set(IResearch_core_sources
147147
./utils/async_utils.cpp
148148
./utils/thread_utils.cpp
149149
./utils/attributes.cpp
150-
./utils/attribute_store.cpp
151150
./utils/automaton_utils.cpp
152151
./utils/bit_packing.cpp
153152
./utils/encryption.cpp
@@ -169,6 +168,7 @@ set(IResearch_core_sources
169168
./utils/version_utils.cpp
170169
./utils/utf8_path.cpp
171170
./utils/locale_utils.cpp
171+
./utils/icu_locale_utils.cpp
172172
./utils/log.cpp
173173
./utils/result.cpp
174174
./utils/so_utils.cpp

3rdParty/iresearch/core/analysis/analyzers.hpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,6 @@ class IRESEARCH_API analyzers {
9696
////////////////////////////////////////////////////////////////////////////////
9797
/// @brief find an analyzer by name, or nullptr if not found
9898
/// indirect call to <class>::make(...)
99-
/// requires use of DECLARE_FACTORY() in class definition
10099
/// NOTE: make(...) MUST be defined in CPP to ensire proper code scope
101100
////////////////////////////////////////////////////////////////////////////////
102101
static result get(
@@ -109,7 +108,6 @@ class IRESEARCH_API analyzers {
109108
////////////////////////////////////////////////////////////////////////////////
110109
/// @brief find an analyzer by name, or nullptr if not found
111110
/// indirect call to <class>::make(...)
112-
/// requires use of DECLARE_FACTORY() in class definition
113111
/// NOTE: make(...) MUST be defined in CPP to ensire proper code scope
114112
////////////////////////////////////////////////////////////////////////////////
115113
static analyzer::ptr get(

3rdParty/iresearch/core/analysis/collation_token_stream.cpp

Lines changed: 36 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -30,45 +30,54 @@
3030
#include "velocypack/Parser.h"
3131
#include "velocypack/velocypack-aliases.h"
3232

33-
#include "utils/locale_utils.hpp"
3433
#include "utils/vpack_utils.hpp"
34+
#include "utils/icu_locale_utils.hpp"
3535

3636
namespace {
3737

3838
using namespace irs;
3939

40-
constexpr VPackStringRef LOCALE_PARAM_NAME{"locale"};
40+
constexpr VPackStringRef LOCALE_PARAM_NAME {"locale"};
4141

4242
bool parse_vpack_options(
4343
const VPackSlice slice,
4444
analysis::collation_token_stream::options_t& options) {
4545

46-
if (!slice.isObject() && !slice.isString()) {
46+
if (!slice.isObject()) {
4747
IR_FRMT_ERROR(
48-
"Slice for collation_token_stream is not an object or string");
48+
"Slice for collation_token_stream is not an object");
4949
return false;
5050
}
5151

5252
try {
53-
switch (slice.type()) {
53+
54+
auto locale_slice = slice.get(LOCALE_PARAM_NAME);
55+
56+
switch (locale_slice.type()) {
5457
case VPackValueType::String:
55-
return locale_utils::icu_locale(get_string<string_ref>(slice), options.locale); // required
58+
{
59+
bool res = icu_locale_utils::get_locale_from_str(get_string<string_ref>(locale_slice),
60+
options.locale,
61+
false, // true - new format of locale string
62+
&options.unicode);
63+
if (res && options.unicode == icu_locale_utils::Unicode::UTF8) {
64+
return true;
65+
} else {
66+
return false;
67+
}
68+
}
5669
case VPackValueType::Object:
5770
{
58-
auto param_name_slice = slice.get(LOCALE_PARAM_NAME);
59-
if (param_name_slice.isString()) {
60-
if (!locale_utils::icu_locale(get_string<string_ref>(param_name_slice), options.locale)) {
61-
return false;
62-
}
63-
64-
return true;
65-
}
71+
return icu_locale_utils::get_locale_from_vpack(locale_slice,
72+
options.locale,
73+
&options.unicode);
74+
6675
}
6776
[[fallthrough]];
6877
default:
6978
IR_FRMT_ERROR(
7079
"Missing '%s' while constructing collation_token_stream"
71-
"from VPack arguments",
80+
"or value is not a string or an object",
7281
LOCALE_PARAM_NAME.data());
7382
}
7483
} catch(const VPackException& ex) {
@@ -109,11 +118,13 @@ bool make_vpack_config(
109118
const analysis::collation_token_stream::options_t& options,
110119
VPackBuilder* builder) {
111120

112-
VPackObjectBuilder object(builder);
121+
VPackBuilder locale_builder;
122+
icu_locale_utils::locale_to_vpack(options.locale, &locale_builder, &options.unicode);
123+
124+
// locale
125+
VPackObjectBuilder locale_obj(builder);
113126
{
114-
// locale
115-
const auto& locale_name = locale_utils::name(options.locale);
116-
builder->add(LOCALE_PARAM_NAME, VPackValue(locale_name));
127+
builder->add(LOCALE_PARAM_NAME.data(), locale_builder.slice());
117128
}
118129

119130
return true;
@@ -138,37 +149,6 @@ bool normalize_vpack_config(const string_ref& args, std::string& config) {
138149
return false;
139150
}
140151

141-
////////////////////////////////////////////////////////////////////////////////
142-
/// @brief args is a language to use for normalizing
143-
////////////////////////////////////////////////////////////////////////////////
144-
analysis::analyzer::ptr make_text(const string_ref& args) {
145-
try {
146-
analysis::collation_token_stream::options_t options;
147-
148-
if (locale_utils::icu_locale(args, options.locale)) {// interpret 'args' as a locale name
149-
return memory::make_unique<analysis::collation_token_stream>(
150-
std::move(options));
151-
}
152-
} catch (...) {
153-
std::string err_msg = static_cast<std::string>(args);
154-
IR_FRMT_ERROR(
155-
"Caught error while constructing collation_token_stream TEXT arguments: %s",
156-
err_msg.c_str());
157-
}
158-
159-
return nullptr;
160-
}
161-
162-
bool normalize_text_config(const string_ref& args,
163-
std::string& definition) {
164-
std::locale locale;
165-
if (locale_utils::icu_locale(args, locale)){
166-
definition = locale_utils::name(locale);
167-
return true;
168-
}
169-
return false;
170-
}
171-
172152
analysis::analyzer::ptr make_json(const string_ref& args) {
173153
try {
174154
if (args.null()) {
@@ -206,15 +186,13 @@ bool normalize_json_config(const string_ref& args, std::string& definition) {
206186
ex.what());
207187
} catch (...) {
208188
IR_FRMT_ERROR(
209-
"Caught error while normalizing text_token_normalizing_stream from JSON");
189+
"Caught error while normalizing collation_token_stream from JSON");
210190
}
211191
return false;
212192
}
213193

214194
REGISTER_ANALYZER_JSON(analysis::collation_token_stream, make_json,
215195
normalize_json_config);
216-
REGISTER_ANALYZER_TEXT(analysis::collation_token_stream, make_text,
217-
normalize_text_config);
218196
REGISTER_ANALYZER_VPACK(analysis::collation_token_stream, make_vpack,
219197
normalize_vpack_config);
220198

@@ -226,27 +204,19 @@ namespace analysis {
226204
constexpr size_t MAX_TOKEN_SIZE = 1 << 15;
227205

228206
struct collation_token_stream::state_t {
229-
icu::Locale icu_locale;
230207
const options_t options;
231208
std::unique_ptr<icu::Collator> collator;
232209
std::string utf8_buf;
233210
byte_type term_buf[MAX_TOKEN_SIZE];
234211

235-
state_t(const options_t& opts)
236-
: icu_locale("C"),
212+
state_t(const options_t& opts) :
237213
options(opts) {
238-
// NOTE: use of the default constructor for Locale() or
239-
// use of Locale::createFromName(nullptr)
240-
// causes a memory leak with Boost 1.58, as detected by valgrind
241-
icu_locale.setToBogus(); // set to uninitialized
242214
}
243215
};
244216

245217
/*static*/ void collation_token_stream::init() {
246218
REGISTER_ANALYZER_JSON(collation_token_stream, make_json,
247219
normalize_json_config);
248-
REGISTER_ANALYZER_TEXT(collation_token_stream, make_text,
249-
normalize_text_config);
250220
REGISTER_ANALYZER_VPACK(collation_token_stream, make_vpack,
251221
normalize_vpack_config);
252222
}
@@ -257,7 +227,7 @@ void collation_token_stream::state_deleter_t::operator()(state_t* p) const noexc
257227

258228
/*static*/ analyzer::ptr collation_token_stream::make(
259229
const string_ref& locale) {
260-
return make_text(locale);
230+
return make_json(locale);
261231
}
262232

263233
collation_token_stream::collation_token_stream(
@@ -268,19 +238,10 @@ collation_token_stream::collation_token_stream(
268238
}
269239

270240
bool collation_token_stream::reset(const string_ref& data) {
271-
if (state_->icu_locale.isBogus()) {
272-
state_->icu_locale = icu::Locale(
273-
std::string(locale_utils::language(state_->options.locale)).c_str(),
274-
std::string(locale_utils::country(state_->options.locale)).c_str());
275-
276-
if (state_->icu_locale.isBogus()) {
277-
return false;
278-
}
279-
}
280241

281242
if (!state_->collator) {
282243
auto err = UErrorCode::U_ZERO_ERROR;
283-
state_->collator.reset(icu::Collator::createInstance(state_->icu_locale, err));
244+
state_->collator.reset(icu::Collator::createInstance(state_->options.locale, err));
284245

285246
if (!U_SUCCESS(err) || !state_->collator) {
286247
state_->collator.reset();
@@ -289,26 +250,12 @@ bool collation_token_stream::reset(const string_ref& data) {
289250
}
290251
}
291252

292-
// ...........................................................................
293-
// convert encoding to UTF8 for use with ICU
294-
// ...........................................................................
295-
string_ref data_utf8_ref;
296-
if (locale_utils::is_utf8(state_->options.locale)) {
297-
data_utf8_ref = data;
298-
} else {
299-
// valid conversion since 'locale_' was created with internal unicode encoding
300-
if (!locale_utils::append_internal(state_->utf8_buf, data, state_->options.locale)) {
301-
return false; // UTF8 conversion failure
302-
}
303-
data_utf8_ref = state_->utf8_buf;
304-
}
305-
306-
if (data_utf8_ref.size() > static_cast<uint32_t>(std::numeric_limits<int32_t>::max())) {
253+
if (data.size() > static_cast<uint32_t>(std::numeric_limits<int32_t>::max())) {
307254
return false; // ICU UnicodeString signatures can handle at most INT32_MAX
308255
}
309256

310257
const icu::UnicodeString icu_token = icu::UnicodeString::fromUTF8(
311-
icu::StringPiece(data_utf8_ref.c_str(), static_cast<int32_t>(data_utf8_ref.size())));
258+
icu::StringPiece(data.c_str(), static_cast<int32_t>(data.size())));
312259

313260
int32_t term_size = state_->collator->getSortKey(
314261
icu_token, state_->term_buf, sizeof state_->term_buf);

3rdParty/iresearch/core/analysis/collation_token_stream.hpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,12 @@
2323
#ifndef IRESEARCH_COLLATION_TOKEN_STREAM_H
2424
#define IRESEARCH_COLLATION_TOKEN_STREAM_H
2525

26+
#include <unicode/locid.h>
27+
2628
#include "analyzers.hpp"
2729
#include "token_attributes.hpp"
2830
#include "utils/frozen_attributes.hpp"
31+
#include "utils/icu_locale_utils.hpp"
2932

3033
namespace iresearch {
3134
namespace analysis {
@@ -38,7 +41,14 @@ class collation_token_stream final
3841
private util::noncopyable {
3942
public:
4043
struct options_t {
41-
std::locale locale;
44+
// NOTE: use of the default constructor for Locale() or
45+
// use of Locale::createFromName(nullptr)
46+
// causes a memory leak with Boost 1.58, as detected by valgrind
47+
options_t() : locale("C"), unicode(icu_locale_utils::Unicode::UTF8) {
48+
locale.setToBogus();
49+
}
50+
icu::Locale locale;
51+
icu_locale_utils::Unicode unicode;
4252
};
4353

4454
static constexpr string_ref type_name() noexcept {

3rdParty/iresearch/core/analysis/ngram_token_stream.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ class ngram_token_stream_base
131131
template<ngram_token_stream_base::InputType StreamType>
132132
class ngram_token_stream: public ngram_token_stream_base {
133133
public:
134-
DECLARE_FACTORY(const ngram_token_stream_base::Options& options);
134+
static ptr make(const ngram_token_stream_base::Options& options);
135135

136136
ngram_token_stream(const ngram_token_stream_base::Options& options);
137137

3rdParty/iresearch/core/analysis/pipeline_token_stream.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ bool normalize_vpack_config(const VPackSlice slice, VPackBuilder* builder) {
188188
{
189189
VPackArrayBuilder array(builder, PIPELINE_PARAM_NAME.data());
190190
{
191-
for (const auto analyzer : options) {
191+
for (const auto& analyzer : options) {
192192
VPackObjectBuilder analyzers_obj(builder);
193193
{
194194
builder->add(TYPE_PARAM_NAME, VPackValue(analyzer.first));

0 commit comments

Comments
 (0)
0