8000 Bug fix/internal issue #589 (#9244) · reynoldsm88/arangodb@384c431 · GitHub
[go: up one dir, main page]

Skip to content

Commit 384c431

Browse files
authored
Bug fix/internal issue arangodb#589 (arangodb#9244)
* update iresearch from upstream * adjust tfidf/bm25 to original behaviour * update iresearch from upstream * fix compilation issues * reduce number of virtual calls * upgrade iresearch from upstream * add js test
1 parent f3c9c2b commit 384c431

File tree

83 files changed

+103284
-101581
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

83 files changed

+103284
-101581
lines changed

3rdParty/iresearch/core/CMakeLists.txt

Lines changed: 121 additions & 121 deletions
Large diffs are not rendered by default.

3rdParty/iresearch/core/analysis/analyzer.hpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
#include "analysis/token_stream.hpp"
2828
#include "utils/type_id.hpp"
29+
#include "utils/text_format.hpp"
2930

3031
NS_ROOT
3132
NS_BEGIN(analysis)
@@ -53,6 +54,14 @@ class IRESEARCH_API analyzer: public token_stream {
5354

5455
const type_id& type() const NOEXCEPT { return *type_; }
5556

57+
virtual bool to_string(const ::irs::text_format::type_id& /*format*/,
58+
std::string& definition) const {
59+
definition.clear();
60+
return false;
61+
}
62+
63+
protected:
64+
5665
private:
5766
const type_id* type_;
5867
};

3rdParty/iresearch/core/analysis/delimited_token_stream.cpp

Lines changed: 55 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,9 @@
2121
/// @author Vasiliy Nabatchikov
2222
////////////////////////////////////////////////////////////////////////////////
2323

24-
#include <rapidjson/rapidjson/document.h> // for rapidjson::Document
24+
#include <rapidjson/rapidjson/document.h> // for rapidjson::Document, rapidjson::Value
25+
#include <rapidjson/rapidjson/writer.h> // for rapidjson::Writer
26+
#include <rapidjson/rapidjson/stringbuffer.h> // for rapidjson::StringBuffer
2527

2628
#include "delimited_token_stream.hpp"
2729

@@ -115,7 +117,8 @@ irs::analysis::analyzer::ptr make_json(const irs::string_ref& args) {
115117
case rapidjson::kStringType:
116118
return irs::analysis::delimited_token_stream::make(json.GetString());
117119
case rapidjson::kObjectType:
118-
if (json.HasMember(delimiterParamName.c_str()) && json[delimiterParamName.c_str()].IsString()) {
120+
if (json.HasMember(delimiterParamName.c_str()) &&
121+
json[delimiterParamName.c_str()].IsString()) {
119122
return irs::analysis::delimited_token_stream::make(json[delimiterParamName.c_str()].GetString());
120123
}
121124
default: {} // fall through
@@ -130,6 +133,36 @@ irs::analysis::analyzer::ptr make_json(const irs::string_ref& args) {
130133
return nullptr;
131134
}
132135

136+
///////////////////////////////////////////////////////////////////////////////
137+
/// @brief builds analyzer config from internal options in json format
138+
/// @param delimiter reference to analyzer options storage
139+
/// @param definition string for storing json document with config
140+
///////////////////////////////////////////////////////////////////////////////
141+
bool make_json_config(const irs::bstring& delimiter, std::string& definition) {
142+
rapidjson::Document json;
143+
json.SetObject();
144+
145+
rapidjson::Document::AllocatorType& allocator = json.GetAllocator();
146+
147+
// delimiter
148+
{
149+
auto delimiterStringRef = ::irs::ref_cast<char>(delimiter);
150+
json.AddMember(rapidjson::Value::StringRefType(delimiterParamName.c_str(),
151+
static_cast<rapidjson::SizeType>(delimiterParamName.size())),
152+
rapidjson::Value(rapidjson::StringRef(delimiterStringRef.c_str(),
153+
static_cast<rapidjson::SizeType>(delimiterStringRef.size()))),
154+
allocator);
155+
}
156+
157+
//output json to string
158+
rapidjson::StringBuffer buffer;
159+
rapidjson::Writer< rapidjson::StringBuffer> writer(buffer);
160+
json.Accept(writer);
161+
definition = buffer.GetString();
162+
return true;
163+
}
164+
165+
133166
////////////////////////////////////////////////////////////////////////////////
134167
/// @brief args is a delimiter to use for tokenization
135168
////////////////////////////////////////////////////////////////////////////////
@@ -139,6 +172,14 @@ irs::analysis::analyzer::ptr make_text(const irs::string_ref& args) {
139172
);
140173
}
141174

175+
////////////////////////////////////////////////////////////////////////////////
176+
/// @brief build config string in 'text' format
177+
////////////////////////////////////////////////////////////////////////////////
178+
bool make_text_config(const irs::bstring& delimiter, std::string& definition) {
179+
definition = irs::ref_cast<char>(delimiter);
180+
return true;
181+
}
182+
142183
REGISTER_ANALYZER_JSON(irs::analysis::delimited_token_stream, make_json);
143184
REGISTER_ANALYZER_TEXT(irs::analysis::delimited_token_stream, make_text);
144185

@@ -213,6 +254,18 @@ bool delimited_token_stream::reset(const string_ref& data) {
213254
return true;
214255
}
215256

257+
bool delimited_token_stream::to_string(
258+
const ::irs::text_format::type_id& format,
259+
std::string& definition) const {
260+
if (::irs::text_format::json == format) {
261+
return make_json_config(delim_buf_, definition);
262+
} else if (::irs::text_format::text == format) {
263+
return make_text_config(delim_buf_, definition);
264+
}
265+
return false;
266+
}
267+
268+
216269
NS_END // analysis
217270
NS_END // ROOT
218271

3rdParty/iresearch/core/analysis/delimited_token_stream.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ class delimited_token_stream: public analyzer, util::noncopyable {
4848
static void init(); // for trigering registration in a static build
4949
virtual bool next() override;
5050
virtual bool reset(const string_ref& data) override;
51+
virtual bool to_string(const ::irs::text_format::type_id& format,
52+
std::string& definition) const override;
5153

5254
private:
5355
class term_attribute final: public irs::term_attribute {

3rdParty/iresearch/core/analysis/ngram_token_stream.cpp

Lines changed: 64 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
////////////////////////////////////////////////////////////////////////////////
2323

2424
#include <rapidjson/rapidjson/document.h> // for rapidjson::Document
25+
#include <rapidjson/rapidjson/writer.h> // for rapidjson::Writer
26+
#include <rapidjson/rapidjson/stringbuffer.h> // for rapidjson::StringBuffer
2527

2628
#include "ngram_token_stream.hpp"
2729

@@ -65,9 +67,9 @@ bool get_bool(
6567
return true;
6668
}
6769

68-
static const irs::string_ref minParamName = "min";
69-
static const irs::string_ref maxParamName = "max";
70-
static const irs::string_ref preserveOriginalParamName = "preserveOriginal";
70+
const irs::string_ref minParamName = "min";
71+
const irs::string_ref maxParamName = "max";
72+
const irs::string_ref preserveOriginalParamName = "preserveOriginal";
7173

7274
////////////////////////////////////////////////////////////////////////////////
7375
/// @brief args is a jSON encoded object with the following attributes:
@@ -77,7 +79,6 @@ static const irs::string_ref preserveOriginalParamName = "preserveOriginal";
7779
////////////////////////////////////////////////////////////////////////////////
7880
irs::analysis::analyzer::ptr make_json(const irs::string_ref& args) {
7981
rapidjson::Document json;
80-
8182
if (json.Parse(args.c_str(), args.size()).HasParseError()) {
8283
IR_FRMT_ERROR(
8384
"Invalid jSON arguments passed while constructing ngram_token_stream, arguments: %s",
@@ -127,10 +128,47 @@ irs::analysis::analyzer::ptr make_json(const irs::string_ref& args) {
127128
}
128129

129130
return irs::analysis::ngram_token_stream::make(
130-
size_t(min), size_t(max), preserve_original
131+
irs::analysis::ngram_token_stream::options_t(size_t(min), size_t(max), preserve_original)
131132
);
132133
}
133134

135+
///////////////////////////////////////////////////////////////////////////////
136+
/// @brief builds analyzer config from internal options in json format
137+
///////////////////////////////////////////////////////////////////////////////
138+
bool make_json_config(const irs::analysis::ngram_token_stream::options_t& options,
139+
std::string& definition) {
140+
rapidjson::Document json;
141+
json.SetObject();
142+
rapidjson::Document::AllocatorType& allocator = json.GetAllocator();
143+
144+
// ensure disambiguating casts below are safe. Casts required for clang compiler on Mac
145+
static_assert(sizeof(uint64_t) >= sizeof(size_t), "sizeof(uint64_t) >= sizeof(size_t)");
146+
//min_gram
147+
json.AddMember(rapidjson::Value::StringRefType(minParamName.c_str(),
148+
static_cast<rapidjson::SizeType>(minParamName.size())),
149+
rapidjson::Value(static_cast<uint64_t>(options.min_gram)),
150+
allocator);
151+
152+
//max_gram
153+
json.AddMember(rapidjson::Value::StringRefType(maxParamName.c_str(),
154+
static_cast<rapidjson::SizeType>(maxParamName.size())),
155+
rapidjson::Value(static_cast<uint64_t>(options.max_gram)),
156+
allocator);
157+
158+
//preserve_original
159+
json.AddMember(rapidjson::Value::StringRefType(preserveOriginalParamName.c_str(),
160+
static_cast<rapidjson::SizeType>(preserveOriginalParamName.size())),
161+
rapidjson::Value(options.preserve_original),
162+
allocator);
163+
164+
//output json to string
165+
rapidjson::StringBuffer buffer;
166+
rapidjson::Writer< rapidjson::StringBuffer> writer(buffer);
167+
json.Accept(writer);
168+
definition = buffer.GetString();
169+
return true;
170+
}
171+
134172
REGISTER_ANALYZER_JSON(irs::analysis::ngram_token_stream, make_json);
135173

136174
NS_END
@@ -139,25 +177,21 @@ NS_ROOT
139177
NS_BEGIN(analysis)
140178

141179
/*static*/ analyzer::ptr ngram_token_stream::make(
142-
size_t min, size_t max, bool preserve_original
180+
const options_t& options
143181
) {
144-
return std::make_shared<ngram_token_stream>(min, max, preserve_original);
182+
return std::make_shared<ngram_token_stream>(options);
145183
}
146184

147185
/*static*/ void ngram_token_stream::init() {
148186
REGISTER_ANALYZER_JSON(ngram_token_stream, make_json); // match registration above
149187
}
150188

151189
ngram_token_stream::ngram_token_stream(
152-
size_t min_gram,
153-
size_t max_gram,
154-
bool preserve_original
190+
const options_t& options
155191
) : analyzer(ngram_token_stream::type()),
156-
min_gram_(min_gram),
157-
max_gram_(max_gram),
158-
preserve_original_(preserve_original) {
159-
min_gram_ = std::max(min_gram_, size_t(1));
160-
max_gram_ = std::max(max_gram_, min_gram_);
192+
options_(options) {
193+
options_.min_gram = std::max(options_.min_gram, size_t(1));
194+
options_.max_gram = std::max(options_.max_gram, options_.min_gram);
161195

162196
attrs_.emplace(offset_);
163197
attrs_.emplace(inc_);
@@ -166,10 +200,10 @@ ngram_token_stream::ngram_token_stream(
166200

167201
//FIXME UTF-8 support
168202
bool ngram_token_stream::next() NOEXCEPT {
169-
if (length_ < min_gram_) {
203+
if (length_ < options_.min_gram) {
170204
++begin_;
171205

172-
if (data_.end() < begin_ + min_gram_) {
206+
if (data_.end() < begin_ + options_.min_gram) {
173207
if (emit_original_) {
174208
// emit the original input if it's not yet emitted
175209
term_.value(data_);
@@ -186,11 +220,11 @@ bool ngram_token_stream::next() NOEXCEPT {
186220
}
187221

188222
return false;
189-
} else if (data_.end() < begin_ + max_gram_) {
223+
} else if (data_.end() < begin_ + options_.max_gram) {
190224
assert(begin_ <= data_.end());
191225
length_ = size_t(std::distance(begin_, data_.end()));
192226
} else {
193-
length_ = max_gram_;
227+
length_ = options_.max_gram;
194228
}
195229

196230
++offset_.start;
@@ -222,12 +256,21 @@ bool ngram_token_stream::reset(const irs::string_ref& value) NOEXCEPT {
222256
data_ = ref_cast<byte_type>(value);
223257
begin_ = data_.begin()-1;
224258
length_ = 0;
225-
emit_original_ = data_.size() > max_gram_ && preserve_original_;
226-
assert(length_ < min_gram_);
259+
emit_original_ = data_.size() > options_.max_gram && options_.preserve_original;
260+
assert(length_ < options_.min_gram);
227261

228262
return true;
229263
}
230264

265+
bool ngram_token_stream::to_string(
266+
const ::irs::text_format::type_id& format,
267+
std::string& definition) const {
268+
if (::irs::text_format::json == format) {
269+
return make_json_config(options_, definition);
270+
}
271+
return false;
272+
}
273+
231274
DEFINE_ANALYZER_TYPE_NAMED(ngram_token_stream, "ngram")
232275

233276
NS_END // analysis

3rdParty/iresearch/core/analysis/ngram_token_stream.hpp

Lines changed: 23 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -37,49 +37,56 @@ NS_BEGIN(analysis)
3737
////////////////////////////////////////////////////////////////////////////////
3838
class ngram_token_stream: public analyzer, util::noncopyable {
3939
public:
40+
struct options_t {
41+
options_t(size_t min, size_t max, bool original) {
42+
min_gram = min;
43+
max_gram = max;
44+
preserve_original = original;
45+
}
46+
47+
size_t min_gram;
48+
size_t max_gram;
49+
bool preserve_original; // emit input data as a token
50+
};
51+
4052
DECLARE_ANALYZER_TYPE();
4153

42-
DECLARE_FACTORY(size_t min_gram, size_t max_gram, bool preserve_original);
54+
DECLARE_FACTORY(const options_t& options);
4355

4456
static void init(); // for trigering registration in a static build
4557

46-
ngram_token_stream(size_t n, bool preserve_original)
47-
: ngram_token_stream(n, n, preserve_original) {
48-
}
58+
//ngram_token_stream(size_t n, bool preserve_original)
59+
// : ngram_token_stream(n, n, preserve_original) {
60+
//}
4961

50-
ngram_token_stream(size_t min_gram, size_t max_gram, bool preserve_original);
62+
ngram_token_stream(const options_t& options);
5163

5264
virtual const attribute_view& attributes() const NOEXCEPT override {
5365
return attrs_;
5466
}
5567

5668
virtual bool next() NOEXCEPT override;
57-
5869
virtual bool reset(const string_ref& data) NOEXCEPT override;
59-
60-
size_t min_gram() const NOEXCEPT { return min_gram_; }
61-
62-
size_t max_gram() const NOEXCEPT { return max_gram_; }
63-
64-
bool preserve_original() const NOEXCEPT { return preserve_original_; }
70+
virtual bool to_string(const ::irs::text_format::type_id& format,
71+
std::string& definition) const override;
72+
size_t min_gram() const NOEXCEPT { return options_.min_gram; }
73+
size_t max_gram() const NOEXCEPT { return options_.max_gram; }
74+
bool preserve_original() const NOEXCEPT { return options_.preserve_original; }
6575

6676
private:
6777
class term_attribute final: public irs::term_attribute {
6878
public:
6979
void value(const bytes_ref& value) { value_ = value; }
7080
};
7181

82+
options_t options_;
7283
attribute_view attrs_;
73-
size_t min_gram_;
74-
size_t max_gram_;
7584
bytes_ref data_; // data to process
7685
increment inc_;
7786
offset offset_;
7887
term_attribute term_;
7988
const byte_type* begin_{};
8089
size_t length_{};
81-
82-
bool preserve_original_{ false }; // emit input data as a token
8390
bool emit_original_{ false };
8491
}; // ngram_token_stream
8592

0 commit comments

Comments
 (0)
0