8000 Add some arangosearch tests by gnusi · Pull Request #14664 · arangodb/arangodb · GitHub
[go: up one dir, main page]

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions 3rdParty/iresearch/CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
master
-------------------------

v1.1 (2021-08-18)
v1.1 (2021-08-25)
-------------------------

* Rework Compression API to return `std::unique_ptr` instead of `std::shared_ptr`.

* Rework Analyzer API to return `std::unique_ptr` instead of `std::shared_ptr`.

* Derive `null_token_stream`, `string_token_stream`, `numeric_token_stream` and `null_token_stream`
Expand All @@ -28,7 +30,7 @@ v1.1 (2021-08-18)

* Replace RapidJSON with Velocypack for analyzers and scorers serialization and deserialization

* Add new `1_4` segment format utilizing new columnstore implementation.
* Add new `1_4` segment format utilizing new columnstore and term dictionary index format.

* Add new columnstore implementation based on sparse bitset format.

Expand Down
11 changes: 8 additions & 3 deletions 3rdParty/iresearch/core/analysis/collation_token_stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,8 @@ struct collation_token_stream::state_t {
byte_type term_buf[MAX_TOKEN_SIZE];

state_t(const options_t& opts)
: icu_locale("C"), options(opts) {
: icu_locale("C"),
options(opts) {
// NOTE: use of the default constructor for Locale() or
// use of Locale::createFromName(nullptr)
// causes a memory leak with Boost 1.58, as detected by valgrind
Expand All @@ -250,6 +251,10 @@ struct collation_token_stream::state_t {
normalize_vpack_config);
}

void collation_token_stream::state_deleter_t::operator()(state_t* p) const noexcept {
delete p;
}

/*static*/ analyzer::ptr collation_token_stream::make(
const string_ref& locale) {
return make_text(locale);
Expand All @@ -258,8 +263,8 @@ struct collation_token_stream::state_t {
collation_token_stream::collation_token_stream(
const options_t& options)
: analyzer{irs::type<collation_token_stream>::get()},
state_(memory::make_unique<state_t>(options)),
term_eof_(true) {
state_{new state_t(options)},
term_eof_{true} {
}

bool collation_token_stream::reset(const string_ref& data) {
Expand Down
5 changes: 4 additions & 1 deletion 3rdParty/iresearch/core/analysis/collation_token_stream.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ class collation_token_stream final

private:
struct state_t;
struct state_deleter_t {
void operator()(state_t*) const noexcept;
};

using attributes = std::tuple<
increment,
Expand All @@ -70,7 +73,7 @@ class collation_token_stream final
term_attribute>; // token value with evaluated quotes

attributes attrs_;
std::shared_ptr<state_t> state_;
std::unique_ptr<state_t, state_deleter_t> state_;
bool term_eof_;
}; // collation_token_stream

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -284,12 +284,15 @@ struct segmentation_token_stream::state_t {
iterator_t end;
};

void segmentation_token_stream::state_deleter_t::operator()(state_t* p) const noexcept {
delete p;
}

REGISTER_ANALYZER_VPACK(segmentation_token_stream, make_vpack,
normalize_vpack_config);
REGISTER_ANALYZER_JSON(segmentation_token_stream, make_json,
normalize_json_config);


/*static*/ void segmentation_token_stream::init() {
REGISTER_ANALYZER_VPACK(segmentation_token_stream, make_vpack,
normalize_vpack_config); // match registration above
Expand All @@ -300,7 +303,8 @@ REGISTER_ANALYZER_JSON(segmentation_token_stream, make_json,
segmentation_token_stream::segmentation_token_stream(
segmentation_token_stream::options_t&& options)
: analyzer{ irs::type<segmentation_token_stream>::get() },
state_(memory::make_unique<state_t>()), options_(options) {
state_{new state_t()},
options_{options} {
}

bool segmentation_token_stream::next() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
///
/// @author Andrei Lobov
////////////////////////////////////////////////////////////////////////////////

#ifndef IRESEARCH_SEGMENTATION_TOKEN_STREAM_H
#define IRESEARCH_SEGMENTATION_TOKEN_STREAM_H

Expand All @@ -28,7 +29,6 @@
#include "token_attributes.hpp"
#include "utils/frozen_attributes.hpp"


namespace iresearch {
namespace analysis {
class segmentation_token_stream final
Expand Down Expand Up @@ -56,16 +56,18 @@ class segmentation_token_stream final
virtual bool next() override;
virtual bool reset(const string_ref& data) override;


private:
using attributes = std::tuple<
increment,
offset,
term_attribute>;

struct state_t;
struct state_deleter_t {
void operator()(state_t*) const noexcept;
};

std::shared_ptr<state_t> state_;
std::unique_ptr<state_t, state_deleter_t> state_;
options_t options_;
std::string term_buf_; // buffer for value if value cannot be referenced directly
attributes attrs_;
Expand Down
28 changes: 17 additions & 11 deletions 3rdParty/iresearch/core/analysis/text_token_normalizing_stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,15 @@ namespace analysis {
struct text_token_normalizing_stream::state_t {
icu::UnicodeString data;
icu::Locale icu_locale;
std::shared_ptr<const icu::Normalizer2> normalizer;
const options_t options;
std::string term_buf; // used by reset()
std::shared_ptr<icu::Transliterator> transliterator;
state_t(const options_t& opts): icu_locale("C"), options(opts) {
const icu::Normalizer2* normalizer; // reusable object owned by ICU
std::unique_ptr<icu::Transliterator> transliterator;

explicit state_t(const options_t& opts)
: icu_locale{"C"},
options{opts},
normalizer{} {
// NOTE: use of the default constructor for Locale() or
// use of Locale::createFromName(nullptr)
// causes a memory leak with Boost 1.58, as detected by valgrind
Expand Down Expand Up @@ -343,11 +347,16 @@ REGISTER_ANALYZER_VPACK(analysis::text_token_normalizing_stream, make_vpack,
namespace iresearch {
namespace analysis {

void text_token_normalizing_stream::state_deleter_t::operator()(
state_t* p) const noexcept {
delete p;
}

text_token_normalizing_stream::text_token_normalizing_stream(
const options_t& options)
: analyzer{irs::type<text_token_normalizing_stream>::get()},
state_(memory::make_unique<state_t>(options)),
term_eof_(true) {
state_{new state_t{options}},
term_eof_{true} {
}

/*static*/ void text_token_normalizing_stream::init() {
Expand Down Expand Up @@ -390,12 +399,10 @@ bool text_token_normalizing_stream::reset(const string_ref& data) {

if (!state_->normalizer) {
// reusable object owned by ICU
state_->normalizer.reset(
icu::Normalizer2::getNFCInstance(err), [](const icu::Normalizer2*)->void{}
);
state_->normalizer = icu::Normalizer2::getNFCInstance(err);

if (!U_SUCCESS(err) || !state_->normalizer) {
state_->normalizer.reset();
state_->normalizer = nullptr;

return false;
}
Expand All @@ -408,8 +415,7 @@ bool text_token_normalizing_stream::reset(const string_ref& data) {

// reusable object owned by *this
state_->transliterator.reset(icu::Transliterator::createInstance(
collationRule, UTransDirection::UTRANS_FORWARD, err
));
collationRule, UTransDirection::UTRANS_FORWARD, err));

if (!U_SUCCESS(err) || !state_->transliterator) {
state_->transliterator.reset();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,6 @@ class text_token_normalizing_stream final
bool accent{true}; // no extra normalization
};

struct state_t;

static constexpr string_ref type_name() noexcept { return "norm"; }
static void init(); // for trigering registration in a static build
static ptr make(const string_ref& locale);
Expand All @@ -66,8 +64,13 @@ class text_token_normalizing_stream final
payload, // raw token value
term_attribute>; // token value with evaluated quotes

struct state_t;
struct state_deleter_t {
void operator()(state_t*) const noexcept;
};

attributes attrs_;
std::shared_ptr<state_t> state_;
std::unique_ptr<state_t, state_deleter_t> state_;
bool term_eof_;
};

Expand Down
11 changes: 5 additions & 6 deletions 3rdParty/iresearch/core/analysis/text_token_stemming_stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -212,8 +212,9 @@ namespace analysis {

text_token_stemming_stream::text_token_stemming_stream(const std::locale& locale)
: analyzer{irs::type<text_token_stemming_stream>::get()},
locale_(locale),
term_eof_(true) {
locale_{locale},
stemmer_{nullptr, &sb_stemmer_delete},
term_eof_{true} {
}

/*static*/ void text_token_stemming_stream::init() {
Expand Down Expand Up @@ -243,10 +244,8 @@ bool text_token_stemming_stream::reset(const string_ref& data) {
if (!stemmer_) {
stemmer_.reset(
sb_stemmer_new(
std::string(locale_utils::language(locale_)).c_str(), nullptr // defaults to utf-8
),
[](sb_stemmer* ptr)->void{ sb_stemmer_delete(ptr); }
);
std::string(locale_utils::language(locale_)).c_str(),
nullptr)); // defaults to utf-8
}

auto& term = std::get<term_attribute>(attrs_);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,8 @@ class text_token_stemming_stream final

attributes attrs_;
std::locale locale_;
std::shared_ptr<sb_stemmer> stemmer_;
std::string term_buf_; // buffer for the last evaluated term
std::unique_ptr<sb_stemmer, void(*)(sb_stemmer*)> stemmer_;
bool term_eof_;
};

Expand Down
Loading
0