arangodb · gnusi · Aug 30, 2021 · Aug 18, 2021 · Aug 23, 2021 · Aug 24, 2021
diff --git a/3rdParty/iresearch/CHANGELOG b/3rdParty/iresearch/CHANGELOG
@@ -1,9 +1,11 @@
 master
 -------------------------
 
-v1.1 (2021-08-18)
+v1.1 (2021-08-25)
 -------------------------
 
+* Rework Compression API to return `std::unique_ptr` instead of `std::shared_ptr`.
+
 * Rework Analyzer API to return `std::unique_ptr` instead of `std::shared_ptr`.
 
 * Derive `null_token_stream`, `string_token_stream`, `numeric_token_stream` and `null_token_stream`
@@ -28,7 +30,7 @@ v1.1 (2021-08-18)
 
 * Replace RapidJSON with Velocypack for analyzers and scorers serialization and deserialization
 
-* Add new `1_4` segment format utilizing new columnstore implementation.
+* Add new `1_4` segment format utilizing new columnstore and term dictionary index format.
 
 * Add new columnstore implementation based on sparse bitset format.
 

diff --git a/3rdParty/iresearch/core/analysis/collation_token_stream.cpp b/3rdParty/iresearch/core/analysis/collation_token_stream.cpp
@@ -233,7 +233,8 @@ struct collation_token_stream::state_t {
   byte_type term_buf[MAX_TOKEN_SIZE];
 
   state_t(const options_t& opts)
-    : icu_locale("C"), options(opts) {
+    : icu_locale("C"),
+      options(opts) {
     // NOTE: use of the default constructor for Locale() or
     //       use of Locale::createFromName(nullptr)
     //       causes a memory leak with Boost 1.58, as detected by valgrind
@@ -250,6 +251,10 @@ struct collation_token_stream::state_t {
                          normalize_vpack_config);
 }
 
+void collation_token_stream::state_deleter_t::operator()(state_t* p) const noexcept {
+  delete p;
+}
+
 /*static*/ analyzer::ptr collation_token_stream::make(
     const string_ref& locale) {
   return make_text(locale);
@@ -258,8 +263,8 @@ struct collation_token_stream::state_t {
 collation_token_stream::collation_token_stream(
     const options_t& options)
   : analyzer{irs::type<collation_token_stream>::get()},
-    state_(memory::make_unique<state_t>(options)),
-    term_eof_(true) {
+    state_{new state_t(options)},
+    term_eof_{true} {
 }
 
 bool collation_token_stream::reset(const string_ref& data) {

diff --git a/3rdParty/iresearch/core/analysis/collation_token_stream.hpp b/3rdParty/iresearch/core/analysis/collation_token_stream.hpp
@@ -62,6 +62,9 @@ class collation_token_stream final
 
  private:
   struct state_t;
+  struct state_deleter_t {
+    void operator()(state_t*) const noexcept;
+  };
 
   using attributes = std::tuple<
     increment,
@@ -70,7 +73,7 @@ class collation_token_stream final
     term_attribute>; // token value with evaluated quotes
 
   attributes attrs_;
-  std::shared_ptr<state_t> state_;
+  std::unique_ptr<state_t, state_deleter_t> state_;
   bool term_eof_;
 }; // collation_token_stream
 

diff --git a/3rdParty/iresearch/core/analysis/segmentation_token_stream.cpp b/3rdParty/iresearch/core/analysis/segmentation_token_stream.cpp
@@ -284,12 +284,15 @@ struct segmentation_token_stream::state_t {
   iterator_t end;
 };
 
+void segmentation_token_stream::state_deleter_t::operator()(state_t* p) const noexcept {
+  delete p;
+}
+
 REGISTER_ANALYZER_VPACK(segmentation_token_stream, make_vpack,
                         normalize_vpack_config);
 REGISTER_ANALYZER_JSON(segmentation_token_stream, make_json,
                         normalize_json_config);
 
-
 /*static*/ void segmentation_token_stream::init() {
   REGISTER_ANALYZER_VPACK(segmentation_token_stream, make_vpack,
                           normalize_vpack_config);  // match registration above
@@ -300,7 +303,8 @@ REGISTER_ANALYZER_JSON(segmentation_token_stream, make_json,
 segmentation_token_stream::segmentation_token_stream(
     segmentation_token_stream::options_t&& options)
   : analyzer{ irs::type<segmentation_token_stream>::get() },
-    state_(memory::make_unique<state_t>()), options_(options) {
+    state_{new state_t()},
+    options_{options} {
 }
 
 bool segmentation_token_stream::next() {

diff --git a/3rdParty/iresearch/core/analysis/segmentation_token_stream.hpp b/3rdParty/iresearch/core/analysis/segmentation_token_stream.hpp
@@ -19,6 +19,7 @@
 ///
 /// @author Andrei Lobov
 ////////////////////////////////////////////////////////////////////////////////
+
 #ifndef IRESEARCH_SEGMENTATION_TOKEN_STREAM_H
 #define IRESEARCH_SEGMENTATION_TOKEN_STREAM_H
 
@@ -28,7 +29,6 @@
 #include "token_attributes.hpp"
 #include "utils/frozen_attributes.hpp"
 
-
 namespace iresearch {
 namespace analysis {
 class segmentation_token_stream final
@@ -56,16 +56,18 @@ class segmentation_token_stream final
   virtual bool next() override;
   virtual bool reset(const string_ref& data) override;
 
-
  private:
   using attributes = std::tuple<
     increment,
     offset,
     term_attribute>;
 
   struct state_t;
+  struct state_deleter_t {
+    void operator()(state_t*) const noexcept;
+  };
 
-  std::shared_ptr<state_t> state_;
+  std::unique_ptr<state_t, state_deleter_t> state_;
   options_t options_;
   std::string term_buf_; // buffer for value if value cannot be referenced directly
   attributes attrs_;

diff --git a/3rdParty/iresearch/core/analysis/text_token_normalizing_stream.cpp b/3rdParty/iresearch/core/analysis/text_token_normalizing_stream.cpp
@@ -67,11 +67,15 @@ namespace analysis {
 struct text_token_normalizing_stream::state_t {
   icu::UnicodeString data;
   icu::Locale icu_locale;
-  std::shared_ptr<const icu::Normalizer2> normalizer;
   const options_t options;
   std::string term_buf; // used by reset()
-  std::shared_ptr<icu::Transliterator> transliterator;
-  state_t(const options_t& opts): icu_locale("C"), options(opts) {
+  const icu::Normalizer2* normalizer; // reusable object owned by ICU
+  std::unique_ptr<icu::Transliterator> transliterator;
+
+  explicit state_t(const options_t& opts)
+    : icu_locale{"C"},
+      options{opts},
+      normalizer{} {
     // NOTE: use of the default constructor for Locale() or
     //       use of Locale::createFromName(nullptr)
     //       causes a memory leak with Boost 1.58, as detected by valgrind
@@ -343,11 +347,16 @@ REGISTER_ANALYZER_VPACK(analysis::text_token_normalizing_stream, make_vpack,
 namespace iresearch {
 namespace analysis {
 
+void text_token_normalizing_stream::state_deleter_t::operator()(
+    state_t* p) const noexcept {
+  delete p;
+}
+
 text_token_normalizing_stream::text_token_normalizing_stream(
     const options_t& options)
   : analyzer{irs::type<text_token_normalizing_stream>::get()},
-    state_(memory::make_unique<state_t>(options)),
-    term_eof_(true) {
+    state_{new state_t{options}},
+    term_eof_{true} {
 }
 
 /*static*/ void text_token_normalizing_stream::init() {
@@ -390,12 +399,10 @@ bool text_token_normalizing_stream::reset(const string_ref& data) {
 
   if (!state_->normalizer) {
     // reusable object owned by ICU
    state_->normalizer.reset(
-      icu::Normalizer2::getNFCInstance(err), [](const icu::Normalizer2*)->void{}
-    );
+    state_->normalizer = icu::Normalizer2::getNFCInstance(err);
 
     if (!U_SUCCESS(err) || !state_->normalizer) {
-      state_->normalizer.reset();
+      state_->normalizer = nullptr;
 
       return false;
     }
@@ -408,8 +415,7 @@ bool text_token_normalizing_stream::reset(const string_ref& data) {
 
     // reusable object owned by *this
     state_->transliterator.reset(icu::Transliterator::createInstance(
-      collationRule, UTransDirection::UTRANS_FORWARD, err
-    ));
+      collationRule, UTransDirection::UTRANS_FORWARD, err));
 
     if (!U_SUCCESS(err) || !state_->transliterator) {
       state_->transliterator.reset();

diff --git a/3rdParty/iresearch/core/analysis/text_token_normalizing_stream.hpp b/3rdParty/iresearch/core/analysis/text_token_normalizing_stream.hpp
@@ -46,8 +46,6 @@ class text_token_normalizing_stream final
     bool accent{true}; // no extra normalization
   };
 
-  struct state_t;
-
   static constexpr string_ref type_name() noexcept { return "norm"; }
   static void init(); // for trigering registration in a static build
   static ptr make(const string_ref& locale);
@@ -66,8 +64,13 @@ class text_token_normalizing_stream final
     payload,         // raw token value
     term_attribute>; // token value with evaluated quotes
 
+  struct state_t;
+  struct state_deleter_t {
+    void operator()(state_t*) const noexcept;
+  };
+
   attributes attrs_;
-  std::shared_ptr<state_t> state_;
+  std::unique_ptr<state_t, state_deleter_t> state_;
   bool term_eof_;
 };
 

diff --git a/3rdParty/iresearch/core/analysis/text_token_stemming_stream.cpp b/3rdParty/iresearch/core/analysis/text_token_stemming_stream.cpp
@@ -212,8 +212,9 @@ namespace analysis {
 
 text_token_stemming_stream::text_token_stemming_stream(const std::locale& locale)
   : analyzer{irs::type<text_token_stemming_stream>::get()},
-    locale_(locale),
-    term_eof_(true) {
+    locale_{locale},
+    stemmer_{nullptr, &sb_stemmer_delete},
+    term_eof_{true} {
 }
 
 /*static*/ void text_token_stemming_stream::init() {
@@ -243,10 +244,8 @@ bool text_token_stemming_stream::reset(const string_ref& data) {
   if (!stemmer_) {
     stemmer_.reset(
       sb_stemmer_new(
-        std::string(locale_utils::language(locale_)).c_str(), nullptr // defaults to utf-8
-      ),
-      [](sb_stemmer* ptr)->void{ sb_stemmer_delete(ptr); }
-    );
+        std::string(locale_utils::language(locale_)).c_str(),
+        nullptr)); // defaults to utf-8
   }
 
   auto& term = std::get<term_attribute>(attrs_);

diff --git a/3rdParty/iresearch/core/analysis/text_token_stemming_stream.hpp b/3rdParty/iresearch/core/analysis/text_token_stemming_stream.hpp
@@ -62,8 +62,8 @@ class text_token_stemming_stream final
 
    attributes attrs_;
    std::locale locale_;
-   std::shared_ptr<sb_stemmer> stemmer_;
    std::string term_buf_; // buffer for the last evaluated term
+   std::unique_ptr<sb_stemmer, void(*)(sb_stemmer*)> stemmer_;
    bool term_eof_;
 };