10BC0 feat: add filter capability to DocumentArray by JoanFM · Pull Request #1051 · docarray/docarray · GitHub
[go: up one dir, main page]

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docarray/base_document/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,6 @@ def __str__(self):
_console.print(self)

return capture.get().strip()

def _get_string_for_regex_filter(self):
return str(self)
46 changes: 46 additions & 0 deletions docarray/documents/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,22 @@ class MultiModalDoc(BaseDocument):
text_doc=Text(text="hello world, how are you doing?"),
)
mmdoc.text_doc.text = mmdoc.text_doc.url.load()

This Document can be compared against another Document of the same type or a string.
When compared against another object of the same type, the pydantic BaseModel
equality check will apply which checks the equality of every attribute,
including `id`. When compared against a str, it will check the equality
of the `text` attribute against the given string.

.. code-block:: python

from docarray.documents Text

doc = Text(text='This is the main text', url='exampleurl.com')
doc2 = Text(text='This is the main text', url='exampleurl.com')

doc == 'This is the main text' # True
doc == doc2 # False, their ids are not equivalent
"""

text: Optional[str] = None
Expand All @@ -79,3 +95,33 @@ def validate(
if isinstance(value, str):
value = cls(text=value)
return super().validate(value)

def __eq__(self, other: Any) -> bool:
if isinstance(other, str):
return self.text == other
else:
# BaseModel has a default equality
return super().__eq__(other)

def __contains__(self, item: str) -> bool:
"""
This method makes `Text` behave the same as an `str`.

.. code-block:: python

from docarray.documents import Text

t = Text(text='this is my text document')
assert 'text' in t
assert 'docarray' not in t

:param item: A string to be checked if is a substring of `text` attribute
:return: A boolean determining the presence of `item` as a substring in `text`
"""
if self.text is not None:
return self.text.__contains__(item)
else:
return False

def _get_string_for_regex_filter(self):
return self.text
66 changes: 66 additions & 0 deletions docarray/utils/filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import json

from typing import Union, Dict, List


from docarray.array.abstract_array import AnyDocumentArray
from docarray.array.array import DocumentArray


def filter(
docs: AnyDocumentArray,
query: Union[str, Dict, List[Dict]],
) -> AnyDocumentArray:
"""
Filter the Documents in the index according to the given filter query.


EXAMPLE USAGE

.. code-block:: python

from docarray import DocumentArray, BaseDocument
from docarray.documents import Text, Image
from docarray.util.filter import filter


class MyDocument(BaseDocument):
caption: Text
image: Image
price: int


docs = DocumentArray[MyDocument](
[MyDocument(caption='A tiger in the jungle',
image=Image(url='tigerphoto.png'), price=100),
MyDocument(caption='A swimming turtle',
image=Image(url='turtlepic.png'), price=50),
< 8000 /td> MyDocument(caption='A couple birdwatching with binoculars',
image=Image(url='binocularsphoto.png'), price=30)]
)
query = {
'$and': {
'image.url': {'$regex': 'photo'},
'price': {'$lte': 50},
}
}

results = filter(docs, query)
assert len(results) == 1
assert results[0].price == 30
assert results[0].caption == 'A couple birdwatching with binoculars'
assert results[0].image.url == 'binocularsphoto.png'

:param docs: the DocumentArray where to apply the filter
:param query: the query to filter by
:return: A DocumentArray containing the Documents
in `docs` that fulfill the filter conditions in the `query`
"""
from docarray.utils.query_language.query_parser import QueryParser

if query:
query = query if not isinstance(query, str) else json.loads(query)
parser = QueryParser(query)
return DocumentArray(d for d in docs if parser.evaluate(d))
else:
return docs
Empty file.
Loading
0