diff --git a/docarray/array/array.py b/docarray/array/array.py index 62a66b0b725..b75be68aef5 100644 --- a/docarray/array/array.py +++ b/docarray/array/array.py @@ -7,16 +7,22 @@ Generic, Iterable, List, + Optional, + Sequence, Type, TypeVar, Union, + cast, + overload, ) +import numpy as np from typing_inspect import is_union_type from docarray.array.abstract_array import AnyDocumentArray from docarray.base_document import AnyDocument, BaseDocument from docarray.typing import NdArray +from docarray.utils.misc import is_torch_available if TYPE_CHECKING: from pydantic import BaseConfig @@ -30,6 +36,7 @@ T = TypeVar('T', bound='DocumentArray') T_doc = TypeVar('T_doc', bound=BaseDocument) +IndexIterType = Union[slice, Iterable[int], Iterable[bool], None] def _delegate_meth_to_data(meth_name: str) -> Callable: @@ -49,6 +56,17 @@ def _delegate_meth(self, *args, **kwargs): return _delegate_meth +def _is_np_int(item: Any) -> bool: + dtype = getattr(item, 'dtype', None) + ndim = getattr(item, 'ndim', None) + if dtype is not None and ndim is not None: + try: + return ndim == 0 and np.issubdtype(dtype, np.integer) + except TypeError: + return False + return False # this is unreachable, but mypy wants it + + class DocumentArray(AnyDocumentArray, Generic[T_doc]): """ DocumentArray is a container of Documents. @@ -66,6 +84,7 @@ class DocumentArray(AnyDocumentArray, Generic[T_doc]): .. code-block:: python from docarray import BaseDocument, DocumentArray from docarray.typing import NdArray, ImageUrl + from typing import Optional class Image(BaseDocument): @@ -79,32 +98,169 @@ class Image(BaseDocument): If your DocumentArray is homogeneous (i.e. follows the same schema), you can access - fields at the DocumentArray level (for example `da.tensor`). You can also set - fields, with `da.tensor = np.random.random([10, 100])` + fields at the DocumentArray level (for example `da.tensor` or `da.url`). + You can also set fields, with `da.tensor = np.random.random([10, 100])`: + + + .. code-block:: python + print(da.url) + # [ImageUrl('http://url.com/foo.png', host_type='domain'), ...] + import numpy as np + + da.tensor = np.random.random([10, 100]) + print(da.tensor) + # [NdArray([0.11299577, 0.47206767, 0.481723 , 0.34754724, 0.15016037, + # 0.88861321, 0.88317666, 0.93845579, 0.60486676, ... ]), ...] + + + You can index into a DocumentArray like a numpy array or torch tensor: + + + .. code-block:: python + da[0] # index by position + da[0:5:2] # index by slice + da[[0, 2, 3]] # index by list of indices + da[True, False, True, True, ...] # index by boolean mask + + """ document_type: Type[BaseDocument] = AnyDocument def __init__( self, - docs: Iterable[BaseDocument] = list(), + docs: Optional[Iterable[BaseDocument]] = None, tensor_type: Type['AbstractTensor'] = NdArray, ): - self._data = [doc_ for doc_ in docs] + self._data = list(docs) if docs is not None else [] self.tensor_type = tensor_type def __len__(self): return len(self._data) + @overload + def __getitem__(self: T, item: int) -> BaseDocument: + ... + + @overload + def __getitem__(self: T, item: IndexIterType) -> T: + ... + def __getitem__(self, item): + item = self._normalize_index_item(item) + if type(item) == slice: return self.__class__(self._data[item]) - else: + + if isinstance(item, int): return self._data[item] + if item is None: + return self + + # _normalize_index_item() guarantees the line below is correct + head = item[0] # type: ignore + if isinstance(head, bool): + return self._get_from_mask(item) + elif isinstance(head, int): + return self._get_from_indices(item) + else: + raise TypeError(f'Invalid type {type(head)} for indexing') + + def __setitem__(self: T, key: IndexIterType, value: Union[T, BaseDocument]): + key_norm = self._normalize_index_item(key) + + if isinstance(key_norm, int): + value_int = cast(BaseDocument, value) + self._data[key_norm] = value_int + elif isinstance(key_norm, slice): + value_slice = cast(T, value) + self._data[key_norm] = value_slice + else: + # _normalize_index_item() guarantees the line below is correct + head = key_norm[0] # type: ignore + if isinstance(head, bool): + key_norm_ = cast(Iterable[bool], key_norm) + value_ = cast(Sequence[BaseDocument], value) # this is no strictly true + # set_by_mask requires value_ to have getitem which + # _normalize_index_item() ensures + return self._set_by_mask(key_norm_, value_) + elif isinstance(head, int): + key_norm__ = cast(Iterable[int], key_norm) + return self._set_by_indices(key_norm__, value) + else: + raise TypeError(f'Invalid type {type(head)} for indexing') + def __iter__(self): return iter(self._data) + @staticmethod + def _normalize_index_item( + item: Any, + ) -> Union[int, slice, Iterable[int], Iterable[bool], None]: + # basic index types + if item is None or isinstance(item, (int, slice, tuple, list)): + return item + + # numpy index types + if _is_np_int(item): + return item.item() + + index_has_getitem = hasattr(item, '__getitem__') + is_valid_bulk_index = index_has_getitem and isinstance(item, Iterable) + if not is_valid_bulk_index: + raise ValueError(f'Invalid index type {type(item)}') + + if isinstance(item, np.ndarray) and ( + item.dtype == np.bool_ or np.issubdtype(item.dtype, np.integer) + ): + return item.tolist() + + # torch index types + torch_available = is_torch_available() + if torch_available: + import torch + else: + raise ValueError(f'Invalid index type {type(item)}') + allowed_torch_dtypes = [ + torch.bool, + torch.int64, + ] + if isinstance(item, torch.Tensor) and (item.dtype in allowed_torch_dtypes): + return item.tolist() + + return item + + def _get_from_indices(self: T, item: Iterable[int]) -> T: + results = [] + for ix in item: + results.append(self._data[ix]) + return self.__class__(results) + + def _set_by_indices(self: T, item: Iterable[int], value: Iterable[BaseDocument]): + # here we cannot use _get_offset_to_doc() because we need to change the doc + # that a given offset points to, not just retrieve it. + # Future optimization idea: _data could be List[DocContainer], where + # DocContainer points to the doc. Then we could use _get_offset_to_container() + # to swap the doc in the container. + for ix, doc_to_set in zip(item, value): + try: + self._data[ix] = doc_to_set + except KeyError: + raise IndexError(f'Index {ix} is out of range') + + def _get_from_mask(self: T, item: Iterable[bool]) -> T: + return self.__class__( + (doc for doc, mask_value in zip(self, item) if mask_value) + ) + + def _set_by_mask(self: T, item: Iterable[bool], value: Sequence[BaseDocument]): + i_value = 0 + for i, mask_value in zip(range(len(self)), item): + if mask_value: + self._data[i] = value[i_value] + i_value += 1 + append = _delegate_meth_to_data('append') extend = _delegate_meth_to_data('extend') insert = _delegate_meth_to_data('insert') diff --git a/docarray/array/array_stacked.py b/docarray/array/array_stacked.py index 1a1bb122a36..5e58447c75c 100644 --- a/docarray/array/array_stacked.py +++ b/docarray/array/array_stacked.py @@ -6,10 +6,12 @@ Iterable, List, Mapping, + Tuple, Type, TypeVar, Union, cast, + overload, ) from docarray.array.abstract_array import AnyDocumentArray @@ -34,6 +36,7 @@ TorchTensor = None # type: ignore T = TypeVar('T', bound='DocumentArrayStacked') +IndexIterType = Union[slice, Iterable[int], Iterable[bool], None] class DocumentArrayStacked(AnyDocumentArray): @@ -61,29 +64,37 @@ def __init__( self: T, docs: DocumentArray, ): - self._columns: Dict[str, Union['DocumentArrayStacked', AbstractTensor]] = {} + self._doc_columns: Dict[str, 'DocumentArrayStacked'] = {} + self._tensor_columns: Dict[str, AbstractTensor] = {} self.from_document_array(docs) def from_document_array(self: T, docs: DocumentArray): self._docs = docs self.tensor_type = self._docs.tensor_type - self._columns = self._create_columns(docs, tensor_type=self.tensor_type) + self._doc_columns, self._tensor_columns = self._create_columns( + docs, tensor_type=self.tensor_type + ) @classmethod - def _from_columns( + def _from_da_and_columns( cls: Type[T], docs: DocumentArray, - columns: Mapping[str, Union['DocumentArrayStacked', AbstractTensor]], + doc_columns: Dict[str, 'DocumentArrayStacked'], + tensor_columns: Dict[str, AbstractTensor], ) -> T: + """Create a DocumentArrayStacked from a DocumentArray + and an associated dict of columns""" # below __class_getitem__ is called explicitly instead # of doing DocumentArrayStacked[docs.document_type] # because mypy has issues with class[...] notation at runtime. # see bug here: https://github.com/python/mypy/issues/13026 # as of 2023-01-05 it should be fixed on mypy master, though, see # here: https://github.com/python/typeshed/issues/4819#issuecomment-1354506442 - da_stacked = DocumentArray.__class_getitem__(cls.document_type)([]).stack() - da_stacked._columns = columns + + da_stacked: T = DocumentArray.__class_getitem__(cls.document_type)([]).stack() + da_stacked._doc_columns = doc_columns + da_stacked._tensor_columns = tensor_columns da_stacked._docs = docs return da_stacked @@ -92,15 +103,14 @@ def to(self: T, device: str) -> T: :param device: the device to move the data to """ - for field in self._columns.keys(): - col = self._columns[field] - if isinstance(col, AbstractTensor): - self._columns[field] = col.__class__._docarray_from_native( - col.get_comp_backend().to_device(col, device) - ) - else: # recursive call - col_docarray = cast(T, col) - col_docarray.to(device) + for field in self._tensor_columns.keys(): + col_tens: AbstractTensor = self._tensor_columns[field] + self._tensor_columns[field] = col_tens.__class__._docarray_from_native( + col_tens.get_comp_backend().to_device(col_tens, device) + ) + for field in self._doc_columns.keys(): + col_doc: 'DocumentArrayStacked' = self._doc_columns[field] + col_doc.to(device) return self @classmethod @@ -131,14 +141,15 @@ def _get_columns_schema( @classmethod def _create_columns( cls: Type[T], docs: DocumentArray, tensor_type: Type[AbstractTensor] - ) -> Dict[str, Union['DocumentArrayStacked', AbstractTensor]]: + ) -> Tuple[Dict[str, 'DocumentArrayStacked'], Dict[str, AbstractTensor]]: if len(docs) == 0: - return {} + return {}, {} column_schema = cls._get_columns_schema(tensor_type) - columns: Dict[str, Union[DocumentArrayStacked, AbstractTensor]] = dict() + doc_columns: Dict[str, DocumentArrayStacked] = dict() + tensor_columns: Dict[str, AbstractTensor] = dict() for field, type_ in column_schema.items(): if issubclass(type_, AbstractTensor): @@ -146,7 +157,7 @@ def _create_columns( column_shape = ( (len(docs), *tensor.shape) if tensor is not None else (len(docs),) ) - columns[field] = type_._docarray_from_native( + tensor_columns[field] = type_._docarray_from_native( type_.get_comp_backend().empty( column_shape, dtype=tensor.dtype if hasattr(tensor, 'dtype') else None, @@ -159,14 +170,14 @@ def _create_columns( if val is None: val = tensor_type.get_comp_backend().none_value() - cast(AbstractTensor, columns[field])[i] = val - setattr(doc, field, columns[field][i]) + cast(AbstractTensor, tensor_columns[field])[i] = val + setattr(doc, field, tensor_columns[field][i]) del val elif issubclass(type_, BaseDocument): - columns[field] = getattr(docs, field).stack() + doc_columns[field] = getattr(docs, field).stack() - return columns + return doc_columns, tensor_columns def _get_array_attribute( self: T, @@ -178,8 +189,10 @@ def _get_array_attribute( :return: Returns a list of the field value for each document in the array like container """ - if field in self._columns.keys(): - return self._columns[field] + if field in self._doc_columns.keys(): + return self._doc_columns[field] + elif field in self._tensor_columns.keys(): + return self._tensor_columns[field] else: return getattr(self._docs, field) @@ -193,30 +206,117 @@ def _set_array_attribute( :param field: name of the fields to extract :values: the values to set at the DocumentArray level """ - if field in self._columns.keys() and not isinstance(values, List): - self._columns[field] = values + if field in self._doc_columns.keys() and not isinstance(values, List): + values_ = cast(T, values) + self._doc_columns[field] = values_ + elif field in self._tensor_columns.keys() and not isinstance(values, List): + values__ = cast(AbstractTensor, values) + self._tensor_columns[field] = values__ else: setattr(self._docs, field, values) - def __getitem__(self, item): # note this should handle slices - if isinstance(item, slice): - return self._get_slice(item) + @overload + def __getitem__(self: T, item: int) -> BaseDocument: + ... + + @overload + def __getitem__(self: T, item: IndexIterType) -> T: + ... + + def __getitem__(self, item): + if item is None: + return self # PyTorch behaviour + # multiple docs case + if isinstance(item, (slice, Iterable)): + item_ = cast(Iterable, item) + return self._get_from_data_and_columns(item_) + # single doc case doc = self._docs[item] - # NOTE: this could be speed up by using a cache - for field in self._columns.keys(): - setattr(doc, field, self._columns[field][item]) + for field in self._doc_columns.keys(): + setattr(doc, field, self._doc_columns[field][item]) + for field in self._tensor_columns.keys(): + setattr(doc, field, self._tensor_columns[field][item]) + return doc + + def __setitem__( + self: T, key: Union[int, IndexIterType], value: Union[T, BaseDocument] + ): + # multiple docs case + if isinstance(key, (slice, Iterable)): + return self._set_data_and_columns(key, value) + # single doc case + doc = self._docs[key] + for field in self._doc_columns.keys(): + setattr(doc, field, self._doc_columns[field][key]) + for field in self._doc_columns.keys(): + setattr(doc, field, self._doc_columns[field][key]) return doc - def _get_slice(self: T, item: slice) -> T: - """Return a slice of the DocumentArrayStacked + def _get_from_data_and_columns(self: T, item: Union[Tuple, Iterable]) -> T: + """Delegates the access to the data and the columns, + and combines into a stacked da. + + :param item: the item used as index. Needs to be a valid index for both + DocumentArray (data) and column types (torch/tensorflow/numpy tensors) + :return: a DocumentArrayStacked, indexed according to `item` + """ + if isinstance(item, tuple): + item = list(item) + # get documents + docs_indexed = self._docs[item] + # get doc columns + doc_columns_indexed = {k: col[item] for k, col in self._doc_columns.items()} + doc_columns_indexed_ = cast( + Dict[str, 'DocumentArrayStacked'], doc_columns_indexed + ) + # get tensor columns + tensor_columns_indexed = { + k: col[item] for k, col in self._tensor_columns.items() + } + return self._from_da_and_columns( + docs_indexed, doc_columns_indexed_, tensor_columns_indexed + ) + + def _set_data_and_columns( + self: T, + index_item: Union[Tuple, Iterable, slice], + value: Union[T, BaseDocument], + ): + """Delegates the setting to the data and the columns. - :param item: the slice to apply - :return: a DocumentArrayStacked + :param index_item: the key used as index. Needs to be a valid index for both + DocumentArray (data) and column types (torch/tensorflow/numpy tensors) + :value: the value to set at the `key` location """ + if isinstance(index_item, tuple): + index_item = list(index_item) + + # set data and prepare columns + doc_cols_to_set: Dict[str, DocumentArrayStacked] + tens_cols_to_set: Dict[str, AbstractTensor] + if isinstance(value, DocumentArray): + self._docs[index_item] = value + doc_cols_to_set, tens_cols_to_set = self._create_columns( + value, self.tensor_type + ) + elif isinstance(value, BaseDocument): + self._docs[index_item] = value + doc_cols_to_set, tens_cols_to_set = self._create_columns( + DocumentArray.__class_getitem__(self.document_type)([value]), + self.tensor_type, + ) + elif isinstance(value, DocumentArrayStacked): + self._docs[index_item] = value._docs + doc_cols_to_set = value._doc_columns + tens_cols_to_set = value._tensor_columns + else: + raise TypeError(f'Can not set a DocumentArrayStacked with {type(value)}') - columns_sliced = {k: col[item] for k, col in self._columns.items()} - columns_sliced_ = cast(Dict[str, Union[AbstractTensor, T]], columns_sliced) - return self._from_columns(self._docs[item], columns_sliced_) + # set columns + for col_key in self._doc_columns.keys(): + self._doc_columns[col_key][index_item] = doc_cols_to_set[col_key] + for col_key in self._tensor_columns.keys(): + self._tensor_columns[col_key][index_item] = tens_cols_to_set[col_key] def __iter__(self): for i in range(len(self)): @@ -233,10 +333,11 @@ def from_protobuf(cls: Type[T], pb_msg: 'DocumentArrayStackedProto') -> T: cls.document_type.from_protobuf(doc_proto) for doc_proto in pb_msg.list_.docs ) - da = cls(DocumentArray([])) + da: T = cls(DocumentArray([])) da._docs = docs - da._columns = pb_msg.columns + da._doc_columns = pb_msg.doc_columns + da._tensor_columns = pb_msg.tensor_columns return da def to_protobuf(self) -> 'DocumentArrayStackedProto': @@ -244,23 +345,25 @@ def to_protobuf(self) -> 'DocumentArrayStackedProto': from docarray.proto import ( DocumentArrayProto, DocumentArrayStackedProto, - UnionArrayProto, + NdArrayProto, ) da_proto = DocumentArrayProto() for doc in self: da_proto.docs.append(doc.to_protobuf()) - columns_proto: Dict[str, UnionArrayProto] = dict() - for field, column in self._columns.items(): - if isinstance(column, DocumentArrayStacked): - columns_proto[field] = UnionArrayProto( - document_array=DocumentArrayProto(stack=column.to_protobuf()) - ) - elif isinstance(column, AbstractTensor): - columns_proto[field] = UnionArrayProto(ndarray=column.to_protobuf()) - - return DocumentArrayStackedProto(list_=da_proto, columns=columns_proto) + doc_columns_proto: Dict[str, DocumentArrayStackedProto] = dict() + tens_columns_proto: Dict[str, NdArrayProto] = dict() + for field, col_doc in self._doc_columns.items(): + doc_columns_proto[field] = col_doc.to_protobuf() + for field, col_tens in self._tensor_columns.items(): + tens_columns_proto[field] = col_tens.to_protobuf() + + return DocumentArrayStackedProto( + list_=da_proto, + doc_columns=doc_columns_proto, + tensor_columns=tens_columns_proto, + ) def unstack(self: T) -> DocumentArray: """Convert DocumentArrayStacked into a DocumentArray. @@ -268,18 +371,26 @@ def unstack(self: T) -> DocumentArray: Note this destroys the arguments and returns a new DocumentArray """ for i, doc in enumerate(self._docs): - for field in self._columns.keys(): - val = self._columns[field] - setattr(doc, field, val[i]) + for field in self._doc_columns.keys(): + val_doc = self._doc_columns[field] + setattr(doc, field, val_doc[i]) + + for field in self._tensor_columns.keys(): + val_tens = self._tensor_columns[field] + setattr(doc, field, val_tens[i]) # NOTE: here we might need to copy the tensor # see here # https://discuss.pytorch.org/t/what-happened-to-a-view-of-a-tensor # -when-the-original-tensor-is-deleted/167294 # noqa: E501 - for field in list(self._columns.keys()): + for field in list(self._doc_columns.keys()): + # list needed here otherwise we are modifying the dict while iterating + del self._doc_columns[field] + + for field in list(self._tensor_columns.keys()): # list needed here otherwise we are modifying the dict while iterating - del self._columns[field] + del self._tensor_columns[field] da_list = self._docs return da_list diff --git a/docarray/base_document/mixins/proto.py b/docarray/base_document/mixins/proto.py index ba2a9ac39bd..3d0ef70be15 100644 --- a/docarray/base_document/mixins/proto.py +++ b/docarray/base_document/mixins/proto.py @@ -8,12 +8,6 @@ if TYPE_CHECKING: from docarray.proto import DocumentProto, NodeProto -try: - import torch # noqa: F401 -except ImportError: - torch_imported = False -else: - torch_imported = True T = TypeVar('T', bound='ProtoMixin') diff --git a/docarray/display/document_array_summary.py b/docarray/display/document_array_summary.py index 1f32b9f970e..7899fbbe452 100644 --- a/docarray/display/document_array_summary.py +++ b/docarray/display/document_array_summary.py @@ -60,18 +60,15 @@ def _get_stacked_fields(da: 'DocumentArrayStacked') -> List[str]: stacked, i.e. all the fields that are of type AbstractTensor. Nested field paths are separated by dot, such as: 'attr.nested_attr'. """ - from docarray.array import DocumentArrayStacked - fields = [] - for field_name, value in da._columns.items(): - if isinstance(value, AbstractTensor): - fields.append(field_name) - elif isinstance(value, DocumentArrayStacked): - fields.extend( - [ - f'{field_name}.{x}' - for x in DocumentArraySummary._get_stacked_fields(da=value) - ] - ) + for field_name, value_tens in da._tensor_columns.items(): + fields.append(field_name) + for field_name, value_doc in da._doc_columns.items(): + fields.extend( + [ + f'{field_name}.{x}' + for x in DocumentArraySummary._get_stacked_fields(da=value_doc) + ] + ) return fields diff --git a/docarray/documents/audio.py b/docarray/documents/audio.py index c27eb175a6f..dcf3e47b3fa 100644 --- a/docarray/documents/audio.py +++ b/docarray/documents/audio.py @@ -7,14 +7,12 @@ from docarray.typing.bytes.audio_bytes import AudioBytes from docarray.typing.tensor.abstract_tensor import AbstractTensor from docarray.typing.tensor.audio.audio_tensor import AudioTensor +from docarray.utils.misc import is_torch_available -try: +torch_available = is_torch_available() +if torch_available: import torch - torch_available = True -except ImportError: - torch_available = False - T = TypeVar('T', bound='Audio') diff --git a/docarray/documents/image.py b/docarray/documents/image.py index 84af89d562b..4b75185f90d 100644 --- a/docarray/documents/image.py +++ b/docarray/documents/image.py @@ -6,16 +6,14 @@ from docarray.typing import AnyEmbedding, ImageBytes, ImageUrl from docarray.typing.tensor.abstract_tensor import AbstractTensor from docarray.typing.tensor.image.image_tensor import ImageTensor +from docarray.utils.misc import is_torch_available T = TypeVar('T', bound='Image') -try: +torch_available = is_torch_available() +if torch_available: import torch - torch_available = True -except ImportError: - torch_available = False - class Image(BaseDocument): """ diff --git a/docarray/documents/point_cloud.py b/docarray/documents/point_cloud.py index 3d22858efa9..23ec687ce57 100644 --- a/docarray/documents/point_cloud.py +++ b/docarray/documents/point_cloud.py @@ -5,14 +5,12 @@ from docarray.base_document import BaseDocument from docarray.typing import AnyEmbedding, AnyTensor, PointCloud3DUrl from docarray.typing.tensor.abstract_tensor import AbstractTensor +from docarray.utils.misc import is_torch_available -try: +torch_available = is_torch_available() +if torch_available: import torch - torch_available = True -except ImportError: - torch_available = False - T = TypeVar('T', bound='PointCloud3D') diff --git a/docarray/documents/video.py b/docarray/documents/video.py index 7beba15b2fd..3912ebc51ae 100644 --- a/docarray/documents/video.py +++ b/docarray/documents/video.py @@ -8,14 +8,12 @@ from docarray.typing.tensor.abstract_tensor import AbstractTensor from docarray.typing.tensor.video.video_tensor import VideoTensor from docarray.typing.url.video_url import VideoUrl +from docarray.utils.misc import is_torch_available -try: +torch_available = is_torch_available() +if torch_available: import torch - torch_available = True -except ImportError: - torch_available = False - T = TypeVar('T', bound='Video') diff --git a/docarray/proto/__init__.py b/docarray/proto/__init__.py index 6d749ebde51..70984d4eff4 100644 --- a/docarray/proto/__init__.py +++ b/docarray/proto/__init__.py @@ -7,7 +7,6 @@ DocumentProto, NdArrayProto, NodeProto, - UnionArrayProto, ) else: from docarray.proto.pb2.docarray_pb2 import ( @@ -16,7 +15,6 @@ DocumentProto, NdArrayProto, NodeProto, - UnionArrayProto, ) __all__ = [ @@ -26,5 +24,4 @@ 'NodeProto', 'DocumentArrayStackedProto', 'DocumentArrayProto', - 'UnionArrayProto', ] diff --git a/docarray/proto/docarray.proto b/docarray/proto/docarray.proto index 122e866be98..aa7731023c8 100644 --- a/docarray/proto/docarray.proto +++ b/docarray/proto/docarray.proto @@ -84,14 +84,9 @@ message DocumentArrayProto { repeated DocumentProto docs = 1; // a list of Documents } -message UnionArrayProto { // represent the column of the da stacked - oneof content { - DocumentArrayStackedProto document_array = 1; - NdArrayProto ndarray = 2; - } -} - message DocumentArrayStackedProto{ DocumentArrayProto list_ = 1; // a list of Documents - map columns = 2; // a dict of columns + map doc_columns = 2; // a dict of document columns + map tensor_columns = 3; // a dict of tensor columns + } \ No newline at end of file diff --git a/docarray/proto/pb/docarray_pb2.py b/docarray/proto/pb/docarray_pb2.py index 5e1630db76d..e6842593369 100644 --- a/docarray/proto/pb/docarray_pb2.py +++ b/docarray/proto/pb/docarray_pb2.py @@ -16,7 +16,7 @@ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x0e\x64ocarray.proto\x12\x08\x64ocarray\x1a\x1cgoogle/protobuf/struct.proto\"A\n\x11\x44\x65nseNdArrayProto\x12\x0e\n\x06\x62uffer\x18\x01 \x01(\x0c\x12\r\n\x05shape\x18\x02 \x03(\r\x12\r\n\x05\x64type\x18\x03 \x01(\t\"g\n\x0cNdArrayProto\x12*\n\x05\x64\x65nse\x18\x01 \x01(\x0b\x32\x1b.docarray.DenseNdArrayProto\x12+\n\nparameters\x18\x02 \x01(\x0b\x32\x17.google.protobuf.Struct\"Z\n\x0cKeyValuePair\x12#\n\x03key\x18\x01 \x01(\x0b\x32\x16.google.protobuf.Value\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.google.protobuf.Value\";\n\x10GenericDictValue\x12\'\n\x07\x65ntries\x18\x01 \x03(\x0b\x32\x16.docarray.KeyValuePair\"\xcb\x03\n\tNodeProto\x12\x0e\n\x04text\x18\x01 \x01(\tH\x00\x12\x11\n\x07integer\x18\x02 \x01(\x05H\x00\x12\x0f\n\x05\x66loat\x18\x03 \x01(\x01H\x00\x12\x11\n\x07\x62oolean\x18\x04 \x01(\x08H\x00\x12\x0e\n\x04\x62lob\x18\x05 \x01(\x0cH\x00\x12)\n\x07ndarray\x18\x06 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12+\n\x08\x64ocument\x18\x07 \x01(\x0b\x32\x17.docarray.DocumentProtoH\x00\x12\x36\n\x0e\x64ocument_array\x18\x08 \x01(\x0b\x32\x1c.docarray.DocumentArrayProtoH\x00\x12*\n\x04list\x18\t \x01(\x0b\x32\x1a.google.protobuf.ListValueH\x00\x12)\n\x03set\x18\n \x01(\x0b\x32\x1a.google.protobuf.ListValueH\x00\x12+\n\x05tuple\x18\x0b \x01(\x0b\x32\x1a.google.protobuf.ListValueH\x00\x12\'\n\x04\x64ict\x18\x0c \x01(\x0b\x32\x17.google.protobuf.StructH\x00\x12\x0e\n\x04type\x18\r \x01(\tH\x01\x42\t\n\x07\x63ontentB\x0f\n\rdocarray_type\"\x82\x01\n\rDocumentProto\x12/\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32!.docarray.DocumentProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\";\n\x12\x44ocumentArrayProto\x12%\n\x04\x64ocs\x18\x01 \x03(\x0b\x32\x17.docarray.DocumentProto\"\x86\x01\n\x0fUnionArrayProto\x12=\n\x0e\x64ocument_array\x18\x01 \x01(\x0b\x32#.docarray.DocumentArrayStackedProtoH\x00\x12)\n\x07ndarray\x18\x02 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x42\t\n\x07\x63ontent\"\xd6\x01\n\x19\x44ocumentArrayStackedProto\x12+\n\x05list_\x18\x01 \x01(\x0b\x32\x1c.docarray.DocumentArrayProto\x12\x41\n\x07\x63olumns\x18\x02 \x03(\x0b\x32\x30.docarray.DocumentArrayStackedProto.ColumnsEntry\x1aI\n\x0c\x43olumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12(\n\x05value\x18\x02 \x01(\x0b\x32\x19.docarray.UnionArrayProto:\x02\x38\x01\x62\x06proto3' + b'\n\x0e\x64ocarray.proto\x12\x08\x64ocarray\x1a\x1cgoogle/protobuf/struct.proto\"A\n\x11\x44\x65nseNdArrayProto\x12\x0e\n\x06\x62uffer\x18\x01 \x01(\x0c\x12\r\n\x05shape\x18\x02 \x03(\r\x12\r\n\x05\x64type\x18\x03 \x01(\t\"g\n\x0cNdArrayProto\x12*\n\x05\x64\x65nse\x18\x01 \x01(\x0b\x32\x1b.docarray.DenseNdArrayProto\x12+\n\nparameters\x18\x02 \x01(\x0b\x32\x17.google.protobuf.Struct\"Z\n\x0cKeyValuePair\x12#\n\x03key\x18\x01 \x01(\x0b\x32\x16.google.protobuf.Value\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.google.protobuf.Value\";\n\x10GenericDictValue\x12\'\n\x07\x65ntries\x18\x01 \x03(\x0b\x32\x16.docarray.KeyValuePair\"\xcb\x03\n\tNodeProto\x12\x0e\n\x04text\x18\x01 \x01(\tH\x00\x12\x11\n\x07integer\x18\x02 \x01(\x05H\x00\x12\x0f\n\x05\x66loat\x18\x03 \x01(\x01H\x00\x12\x11\n\x07\x62oolean\x18\x04 \x01(\x08H\x00\x12\x0e\n\x04\x62lob\x18\x05 \x01(\x0cH\x00\x12)\n\x07ndarray\x18\x06 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12+\n\x08\x64ocument\x18\x07 \x01(\x0b\x32\x17.docarray.DocumentProtoH\x00\x12\x36\n\x0e\x64ocument_array\x18\x08 \x01(\x0b\x32\x1c.docarray.DocumentArrayProtoH\x00\x12*\n\x04list\x18\t \x01(\x0b\x32\x1a.google.protobuf.ListValueH\x00\x12)\n\x03set\x18\n \x01(\x0b\x32\x1a.google.protobuf.ListValueH\x00\x12+\n\x05tuple\x18\x0b \x01(\x0b\x32\x1a.google.protobuf.ListValueH\x00\x12\'\n\x04\x64ict\x18\x0c \x01(\x0b\x32\x17.google.protobuf.StructH\x00\x12\x0e\n\x04type\x18\r \x01(\tH\x01\x42\t\n\x07\x63ontentB\x0f\n\rdocarray_type\"\x82\x01\n\rDocumentProto\x12/\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32!.docarray.DocumentProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\";\n\x12\x44ocumentArrayProto\x12%\n\x04\x64ocs\x18\x01 \x03(\x0b\x32\x17.docarray.DocumentProto\"\x88\x03\n\x19\x44ocumentArrayStackedProto\x12+\n\x05list_\x18\x01 \x01(\x0b\x32\x1c.docarray.DocumentArrayProto\x12H\n\x0b\x64oc_columns\x18\x02 \x03(\x0b\x32\x33.docarray.DocumentArrayStackedProto.DocColumnsEntry\x12N\n\x0etensor_columns\x18\x03 \x03(\x0b\x32\x36.docarray.DocumentArrayStackedProto.TensorColumnsEntry\x1aV\n\x0f\x44ocColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x32\n\x05value\x18\x02 \x01(\x0b\x32#.docarray.DocumentArrayStackedProto:\x02\x38\x01\x1aL\n\x12TensorColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.docarray.NdArrayProto:\x02\x38\x01\x62\x06proto3' ) _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) @@ -26,8 +26,10 @@ DESCRIPTOR._options = None _DOCUMENTPROTO_DATAENTRY._options = None _DOCUMENTPROTO_DATAENTRY._serialized_options = b'8\001' - _DOCUMENTARRAYSTACKEDPROTO_COLUMNSENTRY._options = None - _DOCUMENTARRAYSTACKEDPROTO_COLUMNSENTRY._serialized_options = b'8\001' + _DOCUMENTARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._options = None + _DOCUMENTARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._serialized_options = b'8\001' + _DOCUMENTARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._options = None + _DOCUMENTARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._serialized_options = b'8\001' _DENSENDARRAYPROTO._serialized_start = 58 _DENSENDARRAYPROTO._serialized_end = 123 _NDARRAYPROTO._serialized_start = 125 @@ -44,10 +46,10 @@ _DOCUMENTPROTO_DATAENTRY._serialized_end = 976 _DOCUMENTARRAYPROTO._serialized_start = 978 _DOCUMENTARRAYPROTO._serialized_end = 1037 - _UNIONARRAYPROTO._serialized_start = 1040 - _UNIONARRAYPROTO._serialized_end = 1174 - _DOCUMENTARRAYSTACKEDPROTO._serialized_start = 1177 - _DOCUMENTARRAYSTACKEDPROTO._serialized_end = 1391 - _DOCUMENTARRAYSTACKEDPROTO_COLUMNSENTRY._serialized_start = 1318 - _DOCUMENTARRAYSTACKEDPROTO_COLUMNSENTRY._serialized_end = 1391 + _DOCUMENTARRAYSTACKEDPROTO._serialized_start = 1040 + _DOCUMENTARRAYSTACKEDPROTO._serialized_end = 1432 + _DOCUMENTARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._serialized_start = 1268 + _DOCUMENTARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._serialized_end = 1354 + _DOCUMENTARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._serialized_start = 1356 + _DOCUMENTARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._serialized_end = 1432 # @@protoc_insertion_point(module_scope) diff --git a/docarray/proto/pb2/docarray_pb2.py b/docarray/proto/pb2/docarray_pb2.py index 75b97c6f50c..bda933ee03b 100644 --- a/docarray/proto/pb2/docarray_pb2.py +++ b/docarray/proto/pb2/docarray_pb2.py @@ -3,9 +3,11 @@ # source: docarray.proto """Generated protocol buffer code.""" from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool from google.protobuf import message as _message from google.protobuf import reflection as _reflection from google.protobuf import symbol_database as _symbol_database + # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() @@ -13,682 +15,175 @@ from google.protobuf import struct_pb2 as google_dot_protobuf_dot_struct__pb2 - -DESCRIPTOR = _descriptor.FileDescriptor( - name='docarray.proto', - package='docarray', - syntax='proto3', - serialized_options=None, - create_key=_descriptor._internal_create_key, - serialized_pb=b'\n\x0e\x64ocarray.proto\x12\x08\x64ocarray\x1a\x1cgoogle/protobuf/struct.proto\"A\n\x11\x44\x65nseNdArrayProto\x12\x0e\n\x06\x62uffer\x18\x01 \x01(\x0c\x12\r\n\x05shape\x18\x02 \x03(\r\x12\r\n\x05\x64type\x18\x03 \x01(\t\"g\n\x0cNdArrayProto\x12*\n\x05\x64\x65nse\x18\x01 \x01(\x0b\x32\x1b.docarray.DenseNdArrayProto\x12+\n\nparameters\x18\x02 \x01(\x0b\x32\x17.google.protobuf.Struct\"Z\n\x0cKeyValuePair\x12#\n\x03key\x18\x01 \x01(\x0b\x32\x16.google.protobuf.Value\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.google.protobuf.Value\";\n\x10GenericDictValue\x12\'\n\x07\x65ntries\x18\x01 \x03(\x0b\x32\x16.docarray.KeyValuePair\"\xcb\x03\n\tNodeProto\x12\x0e\n\x04text\x18\x01 \x01(\tH\x00\x12\x11\n\x07integer\x18\x02 \x01(\x05H\x00\x12\x0f\n\x05\x66loat\x18\x03 \x01(\x01H\x00\x12\x11\n\x07\x62oolean\x18\x04 \x01(\x08H\x00\x12\x0e\n\x04\x62lob\x18\x05 \x01(\x0cH\x00\x12)\n\x07ndarray\x18\x06 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12+\n\x08\x64ocument\x18\x07 \x01(\x0b\x32\x17.docarray.DocumentProtoH\x00\x12\x36\n\x0e\x64ocument_array\x18\x08 \x01(\x0b\x32\x1c.docarray.DocumentArrayProtoH\x00\x12*\n\x04list\x18\t \x01(\x0b\x32\x1a.google.protobuf.ListValueH\x00\x12)\n\x03set\x18\n \x01(\x0b\x32\x1a.google.protobuf.ListValueH\x00\x12+\n\x05tuple\x18\x0b \x01(\x0b\x32\x1a.google.protobuf.ListValueH\x00\x12\'\n\x04\x64ict\x18\x0c \x01(\x0b\x32\x17.google.protobuf.StructH\x00\x12\x0e\n\x04type\x18\r \x01(\tH\x01\x42\t\n\x07\x63ontentB\x0f\n\rdocarray_type\"\x82\x01\n\rDocumentProto\x12/\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32!.docarray.DocumentProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\";\n\x12\x44ocumentArrayProto\x12%\n\x04\x64ocs\x18\x01 \x03(\x0b\x32\x17.docarray.DocumentProto\"\x86\x01\n\x0fUnionArrayProto\x12=\n\x0e\x64ocument_array\x18\x01 \x01(\x0b\x32#.docarray.DocumentArrayStackedProtoH\x00\x12)\n\x07ndarray\x18\x02 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x42\t\n\x07\x63ontent\"\xd6\x01\n\x19\x44ocumentArrayStackedProto\x12+\n\x05list_\x18\x01 \x01(\x0b\x32\x1c.docarray.DocumentArrayProto\x12\x41\n\x07\x63olumns\x18\x02 \x03(\x0b\x32\x30.docarray.DocumentArrayStackedProto.ColumnsEntry\x1aI\n\x0c\x43olumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12(\n\x05value\x18\x02 \x01(\x0b\x32\x19.docarray.UnionArrayProto:\x02\x38\x01\x62\x06proto3' - , - dependencies=[google_dot_protobuf_dot_struct__pb2.DESCRIPTOR,]) - - - - -_DENSENDARRAYPROTO = _descriptor.Descriptor( - name='DenseNdArrayProto', - full_name='docarray.DenseNdArrayProto', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='buffer', full_name='docarray.DenseNdArrayProto.buffer', index=0, - number=1, type=12, cpp_type=9, label=1, - has_default_value=False, default_value=b"", - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='shape', full_name='docarray.DenseNdArrayProto.shape', index=1, - number=2, type=13, cpp_type=3, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='dtype', full_name='docarray.DenseNdArrayProto.dtype', index=2, - number=3, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=58, - serialized_end=123, -) - - -_NDARRAYPROTO = _descriptor.Descriptor( - name='NdArrayProto', - full_name='docarray.NdArrayProto', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='dense', full_name='docarray.NdArrayProto.dense', index=0, - number=1, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='parameters', full_name='docarray.NdArrayProto.parameters', index=1, - number=2, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=125, - serialized_end=228, -) - - -_KEYVALUEPAIR = _descriptor.Descriptor( - name='KeyValuePair', - full_name='docarray.KeyValuePair', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='key', full_name='docarray.KeyValuePair.key', index=0, - number=1, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='value', full_name='docarray.KeyValuePair.value', index=1, - number=2, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=230, - serialized_end=320, +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( + b'\n\x0e\x64ocarray.proto\x12\x08\x64ocarray\x1a\x1cgoogle/protobuf/struct.proto\"A\n\x11\x44\x65nseNdArrayProto\x12\x0e\n\x06\x62uffer\x18\x01 \x01(\x0c\x12\r\n\x05shape\x18\x02 \x03(\r\x12\r\n\x05\x64type\x18\x03 \x01(\t\"g\n\x0cNdArrayProto\x12*\n\x05\x64\x65nse\x18\x01 \x01(\x0b\x32\x1b.docarray.DenseNdArrayProto\x12+\n\nparameters\x18\x02 \x01(\x0b\x32\x17.google.protobuf.Struct\"Z\n\x0cKeyValuePair\x12#\n\x03key\x18\x01 \x01(\x0b\x32\x16.google.protobuf.Value\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.google.protobuf.Value\";\n\x10GenericDictValue\x12\'\n\x07\x65ntries\x18\x01 \x03(\x0b\x32\x16.docarray.KeyValuePair\"\xcb\x03\n\tNodeProto\x12\x0e\n\x04text\x18\x01 \x01(\tH\x00\x12\x11\n\x07integer\x18\x02 \x01(\x05H\x00\x12\x0f\n\x05\x66loat\x18\x03 \x01(\x01H\x00\x12\x11\n\x07\x62oolean\x18\x04 \x01(\x08H\x00\x12\x0e\n\x04\x62lob\x18\x05 \x01(\x0cH\x00\x12)\n\x07ndarray\x18\x06 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12+\n\x08\x64ocument\x18\x07 \x01(\x0b\x32\x17.docarray.DocumentProtoH\x00\x12\x36\n\x0e\x64ocument_array\x18\x08 \x01(\x0b\x32\x1c.docarray.DocumentArrayProtoH\x00\x12*\n\x04list\x18\t \x01(\x0b\x32\x1a.google.protobuf.ListValueH\x00\x12)\n\x03set\x18\n \x01(\x0b\x32\x1a.google.protobuf.ListValueH\x00\x12+\n\x05tuple\x18\x0b \x01(\x0b\x32\x1a.google.protobuf.ListValueH\x00\x12\'\n\x04\x64ict\x18\x0c \x01(\x0b\x32\x17.google.protobuf.StructH\x00\x12\x0e\n\x04type\x18\r \x01(\tH\x01\x42\t\n\x07\x63ontentB\x0f\n\rdocarray_type\"\x82\x01\n\rDocumentProto\x12/\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32!.docarray.DocumentProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\";\n\x12\x44ocumentArrayProto\x12%\n\x04\x64ocs\x18\x01 \x03(\x0b\x32\x17.docarray.DocumentProto\"\x88\x03\n\x19\x44ocumentArrayStackedProto\x12+\n\x05list_\x18\x01 \x01(\x0b\x32\x1c.docarray.DocumentArrayProto\x12H\n\x0b\x64oc_columns\x18\x02 \x03(\x0b\x32\x33.docarray.DocumentArrayStackedProto.DocColumnsEntry\x12N\n\x0etensor_columns\x18\x03 \x03(\x0b\x32\x36.docarray.DocumentArrayStackedProto.TensorColumnsEntry\x1aV\n\x0f\x44ocColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x32\n\x05value\x18\x02 \x01(\x0b\x32#.docarray.DocumentArrayStackedProto:\x02\x38\x01\x1aL\n\x12TensorColumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.docarray.NdArrayProto:\x02\x38\x01\x62\x06proto3' ) -_GENERICDICTVALUE = _descriptor.Descriptor( - name='GenericDictValue', - full_name='docarray.GenericDictValue', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='entries', full_name='docarray.GenericDictValue.entries', index=0, - number=1, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=322, - serialized_end=381, +_DENSENDARRAYPROTO = DESCRIPTOR.message_types_by_name['DenseNdArrayProto'] +_NDARRAYPROTO = DESCRIPTOR.message_types_by_name['NdArrayProto'] +_KEYVALUEPAIR = DESCRIPTOR.message_types_by_name['KeyValuePair'] +_GENERICDICTVALUE = DESCRIPTOR.message_types_by_name['GenericDictValue'] +_NODEPROTO = DESCRIPTOR.message_types_by_name['NodeProto'] +_DOCUMENTPROTO = DESCRIPTOR.message_types_by_name['DocumentProto'] +_DOCUMENTPROTO_DATAENTRY = _DOCUMENTPROTO.nested_types_by_name['DataEntry'] +_DOCUMENTARRAYPROTO = DESCRIPTOR.message_types_by_name['DocumentArrayProto'] +_DOCUMENTARRAYSTACKEDPROTO = DESCRIPTOR.message_types_by_name[ + 'DocumentArrayStackedProto' +] +_DOCUMENTARRAYSTACKEDPROTO_DOCCOLUMNSENTRY = ( + _DOCUMENTARRAYSTACKEDPROTO.nested_types_by_name['DocColumnsEntry'] ) - - -_NODEPROTO = _descriptor.Descriptor( - name='NodeProto', - full_name='docarray.NodeProto', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='text', full_name='docarray.NodeProto.text', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='integer', full_name='docarray.NodeProto.integer', index=1, - number=2, type=5, cpp_type=1, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='float', full_name='docarray.NodeProto.float', index=2, - number=3, type=1, cpp_type=5, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='boolean', full_name='docarray.NodeProto.boolean', index=3, - number=4, type=8, cpp_type=7, label=1, - has_default_value=False, default_value=False, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='blob', full_name='docarray.NodeProto.blob', index=4, - number=5, type=12, cpp_type=9, label=1, - has_default_value=False, default_value=b"", - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='ndarray', full_name='docarray.NodeProto.ndarray', index=5, - number=6, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='document', full_name='docarray.NodeProto.document', index=6, - number=7, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='document_array', full_name='docarray.NodeProto.document_array', index=7, - number=8, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='list', full_name='docarray.NodeProto.list', index=8, - number=9, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='set', full_name='docarray.NodeProto.set', index=9, - number=10, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='tuple', full_name='docarray.NodeProto.tuple', index=10, - number=11, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='dict', full_name='docarray.NodeProto.dict', index=11, - number=12, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='type', full_name='docarray.NodeProto.type', index=12, - number=13, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - _descriptor.OneofDescriptor( - name='content', full_name='docarray.NodeProto.content', - index=0, containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[]), - _descriptor.OneofDescriptor( - name='docarray_type', full_name='docarray.NodeProto.docarray_type', - index=1, containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[]), - ], - serialized_start=384, - serialized_end=843, +_DOCUMENTARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY = ( + _DOCUMENTARRAYSTACKEDPROTO.nested_types_by_name['TensorColumnsEntry'] ) - - -_DOCUMENTPROTO_DATAENTRY = _descriptor.Descriptor( - name='DataEntry', - full_name='docarray.DocumentProto.DataEntry', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='key', full_name='docarray.DocumentProto.DataEntry.key', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='value', full_name='docarray.DocumentProto.DataEntry.value', index=1, - number=2, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=b'8\001', - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=912, - serialized_end=976, +DenseNdArrayProto = _reflection.GeneratedProtocolMessageType( + 'DenseNdArrayProto', + (_message.Message,), + { + 'DESCRIPTOR': _DENSENDARRAYPROTO, + '__module__': 'docarray_pb2' + # @@protoc_insertion_point(class_scope:docarray.DenseNdArrayProto) + }, ) - -_DOCUMENTPROTO = _descriptor.Descriptor( - name='DocumentProto', - full_name='docarray.DocumentProto', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='data', full_name='docarray.DocumentProto.data', index=0, - number=1, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[_DOCUMENTPROTO_DATAENTRY, ], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=846, - serialized_end=976, -) - - -_DOCUMENTARRAYPROTO = _descriptor.Descriptor( - name='DocumentArrayProto', - full_name='docarray.DocumentArrayProto', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='docs', full_name='docarray.DocumentArrayProto.docs', index=0, - number=1, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=978, - serialized_end=1037, -) - - -_UNIONARRAYPROTO = _descriptor.Descriptor( - name='UnionArrayProto', - full_name='docarray.UnionArrayProto', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='document_array', full_name='docarray.UnionArrayProto.document_array', index=0, - number=1, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='ndarray', full_name='docarray.UnionArrayProto.ndarray', index=1, - number=2, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - _descriptor.OneofDescriptor( - name='content', full_name='docarray.UnionArrayProto.content', - index=0, containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[]), - ], - serialized_start=1040, - serialized_end=1174, -) - - -_DOCUMENTARRAYSTACKEDPROTO_COLUMNSENTRY = _descriptor.Descriptor( - name='ColumnsEntry', - full_name='docarray.DocumentArrayStackedProto.ColumnsEntry', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='key', full_name='docarray.DocumentArrayStackedProto.ColumnsEntry.key', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='value', full_name='docarray.DocumentArrayStackedProto.ColumnsEntry.value', index=1, - number=2, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=b'8\001', - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1318, - serialized_end=1391, -) - -_DOCUMENTARRAYSTACKEDPROTO = _descriptor.Descriptor( - name='DocumentArrayStackedProto', - full_name='docarray.DocumentArrayStackedProto', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='list_', full_name='docarray.DocumentArrayStackedProto.list_', index=0, - number=1, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='columns', full_name='docarray.DocumentArrayStackedProto.columns', index=1, - number=2, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[_DOCUMENTARRAYSTACKEDPROTO_COLUMNSENTRY, ], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1177, - serialized_end=1391, -) - -_NDARRAYPROTO.fields_by_name['dense'].message_type = _DENSENDARRAYPROTO -_NDARRAYPROTO.fields_by_name['parameters'].message_type = google_dot_protobuf_dot_struct__pb2._STRUCT -_KEYVALUEPAIR.fields_by_name['key'].message_type = google_dot_protobuf_dot_struct__pb2._VALUE -_KEYVALUEPAIR.fields_by_name['value'].message_type = google_dot_protobuf_dot_struct__pb2._VALUE -_GENERICDICTVALUE.fields_by_name['entries'].message_type = _KEYVALUEPAIR -_NODEPROTO.fields_by_name['ndarray'].message_type = _NDARRAYPROTO -_NODEPROTO.fields_by_name['document'].message_type = _DOCUMENTPROTO -_NODEPROTO.fields_by_name['document_array'].message_type = _DOCUMENTARRAYPROTO -_NODEPROTO.fields_by_name['list'].message_type = google_dot_protobuf_dot_struct__pb2._LISTVALUE -_NODEPROTO.fields_by_name['set'].message_type = google_dot_protobuf_dot_struct__pb2._LISTVALUE -_NODEPROTO.fields_by_name['tuple'].message_type = google_dot_protobuf_dot_struct__pb2._LISTVALUE -_NODEPROTO.fields_by_name['dict'].message_type = google_dot_protobuf_dot_struct__pb2._STRUCT -_NODEPROTO.oneofs_by_name['content'].fields.append( - _NODEPROTO.fields_by_name['text']) -_NODEPROTO.fields_by_name['text'].containing_oneof = _NODEPROTO.oneofs_by_name['content'] -_NODEPROTO.oneofs_by_name['content'].fields.append( - _NODEPROTO.fields_by_name['integer']) -_NODEPROTO.fields_by_name['integer'].containing_oneof = _NODEPROTO.oneofs_by_name['content'] -_NODEPROTO.oneofs_by_name['content'].fields.append( - _NODEPROTO.fields_by_name['float']) -_NODEPROTO.fields_by_name['float'].containing_oneof = _NODEPROTO.oneofs_by_name['content'] -_NODEPROTO.oneofs_by_name['content'].fields.append( - _NODEPROTO.fields_by_name['boolean']) -_NODEPROTO.fields_by_name['boolean'].containing_oneof = _NODEPROTO.oneofs_by_name['content'] -_NODEPROTO.oneofs_by_name['content'].fields.append( - _NODEPROTO.fields_by_name['blob']) -_NODEPROTO.fields_by_name['blob'].containing_oneof = _NODEPROTO.oneofs_by_name['content'] -_NODEPROTO.oneofs_by_name['content'].fields.append( - _NODEPROTO.fields_by_name['ndarray']) -_NODEPROTO.fields_by_name['ndarray'].containing_oneof = _NODEPROTO.oneofs_by_name['content'] -_NODEPROTO.oneofs_by_name['content'].fields.append( - _NODEPROTO.fields_by_name['document']) -_NODEPROTO.fields_by_name['document'].containing_oneof = _NODEPROTO.oneofs_by_name['content'] -_NODEPROTO.oneofs_by_name['content'].fields.append( - _NODEPROTO.fields_by_name['document_array']) -_NODEPROTO.fields_by_name['document_array'].containing_oneof = _NODEPROTO.oneofs_by_name['content'] -_NODEPROTO.oneofs_by_name['content'].fields.append( - _NODEPROTO.fields_by_name['list']) -_NODEPROTO.fields_by_name['list'].containing_oneof = _NODEPROTO.oneofs_by_name['content'] -_NODEPROTO.oneofs_by_name['content'].fields.append( - _NODEPROTO.fields_by_name['set']) -_NODEPROTO.fields_by_name['set'].containing_oneof = _NODEPROTO.oneofs_by_name['content'] -_NODEPROTO.oneofs_by_name['content'].fields.append( - _NODEPROTO.fields_by_name['tuple']) -_NODEPROTO.fields_by_name['tuple'].containing_oneof = _NODEPROTO.oneofs_by_name['content'] -_NODEPROTO.oneofs_by_name['content'].fields.append( - _NODEPROTO.fields_by_name['dict']) -_NODEPROTO.fields_by_name['dict'].containing_oneof = _NODEPROTO.oneofs_by_name['content'] -_NODEPROTO.oneofs_by_name['docarray_type'].fields.append( - _NODEPROTO.fields_by_name['type']) -_NODEPROTO.fields_by_name['type'].containing_oneof = _NODEPROTO.oneofs_by_name['docarray_type'] -_DOCUMENTPROTO_DATAENTRY.fields_by_name['value'].message_type = _NODEPROTO -_DOCUMENTPROTO_DATAENTRY.containing_type = _DOCUMENTPROTO -_DOCUMENTPROTO.fields_by_name['data'].message_type = _DOCUMENTPROTO_DATAENTRY -_DOCUMENTARRAYPROTO.fields_by_name['docs'].message_type = _DOCUMENTPROTO -_UNIONARRAYPROTO.fields_by_name['document_array'].message_type = _DOCUMENTARRAYSTACKEDPROTO -_UNIONARRAYPROTO.fields_by_name['ndarray'].message_type = _NDARRAYPROTO -_UNIONARRAYPROTO.oneofs_by_name['content'].fields.append( - _UNIONARRAYPROTO.fields_by_name['document_array']) -_UNIONARRAYPROTO.fields_by_name['document_array'].containing_oneof = _UNIONARRAYPROTO.oneofs_by_name['content'] -_UNIONARRAYPROTO.oneofs_by_name['content'].fields.append( - _UNIONARRAYPROTO.fields_by_name['ndarray']) -_UNIONARRAYPROTO.fields_by_name['ndarray'].containing_oneof = _UNIONARRAYPROTO.oneofs_by_name['content'] -_DOCUMENTARRAYSTACKEDPROTO_COLUMNSENTRY.fields_by_name['value'].message_type = _UNIONARRAYPROTO -_DOCUMENTARRAYSTACKEDPROTO_COLUMNSENTRY.containing_type = _DOCUMENTARRAYSTACKEDPROTO -_DOCUMENTARRAYSTACKEDPROTO.fields_by_name['list_'].message_type = _DOCUMENTARRAYPROTO -_DOCUMENTARRAYSTACKEDPROTO.fields_by_name['columns'].message_type = _DOCUMENTARRAYSTACKEDPROTO_COLUMNSENTRY -DESCRIPTOR.message_types_by_name['DenseNdArrayProto'] = _DENSENDARRAYPROTO -DESCRIPTOR.message_types_by_name['NdArrayProto'] = _NDARRAYPROTO -DESCRIPTOR.message_types_by_name['KeyValuePair'] = _KEYVALUEPAIR -DESCRIPTOR.message_types_by_name['GenericDictValue'] = _GENERICDICTVALUE -DESCRIPTOR.message_types_by_name['NodeProto'] = _NODEPROTO -DESCRIPTOR.message_types_by_name['DocumentProto'] = _DOCUMENTPROTO -DESCRIPTOR.message_types_by_name['DocumentArrayProto'] = _DOCUMENTARRAYPROTO -DESCRIPTOR.message_types_by_name['UnionArrayProto'] = _UNIONARRAYPROTO -DESCRIPTOR.message_types_by_name['DocumentArrayStackedProto'] = _DOCUMENTARRAYSTACKEDPROTO -_sym_db.RegisterFileDescriptor(DESCRIPTOR) - -DenseNdArrayProto = _reflection.GeneratedProtocolMessageType('DenseNdArrayProto', (_message.Message,), { - 'DESCRIPTOR' : _DENSENDARRAYPROTO, - '__module__' : 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.DenseNdArrayProto) - }) _sym_db.RegisterMessage(DenseNdArrayProto) -NdArrayProto = _reflection.GeneratedProtocolMessageType('NdArrayProto', (_message.Message,), { - 'DESCRIPTOR' : _NDARRAYPROTO, - '__module__' : 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.NdArrayProto) - }) +NdArrayProto = _reflection.GeneratedProtocolMessageType( + 'NdArrayProto', + (_message.Message,), + { + 'DESCRIPTOR': _NDARRAYPROTO, + '__module__': 'docarray_pb2' + # @@protoc_insertion_point(class_scope:docarray.NdArrayProto) + }, +) _sym_db.RegisterMessage(NdArrayProto) -KeyValuePair = _reflection.GeneratedProtocolMessageType('KeyValuePair', (_message.Message,), { - 'DESCRIPTOR' : _KEYVALUEPAIR, - '__module__' : 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.KeyValuePair) - }) +KeyValuePair = _reflection.GeneratedProtocolMessageType( + 'KeyValuePair', + (_message.Message,), + { + 'DESCRIPTOR': _KEYVALUEPAIR, + '__module__': 'docarray_pb2' + # @@protoc_insertion_point(class_scope:docarray.KeyValuePair) + }, +) _sym_db.RegisterMessage(KeyValuePair) -GenericDictValue = _reflection.GeneratedProtocolMessageType('GenericDictValue', (_message.Message,), { - 'DESCRIPTOR' : _GENERICDICTVALUE, - '__module__' : 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.GenericDictValue) - }) +GenericDictValue = _reflection.GeneratedProtocolMessageType( + 'GenericDictValue', + (_message.Message,), + { + 'DESCRIPTOR': _GENERICDICTVALUE, + '__module__': 'docarray_pb2' + # @@protoc_insertion_point(class_scope:docarray.GenericDictValue) + }, +) _sym_db.RegisterMessage(GenericDictValue) -NodeProto = _reflection.GeneratedProtocolMessageType('NodeProto', (_message.Message,), { - 'DESCRIPTOR' : _NODEPROTO, - '__module__' : 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.NodeProto) - }) +NodeProto = _reflection.GeneratedProtocolMessageType( + 'NodeProto', + (_message.Message,), + { + 'DESCRIPTOR': _NODEPROTO, + '__module__': 'docarray_pb2' + # @@protoc_insertion_point(class_scope:docarray.NodeProto) + }, +) _sym_db.RegisterMessage(NodeProto) -DocumentProto = _reflection.GeneratedProtocolMessageType('DocumentProto', (_message.Message,), { - - 'DataEntry' : _reflection.GeneratedProtocolMessageType('DataEntry', (_message.Message,), { - 'DESCRIPTOR' : _DOCUMENTPROTO_DATAENTRY, - '__module__' : 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.DocumentProto.DataEntry) - }) - , - 'DESCRIPTOR' : _DOCUMENTPROTO, - '__module__' : 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.DocumentProto) - }) +DocumentProto = _reflection.GeneratedProtocolMessageType( + 'DocumentProto', + (_message.Message,), + { + 'DataEntry': _reflection.GeneratedProtocolMessageType( + 'DataEntry', + (_message.Message,), + { + 'DESCRIPTOR': _DOCUMENTPROTO_DATAENTRY, + '__module__': 'docarray_pb2' + # @@protoc_insertion_point(class_scope:docarray.DocumentProto.DataEntry) + }, + ), + 'DESCRIPTOR': _DOCUMENTPROTO, + '__module__': 'docarray_pb2' + # @@protoc_insertion_point(class_scope:docarray.DocumentProto) + }, +) _sym_db.RegisterMessage(DocumentProto) _sym_db.RegisterMessage(DocumentProto.DataEntry) -DocumentArrayProto = _reflection.GeneratedProtocolMessageType('DocumentArrayProto', (_message.Message,), { - 'DESCRIPTOR' : _DOCUMENTARRAYPROTO, - '__module__' : 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.DocumentArrayProto) - }) +DocumentArrayProto = _reflection.GeneratedProtocolMessageType( + 'DocumentArrayProto', + (_message.Message,), + { + 'DESCRIPTOR': _DOCUMENTARRAYPROTO, + '__module__': 'docarray_pb2' + # @@protoc_insertion_point(class_scope:docarray.DocumentArrayProto) + }, +) _sym_db.RegisterMessage(DocumentArrayProto) -UnionArrayProto = _reflection.GeneratedProtocolMessageType('UnionArrayProto', (_message.Message,), { - 'DESCRIPTOR' : _UNIONARRAYPROTO, - '__module__' : 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.UnionArrayProto) - }) -_sym_db.RegisterMessage(UnionArrayProto) - -DocumentArrayStackedProto = _reflection.GeneratedProtocolMessageType('DocumentArrayStackedProto', (_message.Message,), { - - 'ColumnsEntry' : _reflection.GeneratedProtocolMessageType('ColumnsEntry', (_message.Message,), { - 'DESCRIPTOR' : _DOCUMENTARRAYSTACKEDPROTO_COLUMNSENTRY, - '__module__' : 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.DocumentArrayStackedProto.ColumnsEntry) - }) - , - 'DESCRIPTOR' : _DOCUMENTARRAYSTACKEDPROTO, - '__module__' : 'docarray_pb2' - # @@protoc_insertion_point(class_scope:docarray.DocumentArrayStackedProto) - }) +DocumentArrayStackedProto = _reflection.GeneratedProtocolMessageType( + 'DocumentArrayStackedProto', + (_message.Message,), + { + 'DocColumnsEntry': _reflection.GeneratedProtocolMessageType( + 'DocColumnsEntry', + (_message.Message,), + { + 'DESCRIPTOR': _DOCUMENTARRAYSTACKEDPROTO_DOCCOLUMNSENTRY, + '__module__': 'docarray_pb2' + # @@protoc_insertion_point(class_scope:docarray.DocumentArrayStackedProto.DocColumnsEntry) + }, + ), + 'TensorColumnsEntry': _reflection.GeneratedProtocolMessageType( + 'TensorColumnsEntry', + (_message.Message,), + { + 'DESCRIPTOR': _DOCUMENTARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY, + '__module__': 'docarray_pb2' + # @@protoc_insertion_point(class_scope:docarray.DocumentArrayStackedProto.TensorColumnsEntry) + }, + ), + 'DESCRIPTOR': _DOCUMENTARRAYSTACKEDPROTO, + '__module__': 'docarray_pb2' + # @@protoc_insertion_point(class_scope:docarray.DocumentArrayStackedProto) + }, +) _sym_db.RegisterMessage(DocumentArrayStackedProto) -_sym_db.RegisterMessage(DocumentArrayStackedProto.ColumnsEntry) - - -_DOCUMENTPROTO_DATAENTRY._options = None -_DOCUMENTARRAYSTACKEDPROTO_COLUMNSENTRY._options = None +_sym_db.RegisterMessage(DocumentArrayStackedProto.DocColumnsEntry) +_sym_db.RegisterMessage(DocumentArrayStackedProto.TensorColumnsEntry) + +if _descriptor._USE_C_DESCRIPTORS == False: + + DESCRIPTOR._options = None + _DOCUMENTPROTO_DATAENTRY._options = None + _DOCUMENTPROTO_DATAENTRY._serialized_options = b'8\001' + _DOCUMENTARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._options = None + _DOCUMENTARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._serialized_options = b'8\001' + _DOCUMENTARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._options = None + _DOCUMENTARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._serialized_options = b'8\001' + _DENSENDARRAYPROTO._serialized_start = 58 + _DENSENDARRAYPROTO._serialized_end = 123 + _NDARRAYPROTO._serialized_start = 125 + _NDARRAYPROTO._serialized_end = 228 + _KEYVALUEPAIR._serialized_start = 230 + _KEYVALUEPAIR._serialized_end = 320 + _GENERICDICTVALUE._serialized_start = 322 + _GENERICDICTVALUE._serialized_end = 381 + _NODEPROTO._serialized_start = 384 + _NODEPROTO._serialized_end = 843 + _DOCUMENTPROTO._serialized_start = 846 + _DOCUMENTPROTO._serialized_end = 976 + _DOCUMENTPROTO_DATAENTRY._serialized_start = 912 + _DOCUMENTPROTO_DATAENTRY._serialized_end = 976 + _DOCUMENTARRAYPROTO._serialized_start = 978 + _DOCUMENTARRAYPROTO._serialized_end = 1037 + _DOCUMENTARRAYSTACKEDPROTO._serialized_start = 1040 + _DOCUMENTARRAYSTACKEDPROTO._serialized_end = 1432 + _DOCUMENTARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._serialized_start = 1268 + _DOCUMENTARRAYSTACKEDPROTO_DOCCOLUMNSENTRY._serialized_end = 1354 + _DOCUMENTARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._serialized_start = 1356 + _DOCUMENTARRAYSTACKEDPROTO_TENSORCOLUMNSENTRY._serialized_end = 1432 # @@protoc_insertion_point(module_scope) diff --git a/docarray/typing/tensor/abstract_tensor.py b/docarray/typing/tensor/abstract_tensor.py index 5ae2d0e7b6b..d8a10a0292e 100644 --- a/docarray/typing/tensor/abstract_tensor.py +++ b/docarray/typing/tensor/abstract_tensor.py @@ -235,14 +235,17 @@ def get_comp_backend() -> AbstractComputationalBackend: """The computational backend compatible with this tensor type.""" ... - def __getitem__(self, item): + @abc.abstractmethod + def __getitem__(self: T, item) -> T: """Get a slice of this tensor.""" ... + @abc.abstractmethod def __setitem__(self, index, value): """Set a slice of this tensor.""" ... + @abc.abstractmethod def __iter__(self): """Iterate over the elements of this tensor.""" ... diff --git a/docarray/typing/url/data/05978.jpg b/docarray/typing/url/data/05978.jpg new file mode 100644 index 00000000000..3f0bf32e01d Binary files /dev/null and b/docarray/typing/url/data/05978.jpg differ diff --git a/docarray/utils/misc.py b/docarray/utils/misc.py new file mode 100644 index 00000000000..939d2fe08c9 --- /dev/null +++ b/docarray/utils/misc.py @@ -0,0 +1,10 @@ +try: + import torch # noqa: F401 +except ImportError: + torch_imported = False +else: + torch_imported = True + + +def is_torch_available(): + return torch_imported diff --git a/tests/units/array/test_array_stacked.py b/tests/units/array/test_array_stacked.py index b67ea0df9e6..dd9ed921c59 100644 --- a/tests/units/array/test_array_stacked.py +++ b/tests/units/array/test_array_stacked.py @@ -43,7 +43,7 @@ def test_stack_setter(batch): def test_stack_optional(batch): - assert (batch._columns['tensor'] == torch.zeros(10, 3, 224, 224)).all() + assert (batch._tensor_columns['tensor'] == torch.zeros(10, 3, 224, 224)).all() assert (batch.tensor == torch.zeros(10, 3, 224, 224)).all() @@ -57,17 +57,17 @@ class Image(BaseDocument): batch = batch.stack() - assert (batch._columns['tensor'] == np.zeros((10, 3, 224, 224))).all() + assert (batch._tensor_columns['tensor'] == np.zeros((10, 3, 224, 224))).all() assert (batch.tensor == np.zeros((10, 3, 224, 224))).all() - assert batch.tensor.ctypes.data == batch._columns['tensor'].ctypes.data + assert batch.tensor.ctypes.data == batch._tensor_columns['tensor'].ctypes.data batch.unstack() def test_stack(batch): - assert (batch._columns['tensor'] == torch.zeros(10, 3, 224, 224)).all() + assert (batch._tensor_columns['tensor'] == torch.zeros(10, 3, 224, 224)).all() assert (batch.tensor == torch.zeros(10, 3, 224, 224)).all() - assert batch._columns['tensor'].data_ptr() == batch.tensor.data_ptr() + assert batch._tensor_columns['tensor'].data_ptr() == batch.tensor.data_ptr() for doc, tensor in zip(batch, batch.tensor): assert doc.tensor.data_ptr() == tensor.data_ptr() @@ -90,13 +90,14 @@ class MMdoc(BaseDocument): batch = batch.stack() assert ( - batch._columns['img']._columns['tensor'] == torch.zeros(10, 3, 224, 224) + batch._doc_columns['img']._tensor_columns['tensor'] + == torch.zeros(10, 3, 224, 224) ).all() assert (batch.img.tensor == torch.zeros(10, 3, 224, 224)).all() assert ( - batch._columns['img']._columns['tensor'].data_ptr() + batch._doc_columns['img']._tensor_columns['tensor'].data_ptr() == batch.img.tensor.data_ptr() ) @@ -221,8 +222,8 @@ class Image(BaseDocument): for i in range(len(da)): assert (da[i].tensor == tensor).all() - assert 'tensor' in da._columns.keys() - assert isinstance(da._columns['tensor'], tensor_type) + assert 'tensor' in da._tensor_columns.keys() + assert isinstance(da._tensor_columns['tensor'], tensor_type) def test_any_tensor_with_optional(): @@ -242,8 +243,8 @@ class TopDoc(BaseDocument): for i in range(len(da)): assert (da.img[i].tensor == tensor).all() - assert 'tensor' in da.img._columns.keys() - assert isinstance(da.img._columns['tensor'], TorchTensor) + assert 'tensor' in da.img._tensor_columns.keys() + assert isinstance(da.img._tensor_columns['tensor'], TorchTensor) def test_dict_stack(): @@ -288,7 +289,7 @@ class MyDoc(BaseDocument): [MyDoc(embedding=np.zeros(10)) for _ in range(10)] ).stack() - assert 'embedding' in da._columns.keys() + assert 'embedding' in da._tensor_columns.keys() assert (da.embedding == np.zeros((10, 10))).all() @@ -301,7 +302,7 @@ class MyDoc(BaseDocument): [MyDoc(tensor=None) for _ in range(10)], tensor_type=tensor_backend ).stack() - assert 'tensor' in da._columns.keys() + assert 'tensor' in da._tensor_columns.keys() def test_to_device(): diff --git a/tests/units/array/test_indexing.py b/tests/units/array/test_indexing.py new file mode 100644 index 00000000000..ba53a7bbc06 --- /dev/null +++ b/tests/units/array/test_indexing.py @@ -0,0 +1,235 @@ +import numpy as np +import pytest +import torch + +from docarray import DocumentArray +from docarray.documents import Text +from docarray.typing import TorchTensor + + +@pytest.fixture() +def da(): + texts = [f'hello {i}' for i in range(10)] + tensors = [torch.ones((4,)) * i for i in range(10)] + return DocumentArray[Text]( + [Text(text=text, embedding=tens) for text, tens in zip(texts, tensors)], + tensor_type=TorchTensor, + ) + + +@pytest.fixture() +def da_to_set(): + texts = [f'hello {2*i}' for i in range(5)] + tensors = [torch.ones((4,)) * i * 2 for i in range(5)] + return DocumentArray[Text]( + [Text(text=text, embedding=tens) for text, tens in zip(texts, tensors)], + tensor_type=TorchTensor, + ) + + +########### +# getitem +########### + + +@pytest.mark.parametrize('stack', [True, False]) +def test_simple_getitem(stack, da): + if stack: + da = da.stack() + + assert torch.all(da[0].embedding == torch.zeros((4,))) + assert da[0].text == 'hello 0' + + +@pytest.mark.parametrize('stack', [True, False]) +def test_get_none(stack, da): + if stack: + da = da.stack() + + assert da[None] is da + + +@pytest.mark.parametrize('stack', [True, False]) +@pytest.mark.parametrize('index', [(1, 2, 3, 4, 6), [1, 2, 3, 4, 6]]) +def test_iterable_getitem(stack, da, index): + if stack: + da = da.stack() + + indexed_da = da[index] + + for pos, d in zip(index, indexed_da): + assert d.text == f'hello {pos}' + assert torch.all(d.embedding == torch.ones((4,)) * pos) + + +@pytest.mark.parametrize('stack', [True, False]) +@pytest.mark.parametrize('index_dtype', [torch.int64]) +def test_torchtensor_getitem(stack, da, index_dtype): + if stack: + da = da.stack() + + index = torch.tensor([1, 2, 3, 4, 6], dtype=index_dtype) + + indexed_da = da[index] + + for pos, d in zip(index, indexed_da): + assert d.text == f'hello {pos}' + assert torch.all(d.embedding == torch.ones((4,)) * pos) + + +@pytest.mark.parametrize('stack', [True, False]) +@pytest.mark.parametrize('index_dtype', [np.int, np.int_, np.int32, np.int64]) +def test_nparray_getitem(stack, da, index_dtype): + if stack: + da = da.stack() + + index = np.array([1, 2, 3, 4, 6], dtype=index_dtype) + + indexed_da = da[index] + for pos, d in zip(index, indexed_da): + assert d.text == f'hello {pos}' + assert torch.all(d.embedding == torch.ones((4,)) * pos) + + +@pytest.mark.parametrize('stack', [True, False]) +@pytest.mark.parametrize( + 'index', + [ + [False, True, True, True, True, False, True, False, False, False], + (False, True, True, True, True, False, True, False, False, False), + torch.tensor([0, 1, 1, 1, 1, 0, 1, 0, 0, 0], dtype=torch.bool), + np.array([0, 1, 1, 1, 1, 0, 1, 0, 0, 0], dtype=np.bool), + ], +) +def test_boolmask_getitem(stack, da, index): + if stack: + da = da.stack() + + indexed_da = da[index] + + mask_true_idx = [1, 2, 3, 4, 6] + + for pos, d in zip(mask_true_idx, indexed_da): + assert d.text == f'hello {pos}' + assert torch.all(d.embedding == torch.ones((4,)) * pos) + + +########### +# setitem +########### + + +@pytest.mark.parametrize('stack_left', [True, False]) +def test_simple_setitem(stack_left, da, da_to_set): + if stack_left: + da = da.stack() + + da[0] = da_to_set[0] + + assert torch.all(da[0].embedding == da_to_set[0].embedding) + assert da[0].text == da_to_set[0].text + + +@pytest.mark.parametrize('stack_left', [True, False]) +@pytest.mark.parametrize('stack_right', [True, False]) +@pytest.mark.parametrize('index', [(1, 2, 3, 4, 6), [1, 2, 3, 4, 6]]) +def test_iterable_setitem(stack_left, stack_right, da, da_to_set, index): + if stack_left: + da = da.stack() + if stack_right: + da_to_set = da_to_set.stack() + + da[index] = da_to_set + + i_da_to_set = 0 + for i, d in enumerate(da): + if i in index: + d_reference = da_to_set[i_da_to_set] + assert d.text == d_reference.text + assert torch.all(d.embedding == d_reference.embedding) + i_da_to_set += 1 + else: + assert d.text == f'hello {i}' + assert torch.all(d.embedding == torch.ones((4,)) * i) + + +@pytest.mark.parametrize('stack_left', [True, False]) +@pytest.mark.parametrize('stack_right', [True, False]) +@pytest.mark.parametrize('index_dtype', [torch.int64]) +def test_torchtensor_setitem(stack_left, stack_right, da, da_to_set, index_dtype): + if stack_left: + da = da.stack() + if stack_right: + da_to_set = da_to_set.stack() + + index = torch.tensor([1, 2, 3, 4, 6], dtype=index_dtype) + + da[index] = da_to_set + + i_da_to_set = 0 + for i, d in enumerate(da): + if i in index: + d_reference = da_to_set[i_da_to_set] + assert d.text == d_reference.text + assert torch.all(d.embedding == d_reference.embedding) + i_da_to_set += 1 + else: + assert d.text == f'hello {i}' + assert torch.all(d.embedding == torch.ones((4,)) * i) + + +@pytest.mark.parametrize('stack_left', [True, False]) +@pytest.mark.parametrize('stack_right', [True, False]) +@pytest.mark.parametrize('index_dtype', [np.int, np.int_, np.int32, np.int64]) +def test_nparray_setitem(stack_left, stack_right, da, da_to_set, index_dtype): + if stack_left: + da = da.stack() + if stack_right: + da_to_set = da_to_set.stack() + + index = np.array([1, 2, 3, 4, 6], dtype=index_dtype) + + da[index] = da_to_set + + i_da_to_set = 0 + for i, d in enumerate(da): + if i in index: + d_reference = da_to_set[i_da_to_set] + assert d.text == d_reference.text + assert torch.all(d.embedding == d_reference.embedding) + i_da_to_set += 1 + else: + assert d.text == f'hello {i}' + assert torch.all(d.embedding == torch.ones((4,)) * i) + + +@pytest.mark.parametrize('stack_left', [True, False]) +@pytest.mark.parametrize('stack_right', [True, False]) +@pytest.mark.parametrize( + 'index', + [ + [False, True, True, True, True, False, True, False, False, False], + (False, True, True, True, True, False, True, False, False, False), + torch.tensor([0, 1, 1, 1, 1, 0, 1, 0, 0, 0], dtype=torch.bool), + np.array([0, 1, 1, 1, 1, 0, 1, 0, 0, 0], dtype=np.bool), + ], +) +def test_boolmask_setitem(stack_left, stack_right, da, da_to_set, index): + if stack_left: + da = da.stack() + if stack_right: + da_to_set = da_to_set.stack() + + da[index] = da_to_set + + mask_true_idx = [1, 2, 3, 4, 6] + i_da_to_set = 0 + for i, d in enumerate(da): + if i in mask_true_idx: + d_reference = da_to_set[i_da_to_set] + assert d.text == d_reference.text + assert torch.all(d.embedding == d_reference.embedding) + i_da_to_set += 1 + else: + assert d.text == f'hello {i}' + assert torch.all(d.embedding == torch.ones((4,)) * i)