docarray · JohannesMessner · Feb 7, 2023 · Feb 2, 2023 · Feb 2, 2023 · Feb 2, 2023
diff --git a/docarray/array/array.py b/docarray/array/array.py
@@ -7,16 +7,22 @@
     Generic,
     Iterable,
     List,
+    Optional,
+    Sequence,
     Type,
     TypeVar,
     Union,
+    cast,
+    overload,
 )
 
+import numpy as np
 from typing_inspect import is_union_type
 
 from docarray.array.abstract_array import AnyDocumentArray
 from docarray.base_document import AnyDocument, BaseDocument
 from docarray.typing import NdArray
+from docarray.utils.misc import is_torch_available
 
 if TYPE_CHECKING:
     from pydantic import BaseConfig
@@ -30,6 +36,7 @@
 
 T = TypeVar('T', bound='DocumentArray')
 T_doc = TypeVar('T_doc', bound=BaseDocument)
+IndexIterType = Union[slice, Iterable[int], Iterable[bool], None]
 
 
 def _delegate_meth_to_data(meth_name: str) -> Callable:
@@ -49,6 +56,17 @@ def _delegate_meth(self, *args, **kwargs):
     return _delegate_meth
 
 
+def _is_np_int(item: Any) -> bool:
+    dtype = getattr(item, 'dtype', None)
+    ndim = getattr(item, 'ndim', None)
+    if dtype is not None and ndim is not None:
+        try:
+            return ndim == 0 and np.issubdtype(dtype, np.integer)
+        except TypeError:
+            return False
+    return False  # this is unreachable, but mypy wants it
+
+
 class DocumentArray(AnyDocumentArray, Generic[T_doc]):
     """
      DocumentArray is a container of Documents.
@@ -66,6 +84,7 @@ class DocumentArray(AnyDocumentArray, Generic[T_doc]):
     .. code-block:: python
         from docarray import BaseDocument, DocumentArray
         from docarray.typing import NdArray, ImageUrl
+        from typing import Optional
 
 
         class Image(BaseDocument):
@@ -79,32 +98,169 @@ class Image(BaseDocument):
 
 
     If your DocumentArray is homogeneous (i.e. follows the same schema), you can access
-    fields at the DocumentArray level (for example `da.tensor`). You can also set
-    fields, with `da.tensor = np.random.random([10, 100])`
+    fields at the DocumentArray level (for example `da.tensor` or `da.url`).
+    You can also set fields, with `da.tensor = np.random.random([10, 100])`:
+
+
+    .. code-block:: python
+        print(da.url)
+        # [ImageUrl('http://url.com/foo.png', host_type='domain'), ...]
+        import numpy as np
+
+        da.tensor = np.random.random([10, 100])
+        print(da.tensor)
+        # [NdArray([0.11299577, 0.47206767, 0.481723  , 0.34754724, 0.15016037,
+        #          0.88861321, 0.88317666, 0.93845579, 0.60486676, ... ]), ...]
+
+
+    You can index into a DocumentArray like a numpy array or torch tensor:
+
+
+    .. code-block:: python
+        da[0]  # index by position
+        da[0:5:2]  # index by slice
+        da[[0, 2, 3]]  # index by list of indices
+        da[True, False, True, True, ...]  # index by boolean mask
+
+
     """
 
     document_type: Type[BaseDocument] = AnyDocument
 
     def __init__(
         self,
-        docs: Iterable[BaseDocument] = list(),
+        docs: Optional[Iterable[BaseDocument]] = None,
         tensor_type: Type['AbstractTensor'] = NdArray,
     ):
-        self._data = [doc_ for doc_ in docs]
+        self._data = list(docs) if docs is not None else []
         self.tensor_type = tensor_type
 
     def __len__(self):
         return len(self._data)
 
+    @overload
+    def __getitem__(self: T, item: int) -> BaseDocument:
+        ...
+
+    @overload
+    def __getitem__(self: T, item: IndexIterType) -> T:
+        ...
+    
     def __getitem__(self, item):
+        item = self._normalize_index_item(item)
+
         if type(item) == slice:
             return self.__class__(self._data[item])
-        else:
+
+        if isinstance(item, int):
             return self._data[item]
 
+        if item is None:
+            return self
+
+        # _normalize_index_item() guarantees the line below is correct
+        head = item[0]  # type: ignore
+        if isinstance(head, bool):
+            return self._get_from_mask(item)
+        elif isinstance(head, int):
+            return self._get_from_indices(item)
+        else:
+            raise TypeError(f'Invalid type {type(head)} for indexing')
+
+    def __setitem__(self: T, key: IndexIterType, value: Union[T, BaseDocument]):
+        key_norm = self._normalize_index_item(key)
+
+        if isinstance(key_norm, int):
+            value_int = cast(BaseDocument, value)
+            self._data[key_norm] = value_int
+        elif isinstance(key_norm, slice):
+            value_slice = cast(T, value)
+            self._data[key_norm] = value_slice
+        else:
+            # _normalize_index_item() guarantees the line below is correct
+            head = key_norm[0]  # type: ignore
+            if isinstance(head, bool):
+                key_norm_ = cast(Iterable[bool], key_norm)
+                value_ = cast(Sequence[BaseDocument], value)  # this is no strictly true
+                # set_by_mask requires value_ to have getitem which
+                # _normalize_index_item() ensures
+                return self._set_by_mask(key_norm_, value_)
+            elif isinstance(head, int):
+                key_norm__ = cast(Iterable[int], key_norm)
+                return self._set_by_indices(key_norm__, value)
+            else:
+                raise TypeError(f'Invalid type {type(head)} for indexing')
+
     def __iter__(self):
         return iter(self._data)
 
+    @staticmethod
+    def _normalize_index_item(
+        item: Any,
+    ) -> Union[int, slice, Iterable[int], Iterable[bool], None]:
+        # basic index types
+        if item is None or isinstance(item, (int, slice, tuple, list)):
+            return item
+
+        # numpy index types
+        if _is_np_int(item):
+            return item.item()
+
+        index_has_getitem = hasattr(item, '__getitem__')
+        is_valid_bulk_index = index_has_getitem and isinstance(item, Iterable)
+        if not is_valid_bulk_index:
+            raise ValueError(f'Invalid index type {type(item)}')
+
+        if isinstance(item, np.ndarray) and (
+            item.dtype == np.bool_ or np.issubdtype(item.dtype, np.integer)
+        ):
+            return item.tolist()
+
+        # torch index types
+        torch_available = is_torch_available()
+        if torch_available:
+            import torch
+        else:
+            raise ValueError(f'Invalid index type {type(item)}')
+        allowed_torch_dtypes = [
+            torch.bool,
+            torch.int64,
+        ]
+        if isinstance(item, torch.Tensor) and (item.dtype in allowed_torch_dtypes):
+            return item.tolist()
+
+        return item
+
+    def _get_from_indices(self: T, item: Iterable[int]) -> T:
+        results = []
+        for ix in item:
+            results.append(self._data[ix])
+        return self.__class__(results)
+
+    def _set_by_indices(self: T, item: Iterable[int], value: Iterable[BaseDocument]):
+        # here we cannot use _get_offset_to_doc() because we need to change the doc
+        # that a given offset points to, not just retrieve it.
+        # Future optimization idea: _data could be List[DocContainer], where
+        # DocContainer points to the doc. Then we could use _get_offset_to_container()
+        # to swap the doc in the container.
+        for ix, doc_to_set in zip(item, value):
+            try:
+                self._data[ix] = doc_to_set
+            except KeyError:
+                raise IndexError(f'Index {ix} is out of range')
+
+    def _get_from_mask(self: T, item: Iterable[bool]) -> T:
+        return self.__class__(
+            (doc for doc, mask_value in zip(self, item) if mask_value)
+        )
+
+    def _set_by_mask(self: T, item: Iterable[bool], value: Sequence[BaseDocument]):
+        i_value = 0
+        for i, mask_value in zip(range(len(self)), item):
+            if mask_value:
+                self._data[i] = value[i_value]
+                i_value += 1
+
     append = _delegate_meth_to_data('append')
     extend = _delegate_meth_to_data('extend')
     insert = _delegate_meth_to_data('insert')