From a45226893de839b97a7d6ba1210f1a313a2bc491 Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Tue, 3 Jan 2023 12:47:36 +0100
Subject: [PATCH 01/26] feat: add video url and tensors to proto

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 docarray/proto/docarray.proto      |  6 ++++++
 docarray/proto/pb2/docarray_pb2.py | 28 ++++++++++++++--------------
 2 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/docarray/proto/docarray.proto b/docarray/proto/docarray.proto
index 0646453294e..39f8354b223 100644
--- a/docarray/proto/docarray.proto
+++ b/docarray/proto/docarray.proto
@@ -69,6 +69,12 @@ message NodeProto {
 
     NdArrayProto audio_torch_tensor = 16;
 
+    string video_url = 17;
+
+    NdArrayProto video_ndarray = 18;
+
+    NdArrayProto video_torch_tensor = 19;
+
   }
 
 }
diff --git a/docarray/proto/pb2/docarray_pb2.py b/docarray/proto/pb2/docarray_pb2.py
index 1d5fb2d954b..da5d3df5a46 100644
--- a/docarray/proto/pb2/docarray_pb2.py
+++ b/docarray/proto/pb2/docarray_pb2.py
@@ -15,7 +15,7 @@
 from google.protobuf import struct_pb2 as google_dot_protobuf_dot_struct__pb2
 
 DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
-    b'\n\x0e\x64ocarray.proto\x12\x08\x64ocarray\x1a\x1cgoogle/protobuf/struct.proto\"A\n\x11\x44\x65nseNdArrayProto\x12\x0e\n\x06\x62uffer\x18\x01 \x01(\x0c\x12\r\n\x05shape\x18\x02 \x03(\r\x12\r\n\x05\x64type\x18\x03 \x01(\t\"g\n\x0cNdArrayProto\x12*\n\x05\x64\x65nse\x18\x01 \x01(\x0b\x32\x1b.docarray.DenseNdArrayProto\x12+\n\nparameters\x18\x02 \x01(\x0b\x32\x17.google.protobuf.Struct\"\x8e\x04\n\tNodeProto\x12\x0e\n\x04\x62lob\x18\x01 \x01(\x0cH\x00\x12)\n\x07ndarray\x18\x02 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12\x0e\n\x04text\x18\x03 \x01(\tH\x00\x12)\n\x06nested\x18\x04 \x01(\x0b\x32\x17.docarray.DocumentProtoH\x00\x12.\n\x06\x63hunks\x18\x05 \x01(\x0b\x32\x1c.docarray.DocumentArrayProtoH\x00\x12+\n\tembedding\x18\x06 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12\x11\n\x07\x61ny_url\x18\x07 \x01(\tH\x00\x12\x13\n\timage_url\x18\x08 \x01(\tH\x00\x12\x12\n\x08text_url\x18\t \x01(\tH\x00\x12\x0c\n\x02id\x18\n \x01(\tH\x00\x12.\n\x0ctorch_tensor\x18\x0b \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12\x12\n\x08mesh_url\x18\x0c \x01(\tH\x00\x12\x19\n\x0fpoint_cloud_url\x18\r \x01(\tH\x00\x12\x13\n\taudio_url\x18\x0e \x01(\tH\x00\x12/\n\raudio_ndarray\x18\x0f \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12\x34\n\x12\x61udio_torch_tensor\x18\x10 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x42\t\n\x07\x63ontent\"\x82\x01\n\rDocumentProto\x12/\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32!.docarray.DocumentProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\";\n\x12\x44ocumentArrayProto\x12%\n\x04\x64ocs\x18\x01 \x03(\x0b\x32\x17.docarray.DocumentProto\"\x86\x01\n\x0fUnionArrayProto\x12=\n\x0e\x64ocument_array\x18\x01 \x01(\x0b\x32#.docarray.DocumentArrayStackedProtoH\x00\x12)\n\x07ndarray\x18\x02 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x42\t\n\x07\x63ontent\"\xd6\x01\n\x19\x44ocumentArrayStackedProto\x12+\n\x05list_\x18\x01 \x01(\x0b\x32\x1c.docarray.DocumentArrayProto\x12\x41\n\x07\x63olumns\x18\x02 \x03(\x0b\x32\x30.docarray.DocumentArrayStackedProto.ColumnsEntry\x1aI\n\x0c\x43olumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12(\n\x05value\x18\x02 \x01(\x0b\x32\x19.docarray.UnionArrayProto:\x02\x38\x01\x62\x06proto3'
+    b'\n\x0e\x64ocarray.proto\x12\x08\x64ocarray\x1a\x1cgoogle/protobuf/struct.proto\"A\n\x11\x44\x65nseNdArrayProto\x12\x0e\n\x06\x62uffer\x18\x01 \x01(\x0c\x12\r\n\x05shape\x18\x02 \x03(\r\x12\r\n\x05\x64type\x18\x03 \x01(\t\"g\n\x0cNdArrayProto\x12*\n\x05\x64\x65nse\x18\x01 \x01(\x0b\x32\x1b.docarray.DenseNdArrayProto\x12+\n\nparameters\x18\x02 \x01(\x0b\x32\x17.google.protobuf.Struct\"\x8a\x05\n\tNodeProto\x12\x0e\n\x04\x62lob\x18\x01 \x01(\x0cH\x00\x12)\n\x07ndarray\x18\x02 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12\x0e\n\x04text\x18\x03 \x01(\tH\x00\x12)\n\x06nested\x18\x04 \x01(\x0b\x32\x17.docarray.DocumentProtoH\x00\x12.\n\x06\x63hunks\x18\x05 \x01(\x0b\x32\x1c.docarray.DocumentArrayProtoH\x00\x12+\n\tembedding\x18\x06 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12\x11\n\x07\x61ny_url\x18\x07 \x01(\tH\x00\x12\x13\n\timage_url\x18\x08 \x01(\tH\x00\x12\x12\n\x08text_url\x18\t \x01(\tH\x00\x12\x0c\n\x02id\x18\n \x01(\tH\x00\x12.\n\x0ctorch_tensor\x18\x0b \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12\x12\n\x08mesh_url\x18\x0c \x01(\tH\x00\x12\x19\n\x0fpoint_cloud_url\x18\r \x01(\tH\x00\x12\x13\n\taudio_url\x18\x0e \x01(\tH\x00\x12/\n\raudio_ndarray\x18\x0f \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12\x34\n\x12\x61udio_torch_tensor\x18\x10 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12\x13\n\tvideo_url\x18\x11 \x01(\tH\x00\x12/\n\rvideo_ndarray\x18\x12 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x12\x34\n\x12video_torch_tensor\x18\x13 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x42\t\n\x07\x63ontent\"\x82\x01\n\rDocumentProto\x12/\n\x04\x64\x61ta\x18\x01 \x03(\x0b\x32!.docarray.DocumentProto.DataEntry\x1a@\n\tDataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\"\n\x05value\x18\x02 \x01(\x0b\x32\x13.docarray.NodeProto:\x02\x38\x01\";\n\x12\x44ocumentArrayProto\x12%\n\x04\x64ocs\x18\x01 \x03(\x0b\x32\x17.docarray.DocumentProto\"\x86\x01\n\x0fUnionArrayProto\x12=\n\x0e\x64ocument_array\x18\x01 \x01(\x0b\x32#.docarray.DocumentArrayStackedProtoH\x00\x12)\n\x07ndarray\x18\x02 \x01(\x0b\x32\x16.docarray.NdArrayProtoH\x00\x42\t\n\x07\x63ontent\"\xd6\x01\n\x19\x44ocumentArrayStackedProto\x12+\n\x05list_\x18\x01 \x01(\x0b\x32\x1c.docarray.DocumentArrayProto\x12\x41\n\x07\x63olumns\x18\x02 \x03(\x0b\x32\x30.docarray.DocumentArrayStackedProto.ColumnsEntry\x1aI\n\x0c\x43olumnsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12(\n\x05value\x18\x02 \x01(\x0b\x32\x19.docarray.UnionArrayProto:\x02\x38\x01\x62\x06proto3'
 )
 
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())
@@ -32,17 +32,17 @@
     _NDARRAYPROTO._serialized_start = 125
     _NDARRAYPROTO._serialized_end = 228
     _NODEPROTO._serialized_start = 231
-    _NODEPROTO._serialized_end = 757
-    _DOCUMENTPROTO._serialized_start = 760
-    _DOCUMENTPROTO._serialized_end = 890
-    _DOCUMENTPROTO_DATAENTRY._serialized_start = 826
-    _DOCUMENTPROTO_DATAENTRY._serialized_end = 890
-    _DOCUMENTARRAYPROTO._serialized_start = 892
-    _DOCUMENTARRAYPROTO._serialized_end = 951
-    _UNIONARRAYPROTO._serialized_start = 954
-    _UNIONARRAYPROTO._serialized_end = 1088
-    _DOCUMENTARRAYSTACKEDPROTO._serialized_start = 1091
-    _DOCUMENTARRAYSTACKEDPROTO._serialized_end = 1305
-    _DOCUMENTARRAYSTACKEDPROTO_COLUMNSENTRY._serialized_start = 1232
-    _DOCUMENTARRAYSTACKEDPROTO_COLUMNSENTRY._serialized_end = 1305
+    _NODEPROTO._serialized_end = 881
+    _DOCUMENTPROTO._serialized_start = 884
+    _DOCUMENTPROTO._serialized_end = 1014
+    _DOCUMENTPROTO_DATAENTRY._serialized_start = 950
+    _DOCUMENTPROTO_DATAENTRY._serialized_end = 1014
+    _DOCUMENTARRAYPROTO._serialized_start = 1016
+    _DOCUMENTARRAYPROTO._serialized_end = 1075
+    _UNIONARRAYPROTO._serialized_start = 1078
+    _UNIONARRAYPROTO._serialized_end = 1212
+    _DOCUMENTARRAYSTACKEDPROTO._serialized_start = 1215
+    _DOCUMENTARRAYSTACKEDPROTO._serialized_end = 1429
+    _DOCUMENTARRAYSTACKEDPROTO_COLUMNSENTRY._serialized_start = 1356
+    _DOCUMENTARRAYSTACKEDPROTO_COLUMNSENTRY._serialized_end = 1429
 # @@protoc_insertion_point(module_scope)

From 3ccb697212346ba4dab387e06292c6905151b8f1 Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Tue, 3 Jan 2023 13:12:28 +0100
Subject: [PATCH 02/26] feat: add video url and video ndarray

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 docarray/typing/__init__.py                   |  11 +-
 docarray/typing/tensor/video/video_ndarray.py |  59 ++++++++++
 docarray/typing/url/__init__.py               |  11 +-
 docarray/typing/url/video_url.py              |  92 +++++++++++++++
 tests/units/typing/url/test_video_url.py      | 108 ++++++++++++++++++
 5 files changed, 278 insertions(+), 3 deletions(-)
 create mode 100644 docarray/typing/tensor/video/video_ndarray.py
 create mode 100644 docarray/typing/url/video_url.py
 create mode 100644 tests/units/typing/url/test_video_url.py

diff --git a/docarray/typing/__init__.py b/docarray/typing/__init__.py
index 4cedcc689fc..1f16a4ec3ee 100644
--- a/docarray/typing/__init__.py
+++ b/docarray/typing/__init__.py
@@ -3,6 +3,7 @@
 from docarray.typing.tensor.embedding.embedding import Embedding
 from docarray.typing.tensor.ndarray import NdArray
 from docarray.typing.tensor.tensor import AnyTensor
+from docarray.typing.tensor.video import VideoNdArray
 from docarray.typing.url import (
     AnyUrl,
     AudioUrl,
@@ -10,17 +11,20 @@
     Mesh3DUrl,
     PointCloud3DUrl,
     TextUrl,
+    VideoUrl,
 )
 
 __all__ = [
-    'AudioNdArray',
     'NdArray',
+    'AudioNdArray',
+    'VideoNdArray',
     'Embedding',
     'ImageUrl',
     'AudioUrl',
     'TextUrl',
     'Mesh3DUrl',
     'PointCloud3DUrl',
+    'VideoUrl',
     'AnyUrl',
     'ID',
     'AnyTensor',
@@ -33,5 +37,8 @@
 else:
     from docarray.typing.tensor import TorchEmbedding, TorchTensor  # noqa: F401
     from docarray.typing.tensor.audio.audio_torch_tensor import AudioTorchTensor  # noqa
+    from docarray.typing.tensor.video.video_torch_tensor import VideoTorchTensor  # noqa
 
-    __all__.extend(['AudioTorchTensor', 'TorchEmbedding', 'TorchTensor'])
+    __all__.extend(
+        ['AudioTorchTensor', 'TorchEmbedding', 'TorchTensor', 'VideoTorchTensor']
+    )
diff --git a/docarray/typing/tensor/video/video_ndarray.py b/docarray/typing/tensor/video/video_ndarray.py
new file mode 100644
index 00000000000..f44b332770b
--- /dev/null
+++ b/docarray/typing/tensor/video/video_ndarray.py
@@ -0,0 +1,59 @@
+from typing import TypeVar
+
+import numpy as np
+
+from docarray.typing.tensor.ndarray import NdArray
+from docarray.typing.tensor.video.abstract_video_tensor import AbstractVideoTensor
+
+T = TypeVar('T', bound='VideoNdArray')
+
+
+class VideoNdArray(AbstractVideoTensor, NdArray):
+    """
+    Subclass of NdArray, to represent a video tensor.
+
+    Additionally, this allows storing such a tensor as a .wav audio file.
+
+    EXAMPLE USAGE
+
+    .. code-block:: python
+
+        from typing import Optional
+        from pydantic import parse_obj_as
+        from docarray import Document
+        from docarray.typing import AudioNdArray, AudioUrl
+        import numpy as np
+
+
+        class MyAudioDoc(Document):
+            title: str
+            audio_tensor: Optional[AudioNdArray]
+            url: Optional[AudioUrl]
+
+
+        # from tensor
+        doc_1 = MyAudioDoc(
+            title='my_first_audio_doc',
+            audio_tensor=np.random.rand(1000, 2),
+        )
+        doc_1.audio_tensor.save_to_wav_file(file_path='path/to/file_1.wav')
+        # from url
+        doc_2 = MyAudioDoc(
+            title='my_second_audio_doc',
+            url='https://github.com/docarray/docarray/tree/feat-add-audio-v2/tests/toydata/hello.wav',
+        )
+        doc_2.audio_tensor = parse_obj_as(AudioNdArray, doc_2.url.load())
+        doc_2.audio_tensor.save_to_wav_file(file_path='path/to/file_2.wav')
+    """
+
+    _PROTO_FIELD_NAME = 'video_ndarray'
+
+    def check_shape(self) -> None:
+        if self.ndim != 4 or self.shape[-1] != 3 or self.dtype != np.uint8:
+            raise ValueError(
+                f'expects `` with dtype=uint8 and ndim=4 and the last dimension is 3, '
+                f'but receiving {self.shape} in {self.dtype}'
+            )
+
+    def to_numpy(self) -> np.ndarray:
+        return self
diff --git a/docarray/typing/url/__init__.py b/docarray/typing/url/__init__.py
index 29efa353c16..b1a4416744d 100644
--- a/docarray/typing/url/__init__.py
+++ b/docarray/typing/url/__init__.py
@@ -4,5 +4,14 @@
 from docarray.typing.url.text_url import TextUrl
 from docarray.typing.url.url_3d.mesh_url import Mesh3DUrl
 from docarray.typing.url.url_3d.point_cloud_url import PointCloud3DUrl
+from docarray.typing.url.video_url import VideoUrl
 
-__all__ = ['ImageUrl', 'AudioUrl', 'AnyUrl', 'TextUrl', 'Mesh3DUrl', 'PointCloud3DUrl']
+__all__ = [
+    'ImageUrl',
+    'AudioUrl',
+    'AnyUrl',
+    'TextUrl',
+    'Mesh3DUrl',
+    'PointCloud3DUrl',
+    'VideoUrl',
+]
diff --git a/docarray/typing/url/video_url.py b/docarray/typing/url/video_url.py
new file mode 100644
index 00000000000..bcfdec16191
--- /dev/null
+++ b/docarray/typing/url/video_url.py
@@ -0,0 +1,92 @@
+from typing import TYPE_CHECKING, Any, Tuple, Type, TypeVar, Union
+
+import numpy as np
+from pydantic.tools import parse_obj_as
+
+from docarray.typing.tensor.video import VideoNdArray
+from docarray.typing.url.any_url import AnyUrl
+
+if TYPE_CHECKING:
+    from pydantic import BaseConfig
+    from pydantic.fields import ModelField
+
+    from docarray.proto import NodeProto
+
+T = TypeVar('T', bound='VideoUrl')
+
+VIDEO_FILE_FORMATS = ['mp4']
+
+
+class VideoUrl(AnyUrl):
+    """
+    URL to a .wav file.
+    Can be remote (web) URL, or a local file path.
+    """
+
+    def _to_node_protobuf(self: T) -> 'NodeProto':
+        """Convert Document into a NodeProto protobuf message. This function should
+        be called when the Document is nested into another Document that needs to
+        be converted into a protobuf
+        :return: the nested item protobuf message
+        """
+        from docarray.proto import NodeProto
+
+        return NodeProto(video_url=str(self))
+
+    @classmethod
+    def validate(
+        cls: Type[T],
+        value: Union[T, np.ndarray, Any],
+        field: 'ModelField',
+        config: 'BaseConfig',
+    ) -> T:
+        url = super().validate(value, field, config)
+        has_video_extension = any(ext in url for ext in VIDEO_FILE_FORMATS)
+        if not has_video_extension:
+            raise ValueError(
+                f'Video URL must have one of the following extensions:'
+                f'{VIDEO_FILE_FORMATS}'
+            )
+        return cls(str(url), scheme=None)
+
+    def load(
+        self: T, only_keyframes: bool = False, **kwargs
+    ) -> Union[VideoNdArray, Tuple[VideoNdArray, VideoNdArray]]:
+        """
+        Load the data from the url into a numpy.ndarray.
+
+
+
+        :param only_keyframes: if True keep only the keyframes, if False keep all frames
+            and store the indices of the keyframes in :attr:`.tags`
+        :param kwargs: supports all keyword arguments that are being supported by
+            av.open() as described in:
+            https://pyav.org/docs/stable/api/_globals.html?highlight=open#av.open
+        :return: np.ndarray representing the audio file content, list of key frame
+            indices if only_keyframe False.
+        """
+        import av
+
+        with av.open(self, **kwargs) as container:
+            if only_keyframes:
+                stream = container.streams.video[0]
+                stream.codec_context.skip_frame = 'NONKEY'
+
+            frames = []
+            keyframe_indices = []
+            for i, frame in enumerate(container.decode(video=0)):
+
+                img = frame.to_image()
+                frames.append(img)
+                if not only_keyframes and frame.key_frame == 1:
+                    keyframe_indices.append(i)
+
+        frames = parse_obj_as(VideoNdArray, np.moveaxis(np.stack(frames), 1, 2))
+
+        if only_keyframes:
+            return frames
+        else:
+            indices = parse_obj_as(
+                VideoNdArray, np.ndarray(keyframe_indices, dtype=np.int32)
+            )
+            return frames, indices
diff --git a/tests/units/typing/url/test_video_url.py b/tests/units/typing/url/test_video_url.py
new file mode 100644
index 00000000000..39ad487e8fc
--- /dev/null
+++ b/tests/units/typing/url/test_video_url.py
@@ -0,0 +1,108 @@
+from typing import Optional
+
+import numpy as np
+import pytest
+from pydantic.tools import parse_obj_as, schema_json_of
+
+from docarray import BaseDocument
+from docarray.document.io.json import orjson_dumps
+from docarray.typing import VideoNdArray, VideoTorchTensor, VideoUrl
+from tests import TOYDATA_DIR
+
+LOCAL_VIDEO_FILE = str(TOYDATA_DIR / 'mov_bbb.mp4')
+REMOTE_VIDEO_FILE = 'https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/mov_bbb.mp4?raw=true'  # noqa: E501
+
+
+@pytest.mark.slow
+@pytest.mark.internet
+@pytest.mark.parametrize(
+    'file_url',
+    [LOCAL_VIDEO_FILE, REMOTE_VIDEO_FILE],
+)
+def test_load_with_only_keyframes_false(file_url):
+    url = parse_obj_as(VideoUrl, file_url)
+    tensor, indices = url.load(only_keyframes=False)
+
+    assert isinstance(tensor, np.ndarray)
+    assert isinstance(tensor, VideoNdArray)
+
+    assert isinstance(indices, np.ndarray)
+    assert isinstance(indices, VideoNdArray)
+
+
+@pytest.mark.slow
+@pytest.mark.internet
+@pytest.mark.parametrize(
+    'file_url',
+    [LOCAL_VIDEO_FILE, REMOTE_VIDEO_FILE],
+)
+def test_load_with_only_keyframes_true(file_url):
+    url = parse_obj_as(VideoUrl, file_url)
+    tensor = url.load(only_keyframes=True)
+
+    assert isinstance(tensor, np.ndarray)
+    assert isinstance(tensor, VideoNdArray)
+
+
+@pytest.mark.slow
+@pytest.mark.internet
+@pytest.mark.parametrize(
+    'file_url',
+    [LOCAL_VIDEO_FILE, REMOTE_VIDEO_FILE],
+)
+def test_load_video_url_to_video_torch_tensor_field(file_url):
+    class MyVideoDoc(BaseDocument):
+        video_url: VideoUrl
+        tensor: Optional[VideoTorchTensor]
+
+    doc = MyVideoDoc(video_url=file_url)
+    doc.tensor = doc.video_url.load(only_keyframes=True)
+
+    assert isinstance(doc.tensor, np.ndarray)
+    assert isinstance(doc.tensor, VideoNdArray)
+
+
+def test_json_schema():
+    schema_json_of(VideoUrl)
+
+
+def test_dump_json():
+    url = parse_obj_as(VideoUrl, REMOTE_VIDEO_FILE)
+    orjson_dumps(url)
+
+
+@pytest.mark.parametrize(
+    'path_to_file',
+    [LOCAL_VIDEO_FILE, REMOTE_VIDEO_FILE],
+)
+def test_validation(path_to_file):
+    url = parse_obj_as(VideoUrl, path_to_file)
+    assert isinstance(url, VideoUrl)
+    assert isinstance(url, str)
+
+
+@pytest.mark.parametrize(
+    'path_to_file',
+    [
+        'illegal',
+        'https://www.google.com',
+        'my/local/text/file.txt',
+        'my/local/text/file.png',
+        'my/local/file.mp3',
+    ],
+)
+def test_illegal_validation(path_to_file):
+    with pytest.raises(ValueError, match='VideoUrl'):
+        parse_obj_as(VideoUrl, path_to_file)
+
+
+@pytest.mark.slow
+@pytest.mark.internet
+@pytest.mark.parametrize(
+    'file_url',
+    [LOCAL_VIDEO_FILE, REMOTE_VIDEO_FILE],
+)
+def test_proto_video_url(file_url):
+    uri = parse_obj_as(VideoUrl, file_url)
+    proto = uri._to_node_protobuf()
+    assert str(proto).startswith('video_url')

From dc957d19bc20bf7072f0dda8591b5de1d668406e Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Wed, 4 Jan 2023 10:27:31 +0100
Subject: [PATCH 03/26] feat: add video torch tensor and tests

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 docarray/__init__.py                          |  3 +-
 docarray/predefined_document/__init__.py      |  3 +-
 docarray/predefined_document/video.py         | 31 +++++++
 docarray/typing/tensor/video/__init__.py      | 12 +++
 .../tensor/video/abstract_video_tensor.py     | 65 ++++++++++++++
 docarray/typing/tensor/video/video_ndarray.py | 54 ++++--------
 docarray/typing/tensor/video/video_tensor.py  | 13 +++
 .../typing/tensor/video/video_torch_tensor.py | 43 +++++++++
 docarray/typing/url/video_url.py              | 13 ++-
 .../predefined_document/test_video.py         | 43 +++++++++
 .../units/typing/tensor/test_video_tensor.py  | 87 +++++++++++++++++++
 11 files changed, 323 insertions(+), 44 deletions(-)
 create mode 100644 docarray/predefined_document/video.py
 create mode 100644 docarray/typing/tensor/video/__init__.py
 create mode 100644 docarray/typing/tensor/video/abstract_video_tensor.py
 create mode 100644 docarray/typing/tensor/video/video_tensor.py
 create mode 100644 docarray/typing/tensor/video/video_torch_tensor.py
 create mode 100644 tests/integrations/predefined_document/test_video.py
 create mode 100644 tests/units/typing/tensor/test_video_tensor.py

diff --git a/docarray/__init__.py b/docarray/__init__.py
index f5a2e8f7893..f54c3ad460d 100644
--- a/docarray/__init__.py
+++ b/docarray/__init__.py
@@ -2,7 +2,7 @@
 
 from docarray.array.array import DocumentArray
 from docarray.document.document import BaseDocument
-from docarray.predefined_document import Audio, Image, Mesh3D, PointCloud3D, Text
+from docarray.predefined_document import Audio, Image, Mesh3D, PointCloud3D, Text, Video
 
 __all__ = [
     'BaseDocument',
@@ -12,4 +12,5 @@
     'Text',
     'Mesh3D',
     'PointCloud3D',
+    'Video',
 ]
diff --git a/docarray/predefined_document/__init__.py b/docarray/predefined_document/__init__.py
index cf67088fc2c..6dcec5276a6 100644
--- a/docarray/predefined_document/__init__.py
+++ b/docarray/predefined_document/__init__.py
@@ -3,5 +3,6 @@
 from docarray.predefined_document.mesh import Mesh3D
 from docarray.predefined_document.point_cloud import PointCloud3D
 from docarray.predefined_document.text import Text
+from docarray.predefined_document.video import Video
 
-__all__ = ['Text', 'Image', 'Audio', 'Mesh3D', 'PointCloud3D']
+__all__ = ['Text', 'Image', 'Audio', 'Mesh3D', 'PointCloud3D', 'Video']
diff --git a/docarray/predefined_document/video.py b/docarray/predefined_document/video.py
new file mode 100644
index 00000000000..0536a543d1b
--- /dev/null
+++ b/docarray/predefined_document/video.py
@@ -0,0 +1,31 @@
+from typing import Optional, TypeVar
+
+from docarray.document import BaseDocument
+from docarray.typing import AnyTensor, Embedding
+from docarray.typing.tensor.video.video_tensor import VideoTensor
+from docarray.typing.url.video_url import VideoUrl
+
+T = TypeVar('T', bound='Video')
+
+
+class Video(BaseDocument):
+    """
+    Document for handling video.
+    The Video Document can contain a VideoUrl (`Video.url`), a VideoTensor
+    (`Video.tensor`), an AnyTensor ('Video.key_frame_indices), and an Embedding
+    (`Video.embedding`).
+
+    EXAMPLE USAGE:
+
+    You can use this Document directly:
+
+    You can extend this Document:
+
+    You can use this Document for composition:
+
+    """
+
+    url: Optional[VideoUrl]
+    tensor: Optional[VideoTensor]
+    key_frame_indices: Optional[AnyTensor]
+    embedding: Optional[Embedding]
diff --git a/docarray/typing/tensor/video/__init__.py b/docarray/typing/tensor/video/__init__.py
new file mode 100644
index 00000000000..b2fb90cd1e5
--- /dev/null
+++ b/docarray/typing/tensor/video/__init__.py
@@ -0,0 +1,12 @@
+from docarray.typing.tensor.video.video_ndarray import VideoNdArray
+
+__all__ = ['VideoNdArray']
+
+try:
+    import torch  # noqa: F401
+except ImportError:
+    pass
+else:
+    from docarray.typing.tensor.video.video_torch_tensor import VideoTorchTensor  # noqa
+
+    __all__.extend(['VideoTorchTensor'])
diff --git a/docarray/typing/tensor/video/abstract_video_tensor.py b/docarray/typing/tensor/video/abstract_video_tensor.py
new file mode 100644
index 00000000000..f9134037710
--- /dev/null
+++ b/docarray/typing/tensor/video/abstract_video_tensor.py
@@ -0,0 +1,65 @@
+from abc import ABC, abstractmethod
+from typing import BinaryIO, Dict, Generator, Optional, Tuple, Type, TypeVar, Union
+
+import numpy as np
+
+from docarray.typing.tensor.abstract_tensor import AbstractTensor
+
+T = TypeVar('T', bound='AbstractVideoTensor')
+
+
+class AbstractVideoTensor(AbstractTensor, ABC):
+    @abstractmethod
+    def to_numpy(self) -> np.ndarray:
+        """
+        Convert video tensor to numpy.ndarray.
+        """
+        ...
+
+    def save_to_file(
+        self: 'T',
+        file_path: Union[str, BinaryIO],
+        frame_rate: int = 30,
+        codec: str = 'h264',
+    ) -> None:
+        """
+        Save video tensor to a .wav file. Mono/stereo is preserved.
+
+
+        :param file_path: path to a .wav file. If file is a string, open the file by
+            that name, otherwise treat it as a file-like object.
+        :param frame_rate: frames per second.
+        :param codec: the name of a decoder/encoder.
+        """
+        np_tensor = self.to_numpy()
+
+        video_tensor = np.moveaxis(np.clip(np_tensor, 0, 255), 1, 2).astype('uint8')
+
+        import av
+
+        with av.open(file_path, mode='w') as container:
+            stream = container.add_stream(codec, rate=frame_rate)
+            stream.width = np_tensor.shape[1]
+            stream.height = np_tensor.shape[2]
+            stream.pix_fmt = 'yuv420p'
+
+            for b in video_tensor:
+                frame = av.VideoFrame.from_ndarray(b, format='rgb24')
+                for packet in stream.encode(frame):
+                    container.mux(packet)
+
+            for packet in stream.encode():
+                container.mux(packet)
+
+    @classmethod
+    def generator_from_webcam(
+        cls: Type['T'],
+        height_width: Optional[Tuple[int, int]] = None,
+        show_window: bool = True,
+        window_title: str = 'webcam',
+        fps: int = 30,
+        exit_key: int = 27,
+        exit_event=None,
+        tags: Optional[Dict] = None,
+    ) -> Generator['T', None, None]:
+        ...
diff --git a/docarray/typing/tensor/video/video_ndarray.py b/docarray/typing/tensor/video/video_ndarray.py
index f44b332770b..5362bb05dc1 100644
--- a/docarray/typing/tensor/video/video_ndarray.py
+++ b/docarray/typing/tensor/video/video_ndarray.py
@@ -1,4 +1,4 @@
-from typing import TypeVar
+from typing import TYPE_CHECKING, Any, List, Tuple, Type, TypeVar, Union
 
 import numpy as np
 
@@ -7,53 +7,37 @@
 
 T = TypeVar('T', bound='VideoNdArray')
 
+if TYPE_CHECKING:
+    from pydantic import BaseConfig
+    from pydantic.fields import ModelField
+
 
 class VideoNdArray(AbstractVideoTensor, NdArray):
     """
     Subclass of NdArray, to represent a video tensor.
-
-    Additionally, this allows storing such a tensor as a .wav audio file.
+    Adds video-specific features to the tensor.
 
     EXAMPLE USAGE
 
-    .. code-block:: python
-
-        from typing import Optional
-        from pydantic import parse_obj_as
-        from docarray import Document
-        from docarray.typing import AudioNdArray, AudioUrl
-        import numpy as np
-
-
-        class MyAudioDoc(Document):
-            title: str
-            audio_tensor: Optional[AudioNdArray]
-            url: Optional[AudioUrl]
-
-
-        # from tensor
-        doc_1 = MyAudioDoc(
-            title='my_first_audio_doc',
-            audio_tensor=np.random.rand(1000, 2),
-        )
-        doc_1.audio_tensor.save_to_wav_file(file_path='path/to/file_1.wav')
-        # from url
-        doc_2 = MyAudioDoc(
-            title='my_second_audio_doc',
-            url='https://github.com/docarray/docarray/tree/feat-add-audio-v2/tests/toydata/hello.wav',
-        )
-        doc_2.audio_tensor = parse_obj_as(AudioNdArray, doc_2.url.load())
-        doc_2.audio_tensor.save_to_wav_file(file_path='path/to/file_2.wav')
     """
 
     _PROTO_FIELD_NAME = 'video_ndarray'
 
-    def check_shape(self) -> None:
-        if self.ndim != 4 or self.shape[-1] != 3 or self.dtype != np.uint8:
+    @classmethod
+    def validate(
+        cls: Type[T],
+        value: Union[T, np.ndarray, List[Any], Tuple[Any], Any],
+        field: 'ModelField',
+        config: 'BaseConfig',
+    ) -> T:
+        array = super().validate(value=value, field=field, config=config)
+        if array.ndim not in [3, 4] or array.shape[-1] != 3:
             raise ValueError(
-                f'expects `` with dtype=uint8 and ndim=4 and the last dimension is 3, '
-                f'but receiving {self.shape} in {self.dtype}'
+                f'Expects tensor with 3 or 4 dimensions and the last dimension equal'
+                f' to 3, but received {array.shape} in {array.dtype}'
             )
+        else:
+            return array
 
     def to_numpy(self) -> np.ndarray:
         return self
diff --git a/docarray/typing/tensor/video/video_tensor.py b/docarray/typing/tensor/video/video_tensor.py
new file mode 100644
index 00000000000..ddf8cad3ee6
--- /dev/null
+++ b/docarray/typing/tensor/video/video_tensor.py
@@ -0,0 +1,13 @@
+from typing import Union
+
+from docarray.typing.tensor.video.video_ndarray import VideoNdArray
+
+try:
+    import torch  # noqa: F401
+except ImportError:
+    VideoTensor = VideoNdArray
+
+else:
+    from docarray.typing.tensor.video.video_torch_tensor import VideoTorchTensor
+
+    VideoTensor = Union[VideoNdArray, VideoTorchTensor]  # type: ignore
diff --git a/docarray/typing/tensor/video/video_torch_tensor.py b/docarray/typing/tensor/video/video_torch_tensor.py
new file mode 100644
index 00000000000..0bc755f8467
--- /dev/null
+++ b/docarray/typing/tensor/video/video_torch_tensor.py
@@ -0,0 +1,43 @@
+from typing import TYPE_CHECKING, Any, List, Tuple, Type, TypeVar, Union
+
+import numpy as np
+
+from docarray.typing.tensor.torch_tensor import TorchTensor, metaTorchAndNode
+from docarray.typing.tensor.video.abstract_video_tensor import AbstractVideoTensor
+
+T = TypeVar('T', bound='VideoTorchTensor')
+
+if TYPE_CHECKING:
+    from pydantic import BaseConfig
+    from pydantic.fields import ModelField
+
+
+class VideoTorchTensor(AbstractVideoTensor, TorchTensor, metaclass=metaTorchAndNode):
+    """
+    Subclass of TorchTensor, to represent a video tensor.
+    Adds video-specific features to the tensor.
+
+    EXAMPLE USAGE
+
+    """
+
+    _PROTO_FIELD_NAME = 'video_torch_tensor'
+
+    @classmethod
+    def validate(
+        cls: Type[T],
+        value: Union[T, np.ndarray, List[Any], Tuple[Any], Any],
+        field: 'ModelField',
+        config: 'BaseConfig',
+    ) -> T:
+        tensor = super().validate(value=value, field=field, config=config)
+        if tensor.ndim not in [3, 4] or tensor.shape[-1] != 3:
+            raise ValueError(
+                f'Expects tensor with 3 or 4 dimensions and the last dimension equal '
+                f'to 3, but received {tensor.shape} in {tensor.dtype}'
+            )
+        else:
+            return tensor
+
+    def to_numpy(self) -> np.ndarray:
+        return self.cpu().detach().numpy()
diff --git a/docarray/typing/url/video_url.py b/docarray/typing/url/video_url.py
index bcfdec16191..877a17536b3 100644
--- a/docarray/typing/url/video_url.py
+++ b/docarray/typing/url/video_url.py
@@ -50,15 +50,17 @@ def validate(
         return cls(str(url), scheme=None)
 
     def load(
-        self: T, only_keyframes: bool = False, **kwargs
-    ) -> Union[VideoNdArray, Tuple[VideoNdArray, VideoNdArray]]:
+        self: T, only_keyframes: bool = False, dtype: str = 'int32', **kwargs
+    ) -> Union[VideoNdArray, Tuple[VideoNdArray, np.ndarray]]:
         """
-        Load the data from the url into a numpy.ndarray.
+        Load the data from the url into a VideoNdArray or Tuple of VideoNdArray and
+        np.ndarray.
 
 
 
         :param only_keyframes: if True keep only the keyframes, if False keep all frames
             and store the indices of the keyframes in :attr:`.tags`
+        :param dtype: Data-type of the returned array; default: int32.
         :param kwargs: supports all keyword arguments that are being supported by
             av.open() as described in:
             https://pyav.org/docs/stable/api/_globals.html?highlight=open#av.open
@@ -86,7 +88,4 @@ def load(
         if only_keyframes:
             return frames
         else:
-            indices = parse_obj_as(
-                VideoNdArray, np.ndarray(keyframe_indices, dtype=np.int32)
-            )
-            return frames, indices
+            return frames, np.ndarray(keyframe_indices, dtype=dtype)
diff --git a/tests/integrations/predefined_document/test_video.py b/tests/integrations/predefined_document/test_video.py
new file mode 100644
index 00000000000..77acdab4cd1
--- /dev/null
+++ b/tests/integrations/predefined_document/test_video.py
@@ -0,0 +1,43 @@
+import os
+
+import numpy as np
+import pytest
+
+from docarray import Video
+from docarray.typing import VideoNdArray
+from tests import TOYDATA_DIR
+
+LOCAL_VIDEO_FILE = str(TOYDATA_DIR / 'mov_bbb.mp4')
+REMOTE_VIDEO_FILE = 'https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/mov_bbb.mp4?raw=true'  # noqa: E501
+
+
+@pytest.mark.slow
+@pytest.mark.internet
+@pytest.mark.parametrize('file_url', [LOCAL_VIDEO_FILE, REMOTE_VIDEO_FILE])
+def test_video(file_url):
+    video = Video(url=file_url)
+    video.tensor, video.key_frame_indices = video.url.load()
+
+    assert isinstance(video.tensor, np.ndarray)
+    assert isinstance(video.tensor, VideoNdArray)
+    assert isinstance(video.key_frame_indices, np.ndarray)
+
+
+@pytest.mark.slow
+@pytest.mark.internet
+@pytest.mark.parametrize('file_url', [LOCAL_VIDEO_FILE, REMOTE_VIDEO_FILE])
+def test_save_video_ndarray(file_url, tmpdir):
+    tmp_file = str(tmpdir / 'tmp.mp4')
+
+    video = Video(url=file_url)
+    video.tensor, _ = video.url.load()
+
+    assert isinstance(video.tensor, np.ndarray)
+    assert isinstance(video.tensor, VideoNdArray)
+
+    video.tensor.save_to_file(tmp_file)
+    assert os.path.isfile(tmp_file)
+
+    video_from_file = Video(url=tmp_file)
+    video_from_file.tensor = video_from_file.url.load()
+    assert np.allclose(video.tensor, video_from_file.tensor)
diff --git a/tests/units/typing/tensor/test_video_tensor.py b/tests/units/typing/tensor/test_video_tensor.py
new file mode 100644
index 00000000000..bbc94ddaf4d
--- /dev/null
+++ b/tests/units/typing/tensor/test_video_tensor.py
@@ -0,0 +1,87 @@
+import os
+
+import numpy as np
+import pytest
+import torch
+from pydantic.tools import parse_obj_as
+
+from docarray import BaseDocument
+from docarray.typing import VideoNdArray, VideoTorchTensor
+
+
+@pytest.mark.parametrize(
+    'tensor,cls_video_tensor,cls_tensor',
+    [
+        (torch.zeros(1, 224, 224, 3), VideoTorchTensor, torch.Tensor),
+        (np.zeros((1, 224, 224, 3)), VideoNdArray, np.ndarray),
+    ],
+)
+def test_set_video_tensor(tensor, cls_video_tensor, cls_tensor):
+    class MyVideoDoc(BaseDocument):
+        tensor: cls_video_tensor
+
+    doc = MyVideoDoc(tensor=tensor)
+
+    assert isinstance(doc.tensor, cls_video_tensor)
+    assert isinstance(doc.tensor, cls_tensor)
+    assert (doc.tensor == tensor).all()
+
+
+@pytest.mark.parametrize(
+    'cls_tensor,tensor',
+    [
+        (VideoNdArray, np.zeros((1, 224, 224, 3))),
+        (VideoTorchTensor, torch.zeros(1, 224, 224, 3)),
+        (VideoTorchTensor, np.zeros((1, 224, 224, 3))),
+    ],
+)
+def test_validation(cls_tensor, tensor):
+    arr = parse_obj_as(cls_tensor, tensor)
+    assert isinstance(arr, cls_tensor)
+
+
+@pytest.mark.parametrize(
+    'cls_tensor,tensor',
+    [
+        (VideoNdArray, torch.zeros(1, 224, 224, 3)),
+        (VideoTorchTensor, torch.zeros(224, 3)),
+        (VideoTorchTensor, torch.zeros(1, 224, 224, 100)),
+        (VideoNdArray, 'hello'),
+        (VideoTorchTensor, 'hello'),
+    ],
+)
+def test_illegal_validation(cls_tensor, tensor):
+    match = str(cls_tensor).split('.')[-1][:-2]
+    with pytest.raises(ValueError, match=match):
+        parse_obj_as(cls_tensor, tensor)
+
+
+@pytest.mark.parametrize(
+    'cls_tensor,tensor,proto_key',
+    [
+        (
+            VideoTorchTensor,
+            torch.zeros(1, 224, 224, 3),
+            VideoTorchTensor._PROTO_FIELD_NAME,
+        ),
+        (VideoNdArray, np.zeros((1, 224, 224, 3)), VideoNdArray._PROTO_FIELD_NAME),
+    ],
+)
+def test_proto_tensor(cls_tensor, tensor, proto_key):
+    tensor = parse_obj_as(cls_tensor, tensor)
+    proto = tensor._to_node_protobuf()
+    assert str(proto).startswith(proto_key)
+
+
+@pytest.mark.parametrize(
+    'cls_tensor,tensor',
+    [
+        (VideoTorchTensor, torch.zeros(1, 224, 224, 3)),
+        (VideoNdArray, np.zeros((1, 224, 224, 3))),
+    ],
+)
+def test_save_video_tensor_to_file(cls_tensor, tensor, tmpdir):
+    tmp_file = str(tmpdir / 'tmp.mp4')
+    video_tensor = parse_obj_as(cls_tensor, tensor)
+    video_tensor.save_to_file(tmp_file)
+    assert os.path.isfile(tmp_file)

From fc869203c3c62c3328ec7d48b9a138e848e155e0 Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Wed, 4 Jan 2023 10:51:39 +0100
Subject: [PATCH 04/26] fix: mypy checks

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 .../typing/tensor/video/abstract_video_tensor.py  | 15 +--------------
 docarray/typing/url/video_url.py                  |  8 +++++---
 pyproject.toml                                    |  4 ++++
 3 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/docarray/typing/tensor/video/abstract_video_tensor.py b/docarray/typing/tensor/video/abstract_video_tensor.py
index f9134037710..4cb5d7be9b2 100644
--- a/docarray/typing/tensor/video/abstract_video_tensor.py
+++ b/docarray/typing/tensor/video/abstract_video_tensor.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import BinaryIO, Dict, Generator, Optional, Tuple, Type, TypeVar, Union
+from typing import BinaryIO, TypeVar, Union
 
 import numpy as np
 
@@ -50,16 +50,3 @@ def save_to_file(
 
             for packet in stream.encode():
                 container.mux(packet)
-
-    @classmethod
-    def generator_from_webcam(
-        cls: Type['T'],
-        height_width: Optional[Tuple[int, int]] = None,
-        show_window: bool = True,
-        window_title: str = 'webcam',
-        fps: int = 30,
-        exit_key: int = 27,
-        exit_event=None,
-        tags: Optional[Dict] = None,
-    ) -> Generator['T', None, None]:
-        ...
diff --git a/docarray/typing/url/video_url.py b/docarray/typing/url/video_url.py
index 877a17536b3..e424c1d5935 100644
--- a/docarray/typing/url/video_url.py
+++ b/docarray/typing/url/video_url.py
@@ -83,9 +83,11 @@ def load(
                 if not only_keyframes and frame.key_frame == 1:
                     keyframe_indices.append(i)
 
-        frames = parse_obj_as(VideoNdArray, np.moveaxis(np.stack(frames), 1, 2))
+        frames_vid: VideoNdArray = parse_obj_as(
+            VideoNdArray, np.moveaxis(np.stack(frames), 1, 2)
+        )
 
         if only_keyframes:
-            return frames
+            return frames_vid
         else:
-            return frames, np.ndarray(keyframe_indices, dtype=dtype)
+            return frames_vid, np.ndarray(keyframe_indices, dtype=dtype)
diff --git a/pyproject.toml b/pyproject.toml
index 1d29532b696..b2ff89bef7d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -49,6 +49,10 @@ exclude = ['docarray/proto']
 plugins = "pydantic.mypy"
 check_untyped_defs = true
 
+[[tool.mypy.overrides]]
+module = "av"
+ignore_missing_imports = true
+
 [[tool.mypy.overrides]]
 module = "trimesh"
 ignore_missing_imports = true

From 8a55e0b99dcb475bb99c8dc88bd4dcf3ed267965 Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Wed, 4 Jan 2023 12:17:23 +0100
Subject: [PATCH 05/26] chore: add av to video extra

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 poetry.lock    | 57 +++++++++++++++++++++++++++++++++++++++++++++++++-
 pyproject.toml |  2 ++
 2 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/poetry.lock b/poetry.lock
index 5996317fe67..1e95b3fa905 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -90,6 +90,14 @@ docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"]
 tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy (>=0.900,!=0.940)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "zope.interface"]
 tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy (>=0.900,!=0.940)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins"]
 
+[[package]]
+name = "av"
+version = "10.0.0"
+description = "Pythonic bindings for FFmpeg's libraries."
+category = "main"
+optional = false
+python-versions = "*"
+
 [[package]]
 name = "babel"
 version = "2.11.0"
@@ -1668,11 +1676,12 @@ common = ["protobuf"]
 image = ["pillow", "types-pillow"]
 mesh = ["trimesh"]
 torch = ["torch"]
+video = ["av"]
 
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.8"
-content-hash = "b1aa40aea6ec7f56a8c3b511fd2ce96ed217c6fc81d6f8dd931e519cc0774154"
+content-hash = "1856e2f5fdadf5b4cbd1100c6593ac71b72323aa1a4704bf40235265019f3424"
 
 [metadata.files]
 anyio = [
@@ -1721,6 +1730,52 @@ attrs = [
     {file = "attrs-22.1.0-py2.py3-none-any.whl", hash = "sha256:86efa402f67bf2df34f51a335487cf46b1ec130d02b8d39fd248abfd30da551c"},
     {file = "attrs-22.1.0.tar.gz", hash = "sha256:29adc2665447e5191d0e7c568fde78b21f9672d344281d0c6e1ab085429b22b6"},
 ]
+av = [
+    {file = "av-10.0.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d19bb54197155d045a2b683d993026d4bcb06e31c2acad0327e3e8711571899c"},
+    {file = "av-10.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7dba96a85cd37315529998e6dbbe3fa05c2344eb19a431dc24996be030a904ee"},
+    {file = "av-10.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27d6d38c7c8d46d578c008ffcb8aad1eae14d0621fff41f4ad62395589045fe4"},
+    {file = "av-10.0.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:51037f4bde03daf924236af4f444e17345792ad7f6f70760a5e5863407e14f2b"},
+    {file = "av-10.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0577a38664e453b4ffb63d616a0d23c295827b16ae96a090e89527a753de8718"},
+    {file = "av-10.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:07c971573035d22ce50069d3f2bbdb4d6d02d626ab13db12fda3ce519cda3f22"},
+    {file = "av-10.0.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e5085d11345484c0097898994bb3f515002e7e1deeb43dd11d30dd6f45402c49"},
+    {file = "av-10.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:157bde3ffd1615a9006b56e4daf3b46848d3ee2bd46b0394f7568e43ed7ab5a9"},
+    {file = "av-10.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:115e144d5a1f205378a4b3a3657b7ed3e45918ebe5d2003a891e45984e8f443a"},
+    {file = "av-10.0.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7a7d6e2b3fbda6464f74fe010dbcff361394bb014b0cb4aa4dc9f2bb713ce882"},
+    {file = "av-10.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69fd5a38395191a0f4b71adf31057ff177c9f0762914d73d8797742339ad67d0"},
+    {file = "av-10.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:836d69a9543d284976b229cc8d4343ffcfc0bbaf05239e13fb7e613b13d5291d"},
+    {file = "av-10.0.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:eba192274538617bbe60097a013d83637f1a5ba9844bbbcf3ca7e43c6499b9d5"},
+    {file = "av-10.0.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1301e4cf1a2c899851073720cd541066c8539b64f9eb0d52216f8d0a59f20429"},
+    {file = "av-10.0.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eebd5aa9d8b1e33e715c5409544a712f13ec805bb0110d75f394ff28d2fb64ad"},
+    {file = "av-10.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:04cd0ce13a87870fb0a0ea4673f04934af2b9ac7ae844eafe92e2c19c092ab11"},
+    {file = "av-10.0.0-cp37-cp37m-win_amd64.whl", hash = "sha256:10facb5b933551dd6a30d8015bc91eef5d1c864ee86aa3463ffbaff1a99f6c6a"},
+    {file = "av-10.0.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:088636ded03724a2ab51136f6f4be0bc457bdb3c0d2ac7158792fe81150d4c1a"},
+    {file = "av-10.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ff0f7d3b1003a9ed0d06038f3f521a5ff0d3e056ec5111e2a78e303f98b815a7"},
+    {file = "av-10.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ccaf786e747b126a5b3b9a8f5ffbb6a20c5f528775cc7084c95732ca72606fba"},
+    {file = "av-10.0.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7c579d718b52beb812ea2a7bd68f812d0920b00937804d52d31d41bb71aa5557"},
+    {file = "av-10.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a2cfd39baa5d82768d2a8898de7bfd450a083ef22b837d57e5dc1b6de3244218"},
+    {file = "av-10.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:81b5264d9752f49286bc1dc4d2cc66187418c4948a326dbed837c766c9892139"},
+    {file = "av-10.0.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:16bd82b63d0b4c1b855b3c36b13337f7cdc5925bd8284fab893bdf6c290fc3a9"},
+    {file = "av-10.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a6c8f3f8c26d35eefe45b849c81fd0816ba4b6f589baec7357c25b4c5537d3c4"},
+    {file = "av-10.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91ea46fea7259abdfabe00b0ed3a9ca18e7fff7ce80d2a2c66a28f797cce838a"},
+    {file = "av-10.0.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a62edd533d330aa61902ae8cd82966affa487fa337a0c4f58ae8866ccb5d31c0"},
+    {file = "av-10.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b67b7d028c9cf68215376662fd2e0be6ca0cc02d32d3ed8514fec67b12db9cbd"},
+    {file = "av-10.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:0f9c88062ebfd2ce547c522b64f79e487ed2b0a6a9d6693c801b28df0d944607"},
+    {file = "av-10.0.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:63dbafcd02415127d97509523bc285f1ab260988f87b744d7fb1baee6ffbdf96"},
+    {file = "av-10.0.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e2ea4424d0be62fe18c843420284a0907bcb38d577062d62c4b75a8e940e6057"},
+    {file = "av-10.0.0-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8b6326fd0755761e3ee999e4bf90339e869fe71d548b679fee89157858b8d04a"},
+    {file = "av-10.0.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b3fae238751ec0db6377b2106e13762ca84dbe104bd44c1ce9b424163aef4ab5"},
+    {file = "av-10.0.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:86bb3f6e8cce62ad18cd34eb2eadd091d99f51b40be81c929b53fbd8fecf6d90"},
+    {file = "av-10.0.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:f7b508813abbc100162d305a1ac9b2dd16e5128d56f2ac69639fc6a4b5aca69e"},
+    {file = "av-10.0.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:98cc376199c0aa6e9365d03e0f4e67cfb209e40fe9c0cf566372f9daf2a0c779"},
+    {file = "av-10.0.0-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1b459ca0ef25c1a0e370112556bdc5b7752f76dc9bd497acaf3e653171e4b946"},
+    {file = "av-10.0.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ab930735112c1f788cc4d47c42c59ba0dd214d815aa906e1addf39af91d15194"},
+    {file = "av-10.0.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:13fe0b48b9211539323ecebbf84154c86c72d16723c6d0af76e29ae5c3a614b2"},
+    {file = "av-10.0.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c2eeec7beaebfe9e2213b3c94b482381187d0afdcb632f93239b44dc668b97df"},
+    {file = "av-10.0.0-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3dac2a8b0791c3373270e32f6cd27e6b60628565a188e40a5d9660d3aab05e33"},
+    {file = "av-10.0.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1cdede2325cb750b5bf79238bbf06f9c2a70b757b12726003769a43493b7233a"},
+    {file = "av-10.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:9788e6e15db0910fb8e1548ba7540799d07066177710590a5794a524c4910e05"},
+    {file = "av-10.0.0.tar.gz", hash = "sha256:8afd3d5610e1086f3b2d8389d66672ea78624516912c93612de64dcaa4c67e05"},
+]
 babel = [
     {file = "Babel-2.11.0-py3-none-any.whl", hash = "sha256:1ad3eca1c885218f6dce2ab67291178944f810a10a9b5f3cb8382a5a232b64fe"},
     {file = "Babel-2.11.0.tar.gz", hash = "sha256:5ef4b3226b0180dedded4229651c8b0e1a3a6a2837d45a073272f313e4cf97f6"},
diff --git a/pyproject.toml b/pyproject.toml
index b2ff89bef7d..2ed336c0bcb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,11 +17,13 @@ types-pillow = {version = "^9.3.0.1", optional = true }
 trimesh = {version = "^3.17.1", optional = true}
 typing-inspect = "^0.8.0"
 types-requests = "^2.28.11.6"
+av = "^10.0.0"
 
 [tool.poetry.extras]
 common = ["protobuf"]
 torch = ["torch"]
 image = ["pillow", "types-pillow"]
+video = ["av"]
 mesh = ["trimesh"]
 
 [tool.poetry.dev-dependencies]

From 5cb098a33ed965839c6bc86eede8f3f15f0137b9 Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Wed, 4 Jan 2023 15:16:34 +0100
Subject: [PATCH 06/26] fix: allow dim 3

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 docarray/typing/tensor/video/abstract_video_tensor.py | 9 ++++++---
 docarray/typing/url/video_url.py                      | 9 +++------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/docarray/typing/tensor/video/abstract_video_tensor.py b/docarray/typing/tensor/video/abstract_video_tensor.py
index 4cb5d7be9b2..2751f393278 100644
--- a/docarray/typing/tensor/video/abstract_video_tensor.py
+++ b/docarray/typing/tensor/video/abstract_video_tensor.py
@@ -33,7 +33,7 @@ def save_to_file(
         """
         np_tensor = self.to_numpy()
 
-        video_tensor = np.moveaxis(np.clip(np_tensor, 0, 255), 1, 2).astype('uint8')
+        video_tensor = np.moveaxis(np.clip(np_tensor, 0, 255), -3, -2).astype('uint8')
 
         import av
 
@@ -43,8 +43,11 @@ def save_to_file(
             stream.height = np_tensor.shape[2]
             stream.pix_fmt = 'yuv420p'
 
-            for b in video_tensor:
-                frame = av.VideoFrame.from_ndarray(b, format='rgb24')
+            if video_tensor.ndim == 3:
+                video_tensor = np.expand_dims(video_tensor, axis=0)
+
+            for vid in video_tensor:
+                frame = av.VideoFrame.from_ndarray(vid)
                 for packet in stream.encode(frame):
                     container.mux(packet)
 
diff --git a/docarray/typing/url/video_url.py b/docarray/typing/url/video_url.py
index e424c1d5935..0c705af5917 100644
--- a/docarray/typing/url/video_url.py
+++ b/docarray/typing/url/video_url.py
@@ -50,7 +50,7 @@ def validate(
         return cls(str(url), scheme=None)
 
     def load(
-        self: T, only_keyframes: bool = False, dtype: str = 'int32', **kwargs
+        self: T, only_keyframes: bool = False, **kwargs
     ) -> Union[VideoNdArray, Tuple[VideoNdArray, np.ndarray]]:
         """
         Load the data from the url into a VideoNdArray or Tuple of VideoNdArray and
@@ -60,7 +60,6 @@ def load(
 
         :param only_keyframes: if True keep only the keyframes, if False keep all frames
             and store the indices of the keyframes in :attr:`.tags`
-        :param dtype: Data-type of the returned array; default: int32.
         :param kwargs: supports all keyword arguments that are being supported by
             av.open() as described in:
             https://pyav.org/docs/stable/api/_globals.html?highlight=open#av.open
@@ -83,11 +82,9 @@ def load(
                 if not only_keyframes and frame.key_frame == 1:
                     keyframe_indices.append(i)
 
-        frames_vid: VideoNdArray = parse_obj_as(
-            VideoNdArray, np.moveaxis(np.stack(frames), 1, 2)
-        )
+        frames_vid = parse_obj_as(VideoNdArray, np.moveaxis(np.stack(frames), -3, -2))
 
         if only_keyframes:
             return frames_vid
         else:
-            return frames_vid, np.ndarray(keyframe_indices, dtype=dtype)
+            return frames_vid, np.ndarray(keyframe_indices)

From 3ba1f788101e061f2b91b9a2b6f398186bef1632 Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Thu, 5 Jan 2023 14:10:11 +0100
Subject: [PATCH 07/26] test: wip video load and save

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 .../tensor/video/abstract_video_tensor.py     | 25 ++++-----
 docarray/typing/url/video_url.py              | 11 ++--
 .../predefined_document/test_video.py         | 53 +++++++++++++++----
 3 files changed, 59 insertions(+), 30 deletions(-)

diff --git a/docarray/typing/tensor/video/abstract_video_tensor.py b/docarray/typing/tensor/video/abstract_video_tensor.py
index 2751f393278..5b89ca2368f 100644
--- a/docarray/typing/tensor/video/abstract_video_tensor.py
+++ b/docarray/typing/tensor/video/abstract_video_tensor.py
@@ -19,37 +19,34 @@ def to_numpy(self) -> np.ndarray:
     def save_to_file(
         self: 'T',
         file_path: Union[str, BinaryIO],
-        frame_rate: int = 30,
+        frame_rate: int = 24,
         codec: str = 'h264',
     ) -> None:
         """
-        Save video tensor to a .wav file. Mono/stereo is preserved.
+        Save video tensor to a .mp4 file.
 
-
-        :param file_path: path to a .wav file. If file is a string, open the file by
+        :param file_path: path to a .mp4 file. If file is a string, open the file by
             that name, otherwise treat it as a file-like object.
         :param frame_rate: frames per second.
         :param codec: the name of a decoder/encoder.
         """
         np_tensor = self.to_numpy()
-
-        video_tensor = np.moveaxis(np.clip(np_tensor, 0, 255), -3, -2).astype('uint8')
-
+        print(f"np_tensor[0][:2] = {np_tensor[0][:2]}")
+        video_tensor = np_tensor.astype('uint8')
         import av
 
         with av.open(file_path, mode='w') as container:
-            stream = container.add_stream(codec, rate=frame_rate)
-            stream.width = np_tensor.shape[1]
-            stream.height = np_tensor.shape[2]
-            stream.pix_fmt = 'yuv420p'
-
             if video_tensor.ndim == 3:
                 video_tensor = np.expand_dims(video_tensor, axis=0)
 
+            stream = container.add_stream(codec, rate=frame_rate)
+            stream.height = video_tensor.shape[-3]
+            stream.width = video_tensor.shape[-2]
+
             for vid in video_tensor:
-                frame = av.VideoFrame.from_ndarray(vid)
+                frame = av.VideoFrame.from_ndarray(vid, format='rgb24')
                 for packet in stream.encode(frame):
                     container.mux(packet)
 
-            for packet in stream.encode():
+            for packet in stream.encode(None):
                 container.mux(packet)
diff --git a/docarray/typing/url/video_url.py b/docarray/typing/url/video_url.py
index 0c705af5917..932112d8bcd 100644
--- a/docarray/typing/url/video_url.py
+++ b/docarray/typing/url/video_url.py
@@ -56,8 +56,6 @@ def load(
         Load the data from the url into a VideoNdArray or Tuple of VideoNdArray and
         np.ndarray.
 
-
-
         :param only_keyframes: if True keep only the keyframes, if False keep all frames
             and store the indices of the keyframes in :attr:`.tags`
         :param kwargs: supports all keyword arguments that are being supported by
@@ -75,16 +73,17 @@ def load(
 
             frames = []
             keyframe_indices = []
+
             for i, frame in enumerate(container.decode(video=0)):
 
-                img = frame.to_image()
-                frames.append(img)
+                frame_np = frame.to_ndarray(format='rgb24')
+                frames.append(frame_np)
                 if not only_keyframes and frame.key_frame == 1:
                     keyframe_indices.append(i)
 
-        frames_vid = parse_obj_as(VideoNdArray, np.moveaxis(np.stack(frames), -3, -2))
+        frames_vid = parse_obj_as(VideoNdArray, np.stack(frames))
 
         if only_keyframes:
             return frames_vid
         else:
-            return frames_vid, np.ndarray(keyframe_indices)
+            return frames_vid, np.array(keyframe_indices)
diff --git a/tests/integrations/predefined_document/test_video.py b/tests/integrations/predefined_document/test_video.py
index 77acdab4cd1..f31f4ae3b88 100644
--- a/tests/integrations/predefined_document/test_video.py
+++ b/tests/integrations/predefined_document/test_video.py
@@ -25,19 +25,52 @@ def test_video(file_url):
 
 @pytest.mark.slow
 @pytest.mark.internet
-@pytest.mark.parametrize('file_url', [LOCAL_VIDEO_FILE, REMOTE_VIDEO_FILE])
+@pytest.mark.parametrize('file_url', [LOCAL_VIDEO_FILE])  # , REMOTE_VIDEO_FILE])
 def test_save_video_ndarray(file_url, tmpdir):
-    tmp_file = str(tmpdir / 'tmp.mp4')
+    tmp_file = str(TOYDATA_DIR / 'tmp.mp4')
 
-    video = Video(url=file_url)
-    video.tensor, _ = video.url.load()
+    video_1 = Video(url=file_url)
+    assert video_1.url == file_url
 
-    assert isinstance(video.tensor, np.ndarray)
-    assert isinstance(video.tensor, VideoNdArray)
+    video_1.tensor, _ = video_1.url.load()
+    assert isinstance(video_1.tensor, np.ndarray)
+    assert isinstance(video_1.tensor, VideoNdArray)
 
-    video.tensor.save_to_file(tmp_file)
+    # from PIL import Image
+    # Image.fromarray(video_1.tensor[0]).show()
+
+    video_1.tensor.save_to_file(tmp_file)
     assert os.path.isfile(tmp_file)
+    print(f"video_1.tensor[0][:2] = {video_1.tensor[0][:2]}")
+
+    video_2 = Video(url=tmp_file)
+    video_2.tensor, _ = video_2.url.load()
+    video_2.tensor.save_to_file(str(TOYDATA_DIR / 'tmp_2.mp4'))
+
+    # video_3 = Video(url=str(tmpdir / f'tmp_2.mp4'))
+    # video_3.tensor, _ = video_3.url.load()
+    # video_3.tensor.save_to_file(str(tmpdir / f'tmp_3.mp4'))
+    #
+    # video_4 = Video(url=str(tmpdir / f'tmp_3.mp4'))
+    # video_4.tensor, _ = video_4.url.load()
+    # video_4.tensor.save_to_file(str(tmpdir / f'tmp_4.mp4'))
+    #
+    # video_5 = Video(url=str(tmpdir / f'tmp_4.mp4'))
+    # video_5.tensor, _ = video_5.url.load()
+    # video_5.tensor.save_to_file(str(tmpdir / f'tmp_5.mp4'))
+    #
+    # video_6 = Video(url=str(tmpdir / f'tmp_5.mp4'))
+    # video_6.tensor, _ = video_6.url.load()
+    # video_6.tensor.save_to_file(str(tmpdir / f'tmp_6.mp4'))
+    #
+    print(f"video_2.tensor[0][:2] = {video_2.tensor[0][:2]}")
+    # print(f"video_3.tensor[0][:2] = {video_3.tensor[0][:2]}")
+    # print(f"video_4.tensor[0][:2] = {video_3.tensor[0][:2]}")
+    # print(f"video_5.tensor[0][:2] = {video_3.tensor[0][:2]}")
+    # print(f"video_6.tensor[0][:2] = {video_3.tensor[0][:2]}")
 
-    video_from_file = Video(url=tmp_file)
-    video_from_file.tensor = video_from_file.url.load()
-    assert np.allclose(video.tensor, video_from_file.tensor)
+    # Image.fromarray(video_1.tensor[0]).show()
+    assert isinstance(video_1.tensor, np.ndarray)
+    assert isinstance(video_1.tensor, VideoNdArray)
+    assert video_1.tensor.shape == video_2.tensor.shape
+    assert np.allclose(video_1.tensor, video_2.tensor, atol=100)

From be639262a383385a6562dd708e542c08540b7d32 Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Fri, 6 Jan 2023 10:11:41 +0100
Subject: [PATCH 08/26] refactor: move to numpy to computational backend

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 docarray/computation/abstract_comp_backend.py       | 13 +++++++++++++
 docarray/computation/numpy_backend.py               |  4 ++++
 docarray/computation/torch_backend.py               |  5 +++++
 .../typing/tensor/video/abstract_video_tensor.py    | 12 ++----------
 docarray/typing/tensor/video/video_ndarray.py       |  3 ---
 docarray/typing/tensor/video/video_torch_tensor.py  |  3 ---
 6 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/docarray/computation/abstract_comp_backend.py b/docarray/computation/abstract_comp_backend.py
index a58582d658c..7420d7daf29 100644
--- a/docarray/computation/abstract_comp_backend.py
+++ b/docarray/computation/abstract_comp_backend.py
@@ -2,6 +2,8 @@
 from abc import ABC, abstractmethod
 from typing import List, Optional, Tuple, TypeVar, Union
 
+import numpy as np
+
 # In practice all of the below will be the same type
 TTensor = TypeVar('TTensor')
 TTensorRetrieval = TypeVar('TTensorRetrieval')
@@ -29,6 +31,17 @@ def stack(
     @staticmethod
     @abstractmethod
     def n_dim(array: 'TTensor') -> int:
+        """
+        Get the number of the array dimensions.
+        """
+        ...
+
+    @staticmethod
+    @abstractmethod
+    def to_numpy(array: 'TTensor') -> np.ndarray:
+        """
+        Convert array to np.ndarray.
+        """
         ...
 
     class Retrieval(ABC, typing.Generic[TTensorRetrieval]):
diff --git a/docarray/computation/numpy_backend.py b/docarray/computation/numpy_backend.py
index d5950a70f17..51f0ea7d3bf 100644
--- a/docarray/computation/numpy_backend.py
+++ b/docarray/computation/numpy_backend.py
@@ -44,6 +44,10 @@ def stack(
     def n_dim(array: 'np.ndarray') -> int:
         return array.ndim
 
+    @staticmethod
+    def to_numpy(array: 'np.ndarray') -> np.ndarray:
+        return array
+
     class Retrieval(AbstractComputationalBackend.Retrieval[np.ndarray]):
         """
         Abstract class for retrieval and ranking functionalities
diff --git a/docarray/computation/torch_backend.py b/docarray/computation/torch_backend.py
index 52f7ea879c3..402b36350fb 100644
--- a/docarray/computation/torch_backend.py
+++ b/docarray/computation/torch_backend.py
@@ -1,5 +1,6 @@
 from typing import List, Optional, Tuple, Union
 
+import numpy as np
 import torch
 
 from docarray.computation.abstract_comp_backend import AbstractComputationalBackend
@@ -43,6 +44,10 @@ def stack(
     def n_dim(array: 'torch.Tensor') -> int:
         return array.ndim
 
+    @staticmethod
+    def to_numpy(array: 'torch.Tensor') -> np.ndarray:
+        return array.cpu().detach().numpy()
+
     class Retrieval(AbstractComputationalBackend.Retrieval[torch.Tensor]):
         """
         Abstract class for retrieval and ranking functionalities
diff --git a/docarray/typing/tensor/video/abstract_video_tensor.py b/docarray/typing/tensor/video/abstract_video_tensor.py
index 5b89ca2368f..0f4118a5d59 100644
--- a/docarray/typing/tensor/video/abstract_video_tensor.py
+++ b/docarray/typing/tensor/video/abstract_video_tensor.py
@@ -1,4 +1,4 @@
-from abc import ABC, abstractmethod
+from abc import ABC
 from typing import BinaryIO, TypeVar, Union
 
 import numpy as np
@@ -9,13 +9,6 @@
 
 
 class AbstractVideoTensor(AbstractTensor, ABC):
-    @abstractmethod
-    def to_numpy(self) -> np.ndarray:
-        """
-        Convert video tensor to numpy.ndarray.
-        """
-        ...
-
     def save_to_file(
         self: 'T',
         file_path: Union[str, BinaryIO],
@@ -30,8 +23,7 @@ def save_to_file(
         :param frame_rate: frames per second.
         :param codec: the name of a decoder/encoder.
         """
-        np_tensor = self.to_numpy()
-        print(f"np_tensor[0][:2] = {np_tensor[0][:2]}")
+        np_tensor = self.get_comp_backend().to_numpy(array=self)  # type: ignore
         video_tensor = np_tensor.astype('uint8')
         import av
 
diff --git a/docarray/typing/tensor/video/video_ndarray.py b/docarray/typing/tensor/video/video_ndarray.py
index 5362bb05dc1..10a608ab743 100644
--- a/docarray/typing/tensor/video/video_ndarray.py
+++ b/docarray/typing/tensor/video/video_ndarray.py
@@ -38,6 +38,3 @@ def validate(
             )
         else:
             return array
-
-    def to_numpy(self) -> np.ndarray:
-        return self
diff --git a/docarray/typing/tensor/video/video_torch_tensor.py b/docarray/typing/tensor/video/video_torch_tensor.py
index 0bc755f8467..05f56bf792d 100644
--- a/docarray/typing/tensor/video/video_torch_tensor.py
+++ b/docarray/typing/tensor/video/video_torch_tensor.py
@@ -38,6 +38,3 @@ def validate(
             )
         else:
             return tensor
-
-    def to_numpy(self) -> np.ndarray:
-        return self.cpu().detach().numpy()

From 395a495aaf09e452b9ac7e663dc1f234eaadcbd9 Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Wed, 11 Jan 2023 09:36:52 +0100
Subject: [PATCH 09/26] fix: video load and save

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 docarray/predefined_document/video.py         | 10 +++--
 .../tensor/video/abstract_video_tensor.py     | 45 ++++++++++++++-----
 docarray/typing/url/video_url.py              | 38 +++++++++-------
 3 files changed, 62 insertions(+), 31 deletions(-)

diff --git a/docarray/predefined_document/video.py b/docarray/predefined_document/video.py
index 0536a543d1b..765407f6e75 100644
--- a/docarray/predefined_document/video.py
+++ b/docarray/predefined_document/video.py
@@ -2,6 +2,7 @@
 
 from docarray.document import BaseDocument
 from docarray.typing import AnyTensor, Embedding
+from docarray.typing.tensor.audio.audio_tensor import AudioTensor
 from docarray.typing.tensor.video.video_tensor import VideoTensor
 from docarray.typing.url.video_url import VideoUrl
 
@@ -11,9 +12,9 @@
 class Video(BaseDocument):
     """
     Document for handling video.
-    The Video Document can contain a VideoUrl (`Video.url`), a VideoTensor
-    (`Video.tensor`), an AnyTensor ('Video.key_frame_indices), and an Embedding
-    (`Video.embedding`).
+    The Video Document can contain a VideoUrl (`Video.url`), an AudioTensor
+    (`Video.audio_tensor`), a VideoTensor (`Video.video_tensor`), an AnyTensor
+    ('Video.key_frame_indices), and an Embedding (`Video.embedding`).
 
     EXAMPLE USAGE:
 
@@ -26,6 +27,7 @@ class Video(BaseDocument):
     """
 
     url: Optional[VideoUrl]
-    tensor: Optional[VideoTensor]
+    audio_tensor: Optional[AudioTensor]
+    video_tensor: Optional[VideoTensor]
     key_frame_indices: Optional[AnyTensor]
     embedding: Optional[Embedding]
diff --git a/docarray/typing/tensor/video/abstract_video_tensor.py b/docarray/typing/tensor/video/abstract_video_tensor.py
index 0f4118a5d59..b5d8a79b26c 100644
--- a/docarray/typing/tensor/video/abstract_video_tensor.py
+++ b/docarray/typing/tensor/video/abstract_video_tensor.py
@@ -1,44 +1,65 @@
 from abc import ABC
-from typing import BinaryIO, TypeVar, Union
+from typing import BinaryIO, Optional, TypeVar, Union
 
 import numpy as np
 
 from docarray.typing.tensor.abstract_tensor import AbstractTensor
+from docarray.typing.tensor.audio.audio_tensor import AudioTensor
 
 T = TypeVar('T', bound='AbstractVideoTensor')
 
 
 class AbstractVideoTensor(AbstractTensor, ABC):
-    def save_to_file(
+    def save_to_mp4_file(
         self: 'T',
         file_path: Union[str, BinaryIO],
-        frame_rate: int = 24,
-        codec: str = 'h264',
+        audio_tensor: Optional[AudioTensor] = None,
+        video_frame_rate: int = 30,
+        video_codec: str = 'h264',
+        audio_frame_rate: int = 48000,
+        audio_codec: str = 'aac',
+        audio_format: str = 'fltp',
     ) -> None:
         """
         Save video tensor to a .mp4 file.
 
         :param file_path: path to a .mp4 file. If file is a string, open the file by
             that name, otherwise treat it as a file-like object.
-        :param frame_rate: frames per second.
-        :param codec: the name of a decoder/encoder.
+        :param video_frame_rate: frames per second.
+        :param video_codec: the name of a decoder/encoder.
         """
+        import av
+
         np_tensor = self.get_comp_backend().to_numpy(array=self)  # type: ignore
         video_tensor = np_tensor.astype('uint8')
-        import av
 
         with av.open(file_path, mode='w') as container:
             if video_tensor.ndim == 3:
                 video_tensor = np.expand_dims(video_tensor, axis=0)
 
-            stream = container.add_stream(codec, rate=frame_rate)
-            stream.height = video_tensor.shape[-3]
-            stream.width = video_tensor.shape[-2]
+            stream_video = container.add_stream(video_codec, rate=video_frame_rate)
+            stream_video.height = video_tensor.shape[-3]
+            stream_video.width = video_tensor.shape[-2]
+
+            if audio_tensor is not None:
+                stream_audio = container.add_stream(audio_codec)
+                audio_np = audio_tensor.get_comp_backend().to_numpy(array=audio_tensor)
+                audio_layout = 'stereo' if audio_np.shape[-2] == 2 else 'mono'
+
+                for i, audio in enumerate(audio_np):
+                    frame = av.AudioFrame.from_ndarray(
+                        array=audio, format=audio_format, layout=audio_layout
+                    )
+                    frame.rate = audio_frame_rate
+                    for packet in stream_audio.encode(frame):
+                        container.mux(packet)
 
             for vid in video_tensor:
                 frame = av.VideoFrame.from_ndarray(vid, format='rgb24')
-                for packet in stream.encode(frame):
+                for packet in stream_video.encode(frame):
                     container.mux(packet)
 
-            for packet in stream.encode(None):
+            for packet in stream_audio.encode(None):
+                container.mux(packet)
+            for packet in stream_video.encode(None):
                 container.mux(packet)
diff --git a/docarray/typing/url/video_url.py b/docarray/typing/url/video_url.py
index 932112d8bcd..b84e79360ff 100644
--- a/docarray/typing/url/video_url.py
+++ b/docarray/typing/url/video_url.py
@@ -3,6 +3,7 @@
 import numpy as np
 from pydantic.tools import parse_obj_as
 
+from docarray.typing import AudioNdArray, NdArray
 from docarray.typing.tensor.video import VideoNdArray
 from docarray.typing.url.any_url import AnyUrl
 
@@ -50,19 +51,20 @@ def validate(
         return cls(str(url), scheme=None)
 
     def load(
-        self: T, only_keyframes: bool = False, **kwargs
-    ) -> Union[VideoNdArray, Tuple[VideoNdArray, np.ndarray]]:
+        self: T, only_keyframes: bool = False, audio_format: str = 'fltp', **kwargs
+    ) -> Union[VideoNdArray, Tuple[AudioNdArray, VideoNdArray, NdArray]]:
         """
-        Load the data from the url into a VideoNdArray or Tuple of VideoNdArray and
-        np.ndarray.
+        Load the data from the url into a VideoNdArray or Tuple of AudioNdArray,
+        VideoNdArray and NdArray.
 
         :param only_keyframes: if True keep only the keyframes, if False keep all frames
             and store the indices of the keyframes in :attr:`.tags`
         :param kwargs: supports all keyword arguments that are being supported by
             av.open() as described in:
             https://pyav.org/docs/stable/api/_globals.html?highlight=open#av.open
-        :return: np.ndarray representing the audio file content, list of key frame
-            indices if only_keyframe False.
+        :return: AudioNdArray representing the audio content, VideoNdArray representing
+            the images of the video, NdArray of key frame indices if only_keyframe
+            False, else only VideoNdArray representing the keyframes.
         """
         import av
 
@@ -71,19 +73,25 @@ def load(
                 stream = container.streams.video[0]
                 stream.codec_context.skip_frame = 'NONKEY'
 
-            frames = []
+            audio_frames = []
+            video_frames = []
             keyframe_indices = []
 
-            for i, frame in enumerate(container.decode(video=0)):
+            for frame in container.decode():
+                if type(frame) == av.audio.frame.AudioFrame:
+                    audio_frames.append(frame.to_ndarray(format=audio_format))
+                elif type(frame) == av.video.frame.VideoFrame:
+                    video_frames.append(frame.to_ndarray(format='rgb24'))
 
-                frame_np = frame.to_ndarray(format='rgb24')
-                frames.append(frame_np)
-                if not only_keyframes and frame.key_frame == 1:
-                    keyframe_indices.append(i)
+                    if not only_keyframes and frame.key_frame == 1:
+                        curr_index = len(video_frames)
+                        keyframe_indices.append(curr_index)
 
-        frames_vid = parse_obj_as(VideoNdArray, np.stack(frames))
+        video = parse_obj_as(VideoNdArray, np.stack(video_frames))
 
         if only_keyframes:
-            return frames_vid
+            return video
         else:
-            return frames_vid, np.array(keyframe_indices)
+            audio = parse_obj_as(AudioNdArray, np.stack(audio_frames))
+            indices = parse_obj_as(NdArray, keyframe_indices)
+            return audio, video, indices

From 406ec8084b054abb4932fdeb6258a73e4a02056e Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Wed, 11 Jan 2023 09:37:55 +0100
Subject: [PATCH 10/26] test: adjust tests

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 .../predefined_document/test_video.py         | 56 +++++++++----------
 .../units/typing/tensor/test_video_tensor.py  |  2 +-
 tests/units/typing/url/test_video_url.py      |  4 +-
 3 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/tests/integrations/predefined_document/test_video.py b/tests/integrations/predefined_document/test_video.py
index f31f4ae3b88..719f0dc2295 100644
--- a/tests/integrations/predefined_document/test_video.py
+++ b/tests/integrations/predefined_document/test_video.py
@@ -4,7 +4,7 @@
 import pytest
 
 from docarray import Video
-from docarray.typing import VideoNdArray
+from docarray.typing import AudioNdArray, NdArray, VideoNdArray
 from tests import TOYDATA_DIR
 
 LOCAL_VIDEO_FILE = str(TOYDATA_DIR / 'mov_bbb.mp4')
@@ -15,62 +15,62 @@
 @pytest.mark.internet
 @pytest.mark.parametrize('file_url', [LOCAL_VIDEO_FILE, REMOTE_VIDEO_FILE])
 def test_video(file_url):
-    video = Video(url=file_url)
-    video.tensor, video.key_frame_indices = video.url.load()
+    vid = Video(url=file_url)
+    vid.audio_tensor, vid.video_tensor, vid.key_frame_indices = vid.url.load()
 
-    assert isinstance(video.tensor, np.ndarray)
-    assert isinstance(video.tensor, VideoNdArray)
-    assert isinstance(video.key_frame_indices, np.ndarray)
+    assert isinstance(vid.audio_tensor, AudioNdArray)
+    assert isinstance(vid.video_tensor, VideoNdArray)
+    assert isinstance(vid.key_frame_indices, NdArray)
 
 
 @pytest.mark.slow
 @pytest.mark.internet
-@pytest.mark.parametrize('file_url', [LOCAL_VIDEO_FILE])  # , REMOTE_VIDEO_FILE])
+@pytest.mark.parametrize('file_url', [LOCAL_VIDEO_FILE, REMOTE_VIDEO_FILE])
 def test_save_video_ndarray(file_url, tmpdir):
     tmp_file = str(TOYDATA_DIR / 'tmp.mp4')
 
     video_1 = Video(url=file_url)
     assert video_1.url == file_url
 
-    video_1.tensor, _ = video_1.url.load()
-    assert isinstance(video_1.tensor, np.ndarray)
-    assert isinstance(video_1.tensor, VideoNdArray)
+    audio_tensor, video_1.video_tensor, _ = video_1.url.load()
+    assert isinstance(video_1.video_tensor, np.ndarray)
+    assert isinstance(video_1.video_tensor, VideoNdArray)
 
     # from PIL import Image
     # Image.fromarray(video_1.tensor[0]).show()
 
-    video_1.tensor.save_to_file(tmp_file)
+    video_1.video_tensor.save_to_mp4_file(file_path=tmp_file, audio_tensor=audio_tensor)
     assert os.path.isfile(tmp_file)
-    print(f"video_1.tensor[0][:2] = {video_1.tensor[0][:2]}")
-
-    video_2 = Video(url=tmp_file)
-    video_2.tensor, _ = video_2.url.load()
-    video_2.tensor.save_to_file(str(TOYDATA_DIR / 'tmp_2.mp4'))
-
-    # video_3 = Video(url=str(tmpdir / f'tmp_2.mp4'))
+    print(f"\nvideo_1.tensor[0][:2] = {video_1.video_tensor[0][:2]}")
+    #
+    # video_2 = Video(url=tmp_file)
+    # video_2.tensor, _ = video_2.url.load()
+    # video_2.tensor.save_to_file(str(tmpdir / 'tmp_2.mp4'))
+    #
+    # video_3 = Video(url=str(tmpdir / f'tmp.mp4'))
     # video_3.tensor, _ = video_3.url.load()
     # video_3.tensor.save_to_file(str(tmpdir / f'tmp_3.mp4'))
     #
-    # video_4 = Video(url=str(tmpdir / f'tmp_3.mp4'))
+    # video_4 = Video(url=str(tmpdir / f'tmp.mp4'))
     # video_4.tensor, _ = video_4.url.load()
     # video_4.tensor.save_to_file(str(tmpdir / f'tmp_4.mp4'))
     #
-    # video_5 = Video(url=str(tmpdir / f'tmp_4.mp4'))
+    # video_5 = Video(url=str(tmpdir / f'tmp.mp4'))
     # video_5.tensor, _ = video_5.url.load()
     # video_5.tensor.save_to_file(str(tmpdir / f'tmp_5.mp4'))
     #
-    # video_6 = Video(url=str(tmpdir / f'tmp_5.mp4'))
+    # video_6 = Video(url=str(tmpdir / f'tmp.mp4'))
     # video_6.tensor, _ = video_6.url.load()
     # video_6.tensor.save_to_file(str(tmpdir / f'tmp_6.mp4'))
     #
-    print(f"video_2.tensor[0][:2] = {video_2.tensor[0][:2]}")
+    # print(f"video_2.tensor[0][:2] = {video_2.tensor[0][:2]}")
     # print(f"video_3.tensor[0][:2] = {video_3.tensor[0][:2]}")
     # print(f"video_4.tensor[0][:2] = {video_3.tensor[0][:2]}")
     # print(f"video_5.tensor[0][:2] = {video_3.tensor[0][:2]}")
     # print(f"video_6.tensor[0][:2] = {video_3.tensor[0][:2]}")
-
-    # Image.fromarray(video_1.tensor[0]).show()
-    assert isinstance(video_1.tensor, np.ndarray)
-    assert isinstance(video_1.tensor, VideoNdArray)
-    assert video_1.tensor.shape == video_2.tensor.shape
-    assert np.allclose(video_1.tensor, video_2.tensor, atol=100)
+    #
+    # # Image.fromarray(video_1.tensor[0]).show()
+    # assert isinstance(video_1.tensor, np.ndarray)
+    # assert isinstance(video_1.tensor, VideoNdArray)
+    # # assert video_1.tensor.shape == video_2.tensor.shape
+    # # assert np.allclose(video_1.tensor, video_2.tensor)
diff --git a/tests/units/typing/tensor/test_video_tensor.py b/tests/units/typing/tensor/test_video_tensor.py
index bbc94ddaf4d..08b6d5847d7 100644
--- a/tests/units/typing/tensor/test_video_tensor.py
+++ b/tests/units/typing/tensor/test_video_tensor.py
@@ -83,5 +83,5 @@ def test_proto_tensor(cls_tensor, tensor, proto_key):
 def test_save_video_tensor_to_file(cls_tensor, tensor, tmpdir):
     tmp_file = str(tmpdir / 'tmp.mp4')
     video_tensor = parse_obj_as(cls_tensor, tensor)
-    video_tensor.save_to_file(tmp_file)
+    video_tensor.save_to_mp4_file(tmp_file)
     assert os.path.isfile(tmp_file)
diff --git a/tests/units/typing/url/test_video_url.py b/tests/units/typing/url/test_video_url.py
index 39ad487e8fc..b59622390c8 100644
--- a/tests/units/typing/url/test_video_url.py
+++ b/tests/units/typing/url/test_video_url.py
@@ -6,7 +6,7 @@
 
 from docarray import BaseDocument
 from docarray.document.io.json import orjson_dumps
-from docarray.typing import VideoNdArray, VideoTorchTensor, VideoUrl
+from docarray.typing import NdArray, VideoNdArray, VideoTorchTensor, VideoUrl
 from tests import TOYDATA_DIR
 
 LOCAL_VIDEO_FILE = str(TOYDATA_DIR / 'mov_bbb.mp4')
@@ -27,7 +27,7 @@ def test_load_with_only_keyframes_false(file_url):
     assert isinstance(tensor, VideoNdArray)
 
     assert isinstance(indices, np.ndarray)
-    assert isinstance(indices, VideoNdArray)
+    assert isinstance(indices, NdArray)
 
 
 @pytest.mark.slow

From 091e79ae88314220d9d54a842029adc387750c13 Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Wed, 11 Jan 2023 14:11:29 +0100
Subject: [PATCH 11/26] fix: video load and save and add docstrings

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 .../tensor/video/abstract_video_tensor.py     | 46 ++++++++++++--
 docarray/typing/url/audio_url.py              |  2 +-
 docarray/typing/url/video_url.py              | 62 +++++++++++++++++--
 .../predefined_document/test_video.py         | 56 -----------------
 .../units/typing/tensor/test_video_tensor.py  | 36 +++++++++--
 tests/units/typing/url/test_video_url.py      | 23 ++++---
 6 files changed, 144 insertions(+), 81 deletions(-)

diff --git a/docarray/typing/tensor/video/abstract_video_tensor.py b/docarray/typing/tensor/video/abstract_video_tensor.py
index b5d8a79b26c..aa3b0c4951c 100644
--- a/docarray/typing/tensor/video/abstract_video_tensor.py
+++ b/docarray/typing/tensor/video/abstract_video_tensor.py
@@ -13,9 +13,9 @@ class AbstractVideoTensor(AbstractTensor, ABC):
     def save_to_mp4_file(
         self: 'T',
         file_path: Union[str, BinaryIO],
-        audio_tensor: Optional[AudioTensor] = None,
-        video_frame_rate: int = 30,
+        video_frame_rate: int = 24,
         video_codec: str = 'h264',
+        audio_tensor: Optional[AudioTensor] = None,
         audio_frame_rate: int = 48000,
         audio_codec: str = 'aac',
         audio_format: str = 'fltp',
@@ -25,8 +25,40 @@ def save_to_mp4_file(
 
         :param file_path: path to a .mp4 file. If file is a string, open the file by
             that name, otherwise treat it as a file-like object.
-        :param video_frame_rate: frames per second.
-        :param video_codec: the name of a decoder/encoder.
+        :param video_frame_rate: video frames per second.
+        :param video_codec: the name of a video decoder/encoder.
+        :param audio_tensor: AudioTensor that should be added as soundtrack.
+        :param audio_frame_rate: audio frames per second.
+        :param audio_codec: the name of an audio decoder/encoder.
+        :param audio_format: the name of one of the audio formats supported by PyAV,
+            such as 'flt', 'fltp', 's16' or 's16p'.
+
+        EXAMPLE USAGE
+
+        .. code-block:: python
+            import numpy as np
+
+            from docarray import BaseDocument
+            from docarray.typing.tensor.audio.audio_tensor import AudioTensor
+            from docarray.typing.tensor.video.video_tensor import VideoTensor
+
+
+            class MyDoc(BaseDocument):
+                video_tensor: VideoTensor
+                audio_tensor: AudioTensor
+
+
+            doc = MyDoc(
+                video_tensor=np.random.randint(low=0, high=256, size=(10, 200, 300, 3)),
+                audio_tensor=np.random.randn(100, 1, 1024).astype("float32"),
+            )
+
+            doc.video_tensor.save_to_mp4_file(
+                file_path="toydata/mp_.mp4",
+                audio_tensor=doc.audio_tensor,
+                audio_format="flt",
+            )
+
         """
         import av
 
@@ -51,15 +83,17 @@ def save_to_mp4_file(
                         array=audio, format=audio_format, layout=audio_layout
                     )
                     frame.rate = audio_frame_rate
+                    frame.pts = audio.shape[-1] * i
                     for packet in stream_audio.encode(frame):
                         container.mux(packet)
 
+                for packet in stream_audio.encode(None):
+                    container.mux(packet)
+
             for vid in video_tensor:
                 frame = av.VideoFrame.from_ndarray(vid, format='rgb24')
                 for packet in stream_video.encode(frame):
                     container.mux(packet)
 
-            for packet in stream_audio.encode(None):
-                container.mux(packet)
             for packet in stream_video.encode(None):
                 container.mux(packet)
diff --git a/docarray/typing/url/audio_url.py b/docarray/typing/url/audio_url.py
index 6e9e25a7e7e..1646b4eb0e0 100644
--- a/docarray/typing/url/audio_url.py
+++ b/docarray/typing/url/audio_url.py
@@ -62,7 +62,7 @@ def load(self: T, dtype: str = 'float32') -> AudioNdArray:
 
         .. code-block:: python
 
-            from docarray import Document
+            from docarray import BaseDocument
             import numpy as np
 
             from docarray.typing import AudioUrl
diff --git a/docarray/typing/url/video_url.py b/docarray/typing/url/video_url.py
index b84e79360ff..c660abda121 100644
--- a/docarray/typing/url/video_url.py
+++ b/docarray/typing/url/video_url.py
@@ -51,20 +51,69 @@ def validate(
         return cls(str(url), scheme=None)
 
     def load(
-        self: T, only_keyframes: bool = False, audio_format: str = 'fltp', **kwargs
+        self: T, only_keyframes: bool = False, **kwargs
     ) -> Union[VideoNdArray, Tuple[AudioNdArray, VideoNdArray, NdArray]]:
         """
         Load the data from the url into a VideoNdArray or Tuple of AudioNdArray,
         VideoNdArray and NdArray.
 
-        :param only_keyframes: if True keep only the keyframes, if False keep all frames
-            and store the indices of the keyframes in :attr:`.tags`
+        :param only_keyframes: if True keep only the keyframes, if False return all
+            frames, key frame indices and audio.
         :param kwargs: supports all keyword arguments that are being supported by
             av.open() as described in:
             https://pyav.org/docs/stable/api/_globals.html?highlight=open#av.open
+
         :return: AudioNdArray representing the audio content, VideoNdArray representing
             the images of the video, NdArray of key frame indices if only_keyframe
             False, else only VideoNdArray representing the keyframes.
+
+
+        EXAMPLE USAGE
+
+        .. code-block:: python
+
+            from typing import Optional
+
+            from docarray import BaseDocument
+
+            from docarray.typing import VideoUrl, VideoNdArray, AudioNdArray, NdArray
+
+
+            class MyDoc(BaseDocument):
+                video_url: VideoUrl
+                video: Optional[VideoNdArray]
+                audio: Optional[AudioNdArray]
+                key_frame_indices: Optional[NdArray]
+
+
+            doc = MyDoc(video_url='toydata/mov_bbb.mp4')
+            doc.audio, doc.video, doc.key_frame_indices = doc.video_url.load()
+
+            assert isinstance(doc.video, VideoNdArray)
+            assert isinstance(doc.audio, AudioNdArray)
+            assert isinstance(doc.key_frame_indices, NdArray)
+
+        You can load only the key frames:
+
+        .. code-block:: python
+
+            from typing import Optional
+
+            from docarray import BaseDocument
+
+            from docarray.typing import VideoUrl, VideoNdArray
+
+
+            class MyDoc(BaseDocument):
+                video_url: VideoUrl
+                video_key_frames: Optional[VideoNdArray]
+
+
+            doc = MyDoc(video_url='toydata/mov_bbb.mp4')
+            doc.video_key_frames = doc.video_url.load(only_keyframes=True)
+
+            assert isinstance(doc.video_key_frames, VideoNdArray)
+
         """
         import av
 
@@ -79,7 +128,7 @@ def load(
 
             for frame in container.decode():
                 if type(frame) == av.audio.frame.AudioFrame:
-                    audio_frames.append(frame.to_ndarray(format=audio_format))
+                    audio_frames.append(frame.to_ndarray())
                 elif type(frame) == av.video.frame.VideoFrame:
                     video_frames.append(frame.to_ndarray(format='rgb24'))
 
@@ -92,6 +141,9 @@ def load(
         if only_keyframes:
             return video
         else:
-            audio = parse_obj_as(AudioNdArray, np.stack(audio_frames))
+            if len(audio_frames) == 0:
+                audio = parse_obj_as(AudioNdArray, np.array(audio_frames))
+            else:
+                audio = parse_obj_as(AudioNdArray, np.stack(audio_frames))
             indices = parse_obj_as(NdArray, keyframe_indices)
             return audio, video, indices
diff --git a/tests/integrations/predefined_document/test_video.py b/tests/integrations/predefined_document/test_video.py
index 719f0dc2295..09e9795fce6 100644
--- a/tests/integrations/predefined_document/test_video.py
+++ b/tests/integrations/predefined_document/test_video.py
@@ -1,6 +1,3 @@
-import os
-
-import numpy as np
 import pytest
 
 from docarray import Video
@@ -21,56 +18,3 @@ def test_video(file_url):
     assert isinstance(vid.audio_tensor, AudioNdArray)
     assert isinstance(vid.video_tensor, VideoNdArray)
     assert isinstance(vid.key_frame_indices, NdArray)
-
-
-@pytest.mark.slow
-@pytest.mark.internet
-@pytest.mark.parametrize('file_url', [LOCAL_VIDEO_FILE, REMOTE_VIDEO_FILE])
-def test_save_video_ndarray(file_url, tmpdir):
-    tmp_file = str(TOYDATA_DIR / 'tmp.mp4')
-
-    video_1 = Video(url=file_url)
-    assert video_1.url == file_url
-
-    audio_tensor, video_1.video_tensor, _ = video_1.url.load()
-    assert isinstance(video_1.video_tensor, np.ndarray)
-    assert isinstance(video_1.video_tensor, VideoNdArray)
-
-    # from PIL import Image
-    # Image.fromarray(video_1.tensor[0]).show()
-
-    video_1.video_tensor.save_to_mp4_file(file_path=tmp_file, audio_tensor=audio_tensor)
-    assert os.path.isfile(tmp_file)
-    print(f"\nvideo_1.tensor[0][:2] = {video_1.video_tensor[0][:2]}")
-    #
-    # video_2 = Video(url=tmp_file)
-    # video_2.tensor, _ = video_2.url.load()
-    # video_2.tensor.save_to_file(str(tmpdir / 'tmp_2.mp4'))
-    #
-    # video_3 = Video(url=str(tmpdir / f'tmp.mp4'))
-    # video_3.tensor, _ = video_3.url.load()
-    # video_3.tensor.save_to_file(str(tmpdir / f'tmp_3.mp4'))
-    #
-    # video_4 = Video(url=str(tmpdir / f'tmp.mp4'))
-    # video_4.tensor, _ = video_4.url.load()
-    # video_4.tensor.save_to_file(str(tmpdir / f'tmp_4.mp4'))
-    #
-    # video_5 = Video(url=str(tmpdir / f'tmp.mp4'))
-    # video_5.tensor, _ = video_5.url.load()
-    # video_5.tensor.save_to_file(str(tmpdir / f'tmp_5.mp4'))
-    #
-    # video_6 = Video(url=str(tmpdir / f'tmp.mp4'))
-    # video_6.tensor, _ = video_6.url.load()
-    # video_6.tensor.save_to_file(str(tmpdir / f'tmp_6.mp4'))
-    #
-    # print(f"video_2.tensor[0][:2] = {video_2.tensor[0][:2]}")
-    # print(f"video_3.tensor[0][:2] = {video_3.tensor[0][:2]}")
-    # print(f"video_4.tensor[0][:2] = {video_3.tensor[0][:2]}")
-    # print(f"video_5.tensor[0][:2] = {video_3.tensor[0][:2]}")
-    # print(f"video_6.tensor[0][:2] = {video_3.tensor[0][:2]}")
-    #
-    # # Image.fromarray(video_1.tensor[0]).show()
-    # assert isinstance(video_1.tensor, np.ndarray)
-    # assert isinstance(video_1.tensor, VideoNdArray)
-    # # assert video_1.tensor.shape == video_2.tensor.shape
-    # # assert np.allclose(video_1.tensor, video_2.tensor)
diff --git a/tests/units/typing/tensor/test_video_tensor.py b/tests/units/typing/tensor/test_video_tensor.py
index 08b6d5847d7..99ec0454fa1 100644
--- a/tests/units/typing/tensor/test_video_tensor.py
+++ b/tests/units/typing/tensor/test_video_tensor.py
@@ -6,7 +6,12 @@
 from pydantic.tools import parse_obj_as
 
 from docarray import BaseDocument
-from docarray.typing import VideoNdArray, VideoTorchTensor
+from docarray.typing import (
+    AudioNdArray,
+    AudioTorchTensor,
+    VideoNdArray,
+    VideoTorchTensor,
+)
 
 
 @pytest.mark.parametrize(
@@ -74,14 +79,33 @@ def test_proto_tensor(cls_tensor, tensor, proto_key):
 
 
 @pytest.mark.parametrize(
-    'cls_tensor,tensor',
+    'video_tensor',
     [
-        (VideoTorchTensor, torch.zeros(1, 224, 224, 3)),
-        (VideoNdArray, np.zeros((1, 224, 224, 3))),
+        parse_obj_as(VideoTorchTensor, torch.zeros(1, 224, 224, 3)),
+        parse_obj_as(VideoNdArray, np.zeros((1, 224, 224, 3))),
     ],
 )
-def test_save_video_tensor_to_file(cls_tensor, tensor, tmpdir):
+def test_save_video_tensor_to_file(video_tensor, tmpdir):
     tmp_file = str(tmpdir / 'tmp.mp4')
-    video_tensor = parse_obj_as(cls_tensor, tensor)
     video_tensor.save_to_mp4_file(tmp_file)
     assert os.path.isfile(tmp_file)
+
+
+@pytest.mark.parametrize(
+    'video_tensor',
+    [
+        parse_obj_as(VideoTorchTensor, torch.zeros(1, 224, 224, 3)),
+        parse_obj_as(VideoNdArray, np.zeros((1, 224, 224, 3))),
+    ],
+)
+@pytest.mark.parametrize(
+    'audio_tensor',
+    [
+        parse_obj_as(AudioTorchTensor, torch.randn(100, 1, 1024).to(torch.float32)),
+        parse_obj_as(AudioNdArray, np.random.randn(100, 1, 1024).astype('float32')),
+    ],
+)
+def test_save_video_tensor_to_file_including_audio(video_tensor, audio_tensor, tmpdir):
+    tmp_file = str(tmpdir / 'tmp.mp4')
+    video_tensor.save_to_mp4_file(tmp_file, audio_tensor=audio_tensor)
+    assert os.path.isfile(tmp_file)
diff --git a/tests/units/typing/url/test_video_url.py b/tests/units/typing/url/test_video_url.py
index b59622390c8..40882b11221 100644
--- a/tests/units/typing/url/test_video_url.py
+++ b/tests/units/typing/url/test_video_url.py
@@ -6,7 +6,13 @@
 
 from docarray import BaseDocument
 from docarray.document.io.json import orjson_dumps
-from docarray.typing import NdArray, VideoNdArray, VideoTorchTensor, VideoUrl
+from docarray.typing import (
+    AudioNdArray,
+    NdArray,
+    VideoNdArray,
+    VideoTorchTensor,
+    VideoUrl,
+)
 from tests import TOYDATA_DIR
 
 LOCAL_VIDEO_FILE = str(TOYDATA_DIR / 'mov_bbb.mp4')
@@ -21,10 +27,13 @@
 )
 def test_load_with_only_keyframes_false(file_url):
     url = parse_obj_as(VideoUrl, file_url)
-    tensor, indices = url.load(only_keyframes=False)
+    audio, video, indices = url.load(only_keyframes=False)
+
+    assert isinstance(audio, np.ndarray)
+    assert isinstance(audio, AudioNdArray)
 
-    assert isinstance(tensor, np.ndarray)
-    assert isinstance(tensor, VideoNdArray)
+    assert isinstance(video, np.ndarray)
+    assert isinstance(video, VideoNdArray)
 
     assert isinstance(indices, np.ndarray)
     assert isinstance(indices, NdArray)
@@ -38,10 +47,10 @@ def test_load_with_only_keyframes_false(file_url):
 )
 def test_load_with_only_keyframes_true(file_url):
     url = parse_obj_as(VideoUrl, file_url)
-    tensor = url.load(only_keyframes=True)
+    key_frames = url.load(only_keyframes=True)
 
-    assert isinstance(tensor, np.ndarray)
-    assert isinstance(tensor, VideoNdArray)
+    assert isinstance(key_frames, np.ndarray)
+    assert isinstance(key_frames, VideoNdArray)
 
 
 @pytest.mark.slow

From e4106a8dcec6f4353f1c49b85945b6b674b91519 Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Wed, 11 Jan 2023 15:29:45 +0100
Subject: [PATCH 12/26] fix: fix some imports after merging

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 docarray/documents/__init__.py                       | 3 ++-
 tests/integrations/predefined_document/test_video.py | 2 +-
 tests/units/typing/url/test_video_url.py             | 7 ++++---
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/docarray/documents/__init__.py b/docarray/documents/__init__.py
index 31f2313de4b..052992fc1f6 100644
--- a/docarray/documents/__init__.py
+++ b/docarray/documents/__init__.py
@@ -3,5 +3,6 @@
 from docarray.documents.mesh import Mesh3D
 from docarray.documents.point_cloud import PointCloud3D
 from docarray.documents.text import Text
+from docarray.documents.video import Video
 
-__all__ = ['Text', 'Image', 'Audio', 'Mesh3D', 'PointCloud3D']
+__all__ = ['Text', 'Image', 'Audio', 'Mesh3D', 'PointCloud3D', 'Video']
diff --git a/tests/integrations/predefined_document/test_video.py b/tests/integrations/predefined_document/test_video.py
index 09e9795fce6..2522ba2801e 100644
--- a/tests/integrations/predefined_document/test_video.py
+++ b/tests/integrations/predefined_document/test_video.py
@@ -1,6 +1,6 @@
 import pytest
 
-from docarray import Video
+from docarray.documents import Video
 from docarray.typing import AudioNdArray, NdArray, VideoNdArray
 from tests import TOYDATA_DIR
 
diff --git a/tests/units/typing/url/test_video_url.py b/tests/units/typing/url/test_video_url.py
index 40882b11221..b468160b6ee 100644
--- a/tests/units/typing/url/test_video_url.py
+++ b/tests/units/typing/url/test_video_url.py
@@ -2,10 +2,11 @@
 
 import numpy as np
 import pytest
+import torch
 from pydantic.tools import parse_obj_as, schema_json_of
 
 from docarray import BaseDocument
-from docarray.document.io.json import orjson_dumps
+from docarray.base_document.io.json import orjson_dumps
 from docarray.typing import (
     AudioNdArray,
     NdArray,
@@ -67,8 +68,8 @@ class MyVideoDoc(BaseDocument):
     doc = MyVideoDoc(video_url=file_url)
     doc.tensor = doc.video_url.load(only_keyframes=True)
 
-    assert isinstance(doc.tensor, np.ndarray)
-    assert isinstance(doc.tensor, VideoNdArray)
+    assert isinstance(doc.tensor, torch.Tensor)
+    assert isinstance(doc.tensor, VideoTorchTensor)
 
 
 def test_json_schema():

From 23ee9308e972b4bc37c5a63c23c0c9e65e880c6d Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Wed, 11 Jan 2023 16:30:09 +0100
Subject: [PATCH 13/26] docs: add doc strings and fix example urls

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 docarray/documents/audio.py |  8 +++---
 docarray/documents/video.py | 51 +++++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/docarray/documents/audio.py b/docarray/documents/audio.py
index c543a0778fb..776020bc964 100644
--- a/docarray/documents/audio.py
+++ b/docarray/documents/audio.py
@@ -24,7 +24,7 @@ class Audio(BaseDocument):
 
         # use it directly
         audio = Audio(
-            url='https://github.com/docarray/docarray/tree/feat-add-audio-v2/tests/toydata/hello.wav?raw=true'
+            url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/hello.wav?raw=true'
         )
         audio.tensor = audio.url.load()
         model = MyEmbeddingModel()
@@ -43,12 +43,12 @@ class MyAudio(Audio):
 
 
         audio = MyAudio(
-            url='https://github.com/docarray/docarray/tree/feat-add-audio-v2/tests/toydata/hello.wav?raw=true'
+            url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/hello.wav?raw=true'
         )
         audio.tensor = audio.url.load()
         model = MyEmbeddingModel()
         audio.embedding = model(audio.tensor)
-        audio.name = 'my first audio'
+        audio.name = Text(text='my first audio')
 
 
     You can use this Document for composition:
@@ -66,7 +66,7 @@ class MultiModalDoc(Document):
 
         mmdoc = MultiModalDoc(
             audio=Audio(
-                url='https://github.com/docarray/docarray/tree/feat-add-audio-v2/tests/toydata/hello.wav?raw=true'
+                url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/hello.wav?raw=true'
             ),
             text=Text(text='hello world, how are you doing?'),
         )
diff --git a/docarray/documents/video.py b/docarray/documents/video.py
index dffac71efd6..e085e1d43bd 100644
--- a/docarray/documents/video.py
+++ b/docarray/documents/video.py
@@ -20,10 +20,61 @@ class Video(BaseDocument):
 
     You can use this Document directly:
 
+    .. code-block:: python
+
+        from docarray.documents import Video
+
+        # use it directly
+        vid = Video(
+            url='https://github.com/docarray/docarray/tree/feat-add-video-v2/tests/toydata/mov_bbb.mp4?raw=true'
+        )
+        vid.audio_tensor, vid.video_tensor, vid.key_frame_indices = vid.url.load()
+        model = MyEmbeddingModel()
+        vid.embedding = model(vid.video_tensor)
+
     You can extend this Document:
 
+    .. code-block:: python
+
+        from typing import Optional
+
+        from docarray.documents import Text, Video
+
+
+        # extend it
+        class MyVideo(Video):
+            name: Optional[Text]
+
+
+        video = MyVideo(
+            url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/mov_bbb.mp4?raw=true'
+        )
+        video.video_tensor = video.url.load(only_keyframes=True)
+        model = MyEmbeddingModel()
+        video.embedding = model(video.video_tensor)
+        video.name = Text(text='my first video')
+
     You can use this Document for composition:
 
+    .. code-block:: python
+
+        from docarray import BaseDocument
+        from docarray.documents import Text, Video
+
+
+        # compose it
+        class MultiModalDoc(BaseDocument):
+            video: Video
+            text: Text
+
+
+        mmdoc = MultiModalDoc(
+            video=Video(
+                url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/mov_bbb.mp4?raw=true'
+            ),
+            text=Text(text='hello world, how are you doing?'),
+        )
+        mmdoc.video.video_tensor = mmdoc.video.url.load(only_keyframes=True)
     """
 
     url: Optional[VideoUrl]

From 7ab8dbd41af0d7ef7bb1a8b89a07d5b2e81af4ce Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Wed, 11 Jan 2023 16:49:01 +0100
Subject: [PATCH 14/26] docs: small fixes in docs

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 docarray/__init__.py                                  | 5 ++++-
 docarray/computation/abstract_comp_backend.py         | 7 ++++---
 docarray/computation/numpy_backend.py                 | 2 +-
 docarray/computation/torch_backend.py                 | 2 +-
 docarray/documents/video.py                           | 3 ++-
 docarray/typing/tensor/video/abstract_video_tensor.py | 4 ++--
 docarray/typing/url/video_url.py                      | 8 ++++++--
 7 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/docarray/__init__.py b/docarray/__init__.py
index ae8a65b853e..54eb3a3bdf7 100644
--- a/docarray/__init__.py
+++ b/docarray/__init__.py
@@ -3,4 +3,7 @@
 from docarray.array.array import DocumentArray
 from docarray.base_document.document import BaseDocument
 
-__all__ = ['BaseDocument', 'DocumentArray']
+__all__ = [
+    'BaseDocument',
+    'DocumentArray',
+]
diff --git a/docarray/computation/abstract_comp_backend.py b/docarray/computation/abstract_comp_backend.py
index 97660398e4f..4a213777b11 100644
--- a/docarray/computation/abstract_comp_backend.py
+++ b/docarray/computation/abstract_comp_backend.py
@@ -1,8 +1,9 @@
 import typing
 from abc import ABC, abstractmethod
-from typing import List, Optional, Tuple, TypeVar, Union
+from typing import TYPE_CHECKING, List, Optional, Tuple, TypeVar, Union
 
-import numpy as np
+if TYPE_CHECKING:
+    import numpy as np
 
 # In practice all of the below will be the same type
 TTensor = TypeVar('TTensor')
@@ -38,7 +39,7 @@ def n_dim(array: 'TTensor') -> int:
 
     @staticmethod
     @abstractmethod
-    def to_numpy(array: 'TTensor') -> np.ndarray:
+    def to_numpy(array: 'TTensor') -> 'np.ndarray':
         """
         Convert array to np.ndarray.
         """
diff --git a/docarray/computation/numpy_backend.py b/docarray/computation/numpy_backend.py
index 05e1186f6d3..c8b3745bbdb 100644
--- a/docarray/computation/numpy_backend.py
+++ b/docarray/computation/numpy_backend.py
@@ -65,7 +65,7 @@ def n_dim(array: 'np.ndarray') -> int:
         return array.ndim
 
     @staticmethod
-    def to_numpy(array: 'np.ndarray') -> np.ndarray:
+    def to_numpy(array: 'np.ndarray') -> 'np.ndarray':
         return array
 
     @staticmethod
diff --git a/docarray/computation/torch_backend.py b/docarray/computation/torch_backend.py
index 4e9fcf59b7e..fe2fe4a5266 100644
--- a/docarray/computation/torch_backend.py
+++ b/docarray/computation/torch_backend.py
@@ -66,7 +66,7 @@ def n_dim(array: 'torch.Tensor') -> int:
         return array.ndim
 
     @staticmethod
-    def to_numpy(array: 'torch.Tensor') -> np.ndarray:
+    def to_numpy(array: 'torch.Tensor') -> 'np.ndarray':
         return array.cpu().detach().numpy()
 
     @staticmethod
diff --git a/docarray/documents/video.py b/docarray/documents/video.py
index e085e1d43bd..99ec9733d6e 100644
--- a/docarray/documents/video.py
+++ b/docarray/documents/video.py
@@ -14,7 +14,8 @@ class Video(BaseDocument):
     Document for handling video.
     The Video Document can contain a VideoUrl (`Video.url`), an AudioTensor
     (`Video.audio_tensor`), a VideoTensor (`Video.video_tensor`), an AnyTensor
-    ('Video.key_frame_indices), and an AnyEmbedding (`Video.embedding`).
+    representing the indices of the video's key frames (`Video.key_frame_indices`),
+    and an AnyEmbedding (`Video.embedding`).
 
     EXAMPLE USAGE:
 
diff --git a/docarray/typing/tensor/video/abstract_video_tensor.py b/docarray/typing/tensor/video/abstract_video_tensor.py
index aa3b0c4951c..36a2f53a413 100644
--- a/docarray/typing/tensor/video/abstract_video_tensor.py
+++ b/docarray/typing/tensor/video/abstract_video_tensor.py
@@ -13,9 +13,9 @@ class AbstractVideoTensor(AbstractTensor, ABC):
     def save_to_mp4_file(
         self: 'T',
         file_path: Union[str, BinaryIO],
+        audio_tensor: Optional[AudioTensor] = None,
         video_frame_rate: int = 24,
         video_codec: str = 'h264',
-        audio_tensor: Optional[AudioTensor] = None,
         audio_frame_rate: int = 48000,
         audio_codec: str = 'aac',
         audio_format: str = 'fltp',
@@ -25,9 +25,9 @@ def save_to_mp4_file(
 
         :param file_path: path to a .mp4 file. If file is a string, open the file by
             that name, otherwise treat it as a file-like object.
+        :param audio_tensor: AudioTensor containing the video's soundtrack.
         :param video_frame_rate: video frames per second.
         :param video_codec: the name of a video decoder/encoder.
-        :param audio_tensor: AudioTensor that should be added as soundtrack.
         :param audio_frame_rate: audio frames per second.
         :param audio_codec: the name of an audio decoder/encoder.
         :param audio_format: the name of one of the audio formats supported by PyAV,
diff --git a/docarray/typing/url/video_url.py b/docarray/typing/url/video_url.py
index c660abda121..96b085ab43f 100644
--- a/docarray/typing/url/video_url.py
+++ b/docarray/typing/url/video_url.py
@@ -86,7 +86,9 @@ class MyDoc(BaseDocument):
                 key_frame_indices: Optional[NdArray]
 
 
-            doc = MyDoc(video_url='toydata/mov_bbb.mp4')
+            doc = MyDoc(
+                video_url='https://github.com/docarray/docarray/tree/feat-add-video-v2/tests/toydata/mov_bbb.mp4?raw=true'
+            )
             doc.audio, doc.video, doc.key_frame_indices = doc.video_url.load()
 
             assert isinstance(doc.video, VideoNdArray)
@@ -109,7 +111,9 @@ class MyDoc(BaseDocument):
                 video_key_frames: Optional[VideoNdArray]
 
 
-            doc = MyDoc(video_url='toydata/mov_bbb.mp4')
+            doc = MyDoc(
+                video_url='https://github.com/docarray/docarray/tree/feat-add-video-v2/tests/toydata/mov_bbb.mp4?raw=true'
+            )
             doc.video_key_frames = doc.video_url.load(only_keyframes=True)
 
             assert isinstance(doc.video_key_frames, VideoNdArray)

From 5295dd1f783490001527f93d02e1adf39fac2ffc Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Wed, 11 Jan 2023 17:44:48 +0100
Subject: [PATCH 15/26] refactor: rename save to mp4 file to save

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 docarray/typing/tensor/video/abstract_video_tensor.py | 4 ++--
 tests/units/typing/tensor/test_video_tensor.py        | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docarray/typing/tensor/video/abstract_video_tensor.py b/docarray/typing/tensor/video/abstract_video_tensor.py
index 36a2f53a413..8916902fb74 100644
--- a/docarray/typing/tensor/video/abstract_video_tensor.py
+++ b/docarray/typing/tensor/video/abstract_video_tensor.py
@@ -10,7 +10,7 @@
 
 
 class AbstractVideoTensor(AbstractTensor, ABC):
-    def save_to_mp4_file(
+    def save(
         self: 'T',
         file_path: Union[str, BinaryIO],
         audio_tensor: Optional[AudioTensor] = None,
@@ -53,7 +53,7 @@ class MyDoc(BaseDocument):
                 audio_tensor=np.random.randn(100, 1, 1024).astype("float32"),
             )
 
-            doc.video_tensor.save_to_mp4_file(
+            doc.video_tensor.save(
                 file_path="toydata/mp_.mp4",
                 audio_tensor=doc.audio_tensor,
                 audio_format="flt",
diff --git a/tests/units/typing/tensor/test_video_tensor.py b/tests/units/typing/tensor/test_video_tensor.py
index 99ec0454fa1..214fcdf6e12 100644
--- a/tests/units/typing/tensor/test_video_tensor.py
+++ b/tests/units/typing/tensor/test_video_tensor.py
@@ -87,7 +87,7 @@ def test_proto_tensor(cls_tensor, tensor, proto_key):
 )
 def test_save_video_tensor_to_file(video_tensor, tmpdir):
     tmp_file = str(tmpdir / 'tmp.mp4')
-    video_tensor.save_to_mp4_file(tmp_file)
+    video_tensor.save(tmp_file)
     assert os.path.isfile(tmp_file)
 
 
@@ -107,5 +107,5 @@ def test_save_video_tensor_to_file(video_tensor, tmpdir):
 )
 def test_save_video_tensor_to_file_including_audio(video_tensor, audio_tensor, tmpdir):
     tmp_file = str(tmpdir / 'tmp.mp4')
-    video_tensor.save_to_mp4_file(tmp_file, audio_tensor=audio_tensor)
+    video_tensor.save(tmp_file, audio_tensor=audio_tensor)
     assert os.path.isfile(tmp_file)

From b3f2ccb2d00bad3da52c3a94168be54697227dc5 Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Mon, 16 Jan 2023 13:44:51 +0100
Subject: [PATCH 16/26] feat: add shape method to comp backend

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 docarray/computation/abstract_comp_backend.py |  8 ++++++
 docarray/computation/numpy_backend.py         |  4 +++
 docarray/computation/torch_backend.py         |  4 +++
 .../numpy_backend/test_basics.py              | 25 ++++++++++++++++++
 .../torch_backend/test_basics.py              | 26 +++++++++++++++++++
 5 files changed, 67 insertions(+)

diff --git a/docarray/computation/abstract_comp_backend.py b/docarray/computation/abstract_comp_backend.py
index 4a213777b11..f5cac384e47 100644
--- a/docarray/computation/abstract_comp_backend.py
+++ b/docarray/computation/abstract_comp_backend.py
@@ -37,6 +37,14 @@ def n_dim(array: 'TTensor') -> int:
         """
         ...
 
+    @staticmethod
+    @abstractmethod
+    def shape(array: 'TTensor') -> Tuple:
+        """
+        Get the shape of the array.
+        """
+        ...
+
     @staticmethod
     @abstractmethod
     def to_numpy(array: 'TTensor') -> 'np.ndarray':
diff --git a/docarray/computation/numpy_backend.py b/docarray/computation/numpy_backend.py
index c8b3745bbdb..b02c050aee5 100644
--- a/docarray/computation/numpy_backend.py
+++ b/docarray/computation/numpy_backend.py
@@ -64,6 +64,10 @@ def to_device(
     def n_dim(array: 'np.ndarray') -> int:
         return array.ndim
 
+    @staticmethod
+    def shape(array: 'np.ndarray') -> Tuple:
+        return array.shape
+
     @staticmethod
     def to_numpy(array: 'np.ndarray') -> 'np.ndarray':
         return array
diff --git a/docarray/computation/torch_backend.py b/docarray/computation/torch_backend.py
index fe2fe4a5266..176f887d6ae 100644
--- a/docarray/computation/torch_backend.py
+++ b/docarray/computation/torch_backend.py
@@ -65,6 +65,10 @@ def to_device(
     def n_dim(array: 'torch.Tensor') -> int:
         return array.ndim
 
+    @staticmethod
+    def shape(array: 'torch.Tensor') -> Tuple:
+        return array.size()
+
     @staticmethod
     def to_numpy(array: 'torch.Tensor') -> 'np.ndarray':
         return array.cpu().detach().numpy()
diff --git a/tests/units/computation_backends/numpy_backend/test_basics.py b/tests/units/computation_backends/numpy_backend/test_basics.py
index 1873889f3a5..29ed9ec001a 100644
--- a/tests/units/computation_backends/numpy_backend/test_basics.py
+++ b/tests/units/computation_backends/numpy_backend/test_basics.py
@@ -7,3 +7,28 @@
 def test_to_device():
     with pytest.raises(NotImplementedError):
         NumpyCompBackend.to_device(np.random.rand(10, 3), 'meta')
+
+
+@pytest.mark.parametrize(
+    'array,result',
+    [
+        (np.zeros((5)), 1),
+        (np.zeros((1, 5)), 2),
+        (np.zeros((5, 5)), 2),
+        (np.zeros(()), 0),
+    ],
+)
+def test_n_dim(array, result):
+    assert NumpyCompBackend.n_dim(array) == result
+
+
+@pytest.mark.parametrize(
+    'array,result',
+    [
+        (np.zeros((10,)), (10,)),
+        (np.zeros((5, 5)), (5, 5)),
+        (np.zeros(()), ()),
+    ],
+)
+def test_shape(array, result):
+    assert NumpyCompBackend.shape(array) == result
diff --git a/tests/units/computation_backends/torch_backend/test_basics.py b/tests/units/computation_backends/torch_backend/test_basics.py
index 14f337df429..a98cca72a84 100644
--- a/tests/units/computation_backends/torch_backend/test_basics.py
+++ b/tests/units/computation_backends/torch_backend/test_basics.py
@@ -1,3 +1,4 @@
+import pytest
 import torch
 
 from docarray.computation.torch_backend import TorchCompBackend
@@ -8,3 +9,28 @@ def test_to_device():
     assert t.device == torch.device('cpu')
     t = TorchCompBackend.to_device(t, 'meta')
     assert t.device == torch.device('meta')
+
+
+@pytest.mark.parametrize(
+    'array,result',
+    [
+        (torch.zeros((5)), 1),
+        (torch.zeros((1, 5)), 2),
+        (torch.zeros((5, 5)), 2),
+        (torch.zeros(()), 0),
+    ],
+)
+def test_n_dim(array, result):
+    assert TorchCompBackend.n_dim(array) == result
+
+
+@pytest.mark.parametrize(
+    'array,result',
+    [
+        (torch.zeros((10,)), (10,)),
+        (torch.zeros((5, 5)), (5, 5)),
+        (torch.zeros(()), ()),
+    ],
+)
+def test_shape(array, result):
+    assert TorchCompBackend.shape(array) == result

From 20ecf2cd59f86651ecdf26a660389c6b3573115c Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Mon, 16 Jan 2023 13:48:01 +0100
Subject: [PATCH 17/26] refactor: move validate shape to video tensor mixin

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 docarray/typing/tensor/video/video_ndarray.py | 14 +++------
 ..._video_tensor.py => video_tensor_mixin.py} | 29 +++++++++++++++----
 .../typing/tensor/video/video_torch_tensor.py | 12 ++------
 3 files changed, 31 insertions(+), 24 deletions(-)
 rename docarray/typing/tensor/video/{abstract_video_tensor.py => video_tensor_mixin.py} (81%)

diff --git a/docarray/typing/tensor/video/video_ndarray.py b/docarray/typing/tensor/video/video_ndarray.py
index 10a608ab743..97c07afbdca 100644
--- a/docarray/typing/tensor/video/video_ndarray.py
+++ b/docarray/typing/tensor/video/video_ndarray.py
@@ -3,7 +3,7 @@
 import numpy as np
 
 from docarray.typing.tensor.ndarray import NdArray
-from docarray.typing.tensor.video.abstract_video_tensor import AbstractVideoTensor
+from docarray.typing.tensor.video.video_tensor_mixin import VideoTensorMixin
 
 T = TypeVar('T', bound='VideoNdArray')
 
@@ -12,7 +12,7 @@
     from pydantic.fields import ModelField
 
 
-class VideoNdArray(AbstractVideoTensor, NdArray):
+class VideoNdArray(NdArray, VideoTensorMixin):
     """
     Subclass of NdArray, to represent a video tensor.
     Adds video-specific features to the tensor.
@@ -30,11 +30,5 @@ def validate(
         field: 'ModelField',
         config: 'BaseConfig',
     ) -> T:
-        array = super().validate(value=value, field=field, config=config)
-        if array.ndim not in [3, 4] or array.shape[-1] != 3:
-            raise ValueError(
-                f'Expects tensor with 3 or 4 dimensions and the last dimension equal'
-                f' to 3, but received {array.shape} in {array.dtype}'
-            )
-        else:
-            return array
+        tensor = super().validate(value=value, field=field, config=config)
+        return VideoTensorMixin.validate_shape(cls, value=tensor)
diff --git a/docarray/typing/tensor/video/abstract_video_tensor.py b/docarray/typing/tensor/video/video_tensor_mixin.py
similarity index 81%
rename from docarray/typing/tensor/video/abstract_video_tensor.py
rename to docarray/typing/tensor/video/video_tensor_mixin.py
index 8916902fb74..01d5184f824 100644
--- a/docarray/typing/tensor/video/abstract_video_tensor.py
+++ b/docarray/typing/tensor/video/video_tensor_mixin.py
@@ -1,15 +1,34 @@
-from abc import ABC
-from typing import BinaryIO, Optional, TypeVar, Union
+from typing import TYPE_CHECKING, BinaryIO, Optional, Type, TypeVar, Union
 
 import numpy as np
 
-from docarray.typing.tensor.abstract_tensor import AbstractTensor
 from docarray.typing.tensor.audio.audio_tensor import AudioTensor
 
-T = TypeVar('T', bound='AbstractVideoTensor')
+if TYPE_CHECKING:
+    from docarray.typing import VideoNdArray, VideoTorchTensor
 
 
-class AbstractVideoTensor(AbstractTensor, ABC):
+T = TypeVar('T', bound='VideoTensorMixin')
+
+
+class VideoTensorMixin:
+    @staticmethod
+    def validate_shape(
+        cls: Union[Type['VideoTorchTensor'], Type['VideoNdArray']], value: 'T'
+    ) -> 'T':
+        comp_backend = cls.get_comp_backend()
+
+        if (
+            comp_backend.n_dim(value) not in [3, 4]  # type: ignore
+            or comp_backend.shape(value)[-1] != 3  # type: ignore
+        ):
+            raise ValueError(
+                f'Expects tensor with 3 or 4 dimensions and the last dimension equal '
+                f'to 3, but received {comp_backend.shape(value)}.'  # type: ignore
+            )
+        else:
+            return value
+
     def save(
         self: 'T',
         file_path: Union[str, BinaryIO],
diff --git a/docarray/typing/tensor/video/video_torch_tensor.py b/docarray/typing/tensor/video/video_torch_tensor.py
index 05f56bf792d..5e2953b4231 100644
--- a/docarray/typing/tensor/video/video_torch_tensor.py
+++ b/docarray/typing/tensor/video/video_torch_tensor.py
@@ -3,7 +3,7 @@
 import numpy as np
 
 from docarray.typing.tensor.torch_tensor import TorchTensor, metaTorchAndNode
-from docarray.typing.tensor.video.abstract_video_tensor import AbstractVideoTensor
+from docarray.typing.tensor.video.video_tensor_mixin import VideoTensorMixin
 
 T = TypeVar('T', bound='VideoTorchTensor')
 
@@ -12,7 +12,7 @@
     from pydantic.fields import ModelField
 
 
-class VideoTorchTensor(AbstractVideoTensor, TorchTensor, metaclass=metaTorchAndNode):
+class VideoTorchTensor(TorchTensor, VideoTensorMixin, metaclass=metaTorchAndNode):
     """
     Subclass of TorchTensor, to represent a video tensor.
     Adds video-specific features to the tensor.
@@ -31,10 +31,4 @@ def validate(
         config: 'BaseConfig',
     ) -> T:
         tensor = super().validate(value=value, field=field, config=config)
-        if tensor.ndim not in [3, 4] or tensor.shape[-1] != 3:
-            raise ValueError(
-                f'Expects tensor with 3 or 4 dimensions and the last dimension equal '
-                f'to 3, but received {tensor.shape} in {tensor.dtype}'
-            )
-        else:
-            return tensor
+        return VideoTensorMixin.validate_shape(cls, value=tensor)

From 711d1057331e9ef4ce02af23b21336953ea7411f Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Mon, 16 Jan 2023 14:52:14 +0100
Subject: [PATCH 18/26] refactor: extract private load and make separate
 methods for frames

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 docarray/typing/url/video_url.py         | 113 ++++++++++++++---------
 tests/units/typing/url/test_video_url.py |  10 +-
 2 files changed, 75 insertions(+), 48 deletions(-)

diff --git a/docarray/typing/url/video_url.py b/docarray/typing/url/video_url.py
index 96b085ab43f..2013f22feb4 100644
--- a/docarray/typing/url/video_url.py
+++ b/docarray/typing/url/video_url.py
@@ -50,22 +50,65 @@ def validate(
             )
         return cls(str(url), scheme=None)
 
-    def load(
-        self: T, only_keyframes: bool = False, **kwargs
-    ) -> Union[VideoNdArray, Tuple[AudioNdArray, VideoNdArray, NdArray]]:
+    def _load(
+        self: T, skip_type: str, **kwargs
+    ) -> Tuple[AudioNdArray, VideoNdArray, NdArray]:
         """
-        Load the data from the url into a VideoNdArray or Tuple of AudioNdArray,
-        VideoNdArray and NdArray.
+        Load the data from the url into a Tuple of AudioNdArray, VideoNdArray and
+        NdArray.
+
+        :param skip_type: determines what video frames to discard.
+        :param kwargs: supports all keyword arguments that are being supported by
+            av.open() as described in:
+            https://pyav.org/docs/stable/api/_globals.html?highlight=open#av.open
+
+        :return: AudioNdArray representing the audio content, VideoNdArray representing
+            the images of the video, NdArray of the key frame indices.
+
+        """
+        import av
+
+        with av.open(self, **kwargs) as container:
+            stream = container.streams.video[0]
+            stream.codec_context.skip_frame = skip_type
+
+            audio_frames = []
+            video_frames = []
+            keyframe_indices = []
+
+            for frame in container.decode(
+                video=0, audio=0 if skip_type != 'NONKEY' else []
+            ):
+                if type(frame) == av.audio.frame.AudioFrame:
+                    audio_frames.append(frame.to_ndarray())
+                elif type(frame) == av.video.frame.VideoFrame:
+                    video_frames.append(frame.to_ndarray(format='rgb24'))
+
+                    if frame.key_frame == 1:
+                        curr_index = len(video_frames)
+                        keyframe_indices.append(curr_index)
+
+        if len(audio_frames) == 0:
+            audio = parse_obj_as(AudioNdArray, np.array(audio_frames))
+        else:
+            audio = parse_obj_as(AudioNdArray, np.stack(audio_frames))
+
+        video = parse_obj_as(VideoNdArray, np.stack(video_frames))
+        indices = parse_obj_as(NdArray, keyframe_indices)
+
+        return audio, video, indices
+
+    def load(self: T, **kwargs) -> Tuple[AudioNdArray, VideoNdArray, NdArray]:
+        """
+        Load the data from the url into a Tuple of AudioNdArray, VideoNdArray and
+        NdArray.
 
-        :param only_keyframes: if True keep only the keyframes, if False return all
-            frames, key frame indices and audio.
         :param kwargs: supports all keyword arguments that are being supported by
             av.open() as described in:
             https://pyav.org/docs/stable/api/_globals.html?highlight=open#av.open
 
         :return: AudioNdArray representing the audio content, VideoNdArray representing
-            the images of the video, NdArray of key frame indices if only_keyframe
-            False, else only VideoNdArray representing the keyframes.
+            the images of the video, NdArray of the key frame indices.
 
 
         EXAMPLE USAGE
@@ -95,7 +138,21 @@ class MyDoc(BaseDocument):
             assert isinstance(doc.audio, AudioNdArray)
             assert isinstance(doc.key_frame_indices, NdArray)
 
-        You can load only the key frames:
+        """
+        return self._load(skip_type='DEFAULT', **kwargs)
+
+    def load_key_frames(self: T, **kwargs) -> VideoNdArray:
+        """
+        Load the data from the url into a VideoNdArray or Tuple of AudioNdArray,
+        VideoNdArray and NdArray.
+
+        :param kwargs: supports all keyword arguments that are being supported by
+            av.open() as described in:
+            https://pyav.org/docs/stable/api/_globals.html?highlight=open#av.open
+
+        :return: VideoNdArray representing the keyframes.
+
+        EXAMPLE USAGE
 
         .. code-block:: python
 
@@ -114,40 +171,10 @@ class MyDoc(BaseDocument):
             doc = MyDoc(
                 video_url='https://github.com/docarray/docarray/tree/feat-add-video-v2/tests/toydata/mov_bbb.mp4?raw=true'
             )
-            doc.video_key_frames = doc.video_url.load(only_keyframes=True)
+            doc.video_key_frames = doc.video_url.load_key_frames()
 
             assert isinstance(doc.video_key_frames, VideoNdArray)
 
         """
-        import av
-
-        with av.open(self, **kwargs) as container:
-            if only_keyframes:
-                stream = container.streams.video[0]
-                stream.codec_context.skip_frame = 'NONKEY'
-
-            audio_frames = []
-            video_frames = []
-            keyframe_indices = []
-
-            for frame in container.decode():
-                if type(frame) == av.audio.frame.AudioFrame:
-                    audio_frames.append(frame.to_ndarray())
-                elif type(frame) == av.video.frame.VideoFrame:
-                    video_frames.append(frame.to_ndarray(format='rgb24'))
-
-                    if not only_keyframes and frame.key_frame == 1:
-                        curr_index = len(video_frames)
-                        keyframe_indices.append(curr_index)
-
-        video = parse_obj_as(VideoNdArray, np.stack(video_frames))
-
-        if only_keyframes:
-            return video
-        else:
-            if len(audio_frames) == 0:
-                audio = parse_obj_as(AudioNdArray, np.array(audio_frames))
-            else:
-                audio = parse_obj_as(AudioNdArray, np.stack(audio_frames))
-            indices = parse_obj_as(NdArray, keyframe_indices)
-            return audio, video, indices
+        _, key_frames, _ = self._load(skip_type='NONKEY', **kwargs)
+        return key_frames
diff --git a/tests/units/typing/url/test_video_url.py b/tests/units/typing/url/test_video_url.py
index b468160b6ee..02ae5119a59 100644
--- a/tests/units/typing/url/test_video_url.py
+++ b/tests/units/typing/url/test_video_url.py
@@ -26,9 +26,9 @@
     'file_url',
     [LOCAL_VIDEO_FILE, REMOTE_VIDEO_FILE],
 )
-def test_load_with_only_keyframes_false(file_url):
+def test_load(file_url):
     url = parse_obj_as(VideoUrl, file_url)
-    audio, video, indices = url.load(only_keyframes=False)
+    audio, video, indices = url.load()
 
     assert isinstance(audio, np.ndarray)
     assert isinstance(audio, AudioNdArray)
@@ -46,9 +46,9 @@ def test_load_with_only_keyframes_false(file_url):
     'file_url',
     [LOCAL_VIDEO_FILE, REMOTE_VIDEO_FILE],
 )
-def test_load_with_only_keyframes_true(file_url):
+def test_load_key_frames(file_url):
     url = parse_obj_as(VideoUrl, file_url)
-    key_frames = url.load(only_keyframes=True)
+    key_frames = url.load_key_frames()
 
     assert isinstance(key_frames, np.ndarray)
     assert isinstance(key_frames, VideoNdArray)
@@ -66,7 +66,7 @@ class MyVideoDoc(BaseDocument):
         tensor: Optional[VideoTorchTensor]
 
     doc = MyVideoDoc(video_url=file_url)
-    doc.tensor = doc.video_url.load(only_keyframes=True)
+    doc.tensor = doc.video_url.load_key_frames()
 
     assert isinstance(doc.tensor, torch.Tensor)
     assert isinstance(doc.tensor, VideoTorchTensor)

From 0c9c1fdba6065a5e4b08f5fc4d0042749f5a2a11 Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Mon, 16 Jan 2023 15:16:50 +0100
Subject: [PATCH 19/26] fix: use torch shape instead of size method

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 docarray/computation/torch_backend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docarray/computation/torch_backend.py b/docarray/computation/torch_backend.py
index 176f887d6ae..66ba4d592f3 100644
--- a/docarray/computation/torch_backend.py
+++ b/docarray/computation/torch_backend.py
@@ -67,7 +67,7 @@ def n_dim(array: 'torch.Tensor') -> int:
 
     @staticmethod
     def shape(array: 'torch.Tensor') -> Tuple:
-        return array.size()
+        return array.shape
 
     @staticmethod
     def to_numpy(array: 'torch.Tensor') -> 'np.ndarray':

From e3a465ce11c76029fd8071e193edb78125770610 Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Mon, 16 Jan 2023 17:05:47 +0100
Subject: [PATCH 20/26] fix: add typehint to shape in comp backend

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 docarray/computation/abstract_comp_backend.py                 | 2 +-
 docarray/computation/numpy_backend.py                         | 2 +-
 docarray/computation/torch_backend.py                         | 4 ++--
 tests/units/computation_backends/numpy_backend/test_basics.py | 4 +++-
 tests/units/computation_backends/torch_backend/test_basics.py | 4 +++-
 5 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/docarray/computation/abstract_comp_backend.py b/docarray/computation/abstract_comp_backend.py
index f5cac384e47..0c015f75d6b 100644
--- a/docarray/computation/abstract_comp_backend.py
+++ b/docarray/computation/abstract_comp_backend.py
@@ -39,7 +39,7 @@ def n_dim(array: 'TTensor') -> int:
 
     @staticmethod
     @abstractmethod
-    def shape(array: 'TTensor') -> Tuple:
+    def shape(array: 'TTensor') -> Tuple[int, ...]:
         """
         Get the shape of the array.
         """
diff --git a/docarray/computation/numpy_backend.py b/docarray/computation/numpy_backend.py
index b02c050aee5..5fb3135b9f1 100644
--- a/docarray/computation/numpy_backend.py
+++ b/docarray/computation/numpy_backend.py
@@ -65,7 +65,7 @@ def n_dim(array: 'np.ndarray') -> int:
         return array.ndim
 
     @staticmethod
-    def shape(array: 'np.ndarray') -> Tuple:
+    def shape(array: 'np.ndarray') -> Tuple[int, ...]:
         return array.shape
 
     @staticmethod
diff --git a/docarray/computation/torch_backend.py b/docarray/computation/torch_backend.py
index 66ba4d592f3..df68e73c18e 100644
--- a/docarray/computation/torch_backend.py
+++ b/docarray/computation/torch_backend.py
@@ -66,8 +66,8 @@ def n_dim(array: 'torch.Tensor') -> int:
         return array.ndim
 
     @staticmethod
-    def shape(array: 'torch.Tensor') -> Tuple:
-        return array.shape
+    def shape(array: 'torch.Tensor') -> Tuple[int, ...]:
+        return tuple(array.shape)
 
     @staticmethod
     def to_numpy(array: 'torch.Tensor') -> 'np.ndarray':
diff --git a/tests/units/computation_backends/numpy_backend/test_basics.py b/tests/units/computation_backends/numpy_backend/test_basics.py
index 29ed9ec001a..7a20ad32f2f 100644
--- a/tests/units/computation_backends/numpy_backend/test_basics.py
+++ b/tests/units/computation_backends/numpy_backend/test_basics.py
@@ -31,4 +31,6 @@ def test_n_dim(array, result):
     ],
 )
 def test_shape(array, result):
-    assert NumpyCompBackend.shape(array) == result
+    shape = NumpyCompBackend.shape(array)
+    assert shape == result
+    assert type(shape) == tuple
diff --git a/tests/units/computation_backends/torch_backend/test_basics.py b/tests/units/computation_backends/torch_backend/test_basics.py
index a98cca72a84..68afaacf212 100644
--- a/tests/units/computation_backends/torch_backend/test_basics.py
+++ b/tests/units/computation_backends/torch_backend/test_basics.py
@@ -33,4 +33,6 @@ def test_n_dim(array, result):
     ],
 )
 def test_shape(array, result):
-    assert TorchCompBackend.shape(array) == result
+    shape = TorchCompBackend.shape(array)
+    assert shape == result
+    assert type(shape) == tuple

From 40eac9357150e7a849b7038bf0d07b13f6215bc2 Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Mon, 16 Jan 2023 17:11:42 +0100
Subject: [PATCH 21/26] docs: add supported strings for skip type

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 docarray/typing/url/video_url.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docarray/typing/url/video_url.py b/docarray/typing/url/video_url.py
index 2013f22feb4..fff2dda5d18 100644
--- a/docarray/typing/url/video_url.py
+++ b/docarray/typing/url/video_url.py
@@ -57,7 +57,8 @@ def _load(
         Load the data from the url into a Tuple of AudioNdArray, VideoNdArray and
         NdArray.
 
-        :param skip_type: determines what video frames to discard.
+        :param skip_type: determines what video frames to discard. Supported strings
+            are: 'NONE', 'DEFAULT', 'NONREF', 'BIDIR', 'NONINTRA', 'NONKEY', 'ALL'.
         :param kwargs: supports all keyword arguments that are being supported by
             av.open() as described in:
             https://pyav.org/docs/stable/api/_globals.html?highlight=open#av.open

From a700f308a743e7ada0f90f3bc5d97852ee5d47cd Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Tue, 17 Jan 2023 10:03:50 +0100
Subject: [PATCH 22/26] fix: apply suggestions from code review

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 docarray/documents/video.py                    | 18 +++++++++---------
 .../predefined_document/test_video.py          |  4 ++--
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/docarray/documents/video.py b/docarray/documents/video.py
index 99ec9733d6e..dd011b796fc 100644
--- a/docarray/documents/video.py
+++ b/docarray/documents/video.py
@@ -1,8 +1,8 @@
 from typing import Optional, TypeVar
 
 from docarray.base_document import BaseDocument
+from docarray.documents import Audio
 from docarray.typing import AnyEmbedding, AnyTensor
-from docarray.typing.tensor.audio.audio_tensor import AudioTensor
 from docarray.typing.tensor.video.video_tensor import VideoTensor
 from docarray.typing.url.video_url import VideoUrl
 
@@ -12,10 +12,10 @@
 class Video(BaseDocument):
     """
     Document for handling video.
-    The Video Document can contain a VideoUrl (`Video.url`), an AudioTensor
-    (`Video.audio_tensor`), a VideoTensor (`Video.video_tensor`), an AnyTensor
-    representing the indices of the video's key frames (`Video.key_frame_indices`),
-    and an AnyEmbedding (`Video.embedding`).
+    The Video Document can contain a VideoUrl (`Video.url`), an Audio Document
+    (`Video.audio`), a VideoTensor (`Video.video_tensor`), an AnyTensor representing
+    the indices of the video's key frames (`Video.key_frame_indices`) and an
+    AnyEmbedding (`Video.embedding`).
 
     EXAMPLE USAGE:
 
@@ -29,7 +29,7 @@ class Video(BaseDocument):
         vid = Video(
             url='https://github.com/docarray/docarray/tree/feat-add-video-v2/tests/toydata/mov_bbb.mp4?raw=true'
         )
-        vid.audio_tensor, vid.video_tensor, vid.key_frame_indices = vid.url.load()
+        vid.audio.tensor, vid.video_tensor, vid.key_frame_indices = vid.url.load()
         model = MyEmbeddingModel()
         vid.embedding = model(vid.video_tensor)
 
@@ -50,7 +50,7 @@ class MyVideo(Video):
         video = MyVideo(
             url='https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/mov_bbb.mp4?raw=true'
         )
-        video.video_tensor = video.url.load(only_keyframes=True)
+        video.video_tensor = video.url.load_key_frames()
         model = MyEmbeddingModel()
         video.embedding = model(video.video_tensor)
         video.name = Text(text='my first video')
@@ -75,11 +75,11 @@ class MultiModalDoc(BaseDocument):
             ),
             text=Text(text='hello world, how are you doing?'),
         )
-        mmdoc.video.video_tensor = mmdoc.video.url.load(only_keyframes=True)
+        mmdoc.video.video_tensor = mmdoc.video.url.load_key_frames()
     """
 
     url: Optional[VideoUrl]
-    audio_tensor: Optional[AudioTensor]
+    audio: Optional[Audio] = Audio()
     video_tensor: Optional[VideoTensor]
     key_frame_indices: Optional[AnyTensor]
     embedding: Optional[AnyEmbedding]
diff --git a/tests/integrations/predefined_document/test_video.py b/tests/integrations/predefined_document/test_video.py
index 2522ba2801e..85cc451e851 100644
--- a/tests/integrations/predefined_document/test_video.py
+++ b/tests/integrations/predefined_document/test_video.py
@@ -13,8 +13,8 @@
 @pytest.mark.parametrize('file_url', [LOCAL_VIDEO_FILE, REMOTE_VIDEO_FILE])
 def test_video(file_url):
     vid = Video(url=file_url)
-    vid.audio_tensor, vid.video_tensor, vid.key_frame_indices = vid.url.load()
+    vid.audio.tensor, vid.video_tensor, vid.key_frame_indices = vid.url.load()
 
-    assert isinstance(vid.audio_tensor, AudioNdArray)
+    assert isinstance(vid.audio.tensor, AudioNdArray)
     assert isinstance(vid.video_tensor, VideoNdArray)
     assert isinstance(vid.key_frame_indices, NdArray)

From 07ceae8eb85e67888e742fef66390b5e43d9c2b7 Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Tue, 17 Jan 2023 11:15:17 +0100
Subject: [PATCH 23/26] fix: small change to trigger ci again

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 docarray/typing/tensor/video/video_tensor_mixin.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docarray/typing/tensor/video/video_tensor_mixin.py b/docarray/typing/tensor/video/video_tensor_mixin.py
index 01d5184f824..95ec7a9dfb5 100644
--- a/docarray/typing/tensor/video/video_tensor_mixin.py
+++ b/docarray/typing/tensor/video/video_tensor_mixin.py
@@ -17,7 +17,6 @@ def validate_shape(
         cls: Union[Type['VideoTorchTensor'], Type['VideoNdArray']], value: 'T'
     ) -> 'T':
         comp_backend = cls.get_comp_backend()
-
         if (
             comp_backend.n_dim(value) not in [3, 4]  # type: ignore
             or comp_backend.shape(value)[-1] != 3  # type: ignore

From c2e129d32973638de6344cf60ff15720130fc19c Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Tue, 17 Jan 2023 11:22:14 +0100
Subject: [PATCH 24/26] fix: extract shape var

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 docarray/typing/tensor/video/video_tensor_mixin.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/docarray/typing/tensor/video/video_tensor_mixin.py b/docarray/typing/tensor/video/video_tensor_mixin.py
index 95ec7a9dfb5..6decae2ca3f 100644
--- a/docarray/typing/tensor/video/video_tensor_mixin.py
+++ b/docarray/typing/tensor/video/video_tensor_mixin.py
@@ -17,13 +17,11 @@ def validate_shape(
         cls: Union[Type['VideoTorchTensor'], Type['VideoNdArray']], value: 'T'
     ) -> 'T':
         comp_backend = cls.get_comp_backend()
-        if (
-            comp_backend.n_dim(value) not in [3, 4]  # type: ignore
-            or comp_backend.shape(value)[-1] != 3  # type: ignore
-        ):
+        shape = comp_backend.shape(value)  # type: ignore
+        if comp_backend.n_dim(value) not in [3, 4] or shape[-1] != 3:  # type: ignore
             raise ValueError(
                 f'Expects tensor with 3 or 4 dimensions and the last dimension equal '
-                f'to 3, but received {comp_backend.shape(value)}.'  # type: ignore
+                f'to 3, but received {shape}.'
             )
         else:
             return value

From d50ae67fcb2b9462304a1b99d8fd622b431680e6 Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Tue, 17 Jan 2023 14:42:17 +0100
Subject: [PATCH 25/26] fix: introduce compbackendinterface

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 docarray/typing/tensor/video/video_ndarray.py |  2 +-
 .../typing/tensor/video/video_tensor_mixin.py | 26 ++++++++++++-------
 .../typing/tensor/video/video_torch_tensor.py |  2 +-
 3 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/docarray/typing/tensor/video/video_ndarray.py b/docarray/typing/tensor/video/video_ndarray.py
index 97c07afbdca..5cf6efc0057 100644
--- a/docarray/typing/tensor/video/video_ndarray.py
+++ b/docarray/typing/tensor/video/video_ndarray.py
@@ -31,4 +31,4 @@ def validate(
         config: 'BaseConfig',
     ) -> T:
         tensor = super().validate(value=value, field=field, config=config)
-        return VideoTensorMixin.validate_shape(cls, value=tensor)
+        return cls.validate_shape(value=tensor)
diff --git a/docarray/typing/tensor/video/video_tensor_mixin.py b/docarray/typing/tensor/video/video_tensor_mixin.py
index 6decae2ca3f..ac69cd5dea6 100644
--- a/docarray/typing/tensor/video/video_tensor_mixin.py
+++ b/docarray/typing/tensor/video/video_tensor_mixin.py
@@ -1,3 +1,4 @@
+import abc
 from typing import TYPE_CHECKING, BinaryIO, Optional, Type, TypeVar, Union
 
 import numpy as np
@@ -5,23 +6,28 @@
 from docarray.typing.tensor.audio.audio_tensor import AudioTensor
 
 if TYPE_CHECKING:
-    from docarray.typing import VideoNdArray, VideoTorchTensor
-
+    from docarray.typing.tensor.abstract_tensor import AbstractTensor
 
 T = TypeVar('T', bound='VideoTensorMixin')
+TT = TypeVar('TT', bound='AbstractTensor')
 
 
-class VideoTensorMixin:
+class CompBackendInterface(abc.ABC):
     @staticmethod
-    def validate_shape(
-        cls: Union[Type['VideoTorchTensor'], Type['VideoNdArray']], value: 'T'
-    ) -> 'T':
-        comp_backend = cls.get_comp_backend()
-        shape = comp_backend.shape(value)  # type: ignore
-        if comp_backend.n_dim(value) not in [3, 4] or shape[-1] != 3:  # type: ignore
+    @abc.abstractmethod
+    def get_comp_backend():
+        """The computational backend compatible with this tensor type."""
+        ...
+
+
+class VideoTensorMixin(CompBackendInterface, abc.ABC):
+    @classmethod
+    def validate_shape(cls: Type['T'], value: 'T') -> 'T':
+        comp_be = cls.get_comp_backend()
+        if comp_be.n_dim(value) not in [3, 4] or comp_be.shape(value)[-1] != 3:
             raise ValueError(
                 f'Expects tensor with 3 or 4 dimensions and the last dimension equal '
-                f'to 3, but received {shape}.'
+                f'to 3, but received {comp_be.shape(value)}.'
             )
         else:
             return value
diff --git a/docarray/typing/tensor/video/video_torch_tensor.py b/docarray/typing/tensor/video/video_torch_tensor.py
index 5e2953b4231..60dce18da3f 100644
--- a/docarray/typing/tensor/video/video_torch_tensor.py
+++ b/docarray/typing/tensor/video/video_torch_tensor.py
@@ -31,4 +31,4 @@ def validate(
         config: 'BaseConfig',
     ) -> T:
         tensor = super().validate(value=value, field=field, config=config)
-        return VideoTensorMixin.validate_shape(cls, value=tensor)
+        return cls.validate_shape(value=tensor)

From 2e365e6017d19f013360b76a465a06e050783803 Mon Sep 17 00:00:00 2001
From: anna-charlotte <charlotte.gerhaher@jina.ai>
Date: Tue, 17 Jan 2023 15:19:10 +0100
Subject: [PATCH 26/26] fix: revert previous pr and fix for mypy

Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
---
 .../typing/tensor/video/video_tensor_mixin.py | 24 ++++++-------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/docarray/typing/tensor/video/video_tensor_mixin.py b/docarray/typing/tensor/video/video_tensor_mixin.py
index ac69cd5dea6..1d4c2206e9d 100644
--- a/docarray/typing/tensor/video/video_tensor_mixin.py
+++ b/docarray/typing/tensor/video/video_tensor_mixin.py
@@ -1,33 +1,23 @@
 import abc
-from typing import TYPE_CHECKING, BinaryIO, Optional, Type, TypeVar, Union
+from typing import BinaryIO, Optional, Type, TypeVar, Union
 
 import numpy as np
 
+from docarray.typing.tensor.abstract_tensor import AbstractTensor
 from docarray.typing.tensor.audio.audio_tensor import AudioTensor
 
-if TYPE_CHECKING:
-    from docarray.typing.tensor.abstract_tensor import AbstractTensor
+T = TypeVar('T', bound='AbstractTensor')
 
-T = TypeVar('T', bound='VideoTensorMixin')
-TT = TypeVar('TT', bound='AbstractTensor')
 
-
-class CompBackendInterface(abc.ABC):
-    @staticmethod
-    @abc.abstractmethod
-    def get_comp_backend():
-        """The computational backend compatible with this tensor type."""
-        ...
-
-
-class VideoTensorMixin(CompBackendInterface, abc.ABC):
+class VideoTensorMixin(AbstractTensor, abc.ABC):
     @classmethod
     def validate_shape(cls: Type['T'], value: 'T') -> 'T':
         comp_be = cls.get_comp_backend()
-        if comp_be.n_dim(value) not in [3, 4] or comp_be.shape(value)[-1] != 3:
+        shape = comp_be.shape(value)  # type: ignore
+        if comp_be.n_dim(value) not in [3, 4] or shape[-1] != 3:  # type: ignore
             raise ValueError(
                 f'Expects tensor with 3 or 4 dimensions and the last dimension equal '
-                f'to 3, but received {comp_be.shape(value)}.'
+                f'to 3, but received {shape}.'
             )
         else:
             return value