DiffSharp
diff --git a/‎src/DiffSharp.Backends.Reference/Reference.RawTensor.fs
Lines changed: 18 additions & 21 deletions b/‎src/DiffSharp.Backends.Reference/Reference.RawTensor.fs
Lines changed: 18 additions & 21 deletions
diff --git a/‎src/DiffSharp.Backends.Torch/Torch.RawTensor.fs
Lines changed: 3 additions & 3 deletions b/‎src/DiffSharp.Backends.Torch/Torch.RawTensor.fs
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/DiffSharp.Core/Extensions.fs
Lines changed: 6 additions & 0 deletions b/‎src/DiffSharp.Core/Extensions.fs
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/DiffSharp.Core/RawTensor.fs
Lines changed: 1 addition & 1 deletion b/‎src/DiffSharp.Core/RawTensor.fs
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/DiffSharp.Core/Shape.fs
Lines changed: 7 additions & 4 deletions b/‎src/DiffSharp.Core/Shape.fs
Lines changed: 7 additions & 4 deletions
@@ -530,20 +530,17 @@ module internal RawTensorCPU =
         let result = Array.map (fun t -> t ** t2value) t1value
         (result, t1.Shape)
 
-    let inline MatMulT2T2(t1: RawTensorCPU< ^T >, t2: RawTensor) : (^T[] * Shape) =
-        Shape.checkCanMatmul t1.Shape t2.Shape
-        let t1rows, t1cols = t1.Shape.[0], t1.Shape.[1]
-        let t2rows, t2cols = t2.Shape.[0], t2.Shape.[1]
+    let inline MatMulTT(t1: RawTensorCPU< ^T >, t2: RawTensor) : (^T[] * Shape) =
+        let (t1BatchPart, t1MatrixPart), (t2BatchPart, t2MatrixPart) = Shape.checkCanMatmul t1.Shape t2.Shape
+        if t1BatchPart <> t2BatchPart then failwithf "Cannot matrix multiply raw tensors with shapes %A, %A - mismatch batching" t1.Shape t2.Shape
+        let t1rows, t1cols = t1MatrixPart.[0], t1MatrixPart.[1]
+        let t2rows, t2cols = t2MatrixPart.[0], t2MatrixPart.[1]
         let t1value = t1.Values
-        let t2value = t2.GetTypedValues()        
-        let result = Array.zeroCreate (t1rows*t2cols) 
-        for i in 0 .. t1rows - 1 do
-            for j in 0 .. t2cols - 1 do
-                let mutable acc = zero
-                for k in 0..t2rows-1 do 
-                    acc <- acc + t1value.[i*t1cols + k] * t2value.[k*t2cols + j]
-                result.[i*t2cols + j] <- acc
-        (result,[| t1rows; t2cols |])
+        let t2value = (t2 :?> RawTensorCPU< ^T >).Values        
+        let newShape = Array.append t1BatchPart [| t1rows; t2cols |]
+        let nb = shapeLength t1BatchPart
+        let values = Array.initFlat3D nb t1rows t2cols (fun b i j -> Array.sumBy (fun k -> t1value.[b*t1cols*t1rows + i*t1cols + k] * t2value.[b*t2cols*t2rows + k*t2cols + j]) [|0..(t2rows-1)|] )
+        (values, newShape)
 
     let inline MaxPool1D(t1: RawTensorCPU< ^T >, kernelSize, stride, padding) : RawTensorCPU< ^T > * RawTensorCPU< int > =
         let batchSize, channels, inputSize, outputSize, outputShape =
@@ -895,7 +892,7 @@ type RawTensorFloat32(values: float32[], shape:Shape, device) =
     override t1.PowTT(t2) = RawTensorCPU.PowTT(t1, t2) |> create
     override t1.PowT0T(t2) = RawTensorCPU.PowT0T(t1, t2) |> create
     override t1.PowTT0(t2) = RawTensorCPU.PowTT0(t1, t2) |> create
-    override t1.MatMulT2T2(t2) = RawTensorCPU.MatMulT2T2(t1, t2) |> create
+    override t1.MatMulTT(t2) = RawTensorCPU.MatMulTT(t1, t2) |> create
     override t1.MaxPool1D(kernelSize, stride, padding) = let result, indices = RawTensorCPU.MaxPool1D(t1, kernelSize, stride, padding) in result :> _, indices :> _
     override t1.MaxPool2D(kernelSize, stride, padding) = let result, indices = RawTensorCPU.MaxPool2D(t1, kernelSize, stride, padding) in result :> _, indices :> _
     override t1.MaxPool3D(kernelSize, stride, padding) = let result, indices = RawTensorCPU.MaxPool3D(t1, kernelSize, stride, padding) in result :> _, indices :> _
@@ -980,7 +977,7 @@ type RawTensorFloat64(values: double[], shape:Shape, device) =
     override t1.PowTT(t2) = RawTensorCPU.PowTT(t1, t2) |> create
     override t1.PowT0T(t2) = RawTensorCPU.PowT0T(t1, t2) |> create
     override t1.PowTT0(t2) = RawTensorCPU.PowTT0(t1, t2) |> create
-    override t1.MatMulT2T2(t2) = RawTensorCPU.MatMulT2T2(t1, t2) |> create
+    override t1.MatMulTT(t2) = RawTensorCPU.MatMulTT(t1, t2) |> create
     override t1.MaxPool1D(kernelSize, stride, padding) = let result, indices = RawTensorCPU.MaxPool1D(t1, kernelSize, stride, padding) in result :> _, indices :> _
     override t1.MaxPool2D(kernelSize, stride, padding) = let result, indices = RawTensorCPU.MaxPool2D(t1, kernelSize, stride, padding) in 
77FB
result :> _, indices :> _
     override t1.MaxPool3D(kernelSize, stride, padding) = let result, indices = RawTensorCPU.MaxPool3D(t1, kernelSize, stride, padding) in result :> _, indices :> _
@@ -1061,7 +1058,7 @@ type RawTensorInt8(values: int8[], shape:Shape, device) =
     override t1.DivTT(t2) = RawTensorCPU.DivTT(t1, t2) |> create
     override t1.DivT0T(t2) = RawTensorCPU.DivT0T(t1, t2) |> create
     override t1.DivTT0(t2) = RawTensorCPU.DivTT0(t1, t2) |> create
-    override t1.MatMulT2T2(t2) = RawTensorCPU.MatMulT2T2(t1, t2) |> create
+    override t1.MatMulTT(t2) = RawTensorCPU.MatMulTT(t1, t2) |> create
     override t1.MaxPool1D(kernelSize, stride, padding) = let result, indices = RawTensorCPU.MaxPool1D(t1, kernelSize, stride, padding) in result :> _, indices :> _
     override t1.MaxPool2D(kernelSize, stride, padding) = let result, indices = RawTensorCPU.MaxPool2D(t1, kernelSize, stride, padding) in result :> _, indices :> _
     override t1.MaxPool3D(kernelSize, stride, padding) = let result, indices = RawTensorCPU.MaxPool3D(t1, kernelSize, stride, padding) in result :> _, indices :> _
@@ -1143,7 +1140,7 @@ type RawTensorByte(values: byte[], shape:Shape, device) =
     override t1.DivTT(t2) = RawTensorCPU.DivTT(t1, t2) |> create
     override t1.DivT0T(t2) = RawTensorCPU.DivT0T(t1, t2) |> create
     override t1.DivTT0(t2) = RawTensorCPU.DivTT0(t1, t2) |> create
-    override t1.MatMulT2T2(t2) = RawTensorCPU.MatMulT2T2(t1, t2) |> create
+    override t1.MatMulTT(t2) = RawTensorCPU.MatMulTT(t1, t2) |> create
     override t1.MaxPool1D(kernelSize, stride, padding) = let result, indices = RawTensorCPU.MaxPool1D(t1, kernelSize, stride, padding) in result :> _, indices :> _
     override t1.MaxPool2D(kernelSize, stride, padding) = let result, indices = RawTensorCPU.MaxPool2D(t1, kernelSize, stride, padding) in result :> _, indices :> _
     override t1.MaxPool3D(kernelSize, stride, padding) = let result, indices = RawTensorCPU.MaxPool3D(t1, kernelSize, stride, padding) in result :> _, indices :> _
@@ -1225,7 +1222,7 @@ type RawTensorInt16(values: int16[], shape:Shape, device) =
     override t1.DivTT(t2) = RawTensorCPU.DivTT(t1, t2) |> create
     override t1.DivT0T(t2) = RawTensorCPU.DivT0T(t1, t2) |> create
     override t1.DivTT0(t2) = RawTensorCPU.DivTT0(t1, t2) |> create
-    override t1.MatMulT2T2(t2) = RawTensorCPU.MatMulT2T2(t1, t2) |> create
+    override t1.MatMulTT(t2) = RawTensorCPU.MatMulTT(t1, t2) |> create
     override t1.MaxPool1D(kernelSize, stride, padding) = let result, indices = RawTensorCPU.MaxPool1D(t1, kernelSize, stride, padding) in result :> _, indices :> _
     override t1.MaxPool2D(kernelSize, stride, padding) = let result, indices = RawTensorCPU.MaxPool2D(t1, kernelSize, stride, padding) in result :> _, indices :> _
     override t1.MaxPool3D(kernelSize, stride, padding) = let result, indices = RawTensorCPU.MaxPool3D(t1, kernelSize, stride, padding) in result :> _, indices :> _
@@ -1307,7 +1304,7 @@ type RawTensorInt32(values: int32[], shape:Shape, device) =
     override t1.DivTT(t2) = RawTensorCPU.DivTT(t1, t2) |> create
     override t1.DivT0T(t2) = RawTensorCPU.DivT0T(t1, t2) |> create
     override t1.DivTT0(t2) = RawTensorCPU.DivTT0(t1, t2) |> create
-    override t1.MatMulT2T2(t2) = RawTensorCPU.MatMulT2T2(t1, t2) |> create
+    override t1.MatMulTT(t2) = RawTensorCPU.MatMulTT(t1, t2) |> create
     override t1.MaxPool1D(kernelSize, stride, padding) = let result, indices = RawTensorCPU.MaxPool1D(t1, kernelSize, stride, padding) in result :> _, indices :> _
     override t1.MaxPool2D(kernelSize, stride, padding) = let result, indices = RawTensorCPU.MaxPool2D(t1, kernelSize, stride, padding) in result :> _, indices :> _
     override t1.MaxPool3D(kernelSize, stride, padding) = let result, indices = RawTensorCPU.MaxPool3D(t1, kernelSize, stride, padding) in result :> _, indices :> _
@@ -1389,7 +1386,7 @@ type RawTensorInt64(values: int64[], shape:Shape, device) =
     override t1.DivTT(t2) = RawTensorCPU.DivTT(t1, t2) |> create
     override t1.DivT0T(t2) = RawTensorCPU.DivT0T(t1, t2) |> create
     override t1.DivTT0(t2) = RawTensorCPU.DivTT0(t1, t2) |> create
-    override t1.MatMulT2T2(t2) = RawTensorCPU.MatMulT2T2(t1, t2) |> create
+    override t1.MatMulTT(t2) = RawTensorCPU.MatMulTT(t1, t2) |> create
     override t1.MaxPool1D(kernelSize, stride, padding) = let result, indices = RawTensorCPU.MaxPool1D(t1, kernelSize, stride, padding) in result :> _, indices :> _
     override t1.MaxPool2D(kernelSize, stride, padding) = let result, indices = RawTensorCPU.MaxPool2D(t1, kernelSize, stride, padding) in result :> _, indices :> _
     override t1.MaxPool3D(kernelSize, stride, padding) = let result, indices = RawTensorCPU.MaxPool3D(t1, kernelSize, stride, padding) in result :> _, indices :> _
@@ -1478,7 +1475,7 @@ type RawTensorBool(values: bool[], shape:Shape, device) =
     override t1.DivTT(t2) = opNotSupported2 "DivTT" t1.Dtype t2.Dtype
     override t1.DivT0T(t2) = opNotSupported2 "DivT0T" t1.Dtype t2.Dtype
     override t1.DivTT0(t2) = opNotSupported2 "DivTT0" t1.Dtype t2.Dtype
-    override t1.MatMulT2T2(t2) = opNotSupported2 "MatMulT2T2" t1.Dtype t2.Dtype
+    override t1.MatMulTT(t2) = opNotSupported2 "MatMulTT" t1.Dtype t2.Dtype
     override t1.MaxPool1D(_kernelSize, _stride, _padding) = opNotSupported "MaxPool1D" t1.Dtype
     override t1.MaxPool2D(_kernelSize, _stride, _padding) = opNotSupported "MaxPool2D" t1.Dtype
     override t1.MaxPool3D(_kernelSize, _stride, _padding) = opNotSupported "MaxPool3D" t1.Dtype
 
@@ -523,11 +523,11 @@ type TorchRawTensor(tt: TorchTensor, shape: Shape, dtype: Dtype, device: Device)
         let result = tt.Pow(t2v)
         t1.MakeLike(result)
 
-    override t1.MatMulT2T2(t2) = 
+    override t1.MatMulTT(t2) = 
         match dtype with 
-        | Dtype.Bool -> opNotSupported2 "MatMulT2T2" dtype t2.Dtype
+        | Dtype.Bool -> opNotSupported2 "MatMulTT" dtype t2.Dtype
         | _ ->  
-        Shape.checkCanMatmul t1.Shape t2.Shape
+        let _, _ = Shape.checkCanMatmul t1.Shape t2.Shape
         let result =
             // "addmm for CUDA tensors only supports floating-point types. Try converting the tensors with .float()" | const char *
             match t1.DeviceType, dtype with 
 
@@ -40,6 +40,12 @@ module Array =
         else
             counts |> Array.ofSeq |> Array.map (fun (KeyValue(k, v)) -> k, v) |> Array.unzip
 
+    // Create a 2D array using a flat representation
+    let initFlat2D i j f = Array.init (i*j) (fun ij -> f (ij/j) (ij%j))
+
+    // Create a 3D array using a flat representation
+    let initFlat3D i j k f = Array.init (i*j*k) (fun ijk -> f (ijk/j/k) ((ijk/k)%j) (ijk%k))
+
 /// Contains extensions to the F# Seq module. 
 module Seq =
 
 
@@ -408,7 +408,7 @@ type RawTensor() =
     abstract member PowTT0: t2: RawTensor -> RawTensor
 
     /// Returns the matrix multiplication of two tensors
-    abstract member MatMulT2T2: t2: RawTensor -> RawTensor
+    abstract member MatMulTT: t2: RawTensor -> RawTensor
 
     /// Returns the 1D maxpool of a tensor and its chosen maximum indices
     abstract member MaxPool1D: kernelSize: int * stride: int * padding: int -> RawTensor * RawTensor
 
@@ -459,10 +459,13 @@ module rec Shape =
         if not (contains shape1 shape2) then failwithf "Expecting shape1 to contain shape2, received %A, %A" shape1 shape2
         if location.Length <> shape1.Length then failwithf "Expecting location of the same length as shape1, received %A, %A" (location.Length) shape1
 
-    /// Checks if the given shape is appropriate for a matmul operation.
-    let checkCanMatmul (shape1: Shape) (shape2: Shape) =
-        if shape1.Length <> 2 || shape2.Length <> 2 then failwithf "Expecting two 2d Tensors, received Tensors with shapes %A, %A" shape1 shape2
-        if shape1.[1] <> shape2.[0] then failwithf "Cannot multiply Tensors with shapes %A, %A" shape1 shape2
+    /// Check if the given shape is appropriate for a matmul operation.
+    let checkCanMatmul (shape1:int[]) (shape2:int[]) =
+        if shape1.Length < 2 || shape2.Length < 2 then failwithf "Expecting two 2d Tensors, received Tensors with shapes %A, %A" shape1 shape2
+        let aBatchPart, aMatrixPart = Array.splitAt (shape1.Length-2) shape1
+        let bBatchPart, bMatrixPart = Array.splitAt (shape2.Length-2) shape2
+        if aMatrixPart.[1] <> bMatrixPart.[0] then failwithf "Cannot matrix multiply tensors with shapes %A, %A - mismatch in matrix dimension" shape1 shape2
+        (aBatchPart, aMatrixPart), (bBatchPart, bMatrixPart)
 
     /// Checks if the given shape is appropriate for a dot product operation.
     let checkCanDot (shape1: Shape) (shape2: Shape) =