tensorflow
diff --git a/‎src/backends/webgpu/src/kernels/matmul_packed_webgpu.ts
Lines changed: 105 additions & 81 deletions b/‎src/backends/webgpu/src/kernels/matmul_packed_webgpu.ts
Lines changed: 105 additions & 81 deletions
diff --git a/‎src/backends/webgpu/src/kernels/matmul_webgpu.ts
Lines changed: 69 additions & 42 deletions b/‎src/backends/webgpu/src/kernels/matmul_webgpu.ts
Lines changed: 69 additions & 42 deletions
@@ -15,8 +15,94 @@
  * =============================================================================
  */
 
+import {matMulHeader} from './matmul_webgpu';
 import {WebGPUProgram} from './webgpu_program';
 
+export function makeMatMulPackedSource(workPerThread: number): string {
+  return `
+    ${matMulHeader}
+
+    const uint TileSide = TileSize.x;  // TileSize.x == TileSize.y
+    const uint WorkPerThread = ${workPerThread};
+    shared float mm_Asub[TileSide * WorkPerThread][TileSide * WorkPerThread];
+    shared float mm_Bsub[TileSide * WorkPerThread][TileSide * WorkPerThread];
+
+    void mm_matMul(uint dimAOuter, uint dimInner, uint dimBOuter) {
+      uint row = gl_LocalInvocationID.y;   // 0..local_size_x
+      uint col = gl_LocalInvocationID.x;   // 0..local_size_y
+      uint tileRow = row * WorkPerThread;  // 0..TileSide, stride by local_size
+      uint tileCol = col * WorkPerThread;  // 0..TileSide
+
+      // 0..AOuter, stride by tileSize
+      uint globalRow = TileSide * gl_WorkGroupID.y + tileRow;
+      uint globalCol = TileSide * gl_WorkGroupID.x + tileCol;
+
+      uint numTiles = (dimInner - 1) / TileSize.x + 1;
+
+      float acc[WorkPerThread][WorkPerThread];
+      float ACached;
+      float BCached[WorkPerThread];
+
+      // Without this initialization strange values show up in acc.
+      for (uint innerRow = 0; innerRow < WorkPerThread; innerRow++) {
+        for (uint innerCol = 0; innerCol < WorkPerThread; innerCol++) {
+          acc[innerRow][innerCol] = 0.0;
+        }
+      }
+
+      // Loop over shared dimension.
+      for (uint t = 0; t < numTiles; t++) {
+        // Load one tile of A and B into local memory.
+        for (uint innerRow = 0; innerRow < WorkPerThread; innerRow++) {
+          for (uint innerCol = 0; innerCol < WorkPerThread; innerCol++) {
+            uint inputRow = tileRow + innerRow;
+            uint inputCol = tileCol + innerCol;
+
+            mm_Asub[inputRow][inputCol] = mm_readA(
+                globalRow + innerRow,
+                t * TileSize.x + tileCol + innerCol);
+            mm_Bsub[inputRow][inputCol] = mm_readB(
+                t * TileSize.x + tileRow + innerRow,
+                globalCol + innerCol);
+          }
+        }
+
+        barrier();
+
+        // Compute acc values for a single thread.
+        for (uint k = 0; k < TileSize.x; k++) {
+          for (uint inner = 0; inner < WorkPerThread; inner++) {
+            BCached[inner] = mm_Bsub[k][tileCol + inner];
+          }
+
+          for (uint innerRow = 0; innerRow < WorkPerThread; innerRow++) {
+            ACached = mm_Asub[tileRow + innerRow][k];
+            for (uint innerCol = 0; innerCol < WorkPerThread; innerCol++) {
+              acc[innerRow][innerCol] += ACached * BCached[innerCol];
+            }
+          }
+        }
+
+        barrier();
+      }
+
+      for (uint innerRow = 0; innerRow < WorkPerThread; innerRow++) {
+        for (uint innerCol = 0; innerCol < WorkPerThread; innerCol++) {
+          uint globalFlatIndex =
+            (globalRow + innerRow) * dimBOuter + (globalCol + innerCol);
+
+          if ((globalCol + innerCol) < dimBOuter &&
+              (globalRow + innerRow) < dimAOuter) {
+            mm_write(globalRow + innerRow,
+                     globalCol + innerCol,
+                     acc[innerRow][innerCol]);
+          }
+        }
+      }
+    }
+  `;
+}
+
 export class MatMulPackedProgram implements WebGPUProgram {
   outputShape: number[];
   userCode: string;
@@ -38,92 +124,30 @@ export class MatMulPackedProgram implements WebGPUProgram {
     // about boundary conditions when loading from Asub / Bsub when tiles fit
     // neatly inside of output. May slightly improve performance.
     this.userCode = `
-      const uint WorkPerThread = ${workPerThread};
-      shared float Asub[TileSize.x * WorkPerThread][TileSize.x * WorkPerThread];
-      shared float Bsub[TileSize.x * WorkPerThread][TileSize.x * WorkPerThread];
+      ${makeMatMulPackedSource(workPerThread)}
 
-      void main() {
-        uint row = gl_LocalInvocationID.y; // 0..local_size_x
-        uint col = gl_LocalInvocationID.x; // 0..local_size_y
-        uint tileRow = row * WorkPerThread; // 0..TileSize, stride by local_size
-        uint tileCol = col * WorkPerThread; // 0..TileSize
-        
-        // 0..AOuter, stride by tileSize
-        uint globalRow = TileSize.x*gl_WorkGroupID.y + tileRow; 
-        uint globalCol = TileSize.x*gl_WorkGroupID.x + tileCol;
-
-        uint numTiles = (dimInner - 1) / TileSize.x + 1;
-
-        float acc[WorkPerThread][WorkPerThread];
-        float ACached;
-        float BCached[WorkPerThread];
-
-        // Without this initialization strange values show up in acc.
-        for(uint innerRow=0; innerRow<WorkPerThread; innerRow++) {
-          for(uint innerCol=0; innerCol<WorkPerThread; innerCol++) {
-            acc[innerRow][innerCol] = 0.0;
-          }
+      float mm_readA(uint row, uint col) {
+        if (row < dimAOuter && col < dimInner) {
+          return A[row * dimInner + col];
+        } else {
+          return 0.0;
         }
+      }
 
-        // Loop over shared dimension.
-        for(uint t=0; t<numTiles; t++) { 
-          // Load one tile of A and B into local memory.
-          for(uint innerRow=0; innerRow<WorkPerThread; innerRow++) {
-            for(uint innerCol=0; innerCol<WorkPerThread; innerCol++) {
-              uint inputRow = tileRow + innerRow;
-              uint inputCol = tileCol + innerCol;
-              
-              uint AColumnIndex = t * TileSize.x + tileCol + innerCol;
-              uint AFlatIndex = 
-                (globalRow + innerRow) * dimInner + AColumnIndex;
-
-              if(AColumnIndex < dimInner && AFlatIndex < dimAOuter * dimInner) {
-                Asub[inputRow][inputCol] = A[AFlatIndex];
-              } else {
-                Asub[inputRow][inputCol] = 0.0;
-              }
-
-              uint BRowIndex = t * TileSize.x + tileRow + innerRow;
-              uint BFlatIndex = BRowIndex * dimBOuter + (globalCol + innerCol);
-
-              if(BRowIndex < dimInner && BFlatIndex < dimInner * dimBOuter) {
-                Bsub[inputRow][inputCol] = B[BFlatIndex];
-              } else {
-                Bsub[inputRow][inputCol] = 0.0; 
-              }
-            }
-          }
-
-          barrier();
-
-          // Compute acc values for a single thread.
-          for(uint k=0; k<TileSize.x; k++) {
-            for(uint inner=0; inner<WorkPerThread; inner++) {
-              BCached[inner] = Bsub[k][tileCol + inner];
-            }
-            
-            for(uint innerRow=0; innerRow<WorkPerThread; innerRow++) {
-              ACached = Asub[tileRow + innerRow][k];
-              for(uint innerCol=0; innerCol<WorkPerThread; innerCol++) {
-                acc[innerRow][innerCol] += ACached * BCached[innerCol];
-              }
-            }
-          }
-
-          barrier();
+      float mm_readB(uint row, uint col) {
+        if (row < dimInner && col < dimBOuter) {
+          return B[row * dimBOuter + col];
+        } else {
+          return 0.0;
         }
+      }
 
-        for (uint innerRow=0; innerRow<WorkPerThread; innerRow++) {
-          for (uint innerCol=0; innerCol<WorkPerThread; innerCol++) {
-            uint globalFlatIndex = 
-              (globalRow + innerRow) * dimBOuter + (globalCol + innerCol);
-            
-            if((globalCol + innerCol) < dimBOuter && 
-              (globalRow + innerRow) < dimAOuter) {
-              setOutput(globalFlatIndex, acc[innerRow][innerCol]);
-            }
-          }
-        }
+      void mm_write(uint row, uint col, float value) {
+        setOutput(row * dimBOuter + col, value);
+      }
+
+      void main() {
+        mm_matMul(dimAOuter, dimInner, dimBOuter);
       }
     `;
   }
 
@@ -17,69 +17,96 @@
 
 import {WebGPUProgram} from './webgpu_program';
 
-export class MatMulProgram implements WebGPUProgram {
-  outputShape: number[];
-  userCode: string;
-  dispatch: [number, number, number];
-  variableNames = ['A', 'B'];
-  uniforms = 'uint dimAOuter, dimInner, dimBOuter, batch;';
-  tileSize: [number, number] = [16, 16];  // Must be square.
+export const matMulHeader = `
+  float mm_readA(uint row, uint col);
+  float mm_readB(uint row, uint col);
+  void mm_write(uint row, uint col, float value);
+  void mm_matMul(uint dimAOuter, uint dimInner, uint dimBOuter);`;
 
-  constructor(outputShape: [number, number, number]) {
-    this.outputShape = outputShape;
-    this.dispatch = [
-      Math.ceil(outputShape[1] / this.tileSize[0]),
-      Math.ceil(outputShape[2] / this.tileSize[1]), 1
-    ];
+export function makeMatMulSource(): string {
+  return `
+    ${matMulHeader}
 
-    this.userCode = `
-      shared float Asub[TileSize.x][TileSize.x];
-      shared float Bsub[TileSize.x][TileSize.x];
+    const uint TileSide = TileSize.x;  // TileSize.x == TileSize.y
+    shared float mm_Asub[TileSide][TileSide];
+    shared float mm_Bsub[TileSide][TileSide];
 
-      void main() {
-        uint localRow = gl_LocalInvocationID.x; // < TileSize.x
-        uint localCol = gl_LocalInvocationID.y; // < TileSize.x
-        uint globalRow = TileSize.x*gl_WorkGroupID.x + localRow; // < dimAOuter
-        uint globalCol = TileSize.x*gl_WorkGroupID.y + localCol; // < dimInner
+        uint localRow = gl_LocalInvocationID.x;  // 0..TileSide
+        uint localCol = gl_LocalInvocationID.y;  // 0..TileSide
+        uint globalRow = TileSize.x * gl_WorkGroupID.x + localRow;  // AOuter
+        uint globalCol = TileSize.x * gl_WorkGroupID.y + localCol;  // Inner
 
         float acc = 0.0;
 
         uint numTiles = (dimInner - 1) / TileSize.x + 1;
 
-        for (uint t=0; t<numTiles; t++) {
+        for (uint t = 0; t < numTiles; t++) {
           // Load one tile of A and B into local memory
-          uint tiledACol = TileSize.x*t + localCol;
-          uint tiledBRow = TileSize.x*t + localRow;
-
-          uint AFlatIndex = globalRow * dimInner + tiledACol;
-          if(AFlatIndex < dimAOuter * dimInner) {
-            Asub[localRow][localCol] = A[AFlatIndex];
-          } else {
-            Asub[localRow][localCol] = 0.0;
-          }
-
-          uint BFlatIndex = tiledBRow * dimBOuter + globalCol;
-          if(BFlatIndex < dimInner * dimBOuter) {
-            Bsub[localRow][localCol] = B[BFlatIndex];
-          } else {
-            Bsub[localRow][localCol] = 0.0;
-          }
+          uint tiledACol = TileSize.x * t + localCol;
+          uint tiledBRow = TileSize.x * t + localRow;
+          mm_Asub[localRow][localCol] = mm_readA(globalRow, tiledACol);
+          mm_Bsub[localRow][localCol] = mm_readB(tiledBRow, globalCol);
 
           // Synchronise to make sure the tile is loaded
           barrier();
 
-          for (uint k=0; k<TileSize.x; k++) {
-            acc += Asub[localRow][k] * Bsub[k][localCol];
+          for (uint k = 0; k < TileSize.x; k++) {
+            acc += mm_Asub[localRow][k] * mm_Bsub[k][localCol];
           }
 
           // Synchronise before loading the next tile
           barrier();
         }
 
         if (globalCol < dimBOuter && globalRow < dimAOuter) {
-          setOutput(globalRow * dimBOuter + globalCol, acc);
+          mm_write(globalRow, globalCol, acc);
+        }
+      }
+  `;
+}
+
+export class MatMulProgram implements WebGPUProgram {
+  outputShape: number[];
+  userCode: string;
+  dispatch: [number, number, number];
+  variableNames = ['A', 'B'];
+  uniforms = 'uint dimAOuter, dimInner, dimBOuter, batch;';
+  tileSize: [number, number] = [16, 16];  // Must be square.
+
+  constructor(outputShape: [number, number, number]) {
+    this.outputShape = outputShape;
+    this.dispatch = [
+      Math.ceil(outputShape[1] / this.tileSize[0]),
+      Math.ceil(outputShape[2] / this.tileSize[1]), 1
+    ];
+
+    this.userCode = `
+      ${makeMatMulSource()}
+
+      float mm_readA(uint row, uint col) {
+        if (row < dimAOuter && col < dimInner) {
+          return A[row * dimInner + col];
+        } else {
+          return 0.0;
+        }
+      }
+
+      float mm_readB(uint row, uint col) {
+        if (row < dimInner && col < dimBOuter) {
+          return B[row * dimBOuter + col];
+        } else {
+          return 0.0;
         }
       }
+
+      void mm_write(uint row, uint col, float value) {
+        setOutput(row * dimBOuter + col, value);
+      }
+
+      void main() {
+        mm_matMul(dimAOuter, dimInner, dimBOuter);
+      }
     `;
   }
-}
+}