tensorflow
diff --git a/‎src/backends/webgpu/src/backend_webgpu.ts
Lines changed: 66 additions & 48 deletions b/‎src/backends/webgpu/src/backend_webgpu.ts
Lines changed: 66 additions & 48 deletions
diff --git a/‎src/backends/webgpu/src/kernels/matmul_webgpu.ts
Lines changed: 20 additions & 22 deletions b/‎src/backends/webgpu/src/kernels/matmul_webgpu.ts
Lines changed: 20 additions & 22 deletions
diff --git a/‎src/backends/webgpu/src/kernels/webgpu_program.ts
Lines changed: 44 additions & 16 deletions b/‎src/backends/webgpu/src/kernels/webgpu_program.ts
Lines changed: 44 additions & 16 deletions
@@ -19,7 +19,7 @@
 
 import './flags_webgpu';
 
-import {DataMover, DataType, ENV, KernelBackend, Rank, ShapeMap, Tensor, tensor1d, Tensor3D, util} from '@tensorflow/tfjs-core';
+import {DataMover, DataType, ENV, KernelBackend, Rank, ShapeMap, Tensor, Tensor3D, util} from '@tensorflow/tfjs-core';
 import * as shaderc from '@webgpu/shaderc';
 
 import * as binary_op from './kernels/binary_op_webgpu';
@@ -32,8 +32,7 @@ import * as webgpu_program from './kernels/webgpu_program';
 import {WebGPUBinary} from './kernels/webgpu_program';
 
 type TensorInfo = {
-  shape: number[],
-  dtype: DataType,
+  byteSize: number,
   values: Float32Array|Int32Array|Uint8Array,
   id: number,
   buffer: GPUBuffer
@@ -75,28 +74,31 @@ export class WebGPUBackend extends KernelBackend {
   private tensorMap = new WeakMap<DataId, TensorInfo>();
 
   disposeData(dataId: DataId): void {
-    // Tensor disposal logic.
+    if (!this.tensorMap.has(dataId)) {
+      throw new Error(`Tensor ${dataId} was not registered!`);
+    }
+
+    const info = this.tensorMap.get(dataId);
+    this.destroyBuffer(info.byteSize, info.buffer);
   }
 
-  private createBuffer(size: number) {
-    return this.device.createBuffer({
-      size,
-      usage: GPUBufferUsage.TRANSFER_SRC | GPUBufferUsage.TRANSFER_DST |
-          GPUBufferUsage.STORAGE,
-    });
+  private createBuffer(
+      size: number,
+      usage: GPUBufferUsage = GPUBufferUsage.STORAGE |
+          GPUBufferUsage.TRANSFER_SRC | GPUBufferUsage.TRANSFER_DST) {
+    return this.device.createBuffer({size, usage});
   }
 
-  private setBufferData(
-      buffer: GPUBuffer, data: Float32Array|Int32Array|Uint8Array) {
-    buffer.setSubData(0, data);
+  private destroyBuffer(byteSize: number, buffer: GPUBuffer) {
+    // TODO: recycle deleted buffers
+    buffer.destroy();
   }
 
   register(dataId: object, shape: number[], dtype: DataType): void {
     if (!this.tensorMap.has(dataId)) {
-      const buffer = this.createBuffer(
-          util.sizeFromShape(shape) * util.bytesPerElement(dtype));
-
-      this.tensorMap.set(dataId, {shape, dtype, values: null, id: -1, buffer});
+      const byteSize = util.sizeFromShape(shape) * util.bytesPerElement(dtype);
+      const buffer = this.createBuffer(byteSize);
+      this.tensorMap.set(dataId, {byteSize, values: null, id: -1, buffer});
     }
   }
 
@@ -107,7 +109,7 @@ export class WebGPUBackend extends KernelBackend {
 
     const info = this.tensorMap.get(dataId);
     info.values = values;
-    this.setBufferData(info.buffer, values);
+    info.buffer.setSubData(0, values);
     this.tensorMap.set(dataId, info);
   }
 
@@ -118,15 +120,11 @@ export class WebGPUBackend extends KernelBackend {
   }
 
   private async getBufferData(info: TensorInfo): Promise<ArrayBuffer> {
-    const size =
-        util.sizeFromShape(info.shape) * util.bytesPerElement(info.dtype);
-    const staging = this.device.createBuffer({
-      size,
-      usage: GPUBufferUsage.TRANSFER_DST | GPUBufferUsage.MAP_READ,
-    });
+    const staging = this.createBuffer(
+        info.byteSize, GPUBufferUsage.TRANSFER_DST | GPUBufferUsage.MAP_READ);
     {
       const encoder = this.device.createCommandEncoder({});
-      encoder.copyBufferToBuffer(info.buffer, 0, staging, 0, size);
+      encoder.copyBufferToBuffer(info.buffer, 0, staging, 0, info.byteSize);
       this.commandQueue.push(encoder);
       this.submitQueue();
     }
@@ -158,36 +156,40 @@ export class WebGPUBackend extends KernelBackend {
     return Tensor.make(shape, {}, dtype, this) as T;
   }
 
+  private tensorToBinding(tensor?: Tensor): webgpu_program.BindingInfo {
+    if (!tensor) {
+      return null;
+    }
+
+    const tensorData = this.tensorMap.get(tensor.dataId);
+
+    return {
+      resource: {
+        offset: 0,
+        size: tensor.size * util.bytesPerElement(tensor.dtype),
+        buffer: tensorData.buffer
+      }
+    };
+  }
+
   private compileAndRun<
       K extends {dtype: DataType, size: number, dataId: {}, shape: number[]}>(
-      program: webgpu_program.WebGPUProgram, inputs: Tensor[],
-      output?: Tensor): K {
+      program: webgpu_program.WebGPUProgram, inputs: Tensor[], output?: Tensor,
+      uniforms?: webgpu_program.BindingInfo): K {
     if (output == null) {
       output = this.makeOutputArray(program.outputShape, inputs[0].dtype);
     }
     const key = webgpu_program.makeShaderKey(program);
     const {bindGroupLayout, pipeline} = this.getAndSavePipeline(key, () => {
       return webgpu_program.compileProgram(
           this.compiler, this.shaderc.shader_kind.compute, this.compileOpts,
-          this.device, program, inputs, output);
+          this.device, program, inputs, output, uniforms);
     });
 
     // Creating bind groups on the fly should never be a bottleneck.
-    const bg = this.device.createBindGroup({
-      layout: bindGroupLayout,
-      bindings: inputs.concat(output).map((tensor, i: number) => {
-        const tensorData = this.tensorMap.get(tensor.dataId);
-
-        return {
-          binding: i,
-          resource: {
-            offset: 0,
-            size: tensor.size * util.bytesPerElement(tensor.dtype),
-            buffer: tensorData.buffer
-          }
-        };
-      })
-    });
+    const bg = webgpu_program.makeBindGroup(
+        this.device, bindGroupLayout, inputs.map(t => this.tensorToBinding(t)),
+        this.tensorToBinding(output), uniforms);
 
     const encoder = this.device.createCommandEncoder({});
     const pass = encoder.beginComputePass();
@@ -204,6 +206,17 @@ export class WebGPUBackend extends KernelBackend {
     return output as {} as K;
   }
 
+  private makeUniforms(data: Uint32Array): webgpu_program.BindingInfo {
+    const dimensionsBuffer = this.createBuffer(
+        data.byteLength,
+        GPUBufferUsage.TRANSFER_DST | GPUBufferUsage.UNIFORM);
+    dimensionsBuffer.setSubData(0, data);
+
+    return {
+      resource: {offset: 0, size: data.byteLength, buffer: dimensionsBuffer}
+    };
+  }
+
   pad<T extends Tensor>(
       x: T, paddings: Array<[number, number]>, constantValue: number): T {
     const program = new PadProgram(x.shape, paddings, constantValue);
@@ -244,12 +257,17 @@ export class WebGPUBackend extends KernelBackend {
     const output =
         Tensor.make([batch, outerShapeA, outerShapeB], {}, a.dtype, this) as
         Tensor3D;
-
     const program = new MatMulProgram(output.shape);
-    const dimensions =
-        tensor1d([outerShapeA, sharedDim, outerShapeB, batch], 'int32');
-    // TODO: dispose mnkb
 
-    return this.compileAndRun(program, [a, b, dimensions], output) as Tensor3D;
+    const dimensionsData =
+        new Uint32Array([outerShapeA, sharedDim, outerShapeB, batch]);
+    const dimensions = this.makeUniforms(dimensionsData);
+
+    const result =
+        this.compileAndRun(program, [a, b], output, dimensions) as Tensor3D;
+
+    this.destroyBuffer(dimensionsData.byteLength, dimensions.resource.buffer);
+
+    return result;
   }
 }
@@ -21,53 +21,51 @@ export class MatMulProgram implements WebGPUProgram {
   outputShape: number[];
   userCode: string;
   dispatch: [number, number, number];
-  variableNames = ['A', 'B', 'Dimensions'];
-  tileSize = 8;
+  variableNames = ['A', 'B'];
+  uniforms = 'uint dimAOuter, dimInner, dimBOuter, batch;';
+  tileSize: [number, number] = [16, 16]; // Must be square.
 
   constructor(outputShape: [number, number, number]) {
     this.outputShape = outputShape;
     this.dispatch = [
-      Math.ceil(outputShape[1] / this.tileSize),
-      Math.ceil(outputShape[2] / this.tileSize), 1
+      Math.ceil(outputShape[1] / this.tileSize[0]),
+      Math.ceil(outputShape[2] / this.tileSize[1]), 1
     ];
 
     this.userCode = `
-      shared float Asub[TileSize][TileSize];
-      shared float Bsub[TileSize][TileSize];
+      shared float Asub[TileSize.x][TileSize.x];
+      shared float Bsub[TileSize.x][TileSize.x];
 
       void main() {
-        // M is A outer, N is shared, K is B outer
-        uint M = Dimensions[0], N = Dimensions[1], 
-          K = Dimensions[2], batch = Dimensions[3];
-        uint row = gl_LocalInvocationID.x; // Local row ID (max: TileSize)
-        uint col = gl_LocalInvocationID.y; // Local col ID (max: TileSize)
-        uint globalRow = TileSize*gl_WorkGroupID.x + row; // Row ID of C (0..M)
-        uint globalCol = TileSize*gl_WorkGroupID.y + col; // Col ID of C (0..N)
+        uint localRow = gl_LocalInvocationID.x; // < TileSize.x
+        uint localCol = gl_LocalInvocationID.y; // < TileSize.x
+        uint globalRow = TileSize.x*gl_WorkGroupID.x + localRow; // < dimAOuter
+        uint globalCol = TileSize.x*gl_WorkGroupID.y + localCol; // < dimInner
 
         float acc = 0.0;
 
-        uint numTiles = (N - 1)/TileSize + 1;
+        uint numTiles = (dimInner - 1) / TileSize.x + 1;
 
         for (uint t=0; t<numTiles; t++) {
           // Load one tile of A and B into local memory
-          uint tiledRow = TileSize*t + row;
-          uint tiledCol = TileSize*t + col;
-          Asub[row][col] = A[globalRow*N + tiledCol];
-          Bsub[row][col] = B[tiledRow*K + globalCol];
+          uint tiledACol = TileSize.x*t + localCol;
+          uint tiledBRow = TileSize.x*t + localRow;
+          Asub[localRow][localCol] = A[globalRow * dimInner + tiledACol];
+          Bsub[localRow][localCol] = B[tiledBRow * dimBOuter + globalCol];
 
           // Synchronise to make sure the tile is loaded
           barrier();
 
-          for (uint k=0; k<TileSize; k++) {
-            acc += Asub[row][k] * Bsub[k][col];
+          for (uint k=0; k<TileSize.x; k++) {
+            acc += Asub[localRow][k] * Bsub[k][localCol];
           }
 
           // Synchronise before loading the next tile
           barrier();
         }
 
-        if(globalCol < K && globalRow < M) {
-          setOutput(globalRow*K + globalCol, acc);
+        if (globalCol < dimBOuter && globalRow < dimAOuter) {
+          setOutput(globalRow * dimBOuter + globalCol, acc);
         }
       }
     `;
 
@@ -26,7 +26,8 @@ export interface WebGPUProgram {
   // Dispatch determines the layout of thread groups.
   dispatch: [number, number, number];
   variableNames: string[];
-  tileSize?: number;
+  uniforms?: string;
+  tileSize?: [number, number?, number?];
 }
 
 export interface WebGPUBinary {
@@ -38,35 +39,62 @@ export interface TensorData {
   dtype: DataType;
 }
 
+export interface BindingInfo {
+  resource: {offset: number, size: number, buffer: GPUBuffer};
+}
+
+export const makeBindGroup =
+    (device: GPUDevice, bindGroupLayout: GPUBindGroupLayout,
+     inputs: BindingInfo[], output: BindingInfo, uniforms?: BindingInfo) => {
+      const bindings = [output, ...inputs];
+      if (uniforms) {
+        bindings.push(uniforms);
+      }
+      return device.createBindGroup({
+        layout: bindGroupLayout,
+        bindings: bindings.map((b, i) => ({binding: i, resource: b.resource})),
+      });
+    };
+
+const makeBindGroupLayout =
+    (device: GPUDevice, inputs: Tensor[], output: Tensor,
+     uniforms?: BindingInfo): GPUBindGroupLayout => {
+      const bindings = Array(1 + inputs.length).fill({
+        visibility: GPUShaderStageBit.COMPUTE,
+        type: 'storage-buffer' as GPUBindingType
+      });
+      if (uniforms) {
+        bindings.push({
+          visibility: GPUS
10000
haderStageBit.COMPUTE,
+          type: 'uniform-buffer' as GPUBindingType
+        });
+      }
+      return device.createBindGroupLayout({
+        bindings: bindings.map((b, i) => ({binding: i, ...b})),
+      });
+    };
+
 export const compileProgram =
     (shaderCompiler: shaderc.Compiler, shaderKind: shaderc.ShaderKind,
      compileOptions: shaderc.CompileOptions, device: GPUDevice,
-     program: WebGPUProgram, inputs: Tensor[],
-     output: Tensor): WebGPUBinary => {
-      const bindings =
-          inputs.concat(output).map((input: Tensor, idx: number) => {
-            return {
-              binding: idx,
-              visibility: GPUShaderStageBit.COMPUTE,
-              type: 'storage-buffer'
-            } as GPUBindGroupLayoutBinding;
-          });
+     program: WebGPUProgram, inputs: Tensor[], output: Tensor,
+     uniforms?: BindingInfo): WebGPUBinary => {
       const inputsData = inputs.map((input: Tensor) => {
         return {dtype: input.dtype, shape: input.shape};
       });
       const outputData = {dtype: output.dtype, shape: output.shape};
 
-      const source = shader_preprocessor.makeShader(
-          inputsData, program.variableNames, outputData, program.userCode,
-          program.tileSize);
+      const source =
+          shader_preprocessor.makeShader(inputsData, outputData, program);
       const result = shaderCompiler.CompileGlslToSpv(
           source, shaderKind, 'file', 'main', compileOptions);
       const error = result.GetErrorMessage();
       if (error.length) {
         throw new Error(`Shader compilation failed: ${error}`);
       }
+      const bindGroupLayout =
+          makeBindGroupLayout(device, inputs, output, uniforms);
       const code = result.GetBinary();
-      const bindGroupLayout = device.createBindGroupLayout({bindings});
       const layout =
           device.createPipelineLayout({bindGroupLayouts: [bindGroupLayout]});
       const module = device.createShaderModule({code});
@@ -79,4 +107,4 @@ export const compileProgram =
 export function makeShaderKey(program: WebGPUProgram): string {
   const key = program.userCode;
   return key;
-};
+}