IBMZ-Linux-OSS-Python
diff --git a/‎third_party/xla/xla/backends/gpu/runtime/BUILD
Lines changed: 6 additions & 1 deletion b/‎third_party/xla/xla/backends/gpu/runtime/BUILD
Lines changed: 6 additions & 1 deletion
diff --git a/‎third_party/xla/xla/backends/gpu/runtime/kernel_thunk.cc
Lines changed: 36 additions & 15 deletions b/‎third_party/xla/xla/backends/gpu/runtime/kernel_thunk.cc
Lines changed: 36 additions & 15 deletions
diff --git a/‎third_party/xla/xla/backends/gpu/runtime/kernel_thunk.h
Lines changed: 3 additions & 0 deletions b/‎third_party/xla/xla/backends/gpu/runtime/kernel_thunk.h
Lines changed: 3 additions & 0 deletions
diff --git a/‎third_party/xla/xla/backends/gpu/runtime/kernel_thunk_test.cc
Lines changed: 53 additions & 3 deletions b/‎third_party/xla/xla/backends/gpu/runtime/kernel_thunk_test.cc
Lines changed: 53 additions & 3 deletions
diff --git a/‎third_party/xla/xla/backends/gpu/runtime/thunk.proto
Lines changed: 5 additions & 10 deletions b/‎third_party/xla/xla/backends/gpu/runtime/thunk.proto
Lines changed: 5 additions & 10 deletions
diff --git a/‎third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
Lines changed: 6 additions & 0 deletions b/‎third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
Lines changed: 6 additions & 0 deletions
@@ -788,6 +788,7 @@ cc_library(
     hdrs = ["kernel_thunk.h"],
     deps = [
         ":thunk",
+        "//xla:shape_util",
         "//xla:types",
         "//xla/codegen/emitters:kernel_arguments",
         "//xla/hlo/ir:hlo",
@@ -1860,9 +1861,12 @@ tf_proto_library(
         "thunk.proto",
     ],
     protodeps = [
+        # keep sorted
+        "//xla:xla_data_proto",
         "//xla/service:buffer_assignment_proto",
+        "//xla/service/gpu:launch_dimensions_proto",
+        "//xla/stream_executor:launch_dim_proto",
         "//xla/stream_executor/gpu:gpu_blas_lt_proto",
-        "//xla:xla_data_proto",
     ],
 )
 
@@ -1874,6 +1878,7 @@ cc_library(
         ":conditional_thunk",
         ":copy_thunk",
         ":gemm_thunk",
+        ":kernel_thunk",
         ":sequential_thunk",
         ":thunk",
         ":thunk_proto_cc",
 
@@ -22,6 +22,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 #include <variant>
+#include <vector>
 
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
@@ -38,6 +39,7 @@ limitations under the License.
 #include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/stream_executor_util.h"
+#include "xla/shape.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/stream_executor/kernel.h"
@@ -53,16 +55,6 @@ namespace gpu {
 // KernelThunk
 //===----------------------------------------------------------------------===//
 
-namespace {
-Dim3DProto Dim3DToProto(const se::Dim3D& dim) {
-  Dim3DProto proto;
-  proto.set_x(dim.x);
-  proto.set_y(dim.y);
-  proto.set_z(dim.z);
-  return proto;
-}
-}  // namespace
-
 KernelThunk::KernelThunk(
     Thunk::ThunkInfo thunk_info, std::string kernel_name,
     absl::Span<const emitters::KernelArgument> kernel_arguments,
@@ -102,17 +94,46 @@ absl::StatusOr<ThunkProto> KernelThunk::ToProto() const {
     kernel_proto->add_written(written);
   }
   kernel_proto->set_kernel_name(kernel_name_);
-  *kernel_proto->mutable_launch_block_counts() =
-      Dim3DToProto(launch_dimensions_.block_counts());
-  *kernel_proto->mutable_launch_thread_counts_per_block() =
-      Dim3DToProto(launch_dimensions_.thread_counts_per_block());
+  *kernel_proto->mutable_launch_dimensions() = launch_dimensions_.ToProto();
   if (cluster_dim_) {
-    *kernel_proto->mutable_cluster_dim() = Dim3DToProto(*cluster_dim_);
+    *kernel_proto->mutable_cluster_dim() = cluster_dim_->ToProto();
   }
   kernel_proto->set_shmem_bytes(shmem_bytes_);
   return proto;
 }
 
+absl::StatusOr<std::unique_ptr<KernelThunk>> KernelThunk::FromProto(
+    ThunkInfo thunk_info, const KernelThunkProto& proto,
+    absl::Span<const BufferAllocation> buffer_allocations) {
+  TF_ASSIGN_OR_RETURN(LaunchDimensions launch_dimensions,
+                      LaunchDimensions::FromProto(proto.launch_dimensions()));
+  std::optional<stream_executor::ClusterDim> cluster_dim;
+  if (proto.has_cluster_dim()) {
+    TF_ASSIGN_OR_RETURN(
+        cluster_dim.emplace(),
+        stream_executor::ClusterDim::FromProto(proto.cluster_dim()));
+  }
+
+  if (proto.written().size() != proto.args().size()) {
+    return absl::InvalidArgumentError(
+        "Proto fields `written` and `args` need to have the same cardinality.");
+  }
+
+  std::vector<emitters::KernelArgument> arguments;
+  arguments.reserve(proto.args().size());
+  for (int i = 0; i < proto.args().size(); ++i) {
+    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
+                        BufferAllocation::Slice::FromProto(proto.args().at(i),
+                                                           buffer_allocations));
+    bool written = proto.written().at(i);
+    arguments.push_back(emitters::KernelArgument{Shape{}, slice, written});
+  }
+
+  return std::make_unique<KernelThunk>(thunk_info, proto.kernel_name(),
+                                       arguments, launch_dimensions,
+                                       cluster_dim, proto.shmem_bytes());
+}
+
 absl::Status KernelThunk::Initialize(const InitializeParams& params) {
   absl::MutexLock lock(&mutex_);
 
 
@@ -84,6 +84,9 @@ class KernelThunk : public Thunk {
   std::string ToString(int indent) const override;
 
   absl::StatusOr<ThunkProto> ToProto() const override;
+  static absl::StatusOr<std::unique_ptr<KernelThunk>> FromProto(
+      ThunkInfo thunk_info, const KernelThunkProto& proto,
+      absl::Span<const BufferAllocation> buffer_allocations);
 
   absl::Status Initialize(const InitializeParams& params) override;
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
@@ -15,10 +15,13 @@ limitations under the License.
 
 #include "xla/backends/gpu/runtime/kernel_thunk.h"
 
+#include <array>
+#include <memory>
 #include <optional>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
 #include "xla/backends/gpu/runtime/thunk.h"
@@ -138,13 +141,60 @@ TEST(KernelThunkTest, ToProto) {
           written: false
           written: true
           kernel_name: "kernel123"
-          launch_block_counts { x: 32 y: 31 z: 30 }
-          launch_thread_counts_per_block { x: 256 y: 255 z: 254 }
-          cluster_dim { x: 8 y: 7 z: 6 }
+          launch_dimensions {
+            block_counts { coordinates { x: 32 y: 31 z: 30 } }
+            thread_counts_per_block { coordinates { x: 256 y: 255 z: 254 } }
+          }
+          cluster_dim { coordinates { x: 8 y: 7 z: 6 } }
           shmem_bytes: 1024
         }
       )pb"));
 }
 
+TEST(KernelThunkTest, ToAndFromProto) {
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.profile_annotation = "DotGeneral";
+  thunk_info.execution_stream_id = 123;
+
+  std::array allocations{
+      BufferAllocation{/*index=*/0, /*size=*/1024, /*color=*/0},
+      BufferAllocation{/*index=*/0, /*size=*/256, /*color=*/0}};
+
+  // Note that slices keep a pointer to the allocation. Therefore `allocations`
+  // shouldn't be mutated afterwards.
+  BufferAllocation::Slice slice0(&allocations.at(0), /*offset=*/0,
+                                 /*size=*/1024);
+  BufferAllocation::Slice slice1(&allocations.at(1), /*offset=*/0,
+                                 /*size=*/256);
+
+  std::vector<emitters::KernelArgument> kernel_arguments = {
+      emitters::KernelArgument(ShapeUtil::MakeShape(F32, {1024}), slice0,
+                               /*written=*/false),
+      emitters::KernelArgument(ShapeUtil::MakeShape(F32, {256}), slice1,
+                               /*written=*/true)};
+
+  LaunchDimensions launch_dimensions(se::BlockDim(32, 31, 30),
+                                     se::ThreadDim(256, 255, 254));
+  se::ClusterDim cluster_dim(8, 7, 6);
+  constexpr absl::string_view kKernelName = "kernel123";
+  constexpr int kSharedMemoryBytes = 1024;
+  KernelThunk thunk(thunk_info, std::string{kKernelName}, kernel_arguments,
+                    launch_dimensions, cluster_dim, kSharedMemoryBytes,
+                    /*tma_metadata=*/std::nullopt);
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto proto, thunk.ToProto());
+  ASSERT_TRUE(proto.has_kernel_thunk());
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<KernelThunk> reconstructed_thunk,
+      KernelThunk::FromProto(thunk_info, proto.kernel_thunk(), allocations));
+
+  EXPECT_THAT(reconstructed_thunk->cluster_dim(), cluster_dim);
+  EXPECT_THAT(reconstructed_thunk->kernel_name(), kKernelName);
+  EXPECT_THAT(reconstructed_thunk->launch_dimensions(), launch_dimensions);
+  EXPECT_THAT(reconstructed_thunk->shmem_bytes(), kSharedMemoryBytes);
+  EXPECT_THAT(reconstructed_thunk->written(),
+              ::testing::ElementsAre(false, true));
+  EXPECT_THAT(reconstructed_thunk->arguments(),
+              ::testing::ElementsAre(slice0, slice1));
+}
 }  // namespace
 }  // namespace xla::gpu
@@ -18,7 +18,9 @@ syntax = "proto3";
 package xla.gpu;
 
 import "xla/service/buffer_assignment.proto";
+import "xla/service/gpu/launch_dimensions.proto";
 import "xla/stream_executor/gpu/gpu_blas_lt.proto";
+import "xla/stream_executor/launch_dim.proto";
 import "xla/xla_data.proto";
 
 // Contains basic pieces of information that every thunk type has.
@@ -55,20 +57,13 @@ message WhileThunkProto {
   optional int64 trip_count = 4;
 }
 
-message Dim3DProto {
-  int64 x = 1;
-  int64 y = 2;
-  int64 z = 3;
-}
-
 message KernelThunkProto {
   repeated xla.buffer_assignment.BufferAllocationSliceProto args = 1;
   repeated bool written = 2;
   string kernel_name = 3;
-  Dim3DProto launch_block_counts = 4;
-  Dim3DProto launch_thread_counts_per_block = 5;
-  optional Dim3DProto cluster_dim = 6;
-  int64 shmem_bytes = 7;
+  LaunchDimensionsProto launch_dimensions = 4;
+  optional stream_executor.ClusterDimProto cluster_dim = 5;
+  int64 shmem_bytes = 6;
 }
 
 message GemmThunkProto {
 
@@ -24,6 +24,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/conditional_thunk.h"
 #include "xla/backends/gpu/runtime/copy_thunk.h"
 #include "xla/backends/gpu/runtime/gemm_thunk.h"
+#include "xla/backends/gpu/runtime/kernel_thunk.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/triangular_solve_thunk.h"
@@ -90,6 +91,11 @@ absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProto(
                                            thunk_proto.triangular_solve_thunk(),
                                            buffer_allocations);
   }
+
+  if (thunk_proto.has_kernel_thunk()) {
+    return KernelThunk::FromProto(
+        std::move(thunk_info), thunk_proto.kernel_thunk(), buffer_allocations);
+  }
   return absl::InvalidArgumentError("Unknown thunk type found in ThunkProto.");
 }