pytorch · mzzchy · May 11, 2025 · Skylion007 · May 12, 2025
@@ -126,7 +126,7 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface {
     FAIL_MTIAHOOKS_FUNC(__func__);
   }
 
-  virtual PyObject* memorySnapshot() const {
+  virtual PyObject* memorySnapshot(const std::optional<std::string>& local_path) const {
     FAIL_MTIAHOOKS_FUNC(__func__);
     return nullptr;
   }

diff --git a/buckbuild.bzl b/buckbuild.bzl
@@ -183,6 +183,7 @@ THIRD_PARTY_LIBS = {
     "ruy": ["//third-party/ruy:ruy_xplat_lib", "//third_party:ruy_lib"],
     "sleef_arm": ["//third-party/sleef:sleef_arm", "//third_party:sleef_arm"],
     "typing-extensions": ["//third-party/typing-extensions:typing-extensions", "//third_party:typing-extensions"],
+    "nlohmann-json": ["fbsource//third-party/nlohmann-json:nlohmann-json", "//third_party:nlohmann-json"],
 }
 
 def third_party(name):
@@ -1736,6 +1737,7 @@ def define_buck_targets(
         deps = [
             third_party("glog"),
             third_party("kineto"),
+            third_party("nlohmann-json"),
         ],
         exported_deps = [
             ":aten_cpu",

diff --git a/build_variables.bzl b/build_variables.bzl
@@ -101,6 +101,7 @@ libtorch_profiler_sources = [
     "torch/csrc/profiler/collection.cpp",
     "torch/csrc/profiler/data_flow.cpp",
     "torch/csrc/profiler/kineto_shim.cpp",
+    "torch/csrc/mtia/profiler/MTIAMemoryProfiler.cpp",
     "torch/csrc/profiler/kineto_client_interface.cpp",
     "torch/csrc/profiler/orchestration/observer.cpp",
     "torch/csrc/profiler/orchestration/python_tracer.cpp",

@@ -1162,7 +1162,7 @@ class PythonMemoryTracer final : public python_tracer::PythonMemoryTracerBase {
   ~PythonMemoryTracer() override = default;
   void start() override;
   void stop() override;
-  void export_memory_history(const std::string path) override;
+  void export_memory_history(const std::string& path) override;
 };
 
 static void toggle_memory_tracing(bool enable) {
@@ -1197,7 +1197,7 @@ void PythonMemoryTracer::start() {
   toggle_memory_tracing(true);
 }
 
-void PythonMemoryTracer::export_memory_history(const std::string path) {
+void PythonMemoryTracer::export_memory_history(const std::string& path) {
   PyGILState_STATE gil_state = PyGILState_Ensure();
   THPObjectPtr torch_cuda_memory_module(
       PyImport_ImportModule("torch.cuda.memory"));

@@ -96,7 +96,8 @@ void initModule(PyObject* module) {
       });
 
   m.def("_mtia_memorySnapshot", []() {
-    PyObject* raw_pyobject = at::detail::getMTIAHooks().memorySnapshot();
+    PyObject* raw_pyobject =
+        at::detail::getMTIAHooks().memorySnapshot(std::nullopt);
     return py::reinterpret_steal<py::object>(raw_pyobject);
   });
 

@@ -0,0 +1,35 @@
+#include <ATen/Context.h>
+#include <ATen/detail/MTIAHooksInterface.h>
+#include <nlohmann/json.hpp>
+#include <torch/csrc/mtia/profiler/MTIAMemoryProfiler.h>
+
+using json = nlohmann::json;
+
+namespace torch::mtia {
+
+void MTIAMemoryProfiler::start() {
+  at::detail::getMTIAHooks().recordMemoryHistory("all", "all", 150000);
+}
+
+void MTIAMemoryProfiler::export_memory_history(const std::string& path) {
+  at::detail::getMTIAHooks().memorySnapshot(path);
+  return;
+}
+
+void MTIAMemoryProfiler::stop() {
+  at::detail::getMTIAHooks().recordMemoryHistory(std::nullopt, "all", 0);
+}
+
+std::unique_ptr<torch::profiler::impl::python_tracer::PythonMemoryTracerBase>
+getMemoryTracer() {
+  return std::make_unique<MTIAMemoryProfiler>();
+}
+
+void initMemoryProfiler() {
+  if (at::detail::isMTIAHooksBuilt()) {
+    fprintf(stderr, "Initializing MTIA Memory Tracer\n");
+    torch::profiler::impl::python_tracer::registerMemoryTracer(
+        &getMemoryTracer);
+  }
+}
+} // namespace torch::mtia
@@ -0,0 +1,20 @@
+#pragma once
+#include <torch/csrc/profiler/orchestration/python_tracer.h>
+
+namespace torch::mtia {
+using namespace torch::profiler::impl::python_tracer;
+
+void initMemoryProfiler();
+
+std::unique_ptr<PythonMemoryTracerBase> getMemoryTracer();
+
+class MTIAMemoryProfiler final : public PythonMemoryTracerBase {
+ public:
+  explicit MTIAMemoryProfiler() = default;
+  ~MTIAMemoryProfiler() override = default;
+  void start() override;
+  void stop() override;
+  void export_memory_history(const std::string& path) override;
+};
+
+} // namespace torch::mtia
@@ -2,6 +2,7 @@
 #include <ATen/Context.h>
 #include <libkineto.h>
 #include <torch/csrc/autograd/profiler_kineto.h>
+#include <torch/csrc/mtia/profiler/MTIAMemoryProfiler.h>
 #include <torch/csrc/profiler/kineto_client_interface.h>
 #include <chrono>
 #include <thread>
@@ -23,7 +24,9 @@ using namespace torch::autograd::profiler;
 
 class LibKinetoClient : public libkineto::ClientInterface {
  public:
-  void init() override {}
+  void init() override {
+    ::torch::mtia::initMemoryProfiler();
+  }
 
   void prepare(
       bool report_input_shapes = false,

@@ -24,7 +24,7 @@ struct NoOpMemoryPythonTracer : public PythonMemoryTracerBase {
   ~NoOpMemoryPythonTracer() override = default;
   void start() override {}
   void stop() override {}
-  void export_memory_history(const std::string path) override {}
+  void export_memory_history(const std::string&) override {}
 };
 
 } // namespace

@@ -66,7 +66,7 @@ struct TORCH_API PythonMemoryTracerBase {
 
   virtual void start() = 0;
   virtual void stop() = 0;
-  virtual void export_memory_history(const std::string path) = 0;
+  virtual void export_memory_history(const std::string& path) = 0;
 };
 
 using MakeMemoryFn = std::unique_ptr<PythonMemoryTracerBase> (*)();

@@ -115,6 +115,49 @@ struct PythonTraceback : public CapturedTraceback::Python {
 
 } // namespace
 
+std::vector<nlohmann::json> json_symbolize(
+    std::vector<CapturedTraceback*>& to_symbolize) {
+  std::unordered_map<CapturedTraceback*, uint64_t> cached_frames;
+  std::vector<CapturedTraceback*> unique_frames;
+  for (const auto& sc : to_symbolize) {
+    auto it = cached_frames.find(sc);
+    if (it == cached_frames.end()) {
+      cached_frames.insert({sc, unique_frames.size()});
+      unique_frames.push_back(sc);
+    }
+  }
+  auto s = symbolize(unique_frames);
+
+  std::string line_s = "line";
+  std::string name_s = "name";
+  std::string filename_s = "filename";
+  std::vector<nlohmann::json> all_frames;
+
+  for (const auto& f : s.all_frames) {
+    nlohmann::json d;
+    d[name_s] = f.funcname;
+    d[filename_s] = f.filename;
+    d[line_s] = f.lineno;
+    all_frames.emplace_back(std::move(d));
+  }
+
+  std::vector<nlohmann::json> py_unique_frames;
+  for (const auto& t : s.tracebacks) {
+    nlohmann::json l;
+    for (const auto& e : t) {
+      l.emplace_back(all_frames.at(e));
+    }
+    py_unique_frames.push_back(std::move(l));
+  }
+
+  std::vector<nlohmann::json> result;
+  result.reserve(to_symbolize.size());
+  for (const auto& sc : to_symbolize) {
+    result.push_back(py_unique_frames.at(cached_frames.at(sc)));
+  }
+  return result;
+}
+
 std::vector<py::object> py_symbolize(
     std::vector<CapturedTraceback*>& to_symbolize) {
   // we dedup repeated to_symbolize objects to prevent

@@ -1,5 +1,6 @@
 #include <torch/csrc/profiler/combined_traceback.h>
 
+#include <nlohmann/json.hpp>
 #include <pybind11/pybind11.h>
 #include <torch/csrc/utils/pybind.h>
 
@@ -14,6 +15,10 @@ namespace torch {
 TORCH_API std::vector<pybind11::object> py_symbolize(
     std::vector<CapturedTraceback*>& to_symbolize);
 
+// Return the callback in json format so that it can be used within cpp
+TORCH_API std::vector<nlohmann::json> json_symbolize(
+    std::vector<CapturedTraceback*>& to_symbolize);
+
 // requires GIL to be held, frees any pending free frames
 TORCH_PYTHON_API void freeDeadCapturedTracebackFrames();