pytorch
diff --git a/‎.lintrunner.toml
Lines changed: 3 additions & 4 deletions b/‎.lintrunner.toml
Lines changed: 3 additions & 4 deletions
diff --git a/‎aten/src/ATen/Version.cpp
Lines changed: 3 additions & 3 deletions b/‎aten/src/ATen/Version.cpp
Lines changed: 3 additions & 3 deletions
diff --git a/‎aten/src/ATen/WrapDimUtils.h
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/WrapDimUtils.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/core/ivalue.h
Lines changed: 1 addition & 0 deletions b/‎aten/src/ATen/core/ivalue.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎aten/src/ATen/cpu/vml.h
Lines changed: 2 additions & 2 deletions b/‎aten/src/ATen/cpu/vml.h
Lines changed: 2 additions & 2 deletions
diff --git a/‎aten/src/ATen/cuda/CUDAGraph.h
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/cuda/CUDAGraph.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/cuda/CachingHostAllocator.cpp
Lines changed: 5 additions & 2 deletions b/‎aten/src/ATen/cuda/CachingHostAllocator.cpp
Lines changed: 5 additions & 2 deletions
diff --git a/‎aten/src/ATen/cuda/tunable/GemmCommon.h
Lines changed: 17 additions & 13 deletions b/‎aten/src/ATen/cuda/tunable/GemmCommon.h
Lines changed: 17 additions & 13 deletions
diff --git a/‎aten/src/ATen/cuda/tunable/TunableOp.h
Lines changed: 0 additions & 1 deletion b/‎aten/src/ATen/cuda/tunable/TunableOp.h
Lines changed: 0 additions & 1 deletion
diff --git a/‎aten/src/ATen/functorch/DynamicLayer.cpp
Lines changed: 3 additions & 3 deletions b/‎aten/src/ATen/functorch/DynamicLayer.cpp
Lines changed: 3 additions & 3 deletions
@@ -224,6 +224,9 @@ exclude_patterns = [
     '**/fb/**',
     '**/generated/**',
     '**/*pb.h',
+    '**/*inl.h',
+    'aten/src/ATen/CPUFixedAllocator.h',
+    'aten/src/ATen/Parallel*.h',
     'c10/xpu/**/*.h',
     'c10/xpu/**/*.cpp',
     'c10/benchmark/intrusive_ptr_benchmark.cpp',
@@ -236,15 +239,12 @@ exclude_patterns = [
     'c10/util/strong_type.h',
     'c10/util/SmallVector.h',
     'c10/util/win32-headers.h',
-    'c10/util/*inl.h',
     'c10/test/**/*.h',
     'third_party/**/*',
     'torch/csrc/api/include/torch/nn/modules/common.h',
     'torch/csrc/api/include/torch/linalg.h',
-    'torch/csrc/api/include/torch/nn/pimpl-inl.h',
     'torch/csrc/autograd/generated/**',
     'torch/csrc/distributed/**/*.cu',
-    'torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h',
     'torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp',
     'torch/csrc/distributed/c10d/WinSockUtils.hpp',
     'torch/csrc/distributed/c10d/quantization/quantization_gpu.h',
@@ -253,7 +253,6 @@ exclude_patterns = [
     'torch/csrc/jit/**/*',
     'torch/csrc/jit/serialization/mobile_bytecode_generated.h',
     'torch/csrc/utils/pythoncapi_compat.h',
-    'torch/csrc/utils/throughput_benchmark-inl.h',
 ]
 init_command = [
     'python3',
 
@@ -24,9 +24,9 @@ std::string get_mkl_version() {
     {
       // Magic buffer number is from MKL documentation
       // https://software.intel.com/en-us/mkl-developer-reference-c-mkl-get-version-string
-      char buf[198];
-      mkl_get_version_string(buf, 198);
-      version = buf;
+      version.resize(198,'\0');
+      mkl_get_version_string(version.data(), 198);
+      version.resize(strlen(version.c_str()));
     }
   #else
     version = "MKL not found";
 
@@ -35,7 +35,7 @@ inline int64_t maybe_wrap_dim(
     // if necessary
     return dim;
   }
-  return maybe_wrap_dim(dim, tensor_sizes[0].size());
+  return maybe_wrap_dim(dim, static_cast<int64_t>(tensor_sizes[0].size()));
 }
 
 // Given an array of dimensions `dims` of length `ndims`, this function "Wraps"
 
@@ -1360,6 +1360,7 @@ struct TORCH_API IValue final {
     Payload(Payload&&) = delete;
     Payload& operator=(const Payload&) = delete;
     Payload& operator=(Payload&&) = delete;
+    // NOLINTNEXTLINE(modernize-use-equals-default)
     ~Payload() {}
   };
 
 
@@ -108,12 +108,12 @@ static_assert(
 #define IMPLEMENT_VML_MKL_STUB(op, mklop, type, mkltype)                \
   template <>                                                           \
   inline void v##op(type * out, const type * in, int64_t size) {        \
-    int64_t max_mkl_ind = std::numeric_limits<MKL_INT>::max();          \
+    auto constexpr max_mkl_ind = std::numeric_limits<MKL_INT>::max();   \
     if (size <= static_cast<int64_t>(max_mkl_ind)) {                    \
       vm##mkltype##mklop(                                               \
           size, in, out, VML_HA | VML_FTZDAZ_OFF | VML_ERRMODE_IGNORE); \
     } else {                                                            \
-      MKL_INT ind = 0;                                                  \
+      int64_t ind = 0;                                                  \
       int64_t chunks = size / max_mkl_ind;                              \
       int64_t rest = size % max_mkl_ind;                                \
       for (; ind < chunks; ind++) {                                     \
 
@@ -82,7 +82,7 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
   // in a capture to run on the same device, but this is a limitation of CUDAGraph,
   // not CUDA itself.  We can straightforwardly modify CUDAGraph to support multi-device
   // captures if needed.
-  int capture_dev_{};
+  c10::DeviceIndex capture_dev_{};
 };
 
 } // namespace cuda
 
@@ -98,6 +98,7 @@ struct CUDACachingHostAllocatorImpl
             pinned_use_cuda_host_register()) {
       void* ptr = block->ptr_;
       AT_CUDA_CHECK(cudaHostUnregister(ptr));
+      // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
       std::free(ptr);
     } else {
       AT_CUDA_CHECK(cudaFreeHost(block->ptr_));
@@ -136,8 +137,8 @@ struct CUDACachingHostAllocatorImpl
 
   TaskThreadPool* getThreadPool() {
     static TaskThreadPool* pool = new TaskThreadPool(
-        c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
-            pinned_max_register_threads());
+        static_cast<int>(c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
+            pinned_max_register_threads()));
     return pool;
   }
 
@@ -157,6 +158,7 @@ struct CUDACachingHostAllocatorImpl
     uintptr_t alignedStart =
         (((uintptr_t)start + pageSize - 1) & ~(pageSize - 1));
     for (uintptr_t p = alignedStart; p < ((uintptr_t)end); p += pageSize) {
+      // NOLINTNEXTLINE(performance-no-int-to-ptr)
       memset((void*)p, 0, 1);
     }
   }
@@ -180,6 +182,7 @@ struct CUDACachingHostAllocatorImpl
     // Here we do regular allocation, pre-fault/map the pages, and then do
     // cudaHostRegister with GPU mapping flags to lock the pages, so we
     // can minimize the cost for the cuda global lock.
+    // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
     *ptr = std::malloc(roundSize);
 
     // Parallelize the mapping/registering of pages to reduce wall time
 
@@ -82,9 +82,7 @@ static bool NumericalCheck(ScalarType dtype, void* c, void* other_c, int64_t siz
 
 template <typename T>
 struct GemmParams : OpParams {
-  GemmParams() {
-    duplicate_inputs_ = false;
-  }
+  GemmParams() = default;
 
   std::string Signature() const override {
     return fmt::sprintf("%c%c_%ld_%ld_%ld", transa, transb, m, n, k);
@@ -140,7 +138,9 @@ struct GemmParams : OpParams {
   void Delete() {
     c10::cuda::CUDACachingAllocator::raw_delete(c);
     if (duplicate_inputs_) {
+      // NOLINTNEXTLINE(*const-cast*)
       c10::cuda::CUDACachingAllocator::raw_delete(const_cast<T*>(a));
+      // NOLINTNEXTLINE(*const-cast*)
       c10::cuda::CUDACachingAllocator::raw_delete(const_cast<T*>(b));
     }
   }
@@ -164,7 +164,7 @@ struct GemmParams : OpParams {
   T* c;
   int64_t ldc;
 private:
-  bool duplicate_inputs_;
+  bool duplicate_inputs_{false};
 };
 
 template <typename T>
@@ -248,14 +248,14 @@ struct GemmAndBiasParams : OpParams {
   const T* bias;
   at::cuda::blas::GEMMAndBiasActivationEpilogue activation;
 private:
-  bool duplicate_inputs_;
+  bool duplicate_inputs_{false};
 };
 
 template <typename T>
 struct GemmStridedBatchedParams : OpParams {
-  GemmStridedBatchedParams() {
-    duplicate_inputs_ = false;
-  }
+  GemmStridedBatchedParams() = default;
+  GemmStridedBatchedParams(const GemmStridedBatchedParams&) = default;
+  GemmStridedBatchedParams& operator=(const GemmStridedBatchedParams&) = default;
 
   std::string Signature() const override {
     return fmt::sprintf("%c%c_%ld_%ld_%ld_B_%ld", transa, transb, m, n, k, batch);
@@ -300,7 +300,9 @@ struct GemmStridedBatchedParams : OpParams {
     if (duplicate_inputs) {
       size_t a_size = GetSizeA();
       size_t b_size = GetSizeB();
+      // NOLINTNEXTLINE(*const-cast*)
       copy->a = static_cast<const T*>(c10::cuda::CUDACachingAllocator::raw_alloc(a_size));
+      // NOLINTNEXTLINE(*const-cast*)
       copy->b = static_cast<const T*>(c10::cuda::CUDACachingAllocator::raw_alloc(b_size));
       copy->duplicate_inputs_ = true;
     }
@@ -311,7 +313,9 @@ struct GemmStridedBatchedParams : OpParams {
   void Delete() {
     c10::cuda::CUDACachingAllocator::raw_delete(c);
     if (duplicate_inputs_) {
+      // NOLINTNEXTLINE(*const-cast*)
       c10::cuda::CUDACachingAllocator::raw_delete(const_cast<T*>(a));
+      // NOLINTNEXTLINE(*const-cast*)
       c10::cuda::CUDACachingAllocator::raw_delete(const_cast<T*>(b));
     }
   }
@@ -339,14 +343,12 @@ struct GemmStridedBatchedParams : OpParams {
   int64_t stride_c;
   int64_t batch;
 private:
-  bool duplicate_inputs_;
+  bool duplicate_inputs_{false};
 };
 
 template <typename T>
 struct ScaledGemmParams : OpParams {
-  ScaledGemmParams() {
-    duplicate_inputs_ = false;
-  }
+  ScaledGemmParams() = default;
 
   std::string Signature() const override {
     return fmt::sprintf("%c%c_%ld_%ld_%ld", transa, transb, m, n, k);
@@ -402,7 +404,9 @@ struct ScaledGemmParams : OpParams {
   void Delete() {
     c10::cuda::CUDACachingAllocator::raw_delete(c);
     if (duplicate_inputs_) {
+      // NOLINTNEXTLINE(*const-cast*)
       c10::cuda::CUDACachingAllocator::raw_delete(const_cast<void*>(a));
+      // NOLINTNEXTLINE(*const-cast*)
       c10::cuda::CUDACachingAllocator::raw_delete(const_cast<void*>(b));
     }
   }
@@ -433,7 +437,7 @@ struct ScaledGemmParams : OpParams {
   void* amax_ptr;
   bool use_fast_accum;
 private:
-  bool duplicate_inputs_;
+  bool duplicate_inputs_{false};
 };
 
 } // namespace at::cuda::tunable
@@ -284,7 +284,6 @@ class TunableOp {
 };
 
 struct OpParams {
-  OpParams() = default;
   virtual ~OpParams() = default;
   virtual std::string Signature() const = 0;
 };
 
@@ -232,7 +232,7 @@ DynamicLayer popDynamicLayer() {
 
 int64_t pushDynamicLayer(DynamicLayer&& dynamic_layer) {
   auto& dynamicLayerStack = dynamicLayerStackAccessor();
-  int64_t layerId = 1 + dynamicLayerStack.size();
+  int64_t layerId = static_cast<int64_t>(1 + dynamicLayerStack.size());
   TORCH_INTERNAL_ASSERT(layerId == dynamic_layer.layerId());
   dynamicLayerStack.emplace_back(std::move(dynamic_layer));
 
@@ -256,7 +256,7 @@ int64_t initAndPushDynamicLayer(
     std::optional<bool> prev_fwd_grad_mode,
     std::optional<bool> functionalize_add_back_views) {
   const auto& dynamicLayerStack = dynamicLayerStackAccessor();
-  const auto layerId = 1 + dynamicLayerStack.size();
+  const int64_t layerId = static_cast<int64_t>(1 + dynamicLayerStack.size());
   DynamicLayer new_layer(transform_type, layerId, std::move(batch_size), randomness, prev_grad_mode, prev_fwd_grad_mode, functionalize_add_back_views);
   // NB: this function should be called while holding the GIL to avoid races
   new_layer.interpreter().set_is_alive(true);
@@ -459,7 +459,7 @@ static void dynamicLayerFrontFallback(
 
   // Unwrap escaped GradWrappers
   auto num_args = op.schema().arguments().size();
-  foreachTensorInplace(*stack, stack->size() - num_args, stack->size(), unwrapIfDead);
+  foreachTensorInplace(*stack, static_cast<int64_t>(stack->size() - num_args), static_cast<int64_t>(stack->size()), unwrapIfDead);
 
   auto& layer = dynamicLayerStack.back();
   layer.interpreter().process(op, stack);
Original file line number	Diff line number	Diff line change
`@@ -24,9 +24,9 @@ std::string get_mkl_version() {`
`24`	`24`	`{`
`25`	`25`	`// Magic buffer number is from MKL documentation`
`26`	`26`	`// https://software.intel.com/en-us/mkl-developer-reference-c-mkl-get-version-string`
`27`		`- char buf[198];`
`28`		`- mkl_get_version_string(buf, 198);`
`29`		`- version = buf;`
	`27`	`+ version.resize(198,'\0');`
	`28`	`+ mkl_get_version_string(version.data(), 198);`
	`29`	`+ version.resize(strlen(version.c_str()));`
`30`	`30`	`}`
`31`	`31`	`#else`
`32`	`32`	`version = "MKL not found";`
Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@ inline int64_t maybe_wrap_dim(`
`35`	`35`	`// if necessary`
`36`	`36`	`return dim;`
`37`	`37`	`}`
`38`		`- return maybe_wrap_dim(dim, tensor_sizes[0].size());`
	`38`	`+ return maybe_wrap_dim(dim, static_cast<int64_t>(tensor_sizes[0].size()));`
`39`	`39`	`}`
`40`	`40`
`41`	`41`	// Given an array of dimensions `dims` of length `ndims`, this function "Wraps"