pytorch
diff --git a/‎.github/scripts/gitutils.py
Lines changed: 1 addition & 3 deletions b/‎.github/scripts/gitutils.py
Lines changed: 1 addition & 3 deletions
diff --git a/‎.github/scripts/trymerge.py
Lines changed: 17 additions & 10 deletions b/‎.github/scripts/trymerge.py
Lines changed: 17 additions & 10 deletions
diff --git a/‎.github/workflows/revert.yml
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/revert.yml
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/trymerge.yml
Lines changed: 13 additions & 1 deletion b/‎.github/workflows/trymerge.yml
Lines changed: 13 additions & 1 deletion
diff --git a/‎CMakeLists.txt
Lines changed: 26 additions & 17 deletions b/‎CMakeLists.txt
Lines changed: 26 additions & 17 deletions
diff --git a/‎aten/src/ATen/FunctionalTensorWrapper.cpp
Lines changed: 4 additions & 4 deletions b/‎aten/src/ATen/FunctionalTensorWrapper.cpp
Lines changed: 4 additions & 4 deletions
diff --git a/‎aten/src/ATen/core/ivalue.h
Lines changed: 7 additions & 5 deletions b/‎aten/src/ATen/core/ivalue.h
Lines changed: 7 additions & 5 deletions
diff --git a/‎aten/src/ATen/core/ivalue_inl.h
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/core/ivalue_inl.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/core/jit_type.h
Lines changed: 4 additions & 4 deletions b/‎aten/src/ATen/core/jit_type.h
Lines changed: 4 additions & 4 deletions
diff --git a/‎aten/src/ATen/core/library.cpp
Lines changed: 2 additions & 2 deletions b/‎aten/src/ATen/core/library.cpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎aten/src/ATen/cpu/vec/functional_base.h
Lines changed: 63 additions & 7 deletions b/‎aten/src/ATen/cpu/vec/functional_base.h
Lines changed: 63 additions & 7 deletions
@@ -248,9 +248,7 @@ def push(self, branch: str, dry_run: bool, retry: int = 3) -> None:
                 else:
                     self._run_git("push", self.remote, branch)
             except RuntimeError as e:
-                # Check if push were rejected because branch is stale
-                if len(e.args) == 0 or re.search(r"\[rejected\].+\(fetch first\)\n", e.args[0]) is None:
-                    raise
+                print(f"{cnt} push attempt failed with {e}")
                 self.fetch()
                 self._run_git("rebase", f"{self.remote}/{branch}")
 
 
@@ -384,6 +384,14 @@ def parse_args() -> Any:
     parser.add_argument("pr_num", type=int)
     return parser.parse_args()
 
+def can_skip_internal_checks(pr: "GitHubPR", comment_id: Optional[int] = None) -> bool:
+    if comment_id is None:
+        return False
+    comment = pr.get_comment_by_id(comment_id)
+    if comment.editor_login is not None:
+        return False
+    return comment.author_login == "facebook-github-bot"
+
 
 @dataclass
 class GitHubComment:
@@ -640,7 +648,7 @@ def has_internal_changes(self) -> bool:
             return False
         return checks[checkrun_name] != "SUCCESS"
 
-    def merge_ghstack_into(self, repo: GitRepo, force: bool) -> None:
+    def merge_ghstack_into(self, repo: GitRepo, force: bool, comment_id: Optional[int] = None) -> None:
         assert self.is_ghstack_pr()
         approved_by = self.get_approved_by()
         # For ghstack, cherry-pick commits based from origin
@@ -661,7 +669,7 @@ def merge_ghstack_into(self, repo: GitRepo, force: bool) -> None:
                     continue
                 approved_by = pr.get_approved_by()
                 # Raises exception if matching rule is not found
-                find_matching_merge_rule(pr, repo, force=force)
+                find_matching_merge_rule(pr, repo, force=force, skip_internal_checks=can_skip_internal_checks(self, comment_id))
 
             # Adding the url here makes it clickable within the Github UI
             approved_by_urls = ', '.join(prefix_with_github_url(login) for login in approved_by)
@@ -670,11 +678,9 @@ def merge_ghstack_into(self, repo: GitRepo, force: bool) -> None:
             msg += f"\nApproved by: {approved_by_urls}\n"
             repo.amend_commit_message(msg)
 
-    def merge_into(self, repo: GitRepo, *, force: bool = False, dry_run: bool = False) -> None:
+    def merge_into(self, repo: GitRepo, *, force: bool = False, dry_run: bool = False, comment_id: Optional[int] = None) -> None:
         # Raises exception if matching rule is not found
-        find_matching_merge_rule(self, repo, force=force)
-        if self.has_internal_changes():
-            raise RuntimeError("This PR must be landed via phabricator")
+        find_matching_merge_rule(self, repo, force=force, skip_internal_checks=can_skip_internal_checks(self, comment_id))
         if repo.current_branch() != self.default_branch():
             repo.checkout(self.default_branch())
         if not self.is_ghstack_pr():
@@ -688,7 +694,7 @@ def merge_into(self, repo: GitRepo, *, force: bool = False, dry_run: bool = Fals
             repo._run_git("merge", "--squash", pr_branch_name)
             repo._run_git("commit", f"--author=\"{self.get_author()}\"", "-m", msg)
         else:
-            self.merge_ghstack_into(repo, force)
 
         repo.push(self.default_branch(), dry_run)
         if not dry_run:
@@ -823,9 +829,10 @@ def post_comment(msg: str) -> None:
     expected_association = "CONTRIBUTOR" if pr.is_base_repo_private() else "MEMBER"
     if author_association != expected_association and author_association != "OWNER":
         return post_comment(f"Will not revert as @{author_login} is not a {expected_association}, but {author_association}")
+    skip_internal_checks = can_skip_internal_checks(pr, comment_id)
 
     # Raises exception if matching rule is not found, but ignores all status checks
-    find_matching_merge_rule(pr, repo, force=True)
+    find_matching_merge_rule(pr, repo, force=True, skip_internal_checks=skip_internal_checks)
     commit_sha = pr.get_merge_commit()
     if commit_sha is None:
         commits = repo.commits_resolving_gh_pr(pr.pr_num)
@@ -834,7 +841,7 @@ def post_comment(msg: str) -> None:
         commit_sha = commits[0]
     msg = repo.commit_message(commit_sha)
     rc = RE_DIFF_REV.search(msg)
-    if rc is not None:
+    if rc is not None and not can_skip_internal_checks:
         raise RuntimeError(f"Can't revert PR that was landed via phabricator as {rc.group(1)}")
     repo.checkout(pr.default_branch())
     repo.revert(commit_sha)
@@ -913,7 +920,7 @@ def handle_exception(e: Exception, msg: str = "Merge failed") -> None:
             handle_exception(e)
     else:
         try:
-            pr.merge_into(repo, dry_run=args.dry_run, force=args.force)
+            pr.merge_into(repo, dry_run=args.dry_run, force=args.force, comment_id=args.comment_id)
         except Exception as e:
             handle_exception(e)
 
 
@@ -37,3 +37,5 @@ jobs:
           else
             python3 .github/scripts/trymerge.py --revert "${PR_NUM}"
           fi
+
+concurrency: try-revert
@@ -31,11 +31,23 @@ jobs:
           GH_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
           FORCE: ${{ github.event.client_payload.force}}
           ON_GREEN: ${{ github.event.client_payload.on_green}}
+          COMMENT_ID: ${{ github.event.client_payload.comment_id }}
         run: |
+          set -ex
           if [ -n "${FORCE}" ]; then
-            python3 .github/scripts/trymerge.py --force "${PR_NUM}"
+            if [ -n "${COMMENT_ID}" ]; then
+              python3 .github/scripts/trymerge.py --force --comment-id "${COMMENT_ID}" "${PR_NUM}"
+            else
+              python3 .github/scripts/trymerge.py --force "${PR_NUM}"
+            fi
           elif [ -n "${ON_GREEN}" ]; then
             python3 .github/scripts/trymerge.py --on-green "${PR_NUM}"
+          elif [ -n "${COMMENT_ID}" ]; then
+            python3 .github/scripts/trymerge.py --comment-id "${COMMENT_ID}" "${PR_NUM}"
           else
             python3 .github/scripts/trymerge.py "${PR_NUM}"
           fi
+
+# TODO: Separate merge on green merges from regular merges to not hold up try-merge workflows overall concurrency
+# NOTE: force pushes are also put in their concurrency group to put them higher than regular merges
+concurrency: try-merge-${{ github.event.client_payload.force}}-${{ github.event.client_payload.on_green }}
@@ -97,25 +97,34 @@ if(APPLE)
     # Determine if we can link against MPSGraph
     set(MPS_FOUND OFF)
     execute_process(
-      COMMAND bash -c "xcrun --sdk macosx --show-sdk-path"
-      OUTPUT_VARIABLE _macosx_sdk_path
+      COMMAND bash -c "xcodebuild -sdk macosx -version SDKVersion"
+      RESULT_VARIABLE _exit_code
+      OUTPUT_VARIABLE _macosx_sdk_version
       OUTPUT_STRIP_TRAILING_WHITESPACE)
-    set(_MPS_supported_os_version OFF)
-        AND DARWIN_MAJOR_VERSION VERSION_GREATER_EQUAL 21
-        AND DARWIN_MINOR_VERSION VERSION_GREATER_EQUAL 3)
-      set(_MPS_supported_os_version ON)
-    endif()
-    set(_SDK_SEARCH_PATH "${_macosx_sdk_path}/System/Library/Frameworks/")
-    set(_FRAMEWORK_SEARCH_PATH "/System/Library/Frameworks/")
-
-    find_library(_MPS_fwrk_path_ NAMES MetalPerformanceShadersGraph MetalPerformanceShaders PATHS ${_FRAMEWORK_SEARCH_PATH} NO_DEFAULT_PATH)
-    find_library(_MPS_sdk_path_ NAMES MetalPerformanceShadersGraph MetalPerformanceShaders PATHS ${_SDK_SEARCH_PATH} NO_DEFAULT_PATH)
-
-    if(_MPS_supported_os_version AND _MPS_fwrk_path_ AND _MPS_sdk_path_)
-      set(MPS_FOUND ON)
-      message(STATUS "MPSGraph framework found")
+    if(_exit_code EQUAL 0)
+        set(_MPS_supported_os_version OFF)
+        if(_macosx_sdk_version VERSION_GREATER_EQUAL 12.3)
+            set(_MPS_supported_os_version ON)
+        endif()
+        message(STATUS "sdk version: ${_macosx_sdk_version}, mps supported: ${_MPS_supported_os_version}")
+        execute_process(
+          COMMAND bash -c "xcrun --sdk macosx --show-sdk-path"
+          OUTPUT_VARIABLE _macosx_sdk_path
+          OUTPUT_STRIP_TRAILING_WHITESPACE)
+        set(_SDK_SEARCH_PATH "${_macosx_sdk_path}/System/Library/Frameworks/")
+        set(_FRAMEWORK_SEARCH_PATH "/System/Library/Frameworks/")
+
+        find_library(_MPS_fwrk_path_ NAMES MetalPerformanceShadersGraph MetalPerformanceShaders PATHS ${_FRAMEWORK_SEARCH_PATH} NO_DEFAULT_PATH)
+        find_library(_MPS_sdk_path_ NAMES MetalPerformanceShadersGraph MetalPerformanceShaders PATHS ${_SDK_SEARCH_PATH} NO_DEFAULT_PATH)
+
+        if(_MPS_supported_os_version AND _MPS_fwrk_path_ AND _MPS_sdk_path_)
+          set(MPS_FOUND ON)
+          message(STATUS "MPSGraph framework found")
+        else()
+          message(STATUS "MPSGraph framework not found")
+        endif()
     else()
+      message(STATUS "MPS: unable to get MacOS sdk version")
       message(STATUS "MPSGraph framework not found")
     endif()
   endif()
 
@@ -17,10 +17,10 @@ void FunctionalTensorWrapper::set_constructor_metadata() {
   // For now I'm retroactively setting this in functorch,
   // but once Open Multiple Dispatch lands we should be able to calculate this in core.
   level_ = -1;
-  // shallow_copy_from overwrites the storage and dispatch keyset...
-  auto functional_storage = storage_;
-  shallow_copy_from(value_.getIntrusivePtr());
-  storage_ = functional_storage;
+  // mirror all of the generic tensor metadata onto the wrapper
+  copy_generic_tensor_metadata(value_.getIntrusivePtr().get(), this);
+  refresh_numel();
+  refresh_contiguous();
   storage_access_should_throw_ = false;
   key_set_ = c10::DispatchKeySet(c10::DispatchKey::Functionalize) | value_.key_set();
   // All of the keys corresponding to functorch transforms should not be copied over.
 
@@ -750,15 +750,17 @@ struct TORCH_API IValue final {
   // Scalar, which gets encoded as either an Int, a Double or a ComplexDouble
   IValue(const at::Scalar& s) : IValue() {
     if (s.isFloatingPoint()) {
-      *this = s.toDouble();
+      tag = Tag::Double;
+      payload.u.as_double = s.toDouble();
     } else if (s.isComplex()) {
       *this = s.toComplexDouble();
     } else if (s.isBoolean()) {
-      *this = s.toBool();
-    } else if (s.isIntegral(false)) {
-      *this = s.toLong();
+      tag = Tag::Bool;
+      payload.u.as_bool = s.toBool();
     } else {
-      TORCH_CHECK(false, "Unknown type in Scalar");
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(s.isIntegral(false), "Unknown type in Scalar");
+      tag  = Tag::Int;
+      payload.u.as_int = s.toLong();
     }
   }
 
 
@@ -1179,7 +1179,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
         continue;
       }
       c10::Device device = storage->device();
-      if (!device.is_cpu()) {
+      if (!device.is_cpu() && !device.is_meta()) {
         TORCH_CHECK_VALUE(
             device.type() == impl.type(),
             "Expected all data ptrs to be on a device of type ",
 
@@ -787,7 +787,7 @@ struct TORCH_API TensorType : public SharedType {
   static const TypeKind Kind = TypeKind::TensorType;
 
   static std::vector<int64_t> contiguousStridesOf(
-      at::IntArrayRef sizes,
+      at::IntArrayRef in_sizes,
       at::MemoryFormat memory_format = MemoryFormat::Contiguous) {
     auto contiguous_fn = [](const at::IntArrayRef& sizes,
                             const std::vector<int64_t>& dim_order) {
@@ -804,18 +804,18 @@ struct TORCH_API TensorType : public SharedType {
       return strides;
     };
 
-    std::vector<int64_t> dim_order(sizes.size());
+    std::vector<int64_t> dim_order(in_sizes.size());
     if (memory_format == MemoryFormat::ChannelsLast) {
       dim_order = {1, 3, 2, 0};
     } else if (memory_format == MemoryFormat::ChannelsLast3d) {
       dim_order = {1, 4, 3, 2, 0};
     } else {
-      auto ndims = sizes.size();
+      auto ndims = in_sizes.size();
       for (size_t i = 0; i < ndims; i++) {
         dim_order[i] = ndims - i - 1; // Reverse
       }
     }
-    return contiguous_fn(sizes, dim_order);
+    return contiguous_fn(in_sizes, dim_order);
   }
 
  private:
 
@@ -7,15 +7,15 @@ namespace torch {
 namespace {
   // TODO: Consider representing debug info as a struct instead so you
   // don't have to allocate strings all the time
-  std::string debugString(const std::string& file, uint32_t line) {
+  std::string debugString(const char* file, uint32_t line) {
 #ifdef STRIP_ERROR_MESSAGES
     return std::string();
 #else
     return c10::str("registered at ", file, ":", line);
 #endif
   }
 
-  std::string debugString(std::string debug, const std::string& file, uint32_t line) {
+  std::string debugString(std::string debug, const char* file, uint32_t line) {
 #ifdef STRIP_ERROR_MESSAGES
     return std::string();
 #else
 
@@ -8,7 +8,7 @@
 
 namespace at { namespace vec {
 
-// TODO: Make this more efficient
+// slow path
 template <typename scalar_t, typename Op>
 inline scalar_t vec_reduce_all(
     const Op& vec_fun,
@@ -27,6 +27,62 @@ inline scalar_t vec_reduce_all(
   return acc_arr[0];
 }
 
+template <typename scalar_t, typename Op>
+struct VecReduceAllSIMD {
+  static inline scalar_t apply(const Op& vec_fun, Vectorized<scalar_t> acc_vec) {
+    return vec_reduce_all(vec_fun, acc_vec, Vectorized<scalar_t>::size());
+  }
+};
+
+#if defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) && !defined(C10_MOBILE)
+#if defined(CPU_CAPABILITY_AVX2)
+template <typename Op>
+struct VecReduceAllSIMD<float, Op> {
+  static inline float apply(const Op& vec_fun, Vectorized<float> acc_vec) {
+    using Vec = Vectorized<float>;
+    Vec v = acc_vec;
+    // 128-bit shuffle
+    Vec v1 = _mm256_permute2f128_ps(v, v, 0x1);
+    v = vec_fun(v, v1);
+    // 64-bit shuffle
+    v1 = _mm256_shuffle_ps(v, v, 0x4E);
+    v = vec_fun(v, v1);
+    // 32-bit shuffle
+    v1 = _mm256_shuffle_ps(v, v, 0xB1);
+    v = vec_fun(v, v1);
+    return _mm256_cvtss_f32(v);
+  }
+};
+#endif // defined(CPU_CAPABILITY_AVX2)
+#if defined(CPU_CAPABILITY_AVX512)
+template <typename Op>
+struct VecReduceAllSIMD<float, Op> {
+  static inline float apply(const Op& vec_fun, Vectorized<float> acc_vec) {
+    using Vec = Vectorized<float>;
+    Vec v = acc_vec;
+    // 256-bit shuffle
+    Vec v1 = _mm512_shuffle_f32x4(v, v, 0x4E);
+    v = vec_fun(v, v1);
+    // 128-bit shuffle
+    v1 = _mm512_shuffle_f32x4(v, v, 0xB1);
+    v = vec_fun(v, v1);
+    // 64-bit shuffle
+    v1 = _mm512_shuffle_ps(v, v, 0x4E);
+    v = vec_fun(v, v1);
+    // 32-bit shuffle
+    v1 = _mm512_shuffle_ps(v, v, 0xB1);
+    v = vec_fun(v, v1);
+    return _mm512_cvtss_f32(v);
+  }
+};
+#endif // defined(CPU_CAPABILITY_AVX512)
+#endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) && !defined(C10_MOBILE)
+
+template <typename scalar_t, typename Op>
+inline scalar_t vec_reduce_all(const Op& vec_fun, Vectorized<scalar_t> acc_vec) {
+  return VecReduceAllSIMD<scalar_t, Op>::apply(vec_fun, acc_vec);
+}
+
 template <typename scalar_t, typename Op>
 inline scalar_t reduce_all(const Op& vec_fun, const scalar_t* data, int64_t size) {
   using Vec = vec::Vectorized<scalar_t>;
@@ -42,7 +98,7 @@ inline scalar_t reduce_all(const Op& vec_fun, const scalar_t* data, int64_t size
     Vec data_vec = Vec::loadu(data + d, size - d);
     acc_vec = Vec::set(acc_vec, vec_fun(acc_vec, data_vec), size - d);
   }
-  return vec_reduce_all(vec_fun, acc_vec, Vec::size());
+  return vec_reduce_all(vec_fun, acc_vec);
 }
 
 // similar to reduce_all, but reduces into two outputs
@@ -70,8 +126,8 @@ inline std::pair<scalar_t, scalar_t> reduce2_all(const Op1& vec_fun1, const Op2&
     acc_vec2 = Vec::set(acc_vec2, vec_fun2(acc_vec2, data_vec), size - d);
   }
   return std::pair<scalar_t, scalar_t>(
-    vec_reduce_all(vec_fun1, acc_vec1, Vec::size()),
-    vec_reduce_all(vec_fun2, acc_vec2, Vec::size()));
+    vec_reduce_all(vec_fun1, acc_vec1),
+    vec_reduce_all(vec_fun2, acc_vec2));
 }
 
 template <typename scalar_t, typename MapOp, typename ReduceOp>
@@ -95,7 +151,7 @@ inline scalar_t map_reduce_all(
     data_vec = map_fun(data_vec);
     acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d);
   }
-  return vec_reduce_all(red_fun, acc_vec, Vec::size());
+  return vec_reduce_all(red_fun, acc_vec);
 }
 
 template <typename scalar_t, typename MapOp, typename ReduceOp>
@@ -126,7 +182,7 @@ inline scalar_t map2_reduce_all(
     data_vec = map_fun(data_vec, data2_vec);
     acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d);
   }
-  return vec_reduce_all(red_fun, acc_vec, Vec::size());
+  return vec_reduce_all(red_fun, acc_vec);
 }
 
 template <typename scalar_t, typename MapOp, typename ReduceOp>
@@ -162,7 +218,7 @@ inline scalar_t map3_reduce_all(
     data_vec = map_fun(data_vec, data2_vec, data3_vec);
     acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d);
   }
-  return vec_reduce_all(red_fun, acc_vec, Vec::size());
+  return vec_reduce_all(red_fun, acc_vec);
 }
 
 template <typename scalar_t, typename Op>
Original file line number	Diff line number	Diff line change
`@@ -1179,7 +1179,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {`
`1179`	`1179`	`continue;`
`1180`	`1180`	`}`
`1181`	`1181`	`c10::Device device = storage->device();`
`1182`		`- if (!device.is_cpu()) {`
	`1182`	`+ if (!device.is_cpu() && !device.is_meta()) {`
`1183`	`1183`	`TORCH_CHECK_VALUE(`
`1184`	`1184`	`device.type() == impl.type(),`
`1185`	`1185`	`"Expected all data ptrs to be on a device of type ",`