pytorch
diff --git a/‎.ci/docker/common/install_onnx.sh
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/common/install_onnx.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/pytorch/build.sh
Lines changed: 8 additions & 0 deletions b/‎.ci/pytorch/build.sh
Lines changed: 8 additions & 0 deletions
diff --git a/‎.circleci/scripts/binary_populate_env.sh
Lines changed: 2 additions & 2 deletions b/‎.circleci/scripts/binary_populate_env.sh
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/scripts/filter_test_configs.py
Lines changed: 7 additions & 6 deletions b/‎.github/scripts/filter_test_configs.py
Lines changed: 7 additions & 6 deletions
diff --git a/‎.github/scripts/test_filter_test_configs.py
Lines changed: 56 additions & 1 deletion b/‎.github/scripts/test_filter_test_configs.py
Lines changed: 56 additions & 1 deletion
diff --git a/‎.github/workflows/build-triton-wheel.yml
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/build-triton-wheel.yml
Lines changed: 3 additions & 3 deletions
diff --git a/‎.lintrunner.toml
Lines changed: 0 additions & 14 deletions b/‎.lintrunner.toml
Lines changed: 0 additions & 14 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/CMakeLists.txt
Lines changed: 2 additions & 0 deletions b/‎aten/src/ATen/CMakeLists.txt
Lines changed: 2 additions & 0 deletions
diff --git a/‎aten/src/ATen/core/interned_strings.h
Lines changed: 2 additions & 1 deletion b/‎aten/src/ATen/core/interned_strings.h
Lines changed: 2 additions & 1 deletion
@@ -29,7 +29,7 @@ pip_install \
   transformers==4.32.1
 
 pip_install coloredlogs packaging
-retry pip_install -i https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple/ --no-cache-dir --no-input ort-nightly==1.16.0.dev20230908001
+retry pip_install -i https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple/ --no-cache-dir --no-input ort-nightly==1.16.0.dev20230912006
 
 pip_install onnx==1.14.1
 pip_install onnxscript-preview==0.1.0.dev20230828 --no-deps
 
@@ -159,6 +159,14 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* && -z "$TORCH_CUDA_ARCH_LIST" ]]; then
   exit 1
 fi
 
+# We only build FlashAttention files for CUDA 8.0+, and they require large amounts of
+# memory to build and will OOM
+if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ "$TORCH_CUDA_ARCH_LIST" == *"8.6"* || "$TORCH_CUDA_ARCH_LIST" == *"8.0"* ]]; then
+  echo "WARNING: FlashAttention files require large amounts of memory to build and will OOM"
+  echo "Setting MAX_JOBS=(nproc-2)/3 to reduce memory usage"
+  export MAX_JOBS="$(( $(nproc --ignore=2) / 3 ))"
+fi
+
 if [[ "${BUILD_ENVIRONMENT}" == *clang* ]]; then
   export CC=clang
   export CXX=clang++
 
@@ -155,8 +155,8 @@ EOL
 
 # nproc doesn't exist on darwin
 if [[ "$(uname)" != Darwin ]]; then
-  # Because most Circle executors only have 20 CPUs, using more causes OOMs w/ Ninja and nvcc parallelization
-  MEMORY_LIMIT_MAX_JOBS=18
+  # This was lowered from 18 to 12 to avoid OOMs when compiling FlashAttentionV2
+  MEMORY_LIMIT_MAX_JOBS=12
   NUM_CPUS=$(( $(nproc) - 2 ))
 
   # Defaults here for **binary** linux builds so they can be changed in one place
 
@@ -410,16 +410,17 @@ def process_jobs(
             if target_job in (TEST_JOB_NAME, BUILD_AND_TEST_JOB_NAME):
                 target_cfg = m.group("cfg")
 
-                return _filter_jobs(
+                # NB: There can be multiple unstable configurations, i.e. inductor, inductor_huggingface
+                test_matrix = _filter_jobs(
                     test_matrix=test_matrix,
                     issue_type=issue_type,
                     target_cfg=target_cfg,
                 )
-
-        warnings.warn(
-            f"Found a matching {issue_type.value} issue {target_url} for {workflow} / {job_name}, "
-            + f"but the name {target_job_cfg} is invalid"
-        )
+        else:
+            warnings.warn(
+                f"Found a matching {issue_type.value} issue {target_url} for {workflow} / {job_name}, "
+                + f"but the name {target_job_cfg} is invalid"
+            )
 
     # Found no matching target, return the same input test matrix
     return test_matrix
 
@@ -102,6 +102,30 @@
         "manywheel-py3_8-cuda11_8-build",
         "",
     ],
+    "inductor / cuda12.1-py3.10-gcc9-sm86 / test (inductor)": [
+        "pytorchbot",
+        "107079",
+        "https://github.com/pytorch/pytorch/issues/107079",
+        "inductor",
+        "cuda12.1-py3.10-gcc9-sm86",
+        "test (inductor)",
+    ],
+    "inductor / cuda12.1-py3.10-gcc9-sm86 / test (inductor_huggingface)": [
+        "pytorchbot",
+        "109153",
+        "https://github.com/pytorch/pytorch/issues/109153",
+        "inductor",
+        "cuda12.1-py3.10-gcc9-sm86",
+        "test (inductor_huggingface)",
+    ],
+    "inductor / cuda12.1-py3.10-gcc9-sm86 / test (inductor_huggingface_dynamic)": [
+        "pytorchbot",
+        "109154",
+        "https://github.com/pytorch/pytorch/issues/109154",
+        "inductor",
+        "cuda12.1-py3.10-gcc9-sm86",
+        "test (inductor_huggingface_dynamic)",
+    ],
 }
 
 MOCKED_PR_INFO = {
@@ -569,6 +593,37 @@ def test_mark_unstable_jobs(self, mock_download_json: Any) -> None:
                 "expected": '{"include": [{"config": "default", "unstable": "unstable"}]}',
                 "description": "Both binary build and test jobs are unstable",
             },
+            {
+                "workflow": "inductor",
+                "job_name": "cuda12.1-py3.10-gcc9-sm86 / build",
+                "test_matrix": """
+                    { include: [
+                        { config: "inductor" },
+                        { config: "inductor_huggingface", shard: 1 },
+                        { config: "inductor_huggingface", shard: 2 },
+                        { config: "inductor_timm", shard: 1 },
+                        { config: "inductor_timm", shard: 2 },
+                        { config: "inductor_torchbench" },
+                        { config: "inductor_huggingface_dynamic" },
+                        { config: "inductor_torchbench_dynamic" },
+                        { config: "inductor_distributed" },
+                    ]}
+                """,
+                "expected": """
+                    { "include": [
+                        { "config": "inductor", "unstable": "unstable" },
+                        { "config": "inductor_huggingface", "shard": 1, "unstable": "unstable" },
+                        { "config": "inductor_huggingface", "shard": 2, "unstable": "unstable" },
+                        { "config": "inductor_timm", "shard": 1 },
+                        { "config": "inductor_timm", "shard": 2 },
+                        { "config": "inductor_torchbench" },
+                        { "config": "inductor_huggingface_dynamic", "unstable": "unstable" },
+                        { "config": "inductor_torchbench_dynamic" },
+                        { "config": "inductor_distributed" }
+                    ]}
+                """,
+                "description": "Marking multiple unstable configurations",
+            },
         ]
 
         for case in testcases:
@@ -577,7 +632,7 @@ def test_mark_unstable_jobs(self, mock_download_json: Any) -> None:
             test_matrix = yaml.safe_load(case["test_matrix"])
 
             filtered_test_matrix = mark_unstable_jobs(workflow, job_name, test_matrix)
-            self.assertEqual(case["expected"], json.dumps(filtered_test_matrix))
+            self.assertEqual(json.loads(case["expected"]), filtered_test_matrix)
 
     @mock.patch("subprocess.check_output")
     def test_perform_misc_tasks(self, mocked_subprocess: Any) -> None:
 
@@ -131,7 +131,7 @@ jobs:
     needs: build-wheel
     container:
       image: continuumio/miniconda3:4.12.0
-    environment: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/v'))) && 'conda-aws-upload' || '' }}
+    environment: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v'))) && 'conda-aws-upload' || '' }}
     steps:
       - uses: actions/checkout@v3
 
@@ -244,7 +244,7 @@ jobs:
     needs: build-conda
     container:
       image: continuumio/miniconda3:4.12.0
-    environment: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/v'))) && 'conda-aws-upload' || '' }}
+    environment: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v'))) && 'conda-aws-upload' || '' }}
     steps:
       - uses: actions/checkout@v3
 
@@ -283,7 +283,7 @@ jobs:
         run: |
           set -ex
 
-          if [[ "${UPLOAD_CHANNEL}" = "nightly" ]]; then
+          if [[ "${UPLOAD_CHANNEL:-nightly}" == "nightly" ]]; then
             export ANACONDA_API_TOKEN="${CONDA_PYTORCHBOT_TOKEN}"
           else
             export ANACONDA_API_TOKEN="${CONDA_PYTORCHBOT_TOKEN_TEST}"
 
@@ -195,37 +195,23 @@ include_patterns = [
 exclude_patterns = [
     '**/fb/**',
     'torch/_inductor/index_propagation.py',
-    'torch/_inductor/coordinate_descent_tuner.py',
     'torch/_inductor/debug.py',
-    'torch/_inductor/hooks.py',
     'torch/_inductor/bounds.py',
-    'torch/_inductor/config.py',
     'torch/_inductor/ir.py',
-    'torch/_inductor/codecache.py',
-    'torch/_inductor/test_operators.py',
-    'torch/_inductor/inductor_prims.py',
     'torch/_inductor/scheduler.py',
     'torch/_inductor/exc.py',
     'torch/_inductor/sizevars.py',
-    'torch/_inductor/triton_helpers.py',
     'torch/_inductor/freezing.py',
     'torch/_inductor/pattern_matcher.py',
     'torch/_inductor/fx_utils.py',
-    'torch/_inductor/virtualized.py',
-    'torch/_inductor/cuda_properties.py',
     'torch/_inductor/codegen/triton_foreach.py',
-    'torch/_inductor/codegen/__init__.py',
     'torch/_inductor/codegen/cpp.py',
     'torch/_inductor/codegen/triton.py',
     'torch/_inductor/fx_passes/split_cat.py',
-    'torch/_inductor/fx_passes/binary_folding.py',
-    'torch/_inductor/fx_passes/replace_random.py',
     'torch/_inductor/fx_passes/joint_graph.py',
     'torch/_inductor/fx_passes/pad_mm.py',
-    'torch/_inductor/fx_passes/__init__.py',
     'torch/_inductor/fx_passes/group_batch_fusion.py',
     'torch/_inductor/fx_passes/pre_grad.py',
-    'torch/_inductor/fx_passes/freezing_patterns.py',
 ]
 command = [
     'python3',
 
@@ -730,7 +730,7 @@ include(cmake/Dependencies.cmake)
 cmake_dependent_option(
   USE_FLASH_ATTENTION
   "Whether to build the flash_attention kernel for scaled dot product attention" ON
-  "USE_CUDA AND NOT ROCM AND NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.6" OFF)
+  "USE_CUDA AND NOT ROCM AND NOT MSVC AND NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.6" OFF)
 
 # Flash Attention2 will error while building for sm52 while Mem Eff Attention won't
 cmake_dependent_option(
 
@@ -161,6 +161,7 @@ file(GLOB native_utils_cpp "native/utils/*.cpp")
 
 # flash_attention sources
 file(GLOB flash_attention_cuda_cu "native/transformers/cuda/flash_attn/*.cu")
+file(GLOB flash_attention_cuda_kernels_cu "native/transformers/cuda/flash_attn/kernels/*.cu")
 file(GLOB flash_attention_cuda_cpp "native/transformers/cuda/flash_attn/*.cpp")
 
 #Mem_eff attention sources
@@ -170,6 +171,7 @@ file(GLOB mem_eff_attention_cuda_cpp "native/transformers/cuda/mem_eff_attention
 
 if(USE_FLASH_ATTENTION)
   list(APPEND native_transformers_cuda_cu ${flash_attention_cuda_cu})
+  list(APPEND native_transformers_cuda_cu ${flash_attention_cuda_kernels_cu})
   list(APPEND native_transformers_cuda_cpp ${flash_attention_cuda_cpp})
 endif()
 
 
@@ -340,7 +340,8 @@ namespace c10 {
   _(attr, output_layouts)            \
   _(attr, allowzero)                 \
   _(attr, seen_none)                 \
-  _(attr, overload_name)
+  _(attr, overload_name)             \
+  _(attr, node_stack_idx)
 
 enum class _keys : unique_t {
     #define DEFINE_KEY(ns, s) ns##_##s,