pytorch
diff --git a/‎.circleci/cimodel/data/pytorch_build_definitions.py
Lines changed: 2 additions & 2 deletions b/‎.circleci/cimodel/data/pytorch_build_definitions.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎.circleci/config.yml
Lines changed: 29 additions & 59 deletions b/‎.circleci/config.yml
Lines changed: 29 additions & 59 deletions
diff --git a/‎.circleci/scripts/binary_linux_build.sh
Lines changed: 2 additions & 12 deletions b/‎.circleci/scripts/binary_linux_build.sh
Lines changed: 2 additions & 12 deletions
diff --git a/‎.circleci/verbatim-sources/job-specs/binary-job-specs.yml
Lines changed: 0 additions & 30 deletions b/‎.circleci/verbatim-sources/job-specs/binary-job-specs.yml
Lines changed: 0 additions & 30 deletions
diff --git a/‎.github/workflows/lint.yml
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/lint.yml
Lines changed: 3 additions & 0 deletions
diff --git a/‎.gitignore
Lines changed: 1 addition & 0 deletions b/‎.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 16 additions & 0 deletions b/‎CMakeLists.txt
Lines changed: 16 additions & 0 deletions
diff --git a/‎aten/src/ATen/BatchedFallback.cpp
Lines changed: 4 additions & 4 deletions b/‎aten/src/ATen/BatchedFallback.cpp
Lines changed: 4 additions & 4 deletions
diff --git a/‎aten/src/ATen/Context.cpp
Lines changed: 8 additions & 0 deletions b/‎aten/src/ATen/Context.cpp
Lines changed: 8 additions & 0 deletions
diff --git a/‎aten/src/ATen/Context.h
Lines changed: 3 additions & 0 deletions b/‎aten/src/ATen/Context.h
Lines changed: 3 additions & 0 deletions
diff --git a/‎aten/src/ATen/NumericUtils.h
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/NumericUtils.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/autocast_mode.cpp
Lines changed: 6 additions & 3 deletions b/‎aten/src/ATen/autocast_mode.cpp
Lines changed: 6 additions & 3 deletions
diff --git a/‎aten/src/ATen/core/aten_interned_strings.h
Lines changed: 2 additions & 0 deletions b/‎aten/src/ATen/core/aten_interned_strings.h
Lines changed: 2 additions & 0 deletions
diff --git a/‎aten/src/ATen/core/jit_type.h
Lines changed: 2 additions & 2 deletions b/‎aten/src/ATen/core/jit_type.h
Lines changed: 2 additions & 2 deletions
@@ -191,7 +191,7 @@ def gen_dependent_configs(xenial_parent_config):
             restrict_phases=["test"],
             gpu_resource=gpu,
             parent_build=xenial_parent_config,
-            is_important=xenial_parent_config.is_important,
+            is_important=False,
         )
 
         configs.append(c)
@@ -353,7 +353,7 @@ def instantiate_configs():
         ):
             c.dependent_tests = gen_docs_configs(c)
 
-        if cuda_version == "10.1" and python_version == "3.6" and not is_libtorch:
+        if cuda_version == "10.2" and python_version == "3.6" and not is_libtorch:
             c.dependent_tests = gen_dependent_configs(c)
 
         if (
 
@@ -1011,35 +1011,6 @@ jobs:
         <<: *binary_checkout
     - run:
         <<: *binary_populate_env
-    - run:
-        name: Install unbuffer and ts
-        command: |
-            set -eux -o pipefail
-            source /env
-            OS_NAME=`awk -F= '/^NAME/{print $2}' /etc/os-release`
-            if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
-              retry yum -q -y install epel-release
-              retry yum -q -y install expect moreutils
-            elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
-              retry apt-get update
-              retry apt-get -y install expect moreutils
-              retry conda install -y -c eumetsat expect
-              retry conda install -y cmake
-            fi
-    - run:
-        name: Update compiler to devtoolset7
-        command: |
-            set -eux -o pipefail
-            source /env
-            if [[ "$DESIRED_DEVTOOLSET" == 'devtoolset7' ]]; then
-              source "/builder/update_compiler.sh"
-
-              # Env variables are not persisted into the next step
-              echo "export PATH=$PATH" >> /env
-              echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH" >> /env
-            else
-              echo "Not updating compiler"
-            fi
     - run:
         name: Build
         no_output_timeout: "1h"
@@ -1059,7 +1030,6 @@ jobs:
             python3 -mpip install requests && \
             SCRIBE_GRAPHQL_ACCESS_TOKEN=${SCRIBE_GRAPHQL_ACCESS_TOKEN} \
             python3 /pytorch/.circleci/scripts/upload_binary_size_to_scuba.py || exit 0
-
     - persist_to_workspace:
         root: /
         paths: final_pkgs
@@ -6326,71 +6296,71 @@ workflows:
                 - /release\/.*/
           build_environment: "pytorch-linux-xenial-cuda10.1-cudnn7-py3-gcc7-build"
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.1-cudnn7-py3-gcc7"
+      - pytorch_linux_build:
+          name: pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build
+          requires:
+            - "docker-pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
+          build_environment: "pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7-build"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
+      - pytorch_linux_test:
+          name: pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_test
+          requires:
+            - pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build
+          build_environment: "pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
       - pytorch_linux_test:
-          name: pytorch_linux_xenial_cuda10_1_cudnn7_py3_multigpu_test
+          name: pytorch_linux_xenial_cuda10_2_cudnn7_py3_multigpu_test
           requires:
-            - pytorch_linux_xenial_cuda10_1_cudnn7_py3_gcc7_build
+            - pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build
           filters:
             branches:
               only:
                 - master
                 - /ci-all\/.*/
                 - /release\/.*/
-          build_environment: "pytorch-linux-xenial-cuda10.1-cudnn7-py3-multigpu-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.1-cudnn7-py3-gcc7"
+          build_environment: "pytorch-linux-xenial-cuda10.2-cudnn7-py3-multigpu-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.large
       - pytorch_linux_test:
-          name: pytorch_linux_xenial_cuda10_1_cudnn7_py3_nogpu_NO_AVX2_test
+          name: pytorch_linux_xenial_cuda10_2_cudnn7_py3_nogpu_NO_AVX2_test
           requires:
-            - pytorch_linux_xenial_cuda10_1_cudnn7_py3_gcc7_build
+            - pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build
           filters:
             branches:
               only:
                 - master
                 - /ci-all\/.*/
                 - /release\/.*/
-          build_environment: "pytorch-linux-xenial-cuda10.1-cudnn7-py3-nogpu-NO_AVX2-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.1-cudnn7-py3-gcc7"
+          build_environment: "pytorch-linux-xenial-cuda10.2-cudnn7-py3-nogpu-NO_AVX2-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
           resource_class: large
       - pytorch_linux_test:
-          name: pytorch_linux_xenial_cuda10_1_cudnn7_py3_nogpu_NO_AVX_test
+          name: pytorch_linux_xenial_cuda10_2_cudnn7_py3_nogpu_NO_AVX_test
           requires:
-            - pytorch_linux_xenial_cuda10_1_cudnn7_py3_gcc7_build
+            - pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build
           filters:
             branches:
               only:
                 - master
                 - /ci-all\/.*/
                 - /release\/.*/
-          build_environment: "pytorch-linux-xenial-cuda10.1-cudnn7-py3-nogpu-NO_AVX-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.1-cudnn7-py3-gcc7"
+          build_environment: "pytorch-linux-xenial-cuda10.2-cudnn7-py3-nogpu-NO_AVX-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
           resource_class: large
       - pytorch_linux_test:
-          name: pytorch_linux_xenial_cuda10_1_cudnn7_py3_slow_test
+          name: pytorch_linux_xenial_cuda10_2_cudnn7_py3_slow_test
           requires:
-            - pytorch_linux_xenial_cuda10_1_cudnn7_py3_gcc7_build
+            - pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build
           filters:
             branches:
               only:
                 - master
                 - /ci-all\/.*/
                 - /release\/.*/
-          build_environment: "pytorch-linux-xenial-cuda10.1-cudnn7-py3-slow-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.1-cudnn7-py3-gcc7"
-          use_cuda_docker_runtime: "1"
-          resource_class: gpu.medium
-      - pytorch_linux_build:
-          name: pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build
-          requires:
-            - "docker-pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
-          build_environment: "pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
-      - pytorch_linux_test:
-          name: pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_test
-          requires:
-            - pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build
-          build_environment: "pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7-test"
+          build_environment: "pytorch-linux-xenial-cuda10.2-cudnn7-py3-slow-test"
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7
10000
"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
 
@@ -5,7 +5,7 @@ set -eux -o pipefail
 source /env
 
 # Defaults here so they can be changed in one place
-export MAX_JOBS=12
+export MAX_JOBS=${MAX_JOBS:-$(nproc --ignore=1)}
 
 # Parse the parameters
 if [[ "$PACKAGE_TYPE" == 'conda' ]]; then
@@ -16,15 +16,5 @@ else
   build_script='manywheel/build.sh'
 fi
 
-# We want to call unbuffer, which calls tclsh which finds the expect
-# package. The expect was installed by yum into /usr/bin so we want to
-# find /usr/bin/tclsh, but this is shadowed by /opt/conda/bin/tclsh in
-# the conda docker images, so we prepend it to the path here.
-if [[ "$PACKAGE_TYPE" == 'conda' ]]; then
-  mkdir /just_tclsh_bin
-  ln -s /usr/bin/tclsh /just_tclsh_bin/tclsh
-  export PATH=/just_tclsh_bin:$PATH
-fi
-
 # Build the package
-SKIP_ALL_TESTS=1 unbuffer "/builder/$build_script" | ts
+SKIP_ALL_TESTS=1 stdbuf -i0 -o0 -e0 "/builder/$build_script"
@@ -7,35 +7,6 @@
         <<: *binary_checkout
     - run:
         <<: *binary_populate_env
-    - run:
-        name: Install unbuffer and ts
-        command: |
-            set -eux -o pipefail
-            source /env
-            OS_NAME=`awk -F= '/^NAME/{print $2}' /etc/os-release`
-            if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
-              retry yum -q -y install epel-release
-              retry yum -q -y install expect moreutils
-            elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
-              retry apt-get update
-              retry apt-get -y install expect moreutils
-              retry conda install -y -c eumetsat expect
-              retry conda install -y cmake
-            fi
-    - run:
-        name: Update compiler to devtoolset7
-        command: |
-            set -eux -o pipefail
-            source /env
-            if [[ "$DESIRED_DEVTOOLSET" == 'devtoolset7' ]]; then
-              source "/builder/update_compiler.sh"
-
-              # Env variables are not persisted into the next step
-              echo "export PATH=$PATH" >> /env
-              echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH" >> /env
-            else
-              echo "Not updating compiler"
-            fi
     - run:
         name: Build
         no_output_timeout: "1h"
@@ -55,7 +26,6 @@
             python3 -mpip install requests && \
             SCRIBE_GRAPHQL_ACCESS_TOKEN=${SCRIBE_GRAPHQL_ACCESS_TOKEN} \
             python3 /pytorch/.circleci/scripts/upload_binary_size_to_scuba.py || exit 0
-
     - persist_to_workspace:
         root: /
         paths: final_pkgs
 
@@ -144,6 +144,8 @@ jobs:
           # Run Clang-Tidy
           # The negative filters below are to exclude files that include onnx_pb.h or
           # caffe2_pb.h, otherwise we'd have to build protos as part of this CI job.
+          # FunctionsManual.cpp is excluded to keep this diff clean. It will be fixed
+          # in a follow up PR.
           python tools/clang_tidy.py                               \
             --verbose                                              \
             --paths torch/csrc/                                    \
@@ -157,6 +159,7 @@ jobs:
             -g"-torch/csrc/onnx/init.cpp"                          \
             -g"-torch/csrc/cuda/nccl.*"                            \
             -g"-torch/csrc/cuda/python_nccl.cpp"                   \
+            -g"-torch/csrc/autograd/FunctionsManual.cpp"           \
             "$@" > ${GITHUB_WORKSPACE}/clang-tidy-output.txt
 
           cat ${GITHUB_WORKSPACE}/clang-tidy-output.txt
 
@@ -57,6 +57,7 @@ torch/nn/functional.pyi
 torch/csrc/autograd/generated/*
 # Listed manually because some files in this directory are not generated
 torch/testing/_internal/generated/annotated_fn_args.py
+torch/testing/_internal/data/*.pt
 torch/csrc/cudnn/cuDNN.cpp
 torch/csrc/generated
 torch/csrc/generic/TensorMethods.cpp
 
@@ -136,6 +136,7 @@ cmake_dependent_option(
     CAFFE2_USE_MSVC_STATIC_RUNTIME "Using MSVC static runtime libraries" ON
     "NOT BUILD_SHARED_LIBS" OFF)
 option(BUILD_TEST "Build C++ test binaries (need gtest and gbenchmark)" OFF)
+option(BUILD_STATIC_RUNTIME_BENCHMARK "Build C++ binaries for static runtime benchmarks (need gbenchmark)" OFF)
 option(BUILD_MOBILE_BENCHMARKS "Build C++ test binaries for mobile (ARM) targets(need gtest and gbenchmark)" OFF)
 option(BUILD_MOBILE_TEST "Build C++ test binaries for mobile (ARM) targets(need gtest and gbenchmark)" OFF)
 option(BUILD_JNI "Build JNI bindings" OFF)
@@ -609,6 +610,21 @@ if(USE_ASAN)
     string(APPEND CMAKE_LINKER_FLAGS_DEBUG " -fsanitize=address")
 endif()
 
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+  include(CheckCSourceCompiles)
+  check_c_source_compiles("#include <arm_neon.h>
+int main() {
+  float a[] = {1.0, 1.0};
+  vld1q_f32_x2(a);
+  return 0;
+}" HAS_VLD1)
+
+  if(NOT HAS_VLD1)
+    string(APPEND CMAKE_CXX_FLAGS " -DMISSING_ARM_VLD1")
+  endif()
+endif()
+
+
 # Add code coverage flags to supported compilers
 if(CODE_COVERAGE)
   if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
 
@@ -51,15 +51,15 @@ void batchedTensorForLoopFallback(const c10::OperatorHandle& op, torch::jit::Sta
   const auto& schema = op.schema();
   const auto num_returns = schema.returns().size();
   TORCH_CHECK(!schema.is_mutable() && !schema.hasAnyAliasInfo(),
-              "Batching rule not implemented for ", schema, "; ",
+              "Batching rule not implemented for ", schema.operator_name(), "; ",
               "the fallback path doesn't work on in-place or view ops.");
   TORCH_CHECK(areAllReturnsTensors(schema) && !areAnyArgumentsTensorList(schema),
-              "Batching rule not implemented for ", schema, ". ",
+              "Batching rule not implemented for ", schema.operator_name(), ". ",
               "We could not generate a fallback.");
   TORCH_CHECK(num_returns >= 1,
-              "Batching rule not implemented for ", schema, ". ",
+              "Batching rule not implemented for ", schema.operator_name(), ". ",
               "The fallback path does not support operations with no returns.");
-  TORCH_WARN("Batching rule not implemented for ", schema, " falling back "
+  TORCH_WARN("Batching rule not implemented for ", schema.operator_name(), " falling back "
              "to slow (for loop and stack) implementation");
 
   const auto num_arguments = schema.arguments().size();
 
@@ -78,6 +78,14 @@ void Context::alertNotDeterministic(c10::string_view const& caller) {
   }
 }
 
+bool Context::allowTF32CuDNN() const {
+  return allow_tf32_cudnn;
+}
+
+void Context::setAllowTF32CuDNN(bool b) {
+  allow_tf32_cudnn = b;
+}
+
 static const char cublas_config_var_name[] = "CUBLAS_WORKSPACE_CONFIG";
 static const char* const cublas_deterministic_configs[] = { ":4096:8", ":16:8" };
 
 
@@ -115,6 +115,8 @@ class CAFFE2_API Context {
   bool deterministic() const;
   void setDeterministic(bool);
   void alertNotDeterministic(c10::string_view const& caller);
+  bool allowTF32CuDNN() const;
+  void setAllowTF32CuDNN(bool);
   bool allowTF32CuBLAS() const;
   void setAllowTF32CuBLAS(bool);
   void alertCuBLASConfigNotDeterministic();
@@ -146,6 +148,7 @@ class CAFFE2_API Context {
   bool deterministic_cudnn = false;
   bool _deterministic = false;
   bool benchmark_cudnn = false;
+  bool allow_tf32_cudnn = true;
   bool allow_tf32_cublas = true;
   bool enabled_mkldnn = true;
   #ifdef C10_MOBILE
 
@@ -34,7 +34,7 @@ inline C10_HOST_DEVICE bool _isnan(T val) {
 }
 
 template <typename T,
-          typename std::enable_if<c10::is_complex_t<T>::value, int>::type = 0>
+          typename std::enable_if<c10::is_complex<T>::value, int>::type = 0>
 inline bool _isnan(T val) {
   return std::isnan(val.real()) || std::isnan(val.imag());
 }
 
@@ -255,7 +255,8 @@ TORCH_LIBRARY_IMPL(_, Autocast, m) {
 }
 
 TORCH_LIBRARY_IMPL(aten, Autocast, m) {
-  KERNEL(ADD_NS(_convolution), "_convolution", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor>&, IntArrayRef, IntArrayRef, IntArrayRef, bool, IntArrayRef, int64_t, bool, bool, bool), fp16)
+  KERNEL(ADD_NS(_convolution), "_convolution.deprecated", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor>&, IntArrayRef, IntArrayRef, IntArrayRef, bool, IntArrayRef, int64_t, bool, bool, bool), fp16)
+  KERNEL(ADD_NS(_convolution), "_convolution", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor>&, IntArrayRef, IntArrayRef, IntArrayRef, bool, IntArrayRef, int64_t, bool, bool, bool, bool), fp16)
   KERNEL(ADD_NS(_convolution_nogroup), "_convolution_nogroup", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor>&, IntArrayRef, IntArrayRef, IntArrayRef, bool, IntArrayRef), fp16)
   KERNEL(ADD_NS(conv1d), "conv1d", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor>&, IntArrayRef, IntArrayRef, IntArrayRef, int64_t), fp16)
   KERNEL(ADD_NS(conv2d), "conv2d", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor>&, IntArrayRef, IntArrayRef, IntArrayRef, int64_t), fp16)
@@ -267,8 +268,10 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) {
   KERNEL(ADD_NS(convolution), "convolution", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor>&, IntArrayRef, IntArrayRef, IntArrayRef, bool, IntArrayRef, int64_t), fp16)
   KERNEL(ADD_NS(cudnn_convolution), "cudnn_convolution.deprecated", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor>&, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, bool, bool), fp16)
   KERNEL(ADD_NS(cudnn_convolution_transpose), "cudnn_convolution_transpose.deprecated", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor>&, IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, bool, bool), fp16)
-  KERNEL(ADD_NS(cudnn_convolution), "cudnn_convolution", Tensor (const Tensor &, const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, bool, bool), fp16)
-  KERNEL(ADD_NS(cudnn_convolution_transpose), "cudnn_convolution_transpose", Tensor (const Tensor &, const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, bool, bool), fp16)
+  KERNEL(ADD_NS(cudnn_convolution), "cudnn_convolution.deprecated2", Tensor (const Tensor &, const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, bool, bool), fp16)
+  KERNEL(ADD_NS(cudnn_convolution_transpose), "cudnn_convolution_transpose.deprecated2", Tensor (const Tensor &, const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, bool, bool), fp16)
+  KERNEL(ADD_NS(cudnn_convolution), "cudnn_convolution", Tensor (const Tensor &, const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, bool, bool, bool), fp16)
+  KERNEL(ADD_NS(cudnn_convolution_transpose), "cudnn_convolution_transpose", Tensor (const Tensor &, const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, bool, bool, bool), fp16)
   KERNEL(ADD_NS(prelu), "prelu", Tensor (const Tensor &, const Tensor &), fp16)
   KERNEL(ADD_NS(addmm), "addmm", Tensor (const Tensor &, const Tensor &, const Tensor &, Scalar, Scalar), fp16)
   KERNEL(ADD_NS(addmv), "addmv", Tensor (const Tensor &, const Tensor &, const Tensor &, Scalar, Scalar), fp16)
 
@@ -238,6 +238,8 @@ _(aten, clamp_min) \
 _(aten, clone) \
 _(aten, coalesce) \
 _(aten, combinations) \
+_(aten, _conj) \
+_(aten, conj) \
 _(aten, complex) \
 _(aten, polar) \
 _(aten, constant_pad_nd) \
 
@@ -507,7 +507,7 @@ struct CAFFE2_API SymbolicShape {
 };
 
 template <typename T>
-struct CAFFE2_API VaryingShape {
+struct VaryingShape {
   using ListOfOptionalElements = std::vector<c10::optional<T>>;
   VaryingShape(const std::vector<T>& vec)
       : VaryingShape(ListOfOptionalElements(vec.begin(), vec.end())) {}
@@ -548,7 +548,7 @@ struct CAFFE2_API VaryingShape {
     return dims_;
   }
 
-  VaryingShape merge(const VaryingShape& other) const;
+  CAFFE2_API VaryingShape merge(const VaryingShape& other) const;
 
   c10::optional<std::vector<T>> concrete_sizes() const {
     if (!dims_) {
Original file line number	Diff line number	Diff line change
`@@ -78,6 +78,14 @@ void Context::alertNotDeterministic(c10::string_view const& caller) {`
`78`	`78`	`}`
`79`	`79`	`}`
`80`	`80`
	`81`	`+bool Context::allowTF32CuDNN() const {`
	`82`	`+ return allow_tf32_cudnn;`
	`83`	`+}`
	`84`	`+`
	`85`	`+void Context::setAllowTF32CuDNN(bool b) {`
	`86`	`+ allow_tf32_cudnn = b;`
	`87`	`+}`
	`88`	`+`
`81`	`89`	`static const char cublas_config_var_name[] = "CUBLAS_WORKSPACE_CONFIG";`
`82`	`90`	`static const char* const cublas_deterministic_configs[] = { ":4096:8", ":16:8" };`
`83`	`91`
Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@ inline C10_HOST_DEVICE bool _isnan(T val) {`
`34`	`34`	`}`
`35`	`35`
`36`	`36`	`template <typename T,`
`37`		`- typename std::enable_if<c10::is_complex_t<T>::value, int>::type = 0>`
	`37`	`+ typename std::enable_if<c10::is_complex<T>::value, int>::type = 0>`
`38`	`38`	`inline bool _isnan(T val) {`
`39`	`39`	`return std::isnan(val.real()) \|\| std::isnan(val.imag());`
`40`	`40`	`}`