From de0edc09175ecb88d2851691af8e0ffb6ac99dde Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Fri, 22 Oct 2021 16:09:53 +0100
Subject: [PATCH 1/2] [caffe2] Remove Operator::newstyle_outputs_

`OperatorBase` maintains `output_tensors_` and `newstyle_outputs_`
which hold the same list of tensors except one is
`vector<caffe2::Tensor>` and the other is `List<at::Tensor>`.

This instead maintains only `output_tensors_` and handles the
conversions inside of export_caffe2_op_to_c10.

[ghstack-poisoned]
---
 caffe2/contrib/aten/aten_op_template.h |  2 +-
 caffe2/core/export_caffe2_op_to_c10.h  |  7 ++++-
 caffe2/core/operator.cc                | 13 ++++-----
 caffe2/core/operator.h                 | 39 +++++++++-----------------
 4 files changed, 27 insertions(+), 34 deletions(-)
diff --git a/caffe2/contrib/aten/aten_op_template.h b/caffe2/contrib/aten/aten_op_template.h
index 68b1feda93b7..7cc76cad1f5b 100644
--- a/caffe2/contrib/aten/aten_op_template.h
+++ b/caffe2/contrib/aten/aten_op_template.h
@@ -1,7 +1,7 @@
 #pragma once
 #include <unordered_map>
 #include <string>
-#include <ATen/ATen.h>
+#include <ATen/Functions.h>
 #include <c10/macros/Macros.h>
 #include <caffe2/core/context.h>
 #include <caffe2/core/operator.h>
diff --git a/caffe2/core/export_caffe2_op_to_c10.h b/caffe2/core/export_caffe2_op_to_c10.h
index bac3b0fd5846..5cc6e1d60634 100644
--- a/caffe2/core/export_caffe2_op_to_c10.h
+++ b/caffe2/core/export_caffe2_op_to_c10.h
@@ -30,7 +30,12 @@ inline c10::List<at::Tensor> _call_caffe2_op(
     c10::List<at::Tensor>&& outputs) {
   Caffe2Operator op(schema, std::move(inputs), std::move(outputs), -1);
   op.Run(-1);
-  return std::move(op).move_newstyle_outputs();
+  auto op_outputs = std::move(op).move_output_tensors();
+  TORCH_INTERNAL_ASSERT(outputs.size() == op_outputs.size());
+  for (auto i : c10::irange(op_outputs.size())) {
+    outputs[i] = at::Tensor(std::move(op_outputs[i]));
+  }
+  return std::move(outputs);
 }
 
 // This function is inline in the hope that compilers optimizing for speed will
diff --git a/caffe2/core/operator.cc b/caffe2/core/operator.cc
index e25c92a6d607..aa203d5d6d69 100644
--- a/caffe2/core/operator.cc
+++ b/caffe2/core/operator.cc
@@ -59,10 +59,6 @@ OperatorBase::OperatorBase(const OperatorDef& operator_def, Workspace* ws)
       device_option_(
           operator_def.has_device_option() ? operator_def.device_option()
                                            : DeviceOption()),
-#if defined(EXPOSE_C2_OPS) || \
-    !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-      newstyle_outputs_(),
-#endif
       input_size_(operator_def.input_size()),
       event_(std::make_unique<Event>(device_option_)) {
   static GlobalInitIsCalledGuard guard;
@@ -124,14 +120,17 @@ compute_input_size_(const std::vector<c10::IValue>& inputs) {
 OperatorBase::OperatorBase(
     const c10::FunctionSchema& fn_schema,
     std::vector<c10::IValue> inputs,
-    c10::List<at::Tensor> outputs)
+    const c10::List<at::Tensor> &outputs)
     // NOLINTNEXTLINE(performance-move-const-arg)
     : fn_schema_(make_unique<c10::FunctionSchema>(std::move(fn_schema))),
       newstyle_inputs_(std::move(inputs)),
-      newstyle_outputs_(std::move(outputs)),
       input_size_(compute_input_size_(newstyle_inputs_)) {
   input_tensors_.resize(input_size_);
-  output_tensors_.resize(newstyle_outputs_.size());
+
+  output_tensors_.reserve(outputs_.size());
+  for (auto i : c10::irange(outputs.size())) {
+    output_tensors_.emplace_back(outputs.extract(i));
+  }
 }
 #endif
 
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
index b670845f290b..f9a8d0209da7 100644
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@@ -69,7 +69,7 @@ class TORCH_API OperatorBase : public Observable<OperatorBase> {
   explicit OperatorBase(
       const c10::FunctionSchema& schema,
       std::vector<c10::IValue> inputs,
-      c10::List<at::Tensor> outputs);
+      const c10::List<at::Tensor> &outputs);
 #endif
 
   virtual ~OperatorBase() noexcept;
@@ -245,15 +245,12 @@ class TORCH_API OperatorBase : public Observable<OperatorBase> {
     }
 #if defined(EXPOSE_C2_OPS) || \
     !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-    at::Tensor output = newstyle_outputs_[idx];
-    if (!output.defined() || caffe2::Tensor(output).GetDeviceType() != type) {
+    auto &output = output_tensors_[idx];
+    if (!output.defined() || output.GetDeviceType() != type) {
       // Fix tensor type
-      Tensor tensor = Tensor(type);
-      output = at::Tensor(std::move(tensor.getIntrusivePtr()));
+      output = Tensor(type);
     }
-    output_tensors_[idx] = caffe2::Tensor(output);
-    newstyle_outputs_[idx] = std::move(output);
-    return &output_tensors_[idx];
+    return &output;
 #else
     CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
 #endif
@@ -275,9 +272,6 @@ class TORCH_API OperatorBase : public Observable<OperatorBase> {
     if (!isLegacyOperator()) {
 #if defined(EXPOSE_C2_OPS) || \
     !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-      newstyle_outputs_[idx] = at::Tensor(tensor);
-
-      // also update the tensor in the hack
       output_tensors_[idx] = std::move(tensor);
 #else
       CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
@@ -305,16 +299,12 @@ class TORCH_API OperatorBase : public Observable<OperatorBase> {
     }
 #if defined(EXPOSE_C2_OPS) || \
     !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-    at::Tensor output = newstyle_outputs_[idx];
-    Tensor tensor = output.defined()
-        ? GetSizedTensorWithOptions(caffe2::Tensor(output), dims, options)
+    auto &output = output_tensors_[idx];
+    output = output.defined()
+        ? GetSizedTensorWithOptions(std::move(output), dims, options)
         : caffe2::empty(dims, options);
-    // assign it back in case it changed
-    output = at::Tensor(std::move(tensor.getIntrusivePtr()));
 
-    output_tensors_[idx] = caffe2::Tensor(output);
-    newstyle_outputs_[idx] = std::move(output);
-    return &output_tensors_[idx];
+    return &output;
 #else
     CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
 #endif
@@ -429,7 +419,7 @@ class TORCH_API OperatorBase : public Observable<OperatorBase> {
     }
 #if defined(EXPOSE_C2_OPS) || \
     !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-    return newstyle_outputs_.size();
+    return output_tensors_.size();
 #else
     CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
 #endif
@@ -594,8 +584,8 @@ class TORCH_API OperatorBase : public Observable<OperatorBase> {
 
 #if defined(EXPOSE_C2_OPS) || \
     !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
-  c10::List<at::Tensor> move_newstyle_outputs() && {
-    return std::move(newstyle_outputs_);
+  std::vector<caffe2::Tensor> move_output_tensors() && {
+    return std::move(output_tensors_);
   }
 #endif
 
@@ -615,7 +605,6 @@ class TORCH_API OperatorBase : public Observable<OperatorBase> {
     !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
   std::unique_ptr<const c10::FunctionSchema> fn_schema_;
   vector<c10::IValue> newstyle_inputs_;
-  c10::List<at::Tensor> newstyle_outputs_;
 #endif
   // HACK
   // We preserve the fact that Output() returns Tensor*
@@ -814,9 +803,9 @@ class Operator : public OperatorBase {
   explicit Operator(
       const c10::FunctionSchema& fn_schema,
       std::vector<c10::IValue> inputs,
-      c10::List<at::Tensor> outputs,
+      const c10::List<at::Tensor> &outputs,
       StreamId stream = 0)
-      : OperatorBase(fn_schema, std::move(inputs), std::move(outputs)) {
+      : OperatorBase(fn_schema, std::move(inputs), outputs) {
     // In the constructor, we switch to the device so that the child class
     // constructors will run on that device.
     context_.SwitchToDevice(stream);

From 7bff0a1d7355e3e3d7e55e957bb9589528d9fd1d Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Fri, 22 Oct 2021 16:14:38 +0100
Subject: [PATCH 2/2] Update on "[caffe2] Remove Operator::newstyle_outputs_"

`OperatorBase` maintains `output_tensors_` and `newstyle_outputs_`
which hold the same list of tensors except one is
`vector<caffe2::Tensor>` and the other is `List<at::Tensor>`.

This instead maintains only `output_tensors_` and handles the
conversions inside of export_caffe2_op_to_c10.

[ghstack-poisoned]
---
 caffe2/core/operator.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/caffe2/core/operator.cc b/caffe2/core/operator.cc
index aa203d5d6d69..7711e0956f6e 100644
--- a/caffe2/core/operator.cc
+++ b/caffe2/core/operator.cc
@@ -127,7 +127,7 @@ OperatorBase::OperatorBase(
       input_size_(compute_input_size_(newstyle_inputs_)) {
   input_tensors_.resize(input_size_);
 
-  output_tensors_.reserve(outputs_.size());
+  output_tensors_.reserve(outputs.size());
   for (auto i : c10::irange(outputs.size())) {
     output_tensors_.emplace_back(outputs.extract(i));
   }