evdevdev
diff --git a/‎WORKSPACE‎
Lines changed: 2 additions & 2 deletions b/‎WORKSPACE‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎eigen.BUILD‎
Lines changed: 1 addition & 1 deletion b/‎eigen.BUILD‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorflow/core/BUILD‎
Lines changed: 5 additions & 17 deletions b/‎tensorflow/core/BUILD‎
Lines changed: 5 additions & 17 deletions
diff --git a/‎tensorflow/core/common_runtime/copy_tensor.cc‎
Lines changed: 110 additions & 0 deletions b/‎tensorflow/core/common_runtime/copy_tensor.cc‎
Lines changed: 110 additions & 0 deletions
diff --git a/‎tensorflow/core/common_runtime/copy_tensor.h‎
Lines changed: 54 additions & 0 deletions b/‎tensorflow/core/common_runtime/copy_tensor.h‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎…low/core/common_runtime/gpu/dma_helper.h‎ ‎…sorflow/core/common_runtime/dma_helper.h‎tensorflow/core/common_runtime/gpu/dma_helper.h renamed to tensorflow/core/common_runtime/dma_helper.h
Lines changed: 7 additions & 5 deletions b/‎…low/core/common_runtime/gpu/dma_helper.h‎ ‎…sorflow/core/common_runtime/dma_helper.h‎tensorflow/core/common_runtime/gpu/dma_helper.h renamed to tensorflow/core/common_runtime/dma_helper.h
Lines changed: 7 additions & 5 deletions
diff --git a/‎tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc‎
Lines changed: 13 additions & 2 deletions b/‎tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h‎
Lines changed: 14 additions & 12 deletions b/‎tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h‎
Lines changed: 14 additions & 12 deletions
@@ -21,8 +21,8 @@ new_http_archive(
 
 new_http_archive(
   name = "eigen_archive",
-  url = "https://bitbucket.org/eigen/eigen/get/d861b41.tar.gz",
-  sha256 = "8a22dd6597243592a6cb9e5aaed3c16b94848f0f6742312c45e3bbb35d33935d",
+  url = "https://bitbucket.org/eigen/eigen/get/fb2fa05.tar.gz",
+  sha256 = "8aacd8065d52528af1a22d6b72925dbb6b9fb8f25e46769481dd06d3edf63bbd",
   build_file = "eigen.BUILD",
 )
 
 
@@ -1,6 +1,6 @@
 package(default_visibility = ["//visibility:public"])
 
-archive_dir = "eigen-eigen-d861b41b1996"
+archive_dir = "eigen-eigen-fb2fa0527077"
 
 cc_library(
     name = "eigen",
 
@@ -773,23 +773,13 @@ cc_library(
     deps = ["//tensorflow/core/platform/default/build_config:protos_cc"],
 )
 
-cc_library(
-    name = "copy_tensor",
-    deps = [
-        ":lib",
-        ":protos_cc",
-        ":stream_executor",
-        "//third_party/eigen3",
-    ],
-)
-
 tf_cuda_library(
     name = "core_cpu_internal",
     srcs = glob(
         [
             "client/**/*.cc",
-            "common_runtime/**/*.h",  # TODO(josh11b): exclude common_runtime/gpu/
-            "common_runtime/**/*.cc",
+            "common_runtime/*.h",
+            "common_runtime/*.cc",
             "graph/**/*.h",
             "graph/**/*.cc",
             "public/session.h",
@@ -800,16 +790,14 @@ tf_cuda_library(
         exclude = [
             "**/*test*",
             "**/*main.cc",
-            "common_runtime/gpu/*.cc",
-            "common_runtime/copy_tensor.cc",
             "common_runtime/gpu_device_factory.cc",
             "common_runtime/direct_session.cc",
             "common_runtime/direct_session.h",
         ],
     ),
     hdrs = glob(
         [
-            "common_runtime/**/*.h",  # TODO(josh11b): exclude common_runtime/gpu/
+            "common_runtime/*.h",
             "graph/**/*.h",
         ],
         exclude = [
@@ -819,7 +807,6 @@ tf_cuda_library(
     ),
     copts = tf_copts(),
     deps = [
-        ":copy_tensor",
         ":framework",
         ":framework_internal",
         ":lib",
@@ -861,14 +848,14 @@ tf_cuda_library(
     name = "gpu_runtime",
     srcs = glob(
         [
-            "common_runtime/gpu/*.h",
             "common_runtime/gpu/*.cc",
         ],
         exclude = [
             "**/*main.cc",
             "**/*test.cc",
         ],
     ),
+    hdrs = glob(["common_runtime/gpu/*.h"]),
     copts = tf_copts(),
     cuda_deps = [
         ":cuda",
@@ -1020,6 +1007,7 @@ tf_cc_tests(
         ":direct_session",
         ":framework",
         ":framework_internal",
+        ":gpu_runtime",
         ":kernels",
         ":lib",
         ":lib_internal",
 
@@ -0,0 +1,110 @@
+#include "tensorflow/core/common_runtime/copy_tensor.h"
+
+#include <vector>
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/tracing.h"
+
+namespace tensorflow {
+namespace {
+
+static bool initialization_done = false;
+
+struct RegistrationInfo {
+  RegistrationInfo(DeviceType s, DeviceType r, CopyTensor::CopyFunction cf)
+      : sender_device_type(s), receiver_device_type(r), copy_function(cf) {}
+  DeviceType sender_device_type;
+  DeviceType receiver_device_type;
+  CopyTensor::CopyFunction copy_function;
+};
+
+// We use a vector instead of a map since we expect there to be very
+// few registrations.
+std::vector<RegistrationInfo>* MutableRegistry() {
+  static std::vector<RegistrationInfo>* registry =
+      new std::vector<RegistrationInfo>;
+  return registry;
+}
+
+}  // namespace
+
+// static
+void CopyTensor::ViaDMA(const string& edge_name,
+                        DeviceContext* send_dev_context,
+                        DeviceContext* recv_dev_context, Device* src,
+                        Device* dst, const AllocatorAttributes src_alloc_attr,
+                        const AllocatorAttributes dst_alloc_attr,
+                        const Tensor* input, Tensor* output,
+                        StatusCallback done) {
+  initialization_done = true;
+  port::Tracing::ScopedAnnotation annotation(edge_name);
+  VLOG(1) << "CopyViaDMA " << edge_name;
+  const size_t total_bytes = input->TotalBytes();
+
+  // Note that 0-size tensors have no backing buffer.
+  if (total_bytes > 0) {
+    const DeviceType src_device_type(src_alloc_attr.on_host()
+                                         ? DEVICE_CPU
+                                         : src->attributes().device_type());
+    const DeviceType dst_device_type(dst_alloc_attr.on_host()
+                                         ? DEVICE_CPU
+                                         : dst->attributes().device_type());
+    const bool non_cpu_src = src_device_type != DeviceType(DEVICE_CPU);
+    const bool non_cpu_dst = dst_device_type != DeviceType(DEVICE_CPU);
+
+    if (non_cpu_src) {
+      if (non_cpu_dst) {
+        // Device to device copy.  Look through registry for an appropriate
+        // CopyFunction.
+        std::vector<RegistrationInfo>* registry = MutableRegistry();
+        for (const RegistrationInfo& ri : *registry) {
+          if (ri.sender_device_type == src_device_type &&
+              ri.receiver_device_type == dst_device_type) {
+            ri.copy_function(send_dev_context, recv_dev_context, src, dst,
+                             src_alloc_attr, dst_alloc_attr, input, output,
+                             done);
+            return;
+          }
+        }
+
+        // TODO(josh11b): If no CopyFunction is found, we currently fail
+        // but we could copy between devices via CPU.
+        done(errors::Unimplemented(
+            "No function registered to copy from devices of type ",
+            src_device_type.type(), " to devices of type ",
+            dst_device_type.type()));
+      } else {
+        // Device to host copy.
+        return send_dev_context->CopyDeviceTensorToCPU(input, edge_name, src,
+                                                       output, done);
+      }
+    } else if (non_cpu_dst) {
+      // Host to Device copy.
+      // Note that this is already an async copy.
+      recv_dev_context->CopyCPUTensorToDevice(input, dst, output, done);
+    } else {
+      *output = *input;
+      done(Status::OK());
+    }
+  } else {
+    // buffer is empty
+    done(Status::OK());
+  }
+}
+
+// static
+Status CopyTensor::Register(DeviceType sender_device_type,
+                            DeviceType receiver_device_type,
+                            CopyFunction copy_function) {
+  if (initialization_done) {
+    return errors::FailedPrecondition(
+        "May only register CopyTensor functions during before the first tensor "
+        "is copied.");
+  }
+  std::vector<RegistrationInfo>* registry = MutableRegistry();
+  registry->emplace_back(sender_device_type, receiver_device_type,
+                         copy_function);
+  return Status::OK();
+}
+
+}  // namespace tensorflow
@@ -0,0 +1,54 @@
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_COPY_TENSOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_COPY_TENSOR_H_
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class CopyTensor {
+ public:
+  typedef void (*CopyFunction)(DeviceContext* send_dev_context,
+                               DeviceContext* recv_dev_context, Device* src,
+                               Device* dst,
+                               const AllocatorAttributes src_alloc_attr,
+                               const AllocatorAttributes dst_alloc_attr,
+                               const Tensor* input, Tensor* output,
+                               StatusCallback done);
+
+  // Copies "input" to "output" between devices accessible to the
+  // local process via some DMA-like method.  "edge_name" is the name
+  // of the tensor being copied, for debugging purposes. Depending on
+  // the type of devices and memory in use, the copy may be performed
+  // synchronously or asynchronously.  'done' will be invoked only
+  // after the copy is actually complete.
+  static void ViaDMA(const string& edge_name, DeviceContext* send_dev_context,
+                     DeviceContext* recv_dev_context, Device* src, Device* dst,
+                     const AllocatorAttributes src_alloc_attr,
+                     const AllocatorAttributes dst_alloc_attr,
+                     const Tensor* input, Tensor* output, StatusCallback done);
+
+  // Register a function for copying between two specific DeviceTypes.
+  static Status Register(DeviceType sender_device_type,
+                         DeviceType receiver_device_type,
+                         CopyFunction copy_function);
+
+  // Object used to call Register() at static-initialization time.
+  class Registration {
+   public:
+    Registration(DeviceType sender_device_type, DeviceType receiver_device_type,
+                 CopyFunction copy_function) {
+      TF_QCHECK_OK(
+          Register(sender_device_type, receiver_device_type, copy_function));
+    }
+  };
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_COPY_TENSOR_H_
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_DMA_HELPER_H_
-#define TENSORFLOW_COMMON_RUNTIME_GPU_DMA_HELPER_H_
+#ifndef TENSORFLOW_COMMON_RUNTIME_DMA_HELPER_H_
+#define TENSORFLOW_COMMON_RUNTIME_DMA_HELPER_H_
 
 #include "tensorflow/core/public/tensor.h"
 
-// For internal use only.  Visibility should be limited to brain/framework.
-
 namespace tensorflow {
+
+// For TensorFlow internal use only.
 class DMAHelper {
  public:
   static bool CanUseDMA(const Tensor* t) { return t->CanUseDMA(); }
@@ -29,5 +29,7 @@ class DMAHelper {
   static TensorBuffer* buffer(Tensor* t) { return t->buf_; }
   static const TensorBuffer* buffer(const Tensor* t) { return t->buf_; }
 };
+
 }  // namespace tensorflow
-#endif  // TENSORFLOW_COMMON_RUNTIME_GPU_DMA_HELPER_H_
+
+#endif  // TENSORFLOW_COMMON_RUNTIME_DMA_HELPER_H_
@@ -159,12 +159,14 @@ void* GPUBFCAllocator::AllocateRawInternal(size_t unused_alignment,
     // Start searching from the first bin for the smallest chunk that fits
     // rounded_bytes.
     Bin* b = it->second;
-    for (GPUBFCAllocator::Chunk* chunk : b->free_chunks) {
+    for (auto citer = b->free_chunks.begin(); citer != b->free_chunks.end();
+         ++citer) {
+      GPUBFCAllocator::Chunk* chunk = (*citer);
       DCHECK(!chunk->in_use());
       if (chunk->size >= rounded_bytes) {
         // We found an existing chunk that fits us that wasn't in use, so remove
         // it from the free bin structure prior to using.
-        RemoveFreeChunkFromBin(chunk);
+        RemoveFreeChunkIterFromBin(&b->free_chunks, citer);
 
         // If we can break the size of the chunk into two reasonably
         // large pieces, do so.
@@ -299,6 +301,15 @@ void GPUBFCAllocator::InsertFreeChunkIntoBin(GPUBFCAllocator::Chunk* c) {
   new_bin->free_chunks.insert(c);
 }
 
+void GPUBFCAllocator::RemoveFreeChunkIterFromBin(
+    GPUBFCAllocator::Bin::FreeChunkSet* free_chunks,
+    const GPUBFCAllocator::Bin::FreeChunkSet::iterator& citer) {
+  GPUBFCAllocator::Chunk* c = *citer;
+  CHECK(!c->in_use() && c->bin);
+  free_chunks->erase(citer);
+  c->bin = nullptr;
+}
+
 void GPUBFCAllocator::RemoveFreeChunkFromBin(GPUBFCAllocator::Chunk* c) {
   CHECK(!c->in_use() && c->bin);
   int count = c->bin->free_chunks.erase(c);
 
@@ -125,17 +125,6 @@ class GPUBFCAllocator : public VisitableAllocator {
       return dbg;
     }
   };
-
-  Chunk* AllocateNewChunk(size_t num_bytes);
-  void SplitChunk(Chunk* c, size_t num_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_);
-  void Merge(Chunk* c1, Chunk* c2) EXCLUSIVE_LOCKS_REQUIRED(lock_);
-  void FreeAndMaybeCoalesce(Chunk* c) EXCLUSIVE_LOCKS_REQUIRED(lock_);
-  void InsertFreeChunkIntoBin(Chunk* c) EXCLUSIVE_LOCKS_REQUIRED(lock_);
-  void RemoveFreeChunkFromBin(Chunk* c);
-  void DeleteChunk(Chunk* c) EXCLUSIVE_LOCKS_REQUIRED(lock_);
-
-  void DumpMemoryLog(size_t num_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_);
-
   // A Bin is a collection of similar-sized free chunks.
   struct Bin {
     // All chunks in this bin have >= bin_size memory.
@@ -151,13 +140,26 @@ class GPUBFCAllocator : public VisitableAllocator {
       }
     };
 
+    typedef std::set<Chunk*, ChunkComparator> FreeChunkSet;
     // List of free chunks within the bin, sorted by chunk size.
     // Chunk * not owned.
-    std::set<Chunk*, ChunkComparator> free_chunks;
+    FreeChunkSet free_chunks;
 
     explicit Bin(size_t bs) : bin_size(bs) {}
   };
 
+  Chunk* AllocateNewChunk(size_t num_bytes);
+  void SplitChunk(Chunk* c, size_t num_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  void Merge(Chunk* c1, Chunk* c2) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  void FreeAndMaybeCoalesce(Chunk* c) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  void InsertFreeChunkIntoBin(Chunk* c) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  void RemoveFreeChunkIterFromBin(Bin::FreeChunkSet* free_chunks,
+                                  const Bin::FreeChunkSet::iterator& c);
+  void RemoveFreeChunkFromBin(Chunk* c);
+  void DeleteChunk(Chunk* c) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
+  void DumpMemoryLog(size_t num_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
   GPUAllocatorRetry retry_helper_;
 
   // Structures immutable after construction
Original file line number	Diff line number	Diff line change
`@@ -21,8 +21,8 @@ new_http_archive(`
`21`	`21`
`22`	`22`	`new_http_archive(`
`23`	`23`	`name = "eigen_archive",`
`24`		`- url = "https://bitbucket.org/eigen/eigen/get/d861b41.tar.gz",`
`25`		`- sha256 = "8a22dd6597243592a6cb9e5aaed3c16b94848f0f6742312c45e3bbb35d33935d",`
	`24`	`+ url = "https://bitbucket.org/eigen/eigen/get/fb2fa05.tar.gz",`
	`25`	`+ sha256 = "8aacd8065d52528af1a22d6b72925dbb6b9fb8f25e46769481dd06d3edf63bbd",`
`26`	`26`	`build_file = "eigen.BUILD",`
`27`	`27`	`)`
`28`	`28`