8000 Merge commit for internal changes · evdevdev/tensorflow@2e7ea3d · GitHub
[go: up one dir, main page]

Skip to content

Commit 2e7ea3d

Browse filesBrowse files
author
Vijay Vasudevan
committed
Merge commit for internal changes
2 parents 02fb157 + ff57d12 commit 2e7ea3d

File tree

36 files changed

+1064
-213
lines changed

36 files changed

+1064
-213
lines changed

WORKSPACE

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ new_http_archive(
2121

2222
new_http_archive(
2323
name = "eigen_archive",
24-
url = "https://bitbucket.org/eigen/eigen/get/d861b41.tar.gz",
25-
sha256 = "8a22dd6597243592a6cb9e5aaed3c16b94848f0f6742312c45e3bbb35d33935d",
24+
url = "https://bitbucket.org/eigen/eigen/get/fb2fa05.tar.gz",
25+
sha256 = "8aacd8065d52528af1a22d6b72925dbb6b9fb8f25e46769481dd06d3edf63bbd",
2626
build_file = "eigen.BUILD",
2727
)
2828

eigen.BUILD

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
package(default_visibility = ["//visibility:public"])
22

3-
archive_dir = "eigen-eigen-d861b41b1996"
3+
archive_dir = "eigen-eigen-fb2fa0527077"
44

55
cc_library(
66
name = "eigen",

tensorflow/core/BUILD

Lines changed: 5 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -773,23 +773,13 @@ cc_library(
773773
deps = ["//tensorflow/core/platform/default/build_config:protos_cc"],
774774
)
775775

776-
cc_library(
777-
name = "copy_tensor",
778-
deps = [
779-
":lib",
780-
":protos_cc",
781-
":stream_executor",
782-
"//third_party/eigen3",
783-
],
784-
)
785-
786776
tf_cuda_library(
787777
name = "core_cpu_internal",
788778
srcs = glob(
789779
[
790780
"client/**/*.cc",
791-
"common_runtime/**/*.h", # TODO(josh11b): exclude common_runtime/gpu/
792-
"common_runtime/**/*.cc",
781+
"common_runtime/*.h",
782+
"common_runtime/*.cc",
793783
"graph/**/*.h",
794784
"graph/**/*.cc",
795785
"public/session.h",
@@ -800,16 +790,14 @@ tf_cuda_library(
800790
exclude = [
801791
"**/*test*",
802792
"**/*main.cc",
803-
"common_runtime/gpu/*.cc",
804-
"common_runtime/copy_tensor.cc",
805793
"common_runtime/gpu_device_factory.cc",
806794
"common_runtime/direct_session.cc",
807795
"common_runtime/direct_session.h",
808796
],
809797
),
810798
hdrs = glob(
811799
[
812-
"common_runtime/**/*.h", # TODO(josh11b): exclude common_runtime/gpu/
800+
"common_runtime/*.h",
813801
"graph/**/*.h",
814802
],
815803
exclude = [
@@ -819,7 +807,6 @@ tf_cuda_library(
819807
),
820808
copts = tf_copts(),
821809
deps = [
822-
":copy_tensor",
823810
":framework",
824811
":framework_internal",
825812
":lib",
@@ -861,14 +848,14 @@ tf_cuda_library(
861848
name = "gpu_runtime",
862849
srcs = glob(
863850
[
864-
"common_runtime/gpu/*.h",
865851
"common_runtime/gpu/*.cc",
866852
],
867853
exclude = [
868854
"**/*main.cc",
869855
"**/*test.cc",
870856
],
871857
),
858+
hdrs = glob(["common_runtime/gpu/*.h"]),
872859
copts = tf_copts(),
873860
cuda_deps = [
874861
":cuda",
@@ -1020,6 +1007,7 @@ tf_cc_tests(
10201007
":direct_session",
10211008
":framework",
10221009
":framework_internal",
1010+
":gpu_runtime",
10231011
":kernels",
10241012
":lib",
10251013
":lib_internal",
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
#include "tensorflow/core/common_runtime/copy_tensor.h"
2+
3+
#include <vector>
4+
#include "tensorflow/core/lib/core/errors.h"
5+
#include "tensorflow/core/platform/logging.h"
6+
#include "tensorflow/core/platform/tracing.h"
7+
8+
namespace tensorflow {
9+
namespace {
10+
11+
static bool initialization_done = false;
12+
13+
struct RegistrationInfo {
14+
RegistrationInfo(DeviceType s, DeviceType r, CopyTensor::CopyFunction cf)
15+
: sender_device_type(s), receiver_device_type(r), copy_function(cf) {}
16+
DeviceType sender_device_type;
17+
DeviceType receiver_device_type;
18+
CopyTensor::CopyFunction copy_function;
19+
};
20+
21+
// We use a vector instead of a map since we expect there to be very
22+
// few registrations.
23+
std::vector<RegistrationInfo>* MutableRegistry() {
24+
static std::vector<RegistrationInfo>* registry =
25+
new std::vector<RegistrationInfo>;
26+
return registry;
27+
}
28+
29+
} // namespace
30+
31+
// static
32+
void CopyTensor::ViaDMA(const string& edge_name,
33+
DeviceContext* send_dev_context,
34+
DeviceContext* recv_dev_context, Device* src,
35+
Device* dst, const AllocatorAttributes src_alloc_attr,
36+
const AllocatorAttributes dst_alloc_attr,
37+
const Tensor* input, Tensor* output,
38+
StatusCallback done) {
39+
initialization_done = true;
40+
port::Tracing::ScopedAnnotation annotation(edge_name);
41+
VLOG(1) << "CopyViaDMA " << edge_name;
42+
const size_t total_bytes = input->TotalBytes();
43+
44+
// Note that 0-size tensors have no backing buffer.
45+
if (total_bytes > 0) {
46+
const DeviceType src_device_type(src_alloc_attr.on_host()
47+
? DEVICE_CPU
48+
: src->attributes().device_type());
49+
const DeviceType dst_device_type(dst_alloc_attr.on_host()
50+
? DEVICE_CPU
51+
: dst->attributes().device_type());
52+
const bool non_cpu_src = src_device_type != DeviceType(DEVICE_CPU);
53+
const bool non_cpu_dst = dst_device_type != DeviceType(DEVICE_CPU);
54+
55+
if (non_cpu_src) {
56+
if (non_cpu_dst) {
57+
// Device to device copy. Look through registry for an appropriate
58+
// CopyFunction.
59+
std::vector<RegistrationInfo>* registry = MutableRegistry();
60+
for (const RegistrationInfo& ri : *registry) {
61+
if (ri.sender_device_type == src_device_type &&
62+
ri.receiver_device_type == dst_device_type) {
63+
ri.copy_function(send_dev_context, recv_dev_context, src, dst,
64+
src_alloc_attr, dst_alloc_attr, input, output,
65+
done);
66+
return;
67+
}
68+
}
69+
70+
// TODO(josh11b): If no CopyFunction is found, we currently fail
71+
// but we could copy between devices via CPU.
72+
done(errors::Unimplemented(
73+
"No function registered to copy from devices of type ",
74+
src_device_type.type(), " to devices of type ",
75+
dst_device_type.type()));
76+
} else {
77+
// Device to host copy.
78+
return send_dev_context->CopyDeviceTensorToCPU(input, edge_name, src,
79+
output, done);
80+
}
81+
} else if (non_cpu_dst) {
82+
// Host to Device copy.
83+
// Note that this is already an async copy.
84+
recv_dev_context->CopyCPUTensorToDevice(input, dst, output, done);
85+
} else {
86+
*output = *input;
87+
done(Status::OK());
88+
}
89+
} else {
90+
// buffer is empty
91+
done(Status::OK());
92+
}
93+
}
94+
95+
// static
96+
Status CopyTensor::Register(DeviceType sender_device_type,
97+
DeviceType receiver_device_type,
98+
CopyFunction copy_function) {
99+
if (initialization_done) {
100+
return errors::FailedPrecondition(
101+
"May only register CopyTensor functions during before the first tensor "
102+
"is copied.");
103+
}
104+
std::vector<RegistrationInfo>* registry = MutableRegistry();
105+
registry->emplace_back(sender_device_type, receiver_device_type,
106+
copy_function);
107+
return Status::OK();
108+
}
109+
110+
} // namespace tensorflow
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_COPY_TENSOR_H_
2+
#define TENSORFLOW_CORE_COMMON_RUNTIME_COPY_TENSOR_H_
3+
4+
#include "tensorflow/core/common_runtime/device.h"
5+
#include "tensorflow/core/framework/allocator.h"
6+
#include "tensorflow/core/framework/device_base.h"
7+
#include "tensorflow/core/framework/tensor.h"
8+
#include "tensorflow/core/framework/types.h"
9+
#include "tensorflow/core/lib/core/status.h"
10+
#include "tensorflow/core/platform/types.h"
11+
12+
namespace tensorflow {
13+
14+
class CopyTensor {
15+
public:
16+
typedef void (*CopyFunction)(DeviceContext* send_dev_context,
17+
DeviceContext* recv_dev_context, Device* src,
18+
Device* dst,
19+
const AllocatorAttributes src_alloc_attr,
20+
const AllocatorAttributes dst_alloc_attr,
21+
const Tensor* input, Tensor* output,
22+
StatusCallback done);
23+
24+
// Copies "input" to "output" between devices accessible to the
25+
// local process via some DMA-like method. "edge_name" is the name
26+
// of the tensor being copied, for debugging purposes. Depending on
27+
// the type of devices and memory in use, the copy may be performed
28+
// synchronously or asynchronously. 'done' will be invoked only
29+
// after the copy is actually complete.
30+
static void ViaDMA(const string& edge_name, DeviceContext* send_dev_context,
31+
DeviceContext* recv_dev_context, Device* src, Device* dst,
32+
const AllocatorAttributes src_alloc_attr,
33+
const AllocatorAttributes dst_alloc_attr,
34+
const Tensor* input, Tensor* output, StatusCallback done);
35+
36+
// Register a function for copying between two specific DeviceTypes.
37+
static Status Register(DeviceType sender_device_type,
38+
DeviceType receiver_device_type,
39+
CopyFunction copy_function);
40+
41+
// Object used to call Register() at static-initialization time.
42+
class Registration {
43+
public:
44+
Registration(DeviceType sender_device_type, DeviceType receiver_device_type,
45+
CopyFunction copy_function) {
46+
TF_QCHECK_OK(
47+
Register(sender_device_type, receiver_device_type, copy_function));
48+
}
49+
};
50+
};
51+
52+
} // namespace tensorflow
53+
54+
#endif // TENSORFLOW_CORE_COMMON_RUNTIME_COPY_TENSOR_H_

tensorflow/core/common_runtime/gpu/dma_helper.h renamed to tensorflow/core/common_runtime/dma_helper.h

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
1313
limitations under the License.
1414
==============================================================================*/
1515

16-
#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_DMA_HELPER_H_
17-
#define TENSORFLOW_COMMON_RUNTIME_GPU_DMA_HELPER_H_
16+
#ifndef TENSORFLOW_COMMON_RUNTIME_DMA_HELPER_H_
17+
#define TENSORFLOW_COMMON_RUNTIME_DMA_HELPER_H_
1818

1919
#include "tensorflow/core/public/tensor.h"
2020

21-
// For internal use only. Visibility should be limited to brain/framework.
22-
2321
namespace tensorflow {
22+
23+
// For TensorFlow internal use only.
2424
class DMAHelper {
2525
public:
2626
static bool CanUseDMA(const Tensor* t) { return t->CanUseDMA(); }
@@ -29,5 +29,7 @@ class DMAHelper {
2929
static TensorBuffer* buffer(Tensor* t) { return t->buf_; }
3030
static const TensorBuffer* buffer(const Tensor* t) { return t->buf_; }
3131
};
32+
3233
} // namespace tensorflow
33-
#endif // TENSORFLOW_COMMON_RUNTIME_GPU_DMA_HELPER_H_
34+
35+
#endif // TENSORFLOW_COMMON_RUNTIME_DMA_HELPER_H_

tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -159,12 +159,14 @@ void* GPUBFCAllocator::AllocateRawInternal(size_t unused_alignment,
159159
// Start searching from the first bin for the smallest chunk that fits
160160
// rounded_bytes.
161161
Bin* b = it->second;
162-
for (GPUBFCAllocator::Chunk* chunk : b->free_chunks) {
162+
for (auto citer = b->free_chunks.begin(); citer != b->free_chunks.end();
163+
++citer) {
164+
GPUBFCAllocator::Chunk* chunk = (*citer);
163165
DCHECK(!chunk->in_use());
164166
if (chunk->size >= rounded_bytes) {
165167
// We found an existing chunk that fits us that wasn't in use, so remove
166168
// it from the free bin structure prior to using.
167-
RemoveFreeChunkFromBin(chunk);
169+
RemoveFreeChunkIterFromBin(&b->free_chunks, citer);
168170

169171
// If we can break the size of the chunk into two reasonably
170172
// large pieces, do so.
@@ -299,6 +301,15 @@ void GPUBFCAllocator::InsertFreeChunkIntoBin(GPUBFCAllocator::Chunk* c) {
299301
new_bin->free_chunks.insert(c);
300302
}
301303

304+
void GPUBFCAllocator::RemoveFreeChunkIterFromBin(
305+
GPUBFCAllocator::Bin::FreeChunkSet* free_chunks,
306+
const GPUBFCAllocator::Bin::FreeChunkSet::iterator& citer) {
307+
GPUBFCAllocator::Chunk* c = *citer;
308+
CHECK(!c->in_use() && c->bin);
309+
free_chunks->erase(citer);
310+
c->bin = nullptr;
311+
}
312+
302313
void GPUBFCAllocator::RemoveFreeChunkFromBin(GPUBFCAllocator::Chunk* c) {
303314
CHECK(!c->in_use() && c->bin);
304315
int count = c->bin->free_chunks.erase(c);

tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -125,17 +125,6 @@ class GPUBFCAllocator : public VisitableAllocator {
125125
return dbg;
126126
}
127127
};
128-
129-
Chunk* AllocateNewChunk(size_t num_bytes);
130-
void SplitChunk(Chunk* c, size_t num_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_);
131-
void Merge(Chunk* c1, Chunk* c2) EXCLUSIVE_LOCKS_REQUIRED(lock_);
132-
void FreeAndMaybeCoalesce(Chunk* c) EXCLUSIVE_LOCKS_REQUIRED(lock_);
133-
void InsertFreeChunkIntoBin(Chunk* c) EXCLUSIVE_LOCKS_REQUIRED(lock_);
134-
void RemoveFreeChunkFromBin(Chunk* c);
135-
void DeleteChunk(Chunk* c) EXCLUSIVE_LOCKS_REQUIRED(lock_);
136-
137-
void DumpMemoryLog(size_t num_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_);
138-
139128
// A Bin is a collection of similar-sized free chunks.
140129
struct Bin {
141130
// All chunks in this bin have >= bin_size memory.
@@ -151,13 +140,26 @@ class GPUBFCAllocator : public VisitableAllocator {
151140
}
152141
};
153142

143+
typedef std::set<Chunk*, ChunkComparator> FreeChunkSet;
154144
// List of free chunks within the bin, sorted by chunk size.
155145
// Chunk * not owned.
156-
std::set<Chunk*, ChunkComparator> free_chunks;
146+
FreeChunkSet free_chunks;
157147

158148
explicit Bin(size_t bs) : bin_size(bs) {}
159149
};
160150

151+
Chunk* AllocateNewChunk(size_t num_bytes);
152+
void SplitChunk(Chunk* c, size_t num_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_);
153+
void Merge(Chunk* c1, Chunk* c2) EXCLUSIVE_LOCKS_REQUIRED(lock_);
154+
void FreeAndMaybeCoalesce(Chunk* c) EXCLUSIVE_LOCKS_REQUIRED(lock_);
155+
void InsertFreeChunkIntoBin(Chunk* c) EXCLUSIVE_LOCKS_REQUIRED(lock_);
156+
void RemoveFreeChunkIterFromBin(Bin::FreeChunkSet* free_chunks,
157+
const Bin::FreeChunkSet::iterator& c);
158+
void RemoveFreeChunkFromBin(Chunk* c);
159+
void DeleteChunk(Chunk* c) EXCLUSIVE_LOCKS_REQUIRED(lock_);
160+
161+
void DumpMemoryLog(size_t num_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_);
162+
161163
GPUAllocatorRetry retry_helper_;
162164

163165
// Structures immutable after construction

0 commit comments

Comments
 (0)
0