8000 Merge commit for internal changes · CJavaScala/tensorflow@47e38b0 · GitHub
[go: up one dir, main page]

Skip to content

Commit 47e38b0

Browse files
committed
Merge commit for internal changes
2 parents a2c5e74 + d45ed5c commit 47e38b0

File tree

39 files changed

+261
-217
lines changed

39 files changed

+261
-217
lines changed

configure

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,12 +64,12 @@ while true; do
6464
# Retry
6565
done
6666

67-
# Find out where the CUDNN library is installed
67+
# Find out where the cuDNN library is installed
6868
while true; do
F438
6969
fromuser=""
7070
if [ -z "$CUDNN_INSTALL_PATH" ]; then
7171
default_cudnn_path=${CUDA_TOOLKIT_PATH}
72-
read -p "Please specify the location where CUDNN 6.5 V2 library is installed. Refer to README.md for more details. [Default is $default_cudnn_path]: " CUDNN_INSTALL_PATH
72+
read -p "Please specify the location where cuDNN v2 library is installed. Refer to README.md for more details. [Default is $default_cudnn_path]: " CUDNN_INSTALL_PATH
7373
fromuser="1"
7474
if [ -z "$CUDNN_INSTALL_PATH" ]; then
7575
CUDNN_INSTALL_PATH=$default_cudnn_path
@@ -81,7 +81,7 @@ while true; do
8181
if [ -e "$CUDNN_INSTALL_PATH/libcudnn.so.6.5" -o -e "$CUDNN_INSTALL_PATH/lib64/libcudnn.so.6.5" ]; then
8282
break
8383
fi
84-
echo "Invalid path to CUDNN 6.5 V2 toolkit. Neither of the following two files can be found:"
84+
echo "Invalid path to cuDNN v2 toolkit. Neither of the following two files can be found:"
8585
echo "$CUDNN_INSTALL_PATH/lib64/libcudnn.so.6.5"
8686
echo "$CUDNN_INSTALL_PATH/libcudnn.so.6.5"
8787
if [ -z "$fromuser" ]; then
@@ -96,7 +96,7 @@ cat > third_party/gpus/cuda/cuda.config <<EOF
9696
# at the moment.
9797
CUDA_TOOLKIT_PATH="$CUDA_TOOLKIT_PATH"
9898
99-
# CUDNN_INSTALL_PATH refers to the CUDNN toolkit. The cudnn header and library
99+
# CUDNN_INSTALL_PATH refers to the cuDNN toolkit. The cuDNN header and library
100100
# files can be either in this directory, or under include/ and lib64/
101101
# directories separately.
102102
CUDNN_INSTALL_PATH="$CUDNN_INSTALL_PATH"

tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ TEST(GPUBFCAllocatorTest, ExerciseCoalescing) {
119119
GPUBFCAllocator a(0, 1 << 30);
120120

121121
float* first_ptr = a.Allocate<float>(1024);
122-
a.Deallocate(first_ptr);
122+
a.DeallocateRaw(first_ptr);
123123
for (int i = 0; i < 1024; ++i) {
124124
// Allocate several buffers of different sizes, and then clean them
125125
// all up. We should be able to repeat this endlessly without
@@ -130,18 +130,18 @@ TEST(GPUBFCAllocatorTest, ExerciseCoalescing) {
130130
double* t3 = a.Allocate<double>(2048);
131131
float* t4 = a.Allocate<float>(10485760);
132132

133-
a.Deallocate(t1);
134-
a.Deallocate(t2);
135-
a.Deallocate(t3);
136-
a.Deallocate(t4);
133+
a.DeallocateRaw(t1);
134+
a.DeallocateRaw(t2);
135+
a.DeallocateRaw(t3);
136+
a.DeallocateRaw(t4);
137137
}
138138

139139
// At the end, we should have coalesced all memory into one region
140140
// starting at the beginning, so validate that allocating a pointer
141141
// starts from this region.
142142
float* first_ptr_after = a.Allocate<float>(1024);
143143
EXPECT_EQ(first_ptr, first_ptr_after);
144-
a.Deallocate(first_ptr_after);
144+
a.DeallocateRaw(first_ptr_after);
145145
}
146146

147147
TEST(GPUBFCAllocatorTest, AllocateZeroBufSize) {
@@ -160,7 +160,7 @@ TEST(GPUBFCAllocatorTest, AllocatedVsRequested) {
160160
float* t1 = a.Allocate<float>(1);
161161
EXPECT_EQ(4, a.RequestedSize(t1));
162162
EXPECT_EQ(256, a.AllocatedSize(t1));
163-
a.Deallocate(t1);
163+
a.DeallocateRaw(t1);
164164
}
165165

166166
TEST(GPUBFCAllocatorTest, TestCustomMemoryLimit) {
@@ -172,7 +172,7 @@ TEST(GPUBFCAllocatorTest, TestCustomMemoryLimit) {
172172

173173
EXPECT_NE(nullptr, first_ptr);
174174
EXPECT_EQ(nullptr, second_ptr);
175-
a.Deallocate(first_ptr);
175+
a.DeallocateRaw(first_ptr);
176176
}
177177

178178
} // namespace

tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Header) {
8181
stream_exec->SynchronousMemcpy(&gpu_hdr_ptr, &pi, sizeof(float)));
8282

8383
// Expect error on free.
84-
a.Deallocate(gpu_array);
84+
a.DeallocateRaw(gpu_array);
8585
},
8686
"");
8787
}
@@ -114,7 +114,7 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) {
114114
stream_exec->SynchronousMemcpy(&gpu_ftr_ptr, &pi, sizeof(float)));
115115

116116
// Expect error on free.
117-
a.Deallocate(gpu_array);
117+
a.DeallocateRaw(gpu_array);
118118
},
119119
"");
120120
}
@@ -149,7 +149,7 @@ TEST(GPUDebugAllocatorTest, ResetToNan) {
149149
ASSERT_EQ(1.0, cpu_array_result[0]);
150150

151151
// Free the array
152-
a.Deallocate(gpu_array);
152+
a.DeallocateRaw(gpu_array);
153153

154154
// All values should be reset to nan.
155155
ASSERT_TRUE(
@@ -192,7 +192,7 @@ TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) {
192192
ASSERT_EQ(1.0, cpu_array_result[0]);
193193

194194
// Free the array
195-
a.Deallocate(gpu_array);
195+
a.DeallocateRaw(gpu_array);
196196

197197
// All values should be reset to nan.
198198
ASSERT_TRUE(
@@ -214,7 +214,7 @@ TEST(GPUDebugAllocatorTest, AllocatedVsRequested) {
214214
float* t1 = a.Allocate<float>(1);
215215
EXPECT_EQ(4, a.RequestedSize(t1));
216216
EXPECT_EQ(256, a.AllocatedSize(t1));
217-
a.Deallocate(t1);
217+
a.DeallocateRaw(t1);
218218
}
219219

220220
} // namespace tensorflow

tensorflow/core/common_runtime/gpu/gpu_region_allocator_test.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,8 @@ TEST(GPURegionAllocatorTest, Simple) {
4848
}
4949
float* t1 = a.Allocate<float>(1024);
5050
double* t2 = a.Allocate<double>(1048576);
51-
a.Deallocate(t1);
52-
a.Deallocate(t2);
51+
a.Deallocate(t1, 1024);
52+
a.Deallocate(t2, 1048576);
5353
}
5454

5555
TEST(GPURegionAllocatorTest, CheckMemLeak) {
@@ -77,7 +77,7 @@ TEST(GPURegionAllocatorTest, AllocatedVsRequested) {
7777
// Minimum allocation size if 256
7878
EXPECT_EQ(256, a.AllocatedSize(t1));
7979

80-
a.Deallocate(t1);
80+
a.Deallocate(t1, 1);
8181
}
8282

8383
} // namespace

tensorflow/core/common_runtime/gpu/gpu_util.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ void GPUUtil::SetProtoFromGPU(const Tensor& tensor, Device* dev,
100100
}
101101
tensor_ref.Unref();
102102
port::CopyFromArray(proto->mutable_tensor_content(), mb, num_bytes);
103-
alloc->Deallocate<char>(mb);
103+
alloc->Deallocate<char>(mb, num_bytes);
104104
done(Status::OK());
105105
});
106106
} else {

tensorflow/core/framework/allocator.h

Lines changed: 67 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ limitations under the License.
2121

2222
#include <limits>
2323

24+
#include "tensorflow/core/framework/numeric_types.h"
25+
#include "tensorflow/core/framework/type_traits.h"
2426
#include "tensorflow/core/platform/logging.h"
2527
#include "tensorflow/core/platform/port.h"
2628

@@ -66,9 +68,11 @@ class Allocator {
6668
// REQUIRES: "ptr" was previously returned by a call to AllocateRaw
6769
virtual void DeallocateRaw(void* ptr) = 0;
6870

69-
// Convenience functions to do typed allocation. Note that these functions
70-
// do not invoke C++ constructors or destructors. May return NULL if the
71-
// tensor has too many elements to represent in a single allocation.
71+
// Convenience functions to do typed allocation. C++ constructors
72+
// and destructors are invoked for complex types if necessary,
73+
// depending on the concrete Allocator implementation. May return
74+
// NULL if the tensor has too many elements to represent in a single
75+
// allocation.
7276
template <typename T>
7377
T* Allocate(size_t num_elements) {
7478
return Allocate<T>(num_elements, AllocationAttributes());
@@ -86,12 +90,17 @@ class Allocator {
8690

8791
void* p = AllocateRaw(32 /* align to 32 byte boundary */,
8892
sizeof(T) * num_elements, allocation_attr);
89-
return reinterpret_cast<T*>(p);
93+
T* typed_p = reinterpret_cast<T*>(p);
94+
if (typed_p) RunCtor<T>(typed_p, num_elements);
95+
return typed_p;
9096
}
9197

9298
template <typename T>
93-
void Deallocate(T* ptr) {
94-
DeallocateRaw(ptr);
99+
void Deallocate(T* ptr, size_t num_elements) {
100+
if (ptr) {
101+
RunDtor<T>(ptr, num_elements);
102+
DeallocateRaw(ptr);
103+
}
95104
}
96105

97106
// Returns true if this allocator tracks the sizes of allocations.
@@ -122,11 +131,63 @@ class Allocator {
122131
// allocated by this allocator.
123132
virtual size_t AllocatedSize(void* ptr) { return RequestedSize(ptr); }
124133

134+
// is_simple<T>::value if T[] can be safely constructed and destructed
135+
// without running T() and ~T(). We do not use std::is_trivial<T>
136+
// directly because std::complex<float> is not trival but its array
137+
// can be constructed and destructed without running its default ctor
138+
// and dtor.
139+
template <typename T>
140+
struct is_simple {
141+
static const bool value = std::is_trivial<T>::value ||
142+
std::is_same<T, complex64>::value ||
143+
is_quantized<T>::value;
144+
};
145+
146+
private:
147+
// No constructors or destructors are run for simple types
148+
template <typename T>
149+
void RunCtor(T* p, size_t n) {
150+
static_assert(is_simple<T>::value, "T is not a simple type.");
151+
}
152+
153+
template <typename T>
154+
void RunDtor(T* p, size_t n) {}
155+
156+
// custom constructors and destructors that can be overridden for
157+
// non-standard allocators
158+
159+
// Runs string's default constructor for p[0], p[1], ..., p[n-1].
160+
virtual void RunStringCtor(string* p, size_t n) {
161+
for (size_t i = 0; i < n; ++p, ++i) new (p) string();
162+
}
163+
164+
// Runs string's default destructor for p[0], p[1], ..., p[n-1].
165+
virtual void RunStringDtor(string* p, size_t n) {
166+
for (size_t i = 0; i < n; ++p, ++i) p->~string();
167+
}
168+
125169
// TODO(jeff): Maybe provide some interface to give info about
126170
// current allocation state (total number of bytes available for
127171
// allocation, number of bytes free on device, etc.)
128172
};
129173

174+
template <>
175+
struct Allocator::is_simple<bfloat16> {
176+
static const bool value = true;
177+
};
178+
179+
// Allocator-specific constructors and destructors are used for
180+
// strings
181+
template <>
182+
inline void Allocator::RunCtor(string* p, size_t n) {
183+
RunStringCtor(p, n);
184+
}
185+
186+
template <>
187+
inline void Allocator::RunDtor(string* p, size_t n) {
188+
RunStringDtor(p, n);
189+
}
190+
130191
// A tensorflow Op may need access to different kinds of memory that
131192
// are not simply a function of the device to which the Op has been
132193
// assigned. For example, an Op executing on a GPU may still need

tensorflow/core/framework/allocator_test.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ TEST(CPUAllocatorTest, Simple) {
3535
}
3636
float* t1 = a->Allocate<float>(1024);
3737
double* t2 = a->Allocate<double>(1048576);
38-
a->Deallocate(t1);
39-
a->Deallocate(t2);
38+
a->Deallocate(t1, 1024);
39+
a->Deallocate(t2, 1048576);
4040
}
4141

4242
// Define a struct that we will use to observe behavior in the unit tests

tensorflow/core/framework/tensor.cc

Lines changed: 5 additions & 45 deletions
F438
Original file line numberDiff line numberDiff line change
@@ -76,36 +76,13 @@ class Buffer : public TensorBuffer {
7676
TF_DISALLOW_COPY_AND_ASSIGN(Buffer);
7777
};
7878

79-
// is_simple<T>::value if T[] can be safely constructed and destructed
80-
// without running T() and ~T(). We do not use std::is_trivial<T>
81-
// directly because std::complex<float> is not trival but its array
82-
// can be constructed and destructed without running its default ctor
83-
// and dtor.
84-
template <typename T>
85-
struct is_simple {
86-
static const bool value = std::is_trivial<T>::value ||
87-
std::is_same<T, complex64>::value ||
88-
is_quantized<T>::value;
89-
};
90-
91-
template <>
92-
struct is_simple<bfloat16> {
93-
static const bool value = true;
94-
};
95-
9679
// A set of helper functions depending on T.
9780
template <typename T>
9881
struct Helper {
9982
// By default, we assume T is a simple type (float, int32, etc.)
100-
static_assert(is_simple<T>::value, "T is not a simple type.");
83+
static_assert(Allocator::is_simple<T>::value, "T is not a simple type.");
10184
typedef protobuf::RepeatedField<T> RepeatedFieldType;
10285

103-
// No constructor to run.
104-
static void RunCtor(T* p, int n) {}
105-
106-
// No destructor to run.
107-
static void RunDtor(T* p, int n) {}
108-
10986
// Encoder of simple type T to a string. We do a copy.
11087
template <typename Destination>
11188
static void Encode(TensorBuffer* in, int64 n, Destination* out) {
@@ -142,16 +119,6 @@ struct Helper<string> {
142119
// Proto message uses RepeatedFieldType to hold repeated T.
143120
typedef protobuf::RepeatedPtrField<string> RepeatedFieldType;
144121

145-
// Runs string's default constructor for p[0], p[1], ..., p[n-1].
146-
static void RunCtor(string* p, int n) {
147-
for (int i = 0; i < n; ++p, ++i) new (p) string();
148-
}
149-
150-
// Runs T's default destructor for p[0], p[1], ..., p[n-1].
151-
static void RunDtor(string* p, int n) {
152-
for (int i = 0; i < n; ++p, ++i) p->~string();
153-
}
154-
155122
// Encodes "n" elements of type string stored in "in" into Cord
156123
// "out", which is usually the TensorProto::tensor_content.
157124
template <typename Destination>
@@ -273,23 +240,16 @@ struct ProtoHelper<bfloat16> {
273240

274241
template <typename T>
275242
Buffer<T>::Buffer(Allocator* a, int64 n)
276-
: alloc_(a), data_(a->Allocate<T>(n)), elem_(n) {
277-
if (data_) Helper<T>::RunCtor(data_, elem_);
278-
}
243+
: alloc_(a), data_(a->Allocate<T>(n)), elem_(n) {}
279244

280245
template <typename T>
281246
Buffer<T>::Buffer(Allocator* a, int64 n,
282247
const AllocationAttributes& allocation_attr)
283-
: alloc_(a), data_(a->Allocate<T>(n, allocation_attr)), elem_(n) {
284-
if (data_) Helper<T>::RunCtor(data_, elem_);
285-
}
248+
: alloc_(a), data_(a->Allocate<T>(n, allocation_attr)), elem_(n) {}
286249

287250
template <typename T>
288251
Buffer<T>::~Buffer() {
289-
if (data_) {
290-
Helper<T>::RunDtor(data_, elem_);
291-
alloc_->Deallocate<T>(data_);
292-
}
252+
alloc_->Deallocate<T>(data_, elem_);
293253
}
294254

295255
// Allocates a T[n] buffer. Fills in the buffer with repeated values
@@ -542,7 +502,7 @@ size_t Tensor::TotalBytes() const {
542502
}
543503

544504
bool Tensor::CanUseDMA() const {
545-
CASES(dtype(), return is_simple<T>::value);
505+
CASES(dtype(), return Allocator::is_simple<T>::value);
546506
return false; // Makes compiler happy.
547507
}
548508

0 commit comments

Comments
 (0)
0