提交 03dba169 编写于 作者: B Benoit Steiner 提交者: TensorFlower Gardener

Improved the performance of full reductions on GPU.

NEW
BM_fullReduction/10        4591       4595     153149  20.8M items/s
BM_fullReduction/64        5073       5075     100000  770.0M items/s
BM_fullReduction/512       9067       9070      75263  26.9G items/s
BM_fullReduction/4k      243984     244125       2868  64.0G items/s
BM_fullReduction/5k      359125     359273       1951  64.8G items/s

OLD
BM_fullReduction/10        9085       9087      74395  10.5M items/s
BM_fullReduction/64        9478       9478      72014  412.1M items/s
BM_fullReduction/512      14643      14646      46902  16.7G items/s
BM_fullReduction/4k      260338     260384       2678  60.0G items/s
BM_fullReduction/5k      385076     385178       1818  60.5G items/s
Change: 124290852
上级 c32ef5a6
package(default_visibility = ["//visibility:public"])
archive_dir = "eigen-eigen-0c0b79ecd74c"
archive_dir = "eigen-eigen-62a2305d5734"
cc_library(
name = "eigen",
......
......@@ -7,7 +7,7 @@
include (ExternalProject)
set(eigen_archive_hash "0c0b79ecd74c")
set(eigen_archive_hash "62a2305d5734")
set(eigen_INCLUDE_DIRS
${CMAKE_CURRENT_BINARY_DIR}
......@@ -16,7 +16,7 @@ set(eigen_INCLUDE_DIRS
${tensorflow_source_dir}/third_party/eigen3
)
set(eigen_URL https://bitbucket.org/eigen/eigen/get/${eigen_archive_hash}.tar.gz)
set(eigen_HASH SHA256=b4b5884b03bd4bae114d02b36e2435ad1504ed8e51431d16c876b6f6a365882b)
set(eigen_HASH SHA256=d5da5c60f7225bc2f104f3494323b929e68e3a188ccf01dcee61df32ff536888)
set(eigen_BUILD ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen)
set(eigen_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/eigen/install)
......
......@@ -123,19 +123,20 @@ class EigenAllocator : public ::Eigen::Allocator {
#else
class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
public:
EigenCudaStreamDevice() : scratch_(nullptr) { Eigen::initializeDeviceProp(); }
~EigenCudaStreamDevice() {
if (scratch_) {
deallocate(scratch_);
EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
Eigen::initializeDeviceProp();
}
~EigenCudaStreamDevice() {
}
void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream,
int gpu_id, ::tensorflow::Allocator* alloc) {
int gpu_id, ::tensorflow::Allocator* alloc, char* scratch) {
if (LogMemory::IsEnabled()) {
operation_ = context->op_kernel().name() + "/EigenAllocator";
step_id_ = context->step_id();
}
assert(!scratch_);
scratch_ = scratch;
semaphore_ =
reinterpret_cast<unsigned int*>(scratch + Eigen::kCudaScratchSize);
stream_ = cuda_stream;
allocator_ = alloc;
device_prop_ = &Eigen::m_deviceProperties[gpu_id];
......@@ -172,12 +173,15 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
// Return a pointer to a per stream scratchpad of 1024 bytes residing
// in global memory.
void* scratchpad() const {
if (scratch_ == nullptr) {
scratch_ = allocate(1024);
}
return scratch_;
}
// Return a semaphore. The semaphore is initially initialized to 0, and
// each kernel using it is responsible for resetting to 0 upon completion
// to maintain the invariant that the semaphore is always equal to 0 upon
// each kernel start.
unsigned int* semaphore() const { return semaphore_; }
private:
struct AsyncFreeData {
AsyncFreeData(::tensorflow::Allocator* a, void* p, const string& o,
......@@ -205,7 +209,8 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
const cudaStream_t* stream_; // Not owned.
const cudaDeviceProp* device_prop_; // Not owned.
::tensorflow::Allocator* allocator_; // Not owned.
mutable void* scratch_;
mutable char* scratch_;
mutable unsigned int* semaphore_;
TF_DISALLOW_COPY_AND_ASSIGN(EigenCudaStreamDevice);
};
......@@ -262,6 +267,16 @@ BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
streams_.push_back({stream, host_to_device_stream, device_to_host_stream,
device_to_device_stream});
perftools::gputools::DeviceMemory<char> mem =
executor_->AllocateArray<char>(Eigen::kCudaScratchSize +
sizeof(unsigned int));
scratch_.push_back(static_cast<char*>(mem.opaque()));
bool ok = executor_->SynchronousMemZero(
&mem, Eigen::kCudaScratchSize + sizeof(unsigned int));
if (!ok) {
LOG(FATAL) << "Failed to initialize device " << gpu_id;
}
device_contexts_.push_back(
new GPUDeviceContext(i, stream, host_to_device_stream,
device_to_host_stream, device_to_device_stream));
......@@ -486,9 +501,10 @@ class ConcretePerOpGpuDevice : public PerOpGpuDevice {
public:
ConcretePerOpGpuDevice() : device_(nullptr) {}
void Reinitialize(OpKernelContext* context, gpu::Stream* stream,
Allocator* base_allocator, ::tensorflow::EventMgr* em) {
Allocator* base_allocator, ::tensorflow::EventMgr* em,
char* scratch) {
allocator_.Reinitialize(context, stream, base_allocator, em);
device_.Reinitialize(stream, &allocator_);
device_.Reinitialize(stream, &allocator_, scratch);
}
const Eigen::GpuDevice& device() const override { return device_; }
......@@ -503,8 +519,9 @@ class ConcretePerOpGpuDevice : public PerOpGpuDevice {
ConcretePerOpGpuDevice() : device_(&stream_device_) {}
void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream,
int gpu_id, Allocator* base_allocator) {
stream_device_.Reinitialize(context, cuda_stream, gpu_id, base_allocator);
int gpu_id, Allocator* base_allocator, char* scratch) {
stream_device_.Reinitialize(context, cuda_stream, gpu_id, base_allocator,
scratch);
}
const Eigen::GpuDevice& device() const override { return device_; }
......@@ -524,11 +541,12 @@ void BaseGPUDevice::ReinitializeDevice(OpKernelContext* context,
DCHECK(concrete_device);
#if defined(__GCUDACC__) || defined(__GCUDACC_HOST__)
concrete_device->Reinitialize(context, streams_[stream_id].compute, allocator,
em_.get());
em_.get(), scratch_[stream_id]);
#else
const cudaStream_t* cuda_stream = reinterpret_cast<const cudaStream_t*>(
streams_[stream_id].compute->implementation()->CudaStreamMemberHack());
concrete_device->Reinitialize(context, cuda_stream, gpu_id_, allocator);
concrete_device->Reinitialize(context, cuda_stream, gpu_id_, allocator,
scratch_[stream_id]);
#endif
}
......
......@@ -91,6 +91,7 @@ class BaseGPUDevice : public LocalDevice {
gpu::Stream* device_to_device;
};
gtl::InlinedVector<StreamGroup, 4> streams_;
gtl::InlinedVector<char*, 4> scratch_;
std::vector<GPUDeviceContext*> device_contexts_;
GpuDeviceInfo* gpu_device_info_ = nullptr;
mutex trace_mu_;
......
......@@ -6,8 +6,8 @@
def tf_workspace(path_prefix = "", tf_repo_name = ""):
native.new_http_archive(
name = "eigen_archive",
url = "https://bitbucket.org/eigen/eigen/get/0c0b79ecd74c.tar.gz",
sha256 = "b4b5884b03bd4bae114d02b36e2435ad1504ed8e51431d16c876b6f6a365882b",
url = "https://bitbucket.org/eigen/eigen/get/62a2305d5734.tar.gz",
sha256 = "d5da5c60f7225bc2f104f3494323b929e68e3a188ccf01dcee61df32ff536888",
build_file = path_prefix + "eigen.BUILD",
)
......
#include "eigen-eigen-0c0b79ecd74c/Eigen/Cholesky"
#include "eigen-eigen-62a2305d5734/Eigen/Cholesky"
#include "eigen-eigen-0c0b79ecd74c/Eigen/Core"
#include "eigen-eigen-62a2305d5734/Eigen/Core"
#include "eigen-eigen-0c0b79ecd74c/Eigen/Eigenvalues"
#include "eigen-eigen-62a2305d5734/Eigen/Eigenvalues"
#include "eigen-eigen-0c0b79ecd74c/Eigen/LU"
#include "eigen-eigen-62a2305d5734/Eigen/LU"
#include "eigen-eigen-0c0b79ecd74c/Eigen/QR"
#include "eigen-eigen-62a2305d5734/Eigen/QR"
#include "eigen-eigen-0c0b79ecd74c/unsupported/Eigen/CXX11/Tensor"
#include "eigen-eigen-62a2305d5734/unsupported/Eigen/CXX11/Tensor"
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册