Improved the performance of full reductions on GPU.

NEW BM_fullReduction/10 4591 4595 153149 20.8M items/s BM_fullReduction/64 5073 5075 100000 770.0M items/s BM_fullReduction/512 9067 9070 75263 26.9G items/s BM_fullReduction/4k 243984 244125 2868 64.0G items/s BM_fullReduction/5k 359125 359273 1951 64.8G items/s OLD BM_fullReduction/10 9085 9087 74395 10.5M items/s BM_fullReduction/64 9478 9478 72014 412.1M items/s BM_fullReduction/512 14643 14646 46902 16.7G items/s BM_fullReduction/4k 260338 260384 2678 60.0G items/s BM_fullReduction/5k 385076 385178 1818 60.5G items/s Change: 124290852

Improved the performance of full reductions on GPU.
NEW BM_fullReduction/10 4591 4595 153149 20.8M items/s BM_fullReduction/64 5073 5075 100000 770.0M items/s BM_fullReduction/512 9067 9070 75263 26.9G items/s BM_fullReduction/4k 243984 244125 2868 64.0G items/s BM_fullReduction/5k 359125 359273 1951 64.8G items/s OLD BM_fullReduction/10 9085 9087 74395 10.5M items/s BM_fullReduction/64 9478 9478 72014 412.1M items/s BM_fullReduction/512 14643 14646 46902 16.7G items/s BM_fullReduction/4k 260338 260384 2678 60.0G items/s BM_fullReduction/5k 385076 385178 1818 60.5G items/s Change: 124290852
03dba169 · Benoit Steiner · TensorFlower Gardener · c32ef5a6 · 03dba169 · 03dba169
11 changed file
--- a/eigen.BUILD
+++ b/eigen.BUILD
 package(default_visibility = ["//visibility:public"])

-archive_dir = "eigen-eigen-0c0b79ecd74c"
+archive_dir = "eigen-eigen-62a2305d5734"

 cc_library(
    name = "eigen",

--- a/tensorflow/contrib/cmake/external/eigen.cmake
+++ b/tensorflow/contrib/cmake/external/eigen.cmake
@@ -7,7 +7,7 @@

 include (ExternalProject)

-set(eigen_archive_hash "0c0b79ecd74c")
+set(eigen_archive_hash "62a2305d5734")

 set(eigen_INCLUDE_DIRS
    ${CMAKE_CURRENT_BINARY_DIR}
@@ -16,7 +16,7 @@ set(eigen_INCLUDE_DIRS
    ${tensorflow_source_dir}/third_party/eigen3
 )
 set(eigen_URL https://bitbucket.org/eigen/eigen/get/${eigen_archive_hash}.tar.gz)
-set(eigen_HASH SHA256=b4b5884b03bd4bae114d02b36e2435ad1504ed8e51431d16c876b6f6a365882b)
+set(eigen_HASH SHA256=d5da5c60f7225bc2f104f3494323b929e68e3a188ccf01dcee61df32ff536888)
 set(eigen_BUILD ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen)
 set(eigen_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/eigen/install)


--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -123,19 +123,20 @@ class EigenAllocator : public ::Eigen::Allocator {
 #else
 class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
 public:
-  EigenCudaStreamDevice() : scratch_(nullptr) { Eigen::initializeDeviceProp(); }
-  ~EigenCudaStreamDevice() {
-    if (scratch_) {
-      deallocate(scratch_);
+  EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
+    Eigen::initializeDeviceProp();
  }
+  ~EigenCudaStreamDevice() {
  }
  void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream,
-                    int gpu_id, ::tensorflow::Allocator* alloc) {
+                    int gpu_id, ::tensorflow::Allocator* alloc, char* scratch) {
    if (LogMemory::IsEnabled()) {
      operation_ = context->op_kernel().name() + "/EigenAllocator";
      step_id_ = context->step_id();
    }
-    assert(!scratch_);
+    scratch_ = scratch;
+    semaphore_ =
+        reinterpret_cast<unsigned int*>(scratch + Eigen::kCudaScratchSize);
    stream_ = cuda_stream;
    allocator_ = alloc;
    device_prop_ = &Eigen::m_deviceProperties[gpu_id];
@@ -172,12 +173,15 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
  // Return a pointer to a per stream scratchpad of 1024 bytes residing
  // in global memory.
  void* scratchpad() const {
-    if (scratch_ == nullptr) {
-      scratch_ = allocate(1024);
-    }
    return scratch_;
  }

+  // Return a semaphore. The semaphore is initially initialized to 0, and
+  // each kernel using it is responsible for resetting to 0 upon completion
+  // to maintain the invariant that the semaphore is always equal to 0 upon
+  // each kernel start.
+  unsigned int* semaphore() const { return semaphore_; }
+
 private:
  struct AsyncFreeData {
    AsyncFreeData(::tensorflow::Allocator* a, void* p, const string& o,
@@ -205,7 +209,8 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
  const cudaStream_t* stream_;          // Not owned.
  const cudaDeviceProp* device_prop_;   // Not owned.
  ::tensorflow::Allocator* allocator_;  // Not owned.
-  mutable void* scratch_;
+  mutable char* scratch_;
+  mutable unsigned int* semaphore_;

  TF_DISALLOW_COPY_AND_ASSIGN(EigenCudaStreamDevice);
 };
@@ -262,6 +267,16 @@ BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
    streams_.push_back({stream, host_to_device_stream, device_to_host_stream,
                        device_to_device_stream});

+    perftools::gputools::DeviceMemory<char> mem =
+        executor_->AllocateArray<char>(Eigen::kCudaScratchSize +
+                                       sizeof(unsigned int));
+    scratch_.push_back(static_cast<char*>(mem.opaque()));
+    bool ok = executor_->SynchronousMemZero(
+        &mem, Eigen::kCudaScratchSize + sizeof(unsigned int));
+    if (!ok) {
+      LOG(FATAL) << "Failed to initialize device " << gpu_id;
+    }
+
    device_contexts_.push_back(
        new GPUDeviceContext(i, stream, host_to_device_stream,
                             device_to_host_stream, device_to_device_stream));
@@ -486,9 +501,10 @@ class ConcretePerOpGpuDevice : public PerOpGpuDevice {
 public:
  ConcretePerOpGpuDevice() : device_(nullptr) {}
  void Reinitialize(OpKernelContext* context, gpu::Stream* stream,
-                    Allocator* base_allocator, ::tensorflow::EventMgr* em) {
+                    Allocator* base_allocator, ::tensorflow::EventMgr* em,
+                    char* scratch) {
    allocator_.Reinitialize(context, stream, base_allocator, em);
-    device_.Reinitialize(stream, &allocator_);
+    device_.Reinitialize(stream, &allocator_, scratch);
  }

  const Eigen::GpuDevice& device() const override { return device_; }
@@ -503,8 +519,9 @@ class ConcretePerOpGpuDevice : public PerOpGpuDevice {
  ConcretePerOpGpuDevice() : device_(&stream_device_) {}

  void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream,
-                    int gpu_id, Allocator* base_allocator) {
-    stream_device_.Reinitialize(context, cuda_stream, gpu_id, base_allocator);
+                    int gpu_id, Allocator* base_allocator, char* scratch) {
+    stream_device_.Reinitialize(context, cuda_stream, gpu_id, base_allocator,
+                                scratch);
  }

  const Eigen::GpuDevice& device() const override { return device_; }
@@ -524,11 +541,12 @@ void BaseGPUDevice::ReinitializeDevice(OpKernelContext* context,
  DCHECK(concrete_device);
 #if defined(__GCUDACC__) || defined(__GCUDACC_HOST__)
  concrete_device->Reinitialize(context, streams_[stream_id].compute, allocator,
-                                em_.get());
+                                em_.get(), scratch_[stream_id]);
 #else
  const cudaStream_t* cuda_stream = reinterpret_cast<const cudaStream_t*>(
      streams_[stream_id].compute->implementation()->CudaStreamMemberHack());
-  concrete_device->Reinitialize(context, cuda_stream, gpu_id_, allocator);
+  concrete_device->Reinitialize(context, cuda_stream, gpu_id_, allocator,
+                                scratch_[stream_id]);
 #endif
 }


--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -91,6 +91,7 @@ class BaseGPUDevice : public LocalDevice {
    gpu::Stream* device_to_device;
  };
  gtl::InlinedVector<StreamGroup, 4> streams_;
+  gtl::InlinedVector<char*, 4> scratch_;
  std::vector<GPUDeviceContext*> device_contexts_;
  GpuDeviceInfo* gpu_device_info_ = nullptr;
  mutex trace_mu_;

--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -6,8 +6,8 @@
 def tf_workspace(path_prefix = "", tf_repo_name = ""):
  native.new_http_archive(
    name = "eigen_archive",
-    url = "https://bitbucket.org/eigen/eigen/get/0c0b79ecd74c.tar.gz",
-    sha256 = "b4b5884b03bd4bae114d02b36e2435ad1504ed8e51431d16c876b6f6a365882b",
+    url = "https://bitbucket.org/eigen/eigen/get/62a2305d5734.tar.gz",
+    sha256 = "d5da5c60f7225bc2f104f3494323b929e68e3a188ccf01dcee61df32ff536888",
    build_file = path_prefix + "eigen.BUILD",
  )


--- a/third_party/eigen3/Eigen/Cholesky
+++ b/third_party/eigen3/Eigen/Cholesky
-#include "eigen-eigen-0c0b79ecd74c/Eigen/Cholesky"
+#include "eigen-eigen-62a2305d5734/Eigen/Cholesky"
--- a/third_party/eigen3/Eigen/Core
+++ b/third_party/eigen3/Eigen/Core
-#include "eigen-eigen-0c0b79ecd74c/Eigen/Core"
+#include "eigen-eigen-62a2305d5734/Eigen/Core"
--- a/third_party/eigen3/Eigen/Eigenvalues
+++ b/third_party/eigen3/Eigen/Eigenvalues
-#include "eigen-eigen-0c0b79ecd74c/Eigen/Eigenvalues"
+#include "eigen-eigen-62a2305d5734/Eigen/Eigenvalues"
--- a/third_party/eigen3/Eigen/LU
+++ b/third_party/eigen3/Eigen/LU
-#include "eigen-eigen-0c0b79ecd74c/Eigen/LU"
+#include "eigen-eigen-62a2305d5734/Eigen/LU"
--- a/third_party/eigen3/Eigen/QR
+++ b/third_party/eigen3/Eigen/QR
-#include "eigen-eigen-0c0b79ecd74c/Eigen/QR"
+#include "eigen-eigen-62a2305d5734/Eigen/QR"
--- a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
-#include "eigen-eigen-0c0b79ecd74c/unsupported/Eigen/CXX11/Tensor"
+#include "eigen-eigen-62a2305d5734/unsupported/Eigen/CXX11/Tensor"