diff --git a/paddle/fluid/operators/math/concat.cu b/paddle/fluid/operators/math/concat.cu index 5f64856a1a5b816afaeaed0e0d63b74ed5d6aa85..60b266f08fb2d4217c5933902d69de96fc2abe22 100644 --- a/paddle/fluid/operators/math/concat.cu +++ b/paddle/fluid/operators/math/concat.cu @@ -70,7 +70,7 @@ __global__ void KernelConcat(T** inputs, const int input_col, const int output_rows, const int output_cols, T* output) { int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - float inv_input_col = 1.0 / input_col; + double inv_input_col = 1.0 / input_col; for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) { int split = tid_x * inv_input_col; int in_offset = tid_x - split * input_col; @@ -113,7 +113,7 @@ __global__ void KernelConcatGrad(const T* input, const int input_row, const int input_col, const int output_cols, T** outputs) { int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - float inv_input_col = 1.0 / input_col; + double inv_input_col = 1.0 / input_col; for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) { int split = tid_x * inv_input_col; int in_offset = tid_x - split * input_col; @@ -145,8 +145,8 @@ class ConcatFunctor { int cols = input[0].numel() / rows; int out_rows = rows, out_cols = 0; - paddle::framework::Vector inputs_data(num * sizeof(T*) / 2); - paddle::framework::Vector inputs_cols(num + 1); + framework::Vector inputs_data(num * sizeof(T*) / 2); + framework::Vector inputs_cols(num + 1); inputs_cols[0] = 0; T** inputs_ptr = reinterpret_cast(inputs_data.data()); @@ -168,15 +168,14 @@ class ConcatFunctor { // computation // set the thread block and grid according to CurrentDeviceId const int kThreadsPerBlock = 1024; - int block_cols = std::min(out_cols, kThreadsPerBlock); - int block_rows = std::max(kThreadsPerBlock / block_cols, 1); + int block_cols = kThreadsPerBlock; + if (out_cols < kThreadsPerBlock) { // block_cols is aligned by 32. + block_cols = ((out_cols + 31) >> 5) << 5; + } + int block_rows = kThreadsPerBlock / block_cols; dim3 block_size = dim3(block_cols, block_rows, 1); - int dev_id = paddle::platform::GetCurrentDeviceId(); - int multi_process = paddle::platform::GetCUDAMultiProcessors(dev_id); - int max_threads_per_mp = - paddle::platform::GetCUDAMaxThreadsPerMultiProcessor(dev_id); - int max_threads = multi_process * max_threads_per_mp; + int max_threads = context.GetMaxPhysicalThreadCount(); int max_blocks = std::max(max_threads / kThreadsPerBlock, 1); int grid_cols = @@ -218,8 +217,8 @@ class ConcatGradFunctor { int input_col = 0; bool sameShape = true; - paddle::framework::Vector outputs_data(num * sizeof(T*) / 2); - paddle::framework::Vector outputs_cols(num + 1); + framework::Vector outputs_data(num * sizeof(T*) / 2); + framework::Vector outputs_cols(num + 1); outputs_cols[0] = 0; T** outputs_ptr = reinterpret_cast(outputs_data.data()); @@ -239,12 +238,20 @@ class ConcatGradFunctor { // computation const int kThreadsPerBlock = 1024; - int block_cols = std::min(input_col, kThreadsPerBlock); - int block_rows = std::max(kThreadsPerBlock / block_cols, 1); + int block_cols = kThreadsPerBlock; + if (input_col < kThreadsPerBlock) { // block_cols is aligned by 32. + block_cols = ((input_col + 31) >> 5) << 5; + } + int block_rows = kThreadsPerBlock / block_cols; dim3 block_size = dim3(block_cols, block_rows, 1); - int grid_cols = (input_col + block_cols - 1) / block_cols; - int grid_rows = (input_row + block_rows - 1) / block_rows; + int max_threads = context.GetMaxPhysicalThreadCount(); + int max_blocks = std::max(max_threads / kThreadsPerBlock, 1); + + int grid_cols = + std::min((input_col + block_cols - 1) / block_cols, max_blocks); + int grid_rows = + std::min(max_blocks / grid_cols, std::max(input_row / block_rows, 1)); dim3 grid_size = dim3(grid_cols, grid_rows, 1); if (sameShape) { diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 7da6e04d0a8b81bcb5fb6b105ebdd5b908cf8f1d..583a3e740e10e3bd863c547d8f918c7a7b2563f0 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -121,6 +121,8 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { SetDeviceId(place_.device); + multi_process = GetCUDAMultiProcessors(place_.device); + max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device); PADDLE_ENFORCE(cudaStreamCreate(&stream_)); eigen_stream_.reset(new EigenCudaStreamDevice()); eigen_stream_->Reinitialize(&stream_, place); @@ -154,6 +156,10 @@ void CUDADeviceContext::Wait() const { PADDLE_ENFORCE(cudaGetLastError()); } +int CUDADeviceContext::GetMaxPhysicalThreadCount() const { + return multi_process * max_threads_per_mp; +} + Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { return eigen_device_.get(); } diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index a294ba5101528c9ac0007bdcfc5255a0c2674aad..918243ccfe35907e4ff7b80969c514790e8354ff 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -79,6 +79,9 @@ class CUDADeviceContext : public DeviceContext { /*! \brief Return place in the device context. */ Place GetPlace() const override; + /*! \brief Return the max physical thread count in the device context */ + int GetMaxPhysicalThreadCount() const; + /*! \brief Return eigen device in the device context. */ Eigen::GpuDevice* eigen_device() const; @@ -100,6 +103,9 @@ class CUDADeviceContext : public DeviceContext { cudaStream_t stream_; cudnnHandle_t cudnn_handle_; cublasHandle_t cublas_handle_; + + int multi_process; + int max_threads_per_mp; }; template <>