diff --git a/.gitignore b/.gitignore index 4f21fefda9f64a0392881971a715b97c234030e3..351b8204100dfd71e94cb3efa2e946b44b9e4285 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,4 @@ CMakeFiles cmake_install.cmake paddle/.timestamp python/paddlepaddle.egg-info/ +paddle/pybind/pybind.h diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index c57537be4bf67a8db6a49669ab8d2ed1b1324bdc..f8a64a786611ef872dbbfced10919e00c4d46715 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -22,14 +22,14 @@ namespace framework { template <> Eigen::DefaultDevice& ExecutionContext::GetEigenDevice< platform::CPUPlace, Eigen::DefaultDevice>() const { - return *device_context_->get_eigen_device(); + return *device_context_.get_eigen_device(); } #ifndef PADDLE_ONLY_CPU template <> Eigen::GpuDevice& ExecutionContext::GetEigenDevice() const { - return *device_context_->get_eigen_device(); + return *device_context_.get_eigen_device(); } #endif diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h index adae7bfc3d7d31b1ed0373f01db4ef80343a08f7..b7c9c39402d57daf0aec97d98535ac8a8d9c0150 100644 --- a/paddle/framework/operator.h +++ b/paddle/framework/operator.h @@ -366,7 +366,7 @@ struct EigenDeviceConverter { class ExecutionContext : public InferShapeContext { public: ExecutionContext(const OperatorBase& op, const Scope& scope, - const platform::DeviceContext* device_context) + const platform::DeviceContext& device_context) : InferShapeContext(op, scope), device_context_(device_context) {} template ::EigenDeviceType> DeviceType& GetEigenDevice() const; - platform::Place GetPlace() const { return device_context_->GetPlace(); } + platform::Place GetPlace() const { return device_context_.GetPlace(); } - const platform::DeviceContext* device_context() const { + const platform::DeviceContext& device_context() const { return device_context_; } @@ -401,7 +401,8 @@ class ExecutionContext : public InferShapeContext { return res; } - const platform::DeviceContext* device_context_; + private: + const platform::DeviceContext& device_context_; }; template <> @@ -461,7 +462,7 @@ class OperatorWithKernel : public OperatorBase { void Run(const Scope& scope, const platform::DeviceContext& dev_ctx) const final { auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx)); - opKernel->Compute(ExecutionContext(*this, scope, &dev_ctx)); + opKernel->Compute(ExecutionContext(*this, scope, dev_ctx)); } static std::unordered_map& diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc index 1e86fc3d166077265e0f433a6712b0665ea5a152..def4b01da098fc960ce7c0e497732fbcc2579945 100644 --- a/paddle/operators/math/math_function.cc +++ b/paddle/operators/math/math_function.cc @@ -19,12 +19,13 @@ namespace operators { namespace math { template <> -void gemm(const CBLAS_TRANSPOSE transA, +void gemm(const platform::DeviceContext& context, + const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const float alpha, const float* A, - const float* B, const float beta, float* C, - platform::DeviceContext* context) { + const float* B, const float beta, + float* C) { int lda = (transA == CblasNoTrans) ? K : M; int ldb = (transB == CblasNoTrans) ? N : K; int ldc = N; @@ -33,13 +34,13 @@ void gemm(const CBLAS_TRANSPOSE transA, } template <> -void gemm(const CBLAS_TRANSPOSE transA, +void gemm(const platform::DeviceContext& context, + const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const double alpha, const double* A, const double* B, const double beta, - double* C, - platform::DeviceContext* context) { + double* C) { int lda = (transA == CblasNoTrans) ? K : M; int ldb = (transB == CblasNoTrans) ? N : K; int ldc = N; @@ -48,13 +49,10 @@ void gemm(const CBLAS_TRANSPOSE transA, } template <> -void matmul(const framework::Tensor& matrix_a, - bool trans_a, - const framework::Tensor& matrix_b, - bool trans_b, float alpha, - framework::Tensor* matrix_out, - float beta, - platform::DeviceContext* context) { +void matmul( + const platform::DeviceContext& context, const framework::Tensor& matrix_a, + bool trans_a, const framework::Tensor& matrix_b, bool trans_b, float alpha, + framework::Tensor* matrix_out, float beta) { auto dim_a = matrix_a.dims(); auto dim_b = matrix_b.dims(); auto dim_out = matrix_out->dims(); @@ -74,18 +72,15 @@ void matmul(const framework::Tensor& matrix_a, CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; gemm( - transA, transB, M, N, K, alpha, matrix_a.data(), - matrix_b.data(), beta, matrix_out->data(), context); + context, transA, transB, M, N, K, alpha, matrix_a.data(), + matrix_b.data(), beta, matrix_out->data()); } template <> -void matmul(const framework::Tensor& matrix_a, - bool trans_a, - const framework::Tensor& matrix_b, - bool trans_b, double alpha, - framework::Tensor* matrix_out, - double beta, - platform::DeviceContext* context) { +void matmul( + const platform::DeviceContext& context, const framework::Tensor& matrix_a, + bool trans_a, const framework::Tensor& matrix_b, bool trans_b, double alpha, + framework::Tensor* matrix_out, double beta) { auto dim_a = matrix_a.dims(); auto dim_b = matrix_b.dims(); auto dim_out = matrix_out->dims(); @@ -105,8 +100,8 @@ void matmul(const framework::Tensor& matrix_a, CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; gemm( - transA, transB, M, N, K, alpha, matrix_a.data(), - matrix_b.data(), beta, matrix_out->data(), context); + context, transA, transB, M, N, K, alpha, matrix_a.data(), + matrix_b.data(), beta, matrix_out->data()); } } // namespace math diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu index da40b27c948918e4997f4a046d2145552296158b..71563b77b4b262c3f1e17ae7c4381da56ba780a3 100644 --- a/paddle/operators/math/math_function.cu +++ b/paddle/operators/math/math_function.cu @@ -19,12 +19,13 @@ namespace operators { namespace math { template <> -void gemm(const CBLAS_TRANSPOSE transA, +void gemm(const platform::DeviceContext& context, + const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const float alpha, const float* A, - const float* B, const float beta, float* C, - platform::DeviceContext* context) { + const float* B, const float beta, + float* C) { // Note that cublas follows fortran order, so the order is different from // the cblas convention. int lda = (transA == CblasNoTrans) ? K : M; @@ -35,18 +36,19 @@ void gemm(const CBLAS_TRANSPOSE transA, (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; PADDLE_ENFORCE(platform::dynload::cublasSgemm( - reinterpret_cast(context)->cublas_handle(), + reinterpret_cast(context) + .cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); } template <> -void gemm(const CBLAS_TRANSPOSE transA, +void gemm(const platform::DeviceContext& context, + const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const double alpha, const double* A, const double* B, const double beta, - double* C, - platform::DeviceContext* context) { + double* C) { // Note that cublas follows fortran order, so the order is different from // the cblas convention. int lda = (transA == CblasNoTrans) ? K : M; @@ -56,18 +58,16 @@ void gemm(const CBLAS_TRANSPOSE transA, cublasOperation_t cuTransB = (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; PADDLE_ENFORCE(platform::dynload::cublasDgemm( - reinterpret_cast(context)->cublas_handle(), + reinterpret_cast(context) + .cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); } template <> -void matmul(const framework::Tensor& matrix_a, - bool trans_a, - const framework::Tensor& matrix_b, - bool trans_b, float alpha, - framework::Tensor* matrix_out, - float beta, - platform::DeviceContext* context) { +void matmul( + const platform::DeviceContext& context, const framework::Tensor& matrix_a, + bool trans_a, const framework::Tensor& matrix_b, bool trans_b, float alpha, + framework::Tensor* matrix_out, float beta) { auto dim_a = matrix_a.dims(); auto dim_b = matrix_b.dims(); auto dim_out = matrix_out->dims(); @@ -87,18 +87,15 @@ void matmul(const framework::Tensor& matrix_a, CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; gemm( - transA, transB, M, N, K, alpha, matrix_a.data(), - matrix_b.data(), beta, matrix_out->data(), context); + context, transA, transB, M, N, K, alpha, matrix_a.data(), + matrix_b.data(), beta, matrix_out->data()); } template <> -void matmul(const framework::Tensor& matrix_a, - bool trans_a, - const framework::Tensor& matrix_b, - bool trans_b, double alpha, - framework::Tensor* matrix_out, - double beta, - platform::DeviceContext* context) { +void matmul( + const platform::DeviceContext& context, const framework::Tensor& matrix_a, + bool trans_a, const framework::Tensor& matrix_b, bool trans_b, double alpha, + framework::Tensor* matrix_out, double beta) { auto dim_a = matrix_a.dims(); auto dim_b = matrix_b.dims(); auto dim_out = matrix_out->dims(); @@ -118,8 +115,8 @@ void matmul(const framework::Tensor& matrix_a, CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; gemm( - transA, transB, M, N, K, alpha, matrix_a.data(), - matrix_b.data(), beta, matrix_out->data(), context); + context, transA, transB, M, N, K, alpha, matrix_a.data(), + matrix_b.data(), beta, matrix_out->data()); } } // namespace math diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h index 155589fadb3ed9f59160a750d546dd8093a56cbe..d8518e77fa7b4abdbcf08b7983013c24806e14ca 100644 --- a/paddle/operators/math/math_function.h +++ b/paddle/operators/math/math_function.h @@ -66,16 +66,16 @@ namespace math { // For more detailed info, please refer to // http://www.netlib.org/lapack/explore-html/d4/de2/sgemm_8f.html template -void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, - const int M, const int N, const int K, const T alpha, const T* A, - const T* B, const T beta, T* C, platform::DeviceContext* context); +void gemm(const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const T alpha, const T* A, const T* B, const T beta, T* C); // matrix multiply with continuous memory template -void matmul(const framework::Tensor& matrix_a, bool trans_a, +void matmul(const platform::DeviceContext& context, + const framework::Tensor& matrix_a, bool trans_a, const framework::Tensor& matrix_b, bool trans_b, T alpha, - framework::Tensor* matrix_out, T beta, - platform::DeviceContext* context); + framework::Tensor* matrix_out, T beta); } // namespace math } // namespace operators diff --git a/paddle/operators/math/math_function_test.cc b/paddle/operators/math/math_function_test.cc index 6c020c4ff7285b43bc5836d80c173d3a068e72b3..7e339457f7f08ff16162f399064a4b4dca594d7f 100644 --- a/paddle/operators/math/math_function_test.cc +++ b/paddle/operators/math/math_function_test.cc @@ -15,8 +15,7 @@ TEST(math_function, notrans_mul_trans) { memcpy(input1_ptr, arr, 6 * sizeof(float)); auto* gpu_place = new paddle::platform::GPUPlace(0); - paddle::platform::DeviceContext* context = - new paddle::platform::CUDADeviceContext(*gpu_place); + paddle::platform::CUDADeviceContext context(*gpu_place); input1_gpu.CopyFrom(input1, *gpu_place); input2_gpu.CopyFrom(input1, *gpu_place); @@ -24,7 +23,7 @@ TEST(math_function, notrans_mul_trans) { out_gpu.mutable_data({2, 2}, *gpu_place); paddle::operators::math::matmul( - input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0, context); + context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0); out.CopyFrom(out_gpu, *cpu_place); @@ -33,6 +32,7 @@ TEST(math_function, notrans_mul_trans) { EXPECT_EQ(out_ptr[1], 14); EXPECT_EQ(out_ptr[2], 14); EXPECT_EQ(out_ptr[3], 50); + delete gpu_place; } TEST(math_function, trans_mul_notrans) { @@ -48,8 +48,7 @@ TEST(math_function, trans_mul_notrans) { memcpy(input1_ptr, arr, 6 * sizeof(float)); auto* gpu_place = new paddle::platform::GPUPlace(0); - paddle::platform::DeviceContext* context = - new paddle::platform::CUDADeviceContext(*gpu_place); + paddle::platform::CUDADeviceContext context(*gpu_place); input1_gpu.CopyFrom(input1, *gpu_place); input2_gpu.CopyFrom(input1, *gpu_place); @@ -57,7 +56,7 @@ TEST(math_function, trans_mul_notrans) { out_gpu.mutable_data({3, 3}, *gpu_place); paddle::operators::math::matmul( - input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0, context); + context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0); out.CopyFrom(out_gpu, *cpu_place); @@ -71,5 +70,6 @@ TEST(math_function, trans_mul_notrans) { EXPECT_EQ(out_ptr[6], 15); EXPECT_EQ(out_ptr[7], 22); EXPECT_EQ(out_ptr[8], 29); + delete gpu_place; } #endif diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h index 3c01f868bda8cba488b3403df456d63d6b082fa6..ac7136a76933d1f3ead86518c65d589747227631 100644 --- a/paddle/operators/mul_op.h +++ b/paddle/operators/mul_op.h @@ -46,10 +46,8 @@ class MulKernel : public framework::OpKernel { : *y; z->mutable_data(context.GetPlace()); - auto* device_context = - const_cast(context.device_context_); - math::matmul(x_matrix, false, y_matrix, false, 1, z, 0, - device_context); + math::matmul(context.device_context(), x_matrix, false, y_matrix, + false, 1, z, 0); } }; @@ -71,16 +69,14 @@ class MulGradKernel : public framework::OpKernel { Tensor* dx = ctx.Output(framework::GradVarName("X")); Tensor* dy = ctx.Output(framework::GradVarName("Y")); - auto* device_context = - const_cast(ctx.device_context_); if (dx) { dx->mutable_data(ctx.GetPlace()); Tensor dx_matrix = dx->dims().size() > 2 ? framework::ReshapeToMatrix( *dx, x_num_col_dims) : *dx; // dx = dout * y'. dx: M x K, dout : M x N, y : K x N - math::matmul(*dout, false, y_matrix, true, 1, &dx_matrix, 0, - device_context); + math::matmul(ctx.device_context(), *dout, false, y_matrix, true, + 1, &dx_matrix, 0); } if (dy) { dy->mutable_data(ctx.GetPlace()); @@ -88,8 +84,8 @@ class MulGradKernel : public framework::OpKernel { *dy, y_num_col_dims) : *dy; // dy = x' * dout. dy K x N, dout : M x N, x : M x K - math::matmul(x_matrix, true, *dout, false, 1, &dy_matrix, 0, - device_context); + math::matmul(ctx.device_context(), x_matrix, true, *dout, false, + 1, &dy_matrix, 0); } } }; diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc index ad212c5b2c47312743362db4926c80bf056e100d..93b472b41c8a4c3a2bfada9d4fbf0e9e1b0cc736 100644 --- a/paddle/platform/device_context.cc +++ b/paddle/platform/device_context.cc @@ -101,19 +101,17 @@ CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) { eigen_stream_.reset(new EigenCudaStreamDevice()); eigen_stream_->Reinitialize(&stream_, place); eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get())); + PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_)); + PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_)); + PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_)); + PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_)); } CUDADeviceContext::~CUDADeviceContext() { SetDeviceId(place_.device); Wait(); - if (cublas_handle_) { - PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_)); - } - - if (cudnn_handle_) { - PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_)); - } - + PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_)); + PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_)); eigen_stream_.reset(); eigen_device_.reset(); PADDLE_ENFORCE(cudaStreamDestroy(stream_)); @@ -129,25 +127,13 @@ Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { return eigen_device_.get(); } -cublasHandle_t CUDADeviceContext::cublas_handle() { - if (!cublas_handle_) { - SetDeviceId(place_.device); - PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_)); - PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_)); - } +cublasHandle_t CUDADeviceContext::cublas_handle() const { return cublas_handle_; } -cudnnHandle_t CUDADeviceContext::cudnn_handle() { - if (!cudnn_handle_) { - SetDeviceId(place_.device); - PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_)); - PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_)); - } - return cudnn_handle_; -} +cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; } -cudaStream_t CUDADeviceContext::stream() { return stream_; } +cudaStream_t CUDADeviceContext::stream() const { return stream_; } #endif // PADDLE_ONLY_CPU diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index 11528e1194e4516891034fa8febdac3ba6eed204..a106592e454e21c46cd2f87f1bbf6694955d6e23 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -67,16 +67,14 @@ class CUDADeviceContext : public DeviceContext { /*! \brief Return eigen device in the device context. */ Eigen::GpuDevice* eigen_device() const; - // clang-format off /*! \brief Return cublas handle in the device context. */ - cublasHandle_t cublas_handle(); + cublasHandle_t cublas_handle() const; /*! \brief Return cudnn handle in the device context. */ - cudnnHandle_t cudnn_handle(); + cudnnHandle_t cudnn_handle() const; /*! \brief Return cuda stream in the device context. */ - cudaStream_t stream(); - // clang-format on + cudaStream_t stream() const; private: GPUPlace place_; @@ -84,11 +82,9 @@ class CUDADeviceContext : public DeviceContext { std::unique_ptr eigen_device_; std::unique_ptr eigen_stream_; - // clang-format off - cudaStream_t stream_{nullptr}; - cudnnHandle_t cudnn_handle_{nullptr}; - cublasHandle_t cublas_handle_{nullptr}; - // clang-format on + cudaStream_t stream_; + cudnnHandle_t cudnn_handle_; + cublasHandle_t cublas_handle_; }; #endif