diff --git a/CMakeLists.txt b/CMakeLists.txt index 06dd5a1332cb9dda8f942a342693ec74fb6184d9..ad559672ad2f83a3d62cdf332b47c6cf1e730f70 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,6 +55,7 @@ option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF) option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF) option(GLIDE_INSTALL "Download and install go dependencies " ON) option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF) +option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF) # CMAKE_BUILD_TYPE if(NOT CMAKE_BUILD_TYPE) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 209f9078a637ac581d90212a48216eb388c477ed..51c3b918cc4ef4cf6c8052ccc14028a872309fcf 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -28,6 +28,10 @@ if(NOT WITH_TIMER) add_definitions(-DPADDLE_DISABLE_TIMER) endif(NOT WITH_TIMER) +if(USE_EIGEN_FOR_BLAS) + add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS) +endif(USE_EIGEN_FOR_BLAS) + if(NOT WITH_PROFILER) add_definitions(-DPADDLE_DISABLE_PROFILER) endif(NOT WITH_PROFILER) diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt index 7dfb6f61c50959f7269725a00dbc4f9c27474bdf..c572a9d433bc16e6733b8fc9367970bef28e699a 100644 --- a/paddle/function/CMakeLists.txt +++ b/paddle/function/CMakeLists.txt @@ -4,6 +4,10 @@ file(GLOB cpp_files . *Op.cpp) list(APPEND h_files Function.h) list(APPEND cpp_files Function.cpp) list(APPEND cpp_files BufferArg.cpp) +list(APPEND cpp_files GemmFunctor.cpp) +if(USE_EIGEN_FOR_BLAS) + list(APPEND cpp_files EigenGemm.cpp) +endif(USE_EIGEN_FOR_BLAS) if(WITH_GPU) file(GLOB cu_files . *OpGpu.cu) diff --git a/paddle/function/DepthwiseConvOp.cpp b/paddle/function/DepthwiseConvOp.cpp index 490e8d546cbd460217abe95f6291b13fa207faa9..2f3112fe657cd381891dc53c7179e7520911e8c9 100644 --- a/paddle/function/DepthwiseConvOp.cpp +++ b/paddle/function/DepthwiseConvOp.cpp @@ -14,7 +14,6 @@ limitations under the License. */ #include "DepthwiseConvOp.h" #include "ConvOp.h" -#include "GemmFunctor.h" namespace paddle { diff --git a/paddle/function/DepthwiseConvOpGpu.cu b/paddle/function/DepthwiseConvOpGpu.cu index 33463805cbd4746c05548028e0bc4a0e2a90453e..2d722dfcfca0f328edeecf185ea37b8512b91907 100644 --- a/paddle/function/DepthwiseConvOpGpu.cu +++ b/paddle/function/DepthwiseConvOpGpu.cu @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "DepthwiseConvOp.h" -#include "GemmFunctor.h" #include "paddle/math/BaseMatrix.h" namespace paddle { diff --git a/paddle/function/EigenGemm.cpp b/paddle/function/EigenGemm.cpp new file mode 100644 index 0000000000000000000000000000000000000000..674141ed39b7f5573948348e3ba3bb526ae43c66 --- /dev/null +++ b/paddle/function/EigenGemm.cpp @@ -0,0 +1,91 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "unsupported/Eigen/CXX11/Tensor" + +namespace paddle { + +template +struct EigenBlasGemm { + typedef Eigen::TensorMap, + Eigen::Aligned> + Matrix; + + static void compute(const bool transA, + const bool transB, + const int M, + const int N, + const int K, + const T alpha, + const T* A, + const int lda, + const T* B, + const int ldb, + const T beta, + T* C, + const int ldc) { + Eigen::array sizeA; + if (transA) { + sizeA[0] = K; + sizeA[1] = M; + CHECK_EQ(M, lda); + } else { + sizeA[0] = M; + sizeA[1] = K; + CHECK_EQ(K, lda); + } + Eigen::array sizeB; + if (transB) { + sizeB[0] = N; + sizeB[1] = K; + CHECK_EQ(K, ldb); + } else { + sizeB[0] = K; + sizeB[1] = N; + CHECK_EQ(N, ldb); + } + Eigen::array sizeC; + sizeC[0] = M; + sizeC[1] = N; + CHECK_EQ(N, ldc); + + const Matrix a(const_cast(A), sizeA); + const Matrix b(const_cast(B), sizeB); + Matrix c(C, sizeC); + + typedef typename Eigen::Tensor::DimensionPair DimPair; + Eigen::array dims; + dims[0] = DimPair(1, 0); + dims[0].first = transA ? 0 : 1; + dims[0].second = transB ? 1 : 0; + + Eigen::DefaultDevice device; + if (alpha == T(1) && beta == T(0)) { + c.device(device) = a.contract(b, dims); + } else if (alpha == T(1) && beta == T(1)) { + c.device(device) += a.contract(b, dims); + } else { + c.device(device) = alpha * a.contract(b, dims) + beta * c; + } + } +}; + +#ifdef PADDLE_TYPE_DOUBLE +template class EigenBlasGemm; +#else +template class EigenBlasGemm; +#endif + +} // namespace paddle diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp index 0ada4d70a0c7d13f9b5fb1a42eac07fc4c775a87..f8cf4ebea8d724f0291b981647622b63e3d84495 100644 --- a/paddle/function/GemmConvOp.cpp +++ b/paddle/function/GemmConvOp.cpp @@ -85,7 +85,6 @@ public: } Im2ColFunctor im2col; - GemmFunctor gemm; size_t inputOffset = imShape.getElements(); size_t outputOffset = (outputChannels / groups_) * outputHeight * outputWidth; @@ -108,19 +107,19 @@ public: int M = outputChannels / groups_; int N = outputHeight * outputWidth; int K = inputChannels / groups_ * filterHeight * filterWidth; - gemm(CblasNoTrans, - CblasNoTrans, - M, - N, - K, - 1.0f, - filterData + g * filterOffset, - K, - colData, - N, - beta, - outputData + g * outputOffset, - N); + BlasGemm::compute(false, + false, + M, + N, + K, + 1.0f, + filterData + g * filterOffset, + K, + colData, + N, + beta, + outputData + g * outputOffset, + N); } inputData += inputChannels * inputHeight * inputWidth; outputData += outputChannels * outputHeight * outputWidth; @@ -188,8 +187,6 @@ public: } Col2ImFunctor col2im; - GemmFunctor gemm; - size_t inputOffset = imShape.getElements(); size_t outputOffset = (outputChannels / groups_) * outputHeight * outputWidth; @@ -205,19 +202,19 @@ public: colData = inputGrad + g * inputOffset; scale = 1.0f; } - gemm(CblasTrans, - CblasNoTrans, - M, - N, - K, - 1.0f, - filterData + g * filterOffset, - M, - outputGrad + g * outputOffset, - N, - scale, - colData, - N); + BlasGemm::compute(true, + false, + M, + N, + K, + 1.0f, + filterData + g * filterOffset, + M, + outputGrad + g * outputOffset, + N, + scale, + colData, + N); if (needIm2col) { col2im(inputGrad + g * inputOffset, imShape, @@ -299,7 +296,6 @@ public: } Im2ColFunctor im2col; - GemmFunctor gemm; size_t inputOffset = imShape.getElements(); size_t outputOffset = (outputChannels / groups_) * outputHeight * outputWidth; @@ -321,19 +317,19 @@ public: int M = outputChannels / groups_; int K = outputHeight * outputWidth; int N = inputChannels / groups_ * filterHeight * filterWidth; - gemm(CblasNoTrans, - CblasTrans, - M, - N, - K, - 1.0f, - outputGrad + g * outputOffset, - K, - colData, - K, - i == 0 ? beta : 1.0f, - filterGrad + g * filterOffset, - N); + BlasGemm::compute(false, + true, + M, + N, + K, + 1.0f, + outputGrad + g * outputOffset, + K, + colData, + K, + i == 0 ? beta : 1.0f, + filterGrad + g * filterOffset, + N); } inputData += inputChannels * inputHeight * inputWidth; outputGrad += outputChannels * outputHeight * outputWidth; diff --git a/paddle/function/GemmFunctor.cpp b/paddle/function/GemmFunctor.cpp new file mode 100644 index 0000000000000000000000000000000000000000..dc83278d8e3b70101962521b76b902046629c2dd --- /dev/null +++ b/paddle/function/GemmFunctor.cpp @@ -0,0 +1,90 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "GemmFunctor.h" +#include "paddle/math/MathFunctions.h" + +namespace paddle { + +template +struct BlasGemm { + static void compute(const bool transA, + const bool transB, + const int M, + const int N, + const int K, + const T alpha, + const T* A, + const int lda, + const T* B, + const int ldb, + const T beta, + T* C, + const int ldc) { +#ifdef PADDLE_USE_EIGEN_FOR_BLAS + EigenBlasGemm::compute( + transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); +#else + gemm(transA == false ? CblasNoTrans : CblasTrans, + transB == false ? CblasNoTrans : CblasTrans, + M, + N, + K, + alpha, + A, + lda, + B, + ldb, + beta, + C, + ldc); +#endif + } +}; + +template +struct BlasGemm { + static void compute(const bool transA, + const bool transB, + const int M, + const int N, + const int K, + const T alpha, + const T* A, + const int lda, + const T* B, + const int ldb, + const T beta, + T* C, + const int ldc) { + hl_matrix_mul((T*)A, + transA == false ? HPPL_OP_N : HPPL_OP_T, + (T*)B, + transB == false ? HPPL_OP_N : HPPL_OP_T, + C, + M, + N, + K, + alpha, + beta, + lda, + ldb, + ldc); + } +}; + +template class BlasGemm; +template class BlasGemm; + +} // namespace paddle diff --git a/paddle/function/GemmFunctor.h b/paddle/function/GemmFunctor.h index d5db5cf5e7a855d89b262fe8cf42aa2c55f419f1..0809953b4eb17c25eadcce7f474a3dab0469bba1 100644 --- a/paddle/function/GemmFunctor.h +++ b/paddle/function/GemmFunctor.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#include "paddle/math/MathFunctions.h" +#include "TensorType.h" namespace paddle { @@ -24,73 +24,42 @@ namespace paddle { // of MatMulFunction, we need to consider the reconstruction of hl_matrix_mul // interface. template -class GemmFunctor { -public: - void operator()(const CBLAS_TRANSPOSE transA, - const CBLAS_TRANSPOSE TransB, - const int M, - const int N, - const int K, - const T alpha, - const T* A, - const int lda, - const T* B, - const int ldb, - const T beta, - T* C, - const int ldc); +struct BlasGemm { + static void compute(const bool transA, + const bool transB, + const int M, + const int N, + const int K, + const T alpha, + const T* A, + const int lda, + const T* B, + const int ldb, + const T beta, + T* C, + const int ldc); }; +// TODO(hedaoyuan): Since the definition of the real type in the Paddle +// conflicts with the Eigen library, so compile the Eigen code can not +// include the Paddle header file. And need an EigenBlasGemm template class +// that does not contain the DeviceType parameter. +// I will fix this problem and merge BlasGemm and EigenBlasGemm into one. template -class GemmFunctor { -public: - void operator()(const CBLAS_TRANSPOSE transA, - const CBLAS_TRANSPOSE TransB, - const int M, - const int N, - const int K, - const T alpha, - const T* A, - const int lda, - const T* B, - const int ldb, - const T beta, - T* C, - const int ldc) { - gemm(transA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); - } -}; - -template -class GemmFunctor { -public: - void operator()(const CBLAS_TRANSPOSE transA, - const CBLAS_TRANSPOSE TransB, - const int M, - const int N, - const int K, - const T alpha, - const T* A, - const int lda, - const T* B, - const int ldb, - const T beta, - T* C, - const int ldc) { - hl_matrix_mul((T*)A, - transA == CblasNoTrans ? HPPL_OP_N : HPPL_OP_T, - (T*)B, - TransB == CblasNoTrans ? HPPL_OP_N : HPPL_OP_T, - C, - M, - N, - K, - alpha, - beta, - lda, - ldb, - ldc); - } +struct EigenBlasGemm { + static void compute(const bool transA, + const bool transB, + const int M, + const int N, + const int K, + const T alpha, + const T* A, + const int lda, + const T* B, + const int ldb, + const T beta, + T* C, + const int ldc); }; } // namespace paddle