diff --git a/CMakeLists.txt b/CMakeLists.txt index b174831109372cb014741d63032fa6a470e74042..c7d743e193e7d32dbc0b56f3bcb05b6c61f85f1d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,8 +36,8 @@ include(simd) ################################ Configurations ####################################### option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) -option(WITH_MKLDNN "Compile PaddlePaddle with mkl-dnn support." ${AVX_FOUND}) -option(WITH_MKLML "Compile PaddlePaddle with mklml package." ${AVX_FOUND}) +option(WITH_MKLDNN "Compile PaddlePaddle with mkl-dnn support." OFF) +option(WITH_MKLML "Compile PaddlePaddle with mklml package." OFF) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON) option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON) diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu index 3e2aeea1da490d524db7b04cce947c57af97d8ef..2cc3c24fb34596ca127585e3a5543378db1efda7 100644 --- a/paddle/operators/math/math_function.cu +++ b/paddle/operators/math/math_function.cu @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/operators/math/math_function.h" - namespace paddle { namespace operators { namespace math { @@ -26,6 +25,8 @@ void gemm( platform::DeviceContext* context) { // Note that cublas follows fortran order, so the order is different from // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; cublasOperation_t cuTransA = (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; cublasOperation_t cuTransB = @@ -44,6 +45,8 @@ void gemm( const int ldc, platform::DeviceContext* context) { // Note that cublas follows fortran order, so the order is different from // the cblas convention. + lda = (transA == CblasNoTrans) ? K : M; + ldb = (transB == CblasNoTrans) ? N : K; cublasOperation_t cuTransA = (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; cublasOperation_t cuTransB = @@ -118,7 +121,6 @@ void matmul(const framework::Tensor& in1, in1.data(), K, in2.data(), N, beta, out->data(), N, context); } - } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h index f068f4a15eec7593c257e9a9e026413c95904398..1ecca604039f6cd8ad9c3f976398b258252d57bb 100644 --- a/paddle/operators/math/math_function.h +++ b/paddle/operators/math/math_function.h @@ -37,6 +37,20 @@ extern "C" { #include #endif +#ifndef LAPACK_FOUND +extern "C" { +#include +int LAPACKE_sgetrf( + int matrix_layout, int m, int n, float* a, int lda, int* ipiv); +int LAPACKE_dgetrf( + int matrix_layout, int m, int n, double* a, int lda, int* ipiv); +int LAPACKE_sgetri( + int matrix_layout, int n, float* a, int lda, const int* ipiv); +int LAPACKE_dgetri( + int matrix_layout, int n, double* a, int lda, const int* ipiv); +} +#endif + #include #include "paddle/framework/tensor.h" #include "paddle/platform/device_context.h" @@ -61,7 +75,7 @@ void gemm(const CBLAS_TRANSPOSE transA, const int ldc, platform::DeviceContext* context); -// matrix multiply with continous memory +// matrix multiply with continuous memory template void matmul(const framework::Tensor& in1, bool in1_T, diff --git a/paddle/operators/mul_op.cu b/paddle/operators/mul_op.cu index 7435b74bd87a392a27c1ae7f4a8d2416526a40d0..346a7e505d123b5e4e831daa39a1f6349b3dcccf 100644 --- a/paddle/operators/mul_op.cu +++ b/paddle/operators/mul_op.cu @@ -15,4 +15,5 @@ #define EIGEN_USE_GPU #include "paddle/operators/mul_op.h" +namespace ops = paddle::operators; REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel); diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h index 2087e98901d69574e36adafe2a030abf2305c97b..98c54f1dfb8ef66e8ec51f9a16415a63b383214b 100644 --- a/paddle/operators/mul_op.h +++ b/paddle/operators/mul_op.h @@ -31,9 +31,6 @@ template class MulKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - Eigen::array, 1> dim_pair = { - {Eigen::IndexPair(1, 0)}}; - auto input0 = context.Input("X"); auto input1 = context.Input("Y"); auto output = context.Output(0);