From 3fa8bdcde03bd57f0e7100859baf8a3193d03089 Mon Sep 17 00:00:00 2001 From: superjomn Date: Sat, 27 Apr 2019 17:14:45 +0800 Subject: [PATCH] the whole inference works in the new framework with CPU and GPU mixed MUL CUDA kernel is faked, for the cublas not works Some enhancement needs for TypeSystem, unittests --- paddle/fluid/lite/core/kernel.h | 1 - paddle/fluid/lite/core/mir/passes.h | 1 + paddle/fluid/lite/core/optimizer.cc | 1 + paddle/fluid/lite/core/tensor.h | 1 + paddle/fluid/lite/cuda/CMakeLists.txt | 2 +- paddle/fluid/lite/cuda/blas.h | 8 +++++++- paddle/fluid/lite/kernels/cuda/CMakeLists.txt | 2 +- paddle/fluid/lite/kernels/cuda/io_copy_compute.cc | 4 +++- paddle/fluid/lite/kernels/cuda/mul_compute.h | 13 ++++++++----- paddle/fluid/lite/operators/mul_op.cc | 1 + 10 files changed, 24 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/lite/core/kernel.h b/paddle/fluid/lite/core/kernel.h index d8999cef994..c6acf92bb49 100644 --- a/paddle/fluid/lite/core/kernel.h +++ b/paddle/fluid/lite/core/kernel.h @@ -116,7 +116,6 @@ class KernelBase { std::string GenParamTypeKey() const { std::stringstream ss; - LOG(INFO) << "alias : " << alias_; ss << op_type() << "/" << alias_; return ss.str(); } diff --git a/paddle/fluid/lite/core/mir/passes.h b/paddle/fluid/lite/core/mir/passes.h index 35eaeeef29a..d81cdd7d01e 100644 --- a/paddle/fluid/lite/core/mir/passes.h +++ b/paddle/fluid/lite/core/mir/passes.h @@ -28,3 +28,4 @@ USE_MIR_PASS(io_complement_pass); USE_MIR_PASS(generate_program_pass); USE_MIR_PASS(io_copy_kernel_pick_pass); USE_MIR_PASS(argument_type_display_pass); +USE_MIR_PASS(runtime_context_assign_pass); diff --git a/paddle/fluid/lite/core/optimizer.cc b/paddle/fluid/lite/core/optimizer.cc index c3be12d22f5..b9761d10557 100644 --- a/paddle/fluid/lite/core/optimizer.cc +++ b/paddle/fluid/lite/core/optimizer.cc @@ -38,6 +38,7 @@ void Optimizer::RunPasses() { "argument_type_display_pass", // "io_copy_kernel_pick_pass", // "variable_place_inference_pass", // + "runtime_context_assign_pass", // }); for (auto& pass_type : passes) { LOG(INFO) << ".. running pass " << pass_type; diff --git a/paddle/fluid/lite/core/tensor.h b/paddle/fluid/lite/core/tensor.h index f78b9080139..e19948cde12 100644 --- a/paddle/fluid/lite/core/tensor.h +++ b/paddle/fluid/lite/core/tensor.h @@ -71,6 +71,7 @@ class Tensor { template T* mutable_data(TargetType target) { target_ = target; + memory_size_ = product(dims_) * sizeof(T); buffer_->ResetLazy(target, memory_size()); return static_cast(buffer_->data()); } diff --git a/paddle/fluid/lite/cuda/CMakeLists.txt b/paddle/fluid/lite/cuda/CMakeLists.txt index 1ac05bc7cd8..1e3a9a5c8df 100644 --- a/paddle/fluid/lite/cuda/CMakeLists.txt +++ b/paddle/fluid/lite/cuda/CMakeLists.txt @@ -1,2 +1,2 @@ nv_library(target_wrapper_cuda SRCS target_wrapper.cc) -nv_library(cuda_blas SRCS blas.cc) +nv_library(cuda_blas_lite SRCS blas.cc) diff --git a/paddle/fluid/lite/cuda/blas.h b/paddle/fluid/lite/cuda/blas.h index 3a8d2fd9221..ee11512fee4 100644 --- a/paddle/fluid/lite/cuda/blas.h +++ b/paddle/fluid/lite/cuda/blas.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include "paddle/fluid/lite/cuda/cuda_utils.h" #include "paddle/fluid/lite/utils/all.h" @@ -31,8 +32,10 @@ namespace cuda { * Some basic methods. */ struct BlasBase { + /* BlasBase() { CUBLAS_CHECK(cublasCreate(&handle_)); } ~BlasBase() { CUBLAS_CHECK(cublasDestroy(handle_)); } + */ void SetStream(cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle_, stream)); @@ -69,7 +72,10 @@ class Blas : public lite::cuda::BlasBase { const T* B, int ldb, // const T* beta, // T* C, int ldc) const { - LITE_UNIMPLEMENTED; + CHECK_EQ(CUBLAS_STATUS_SUCCESS, + cublasSgemm(handle_, // + CUBLAS_OP_N, CUBLAS_OP_N, // + m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)); } }; diff --git a/paddle/fluid/lite/kernels/cuda/CMakeLists.txt b/paddle/fluid/lite/kernels/cuda/CMakeLists.txt index 64ea90b0afe..f2b2006600b 100644 --- a/paddle/fluid/lite/kernels/cuda/CMakeLists.txt +++ b/paddle/fluid/lite/kernels/cuda/CMakeLists.txt @@ -1,4 +1,4 @@ nv_library(mul_compute_cuda SRCS mul_compute.cc DEPS tensor_lite) cc_library(io_copy_compute_cuda SRCS io_copy_compute.cc DEPS tensor_lite) -nv_library(kernels_cuda DEPS mul_compute_cuda io_copy_compute_cuda) +nv_library(kernels_cuda DEPS mul_compute_cuda io_copy_compute_cuda cuda_blas_lite) diff --git a/paddle/fluid/lite/kernels/cuda/io_copy_compute.cc b/paddle/fluid/lite/kernels/cuda/io_copy_compute.cc index 5705feae922..01b62374df9 100644 --- a/paddle/fluid/lite/kernels/cuda/io_copy_compute.cc +++ b/paddle/fluid/lite/kernels/cuda/io_copy_compute.cc @@ -48,7 +48,8 @@ class IoCopyHostToCudaCompute auto& param = Param(); CHECK(param.x->target() == TARGET(kHost) || param.x->target() == TARGET(kX86)); - auto* data = param.y->mutable_data(target(), param.x->memory_size()); + LOG(INFO) << "copy size " << param.x->memory_size(); + auto* data = param.y->mutable_data(TARGET(kCUDA), param.x->memory_size()); CopyFromHostSync(data, param.x->data(), param.x->memory_size()); } @@ -82,6 +83,7 @@ class IoCopyCudaToHostCompute auto& param = Param(); CHECK(param.x->target() == TARGET(kCUDA)); auto* data = param.y->mutable_data(TARGET(kHost), param.x->memory_size()); + LOG(INFO) << "copy size " << param.x->memory_size(); CopyToHostSync(data, param.x->data(), param.x->memory_size()); } diff --git a/paddle/fluid/lite/kernels/cuda/mul_compute.h b/paddle/fluid/lite/kernels/cuda/mul_compute.h index ad39e2eae8a..b7419e99ede 100644 --- a/paddle/fluid/lite/kernels/cuda/mul_compute.h +++ b/paddle/fluid/lite/kernels/cuda/mul_compute.h @@ -26,8 +26,8 @@ namespace cuda { template void mul_compute(const lite::cuda::Blas& blas, const T* x, int x_h, int x_w, const T* y, int y_h, int y_w, T* out) { - blas.sgemm(CUBLAS_OP_N, CUBLAS_OP_N, x_w, x_h, y_w, nullptr, x, 0, y, 0, - nullptr, out, 0); + blas.sgemm(CUBLAS_OP_N, CUBLAS_OP_N, x_h, y_w, x_w, nullptr, x, x_w, y, y_w, + nullptr, out, x_h); } class MulCompute : public OpKernel { @@ -38,8 +38,8 @@ class MulCompute : public OpKernel { CHECK(context_) << "running context should be set first"; auto& context = context_->AsCudaContext(); CHECK(context.blas_fp32) << "blas should init first"; + /* auto& blas = *context.blas_fp32; - const auto& param = Param(); CHECK(param.x->target() == TARGET(kCUDA)); auto* x = param.x->data(); int x_h = param.x->dims()[0]; @@ -48,10 +48,13 @@ class MulCompute : public OpKernel { auto* y = param.y->data(); int y_h = param.y->dims()[0]; int y_w = param.y->dims()[1]; + */ - auto* out = param.output->mutable_data(TARGET(kCUDA)); + const auto& param = Param(); + param.output->mutable_data(TARGET(kCUDA)); + LOG(INFO) << "mul output memory size " << param.output->memory_size(); - mul_compute(blas, x, x_h, x_w, y, y_h, y_w, out); + // mul_compute(blas, x, x_h, x_w, y, y_h, y_w, out); } virtual ~MulCompute() = default; diff --git a/paddle/fluid/lite/operators/mul_op.cc b/paddle/fluid/lite/operators/mul_op.cc index 5f1dd0de970..e0fe5837153 100644 --- a/paddle/fluid/lite/operators/mul_op.cc +++ b/paddle/fluid/lite/operators/mul_op.cc @@ -44,6 +44,7 @@ bool MulOpLite::InferShape() const { out_dims[i] = x_dims[i]; } out_dims.back() = y_dims[1]; + param_.output->Resize(out_dims); // share LoD -- GitLab