diff --git a/paddle/fluid/lite/core/kernel.h b/paddle/fluid/lite/core/kernel.h index d8999cef9945b41f80ce3e83259aea6cedd9ddbc..c6acf92bb49aa2933ec9ebca633aa2fd6b629aad 100644 --- a/paddle/fluid/lite/core/kernel.h +++ b/paddle/fluid/lite/core/kernel.h @@ -116,7 +116,6 @@ class KernelBase { std::string GenParamTypeKey() const { std::stringstream ss; - LOG(INFO) << "alias : " << alias_; ss << op_type() << "/" << alias_; return ss.str(); } diff --git a/paddle/fluid/lite/core/mir/passes.h b/paddle/fluid/lite/core/mir/passes.h index 35eaeeef29ad1c078e962a3cf1184c93c2174a4c..d81cdd7d01e9972a51a37d4f6a918451ee03e144 100644 --- a/paddle/fluid/lite/core/mir/passes.h +++ b/paddle/fluid/lite/core/mir/passes.h @@ -28,3 +28,4 @@ USE_MIR_PASS(io_complement_pass); USE_MIR_PASS(generate_program_pass); USE_MIR_PASS(io_copy_kernel_pick_pass); USE_MIR_PASS(argument_type_display_pass); +USE_MIR_PASS(runtime_context_assign_pass); diff --git a/paddle/fluid/lite/core/optimizer.cc b/paddle/fluid/lite/core/optimizer.cc index c3be12d22f5bb7e080c4d4500cf65bc0666f5101..b9761d10557c5588e8a9864b0edd37835944613a 100644 --- a/paddle/fluid/lite/core/optimizer.cc +++ b/paddle/fluid/lite/core/optimizer.cc @@ -38,6 +38,7 @@ void Optimizer::RunPasses() { "argument_type_display_pass", // "io_copy_kernel_pick_pass", // "variable_place_inference_pass", // + "runtime_context_assign_pass", // }); for (auto& pass_type : passes) { LOG(INFO) << ".. running pass " << pass_type; diff --git a/paddle/fluid/lite/core/tensor.h b/paddle/fluid/lite/core/tensor.h index f78b90801396848e3f8cadb0695de27530677dd4..e19948cde1263131ecf109d30dcecbde4a6814b1 100644 --- a/paddle/fluid/lite/core/tensor.h +++ b/paddle/fluid/lite/core/tensor.h @@ -71,6 +71,7 @@ class Tensor { template T* mutable_data(TargetType target) { target_ = target; + memory_size_ = product(dims_) * sizeof(T); buffer_->ResetLazy(target, memory_size()); return static_cast(buffer_->data()); } diff --git a/paddle/fluid/lite/cuda/CMakeLists.txt b/paddle/fluid/lite/cuda/CMakeLists.txt index 1ac05bc7cd8a2b885b5cfe680131f7986744d834..1e3a9a5c8df2ff05d88fcc721001866093fe3fd1 100644 --- a/paddle/fluid/lite/cuda/CMakeLists.txt +++ b/paddle/fluid/lite/cuda/CMakeLists.txt @@ -1,2 +1,2 @@ nv_library(target_wrapper_cuda SRCS target_wrapper.cc) -nv_library(cuda_blas SRCS blas.cc) +nv_library(cuda_blas_lite SRCS blas.cc) diff --git a/paddle/fluid/lite/cuda/blas.h b/paddle/fluid/lite/cuda/blas.h index 3a8d2fd92216b4fcecc2280dbec7c8641c50e4f9..ee11512fee4548af3bd56f0be13dbacf0f5fa98a 100644 --- a/paddle/fluid/lite/cuda/blas.h +++ b/paddle/fluid/lite/cuda/blas.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include "paddle/fluid/lite/cuda/cuda_utils.h" #include "paddle/fluid/lite/utils/all.h" @@ -31,8 +32,10 @@ namespace cuda { * Some basic methods. */ struct BlasBase { + /* BlasBase() { CUBLAS_CHECK(cublasCreate(&handle_)); } ~BlasBase() { CUBLAS_CHECK(cublasDestroy(handle_)); } + */ void SetStream(cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle_, stream)); @@ -69,7 +72,10 @@ class Blas : public lite::cuda::BlasBase { const T* B, int ldb, // const T* beta, // T* C, int ldc) const { - LITE_UNIMPLEMENTED; + CHECK_EQ(CUBLAS_STATUS_SUCCESS, + cublasSgemm(handle_, // + CUBLAS_OP_N, CUBLAS_OP_N, // + m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)); } }; diff --git a/paddle/fluid/lite/kernels/cuda/CMakeLists.txt b/paddle/fluid/lite/kernels/cuda/CMakeLists.txt index 64ea90b0afe46866c8bc506c7f18f75dab3efa37..f2b2006600bd9f57623dfb8cdda759382c17f9bb 100644 --- a/paddle/fluid/lite/kernels/cuda/CMakeLists.txt +++ b/paddle/fluid/lite/kernels/cuda/CMakeLists.txt @@ -1,4 +1,4 @@ nv_library(mul_compute_cuda SRCS mul_compute.cc DEPS tensor_lite) cc_library(io_copy_compute_cuda SRCS io_copy_compute.cc DEPS tensor_lite) -nv_library(kernels_cuda DEPS mul_compute_cuda io_copy_compute_cuda) +nv_library(kernels_cuda DEPS mul_compute_cuda io_copy_compute_cuda cuda_blas_lite) diff --git a/paddle/fluid/lite/kernels/cuda/io_copy_compute.cc b/paddle/fluid/lite/kernels/cuda/io_copy_compute.cc index 5705feae92226f959c8c21af0a3581497f9c83df..01b62374df925be1d7317d347c998c03fbffe6f7 100644 --- a/paddle/fluid/lite/kernels/cuda/io_copy_compute.cc +++ b/paddle/fluid/lite/kernels/cuda/io_copy_compute.cc @@ -48,7 +48,8 @@ class IoCopyHostToCudaCompute auto& param = Param(); CHECK(param.x->target() == TARGET(kHost) || param.x->target() == TARGET(kX86)); - auto* data = param.y->mutable_data(target(), param.x->memory_size()); + LOG(INFO) << "copy size " << param.x->memory_size(); + auto* data = param.y->mutable_data(TARGET(kCUDA), param.x->memory_size()); CopyFromHostSync(data, param.x->data(), param.x->memory_size()); } @@ -82,6 +83,7 @@ class IoCopyCudaToHostCompute auto& param = Param(); CHECK(param.x->target() == TARGET(kCUDA)); auto* data = param.y->mutable_data(TARGET(kHost), param.x->memory_size()); + LOG(INFO) << "copy size " << param.x->memory_size(); CopyToHostSync(data, param.x->data(), param.x->memory_size()); } diff --git a/paddle/fluid/lite/kernels/cuda/mul_compute.h b/paddle/fluid/lite/kernels/cuda/mul_compute.h index ad39e2eae8a96807d93b62f46b75858fd952baa1..b7419e99ede808293d966ced6819c67688ad8cce 100644 --- a/paddle/fluid/lite/kernels/cuda/mul_compute.h +++ b/paddle/fluid/lite/kernels/cuda/mul_compute.h @@ -26,8 +26,8 @@ namespace cuda { template void mul_compute(const lite::cuda::Blas& blas, const T* x, int x_h, int x_w, const T* y, int y_h, int y_w, T* out) { - blas.sgemm(CUBLAS_OP_N, CUBLAS_OP_N, x_w, x_h, y_w, nullptr, x, 0, y, 0, - nullptr, out, 0); + blas.sgemm(CUBLAS_OP_N, CUBLAS_OP_N, x_h, y_w, x_w, nullptr, x, x_w, y, y_w, + nullptr, out, x_h); } class MulCompute : public OpKernel { @@ -38,8 +38,8 @@ class MulCompute : public OpKernel { CHECK(context_) << "running context should be set first"; auto& context = context_->AsCudaContext(); CHECK(context.blas_fp32) << "blas should init first"; + /* auto& blas = *context.blas_fp32; - const auto& param = Param(); CHECK(param.x->target() == TARGET(kCUDA)); auto* x = param.x->data(); int x_h = param.x->dims()[0]; @@ -48,10 +48,13 @@ class MulCompute : public OpKernel { auto* y = param.y->data(); int y_h = param.y->dims()[0]; int y_w = param.y->dims()[1]; + */ - auto* out = param.output->mutable_data(TARGET(kCUDA)); + const auto& param = Param(); + param.output->mutable_data(TARGET(kCUDA)); + LOG(INFO) << "mul output memory size " << param.output->memory_size(); - mul_compute(blas, x, x_h, x_w, y, y_h, y_w, out); + // mul_compute(blas, x, x_h, x_w, y, y_h, y_w, out); } virtual ~MulCompute() = default; diff --git a/paddle/fluid/lite/operators/mul_op.cc b/paddle/fluid/lite/operators/mul_op.cc index 5f1dd0de970edca22f4c12369ffb1300451a8518..e0fe5837153b3fc41bf77e07c3f60f10faea088e 100644 --- a/paddle/fluid/lite/operators/mul_op.cc +++ b/paddle/fluid/lite/operators/mul_op.cc @@ -44,6 +44,7 @@ bool MulOpLite::InferShape() const { out_dims[i] = x_dims[i]; } out_dims.back() = y_dims[1]; + param_.output->Resize(out_dims); // share LoD