提交 3fa8bdcd 编写于 作者: S superjomn

the whole inference works in the new framework with CPU and GPU mixed

MUL CUDA kernel is faked, for the cublas not works

Some enhancement needs for TypeSystem, unittests
上级 621d1522
...@@ -116,7 +116,6 @@ class KernelBase { ...@@ -116,7 +116,6 @@ class KernelBase {
std::string GenParamTypeKey() const { std::string GenParamTypeKey() const {
std::stringstream ss; std::stringstream ss;
LOG(INFO) << "alias : " << alias_;
ss << op_type() << "/" << alias_; ss << op_type() << "/" << alias_;
return ss.str(); return ss.str();
} }
......
...@@ -28,3 +28,4 @@ USE_MIR_PASS(io_complement_pass); ...@@ -28,3 +28,4 @@ USE_MIR_PASS(io_complement_pass);
USE_MIR_PASS(generate_program_pass); USE_MIR_PASS(generate_program_pass);
USE_MIR_PASS(io_copy_kernel_pick_pass); USE_MIR_PASS(io_copy_kernel_pick_pass);
USE_MIR_PASS(argument_type_display_pass); USE_MIR_PASS(argument_type_display_pass);
USE_MIR_PASS(runtime_context_assign_pass);
...@@ -38,6 +38,7 @@ void Optimizer::RunPasses() { ...@@ -38,6 +38,7 @@ void Optimizer::RunPasses() {
"argument_type_display_pass", // "argument_type_display_pass", //
"io_copy_kernel_pick_pass", // "io_copy_kernel_pick_pass", //
"variable_place_inference_pass", // "variable_place_inference_pass", //
"runtime_context_assign_pass", //
}); });
for (auto& pass_type : passes) { for (auto& pass_type : passes) {
LOG(INFO) << ".. running pass " << pass_type; LOG(INFO) << ".. running pass " << pass_type;
......
...@@ -71,6 +71,7 @@ class Tensor { ...@@ -71,6 +71,7 @@ class Tensor {
template <typename T> template <typename T>
T* mutable_data(TargetType target) { T* mutable_data(TargetType target) {
target_ = target; target_ = target;
memory_size_ = product(dims_) * sizeof(T);
buffer_->ResetLazy(target, memory_size()); buffer_->ResetLazy(target, memory_size());
return static_cast<T*>(buffer_->data()); return static_cast<T*>(buffer_->data());
} }
......
nv_library(target_wrapper_cuda SRCS target_wrapper.cc) nv_library(target_wrapper_cuda SRCS target_wrapper.cc)
nv_library(cuda_blas SRCS blas.cc) nv_library(cuda_blas_lite SRCS blas.cc)
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <cublas_api.h> #include <cublas_api.h>
#include <cublas_v2.h> #include <cublas_v2.h>
#include <glog/logging.h> #include <glog/logging.h>
#include <glog/logging.h>
#include <library_types.h> #include <library_types.h>
#include "paddle/fluid/lite/cuda/cuda_utils.h" #include "paddle/fluid/lite/cuda/cuda_utils.h"
#include "paddle/fluid/lite/utils/all.h" #include "paddle/fluid/lite/utils/all.h"
...@@ -31,8 +32,10 @@ namespace cuda { ...@@ -31,8 +32,10 @@ namespace cuda {
* Some basic methods. * Some basic methods.
*/ */
struct BlasBase { struct BlasBase {
/*
BlasBase() { CUBLAS_CHECK(cublasCreate(&handle_)); } BlasBase() { CUBLAS_CHECK(cublasCreate(&handle_)); }
~BlasBase() { CUBLAS_CHECK(cublasDestroy(handle_)); } ~BlasBase() { CUBLAS_CHECK(cublasDestroy(handle_)); }
*/
void SetStream(cudaStream_t stream) { void SetStream(cudaStream_t stream) {
CUBLAS_CHECK(cublasSetStream(handle_, stream)); CUBLAS_CHECK(cublasSetStream(handle_, stream));
...@@ -69,7 +72,10 @@ class Blas : public lite::cuda::BlasBase { ...@@ -69,7 +72,10 @@ class Blas : public lite::cuda::BlasBase {
const T* B, int ldb, // const T* B, int ldb, //
const T* beta, // const T* beta, //
T* C, int ldc) const { T* C, int ldc) const {
LITE_UNIMPLEMENTED; CHECK_EQ(CUBLAS_STATUS_SUCCESS,
cublasSgemm(handle_, //
CUBLAS_OP_N, CUBLAS_OP_N, //
m, n, k, alpha, A, lda, B, ldb, beta, C, ldc));
} }
}; };
......
nv_library(mul_compute_cuda SRCS mul_compute.cc DEPS tensor_lite) nv_library(mul_compute_cuda SRCS mul_compute.cc DEPS tensor_lite)
cc_library(io_copy_compute_cuda SRCS io_copy_compute.cc DEPS tensor_lite) cc_library(io_copy_compute_cuda SRCS io_copy_compute.cc DEPS tensor_lite)
nv_library(kernels_cuda DEPS mul_compute_cuda io_copy_compute_cuda) nv_library(kernels_cuda DEPS mul_compute_cuda io_copy_compute_cuda cuda_blas_lite)
...@@ -48,7 +48,8 @@ class IoCopyHostToCudaCompute ...@@ -48,7 +48,8 @@ class IoCopyHostToCudaCompute
auto& param = Param<operators::IoCopyParam>(); auto& param = Param<operators::IoCopyParam>();
CHECK(param.x->target() == TARGET(kHost) || CHECK(param.x->target() == TARGET(kHost) ||
param.x->target() == TARGET(kX86)); param.x->target() == TARGET(kX86));
auto* data = param.y->mutable_data(target(), param.x->memory_size()); LOG(INFO) << "copy size " << param.x->memory_size();
auto* data = param.y->mutable_data(TARGET(kCUDA), param.x->memory_size());
CopyFromHostSync(data, param.x->data<void>(), param.x->memory_size()); CopyFromHostSync(data, param.x->data<void>(), param.x->memory_size());
} }
...@@ -82,6 +83,7 @@ class IoCopyCudaToHostCompute ...@@ -82,6 +83,7 @@ class IoCopyCudaToHostCompute
auto& param = Param<operators::IoCopyParam>(); auto& param = Param<operators::IoCopyParam>();
CHECK(param.x->target() == TARGET(kCUDA)); CHECK(param.x->target() == TARGET(kCUDA));
auto* data = param.y->mutable_data(TARGET(kHost), param.x->memory_size()); auto* data = param.y->mutable_data(TARGET(kHost), param.x->memory_size());
LOG(INFO) << "copy size " << param.x->memory_size();
CopyToHostSync(data, param.x->data<void>(), param.x->memory_size()); CopyToHostSync(data, param.x->data<void>(), param.x->memory_size());
} }
......
...@@ -26,8 +26,8 @@ namespace cuda { ...@@ -26,8 +26,8 @@ namespace cuda {
template <typename T> template <typename T>
void mul_compute(const lite::cuda::Blas<float>& blas, const T* x, int x_h, void mul_compute(const lite::cuda::Blas<float>& blas, const T* x, int x_h,
int x_w, const T* y, int y_h, int y_w, T* out) { int x_w, const T* y, int y_h, int y_w, T* out) {
blas.sgemm(CUBLAS_OP_N, CUBLAS_OP_N, x_w, x_h, y_w, nullptr, x, 0, y, 0, blas.sgemm(CUBLAS_OP_N, CUBLAS_OP_N, x_h, y_w, x_w, nullptr, x, x_w, y, y_w,
nullptr, out, 0); nullptr, out, x_h);
} }
class MulCompute : public OpKernel<TARGET(kCUDA), PRECISION(kFloat)> { class MulCompute : public OpKernel<TARGET(kCUDA), PRECISION(kFloat)> {
...@@ -38,8 +38,8 @@ class MulCompute : public OpKernel<TARGET(kCUDA), PRECISION(kFloat)> { ...@@ -38,8 +38,8 @@ class MulCompute : public OpKernel<TARGET(kCUDA), PRECISION(kFloat)> {
CHECK(context_) << "running context should be set first"; CHECK(context_) << "running context should be set first";
auto& context = context_->AsCudaContext(); auto& context = context_->AsCudaContext();
CHECK(context.blas_fp32) << "blas should init first"; CHECK(context.blas_fp32) << "blas should init first";
/*
auto& blas = *context.blas_fp32; auto& blas = *context.blas_fp32;
const auto& param = Param<operators::MulParam>();
CHECK(param.x->target() == TARGET(kCUDA)); CHECK(param.x->target() == TARGET(kCUDA));
auto* x = param.x->data<float>(); auto* x = param.x->data<float>();
int x_h = param.x->dims()[0]; int x_h = param.x->dims()[0];
...@@ -48,10 +48,13 @@ class MulCompute : public OpKernel<TARGET(kCUDA), PRECISION(kFloat)> { ...@@ -48,10 +48,13 @@ class MulCompute : public OpKernel<TARGET(kCUDA), PRECISION(kFloat)> {
auto* y = param.y->data<float>(); auto* y = param.y->data<float>();
int y_h = param.y->dims()[0]; int y_h = param.y->dims()[0];
int y_w = param.y->dims()[1]; int y_w = param.y->dims()[1];
*/
auto* out = param.output->mutable_data<float>(TARGET(kCUDA)); const auto& param = Param<operators::MulParam>();
param.output->mutable_data<float>(TARGET(kCUDA));
LOG(INFO) << "mul output memory size " << param.output->memory_size();
mul_compute<float>(blas, x, x_h, x_w, y, y_h, y_w, out); // mul_compute<float>(blas, x, x_h, x_w, y, y_h, y_w, out);
} }
virtual ~MulCompute() = default; virtual ~MulCompute() = default;
......
...@@ -44,6 +44,7 @@ bool MulOpLite::InferShape() const { ...@@ -44,6 +44,7 @@ bool MulOpLite::InferShape() const {
out_dims[i] = x_dims[i]; out_dims[i] = x_dims[i];
} }
out_dims.back() = y_dims[1]; out_dims.back() = y_dims[1];
param_.output->Resize(out_dims); param_.output->Resize(out_dims);
// share LoD // share LoD
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册