提交 3fa8bdcd 编写于 作者: S superjomn

the whole inference works in the new framework with CPU and GPU mixed

MUL CUDA kernel is faked, for the cublas not works

Some enhancement needs for TypeSystem, unittests
上级 621d1522
......@@ -116,7 +116,6 @@ class KernelBase {
std::string GenParamTypeKey() const {
std::stringstream ss;
LOG(INFO) << "alias : " << alias_;
ss << op_type() << "/" << alias_;
return ss.str();
}
......
......@@ -28,3 +28,4 @@ USE_MIR_PASS(io_complement_pass);
USE_MIR_PASS(generate_program_pass);
USE_MIR_PASS(io_copy_kernel_pick_pass);
USE_MIR_PASS(argument_type_display_pass);
USE_MIR_PASS(runtime_context_assign_pass);
......@@ -38,6 +38,7 @@ void Optimizer::RunPasses() {
"argument_type_display_pass", //
"io_copy_kernel_pick_pass", //
"variable_place_inference_pass", //
"runtime_context_assign_pass", //
});
for (auto& pass_type : passes) {
LOG(INFO) << ".. running pass " << pass_type;
......
......@@ -71,6 +71,7 @@ class Tensor {
template <typename T>
T* mutable_data(TargetType target) {
target_ = target;
memory_size_ = product(dims_) * sizeof(T);
buffer_->ResetLazy(target, memory_size());
return static_cast<T*>(buffer_->data());
}
......
nv_library(target_wrapper_cuda SRCS target_wrapper.cc)
nv_library(cuda_blas SRCS blas.cc)
nv_library(cuda_blas_lite SRCS blas.cc)
......@@ -17,6 +17,7 @@
#include <cublas_api.h>
#include <cublas_v2.h>
#include <glog/logging.h>
#include <glog/logging.h>
#include <library_types.h>
#include "paddle/fluid/lite/cuda/cuda_utils.h"
#include "paddle/fluid/lite/utils/all.h"
......@@ -31,8 +32,10 @@ namespace cuda {
* Some basic methods.
*/
struct BlasBase {
/*
BlasBase() { CUBLAS_CHECK(cublasCreate(&handle_)); }
~BlasBase() { CUBLAS_CHECK(cublasDestroy(handle_)); }
*/
void SetStream(cudaStream_t stream) {
CUBLAS_CHECK(cublasSetStream(handle_, stream));
......@@ -69,7 +72,10 @@ class Blas : public lite::cuda::BlasBase {
const T* B, int ldb, //
const T* beta, //
T* C, int ldc) const {
LITE_UNIMPLEMENTED;
CHECK_EQ(CUBLAS_STATUS_SUCCESS,
cublasSgemm(handle_, //
CUBLAS_OP_N, CUBLAS_OP_N, //
m, n, k, alpha, A, lda, B, ldb, beta, C, ldc));
}
};
......
nv_library(mul_compute_cuda SRCS mul_compute.cc DEPS tensor_lite)
cc_library(io_copy_compute_cuda SRCS io_copy_compute.cc DEPS tensor_lite)
nv_library(kernels_cuda DEPS mul_compute_cuda io_copy_compute_cuda)
nv_library(kernels_cuda DEPS mul_compute_cuda io_copy_compute_cuda cuda_blas_lite)
......@@ -48,7 +48,8 @@ class IoCopyHostToCudaCompute
auto& param = Param<operators::IoCopyParam>();
CHECK(param.x->target() == TARGET(kHost) ||
param.x->target() == TARGET(kX86));
auto* data = param.y->mutable_data(target(), param.x->memory_size());
LOG(INFO) << "copy size " << param.x->memory_size();
auto* data = param.y->mutable_data(TARGET(kCUDA), param.x->memory_size());
CopyFromHostSync(data, param.x->data<void>(), param.x->memory_size());
}
......@@ -82,6 +83,7 @@ class IoCopyCudaToHostCompute
auto& param = Param<operators::IoCopyParam>();
CHECK(param.x->target() == TARGET(kCUDA));
auto* data = param.y->mutable_data(TARGET(kHost), param.x->memory_size());
LOG(INFO) << "copy size " << param.x->memory_size();
CopyToHostSync(data, param.x->data<void>(), param.x->memory_size());
}
......
......@@ -26,8 +26,8 @@ namespace cuda {
template <typename T>
void mul_compute(const lite::cuda::Blas<float>& blas, const T* x, int x_h,
int x_w, const T* y, int y_h, int y_w, T* out) {
blas.sgemm(CUBLAS_OP_N, CUBLAS_OP_N, x_w, x_h, y_w, nullptr, x, 0, y, 0,
nullptr, out, 0);
blas.sgemm(CUBLAS_OP_N, CUBLAS_OP_N, x_h, y_w, x_w, nullptr, x, x_w, y, y_w,
nullptr, out, x_h);
}
class MulCompute : public OpKernel<TARGET(kCUDA), PRECISION(kFloat)> {
......@@ -38,8 +38,8 @@ class MulCompute : public OpKernel<TARGET(kCUDA), PRECISION(kFloat)> {
CHECK(context_) << "running context should be set first";
auto& context = context_->AsCudaContext();
CHECK(context.blas_fp32) << "blas should init first";
/*
auto& blas = *context.blas_fp32;
const auto& param = Param<operators::MulParam>();
CHECK(param.x->target() == TARGET(kCUDA));
auto* x = param.x->data<float>();
int x_h = param.x->dims()[0];
......@@ -48,10 +48,13 @@ class MulCompute : public OpKernel<TARGET(kCUDA), PRECISION(kFloat)> {
auto* y = param.y->data<float>();
int y_h = param.y->dims()[0];
int y_w = param.y->dims()[1];
*/
auto* out = param.output->mutable_data<float>(TARGET(kCUDA));
const auto& param = Param<operators::MulParam>();
param.output->mutable_data<float>(TARGET(kCUDA));
LOG(INFO) << "mul output memory size " << param.output->memory_size();
mul_compute<float>(blas, x, x_h, x_w, y, y_h, y_w, out);
// mul_compute<float>(blas, x, x_h, x_w, y, y_h, y_w, out);
}
virtual ~MulCompute() = default;
......
......@@ -44,6 +44,7 @@ bool MulOpLite::InferShape() const {
out_dims[i] = x_dims[i];
}
out_dims.back() = y_dims[1];
param_.output->Resize(out_dims);
// share LoD
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册