提交 d9ed6f14 编写于 作者: R Ray Liu 提交者: GitHub

Merge pull request #1114 from codeWorm2015/opencl

update cltensor
...@@ -168,9 +168,8 @@ Print &operator<<(Print &printer, const CLImage &cl_image) { ...@@ -168,9 +168,8 @@ Print &operator<<(Print &printer, const CLImage &cl_image) {
i0 += width * H; i0 += width * H;
} }
if (err != CL_SUCCESS) {
CL_CHECK_ERRORS(err); CL_CHECK_ERRORS(err);
}
for (int i = 0; i < cl_image.numel(); i += stride) { for (int i = 0; i < cl_image.numel(); i += stride) {
printer << data[i] << " "; printer << data[i] << " ";
} }
......
...@@ -28,7 +28,19 @@ namespace framework { ...@@ -28,7 +28,19 @@ namespace framework {
class CLTensor : TensorBase { class CLTensor : TensorBase {
public: public:
explicit CLTensor(cl_context context) : context_(context) {} CLTensor(cl_context context, cl_command_queue command_queue)
: context_(context), command_queue_(command_queue) {}
CLTensor() = default;
/*
* if init method haven't set context and command_queue, need set
* */
void SetContextAndCommandQueue(cl_context context,
cl_command_queue command_queue) {
context_ = context;
command_queue_ = command_queue;
}
/*! Resize the dimensions of the memory block. */ /*! Resize the dimensions of the memory block. */
inline CLTensor &Resize(const DDim &dims) { inline CLTensor &Resize(const DDim &dims) {
...@@ -39,7 +51,8 @@ class CLTensor : TensorBase { ...@@ -39,7 +51,8 @@ class CLTensor : TensorBase {
template <typename T> template <typename T>
inline T mutable_with_data(void *data) { inline T mutable_with_data(void *data) {
int64_t size = numel() * sizeof(float); int64_t size = numel() * sizeof(float);
holder_.reset(new PlaceholderImpl(size, data, typeid(T), context_)); holder_.reset(
new PlaceholderImpl(size, data, typeid(T), context_, command_queue_));
return reinterpret_cast<T>( return reinterpret_cast<T>(
reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(holder_->ptr()))); reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(holder_->ptr())));
} }
...@@ -51,7 +64,7 @@ class CLTensor : TensorBase { ...@@ -51,7 +64,7 @@ class CLTensor : TensorBase {
PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.") PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.")
int64_t size = numel() * SizeOfType(type); int64_t size = numel() * SizeOfType(type);
if (holder_ == nullptr || holder_->size() < size + offset_) { if (holder_ == nullptr || holder_->size() < size + offset_) {
holder_.reset(new PlaceholderImpl(size, type, context_)); holder_.reset(new PlaceholderImpl(size, type, context_, command_queue_));
offset_ = 0; offset_ = 0;
} }
return reinterpret_cast<void *>( return reinterpret_cast<void *>(
...@@ -85,6 +98,7 @@ class CLTensor : TensorBase { ...@@ -85,6 +98,7 @@ class CLTensor : TensorBase {
private: private:
cl_context context_; cl_context context_;
cl_command_queue command_queue_;
/* /*
* virtual ~Placeholder() = default; * virtual ~Placeholder() = default;
...@@ -99,20 +113,31 @@ class CLTensor : TensorBase { ...@@ -99,20 +113,31 @@ class CLTensor : TensorBase {
* */ * */
struct PlaceholderImpl : public Placeholder { struct PlaceholderImpl : public Placeholder {
PlaceholderImpl(size_t size, void *input, std::type_index type, PlaceholderImpl(size_t size, void *input, std::type_index type,
cl_context context) cl_context context, cl_command_queue command_queue)
: ptr_(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, : ptr_(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
size, reinterpret_cast<void *>(input), NULL)), size, reinterpret_cast<void *>(input), NULL)),
size_(size), size_(size),
type_(type) {} type_(type),
command_queue_(command_queue) {}
PlaceholderImpl(size_t size, std::type_index type, cl_context context) PlaceholderImpl(size_t size, std::type_index type, cl_context context,
cl_command_queue command_queue)
: ptr_(clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, NULL)), : ptr_(clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, NULL)),
size_(size), size_(size),
type_(type) {} type_(type),
command_queue_(command_queue) {}
virtual size_t size() const { return size_; } virtual size_t size() const { return size_; }
virtual void *ptr() const { return static_cast<void *>(ptr_.get()); } virtual void *ptr() const {
if (host_ptr_) {
delete (host_ptr_);
}
char *host_ptr = new char[size_];
clEnqueueReadBuffer(command_queue_, ptr_.get(), CL_TRUE, 0, size_,
host_ptr, 0, NULL, NULL);
return static_cast<void *>(host_ptr);
}
virtual std::type_index type() const { return type_; } virtual std::type_index type() const { return type_; }
...@@ -124,6 +149,17 @@ class CLTensor : TensorBase { ...@@ -124,6 +149,17 @@ class CLTensor : TensorBase {
/* the current type of memory */ /* the current type of memory */
std::type_index type_; std::type_index type_;
cl_command_queue command_queue_;
~PlaceholderImpl() {
if (host_ptr_) {
delete (host_ptr_);
}
}
private:
void *host_ptr_;
}; };
}; };
......
...@@ -37,7 +37,7 @@ limitations under the License. */ ...@@ -37,7 +37,7 @@ limitations under the License. */
#include "framework/cl/cl_image.h" #include "framework/cl/cl_image.h"
#endif #endif
int debug_to = 3; int debug_to = 115;
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
...@@ -87,7 +87,7 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size, ...@@ -87,7 +87,7 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
for (int i = 0; i < blocks.size(); ++i) { for (int i = 0; i < blocks.size(); ++i) {
std::shared_ptr<framework::BlockDesc> block_desc = blocks[i]; std::shared_ptr<framework::BlockDesc> block_desc = blocks[i];
std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops(); std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
for (int j = 0; j < debug_to; ++j) { for (int j = 0; j < ops.size(); ++j) {
std::shared_ptr<framework::OpDesc> op = ops[j]; std::shared_ptr<framework::OpDesc> op = ops[j];
DLOG << "create op: " << j << " " << op->Type(); DLOG << "create op: " << j << " " << op->Type();
auto op_base = framework::OpRegistry<Dtype>::CreateOp( auto op_base = framework::OpRegistry<Dtype>::CreateOp(
...@@ -416,7 +416,7 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict( ...@@ -416,7 +416,7 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
} }
} }
#else #else
for (int i = 0; i < debug_to; i++) { for (int i = 0; i < ops.size(); i++) {
#ifdef PADDLE_MOBILE_PROFILE #ifdef PADDLE_MOBILE_PROFILE
struct timespec ts; struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts); clock_gettime(CLOCK_MONOTONIC, &ts);
......
...@@ -40,6 +40,11 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init( ...@@ -40,6 +40,11 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
const framework::CLImage *scale = param->InputScale(); const framework::CLImage *scale = param->InputScale();
const framework::CLImage *bias = param->InputBias(); const framework::CLImage *bias = param->InputBias();
const float epsilon = param->Epsilon(); const float epsilon = param->Epsilon();
//
// DLOG << " climage mean: " << *mean;
// DLOG << " climage variance: " << *variance;
// DLOG << " climage scale: " << *scale;
// DLOG << " climage bias: " << *bias;
auto mean_ptr = mean->data<float>(); auto mean_ptr = mean->data<float>();
auto variance_ptr = variance->data<float>(); auto variance_ptr = variance->data<float>();
...@@ -67,12 +72,20 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init( ...@@ -67,12 +72,20 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
new_scale->InitCLImage(this->cl_helper_.CLContext(), new_scale->InitCLImage(this->cl_helper_.CLContext(),
cl_helper_.CLCommandQueue()); cl_helper_.CLCommandQueue());
DLOG << " climage - y bias: " << *(param->Bias());
DLOG << " climage - new scale: " << *new_scale;
framework::CLImage *new_bias = new framework::CLImage(); framework::CLImage *new_bias = new framework::CLImage();
new_bias->SetTensorData(new_bias_ptr, variance->dims()); new_bias->SetTensorData(new_bias_ptr, variance->dims());
new_bias->InitCLImage(this->cl_helper_.CLContext(), new_bias->InitCLImage(this->cl_helper_.CLContext(),
cl_helper_.CLCommandQueue()); cl_helper_.CLCommandQueue());
DLOG << " climage - new bias: " << *new_bias;
DLOG << " climage - filter: " << *(param->Filter());
param->SetNewScale(new_scale); param->SetNewScale(new_scale);
param->SetNewBias(new_bias); param->SetNewBias(new_bias);
......
...@@ -36,7 +36,8 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) { ...@@ -36,7 +36,8 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
cl_mem cl_image = output->GetCLImage(); cl_mem cl_image = output->GetCLImage();
int height = output->dims()[2]; int height = output->dims()[2];
int width = output->dims()[3]; int width = output->dims()[3];
CLTensor input_cl_tensor(this->cl_helper_.CLContext()); CLTensor input_cl_tensor(this->cl_helper_.CLContext(),
this->cl_helper_.CLCommandQueue());
input_cl_tensor.Resize(input->dims()); input_cl_tensor.Resize(input->dims());
cl_mem inputBuffer = cl_mem inputBuffer =
input_cl_tensor.mutable_with_data<cl_mem>((void *)input_data); input_cl_tensor.mutable_with_data<cl_mem>((void *)input_data);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册