提交 56ce4aaf 编写于 作者: R Ray Liu 提交者: GitHub

Merge pull request #1160 from codeWorm2015/opencl

 commit event
......@@ -30,6 +30,13 @@ struct CLMemDeleter {
}
};
struct CLEventDeleter {
template <class T>
void operator()(T *clEventObj) {
clReleaseEvent(clEventObj);
}
};
struct CLCommQueueDeleter {
template <class T>
void operator()(T *clQueueObj) {
......
......@@ -81,6 +81,12 @@ class CLEngine {
return std::move(program_ptr);
}
std::unique_ptr<_cl_event, CLEventDeleter> CreateEvent(cl_context context) {
cl_event event = clCreateUserEvent(context, status_);
std::unique_ptr<_cl_event, CLEventDeleter> event_ptr(event);
return std::move(event_ptr);
}
bool BuildProgram(cl_program program) {
cl_int status;
status = clBuildProgram(program, 0, 0, "-cl-fast-relaxed-math", 0, 0);
......
......@@ -21,6 +21,7 @@ limitations under the License. */
#include "framework/cl/cl_half.h"
#include "framework/cl/cl_tool.h"
#include "framework/cl/cl_deleter.h"
#include "framework/cl/cl_engine.h"
#include "framework/ddim.h"
#include "framework/tensor.h"
......@@ -97,6 +98,8 @@ class CLImage {
InitCLImage(context, command_queue, tensor_data_, tensor_dims_);
}
cl_event_ = CLEngine::Instance()->CreateEvent(context);
// InitCLImage(context, command_queue, nullptr, dim);
initialized_ = true;
......@@ -157,6 +160,8 @@ class CLImage {
const ImageType GetImageType() const { return image_type_; }
cl_event GetClEvent() const { return cl_event_.get(); }
private:
ImageType image_type_ = Invalid;
void InitCLImage2C(cl_context context, cl_command_queue command_queue,
......@@ -295,6 +300,7 @@ class CLImage {
bool initialized_ = false;
std::unique_ptr<_cl_mem, CLMemDeleter> cl_image_;
std::unique_ptr<_cl_event, CLEventDeleter> cl_event_;
size_t image_width_;
size_t width_of_one_block_;
size_t height_of_one_block_;
......
......@@ -37,8 +37,6 @@ limitations under the License. */
#include "framework/cl/cl_image.h"
#endif
int debug_to = 32;
namespace paddle_mobile {
namespace framework {
......@@ -87,7 +85,7 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
for (int i = 0; i < blocks.size(); ++i) {
std::shared_ptr<framework::BlockDesc> block_desc = blocks[i];
std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
for (int j = 0; j < debug_to; ++j) {
for (int j = 0; j < ops.size(); ++j) {
std::shared_ptr<framework::OpDesc> op = ops[j];
DLOG << "create op: " << j << " " << op->Type();
auto op_base = framework::OpRegistry<Dtype>::CreateOp(
......@@ -416,7 +414,7 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
}
}
#else
for (int i = 0; i < debug_to; i++) {
for (int i = 0; i < ops.size(); i++) {
#ifdef PADDLE_MOBILE_PROFILE
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
......@@ -433,8 +431,6 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
DLOG << " predict return nullptr";
return nullptr;
auto last_op = ops.rbegin();
auto output_map = (*last_op)->Outputs();
std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
......
......@@ -168,20 +168,20 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
int output_width = param.Output()->WidthOfOneBlock();
int output_height = param.Output()->HeightOfOneBlock();
DLOG << " c block " << c_block;
DLOG << " w " << w;
DLOG << " nh " << nh;
DLOG << " stride " << stride;
DLOG << " offset " << offset;
DLOG << " input_c " << input_c;
DLOG << " dilation " << dilation;
DLOG << " input width " << input_width;
DLOG << " input height " << input_height;
DLOG << " output width " << output_width;
DLOG << " output height " << output_height;
DLOG << " input dim " << param.Input()->dims();
DLOG << " output dim " << param.Output()->dims();
DLOG << " filter dim " << param.Filter()->dims();
// DLOG << " c block " << c_block;
// DLOG << " w " << w;
// DLOG << " nh " << nh;
// DLOG << " stride " << stride;
// DLOG << " offset " << offset;
// DLOG << " input_c " << input_c;
// DLOG << " dilation " << dilation;
// DLOG << " input width " << input_width;
// DLOG << " input height " << input_height;
// DLOG << " output width " << output_width;
// DLOG << " output height " << output_height;
// DLOG << " input dim " << param.Input()->dims();
// DLOG << " output dim " << param.Output()->dims();
// DLOG << " filter dim " << param.Filter()->dims();
cl_int status;
......@@ -236,9 +236,12 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
status = clSetKernelArg(kernel, 16, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
cl_event out_event = param.Output()->GetClEvent();
cl_event wait_event = param.Input()->GetClEvent();
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
default_work_size.data(), NULL, 1, &wait_event, &out_event);
CL_CHECK_ERRORS(status);
}
......
......@@ -117,9 +117,12 @@ void ConvAddKernel<GPU_CL, float>::Compute(
status = clSetKernelArg(kernel, 14, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
cl_event out_event = param.Output()->GetClEvent();
cl_event wait_event = param.Input()->GetClEvent();
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
default_work_size.data(), NULL, 1, &wait_event, &out_event);
CL_CHECK_ERRORS(status);
}
......
......@@ -62,27 +62,15 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
template <>
void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
DLOG << " Compute helper: " << &cl_helper_;
DLOG << " begin compute ";
auto kernel = this->cl_helper_.KernelAt(0);
DLOG << " get work size ";
auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
DLOG << " end work size ";
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
auto input = param.Input()->GetCLImage();
DLOG << " get Input ";
auto filter = param.Filter()->GetCLImage();
DLOG << " get Filter ";
auto output = param.Output()->GetCLImage();
DLOG << " get Output ";
int stride = param.Strides()[0];
int offset = param.Offset();
int input_c = param.Input()->CBlock();
......@@ -109,56 +97,27 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
DLOG << " output height " << output_height;
status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(int), &w);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 12, sizeof(int), &output_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 13, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
DLOG << " end set kernel arg ";
DLOG << " begin enqueue ";
cl_event out_event = param.Output()->GetClEvent();
cl_event wait_event = param.Input()->GetClEvent();
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
default_work_size.data(), NULL, 1, &wait_event, &out_event);
CL_CHECK_ERRORS(status);
DLOG << " end enqueue ";
}
template class ConvKernel<GPU_CL, float>;
......
......@@ -76,9 +76,12 @@ void DepthwiseConvKernel<GPU_CL, float>::Compute(
CL_CHECK_ERRORS(status);
cl_event out_event = param.Output()->GetClEvent();
cl_event wait_event = param.Input()->GetClEvent();
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
default_work_size.data(), NULL, 1, &wait_event, &out_event);
CL_CHECK_ERRORS(status);
}
......
......@@ -34,21 +34,13 @@ void SoftmaxKernel<GPU_CL, float>::Compute(const SoftmaxParam<GPU_CL> &param) {
auto inputImage = input->GetCLImage();
auto outputImage = output->GetCLImage();
DLOG << " softmax - output image dim " << output->ImageDims();
DLOG << " softmax - output image tensor dim " << output->dims();
int group = output->ImageWidth();
cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(int), &group);
CL_CHECK_ERRORS(status);
// const auto &inputDim = input->dims();
//
......@@ -62,7 +54,6 @@ void SoftmaxKernel<GPU_CL, float>::Compute(const SoftmaxParam<GPU_CL> &param) {
// clSetKernelArg(kernel, 3, sizeof(int), &dims[1]);
// clSetKernelArg(kernel, 4, sizeof(int), &dims[2]);
// clSetKernelArg(kernel, 5, sizeof(int), &dims[3]);
DLOG << "default_work_size: " << default_work_size;
status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册