提交 6f7369b9 编写于 作者: xiebaiyuan's avatar xiebaiyuan

opencl opt

上级 4782257a
...@@ -21,7 +21,7 @@ limitations under the License. */ ...@@ -21,7 +21,7 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
bool optimise = true;
template <> template <>
bool ConvAddBNReluKernel<GPU_CL, float>::Init( bool ConvAddBNReluKernel<GPU_CL, float>::Init(
FusionConvAddBNReluParam<GPU_CL> *param) { FusionConvAddBNReluParam<GPU_CL> *param) {
...@@ -139,7 +139,12 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init( ...@@ -139,7 +139,12 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
param->Filter()->InitNImage(cl_helper_.CLContext(), param->Filter()->InitNImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue()); cl_helper_.CLCommandQueue());
if (optimise) {
this->cl_helper_.AddKernel("conv_1x1_spl", "conv_add_bn_relu_kernel.cl");
} else {
this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl"); this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl");
}
DLOG << " conv add bn relu conv 1x1"; DLOG << " conv add bn relu conv 1x1";
} else if (param->Filter()->dims()[1] == 1 && } else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] && param->Input()->dims()[1] == param->Output()->dims()[1] &&
...@@ -205,10 +210,13 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute( ...@@ -205,10 +210,13 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
cl_int status; cl_int status;
if (optimise) {
if (param.Filter()->dims()[2] == 1 && param.Filter()->dims()[3] == 1) {
status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(int), &w); int maped_w = maptofactor(w, 4);
status = clSetKernelArg(kernel, 1, sizeof(int), &maped_w);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(int), &nh); status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
...@@ -256,30 +264,132 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute( ...@@ -256,30 +264,132 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
status = clSetKernelArg(kernel, 16, sizeof(int), &output_height); status = clSetKernelArg(kernel, 16, sizeof(int), &output_height);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
// cl_event out_event = param.Output()->GetClEvent(); status = clSetKernelArg(kernel, 17, sizeof(int), &w);
// cl_event wait_event = param.Input()->GetClEvent(); CL_CHECK_ERRORS(status);
/* const size_t work_size[3] = {
if (param.Filter()->dims()[2] == 1 && static_cast<const uint32_t>(default_work_size.data()[0]),
param.Filter()->dims()[3] == 1 && static_cast<const uint32_t>(maped_w),
param.Filter()->dims()[0] % 16 == 0) { static_cast<const uint32_t>(default_work_size.data()[2])};
DLOG << " before modifi work size: " << default_work_size;
default_work_size[0] = default_work_size[0] / 4; status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel,
default_work_size.size(), NULL, work_size,
NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
} else {
status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
CL_CHECK_ERRORS(status);
DLOG << " modification work size: " << default_work_size; status = clSetKernelArg(kernel, 1, sizeof(int), &w);
DLOG << " input dims " << param.Input()->dims(); CL_CHECK_ERRORS(status);
DLOG << " output dims " << param.Output()->dims();
DLOG << " filter dims: " << param.Filter()->dims();
DLOG << " biase dims : " << param.Bias()->dims();
status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 9, sizeof(int), &stride);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 10, sizeof(int), &offset);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 11, sizeof(int), &input_c);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 12, sizeof(int), &dilation);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 13, sizeof(int), &input_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 14, sizeof(int), &input_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 15, sizeof(int), &output_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 16, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
NULL, default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
} }
*/
} else {
status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(int), &w);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 9, sizeof(int), &stride);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 10, sizeof(int), &offset);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 11, sizeof(int), &input_c);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 12, sizeof(int), &dilation);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 13, sizeof(int), &input_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 14, sizeof(int), &input_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 15, sizeof(int), &output_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 16, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
status = clEnqueueNDRangeKernel( status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
default_work_size.data(), NULL, 0, NULL, NULL); NULL, default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
}
} }
template class ConvAddBNReluKernel<GPU_CL, float>; template class ConvAddBNReluKernel<GPU_CL, float>;
......
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
bool optimise_convadd = true;
template <> template <>
bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) { bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
...@@ -35,8 +36,11 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) { ...@@ -35,8 +36,11 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
param->Filter()->InitNImage(cl_helper_.CLContext(), param->Filter()->InitNImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue()); cl_helper_.CLCommandQueue());
if (optimise_convadd) {
this->cl_helper_.AddKernel("conv_1x1_spl", "conv_add_kernel.cl");
} else {
this->cl_helper_.AddKernel("conv_1x1", "conv_add_kernel.cl"); this->cl_helper_.AddKernel("conv_1x1", "conv_add_kernel.cl");
}
} else if (param->Filter()->dims()[1] == 1 && } else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] && param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] == 3) { param->Filter()->dims()[2] == 3) {
...@@ -95,10 +99,13 @@ void ConvAddKernel<GPU_CL, float>::Compute( ...@@ -95,10 +99,13 @@ void ConvAddKernel<GPU_CL, float>::Compute(
cl_int status; cl_int status;
if (optimise_convadd && param.Filter()->dims()[2] == 1 &&
param.Filter()->dims()[3] == 1) {
status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(int), &w); int maped_w = maptofactor(w, 4);
status = clSetKernelArg(kernel, 1, sizeof(int), &maped_w);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(int), &nh); status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
...@@ -140,13 +147,69 @@ void ConvAddKernel<GPU_CL, float>::Compute( ...@@ -140,13 +147,69 @@ void ConvAddKernel<GPU_CL, float>::Compute(
status = clSetKernelArg(kernel, 14, sizeof(int), &output_height); status = clSetKernelArg(kernel, 14, sizeof(int), &output_height);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
// cl_event out_event = param.Output()->GetClEvent(); status = clSetKernelArg(kernel, 15, sizeof(int), &w);
// cl_event wait_event = param.Input()->GetClEvent(); CL_CHECK_ERRORS(status);
const size_t work_size[3] = {
static_cast<const uint32_t>(default_work_size.data()[0]),
static_cast<const uint32_t>(maped_w),
static_cast<const uint32_t>(default_work_size.data()[2])};
status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel,
default_work_size.size(), NULL, work_size,
NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
} else {
status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(int), &w);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 7, sizeof(int), &stride);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 8, sizeof(int), &offset);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 9, sizeof(int), &input_c);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 10, sizeof(int), &dilation);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 11, sizeof(int), &input_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 12, sizeof(int), &input_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 13, sizeof(int), &output_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 14, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
status = clEnqueueNDRangeKernel( status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
default_work_size.data(), NULL, 0, NULL, NULL); NULL, default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
}
} }
template class ConvAddKernel<GPU_CL, float>; template class ConvAddKernel<GPU_CL, float>;
......
...@@ -36,6 +36,9 @@ class ConvAddBNReluKernel ...@@ -36,6 +36,9 @@ class ConvAddBNReluKernel
public: public:
void Compute(const FusionConvAddBNReluParam<DeviceType> &param); void Compute(const FusionConvAddBNReluParam<DeviceType> &param);
bool Init(FusionConvAddBNReluParam<DeviceType> *param); bool Init(FusionConvAddBNReluParam<DeviceType> *param);
inline int maptofactor(int i, int factor) {
return (i + factor - 1) / factor;
}
}; };
} // namespace operators } // namespace operators
......
...@@ -41,6 +41,9 @@ class ConvAddKernel ...@@ -41,6 +41,9 @@ class ConvAddKernel
public: public:
void Compute(const FusionConvAddParam<DeviceType> &param); void Compute(const FusionConvAddParam<DeviceType> &param);
bool Init(FusionConvAddParam<DeviceType> *param); bool Init(FusionConvAddParam<DeviceType> *param);
inline int maptofactor(int i, int factor) {
return (i + factor - 1) / factor;
}
}; };
} // namespace operators } // namespace operators
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册