提交 6df2ad4a 编写于 作者: R Ray Liu 提交者: GitHub

Merge pull request #1109 from codeWorm2015/opencl

update fusion conv kernel
...@@ -57,7 +57,9 @@ void OperatorBase<Dtype>::CheckAllInputOutputSet() const {} ...@@ -57,7 +57,9 @@ void OperatorBase<Dtype>::CheckAllInputOutputSet() const {}
template <typename Dtype> template <typename Dtype>
void OperatorBase<Dtype>::Run() { void OperatorBase<Dtype>::Run() {
DLOG << " ----- Begin run impl --- " << type_ << " ----- ";
RunImpl(); RunImpl();
DLOG << " ----- End run impl --- " << type_ << " ----- ";
#ifdef PADDLE_MOBILE_DEBUG #ifdef PADDLE_MOBILE_DEBUG
DLOG << "-------------" << type_ << "----------------------------"; DLOG << "-------------" << type_ << "----------------------------";
vector<string> input_keys = GetInputKeys(); vector<string> input_keys = GetInputKeys();
......
...@@ -14,7 +14,6 @@ limitations under the License. */ ...@@ -14,7 +14,6 @@ limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable #pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define BIASE #define BIASE
#define BATCH_NORM #define BATCH_NORM
...@@ -54,21 +53,24 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -54,21 +53,24 @@ __kernel void conv_3x3(__private const int global_size_dim0,
ouput_pos_in_one_block.x = out_w; ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh; ouput_pos_in_one_block.y = out_nh;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int2 in_pos_in_one_block; int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
#ifdef BIASE #ifdef BIASE
half4 output = read_imageh(bias, sampler, int2(out_c, 0)); half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#else #else
half4 output = 0.0f; half4 output = 0.0f;
#endif #endif
half4 input[9]; half4 input[9];
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
for (int i = 0; i < input_c; ++i) { for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
...@@ -139,7 +141,7 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -139,7 +141,7 @@ __kernel void conv_3x3(__private const int global_size_dim0,
} }
#ifdef BATCH_NORM #ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, int2(out_c, 0)) + read_imageh(new_biase, sampler, int2(out_c, 0)) output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif #endif
#ifdef RELU #ifdef RELU
...@@ -250,7 +252,7 @@ __kernel void depth_conv_3x3(__private const int global_size_dim0, ...@@ -250,7 +252,7 @@ __kernel void depth_conv_3x3(__private const int global_size_dim0,
} }
#ifdef BATCH_NORM #ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)) output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif #endif
#ifdef RELU #ifdef RELU
...@@ -321,7 +323,7 @@ __kernel void conv_1x1(__private const int global_size_dim0, ...@@ -321,7 +323,7 @@ __kernel void conv_1x1(__private const int global_size_dim0,
} }
#ifdef BATCH_NORM #ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)) output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif #endif
#ifdef RELU #ifdef RELU
......
...@@ -56,17 +56,19 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -56,17 +56,19 @@ __kernel void conv_3x3(__private const int global_size_dim0,
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
#ifdef BIASE #ifdef BIASE
half4 output = read_imageh(bias, sampler, int2(out_c, 0)); half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#else #else
half4 output = 0.0f; half4 output = 0.0f;
#endif #endif
half4 input[9]; half4 input[9];
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
for (int i = 0; i < input_c; ++i) { for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
...@@ -137,7 +139,7 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -137,7 +139,7 @@ __kernel void conv_3x3(__private const int global_size_dim0,
} }
#ifdef BATCH_NORM #ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, int2(out_c, 0)) + read_imageh(new_biase, sampler, int2(out_c, 0)) output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif #endif
#ifdef RELU #ifdef RELU
...@@ -248,7 +250,7 @@ __kernel void depth_conv_3x3(__private const int global_size_dim0, ...@@ -248,7 +250,7 @@ __kernel void depth_conv_3x3(__private const int global_size_dim0,
} }
#ifdef BATCH_NORM #ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)) output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif #endif
#ifdef RELU #ifdef RELU
...@@ -319,7 +321,7 @@ __kernel void conv_1x1(__private const int global_size_dim0, ...@@ -319,7 +321,7 @@ __kernel void conv_1x1(__private const int global_size_dim0,
} }
#ifdef BATCH_NORM #ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)) output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif #endif
#ifdef RELU #ifdef RELU
......
...@@ -117,7 +117,7 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute( ...@@ -117,7 +117,7 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
auto biase = param.Bias()->GetCLImage(); auto biase = param.Bias()->GetCLImage();
auto new_scale = param.NewScale()->GetCLImage(); auto new_scale = param.NewScale()->GetCLImage();
auto new_bias = param.NewBias()->GetCLImage(); auto new_bias = param.NewBias()->GetCLImage();
auto output = param.Output(); auto output = param.Output()->GetCLImage();
int stride = param.Strides()[0]; int stride = param.Strides()[0];
int offset = param.Offset(); int offset = param.Offset();
int input_c = param.Input()->CBlock(); int input_c = param.Input()->CBlock();
......
...@@ -23,7 +23,7 @@ int main() { ...@@ -23,7 +23,7 @@ int main() {
// auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model", // auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
// std::string(g_mobilenet_detect) + "/params", true); // std::string(g_mobilenet_detect) + "/params", true);
auto isok = paddle_mobile.Load(g_mobilenet, false); auto isok = paddle_mobile.Load(g_mobilenet, true);
if (isok) { if (isok) {
auto time2 = paddle_mobile::time(); auto time2 = paddle_mobile::time();
std::cout << "load cost :" << paddle_mobile::time_diff(time1, time1) << "ms" std::cout << "load cost :" << paddle_mobile::time_diff(time1, time1) << "ms"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册