提交 6df2ad4a 编写于 作者: R Ray Liu 提交者: GitHub

Merge pull request #1109 from codeWorm2015/opencl

update fusion conv kernel
......@@ -57,7 +57,9 @@ void OperatorBase<Dtype>::CheckAllInputOutputSet() const {}
template <typename Dtype>
void OperatorBase<Dtype>::Run() {
DLOG << " ----- Begin run impl --- " << type_ << " ----- ";
RunImpl();
DLOG << " ----- End run impl --- " << type_ << " ----- ";
#ifdef PADDLE_MOBILE_DEBUG
DLOG << "-------------" << type_ << "----------------------------";
vector<string> input_keys = GetInputKeys();
......
......@@ -14,7 +14,6 @@ limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define BIASE
#define BATCH_NORM
......@@ -54,21 +53,24 @@ __kernel void conv_3x3(__private const int global_size_dim0,
ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
#ifdef BIASE
half4 output = read_imageh(bias, sampler, int2(out_c, 0));
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#else
half4 output = 0.0f;
#endif
half4 input[9];
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
......@@ -139,7 +141,7 @@ __kernel void conv_3x3(__private const int global_size_dim0,
}
#ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, int2(out_c, 0)) + read_imageh(new_biase, sampler, int2(out_c, 0))
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
......@@ -250,7 +252,7 @@ __kernel void depth_conv_3x3(__private const int global_size_dim0,
}
#ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0))
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
......@@ -321,7 +323,7 @@ __kernel void conv_1x1(__private const int global_size_dim0,
}
#ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0))
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
......
......@@ -56,17 +56,19 @@ __kernel void conv_3x3(__private const int global_size_dim0,
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
#ifdef BIASE
half4 output = read_imageh(bias, sampler, int2(out_c, 0));
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#else
half4 output = 0.0f;
#endif
half4 input[9];
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
......@@ -137,7 +139,7 @@ __kernel void conv_3x3(__private const int global_size_dim0,
}
#ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, int2(out_c, 0)) + read_imageh(new_biase, sampler, int2(out_c, 0))
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
......@@ -248,7 +250,7 @@ __kernel void depth_conv_3x3(__private const int global_size_dim0,
}
#ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0))
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
......@@ -319,7 +321,7 @@ __kernel void conv_1x1(__private const int global_size_dim0,
}
#ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0))
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
......
......@@ -117,7 +117,7 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
auto biase = param.Bias()->GetCLImage();
auto new_scale = param.NewScale()->GetCLImage();
auto new_bias = param.NewBias()->GetCLImage();
auto output = param.Output();
auto output = param.Output()->GetCLImage();
int stride = param.Strides()[0];
int offset = param.Offset();
int input_c = param.Input()->CBlock();
......
......@@ -23,7 +23,7 @@ int main() {
// auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
// std::string(g_mobilenet_detect) + "/params", true);
auto isok = paddle_mobile.Load(g_mobilenet, false);
auto isok = paddle_mobile.Load(g_mobilenet, true);
if (isok) {
auto time2 = paddle_mobile::time();
std::cout << "load cost :" << paddle_mobile::time_diff(time1, time1) << "ms"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册