提交 44643650 编写于 作者: R Ray Liu 提交者: GitHub

Merge pull request #1211 from codeWorm2015/opencl

 commit for test
...@@ -26,7 +26,7 @@ if (DEBUGING) ...@@ -26,7 +26,7 @@ if (DEBUGING)
message(STATUS "debug") message(STATUS "debug")
set(CMAKE_BUILD_TYPE Release) set(CMAKE_BUILD_TYPE Release)
set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG") set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG")
add_definitions(-DPADDLE_MOBILE_DEBUG) # add_definitions(-DPADDLE_MOBILE_DEBUG)
else () else ()
set(CMAKE_BUILD_TYPE Release) set(CMAKE_BUILD_TYPE Release)
set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG") set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG")
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable #pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void batchnorm(__private const int out_height, __kernel void batchnorm(__private const int out_width,
__private const int out_width,
__read_only image2d_t input, __read_only image2d_t input,
__read_only image2d_t new_scale_image, __read_only image2d_t new_scale_image,
__read_only image2d_t new_bias_image, __read_only image2d_t new_bias_image,
......
...@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable #pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void channel_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage,int w) { __kernel void channel_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage,int w) {
int x = get_global_id(0); int x = get_global_id(0);
......
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once
#pragma OPENCL EXTENSION cl_khr_fp16 : enable #pragma OPENCL EXTENSION cl_khr_fp16 : enable
inline half4 activation(half4 in inline half4 activation(half4 in
......
...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#define BIASE #define BIASE
#define BATCH_NORM #define BATCH_NORM
#define RELU #define RELU
......
...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#define BIASE #define BIASE
#include "conv_kernel.inc.cl" #include "conv_kernel.inc.cl"
...@@ -56,7 +56,6 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -56,7 +56,6 @@ __kernel void conv_3x3(__private const int global_size_dim0,
if (out_c >= global_size_dim0 || if (out_c >= global_size_dim0 ||
out_w >= global_size_dim1 || out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) { out_nh >= global_size_dim2) {
printf(" out of range ");
return; return;
} }
...@@ -134,22 +133,22 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -134,22 +133,22 @@ __kernel void conv_3x3(__private const int global_size_dim0,
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
for (int j = 0; j < 9; ++j) { for (int j = 0; j < 9; ++j) {
int2 fuck; int2 pos_of_weight;
fuck.x = i * 3 + j % 3; pos_of_weight.x = i * 3 + j % 3;
fuck.y = out_c * 4 * 3 + 0 * 3 + j / 3; pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, fuck); half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x); output.x += dot(input[j], weight_x);
fuck.y = out_c * 4 * 3 + 1 * 3 + j / 3; pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, fuck); half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y); output.y += dot(input[j], weight_y);
fuck.y = out_c * 4 * 3 + 2 * 3 + j / 3; pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, fuck); half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z); output.z += dot(input[j], weight_z);
fuck.y = out_c * 4 * 3 + 3 * 3 + j / 3; pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, fuck); half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w); output.w += dot(input[j], weight_w);
} }
} }
...@@ -321,6 +320,7 @@ __kernel void depth_conv_3x3(__private const int global_size_dim0, ...@@ -321,6 +320,7 @@ __kernel void depth_conv_3x3(__private const int global_size_dim0,
} }
__kernel void conv_1x1(__private const int global_size_dim0, __kernel void conv_1x1(__private const int global_size_dim0,
__private const int global_size_dim1, __private const int global_size_dim1,
__private const int global_size_dim2, __private const int global_size_dim2,
...@@ -349,92 +349,179 @@ __kernel void conv_1x1(__private const int global_size_dim0, ...@@ -349,92 +349,179 @@ __kernel void conv_1x1(__private const int global_size_dim0,
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP | CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST; CLK_FILTER_NEAREST;
const uint kernelHXW = 1; const uint kernelHXW = 1;
int2 stride_xy = (int2)(stride, stride); int2 stride_xy = (int2)(stride, stride);
int2 ouput_pos_in_one_block = (int2)(out_w, out_nh); int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
#ifdef BIASE #ifdef BIASE
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#else #else
half4 output = 0.0f; half4 output = 0.0f;
#endif #endif
int out_c_p = 0, out_w_p = 0, out_nh_p = 0; for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
half4 input = read_imageh(input_image, sampler, pos_in);
half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
/* /*
if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) { output.x = dot(input, weight0);
float4 out = (float4)(output.x, output.y, output.z, output.w); output.y = dot(input, weight1);
printf(" after bias output4 = %v4hlf \n", out); output.z = dot(input, weight2);
output.w = dot(input, weight3);
*/
output = mad(input.x, weight0, output);
output = mad(input.y, weight1, output);
output = mad(input.z, weight2, output);
output = mad(input.w, weight3, output);
} }
*/ #ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
for (int i = 0; i < input_c; ++i) { #ifdef RELU
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); output = activation(output);
half4 input = read_imageh(input_image, sampler, pos_in); #endif
half4 weight_x = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 0)); int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
output.x += dot(input, weight_x); write_imageh(output_image, output_pos, output);
}
half4 weight_y = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 1));
output.y += dot(input, weight_y);
half4 weight_z = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 2));
output.z += dot(input, weight_z);
half4 weight_w = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 3));
output.w += dot(input, weight_w);
/* /*
if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) {
printf("x - %d \n", pos_in.x);
printf("y - %d \n", pos_in.y); __kernel void conv_1x1_4(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter,
#ifdef BIASE
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width,
__private const int input_height,
__private const int output_width,
__private const int output_height) {
const int out_c = get_global_id(0) * 4;
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
float4 in = (float4)(input.x, input.y, input.z, input.w); const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
printf("input4 = %v4hlf \n", in); CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
float4 w = (float4)(weight_x.x, weight_x.y, weight_x.z, weight_x.w); int2 stride_xy = (int2)(stride, stride);
printf("weight4 = %v4hlf \n", w); int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
} #ifdef BIASE
*/ half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
} half4 output1 = read_imageh(bias, sampler, (int2)(out_c + 1, 0));
/* half4 output2 = read_imageh(bias, sampler, (int2)(out_c + 2, 0));
if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) { half4 output3 = read_imageh(bias, sampler, (int2)(out_c + 3, 0));
float4 out = (float4)(output.x, output.y, output.z, output.w); #else
printf("output4 = %v4hlf \n", out); half4 output0 = 0.0f;
half4 output1 = 0.0f;
half4 output2 = 0.0f;
half4 output3 = 0.0f;
#endif
} for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
half4 input = read_imageh(input_image, sampler, pos_in);
*/ half4 weight0_0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
half4 weight0_1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
half4 weight0_2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
half4 weight0_3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
output0 = mad(input.x, weight0_0, output0);
output0 = mad(input.y, weight0_1, output0);
output0 = mad(input.z, weight0_2, output0);
output0 = mad(input.w, weight0_3, output0);
half4 weight1_0 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 0));
half4 weight1_1 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 1));
half4 weight1_2 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 2));
half4 weight1_3 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 3));
output1 = mad(input.x, weight1_0, output1);
output1 = mad(input.y, weight1_1, output1);
output1 = mad(input.z, weight1_2, output1);
output1 = mad(input.w, weight1_3, output1);
half4 weight2_0 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 0));
half4 weight2_1 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 1));
half4 weight2_2 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 2));
half4 weight2_3 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 3));
output2 = mad(input.x, weight2_0, output2);
output2 = mad(input.y, weight2_1, output2);
output2 = mad(input.z, weight2_2, output2);
output2 = mad(input.w, weight2_3, output2);
half4 weight3_0 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 0));
half4 weight3_1 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 1));
half4 weight3_2 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 2));
half4 weight3_3 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 3));
output3 = mad(input.x, weight3_0, output3);
output3 = mad(input.y, weight3_1, output3);
output3 = mad(input.z, weight3_2, output3);
output3 = mad(input.w, weight3_3, output3);
}
#ifdef BATCH_NORM #ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c + 0, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 0, 0));
#endif
/* output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c + 1, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 1, 0));
if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) {
float4 out = (float4)(output.x, output.y, output.z, output.w);
printf(" after batch output4 = %v4hlf \n", out);
} output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c + 2, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 2, 0));
*/ output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c + 3, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 3, 0));
#endif
#ifdef RELU #ifdef RELU
output = activation(output); output0 = activation(output0);
output1 = activation(output1);
output2 = activation(output2);
output3 = activation(output3);
#endif #endif
/* int2 output_pos0 = (int2)(out_c * global_size_dim1 + out_w, out_nh);
if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) { write_imageh(output_image, output_pos0, output0);
float4 out = (float4)(output.x, output.y, output.z, output.w);
printf(" after relu output4 = %v4hlf \n", out);
}
*/ int2 output_pos1 = (int2)((out_c + 1) * global_size_dim1 + out_w, out_nh);
write_imageh(output_image, output_pos1, output1);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
write_imageh(output_image, output_pos, output); int2 output_pos2 = (int2)((out_c + 2) * global_size_dim1 + out_w, out_nh);
write_imageh(output_image, output_pos2, output2);
int2 output_pos3 = (int2)((out_c + 3) * global_size_dim1 + out_w, out_nh);
write_imageh(output_image, output_pos3, output3);
} }
*/
...@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable #pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void elementwise_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage) { __kernel void elementwise_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage) {
int x = get_global_id(0); int x = get_global_id(0);
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable #pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void feed(__global float *in, __write_only image2d_t outputImage,int h,int w) __kernel void feed(__global float *in, __write_only image2d_t outputImage,int h,int w)
{ {
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable #pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void fetch(__private const int in_height, __kernel void fetch(__private const int in_height,
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable #pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define MIN_VALUE -FLT_MAX #define MIN_VALUE -FLT_MAX
......
...@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable #pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void relu(__read_only image2d_t input, __kernel void relu(__read_only image2d_t input,
......
...@@ -33,17 +33,17 @@ __kernel void softmax(__read_only image2d_t input_image, ...@@ -33,17 +33,17 @@ __kernel void softmax(__read_only image2d_t input_image,
maxv = max(maxv, max(temp.x, max(temp.y, max(temp.z, temp.w)))); maxv = max(maxv, max(temp.x, max(temp.y, max(temp.z, temp.w))));
} }
half4 rsum = (half4)(0.0f); half4 rsum = (half4)(0.0f);
for (int i = 0; i < group; ++i) { for (int i = 0; i < group; ++i) {
half4 r = read_imageh(input_image, sampler, (int2)(i, 0)); half4 r = read_imageh(input_image, sampler, (int2)(i, 0));
rsum += convert_half4(exp(convert_float4(r - maxv))); rsum += exp(r - maxv);
} }
float sum = rsum.x + rsum.y + rsum.z + rsum.w; float sum = rsum.x + rsum.y + rsum.z + rsum.w;
half4 rr = read_imageh(input_image, sampler, (int2)(out_w, out_nh)); half4 rr = read_imageh(input_image, sampler, (int2)(out_w, out_nh));
half4 result = convert_half4(exp(convert_float4(rr - maxv)) / sum); half4 result = exp(rr - maxv) / sum;
write_imageh(output_image, (int2)(out_w, out_nh), result); write_imageh(output_image, (int2)(out_w, out_nh), result);
} }
......
...@@ -125,10 +125,21 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init( ...@@ -125,10 +125,21 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
param->SetOffset(offset); param->SetOffset(offset);
if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
/*
if (param->Filter()->dims()[2] == 1 &&
param->Filter()->dims()[3] == 1 &&
(param->Filter()->dims()[0] % 16) == 0) {
param->Filter()->InitNImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_1x1_4", "conv_add_bn_relu_kernel.cl");
DLOG << " conv add bn relu conv 1x1 4";
}
*/
if (param->Filter()->dims()[2] == 1 &&
param->Filter()->dims()[3] == 1) {
param->Filter()->InitNImage(cl_helper_.CLContext(), param->Filter()->InitNImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue()); cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl"); this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl");
DLOG << " conv add bn relu conv 1x1"; DLOG << " conv add bn relu conv 1x1";
} else if (param->Filter()->dims()[1] == 1 && } else if (param->Filter()->dims()[1] == 1 &&
...@@ -249,6 +260,23 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute( ...@@ -249,6 +260,23 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
// cl_event out_event = param.Output()->GetClEvent(); // cl_event out_event = param.Output()->GetClEvent();
// cl_event wait_event = param.Input()->GetClEvent(); // cl_event wait_event = param.Input()->GetClEvent();
/*
if (param.Filter()->dims()[2] == 1 &&
param.Filter()->dims()[3] == 1 &&
param.Filter()->dims()[0] % 16 == 0) {
DLOG << " before modifi work size: " << default_work_size;
default_work_size[0] = default_work_size[0] / 4;
DLOG << " modification work size: " << default_work_size;
DLOG << " input dims " << param.Input()->dims();
DLOG << " output dims " << param.Output()->dims();
DLOG << " filter dims: " << param.Filter()->dims();
DLOG << " biase dims : " << param.Bias()->dims();
}
*/
status = clEnqueueNDRangeKernel( status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL); default_work_size.data(), NULL, 0, NULL, NULL);
......
...@@ -13,19 +13,33 @@ See the License for the specific language governing permissions and ...@@ -13,19 +13,33 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <string> #include <string>
#include <iostream>
#include "../test_helper.h" #include "../test_helper.h"
#include "framework/loader.h" #include "framework/loader.h"
int main() { int main() {
paddle_mobile::framework::Loader<paddle_mobile::CPU> loader; paddle_mobile::framework::Loader<paddle_mobile::GPU_CL> loader;
// ../../../test/models/googlenet // ../../../test/models/googlenet
// ../../../test/models/mobilenet // ../../../test/models/mobilenet
// auto program = loader.Load(g_googlenet, true);
std::cout << " Begin load mobilenet " << std::endl;
auto program = loader.Load(std::string(g_mobilenet_mul), true);
std::cout << " End load mobilenet " << std::endl;
std::cout << " Begin load YOLO " << std::endl;
auto program1 = loader.Load(std::string(g_yolo_mul), true);
std::cout << " End load YOLO " << std::endl;
// auto program = loader.Load(g_mobilenet_ssd, true); // auto program = loader.Load(g_mobilenet_ssd, true);
auto program = loader.Load(std::string(g_ocr) + "/model", // auto program = loader.Load(std::string(g_ocr) + "/model",
std::string(g_ocr) + "/params", false); // std::string(g_ocr) + "/params", false);
// program.originProgram->Description("program desc: "); // program.originProgram->Description("program desc: ");
return 0; return 0;
} }
...@@ -23,7 +23,7 @@ int main() { ...@@ -23,7 +23,7 @@ int main() {
// auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model", // auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
// std::string(g_mobilenet_detect) + "/params", true); // std::string(g_mobilenet_detect) + "/params", true);
auto isok = paddle_mobile.Load(g_mobilenet, true); auto isok = paddle_mobile.Load(std::string(g_mobilenet), true);
if (isok) { if (isok) {
auto time2 = paddle_mobile::time(); auto time2 = paddle_mobile::time();
std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms" std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
...@@ -33,24 +33,15 @@ int main() { ...@@ -33,24 +33,15 @@ int main() {
std::vector<int64_t> dims{1, 3, 224, 224}; std::vector<int64_t> dims{1, 3, 224, 224};
GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims); GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
std::vector<float> vec_result; std::vector<float> vec_result = paddle_mobile.Predict(input, dims);
// = paddle_mobile.Predict(input, dims);
auto time3 = paddle_mobile::time(); auto time3 = paddle_mobile::time();
int max = 1; int max = 10;
for (int i = 0; i < max; ++i) { for (int i = 0; i < max; ++i) {
vec_result = paddle_mobile.Predict(input, dims); vec_result = paddle_mobile.Predict(input, dims);
} }
auto time4 = paddle_mobile::time(); auto time4 = paddle_mobile::time();
// auto time3 = paddle_mobile::time();
// for (int i = 0; i < 10; ++i) {
// auto vec_result = paddle_mobile.Predict(input, dims);
// }
// auto time4 = paddle_mobile::time();
std::cout << "predict cost :" std::cout << "predict cost :"
<< paddle_mobile::time_diff(time3, time4) / max << "ms" << paddle_mobile::time_diff(time3, time4) / max << "ms"
<< std::endl; << std::endl;
......
...@@ -23,7 +23,7 @@ int main() { ...@@ -23,7 +23,7 @@ int main() {
// auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model", // auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
// std::string(g_mobilenet_detect) + "/params", true); // std::string(g_mobilenet_detect) + "/params", true);
auto isok = paddle_mobile.Load(g_yolo_mul, true); auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true);
if (isok) { if (isok) {
auto time2 = paddle_mobile::time(); auto time2 = paddle_mobile::time();
std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms" std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册