提交 361c9b2a 编写于 作者: Y yangfei

Merge remote-tracking branch 'upstream/opencl' into opencl1

# Conflicts:
#	src/operators/kernel/fpga/feed-kernel.cpp
...@@ -257,16 +257,21 @@ class CLImage { ...@@ -257,16 +257,21 @@ class CLImage {
float *p = tensor_data; float *p = tensor_data;
size_t i0 = 0; size_t i0 = 0;
for (int n = 0; n < N; n++) { for (int n = 0; n < N; n++) {
for (int c = 0; c < C; c++) { for (int c = 0; c < c_block_ * 4; c++) {
size_t i1 = i0 + (c / 4) * W; size_t i1 = i0 + (c / 4) * W;
for (int h = 0; h < H; h++) { for (int h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4; size_t i2 = (i1 << 2) + c % 4;
for (int w = 0; w < W; w++) { for (int w = 0; w < W; w++) {
// int x = (n * width * H + h * width + (c / 4) * W + w) * 4 + (c if (c < C) {
// % 4); // int x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
imageData[i2] = Float2Half(*p); // (c % 4);
i2 += 4; imageData[i2] = Float2Half(*p);
p++; i2 += 4;
p++;
} else {
imageData[i2] = 0.0;
i2 += 4;
}
} }
i1 += width; i1 += width;
} }
......
...@@ -37,7 +37,7 @@ limitations under the License. */ ...@@ -37,7 +37,7 @@ limitations under the License. */
#include "framework/cl/cl_image.h" #include "framework/cl/cl_image.h"
#endif #endif
int debug_to = 3; int debug_to = 2;
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
......
...@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "feed_op.h" #include "operators/feed_op.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -22,6 +23,7 @@ void FeedOp<DeviceType, T>::InferShape() const { ...@@ -22,6 +23,7 @@ void FeedOp<DeviceType, T>::InferShape() const {
out_dims[0] = this->param_.BatchSize(); out_dims[0] = this->param_.BatchSize();
this->param_.Out()->Resize(out_dims); this->param_.Out()->Resize(out_dims);
} }
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
...@@ -65,6 +65,14 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -65,6 +65,14 @@ __kernel void conv_3x3(__private const int global_size_dim0,
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
const int out_nh = get_global_id(2); const int out_nh = get_global_id(2);
if (out_c >= global_size_dim0 ||
out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
printf(" out of range ");
return;
}
int2 stride_xy; int2 stride_xy;
stride_xy.x = stride; stride_xy.x = stride;
stride_xy.y = stride; stride_xy.y = stride;
...@@ -135,24 +143,24 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -135,24 +143,24 @@ __kernel void conv_3x3(__private const int global_size_dim0,
input[8] = select(read_imageh(input_image, sampler, input[8] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y + dilation)), (int2)(pos_in.x + dilation, pos_in.y + dilation)),
(half4)(0.0f), (half4)(0.0f),
(ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
for (int j = 0; j < 9; ++j) { for (int j = 0; j < 9; ++j) {
int2 fuck; int2 fuck;
fuck.x = i * 3 + j % 3; fuck.x = i * 3 + j % 3;
fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3; fuck.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, fuck); half4 weight_x = read_imageh(filter, sampler, fuck);
output.x += dot(input[j], weight_x); output.x += dot(input[j], weight_x);
fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3; fuck.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, fuck); half4 weight_y = read_imageh(filter, sampler, fuck);
output.y += dot(input[j], weight_y); output.y += dot(input[j], weight_y);
fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3; fuck.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, fuck); half4 weight_z = read_imageh(filter, sampler, fuck);
output.z += dot(input[j], weight_z); output.z += dot(input[j], weight_z);
fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3; fuck.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, fuck); half4 weight_w = read_imageh(filter, sampler, fuck);
output.w += dot(input[j], weight_w); output.w += dot(input[j], weight_w);
} }
......
...@@ -63,6 +63,14 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -63,6 +63,14 @@ __kernel void conv_3x3(__private const int global_size_dim0,
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
const int out_nh = get_global_id(2); const int out_nh = get_global_id(2);
if (out_c >= global_size_dim0 ||
out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
printf(" out of range ");
return;
}
int2 stride_xy; int2 stride_xy;
stride_xy.x = stride; stride_xy.x = stride;
stride_xy.y = stride; stride_xy.y = stride;
...@@ -133,24 +141,24 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -133,24 +141,24 @@ __kernel void conv_3x3(__private const int global_size_dim0,
input[8] = select(read_imageh(input_image, sampler, input[8] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y + dilation)), (int2)(pos_in.x + dilation, pos_in.y + dilation)),
(half4)(0.0f), (half4)(0.0f),
(ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
for (int j = 0; j < 9; ++j) { for (int j = 0; j < 9; ++j) {
int2 fuck; int2 fuck;
fuck.x = i * 3 + j % 3; fuck.x = i * 3 + j % 3;
fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3; fuck.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, fuck); half4 weight_x = read_imageh(filter, sampler, fuck);
output.x += dot(input[j], weight_x); output.x += dot(input[j], weight_x);
fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3; fuck.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, fuck); half4 weight_y = read_imageh(filter, sampler, fuck);
output.y += dot(input[j], weight_y); output.y += dot(input[j], weight_y);
fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3; fuck.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, fuck); half4 weight_z = read_imageh(filter, sampler, fuck);
output.z += dot(input[j], weight_z); output.z += dot(input[j], weight_z);
fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3; fuck.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, fuck); half4 weight_w = read_imageh(filter, sampler, fuck);
output.w += dot(input[j], weight_w); output.w += dot(input[j], weight_w);
} }
...@@ -169,7 +177,6 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -169,7 +177,6 @@ __kernel void conv_3x3(__private const int global_size_dim0,
__kernel void depth_conv_3x3(__private const int global_size_dim0, __kernel void depth_conv_3x3(__private const int global_size_dim0,
__private const int global_size_dim1, __private const int global_size_dim1,
__private const int global_size_dim2, __private const int global_size_dim2,
......
...@@ -44,6 +44,14 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -44,6 +44,14 @@ __kernel void conv_3x3(__private const int global_size_dim0,
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
const int out_nh = get_global_id(2); const int out_nh = get_global_id(2);
if (out_c >= global_size_dim0 ||
out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
printf(" out of range ");
return;
}
int2 stride_xy; int2 stride_xy;
stride_xy.x = stride; stride_xy.x = stride;
stride_xy.y = stride; stride_xy.y = stride;
...@@ -114,24 +122,24 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -114,24 +122,24 @@ __kernel void conv_3x3(__private const int global_size_dim0,
input[8] = select(read_imageh(input_image, sampler, input[8] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y + dilation)), (int2)(pos_in.x + dilation, pos_in.y + dilation)),
(half4)(0.0f), (half4)(0.0f),
(ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
for (int j = 0; j < 9; ++j) { for (int j = 0; j < 9; ++j) {
int2 fuck; int2 fuck;
fuck.x = i * 3 + j % 3; fuck.x = i * 3 + j % 3;
fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3; fuck.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, fuck); half4 weight_x = read_imageh(filter, sampler, fuck);
output.x += dot(input[j], weight_x); output.x += dot(input[j], weight_x);
fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3; fuck.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, fuck); half4 weight_y = read_imageh(filter, sampler, fuck);
output.y += dot(input[j], weight_y); output.y += dot(input[j], weight_y);
fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3; fuck.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, fuck); half4 weight_z = read_imageh(filter, sampler, fuck);
output.z += dot(input[j], weight_z); output.z += dot(input[j], weight_z);
fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3; fuck.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, fuck); half4 weight_w = read_imageh(filter, sampler, fuck);
output.w += dot(input[j], weight_w); output.w += dot(input[j], weight_w);
} }
...@@ -150,7 +158,6 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -150,7 +158,6 @@ __kernel void conv_3x3(__private const int global_size_dim0,
__kernel void depth_conv_3x3(__private const int global_size_dim0, __kernel void depth_conv_3x3(__private const int global_size_dim0,
__private const int global_size_dim1, __private const int global_size_dim1,
__private const int global_size_dim2, __private const int global_size_dim2,
......
...@@ -54,6 +54,14 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -54,6 +54,14 @@ __kernel void conv_3x3(__private const int global_size_dim0,
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
const int out_nh = get_global_id(2); const int out_nh = get_global_id(2);
if (out_c >= global_size_dim0 ||
out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
printf(" out of range ");
return;
}
int2 stride_xy; int2 stride_xy;
stride_xy.x = stride; stride_xy.x = stride;
stride_xy.y = stride; stride_xy.y = stride;
...@@ -124,24 +132,24 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -124,24 +132,24 @@ __kernel void conv_3x3(__private const int global_size_dim0,
input[8] = select(read_imageh(input_image, sampler, input[8] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y + dilation)), (int2)(pos_in.x + dilation, pos_in.y + dilation)),
(half4)(0.0f), (half4)(0.0f),
(ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
for (int j = 0; j < 9; ++j) { for (int j = 0; j < 9; ++j) {
int2 fuck; int2 fuck;
fuck.x = i * 3 + j % 3; fuck.x = i * 3 + j % 3;
fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3; fuck.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, fuck); half4 weight_x = read_imageh(filter, sampler, fuck);
output.x += dot(input[j], weight_x); output.x += dot(input[j], weight_x);
fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3; fuck.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, fuck); half4 weight_y = read_imageh(filter, sampler, fuck);
output.y += dot(input[j], weight_y); output.y += dot(input[j], weight_y);
fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3; fuck.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, fuck); half4 weight_z = read_imageh(filter, sampler, fuck);
output.z += dot(input[j], weight_z); output.z += dot(input[j], weight_z);
fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3; fuck.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, fuck); half4 weight_w = read_imageh(filter, sampler, fuck);
output.w += dot(input[j], weight_w); output.w += dot(input[j], weight_w);
} }
...@@ -158,9 +166,6 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -158,9 +166,6 @@ __kernel void conv_3x3(__private const int global_size_dim0,
write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output); write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output);
} }
__kernel void depth_conv_3x3(__private const int global_size_dim0, __kernel void depth_conv_3x3(__private const int global_size_dim0,
__private const int global_size_dim1, __private const int global_size_dim1,
__private const int global_size_dim2, __private const int global_size_dim2,
......
...@@ -165,6 +165,18 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute( ...@@ -165,6 +165,18 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
int output_width = param.Output()->WidthOfOneBlock(); int output_width = param.Output()->WidthOfOneBlock();
int output_height = param.Output()->HeightOfOneBlock(); int output_height = param.Output()->HeightOfOneBlock();
DLOG << " c block " << c_block;
DLOG << " w " << w;
DLOG << " nh " << nh;
DLOG << " stride " << stride;
DLOG << " offset " << offset;
DLOG << " input_c " << input_c;
DLOG << " dilation " << dilation;
DLOG << " input width " << input_width;
DLOG << " input height " << input_height;
DLOG << " output width " << output_width;
DLOG << " output height " << output_height;
cl_int status; cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册