提交 e8d74b3b 编写于 作者: L liuruilong

update conv kernel

上级 af1c0a52
...@@ -257,16 +257,21 @@ class CLImage { ...@@ -257,16 +257,21 @@ class CLImage {
float *p = tensor_data; float *p = tensor_data;
size_t i0 = 0; size_t i0 = 0;
for (int n = 0; n < N; n++) { for (int n = 0; n < N; n++) {
for (int c = 0; c < C; c++) { for (int c = 0; c < c_block_ * 4; c++) {
size_t i1 = i0 + (c / 4) * W; size_t i1 = i0 + (c / 4) * W;
for (int h = 0; h < H; h++) { for (int h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4; size_t i2 = (i1 << 2) + c % 4;
for (int w = 0; w < W; w++) { for (int w = 0; w < W; w++) {
// int x = (n * width * H + h * width + (c / 4) * W + w) * 4 + (c if (c < C) {
// % 4); // int x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
imageData[i2] = Float2Half(*p); // (c % 4);
i2 += 4; imageData[i2] = Float2Half(*p);
p++; i2 += 4;
p++;
} else {
imageData[i2] = 0.0;
i2 += 4;
}
} }
i1 += width; i1 += width;
} }
......
...@@ -37,7 +37,7 @@ limitations under the License. */ ...@@ -37,7 +37,7 @@ limitations under the License. */
#include "framework/cl/cl_image.h" #include "framework/cl/cl_image.h"
#endif #endif
int debug_to = 3; int debug_to = 2;
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
......
...@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "feed_op.h" #include "operators/feed_op.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -22,6 +23,7 @@ void FeedOp<DeviceType, T>::InferShape() const { ...@@ -22,6 +23,7 @@ void FeedOp<DeviceType, T>::InferShape() const {
out_dims[0] = this->param_.BatchSize(); out_dims[0] = this->param_.BatchSize();
this->param_.Out()->Resize(out_dims); this->param_.Out()->Resize(out_dims);
} }
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
...@@ -12,25 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,25 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "operators/kernel/feed_kernel.h" #include "operators/kernel/feed_kernel.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <> template <>
bool FeedKernel<CPU, float>::Init(FeedParam<CPU> *param) { bool FeedKernel<CPU, float>::Init(FeedParam<CPU> *param) {
return true; return true;
} }
template <> template <>
void FeedKernel<CPU, float>::Compute(const FeedParam<CPU> &param) { void FeedKernel<CPU, float>::Compute(const FeedParam<CPU> &param) {
param.Out()->ShareDataWith(*(param.InputX())); param.Out()->ShareDataWith(*(param.InputX()));
param.Out()->set_lod(param.InputX()->lod()); param.Out()->set_lod(param.InputX()->lod());
} }
template class FeedKernel<CPU, float>; template class FeedKernel<CPU, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -65,6 +65,14 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -65,6 +65,14 @@ __kernel void conv_3x3(__private const int global_size_dim0,
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
const int out_nh = get_global_id(2); const int out_nh = get_global_id(2);
if (out_c >= global_size_dim0 ||
out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
printf(" out of range ");
return;
}
int2 stride_xy; int2 stride_xy;
stride_xy.x = stride; stride_xy.x = stride;
stride_xy.y = stride; stride_xy.y = stride;
...@@ -135,24 +143,24 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -135,24 +143,24 @@ __kernel void conv_3x3(__private const int global_size_dim0,
input[8] = select(read_imageh(input_image, sampler, input[8] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y + dilation)), (int2)(pos_in.x + dilation, pos_in.y + dilation)),
(half4)(0.0f), (half4)(0.0f),
(ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
for (int j = 0; j < 9; ++j) { for (int j = 0; j < 9; ++j) {
int2 fuck; int2 fuck;
fuck.x = i * 3 + j % 3; fuck.x = i * 3 + j % 3;
fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3; fuck.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, fuck); half4 weight_x = read_imageh(filter, sampler, fuck);
output.x += dot(input[j], weight_x); output.x += dot(input[j], weight_x);
fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3; fuck.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, fuck); half4 weight_y = read_imageh(filter, sampler, fuck);
output.y += dot(input[j], weight_y); output.y += dot(input[j], weight_y);
fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3; fuck.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, fuck); half4 weight_z = read_imageh(filter, sampler, fuck);
output.z += dot(input[j], weight_z); output.z += dot(input[j], weight_z);
fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3; fuck.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, fuck); half4 weight_w = read_imageh(filter, sampler, fuck);
output.w += dot(input[j], weight_w); output.w += dot(input[j], weight_w);
} }
......
...@@ -63,6 +63,14 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -63,6 +63,14 @@ __kernel void conv_3x3(__private const int global_size_dim0,
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
const int out_nh = get_global_id(2); const int out_nh = get_global_id(2);
if (out_c >= global_size_dim0 ||
out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
printf(" out of range ");
return;
}
int2 stride_xy; int2 stride_xy;
stride_xy.x = stride; stride_xy.x = stride;
stride_xy.y = stride; stride_xy.y = stride;
...@@ -133,24 +141,24 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -133,24 +141,24 @@ __kernel void conv_3x3(__private const int global_size_dim0,
input[8] = select(read_imageh(input_image, sampler, input[8] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y + dilation)), (int2)(pos_in.x + dilation, pos_in.y + dilation)),
(half4)(0.0f), (half4)(0.0f),
(ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
for (int j = 0; j < 9; ++j) { for (int j = 0; j < 9; ++j) {
int2 fuck; int2 fuck;
fuck.x = i * 3 + j % 3; fuck.x = i * 3 + j % 3;
fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3; fuck.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, fuck); half4 weight_x = read_imageh(filter, sampler, fuck);
output.x += dot(input[j], weight_x); output.x += dot(input[j], weight_x);
fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3; fuck.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, fuck); half4 weight_y = read_imageh(filter, sampler, fuck);
output.y += dot(input[j], weight_y); output.y += dot(input[j], weight_y);
fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3; fuck.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, fuck); half4 weight_z = read_imageh(filter, sampler, fuck);
output.z += dot(input[j], weight_z); output.z += dot(input[j], weight_z);
fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3; fuck.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, fuck); half4 weight_w = read_imageh(filter, sampler, fuck);
output.w += dot(input[j], weight_w); output.w += dot(input[j], weight_w);
} }
...@@ -169,7 +177,6 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -169,7 +177,6 @@ __kernel void conv_3x3(__private const int global_size_dim0,
__kernel void depth_conv_3x3(__private const int global_size_dim0, __kernel void depth_conv_3x3(__private const int global_size_dim0,
__private const int global_size_dim1, __private const int global_size_dim1,
__private const int global_size_dim2, __private const int global_size_dim2,
......
...@@ -44,6 +44,14 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -44,6 +44,14 @@ __kernel void conv_3x3(__private const int global_size_dim0,
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
const int out_nh = get_global_id(2); const int out_nh = get_global_id(2);
if (out_c >= global_size_dim0 ||
out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
printf(" out of range ");
return;
}
int2 stride_xy; int2 stride_xy;
stride_xy.x = stride; stride_xy.x = stride;
stride_xy.y = stride; stride_xy.y = stride;
...@@ -114,24 +122,24 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -114,24 +122,24 @@ __kernel void conv_3x3(__private const int global_size_dim0,
input[8] = select(read_imageh(input_image, sampler, input[8] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y + dilation)), (int2)(pos_in.x + dilation, pos_in.y + dilation)),
(half4)(0.0f), (half4)(0.0f),
(ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
for (int j = 0; j < 9; ++j) { for (int j = 0; j < 9; ++j) {
int2 fuck; int2 fuck;
fuck.x = i * 3 + j % 3; fuck.x = i * 3 + j % 3;
fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3; fuck.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, fuck); half4 weight_x = read_imageh(filter, sampler, fuck);
output.x += dot(input[j], weight_x); output.x += dot(input[j], weight_x);
fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3; fuck.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, fuck); half4 weight_y = read_imageh(filter, sampler, fuck);
output.y += dot(input[j], weight_y); output.y += dot(input[j], weight_y);
fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3; fuck.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, fuck); half4 weight_z = read_imageh(filter, sampler, fuck);
output.z += dot(input[j], weight_z); output.z += dot(input[j], weight_z);
fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3; fuck.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, fuck); half4 weight_w = read_imageh(filter, sampler, fuck);
output.w += dot(input[j], weight_w); output.w += dot(input[j], weight_w);
} }
...@@ -150,7 +158,6 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -150,7 +158,6 @@ __kernel void conv_3x3(__private const int global_size_dim0,
__kernel void depth_conv_3x3(__private const int global_size_dim0, __kernel void depth_conv_3x3(__private const int global_size_dim0,
__private const int global_size_dim1, __private const int global_size_dim1,
__private const int global_size_dim2, __private const int global_size_dim2,
......
...@@ -54,6 +54,14 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -54,6 +54,14 @@ __kernel void conv_3x3(__private const int global_size_dim0,
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
const int out_nh = get_global_id(2); const int out_nh = get_global_id(2);
if (out_c >= global_size_dim0 ||
out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
printf(" out of range ");
return;
}
int2 stride_xy; int2 stride_xy;
stride_xy.x = stride; stride_xy.x = stride;
stride_xy.y = stride; stride_xy.y = stride;
...@@ -124,24 +132,24 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -124,24 +132,24 @@ __kernel void conv_3x3(__private const int global_size_dim0,
input[8] = select(read_imageh(input_image, sampler, input[8] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y + dilation)), (int2)(pos_in.x + dilation, pos_in.y + dilation)),
(half4)(0.0f), (half4)(0.0f),
(ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
for (int j = 0; j < 9; ++j) { for (int j = 0; j < 9; ++j) {
int2 fuck; int2 fuck;
fuck.x = i * 3 + j % 3; fuck.x = i * 3 + j % 3;
fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3; fuck.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, fuck); half4 weight_x = read_imageh(filter, sampler, fuck);
output.x += dot(input[j], weight_x); output.x += dot(input[j], weight_x);
fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3; fuck.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, fuck); half4 weight_y = read_imageh(filter, sampler, fuck);
output.y += dot(input[j], weight_y); output.y += dot(input[j], weight_y);
fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3; fuck.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, fuck); half4 weight_z = read_imageh(filter, sampler, fuck);
output.z += dot(input[j], weight_z); output.z += dot(input[j], weight_z);
fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3; fuck.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, fuck); half4 weight_w = read_imageh(filter, sampler, fuck);
output.w += dot(input[j], weight_w); output.w += dot(input[j], weight_w);
} }
...@@ -158,9 +166,6 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -158,9 +166,6 @@ __kernel void conv_3x3(__private const int global_size_dim0,
write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output); write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output);
} }
__kernel void depth_conv_3x3(__private const int global_size_dim0, __kernel void depth_conv_3x3(__private const int global_size_dim0,
__private const int global_size_dim1, __private const int global_size_dim1,
__private const int global_size_dim2, __private const int global_size_dim2,
......
...@@ -165,6 +165,18 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute( ...@@ -165,6 +165,18 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
int output_width = param.Output()->WidthOfOneBlock(); int output_width = param.Output()->WidthOfOneBlock();
int output_height = param.Output()->HeightOfOneBlock(); int output_height = param.Output()->HeightOfOneBlock();
DLOG << " c block " << c_block;
DLOG << " w " << w;
DLOG << " nh " << nh;
DLOG << " stride " << stride;
DLOG << " offset " << offset;
DLOG << " input_c " << input_c;
DLOG << " dilation " << dilation;
DLOG << " input width " << input_width;
DLOG << " input height " << input_height;
DLOG << " output width " << output_width;
DLOG << " output height " << output_height;
cl_int status; cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
......
...@@ -15,41 +15,41 @@ limitations under the License. */ ...@@ -15,41 +15,41 @@ limitations under the License. */
#include "operators/kernel/feed_kernel.h" #include "operators/kernel/feed_kernel.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <> template <>
bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) { bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
Tensor *output = param->Out(); Tensor *output = param->Out();
fpga::format_fp16_ofm(output); fpga::format_fp16_ofm(output);
return true; return true;
} }
template <> template <>
void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) { void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
auto input = reinterpret_cast<Tensor *>(const_cast<LoDTensor *>(param.InputX())); auto input =
auto input_ptr = input->data<float>(); reinterpret_cast<Tensor *>(const_cast<LoDTensor *>(param.InputX()));
fpga::format_image(input); auto input_ptr = input->data<float>();
Tensor *output = param.Out(); fpga::format_image(input);
auto output_ptr = output->data<float>(); Tensor *output = param.Out();
auto output_ptr = output->data<float>();
fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
args.input_data_type = fpga::DATA_TYPE_FP32;
args.output_data_type = fpga::DATA_TYPE_FP16; args.input_data_type = fpga::DATA_TYPE_FP32;
args.input_layout_type = fpga::LAYOUT_CHW; args.output_data_type = fpga::DATA_TYPE_FP16;
args.output_layout_type = fpga::LAYOUT_HWC; args.input_layout_type = fpga::LAYOUT_CHW;
args.image.address = reinterpret_cast<void *>(input_ptr); args.output_layout_type = fpga::LAYOUT_HWC;
args.image.channels = (uint32_t)input->dims()[1]; args.image.address = reinterpret_cast<void *>(input_ptr);
args.image.height = (uint32_t)input->dims()[2]; args.image.channels = (uint32_t)input->dims()[1];
args.image.width = (uint32_t)input->dims()[3]; args.image.height = (uint32_t)input->dims()[2];
args.image.pad_height = 0; args.image.width = (uint32_t)input->dims()[3];
args.image.pad_width = 0; args.image.pad_height = 0;
args.output.address = output_ptr; args.image.pad_width = 0;
args.output.scale_address = output->scale; args.output.address = output_ptr;
fpga::PerformBypass(args); args.output.scale_address = output->scale;
} fpga::PerformBypass(args);
template class FeedKernel<FPGA, float>; }
template class FeedKernel<FPGA, float>;
} // namespace operators
} // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -15,23 +15,19 @@ limitations under the License. */ ...@@ -15,23 +15,19 @@ limitations under the License. */
#include "operators/kernel/feed_kernel.h" #include "operators/kernel/feed_kernel.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <>
bool FeedKernel<GPU_MALI, float>::Init(FeedParam<GPU_MALI> *param) {
return true;
}
template <> template <>
bool FeedKernel<GPU_MALI, float>::Init( void FeedKernel<GPU_MALI, float>::Compute(const FeedParam<GPU_MALI> &param) {}
FeedParam<GPU_MALI> *param) {
return true;
}
template <> template class FeedKernel<GPU_MALI, float>;
void FeedKernel<GPU_MALI, float>::Compute(
const FeedParam<GPU_MALI> &param) {
}
template class FeedKernel<GPU_MALI, float>; } // namespace operators
} // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif #endif
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册