提交 176f90c4 编写于 作者: X xiebaiyuan 提交者: GitHub

fix out range in conv 1x1 for nano yolo ,test=develop (#2246)

上级 05e96136
......@@ -126,6 +126,9 @@ class CLImage {
void InitEmptyImage(cl_context context, cl_command_queue command_queue,
const DDim &dim) {
if (image_converter_ != nullptr) {
delete image_converter_;
}
PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr,
" empty image tensor data shouldn't have value");
......@@ -153,7 +156,9 @@ class CLImage {
const DDim &need_dims, const DDim &real_image_dims) {
PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr,
" empty image tensor data shouldn't have value");
if (image_converter_ != nullptr) {
delete image_converter_;
}
CLImageConverterNormal *normal_converter = new CLImageConverterNormal();
// use real image dims to create mem
real_image_dims_ = real_image_dims;
......@@ -178,6 +183,9 @@ class CLImage {
*/
void InitWithExistMem(cl_context context, cl_command_queue command_queue,
DDim need_dims, const CLImage &src) {
if (image_converter_ != nullptr) {
delete image_converter_;
}
CLImageConverterNormal *normal_converter = new CLImageConverterNormal();
real_image_dims_ = src.real_image_dims_;
......
......@@ -32,6 +32,9 @@ namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(elementwise_mul, ops::ElementwiseMulOp);
#endif
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CL(elementwise_mul, ops::ElementwiseMulOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(elementwise_mul, ops::ElementwiseMulOp);
#endif
......
......@@ -212,6 +212,7 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
param.Input()->Converter())
->GetCBlock();
int input_c_origin = param.Input()->dims()[1];
int dilation = param.Dilations()[0];
int input_width = param.Input()->dims()[3];
int input_height = param.Input()->dims()[2];
......@@ -284,6 +285,9 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
status = clSetKernelArg(kernel, index++, sizeof(int), &input_c);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &input_c_origin);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &dilation);
CL_CHECK_ERRORS(status);
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage) {
int x = get_global_id(0);
int y = get_global_id(1);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords;
coords.x = x;
coords.y = y;
half4 in = read_imageh(input, sampler, coords);
half4 biase = read_imageh(bias, sampler, coords);
half4 output = in * biase;
write_imageh(outputImage,coords,output);
}
__kernel void channel_mul(__global image2d_t input, __global image2d_t bias,__write_only
image2d_t outputImage, int w) {
int x = get_global_id(0);
int y = get_global_id(1);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords;
coords.x = x;
coords.y = y;
int2 coords_bias;
coords_bias.x = x/w;
coords_bias.y = 0;
half4 in = read_imageh(input, sampler, coords);
half4 biase = read_imageh(bias, sampler, coords_bias);
half4 output = in * biase;
write_imageh(outputImage,coords,output);
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef ELEMENTWISEMUL_OP
#include "operators/kernel/elementwise_mul_kernel.h"
#include "framework/cl/cl_image.h"
namespace paddle_mobile {
namespace operators {
template <>
bool ElementwiseMulKernel<GPU_CL, float>::Init(
ElementwiseMulParam<GPU_CL> *param) {
DLOG << "-----init add-----";
framework::CLImage *bias = reinterpret_cast<framework::CLImage *>(
const_cast<framework::CLImage *>(param->InputY()));
if (bias->dims() == param->InputX()->dims()) {
this->cl_helper_.AddKernel("elementwise_mul", "elementwise_mul_kernel.cl");
} else if (bias->dims().size() == 4) {
this->cl_helper_.AddKernel("channel_mul", "elementwise_mul_kernel.cl");
} else {
DLOG << "error:bias dims is error";
}
return true;
}
template <>
void ElementwiseMulKernel<GPU_CL, float>::Compute(
const ElementwiseMulParam<GPU_CL> &param) {
auto input = param.InputX();
auto bias = param.InputY();
auto output = param.Out();
cl_int status;
auto kernel = this->cl_helper_.KernelAt(0);
if (bias->dims() == input->dims()) {
cl_mem input_image = input->GetCLImage();
cl_mem bias_image = bias->GetCLImage();
cl_mem output_image = output->GetCLImage();
status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
reinterpret_cast<void *>(&input_image));
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
reinterpret_cast<void *>(&bias_image));
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
reinterpret_cast<void *>(&output_image));
CL_CHECK_ERRORS(status);
auto width = input->ImageWidth();
auto height = input->ImageHeight();
size_t global_work_size[2] = {width, height};
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
NULL, global_work_size, NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
} else if (bias->dims().size() == 4) {
DLOG << "zp7 444";
cl_mem input_image = input->GetCLImage();
cl_mem bias_image = bias->GetCLImage();
cl_mem output_image = output->GetCLImage();
int tensor_w = input->dims()[input->dims().size() - 1];
status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
reinterpret_cast<void *>(&input_image));
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
reinterpret_cast<void *>(&bias_image));
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
reinterpret_cast<void *>(&output_image));
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_int),
reinterpret_cast<void *>(&tensor_w));
CL_CHECK_ERRORS(status);
auto width = input->ImageWidth();
auto height = input->ImageHeight();
DLOG << "dede:" << width << "," << height;
size_t global_work_size[2] = {width, height};
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
NULL, global_work_size, NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
} else {
DLOG << "error:bias dims is error";
}
}
template class ElementwiseMulKernel<GPU_CL, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册