提交 7e35c2b3 编写于 作者: Y yangfei

imp mobilelessd

上级 c186adb0
...@@ -13,18 +13,98 @@ See the License for the specific language governing permissions and ...@@ -13,18 +13,98 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "framework/cl/cl_image.h" #include "framework/cl/cl_image.h"
#include "framework/cl/cl_tensor.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
void CLImageToTensor(CLImage *cl_image, Tensor *tensor, void CLImageToTensor(CLImage *cl_image, Tensor *tensor, cl_context context,
cl_command_queue commandQueue) { cl_command_queue commandQueue, cl_kernel kernel) {
// TODO(yangfei): need imp tensor->mutable_data<float>();
const auto &dim = cl_image->dims();
size_t new_dims[] = {1, 1, 1, 1};
for (int j = 0; j < dim.size(); ++j) {
new_dims[4 - dim.size() + j] = dim[j];
}
size_t C, in_height, in_width;
C = new_dims[1];
in_height = new_dims[2];
in_width = new_dims[3];
CLTensor out_cl_tensor(context, commandQueue);
out_cl_tensor.Resize(tensor->dims());
cl_mem outBuffer = out_cl_tensor.mutable_data<float>();
auto input_image = cl_image->GetCLImage();
clSetKernelArg(kernel, 0, sizeof(int), &in_height);
clSetKernelArg(kernel, 1, sizeof(int), &in_width);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &input_image);
clSetKernelArg(kernel, 3, sizeof(cl_mem), &outBuffer);
int size_ch = in_height * in_width;
int size_block = size_ch * 4;
int size_batch = size_ch * C;
clSetKernelArg(kernel, 4, sizeof(int), &size_ch);
clSetKernelArg(kernel, 5, sizeof(int), &size_block);
clSetKernelArg(kernel, 6, sizeof(int), &size_batch);
clSetKernelArg(kernel, 7, sizeof(int), &C);
size_t global_work_size[3] = {(new_dims[1] + 3) / 4, new_dims[3],
new_dims[0] * new_dims[2]};
clEnqueueNDRangeKernel(commandQueue, kernel, 3, NULL, global_work_size, NULL,
0, NULL, NULL);
memcpy(tensor->data<float>(), out_cl_tensor.Data<float>(),
tensor->memory_size());
} }
void TensorToCLImage(const Tensor *tensor, CLImage *cl_image, void TensorToCLImage(Tensor *tensor, CLImage *cl_image, cl_context context,
cl_command_queue commandQueue) { cl_command_queue commandQueue, cl_kernel kernel) {
// TODO(yangfei): need imp const auto &dim = cl_image->dims();
size_t new_dims[] = {1, 1, 1, 1};
for (int j = 0; j < dim.size(); ++j) {
new_dims[4 - dim.size() + j] = dim[j];
}
cl_int status;
auto output = cl_image;
const Tensor *input = tensor;
const float *input_data = input->data<float>();
auto output_image = output->GetCLImage();
const int out_C = new_dims[1];
const int out_H = new_dims[2];
const int out_W = new_dims[3];
const int Stride2 = out_C * out_H * out_W;
const int Stride1 = out_H * out_W;
const int Stride0 = out_W;
DLOG << out_C;
DLOG << out_H;
DLOG << out_W;
CLTensor input_cl_tensor(context, commandQueue);
input_cl_tensor.Resize(input->dims());
cl_mem inputBuffer = input_cl_tensor.mutable_with_data<float>(input_data);
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(cl_int), &out_C);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 5, sizeof(cl_int), &Stride0);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 6, sizeof(cl_int), &Stride1);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 7, sizeof(cl_int), &Stride2);
CL_CHECK_ERRORS(status);
size_t global_work_size[3] = {(new_dims[1] + 3) / 4, new_dims[3],
new_dims[0] * new_dims[2]};
status = clEnqueueNDRangeKernel(commandQueue, kernel, 3, NULL,
global_work_size, NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
} }
#ifdef PADDLE_MOBILE_DEBUG #ifdef PADDLE_MOBILE_DEBUG
......
...@@ -222,11 +222,11 @@ class CLImage { ...@@ -222,11 +222,11 @@ class CLImage {
CLImageConverterBase *image_converter_ = nullptr; CLImageConverterBase *image_converter_ = nullptr;
}; };
void TensorToCLImage(Tensor *tensor, CLImage *image, void TensorToCLImage(Tensor *tensor, CLImage *image, cl_context context,
cl_command_queue commandQueue); cl_command_queue commandQueue, cl_kernel kernel);
void CLImageToTensor(CLImage *image, Tensor *tensor, void CLImageToTensor(CLImage *image, Tensor *tensor, cl_context context,
cl_command_queue commandQueue); cl_command_queue commandQueue, cl_kernel kernel);
#ifdef PADDLE_MOBILE_DEBUG #ifdef PADDLE_MOBILE_DEBUG
Print &operator<<(Print &printer, const CLImage &image); Print &operator<<(Print &printer, const CLImage &image);
......
...@@ -143,10 +143,12 @@ double PaddleMobile<CPU, Precision::FP32>::GetPredictTime() { ...@@ -143,10 +143,12 @@ double PaddleMobile<CPU, Precision::FP32>::GetPredictTime() {
int t1 = 1; int t1 = 1;
int t2 = 1; int t2 = 1;
for (int i = 0; i < m * k; ++i) { for (int i = 0; i < m * k; ++i) {
a[i] = t1 + rand() % t2; unsigned int seed = 100;
a[i] = t1 + rand_r(&seed) % t2;
} }
for (int i = 0; i < k * n; ++i) { for (int i = 0; i < k * n; ++i) {
b[i] = t1 + rand() % t2; unsigned int seed = 200;
b[i] = t1 + rand_r(&seed) % t2;
} }
paddle_mobile::operators::math::Gemm gemm; paddle_mobile::operators::math::Gemm gemm;
auto time1 = paddle_mobile::time(); auto time1 = paddle_mobile::time();
...@@ -215,13 +217,13 @@ double PaddleMobile<GPU_CL, Precision::FP32>::GetPredictTime() { ...@@ -215,13 +217,13 @@ double PaddleMobile<GPU_CL, Precision::FP32>::GetPredictTime() {
cl_int status; cl_int status;
cl_uint nPlatform; cl_uint nPlatform;
clGetPlatformIDs(0, NULL, &nPlatform); clGetPlatformIDs(0, NULL, &nPlatform);
cl_platform_id *listPlatform = cl_platform_id *listPlatform = reinterpret_cast<cl_platform_id *>(
(cl_platform_id *)malloc(nPlatform * sizeof(cl_platform_id)); malloc(nPlatform * sizeof(cl_platform_id)));
clGetPlatformIDs(nPlatform, listPlatform, NULL); clGetPlatformIDs(nPlatform, listPlatform, NULL);
cl_uint nDevice = 0; cl_uint nDevice = 0;
clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_GPU, 0, NULL, &nDevice); clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_GPU, 0, NULL, &nDevice);
cl_device_id *listDevice = cl_device_id *listDevice =
(cl_device_id *)malloc(nDevice * sizeof(cl_device_id)); reinterpret_cast<cl_device_id *>(malloc(nDevice * sizeof(cl_device_id)));
clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_GPU, nDevice, listDevice, clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_GPU, nDevice, listDevice,
NULL); NULL);
cl_context context = cl_context context =
...@@ -277,41 +279,66 @@ double PaddleMobile<GPU_CL, Precision::FP32>::GetPredictTime() { ...@@ -277,41 +279,66 @@ double PaddleMobile<GPU_CL, Precision::FP32>::GetPredictTime() {
clBuildProgram(program, 0, 0, path1.c_str(), NULL, NULL); clBuildProgram(program, 0, 0, path1.c_str(), NULL, NULL);
cl_kernel kernel = clCreateKernel(program, "feed", &status); cl_kernel kernel = clCreateKernel(program, "feed", &status);
int out_H = 224;
int out_W = 224;
int out_C = 3;
int Stride2 = out_C * out_H * out_W;
int Stride1 = out_H * out_W;
int Stride0 = out_W;
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer); status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_input_image); status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_input_image);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(cl_int), &input_w); status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_int), &input_h); status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(cl_int), &c); status = clSetKernelArg(kernel, 4, sizeof(cl_int), &out_C);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 5, sizeof(cl_int), &Stride0);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 6, sizeof(cl_int), &Stride1);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 7, sizeof(cl_int), &Stride2);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
size_t global_work_size[2] = {input_w, input_h}; size_t global_work_size[3] = {1, 224, 224};
// cl_event out_event = param.Out()->GetClEvent(); // cl_event out_event = param.Out()->GetClEvent();
status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, status = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size,
NULL, 0, NULL, NULL); NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
out_H = 3;
out_W = 3;
out_C = 3;
Stride2 = out_C * out_H * out_W;
Stride1 = out_H * out_W;
Stride0 = out_W;
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &filterBuffer); status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &filterBuffer);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_filter_image); status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_filter_image);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(cl_int), &filter_w); status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_int), &filter_h); status = clSetKernelArg(kernel, 4, sizeof(cl_int), &out_C);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(cl_int), &c); status = clSetKernelArg(kernel, 5, sizeof(cl_int), &Stride0);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 6, sizeof(cl_int), &Stride1);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 7, sizeof(cl_int), &Stride2);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
size_t global_work_size1[2] = {filter_w, filter_h}; size_t global_work_size1[3] = {1, 3, 96};
// cl_event out_event = param.Out()->GetClEvent(); // cl_event out_event = param.Out()->GetClEvent();
status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size1, status = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size1,
NULL, 0, NULL, NULL); NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
...@@ -378,13 +405,16 @@ double PaddleMobile<GPU_CL, Precision::FP32>::GetPredictTime() { ...@@ -378,13 +405,16 @@ double PaddleMobile<GPU_CL, Precision::FP32>::GetPredictTime() {
auto time2 = paddle_mobile::time(); auto time2 = paddle_mobile::time();
paddle_mobile::memory::Free(input); paddle_mobile::memory::Free(input);
paddle_mobile::memory::Free(filter); paddle_mobile::memory::Free(filter);
return paddle_mobile::time_diff(time1, time2); if (status == CL_SUCCESS) {
return paddle_mobile::time_diff(time1, time2);
} else {
return -1;
}
} }
template <typename Dtype, Precision P> template <typename Dtype, Precision P>
int PaddleMobile<Dtype, P>::readText( int PaddleMobile<Dtype, P>::readText(
const char *kernelPath, const char *kernelPath,
char **pcode) // 读取文本文件放入 pcode,返回字符串长度 char **pcode) { // 读取文本文件放入 pcode,返回字符串长度
{
FILE *fp; FILE *fp;
int size; int size;
// printf("<readText> File: %s\n", kernelPath); // printf("<readText> File: %s\n", kernelPath);
...@@ -402,7 +432,7 @@ int PaddleMobile<Dtype, P>::readText( ...@@ -402,7 +432,7 @@ int PaddleMobile<Dtype, P>::readText(
return -1; return -1;
} }
rewind(fp); rewind(fp);
if ((*pcode = (char *)malloc(size + 1)) == NULL) { if ((*pcode = reinterpret_cast<char *>(malloc(size + 1))) == NULL) {
printf("<readText> Allocate space failed\n"); printf("<readText> Allocate space failed\n");
return -1; return -1;
} }
......
...@@ -20,13 +20,57 @@ namespace paddle_mobile { ...@@ -20,13 +20,57 @@ namespace paddle_mobile {
namespace operators { namespace operators {
template <> template <>
bool BoxCoderKernel<GPU_CL, float>::Init(BoxCoderParam<GPU_CL> *param) { bool BoxCoderKernel<GPU_CL, float>::Init(BoxCoderParam<GPU_CL>* param) {
if (param->CodeType() == "decode_center_size") {
this->cl_helper_.AddKernel("box_decoder", "box_coder_kernel.cl");
}
return true; return true;
} }
template <> template <>
void BoxCoderKernel<GPU_CL, float>::Compute( void BoxCoderKernel<GPU_CL, float>::Compute(
const BoxCoderParam<GPU_CL> &param) {} const BoxCoderParam<GPU_CL>& param) {
auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.OutputBox());
const auto* input_priorbox = param.InputPriorBox();
const auto* input_priorboxvar = param.InputPriorBoxVar();
const auto* input_targetbox = param.InputTargetBox();
const auto& code_type = param.CodeType();
if (code_type == "decode_center_size") {
auto prior_box_image = input_priorbox->GetCLImage();
auto prior_box_var_image = input_priorboxvar->GetCLImage();
auto target_box_image = input_targetbox->GetCLImage();
auto output_image = param.OutputBox()->GetCLImage();
auto& outputDim = param.OutputBox()->dims();
int new_dims[4] = {1, 1, 1, 1};
for (int i = 0; i < outputDim.size(); i++) {
new_dims[4 - outputDim.size() + i] = outputDim[i];
}
int out_C = new_dims[1];
int out_H = new_dims[2];
DLOG << "out_C=" << out_C;
DLOG << "out_H=" << out_H;
DLOG << "default_work_size=" << default_work_size;
cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &prior_box_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &prior_box_var_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &target_box_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &output_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(int), &out_C);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 5, sizeof(int), &out_H);
CL_CHECK_ERRORS(status);
size_t global_work_size[2] = {default_work_size[0], default_work_size[2]};
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
NULL, global_work_size, NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
}
}
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void box_decoder(__read_only image2d_t prior_box_image,
__read_only image2d_t prior_box_var_image,
__read_only image2d_t target_box_image,
__write_only image2d_t output_image,
__private const int out_C,
__private const int out_H
){
const int out_c = get_global_id(0);
const int out_nh = get_global_id(1);
const int out_h = out_nh%out_H;
const int out_n = 1;
const int prior_box_n = 1;
const int prior_box_c = 0;
const int prior_box_h = out_h;
const int prior_box_var_n = 1;
const int prior_box_var_c = 0;
const int prior_box_var_h = out_h;
const int target_box_n = 1;
const int target_box_c = out_c;
const int target_box_h = out_h;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int2 prior_box_pos;
int2 prior_box_var_pos;
int2 target_box_pos;
int2 output_pos;
prior_box_pos.x = prior_box_c * 4;
prior_box_pos.y = prior_box_n * prior_box_h;
prior_box_var_pos.x = prior_box_var_c * 4;
prior_box_var_pos.y = prior_box_var_n * prior_box_var_h;
target_box_pos.x = target_box_c * 4;
target_box_pos.y = target_box_n * target_box_h;
output_pos.x = out_c * 4;
output_pos.y = out_n * out_h;
half4 prior_box_input[4];
half4 prior_box_var_input[4];
half4 target_box_input[4];
prior_box_input[0] = read_imageh(prior_box_image, sampler,(int2)(prior_box_pos.x + 0,prior_box_pos.y));
prior_box_input[1] = read_imageh(prior_box_image, sampler,(int2)(prior_box_pos.x + 1,prior_box_pos.y));
prior_box_input[2] = read_imageh(prior_box_image, sampler,(int2)(prior_box_pos.x + 2,prior_box_pos.y));
prior_box_input[3] = read_imageh(prior_box_image, sampler,(int2)(prior_box_pos.x + 3,prior_box_pos.y));
prior_box_var_input[0] = read_imageh(prior_box_var_image, sampler,(int2)(prior_box_var_pos.x + 0,prior_box_var_pos.y));
prior_box_var_input[1] = read_imageh(prior_box_var_image, sampler,(int2)(prior_box_var_pos.x + 1,prior_box_var_pos.y));
prior_box_var_input[2] = read_imageh(prior_box_var_image, sampler,(int2)(prior_box_var_pos.x + 2,prior_box_var_pos.y));
prior_box_var_input[3] = read_imageh(prior_box_var_image, sampler,(int2)(prior_box_var_pos.x + 3,prior_box_var_pos.y));
target_box_input[0] = read_imageh(target_box_image, sampler,(int2)(target_box_pos.x + 0,target_box_pos.y));
target_box_input[1] = read_imageh(target_box_image, sampler,(int2)(target_box_pos.x + 1,target_box_pos.y));
target_box_input[2] = read_imageh(target_box_image, sampler,(int2)(target_box_pos.x + 2,target_box_pos.y));
target_box_input[3] = read_imageh(target_box_image, sampler,(int2)(target_box_pos.x + 3,target_box_pos.y));
half prior_box_width = prior_box_input[2].x - prior_box_input[0].x;
half prior_box_height = prior_box_input[3].x - prior_box_input[1].x;
half prior_box_center_x = (prior_box_input[2].x + prior_box_input[0].x)/(half)2;
half prior_box_center_y = (prior_box_input[3].x + prior_box_input[1].x)/(half)2;
half4 target_box_center_x;
half4 target_box_center_y;
half4 target_box_width;
half4 target_box_height;
half4 output[4];
output[0] = 0.0f;
output[1] = 0.0f;
output[2] = 0.0f;
output[3] = 0.0f;
target_box_center_x.x = prior_box_var_input[0].x * target_box_input[0].x * prior_box_width + prior_box_center_x;
target_box_center_y.x = prior_box_var_input[1].x * target_box_input[1].x * prior_box_height + prior_box_center_y;
target_box_width.x = exp(prior_box_var_input[2].x * target_box_input[2].x) * prior_box_width;
target_box_height.x = exp(prior_box_var_input[3].x * target_box_input[3].x) * prior_box_height;
output[0].x = target_box_center_x.x - target_box_width.x/(half)2;
output[1].x = target_box_center_y.x - target_box_height.x/(half)2;
output[2].x = target_box_center_x.x + target_box_width.x/(half)2;
output[3].x = target_box_center_y.x + target_box_height.x/(half)2;
if(out_C - out_c * 4 >= 2){
target_box_center_x.y = prior_box_var_input[0].x * target_box_input[0].y * prior_box_width + prior_box_center_x;
target_box_center_y.y = prior_box_var_input[1].x * target_box_input[1].y * prior_box_height + prior_box_center_y;
target_box_width.y = exp(prior_box_var_input[2].x * target_box_input[2].y) * prior_box_width;
target_box_height.y = exp(prior_box_var_input[3].x * target_box_input[3].y) * prior_box_height;
output[0].y = target_box_center_x.y - target_box_width.y/(half)2;
output[1].y = target_box_center_y.y - target_box_height.y/(half)2;
output[2].y = target_box_center_x.y + target_box_width.y/(half)2;
output[3].y = target_box_center_y.y + target_box_height.y/(half)2;
}
if(out_C - out_c * 4 >= 3){
target_box_center_x.z = prior_box_var_input[0].x * target_box_input[0].z * prior_box_width + prior_box_center_x;
target_box_center_y.z = prior_box_var_input[1].x * target_box_input[1].z * prior_box_height + prior_box_center_y;
target_box_width.z = exp(prior_box_var_input[2].x * target_box_input[2].z) * prior_box_width;
target_box_height.z = exp(prior_box_var_input[3].x * target_box_input[3].z) * prior_box_height;
output[0].z = target_box_center_x.z - target_box_width.z/(half)2;
output[1].z = target_box_center_y.z - target_box_height.z/(half)2;
output[2].z = target_box_center_x.z + target_box_width.z/(half)2;
output[3].z = target_box_center_y.z + target_box_height.z/(half)2;
}
if(out_C - out_c * 4 >= 4){
target_box_center_x.w = prior_box_var_input[0].x * target_box_input[0].w * prior_box_width + prior_box_center_x;
target_box_center_y.w = prior_box_var_input[1].x * target_box_input[1].w * prior_box_height + prior_box_center_y;
target_box_width.w = exp(prior_box_var_input[2].x * target_box_input[2].w) * prior_box_width;
target_box_height.w = exp(prior_box_var_input[3].x * target_box_input[3].w) * prior_box_height;
output[0].w = target_box_center_x.w - target_box_width.w/(half)2;
output[1].w = target_box_center_y.w - target_box_height.w/(half)2;
output[2].w = target_box_center_x.w + target_box_width.w/(half)2;
output[3].w = target_box_center_y.w + target_box_height.w/(half)2;
}
write_imageh(output_image, (int2)(output_pos.x + 0, output_pos.y), output[0]);
write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output[1]);
write_imageh(output_image, (int2)(output_pos.x + 2, output_pos.y), output[2]);
write_imageh(output_image, (int2)(output_pos.x + 3, output_pos.y), output[3]);
}
\ No newline at end of file
...@@ -13,26 +13,50 @@ See the License for the specific language governing permissions and ...@@ -13,26 +13,50 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable #pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void feed(__global float *in, __write_only image2d_t outputImage,int h,int w,int c) __kernel void feed(__global float *in,
{ __write_only image2d_t output_image,
int i = get_global_id(0); __private const int out_H,
int j = get_global_id(1); __private const int out_W,
half4 pixel; __private const int out_C,
pixel.x = convert_half(in[(i * w + j)]); __private const int Stride0,
if(c>=2){ __private const int Stride1,
pixel.y = convert_half(in[h * w + (i * w + j)]); __private const int Stride2){
}else{
pixel.y = 0.0; const int out_c = get_global_id(0);
} const int out_w = get_global_id(1);
if(c>=3){ const int out_nh = get_global_id(2);
pixel.z = convert_half(in[2 * h * w + (i * w + j)]); const int out_n = out_nh/out_H;
}else{ const int out_h = out_nh%out_H;
pixel.z = 0.0;
} const int in_n = out_n;
pixel.w = 0.0; const int in_c0 = out_c * 4 + 0;
int2 coords; const int in_c1 = out_c * 4 + 1;
coords.x = j; const int in_c2 = out_c * 4 + 2;
coords.y = i; const int in_c3 = out_c * 4 + 3;
const int in_h = out_h;
write_imageh(outputImage,coords,pixel); const int in_w = out_w;
int input_pos0 = in_n * Stride2 + in_c0 * Stride1 + in_h * Stride0 + in_w;
int input_pos1 = in_n * Stride2 + in_c1 * Stride1 + in_h * Stride0 + in_w;
int input_pos2 = in_n * Stride2 + in_c2 * Stride1 + in_h * Stride0 + in_w;
int input_pos3 = in_n * Stride2 + in_c3 * Stride1 + in_h * Stride0 + in_w;
int2 output_pos;
output_pos.x = out_c * out_W + out_w;
output_pos.y = out_nh;
half4 output = (half4)0.0f;
output.x = convert_half(in[input_pos0]);
if(out_C - 4 * out_c>=2){
output.y = convert_half(in[input_pos1]);
}
if(out_C - 4 * out_c>=3){
output.z = convert_half(in[input_pos2]);
}
if(out_C - 4 * out_c>=4){
output.w = convert_half(in[input_pos3]);
}
write_imageh(output_image, output_pos, output);
} }
...@@ -107,11 +107,13 @@ __kernel void prior_box(__private const int global_size_dim0, ...@@ -107,11 +107,13 @@ __kernel void prior_box(__private const int global_size_dim0,
output[2] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[2]),(half4)(1.0f, 1.0f, 1.0f, 1.0f)); output[2] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[2]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
output[3] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[3]),(half4)(1.0f, 1.0f, 1.0f, 1.0f)); output[3] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[3]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
} }
/*
if(output_pos.x == 0 && output_pos.y == 1){ if(output_pos.x == 0 && output_pos.y == 1){
float4 out = (float4)(output[0].x, output[1].x, output[2].x, output[3].x); float4 out = (float4)(output[0].x, output[1].x, output[2].x, output[3].x);
printf("output = %v4hlf \n", out); printf("output = %v4hlf \n", out);
} }
*/
write_imageh(output_boxes, (int2)(output_pos.x + 0, output_pos.y), output[0]); write_imageh(output_boxes, (int2)(output_pos.x + 0, output_pos.y), output[0]);
write_imageh(output_boxes, (int2)(output_pos.x + 1, output_pos.y), output[1]); write_imageh(output_boxes, (int2)(output_pos.x + 1, output_pos.y), output[1]);
......
...@@ -16,35 +16,46 @@ limitations under the License. */ ...@@ -16,35 +16,46 @@ limitations under the License. */
__kernel void softmax(__read_only image2d_t input_image, __kernel void softmax(__read_only image2d_t input_image,
__write_only image2d_t output_image, __write_only image2d_t output_image,
__private const int group __private const int out_W
) { ) {
const int out_c = get_global_id(0); // block index const int out_c = get_global_id(0); // block index
const int out_w = get_global_id(1); // index in one block const int out_w = get_global_id(1); // index in one block
const int out_nh = get_global_id(2); const int out_nh = get_global_id(2);
const int in_c = out_c;
const int in_w = out_w;
const int in_nh = out_nh;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | int2 input_pos;
CLK_ADDRESS_CLAMP | int2 output_pos;
CLK_FILTER_NEAREST;
half maxv = 0.0f; input_pos.x = in_c * out_W + in_w;
for (int i = 0; i < group; ++i) { input_pos.y = in_nh;
half4 temp = read_imageh(input_image, sampler, (int2)(i, 0));
maxv = max(maxv, max(temp.x, max(temp.y, max(temp.z, temp.w))));
}
output_pos.x = out_c * out_W + out_w;
output_pos.y = out_nh;
half4 rsum = (half4)(0.0f); const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
for (int i = 0; i < group; ++i) { CLK_ADDRESS_CLAMP |
half4 r = read_imageh(input_image, sampler, (int2)(i, 0)); CLK_FILTER_NEAREST;
rsum += convert_half4(exp(convert_float4(r - maxv)));
} half4 input_max = 0.0f;
half4 input_tmp;
for(int i=0;i<out_W;i++){
input_tmp = read_imageh(input_image, sampler,(int2)(in_c * out_W + i,in_nh));
input_max = max(input_max,input_tmp);
}
half4 sum = (half4)0.0f;
for(int i=0;i<out_W;i++){
input_tmp = read_imageh(input_image, sampler,(int2)(in_c * out_W + i,in_nh));
sum += exp(input_tmp - input_max);
}
float sum = rsum.x + rsum.y + rsum.z + rsum.w; half4 input = read_imageh(input_image, sampler,input_pos);
half4 output = exp(input - input_max)/sum;
write_imageh(output_image, output_pos, output);
half4 rr = read_imageh(input_image, sampler, (int2)(out_w, out_nh));
half4 result = convert_half4(exp(convert_float4(rr - maxv)) / sum);
write_imageh(output_image, (int2)(out_w, out_nh), result);
} }
/* /*
......
...@@ -101,7 +101,7 @@ __kernel void transpose_4d( __read_only image2d_t input_image, ...@@ -101,7 +101,7 @@ __kernel void transpose_4d( __read_only image2d_t input_image,
if(out_w%4==0){ if(out_w%4==0){
output.z = input2.x; output.z = input2.x;
}else if(out_w%4==1){ }else if(out_w%4==1){
output.z = input1.y; output.z = input2.y;
}else if(out_w%4==2){ }else if(out_w%4==2){
output.z = input2.z; output.z = input2.z;
}else{ }else{
...@@ -126,4 +126,44 @@ __kernel void transpose_4d( __read_only image2d_t input_image, ...@@ -126,4 +126,44 @@ __kernel void transpose_4d( __read_only image2d_t input_image,
output.w = 0.0f; output.w = 0.0f;
} }
write_imageh(output_image, output_pos, output); write_imageh(output_image, output_pos, output);
}
__kernel void transpose( __read_only image2d_t input_image,
__write_only image2d_t output_image,
__private const int out_C,
__private const int out_H,
__private const int out_W,
__private const int in_W
){
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const int out_n = 1;
const int out_h = out_nh%out_H;
const int in_n = 1;
const int in_c = out_c;
const int in_w = out_h;
const int in_h = out_w;
int2 input_pos;
int2 output_pos;
input_pos.x = in_c * in_W + in_w;
input_pos.y = in_n * in_h;
output_pos.x = out_c * out_W + out_w;
output_pos.y = out_n * out_h;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
half4 input;
half4 output;
input = read_imageh(input_image, sampler,input_pos);
output = input;
write_imageh(output_image, output_pos, output);
} }
\ No newline at end of file
...@@ -27,6 +27,7 @@ bool FeedKernel<GPU_CL, float>::Init(FeedParam<GPU_CL> *param) { ...@@ -27,6 +27,7 @@ bool FeedKernel<GPU_CL, float>::Init(FeedParam<GPU_CL> *param) {
template <> template <>
void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) { void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0); auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out()));
cl_int status; cl_int status;
param.Out()->InitEmptyImage(cl_helper_.CLContext(), param.Out()->InitEmptyImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue(), param.Out()->dims()); cl_helper_.CLCommandQueue(), param.Out()->dims());
...@@ -35,10 +36,13 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) { ...@@ -35,10 +36,13 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
// DLOG << *input; // DLOG << *input;
const float *input_data = input->data<float>(); const float *input_data = input->data<float>();
int numel = input->numel(); int numel = input->numel();
cl_mem cl_image = output->GetCLImage(); cl_mem output_image = output->GetCLImage();
int c = input->dims()[1]; const int out_C = output->dims()[1];
int height = output->dims()[2]; const int out_H = output->dims()[2];
int width = output->dims()[3]; const int out_W = output->dims()[3];
const int Stride2 = out_C * out_H * out_W;
const int Stride1 = out_H * out_W;
const int Stride0 = out_W;
CLTensor input_cl_tensor(this->cl_helper_.CLContext(), CLTensor input_cl_tensor(this->cl_helper_.CLContext(),
this->cl_helper_.CLCommandQueue()); this->cl_helper_.CLCommandQueue());
input_cl_tensor.Resize(input->dims()); input_cl_tensor.Resize(input->dims());
...@@ -46,21 +50,25 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) { ...@@ -46,21 +50,25 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer); status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_image); status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(cl_int), &width); status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_int), &height); status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(cl_int), &c); status = clSetKernelArg(kernel, 4, sizeof(cl_int), &out_C);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 5, sizeof(cl_int), &Stride0);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 6, sizeof(cl_int), &Stride1);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 7, sizeof(cl_int), &Stride2);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
size_t global_work_size[2] = {width, height}; status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
// cl_event out_event = param.Out()->GetClEvent(); default_work_size.data(), NULL, 0, NULL, NULL);
status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
NULL, global_work_size, NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
} }
......
...@@ -37,6 +37,7 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) { ...@@ -37,6 +37,7 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
auto input = param.InputX()->GetCLImage(); auto input = param.InputX()->GetCLImage();
auto *out = param.Out(); auto *out = param.Out();
out->Resize(param.InputX()->dims());
out->mutable_data<float>(); out->mutable_data<float>();
const auto &dim = param.InputX()->dims(); const auto &dim = param.InputX()->dims();
size_t new_dims[] = {1, 1, 1, 1}; size_t new_dims[] = {1, 1, 1, 1};
......
...@@ -15,19 +15,323 @@ limitations under the License. */ ...@@ -15,19 +15,323 @@ limitations under the License. */
#ifdef MULTICLASSNMS_OP #ifdef MULTICLASSNMS_OP
#include "operators/kernel/multiclass_nms_kernel.h" #include "operators/kernel/multiclass_nms_kernel.h"
#include "operators/math/poly_util.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <> template <>
bool MultiClassNMSKernel<GPU_CL, float>::Init( bool MultiClassNMSKernel<GPU_CL, float>::Init(
MultiClassNMSParam<GPU_CL> *param) { MultiClassNMSParam<GPU_CL>* param) {
this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
this->cl_helper_.AddKernel("feed", "feed_kernel.cl");
return true; return true;
} }
template <class T>
bool SortScorePairDescend(const std::pair<float, T>& pair1,
const std::pair<float, T>& pair2) {
return pair1.first > pair2.first;
}
template <class T>
static inline void GetMaxScoreIndex(
const std::vector<T>& scores, const T threshold, int top_k,
std::vector<std::pair<T, int>>* sorted_indices) {
for (size_t i = 0; i < scores.size(); ++i) {
if (scores[i] > threshold) {
sorted_indices->push_back(std::make_pair(scores[i], i));
}
}
// Sort the score pair according to the scores in descending order
std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
SortScorePairDescend<int>);
// Keep top_k scores if needed.
if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
sorted_indices->resize(top_k);
}
}
template <class T>
static inline T BBoxArea(const T* box, const bool normalized) {
if (box[2] < box[0] || box[3] < box[1]) {
// If coordinate values are is invalid
// (e.g. xmax < xmin or ymax < ymin), return 0.
return static_cast<T>(0.);
} else {
const T w = box[2] - box[0];
const T h = box[3] - box[1];
if (normalized) {
return w * h;
} else {
// If coordinate values are not within range [0, 1].
return (w + 1) * (h + 1);
}
}
}
template <class T>
static inline T JaccardOverlap(const T* box1, const T* box2,
const bool normalized) {
if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
box2[3] < box1[1]) {
return static_cast<T>(0.);
} else {
const T inter_xmin = std::max(box1[0], box2[0]);
const T inter_ymin = std::max(box1[1], box2[1]);
const T inter_xmax = std::min(box1[2], box2[2]);
const T inter_ymax = std::min(box1[3], box2[3]);
const T inter_w = inter_xmax - inter_xmin;
const T inter_h = inter_ymax - inter_ymin;
const T inter_area = inter_w * inter_h;
const T bbox1_area = BBoxArea<T>(box1, normalized);
const T bbox2_area = BBoxArea<T>(box2, normalized);
return inter_area / (bbox1_area + bbox2_area - inter_area);
}
}
template <class T>
static inline T PolyIoU(const T* box1, const T* box2, const size_t box_size,
const bool normalized) {
T bbox1_area = math::PolyArea<T>(box1, box_size, normalized);
T bbox2_area = math::PolyArea<T>(box2, box_size, normalized);
T inter_area = math::PolyOverlapArea<T>(box1, box2, box_size, normalized);
if (bbox1_area == 0 || bbox2_area == 0 || inter_area == 0) {
// If coordinate values are is invalid
// if area size <= 0, return 0.
return static_cast<T>(0.);
} else {
return inter_area / (bbox1_area + bbox2_area - inter_area);
}
}
template <typename T>
static inline void NMSFast(const framework::Tensor& bbox,
const framework::Tensor& scores,
const T score_threshold, const T nms_threshold,
const T eta, const int64_t top_k,
std::vector<int>* selected_indices) {
// The total boxes for each instance.
int64_t num_boxes = bbox.dims()[0];
// 4: [xmin ymin xmax ymax]
int64_t box_size = bbox.dims()[1];
std::vector<T> scores_data(num_boxes);
std::copy_n(scores.data<T>(), num_boxes, scores_data.begin());
std::vector<std::pair<T, int>> sorted_indices;
GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
selected_indices->clear();
T adaptive_threshold = nms_threshold;
const T* bbox_data = bbox.data<T>();
while (sorted_indices.size() != 0) {
const int idx = sorted_indices.front().second;
bool keep = true;
for (size_t k = 0; k < selected_indices->size(); ++k) {
if (keep) {
const int kept_idx = (*selected_indices)[k];
T overlap = T(0.);
if (box_size == 4) {
overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
bbox_data + kept_idx * box_size, true);
} else {
overlap = PolyIoU<T>(bbox_data + idx * box_size,
bbox_data + kept_idx * box_size, box_size, true);
}
keep = overlap <= adaptive_threshold;
} else {
break;
}
}
if (keep) {
selected_indices->push_back(idx);
}
sorted_indices.erase(sorted_indices.begin());
if (keep && eta < 1 && adaptive_threshold > 0.5) {
adaptive_threshold *= eta;
}
}
}
template <typename T>
void MultiClassNMS(const framework::Tensor& scores,
const framework::Tensor& bboxes,
std::map<int, std::vector<int>>* indices, int* num_nmsed_out,
const int& background_label, const int& nms_top_k,
const int& keep_top_k, const T& nms_threshold,
const T& nms_eta, const T& score_threshold) {
int64_t class_num = scores.dims()[0];
int64_t predict_dim = scores.dims()[1];
int num_det = 0;
for (int64_t c = 0; c < class_num; ++c) {
if (c == background_label) continue;
framework::Tensor score = scores.Slice(c, c + 1);
/// [c] is key
NMSFast<float>(bboxes, score, score_threshold, nms_threshold, nms_eta,
nms_top_k, &((*indices)[c]));
num_det += (*indices)[c].size();
}
*num_nmsed_out = num_det;
const T* scores_data = scores.data<T>();
if (keep_top_k > -1 && num_det > keep_top_k) {
std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
for (const auto& it : *indices) {
int label = it.first;
const T* sdata = scores_data + label * predict_dim;
const std::vector<int>& label_indices = it.second;
for (size_t j = 0; j < label_indices.size(); ++j) {
int idx = label_indices[j];
// PADDLE_ENFORCE_LT(idx, predict_dim);
score_index_pairs.push_back(
std::make_pair(sdata[idx], std::make_pair(label, idx)));
}
}
// Keep top k results per image.
std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
SortScorePairDescend<std::pair<int, int>>);
score_index_pairs.resize(keep_top_k);
// Store the new indices.
std::map<int, std::vector<int>> new_indices;
for (size_t j = 0; j < score_index_pairs.size(); ++j) {
int label = score_index_pairs[j].second.first;
int idx = score_index_pairs[j].second.second;
new_indices[label].push_back(idx);
}
new_indices.swap(*indices);
*num_nmsed_out = keep_top_k;
}
}
template <typename T>
void MultiClassOutput(const framework::Tensor& scores,
const framework::Tensor& bboxes,
const std::map<int, std::vector<int>>& selected_indices,
framework::Tensor* outs) {
int predict_dim = scores.dims()[1];
int box_size = bboxes.dims()[1];
int out_dim = bboxes.dims()[1] + 2;
auto* scores_data = scores.data<T>();
auto* bboxes_data = bboxes.data<T>();
auto* odata = outs->data<T>();
int count = 0;
for (const auto& it : selected_indices) {
/// one batch
int label = it.first;
const T* sdata = scores_data + label * predict_dim;
const std::vector<int>& indices = it.second;
for (size_t j = 0; j < indices.size(); ++j) {
int idx = indices[j];
const T* bdata = bboxes_data + idx * box_size;
odata[count * out_dim] = label; // label
odata[count * out_dim + 1] = sdata[idx]; // score
// xmin, ymin, xmax, ymax
std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T));
count++;
}
}
}
template <typename P>
void MultiClassNMSCompute(const MultiClassNMSParam<GPU_CL>& param,
cl_context context, cl_command_queue commandQueue,
cl_kernel kernel0, cl_kernel kernel1) {
auto* input_bboxes_image = param.InputBBoxes();
auto& input_bboxes_dims = input_bboxes_image->dims();
Tensor* input_bboxes = new Tensor();
input_bboxes->Resize(input_bboxes_dims);
input_bboxes->mutable_data<float>();
DLOG << "yangfei20";
framework::CLImageToTensor(input_bboxes_image, input_bboxes, context,
commandQueue, kernel0);
DLOG << "yangfei20";
auto* input_scores_image = param.InputScores();
auto& input_scores_dims = input_scores_image->dims();
Tensor* input_scores = new Tensor();
input_scores->Resize(input_scores_dims);
input_scores->mutable_data<float>();
framework::CLImageToTensor(input_scores_image, input_scores, context,
commandQueue, kernel0);
DLOG << "yangfei20";
auto outs_image = param.Out();
Tensor* outs = new Tensor();
outs->Resize(outs_image->dims());
outs->mutable_data<float>();
DLOG << *input_bboxes;
DLOG << *input_scores;
DLOG << *outs;
auto background_label = param.BackGroundLabel();
auto nms_top_k = param.NMSTopK();
auto keep_top_k = param.KeepTopK();
auto nms_threshold = param.NMSThreshold();
auto nms_eta = param.NMSEta();
auto score_threshold = param.ScoreThreshold();
int64_t batch_size = input_scores_dims[0];
int64_t class_num = input_scores_dims[1];
int64_t predict_dim = input_scores_dims[2];
int64_t box_dim = input_bboxes_dims[2];
std::vector<std::map<int, std::vector<int>>> all_indices;
std::vector<size_t> batch_starts = {0};
for (int64_t i = 0; i < batch_size; ++i) {
framework::Tensor ins_score = input_scores->Slice(i, i + 1);
ins_score.Resize({class_num, predict_dim});
framework::Tensor ins_boxes = input_bboxes->Slice(i, i + 1);
ins_boxes.Resize({predict_dim, box_dim});
std::map<int, std::vector<int>> indices;
int num_nmsed_out = 0;
MultiClassNMS<float>(ins_score, ins_boxes, &indices, &num_nmsed_out,
background_label, nms_top_k, keep_top_k, nms_threshold,
nms_eta, score_threshold);
all_indices.push_back(indices);
batch_starts.push_back(batch_starts.back() + num_nmsed_out);
}
int num_kept = batch_starts.back();
if (num_kept == 0) {
float* od = outs->mutable_data<float>({1});
od[0] = -1;
} else {
int64_t out_dim = box_dim + 2;
outs->mutable_data<float>({num_kept, out_dim});
for (int64_t i = 0; i < batch_size; ++i) {
framework::Tensor ins_score = input_scores->Slice(i, i + 1);
ins_score.Resize({class_num, predict_dim});
framework::Tensor ins_boxes = input_bboxes->Slice(i, i + 1);
ins_boxes.Resize({predict_dim, box_dim});
int64_t s = batch_starts[i];
int64_t e = batch_starts[i + 1];
if (e > s) {
framework::Tensor out = outs->Slice(s, e);
MultiClassOutput<float>(ins_score, ins_boxes, all_indices[i], &out);
}
}
}
DLOG << "yangfei20";
outs_image->InitEmptyImage(context, commandQueue, outs->dims());
framework::TensorToCLImage(outs, outs_image, context, commandQueue, kernel1);
DLOG << *outs;
delete (input_bboxes);
delete (input_scores);
delete (outs);
DLOG << "yangfei20";
}
template <> template <>
void MultiClassNMSKernel<GPU_CL, float>::Compute( void MultiClassNMSKernel<GPU_CL, float>::Compute(
const MultiClassNMSParam<GPU_CL> &param) {} const MultiClassNMSParam<GPU_CL>& param) {
auto kernel0 = this->cl_helper_.KernelAt(0);
auto kernel1 = this->cl_helper_.KernelAt(1);
MultiClassNMSCompute<float>(param, this->cl_helper_.CLContext(),
this->cl_helper_.CLCommandQueue(), kernel0,
kernel1);
}
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
...@@ -33,31 +33,24 @@ void SoftmaxKernel<GPU_CL, float>::Compute(const SoftmaxParam<GPU_CL> &param) { ...@@ -33,31 +33,24 @@ void SoftmaxKernel<GPU_CL, float>::Compute(const SoftmaxParam<GPU_CL> &param) {
auto *output = param.Out(); auto *output = param.Out();
auto inputImage = input->GetCLImage(); auto inputImage = input->GetCLImage();
auto outputImage = output->GetCLImage(); auto outputImage = output->GetCLImage();
const auto &outputDim = output->dims();
int group = output->ImageWidth(); int dims[4] = {1, 1, 1, 1};
for (int i = 0; i < outputDim.size(); i++) {
dims[4 - outputDim.size() + i] = outputDim[i];
}
const int out_W = dims[3];
cl_int status; cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
status = clSetKernelArg(kernel, 2, sizeof(int), &group); CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(int), &out_W);
// const auto &inputDim = input->dims(); CL_CHECK_ERRORS(status);
//
// int dims[4] = {1, 1, 1, 1};
//
// for (int i = 0; i < inputDim.size(); i++) {
// dims[4 - inputDim.size() + i] = inputDim[i];
// }
//
// clSetKernelArg(kernel, 2, sizeof(int), &dims);
// clSetKernelArg(kernel, 3, sizeof(int), &dims[1]);
// clSetKernelArg(kernel, 4, sizeof(int), &dims[2]);
// clSetKernelArg(kernel, 5, sizeof(int), &dims[3]);
// cl_event out_event = param.Out()->GetClEvent();
// cl_event wait_event = param.InputX()->GetClEvent();
status = clEnqueueNDRangeKernel( status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL); default_work_size.data(), NULL, 0, NULL, NULL);
......
...@@ -22,6 +22,8 @@ template <> ...@@ -22,6 +22,8 @@ template <>
bool TransposeKernel<GPU_CL, float>::Init(TransposeParam<GPU_CL> *param) { bool TransposeKernel<GPU_CL, float>::Init(TransposeParam<GPU_CL> *param) {
if (param->Out()->dims().size() == 4) { if (param->Out()->dims().size() == 4) {
this->cl_helper_.AddKernel("transpose_4d", "transpose_kernel.cl"); this->cl_helper_.AddKernel("transpose_4d", "transpose_kernel.cl");
} else if (param->Out()->dims().size() < 4) {
this->cl_helper_.AddKernel("transpose", "transpose_kernel.cl");
} }
return true; return true;
} }
...@@ -60,6 +62,69 @@ void TransposeKernel<GPU_CL, float>::Compute( ...@@ -60,6 +62,69 @@ void TransposeKernel<GPU_CL, float>::Compute(
this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
NULL, default_work_size.data(), NULL, 0, NULL, NULL); NULL, default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
} else if (param.Out()->dims().size() == 3) {
auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
int out_C = param.Out()->dims()[0];
int out_H = param.Out()->dims()[1];
int out_W = param.Out()->dims()[2];
int in_W = param.InputX()->dims()[2];
auto output_image = param.Out()->GetCLImage();
auto input_image = param.InputX()->GetCLImage();
DLOG << "out_C=" << out_C;
DLOG << "out_H=" << out_H;
DLOG << "out_W=" << out_W;
DLOG << "in_C=" << in_W;
DLOG << "default_work_size=" << default_work_size;
cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(int), &out_C);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(int), &out_H);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(int), &out_W);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 5, sizeof(int), &in_W);
CL_CHECK_ERRORS(status);
status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
NULL, default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
} else if (param.Out()->dims().size() == 2) {
auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
int out_C = 1;
int out_H = param.Out()->dims()[0];
int out_W = param.Out()->dims()[1];
int in_W = param.InputX()->dims()[1];
auto output_image = param.Out()->GetCLImage();
auto input_image = param.InputX()->GetCLImage();
DLOG << "out_C=" << out_C;
DLOG << "out_H=" << out_H;
DLOG << "out_W=" << out_W;
DLOG << "in_C=" << in_W;
DLOG << "default_work_size=" << default_work_size;
cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(int), &out_C);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(int), &out_H);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(int), &out_W);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 5, sizeof(int), &in_W);
CL_CHECK_ERRORS(status);
status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
NULL, default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
} }
} }
......
...@@ -1018,9 +1018,9 @@ class MultiClassNMSParam : public OpParam { ...@@ -1018,9 +1018,9 @@ class MultiClassNMSParam : public OpParam {
score_threshold_ = GetAttr<float>("score_threshold", attrs); score_threshold_ = GetAttr<float>("score_threshold", attrs);
} }
const RType *InputBBoxes() const { return input_bboxes_; } RType *InputBBoxes() const { return input_bboxes_; }
const RType *InputScores() const { return input_scores_; } RType *InputScores() const { return input_scores_; }
RType *Out() const { return out_; } RType *Out() const { return out_; }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册