提交 44809658 编写于 作者: Z zhaojiaying01

adjust gpu code structure

上级 93c3df7b
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
#include "framework/cl/cl_image_converter.h"
#include "framework/cl/cl_tensor.h"
namespace paddle_mobile {
namespace operators {
template <>
void winograd_transform_weight<4, 3>(framework::CLHelper &cl_helper,
framework::CLImage &weight){};
template <>
void WinogradConv3x3<4, 3>(framework::CLHelper &cl_helper,
const ConvParam<GPU_CL> &param) {}
void ConvAddBnRelu(framework::CLHelper &cl_helper,
const ConvParam<GPU_CL> &param, bool ifRelu,
const CLImage *biase, const CLImage *new_scale,
const CLImage *new_bias) {
auto kernel = cl_helper.KernelAt(0);
auto default_work_size = cl_helper.DefaultWorkSize(*param.Output());
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
auto input = param.Input()->GetCLImage();
auto filter = param.Filter()->GetCLImage();
auto output = param.Output()->GetCLImage();
int stride = param.Strides()[0];
int offset = param.Offset();
int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
param.Input()->Converter())
->GetCBlock();
int dilation = param.Dilations()[0];
int input_width = param.Input()->dims()[3];
int input_height = param.Input()->dims()[2];
int output_width = param.Output()->dims()[3];
int output_height = param.Output()->dims()[2];
// DLOG << " c block " << c_block;
// DLOG << " w " << w;
// DLOG << " nh " << nh;
// DLOG << " stride " << stride;
// DLOG << " offset " << offset;
// DLOG << " input_c " << input_c;
// DLOG << " dilation " << dilation;
// DLOG << " input width " << input_width;
// DLOG << " input height " << input_height;
// DLOG << " output width " << output_width;
// DLOG << " output height " << output_height;
// DLOG << " input dim " << param.Input()->dims();
// DLOG << " output dim " << param.Output()->dims();
// DLOG << " filter dim " << param.Filter()->dims();
cl_int status;
int index = 0;
if (param.Filter()->dims()[2] == 1 && param.Filter()->dims()[3] == 1) {
status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
CL_CHECK_ERRORS(status);
int maped_w = maptofactor(w, 4);
status = clSetKernelArg(kernel, index++, sizeof(int), &maped_w);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &nh);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter);
CL_CHECK_ERRORS(status);
if (biase) {
auto bias_mem = biase->GetCLImage();
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem);
CL_CHECK_ERRORS(status);
}
if (new_scale && new_bias) {
auto new_scale_mem = new_scale->GetCLImage();
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem);
CL_CHECK_ERRORS(status);
auto new_bias_mem = new_bias->GetCLImage();
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem);
CL_CHECK_ERRORS(status);
}
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &stride);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &offset);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &input_c);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &dilation);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &input_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &input_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &output_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &w);
CL_CHECK_ERRORS(status);
const size_t work_size[3] = {
static_cast<const uint32_t>(default_work_size.data()[0]),
static_cast<const uint32_t>(maped_w),
static_cast<const uint32_t>(default_work_size.data()[2])};
status = clEnqueueNDRangeKernel(cl_helper.CLCommandQueue(), kernel,
default_work_size.size(), NULL, work_size,
NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
} else {
status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &w);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &nh);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter);
CL_CHECK_ERRORS(status);
if (biase) {
auto bias_mem = biase->GetCLImage();
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem);
CL_CHECK_ERRORS(status);
}
if (new_scale && new_bias) {
auto new_scale_mem = new_scale->GetCLImage();
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem);
CL_CHECK_ERRORS(status);
auto new_bias_mem = new_bias->GetCLImage();
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem);
CL_CHECK_ERRORS(status);
}
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &stride);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &offset);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &input_c);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &dilation);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &input_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &input_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &output_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
status = clEnqueueNDRangeKernel(
cl_helper.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
}
}
} // namespace operators
} // namespace paddle_mobile
\ No newline at end of file
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CONV_OP
#pragma once
#include "framework/cl/cl_helper.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using namespace framework;
inline int maptofactor(int i, int factor) { return (i + factor - 1) / factor; }
template <int tile, int kernel>
void winograd_transform_weight(framework::CLHelper &cl_helper,
framework::CLImage &weight);
template <int tile, int kernel>
void WinogradConv3x3(framework::CLHelper &cl_helper,
const ConvParam<GPU_CL> &param);
void ConvAddBnRelu(framework::CLHelper &cl_helper,
const ConvParam<GPU_CL> &param,
bool ifRelu = false,
const CLImage *biase = nullptr,
const CLImage *new_scale = nullptr,
const CLImage *new_bias = nullptr);
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -18,6 +18,7 @@ limitations under the License. */
#include <cmath>
#include "framework/cl/cl_image.h"
#include "framework/cl/cl_tool.h"
#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
namespace paddle_mobile {
namespace operators {
......@@ -171,225 +172,8 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
template <>
void ConvAddBNReluKernel<GPU_CL, float>::Compute(
const FusionConvAddBNReluParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
auto input = param.Input()->GetCLImage();
auto filter = param.Filter()->GetCLImage();
auto biase = param.Bias()->GetCLImage();
auto new_scale = param.NewScale()->GetCLImage();
auto new_bias = param.NewBias()->GetCLImage();
auto output = param.Output()->GetCLImage();
int stride = param.Strides()[0];
int offset = param.Offset();
int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
param.Input()->Converter())
->GetCBlock();
int dilation = param.Dilations()[0];
int input_width = param.Input()->dims()[3];
int input_height = param.Input()->dims()[2];
int output_width = param.Output()->dims()[3];
int output_height = param.Output()->dims()[2];
// DLOG << " c block " << c_block;
// DLOG << " w " << w;
// DLOG << " nh " << nh;
// DLOG << " stride " << stride;
// DLOG << " offset " << offset;
// DLOG << " input_c " << input_c;
// DLOG << " dilation " << dilation;
// DLOG << " input width " << input_width;
// DLOG << " input height " << input_height;
// DLOG << " output width " << output_width;
// DLOG << " output height " << output_height;
// DLOG << " input dim " << param.Input()->dims();
// DLOG << " output dim " << param.Output()->dims();
// DLOG << " filter dim " << param.Filter()->dims();
cl_int status;
if (optimise) {
if (param.Filter()->dims()[2] == 1 && param.Filter()->dims()[3] == 1) {
status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
CL_CHECK_ERRORS(status);
int maped_w = maptofactor(w, 4);
status = clSetKernelArg(kernel, 1, sizeof(int), &maped_w);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 9, sizeof(int), &stride);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 10, sizeof(int), &offset);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 11, sizeof(int), &input_c);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 12, sizeof(int), &dilation);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 13, sizeof(int), &input_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 14, sizeof(int), &input_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 15, sizeof(int), &output_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 16, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 17, sizeof(int), &w);
CL_CHECK_ERRORS(status);
const size_t work_size[3] = {
static_cast<const uint32_t>(default_work_size.data()[0]),
static_cast<const uint32_t>(maped_w),
static_cast<const uint32_t>(default_work_size.data()[2])};
status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel,
default_work_size.size(), NULL, work_size,
NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
} else {
status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(int), &w);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 9, sizeof(int), &stride);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 10, sizeof(int), &offset);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 11, sizeof(int), &input_c);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 12, sizeof(int), &dilation);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 13, sizeof(int), &input_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 14, sizeof(int), &input_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 15, sizeof(int), &output_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 16, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
NULL, default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
}
} else {
status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(int), &w);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 9, sizeof(int), &stride);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 10, sizeof(int), &offset);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 11, sizeof(int), &input_c);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 12, sizeof(int), &dilation);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 13, sizeof(int), &input_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 14, sizeof(int), &input_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 15, sizeof(int), &output_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 16, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
NULL, default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
}
ConvAddBnRelu(this->cl_helper_, param, true, param.Bias(), param.NewScale(),
param.NewBias());
}
template class ConvAddBNReluKernel<GPU_CL, float>;
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVADD_OP
#include "operators/kernel/conv_add_kernel.h"
#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
namespace paddle_mobile {
namespace operators {
......@@ -73,143 +74,7 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
template <>
void ConvAddKernel<GPU_CL, float>::Compute(
const FusionConvAddParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
auto input = param.Input()->GetCLImage();
auto filter = param.Filter()->GetCLImage();
auto biase = param.Bias()->GetCLImage();
param.Output()->InitEmptyImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue(),
param.Output()->dims());
auto output = param.Output()->GetCLImage();
int stride = param.Strides()[0];
int offset = param.Offset();
int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
param.Input()->Converter())
->GetCBlock();
int dilation = param.Dilations()[0];
int input_width = param.Input()->dims()[3];
int input_height = param.Input()->dims()[2];
int output_width = param.Output()->dims()[3];
int output_height = param.Output()->dims()[2];
cl_int status;
if (optimise_convadd && param.Filter()->dims()[2] == 1 &&
param.Filter()->dims()[3] == 1) {
status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
CL_CHECK_ERRORS(status);
int maped_w = maptofactor(w, 4);
status = clSetKernelArg(kernel, 1, sizeof(int), &maped_w);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 7, sizeof(int), &stride);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 8, sizeof(int), &offset);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 9, sizeof(int), &input_c);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 10, sizeof(int), &dilation);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 11, sizeof(int), &input_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 12, sizeof(int), &input_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 13, sizeof(int), &output_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 14, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 15, sizeof(int), &w);
CL_CHECK_ERRORS(status);
const size_t work_size[3] = {
static_cast<const uint32_t>(default_work_size.data()[0]),
static_cast<const uint32_t>(maped_w),
static_cast<const uint32_t>(default_work_size.data()[2])};
status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel,
default_work_size.size(), NULL, work_size,
NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
} else {
status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(int), &w);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 7, sizeof(int), &stride);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 8, sizeof(int), &offset);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 9, sizeof(int), &input_c);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 10, sizeof(int), &dilation);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 11, sizeof(int), &input_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 12, sizeof(int), &input_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 13, sizeof(int), &output_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 14, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
NULL, default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
}
ConvAddBnRelu(this->cl_helper_, param, false, param.Bias());
}
template class ConvAddKernel<GPU_CL, float>;
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVADDRELU_OP
#include "operators/kernel/conv_add_relu_kernel.h"
#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
namespace paddle_mobile {
namespace operators {
......@@ -72,84 +73,7 @@ bool ConvAddReluKernel<GPU_CL, float>::Init(
template <>
void ConvAddReluKernel<GPU_CL, float>::Compute(
const FusionConvAddReluParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
auto input = param.Input()->GetCLImage();
auto filter = param.Filter()->GetCLImage();
DLOG << "---yangfei30---";
DLOG << *param.Filter();
DLOG << param.Paddings();
auto biase = param.Bias()->GetCLImage();
auto output = param.Output()->GetCLImage();
int stride = param.Strides()[0];
int offset = param.Offset();
int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
param.Input()->Converter())
->GetCBlock();
int dilation = param.Dilations()[0];
int input_width = param.Input()->dims()[3];
int input_height = param.Input()->dims()[2];
int output_width = param.Output()->dims()[3];
int output_height = param.Output()->dims()[2];
cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(int), &w);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 7, sizeof(int), &stride);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 8, sizeof(int), &offset);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 9, sizeof(int), &input_c);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 10, sizeof(int), &dilation);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 11, sizeof(int), &input_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 12, sizeof(int), &input_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 13, sizeof(int), &output_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 14, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
// cl_event out_event = param.Output()->GetClEvent();
// cl_event wait_event = param.Input()->GetClEvent();
status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
ConvAddBnRelu(this->cl_helper_, param, true, param.Bias());
}
template class ConvAddReluKernel<GPU_CL, float>;
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include "operators/kernel/conv_bn_add_relu_kernel.h"
#include <cmath>
#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
namespace paddle_mobile {
namespace operators {
......@@ -130,101 +131,7 @@ bool ConvBNAddReluKernel<GPU_CL, float>::Init(
template <>
void ConvBNAddReluKernel<GPU_CL, float>::Compute(
const FusionConvBNAddReluParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
auto input = param.Input()->GetCLImage();
auto filter = param.Filter()->GetCLImage();
auto biase = param.Bias()->GetCLImage();
auto new_scale = param.NewScale()->GetCLImage();
auto new_bias = param.NewBias()->GetCLImage();
auto output = param.Output()->GetCLImage();
int stride = param.Strides()[0];
int offset = param.Offset();
int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
param.Input()->Converter())
->GetCBlock();
int dilation = param.Dilations()[0];
int input_width = param.Input()->dims()[3];
int input_height = param.Input()->dims()[2];
int output_width = param.Output()->dims()[3];
int output_height = param.Output()->dims()[2];
// DLOG << " c block " << c_block;
// DLOG << " w " << w;
// DLOG << " nh " << nh;
// DLOG << " stride " << stride;
// DLOG << " offset " << offset;
// DLOG << " input_c " << input_c;
// DLOG << " dilation " << dilation;
// DLOG << " input width " << input_width;
// DLOG << " input height " << input_height;
// DLOG << " output width " << output_width;
// DLOG << " output height " << output_height;
// DLOG << " input dim " << *param.Input();
// DLOG << " output dim " <<* param.Output();
// DLOG << " filter dim " << *param.Filter();
// DLOG<<*param.Bias();
cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(int), &w);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 9, sizeof(int), &stride);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 10, sizeof(int), &offset);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 11, sizeof(int), &input_c);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 12, sizeof(int), &dilation);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 13, sizeof(int), &input_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 14, sizeof(int), &input_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 15, sizeof(int), &output_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 16, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
ConvAddBnRelu(this->cl_helper_, param, true, param.Bias(), param.NewScale(), param.NewBias());
}
template class ConvBNAddReluKernel<GPU_CL, float>;
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include "operators/kernel/conv_bn_relu_kernel.h"
#include <cmath>
#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
namespace paddle_mobile {
namespace operators {
......@@ -126,81 +127,7 @@ bool ConvBNReluKernel<GPU_CL, float>::Init(
template <>
void ConvBNReluKernel<GPU_CL, float>::Compute(
const FusionConvBNReluParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
auto input = param.Input()->GetCLImage();
auto filter = param.Filter()->GetCLImage();
auto new_scale = param.NewScale()->GetCLImage();
auto new_bias = param.NewBias()->GetCLImage();
auto output = param.Output()->GetCLImage();
int stride = param.Strides()[0];
int offset = param.Offset();
int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
param.Input()->Converter())
->GetCBlock();
int dilation = param.Dilations()[0];
int input_width = param.Input()->dims()[3];
int input_height = param.Input()->dims()[2];
int output_width = param.Output()->dims()[3];
int output_height = param.Output()->dims()[2];
cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(int), &w);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &new_scale);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_bias);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &output);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 8, sizeof(int), &stride);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 9, sizeof(int), &offset);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 10, sizeof(int), &input_c);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 11, sizeof(int), &dilation);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 12, sizeof(int), &input_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 13, sizeof(int), &input_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 14, sizeof(int), &output_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 15, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
ConvAddBnRelu(this->cl_helper_, param, true, nullptr, param.NewScale(), param.NewBias());
}
template class ConvBNReluKernel<GPU_CL, float>;
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#ifdef CONV_OP
#include "operators/kernel/conv_kernel.h"
#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
namespace paddle_mobile {
namespace operators {
......@@ -66,64 +67,7 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
template <>
void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
auto input = param.Input()->GetCLImage();
auto filter = param.Filter()->GetCLImage();
auto output = param.Output()->GetCLImage();
int stride = param.Strides()[0];
int offset = param.Offset();
int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
param.Input()->Converter())
->GetCBlock();
int dilation = param.Dilations()[0];
int input_width = param.Input()->dims()[3];
int input_height = param.Input()->dims()[2];
int output_width = param.Output()->dims()[3];
int output_height = param.Output()->dims()[2];
cl_int status;
DLOG << " begin set kernel arg ";
DLOG << " c block " << c_block;
DLOG << " w " << w;
DLOG << " nh " << nh;
DLOG << " stride " << stride;
DLOG << " offset " << offset;
DLOG << " input_c " << input_c;
DLOG << " dilation " << dilation;
DLOG << " input width " << input_width;
DLOG << " input height " << input_height;
DLOG << " output width " << output_width;
DLOG << " output height " << output_height;
status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
status = clSetKernelArg(kernel, 1, sizeof(int), &w);
status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
status = clSetKernelArg(kernel, 12, sizeof(int), &output_width);
status = clSetKernelArg(kernel, 13, sizeof(int), &output_height);
// cl_event out_event = param.Output()->GetClEvent();
// cl_event wait_event = param.Input()->GetClEvent();
status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
ConvAddBnRelu(this->cl_helper_, param);
}
template class ConvKernel<GPU_CL, float>;
......
......@@ -36,9 +36,6 @@ class ConvAddBNReluKernel
public:
void Compute(const FusionConvAddBNReluParam<DeviceType> &param);
bool Init(FusionConvAddBNReluParam<DeviceType> *param);
inline int maptofactor(int i, int factor) {
return (i + factor - 1) / factor;
}
};
} // namespace operators
......
......@@ -41,9 +41,6 @@ class ConvAddKernel
public:
void Compute(const FusionConvAddParam<DeviceType> &param);
bool Init(FusionConvAddParam<DeviceType> *param);
inline int maptofactor(int i, int factor) {
return (i + factor - 1) / factor;
}
};
} // namespace operators
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册