未验证 提交 cf70102b 编写于 作者: J Jiaying Zhao 提交者: GitHub

optimize GPU conv performance and structure (#1595)

* optimize GPU conv performance and structure

* add CL macro for test_conv_gpu

* fix build failure
上级 619acd23
...@@ -103,9 +103,9 @@ class CLEngine { ...@@ -103,9 +103,9 @@ class CLEngine {
return std::move(event_ptr); return std::move(event_ptr);
} }
bool BuildProgram(cl_program program) { bool BuildProgram(cl_program program, const std::string &options = "") {
cl_int status; cl_int status;
std::string path = "-cl-fast-relaxed-math -I " + std::string path = options + " -cl-fast-relaxed-math -I " +
CLEngine::Instance()->GetCLPath() + "/cl_kernel"; CLEngine::Instance()->GetCLPath() + "/cl_kernel";
status = clBuildProgram(program, 0, 0, path.c_str(), 0, 0); status = clBuildProgram(program, 0, 0, path.c_str(), 0, 0);
...@@ -149,7 +149,7 @@ class CLEngine { ...@@ -149,7 +149,7 @@ class CLEngine {
cl_int status_; cl_int status_;
std::string cl_path_; std::string cl_path_ = "/data/local/tmp/bin";
std::unique_ptr<_cl_program, CLProgramDeleter> program_; std::unique_ptr<_cl_program, CLProgramDeleter> program_;
std::unique_ptr<_cl_context, CLContextDeleter> context_ = nullptr; std::unique_ptr<_cl_context, CLContextDeleter> context_ = nullptr;
......
...@@ -32,9 +32,10 @@ class CLHelper { ...@@ -32,9 +32,10 @@ class CLHelper {
explicit CLHelper(CLScope *scope) : scope_(scope) {} explicit CLHelper(CLScope *scope) : scope_(scope) {}
void AddKernel(const std::string &kernel_name, const std::string &file_name) { void AddKernel(const std::string &kernel_name, const std::string &file_name,
const std::string &options = "") {
DLOG << " begin add kernel "; DLOG << " begin add kernel ";
auto kernel = scope_->GetKernel(kernel_name, file_name); auto kernel = scope_->GetKernel(kernel_name, file_name, options);
DLOG << " add kernel ing "; DLOG << " add kernel ing ";
kernels.emplace_back(std::move(kernel)); kernels.emplace_back(std::move(kernel));
} }
......
...@@ -146,6 +146,20 @@ class CLImage { ...@@ -146,6 +146,20 @@ class CLImage {
DLOG << " end init cl image"; DLOG << " end init cl image";
} }
void InitEmpty(cl_context context, cl_command_queue command_queue,
const DDim &image_dims) {
DLOG << " to get image dims ";
image_dims_ = image_dims;
DLOG << " end get image dims " << image_dims_;
InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
command_queue_ = command_queue;
cl_event_ = CLEngine::Instance()->CreateEvent(context);
initialized_ = true;
DLOG << " end init cl image";
}
cl_mem GetCLImage() const { return cl_image_.get(); } cl_mem GetCLImage() const { return cl_image_.get(); }
const DDim &ImageDims() const { return image_dims_; } const DDim &ImageDims() const { return image_dims_; }
......
...@@ -426,5 +426,25 @@ void CLImageConverterNormal::ImageToNCHW(half_t *image, float *tensor, ...@@ -426,5 +426,25 @@ void CLImageConverterNormal::ImageToNCHW(half_t *image, float *tensor,
default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim); default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim);
} }
const DDim &CLImageConverterWinoTransWeight::InitImageDimInfoWith(
const DDim &tensor_dim) {
PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
size_t N, C, H, W;
N = tensor_dim[0];
C = tensor_dim[1];
H = tensor_dim[2];
W = tensor_dim[3];
size_t width = (C + 3) / 4;
size_t height = N * 16; // N * (wino_blk_size + 2) * (wino_blk_size + 2)
return make_ddim({width, height});
}
void CLImageConverterWinoTransWeight::NCHWToImage(float *tensor, half_t *image,
const DDim &tensor_dim) {}
void CLImageConverterWinoTransWeight::ImageToNCHW(half_t *image, float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) {}
} // namespace framework } // namespace framework
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -101,5 +101,13 @@ class CLImageConverterDWBlock : public CLImageConverterBase { ...@@ -101,5 +101,13 @@ class CLImageConverterDWBlock : public CLImageConverterBase {
const DDim &tensor_dim); const DDim &tensor_dim);
}; };
class CLImageConverterWinoTransWeight : public CLImageConverterBase {
public:
const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
const DDim &tensor_dim);
};
} // namespace framework } // namespace framework
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -37,9 +37,10 @@ class CLScope { ...@@ -37,9 +37,10 @@ class CLScope {
cl_command_queue CommandQueue() { return command_queue_; } cl_command_queue CommandQueue() { return command_queue_; }
std::unique_ptr<_cl_kernel, CLKernelDeleter> GetKernel( std::unique_ptr<_cl_kernel, CLKernelDeleter> GetKernel(
const std::string &kernel_name, const std::string &file_name) { const std::string &kernel_name, const std::string &file_name,
const std::string &options) {
DLOG << " to get program " << file_name; DLOG << " to get program " << file_name;
auto program = Program(file_name); auto program = Program(file_name, options);
DLOG << " end get program ~ "; DLOG << " end get program ~ ";
DLOG << " to create kernel: " << kernel_name; DLOG << " to create kernel: " << kernel_name;
std::unique_ptr<_cl_kernel, CLKernelDeleter> kernel( std::unique_ptr<_cl_kernel, CLKernelDeleter> kernel(
...@@ -51,8 +52,12 @@ class CLScope { ...@@ -51,8 +52,12 @@ class CLScope {
cl_context Context() { return context_; } cl_context Context() { return context_; }
cl_program Program(const std::string &file_name) { cl_program Program(const std::string &file_name, const std::string &options) {
auto it = programs_.find(file_name); std::string program_key = file_name;
if (!options.empty()) {
program_key += options;
}
auto it = programs_.find(program_key);
if (it != programs_.end()) { if (it != programs_.end()) {
return it->second.get(); return it->second.get();
} }
...@@ -61,13 +66,13 @@ class CLScope { ...@@ -61,13 +66,13 @@ class CLScope {
context_, context_,
CLEngine::Instance()->GetCLPath() + "/cl_kernel/" + file_name); CLEngine::Instance()->GetCLPath() + "/cl_kernel/" + file_name);
DLOG << " --- begin build program -> " << file_name << " --- "; DLOG << " --- begin build program -> " << program_key << " --- ";
CLEngine::Instance()->BuildProgram(program.get()); CLEngine::Instance()->BuildProgram(program.get(), options);
DLOG << " --- end build program -> " << file_name << " --- "; DLOG << " --- end build program -> " << program_key << " --- ";
programs_[file_name] = std::move(program); programs_[program_key] = std::move(program);
return programs_[file_name].get(); return programs_[program_key].get();
} }
private: private:
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "operators/kernel/cl/cl-kernel-func/conv_func.h" #include "operators/kernel/cl/cl-kernel-func/conv_func.h"
#include <vector>
#include "framework/cl/cl_image_converter.h" #include "framework/cl/cl_image_converter.h"
#include "framework/cl/cl_tensor.h" #include "framework/cl/cl_tensor.h"
...@@ -20,19 +21,23 @@ namespace paddle_mobile { ...@@ -20,19 +21,23 @@ namespace paddle_mobile {
namespace operators { namespace operators {
template <> template <>
void winograd_transform_weight<4, 3>(framework::CLHelper &cl_helper, void winograd_transform_weight<4, 3>(framework::CLHelper *cl_helper,
framework::CLImage &weight){}; framework::CLImage *weight) {}
template <> template <>
void WinogradConv3x3<4, 3>(framework::CLHelper &cl_helper, void WinogradConv3x3<4, 3>(framework::CLHelper *cl_helper,
const ConvParam<GPU_CL> &param) {} const ConvParam<GPU_CL> &param, bool ifRelu,
const framework::CLImage *biase,
const framework::CLImage *new_scale,
const framework::CLImage *new_bias) {}
void ConvAddBnRelu(framework::CLHelper &cl_helper, void ConvAddBnRelu(framework::CLHelper *cl_helper,
const ConvParam<GPU_CL> &param, bool ifRelu, const ConvParam<GPU_CL> &param, bool ifRelu,
const CLImage *biase, const CLImage *new_scale, const framework::CLImage *biase,
const CLImage *new_bias) { const framework::CLImage *new_scale,
auto kernel = cl_helper.KernelAt(0); const framework::CLImage *new_bias) {
auto default_work_size = cl_helper.DefaultWorkSize(*param.Output()); auto kernel = cl_helper->KernelAt(0);
auto default_work_size = cl_helper->DefaultWorkSize(*param.Output());
int c_block = default_work_size[0]; int c_block = default_work_size[0];
int w = default_work_size[1]; int w = default_work_size[1];
int nh = default_work_size[2]; int nh = default_work_size[2];
...@@ -137,7 +142,7 @@ void ConvAddBnRelu(framework::CLHelper &cl_helper, ...@@ -137,7 +142,7 @@ void ConvAddBnRelu(framework::CLHelper &cl_helper,
static_cast<const uint32_t>(maped_w), static_cast<const uint32_t>(maped_w),
static_cast<const uint32_t>(default_work_size.data()[2])}; static_cast<const uint32_t>(default_work_size.data()[2])};
status = clEnqueueNDRangeKernel(cl_helper.CLCommandQueue(), kernel, status = clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel,
default_work_size.size(), NULL, work_size, default_work_size.size(), NULL, work_size,
NULL, 0, NULL, NULL); NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
...@@ -201,7 +206,7 @@ void ConvAddBnRelu(framework::CLHelper &cl_helper, ...@@ -201,7 +206,7 @@ void ConvAddBnRelu(framework::CLHelper &cl_helper,
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = clEnqueueNDRangeKernel( status = clEnqueueNDRangeKernel(
cl_helper.CLCommandQueue(), kernel, default_work_size.size(), NULL, cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL); default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
} }
......
...@@ -22,23 +22,24 @@ limitations under the License. */ ...@@ -22,23 +22,24 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
using namespace framework;
inline int maptofactor(int i, int factor) { return (i + factor - 1) / factor; } inline int maptofactor(int i, int factor) { return (i + factor - 1) / factor; }
template <int tile, int kernel> template <int tile, int kernel>
void winograd_transform_weight(framework::CLHelper &cl_helper, void winograd_transform_weight(framework::CLHelper *cl_helper,
framework::CLImage &weight); framework::CLImage *weight);
template <int tile, int kernel> template <int tile, int kernel>
void WinogradConv3x3(framework::CLHelper &cl_helper, void WinogradConv3x3(framework::CLHelper *cl_helper,
const ConvParam<GPU_CL> &param); const ConvParam<GPU_CL> &param, bool ifRelu = false,
const framework::CLImage *biase = nullptr,
const framework::CLImage *new_scale = nullptr,
const framework::CLImage *new_bias = nullptr);
void ConvAddBnRelu(framework::CLHelper &cl_helper, void ConvAddBnRelu(framework::CLHelper *cl_helper,
const ConvParam<GPU_CL> &param, bool ifRelu = false, const ConvParam<GPU_CL> &param, bool ifRelu = false,
const CLImage *biase = nullptr, const framework::CLImage *biase = nullptr,
const CLImage *new_scale = nullptr, const framework::CLImage *new_scale = nullptr,
const CLImage *new_bias = nullptr); const framework::CLImage *new_bias = nullptr);
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define BIASE
#define BATCH_NORM
#define RELU
#include "conv_kernel.inc.cl"
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define BIASE
#include "conv_kernel.inc.cl"
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define BIASE
#define RELU
#include "conv_kernel.inc.cl"
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define BATCH_NORM
#define BIASE
#define RELU
#include "conv_kernel.inc.cl"
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define BATCH_NORM
#define RELU
#include "conv_kernel.inc.cl"
...@@ -126,6 +126,10 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init( ...@@ -126,6 +126,10 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
param->SetOffset(offset); param->SetOffset(offset);
const std::string conv_kernel_file = "conv_kernel.cl";
const std::string wino_kernel_file = "winograd_transform.cl";
const std::string build_options = "-DBIASE -DBATCH_NORM -DRELU";
/* /*
if (param->Filter()->dims()[2] == 1 && if (param->Filter()->dims()[2] == 1 &&
param->Filter()->dims()[3] == 1 && param->Filter()->dims()[3] == 1 &&
...@@ -137,26 +141,44 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init( ...@@ -137,26 +141,44 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
} }
*/ */
if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT;
param->Filter()->InitNImage(cl_helper_.CLContext(), param->Filter()->InitNImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue()); cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_1x1_spl", "conv_add_bn_relu_kernel.cl");
DLOG << " conv add bn relu conv 1x1"; this->cl_helper_.AddKernel("conv_1x1_spl", conv_kernel_file, build_options);
} else if (param->Filter()->dims()[1] == 1 && } else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] && param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] == 3) { param->Filter()->dims()[2] == 3) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
param->Filter()->InitDWImage(cl_helper_.CLContext(), param->Filter()->InitDWImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue()); cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl");
DLOG << " conv add bn relu depth_conv_3x3"; this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
build_options);
} else if (param->Filter()->dims()[2] == 3 && } else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3) { param->Filter()->dims()[3] == 3) {
// if (param->Strides()[0] == param->Strides()[1] &&
// param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) {
// param->ExecMode() = ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT;
// this->cl_helper_.AddKernel("winograd_filter_transform_2x2",
// wino_kernel_file, build_options);
// this->cl_helper_.AddKernel("winograd_input_transform_2x2",
// wino_kernel_file, build_options);
// this->cl_helper_.AddKernel("matmul", "matmul.cl");
// this->cl_helper_.AddKernel("winograd_output_transform_2x2",
// wino_kernel_file, build_options);
//
// winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
//
// } else {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
param->Filter()->InitCLImage(cl_helper_.CLContext(), param->Filter()->InitCLImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue()); cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl"); this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
DLOG << " conv add bn relu conv_3x3"; // }
} else { } else {
PADDLE_MOBILE_THROW_EXCEPTION(" not support "); PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
} }
...@@ -167,8 +189,21 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init( ...@@ -167,8 +189,21 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
template <> template <>
void ConvAddBNReluKernel<GPU_CL, float>::Compute( void ConvAddBNReluKernel<GPU_CL, float>::Compute(
const FusionConvAddBNReluParam<GPU_CL> &param) { const FusionConvAddBNReluParam<GPU_CL> &param) {
ConvAddBnRelu(this->cl_helper_, param, true, param.Bias(), param.NewScale(), switch (param.ExecMode()) {
param.NewBias()); case ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT:
WinogradConv3x3<4, 3>(&this->cl_helper_, param, true, param.Bias(),
param.NewScale(), param.NewBias());
break;
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(),
param.NewScale(), param.NewBias());
break;
default:
PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
param.ExecMode());
}
} }
template class ConvAddBNReluKernel<GPU_CL, float>; template class ConvAddBNReluKernel<GPU_CL, float>;
......
...@@ -33,34 +33,65 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) { ...@@ -33,34 +33,65 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
static_cast<int>(param->Paddings()[1]); static_cast<int>(param->Paddings()[1]);
param->SetOffset(offset); param->SetOffset(offset);
const std::string conv_kernel_file = "conv_kernel.cl";
const std::string wino_kernel_file = "winograd_transform.cl";
const std::string build_options = "-DBIASE";
if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT;
param->Filter()->InitNImage(cl_helper_.CLContext(), param->Filter()->InitNImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue()); cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_1x1_spl", "conv_add_kernel.cl");
this->cl_helper_.AddKernel("conv_1x1_spl", conv_kernel_file, build_options);
} else if (param->Filter()->dims()[1] == 1 && } else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] && param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] == 3) { param->Filter()->dims()[2] == 3) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
param->Filter()->InitDWImage(cl_helper_.CLContext(), param->Filter()->InitDWImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue()); cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_kernel.cl");
this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
build_options);
} else if (param->Filter()->dims()[2] == 3 && } else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3) { param->Filter()->dims()[3] == 3) {
// if (param->Strides()[0] == param->Strides()[1] &&
// param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) {
// param->ExecMode() = ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT;
// this->cl_helper_.AddKernel("winograd_filter_transform_2x2",
// wino_kernel_file, build_options);
// this->cl_helper_.AddKernel("winograd_input_transform_2x2",
// wino_kernel_file, build_options);
// this->cl_helper_.AddKernel("matmul", "matmul.cl");
// this->cl_helper_.AddKernel("winograd_output_transform_2x2",
// wino_kernel_file, build_options);
//
// winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
//
// } else {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
param->Filter()->InitCLImage(cl_helper_.CLContext(), param->Filter()->InitCLImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue()); cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_3x3", "conv_add_kernel.cl"); this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
// }
} else if (param->Filter()->dims()[2] == 7 && } else if (param->Filter()->dims()[2] == 7 &&
param->Filter()->dims()[3] == 7) { param->Filter()->dims()[3] == 7) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT;
param->Filter()->InitCLImage(cl_helper_.CLContext(), param->Filter()->InitCLImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue()); cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_7x7", "conv_add_kernel.cl");
this->cl_helper_.AddKernel("conv_7x7", conv_kernel_file, build_options);
} else if (param->Filter()->dims()[2] == 5 && } else if (param->Filter()->dims()[2] == 5 &&
param->Filter()->dims()[3] == 5) { param->Filter()->dims()[3] == 5) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT;
param->Filter()->InitCLImage(cl_helper_.CLContext(), param->Filter()->InitCLImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue()); cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_5x5", "conv_add_kernel.cl");
this->cl_helper_.AddKernel("conv_5x5", conv_kernel_file, build_options);
} }
return true; return true;
...@@ -69,7 +100,21 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) { ...@@ -69,7 +100,21 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
template <> template <>
void ConvAddKernel<GPU_CL, float>::Compute( void ConvAddKernel<GPU_CL, float>::Compute(
const FusionConvAddParam<GPU_CL> &param) { const FusionConvAddParam<GPU_CL> &param) {
ConvAddBnRelu(this->cl_helper_, param, false, param.Bias()); switch (param.ExecMode()) {
case ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT:
WinogradConv3x3<4, 3>(&this->cl_helper_, param, false, param.Bias());
break;
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
ConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
break;
default:
PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
param.ExecMode());
}
} }
template class ConvAddKernel<GPU_CL, float>; template class ConvAddKernel<GPU_CL, float>;
......
...@@ -34,35 +34,66 @@ bool ConvAddReluKernel<GPU_CL, float>::Init( ...@@ -34,35 +34,66 @@ bool ConvAddReluKernel<GPU_CL, float>::Init(
static_cast<int>(param->Paddings()[1]); static_cast<int>(param->Paddings()[1]);
param->SetOffset(offset); param->SetOffset(offset);
const std::string conv_kernel_file = "conv_kernel.cl";
const std::string wino_kernel_file = "winograd_transform.cl";
const std::string build_options = "-DBIASE -DRELU";
if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT;
param->Filter()->InitNImage(cl_helper_.CLContext(), param->Filter()->InitNImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue()); cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_1x1_spl", "conv_add_relu_kernel.cl"); this->cl_helper_.AddKernel("conv_1x1_spl", conv_kernel_file, build_options);
} else if (param->Filter()->dims()[1] == 1 && } else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] && param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] == 3) { param->Filter()->dims()[2] == 3) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
param->Filter()->InitDWImage(cl_helper_.CLContext(), param->Filter()->InitDWImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue()); cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_relu_kernel.cl");
this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
build_options);
} else if (param->Filter()->dims()[2] == 3 && } else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3) { param->Filter()->dims()[3] == 3) {
// if (param->Strides()[0] == param->Strides()[1] &&
// param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) {
// param->ExecMode() = ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT;
// this->cl_helper_.AddKernel("winograd_filter_transform_2x2",
// wino_kernel_file, build_options);
// this->cl_helper_.AddKernel("winograd_input_transform_2x2",
// wino_kernel_file, build_options);
// this->cl_helper_.AddKernel("matmul", "matmul.cl");
// this->cl_helper_.AddKernel("winograd_output_transform_2x2",
// wino_kernel_file, build_options);
//
// winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
//
// } else {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
param->Filter()->InitCLImage(cl_helper_.CLContext(), param->Filter()->InitCLImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue()); cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_3x3", "conv_add_relu_kernel.cl"); this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
// }
} else if (param->Filter()->dims()[2] == 7 && } else if (param->Filter()->dims()[2] == 7 &&
param->Filter()->dims()[3] == 7) { param->Filter()->dims()[3] == 7) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT;
param->Filter()->InitCLImage(cl_helper_.CLContext(), param->Filter()->InitCLImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue()); cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_7x7", "conv_add_relu_kernel.cl");
this->cl_helper_.AddKernel("conv_7x7", conv_kernel_file, build_options);
} else if (param->Filter()->dims()[2] == 5 && } else if (param->Filter()->dims()[2] == 5 &&
param->Filter()->dims()[3] == 5) { param->Filter()->dims()[3] == 5) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT;
param->Filter()->InitCLImage(cl_helper_.CLContext(), param->Filter()->InitCLImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue()); cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_5x5", "conv_add_relu_kernel.cl");
this->cl_helper_.AddKernel("conv_5x5", conv_kernel_file, build_options);
} else { } else {
PADDLE_MOBILE_THROW_EXCEPTION(" not support "); PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
} }
...@@ -73,7 +104,21 @@ bool ConvAddReluKernel<GPU_CL, float>::Init( ...@@ -73,7 +104,21 @@ bool ConvAddReluKernel<GPU_CL, float>::Init(
template <> template <>
void ConvAddReluKernel<GPU_CL, float>::Compute( void ConvAddReluKernel<GPU_CL, float>::Compute(
const FusionConvAddReluParam<GPU_CL> &param) { const FusionConvAddReluParam<GPU_CL> &param) {
ConvAddBnRelu(this->cl_helper_, param, true, param.Bias()); switch (param.ExecMode()) {
case ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT:
WinogradConv3x3<4, 3>(&this->cl_helper_, param, true, param.Bias());
break;
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias());
break;
default:
PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
param.ExecMode());
}
} }
template class ConvAddReluKernel<GPU_CL, float>; template class ConvAddReluKernel<GPU_CL, float>;
......
...@@ -100,28 +100,51 @@ bool ConvBNAddReluKernel<GPU_CL, float>::Init( ...@@ -100,28 +100,51 @@ bool ConvBNAddReluKernel<GPU_CL, float>::Init(
param->SetOffset(offset); param->SetOffset(offset);
const std::string conv_kernel_file = "conv_kernel.cl";
const std::string wino_kernel_file = "winograd_transform.cl";
const std::string build_options = "-DBIASE -DBATCH_NORM -DRELU";
if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT;
param->Filter()->InitNImage(cl_helper_.CLContext(), param->Filter()->InitNImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue()); cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("convBNAdd_1x1_spl",
"conv_bn_add_relu_kernel.cl"); this->cl_helper_.AddKernel("convBNAdd_1x1_spl", conv_kernel_file,
DLOG << " conv bn add relu conv 1x1"; build_options);
} else if (param->Filter()->dims()[1] == 1 && } else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] && param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] == 3) { param->Filter()->dims()[2] == 3) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
param->Filter()->InitDWImage(cl_helper_.CLContext(), param->Filter()->InitDWImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue()); cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("depth_convBNAdd_3x3",
"conv_bn_add_relu_kernel.cl"); this->cl_helper_.AddKernel("depth_convBNAdd_3x3", conv_kernel_file,
DLOG << " conv bn add relu depth_conv_3x3"; build_options);
} else if (param->Filter()->dims()[2] == 3 && } else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3) { param->Filter()->dims()[3] == 3) {
// if (param->Strides()[0] == param->Strides()[1] &&
// param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) {
// param->ExecMode() = ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT;
// this->cl_helper_.AddKernel("winograd_filter_transform_2x2",
// wino_kernel_file, build_options);
// this->cl_helper_.AddKernel("winograd_input_transform_2x2",
// wino_kernel_file, build_options);
// this->cl_helper_.AddKernel("matmul", "matmul.cl");
// this->cl_helper_.AddKernel("winograd_output_transform_2x2_bn_add",
// wino_kernel_file, build_options);
//
// winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
//
// } else {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
param->Filter()->InitCLImage(cl_helper_.CLContext(), param->Filter()->InitCLImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue()); cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("convBNAdd_3x3", "conv_bn_add_relu_kernel.cl"); this->cl_helper_.AddKernel("convBNAdd_3x3", conv_kernel_file,
DLOG << " conv bn add relu conv_3x3"; build_options);
// }
} else { } else {
PADDLE_MOBILE_THROW_EXCEPTION(" not support "); PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
} }
...@@ -132,8 +155,21 @@ bool ConvBNAddReluKernel<GPU_CL, float>::Init( ...@@ -132,8 +155,21 @@ bool ConvBNAddReluKernel<GPU_CL, float>::Init(
template <> template <>
void ConvBNAddReluKernel<GPU_CL, float>::Compute( void ConvBNAddReluKernel<GPU_CL, float>::Compute(
const FusionConvBNAddReluParam<GPU_CL> &param) { const FusionConvBNAddReluParam<GPU_CL> &param) {
ConvAddBnRelu(this->cl_helper_, param, true, param.Bias(), param.NewScale(), switch (param.ExecMode()) {
param.NewBias()); case ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT:
WinogradConv3x3<4, 3>(&this->cl_helper_, param, true, param.Bias(),
param.NewScale(), param.NewBias());
break;
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(),
param.NewScale(), param.NewBias());
break;
default:
PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
param.ExecMode());
}
} }
template class ConvBNAddReluKernel<GPU_CL, float>; template class ConvBNAddReluKernel<GPU_CL, float>;
......
...@@ -98,26 +98,49 @@ bool ConvBNReluKernel<GPU_CL, float>::Init( ...@@ -98,26 +98,49 @@ bool ConvBNReluKernel<GPU_CL, float>::Init(
param->SetOffset(offset); param->SetOffset(offset);
const std::string conv_kernel_file = "conv_kernel.cl";
const std::string wino_kernel_file = "winograd_transform.cl";
const std::string build_options = "-DBATCH_NORM -DRELU";
if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT;
param->Filter()->InitNImage(cl_helper_.CLContext(), param->Filter()->InitNImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue()); cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_1x1_spl", "conv_bn_relu_kernel.cl");
DLOG << " conv bn relu conv 1x1"; this->cl_helper_.AddKernel("conv_1x1_spl", conv_kernel_file, build_options);
} else if (param->Filter()->dims()[1] == 1 && } else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] && param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] == 3) { param->Filter()->dims()[2] == 3) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
param->Filter()->InitDWImage(cl_helper_.CLContext(), param->Filter()->InitDWImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue()); cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("depth_conv_3x3", "conv_bn_relu_kernel.cl");
DLOG << " conv bn relu depth_conv_3x3"; this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
build_options);
} else if (param->Filter()->dims()[2] == 3 && } else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3) { param->Filter()->dims()[3] == 3) {
// if (param->Strides()[0] == param->Strides()[1] &&
// param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) {
// param->ExecMode() = ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT;
// this->cl_helper_.AddKernel("winograd_filter_transform_2x2",
// wino_kernel_file, build_options);
// this->cl_helper_.AddKernel("winograd_input_transform_2x2",
// wino_kernel_file, build_options);
// this->cl_helper_.AddKernel("matmul", "matmul.cl");
// this->cl_helper_.AddKernel("winograd_output_transform_2x2",
// wino_kernel_file, build_options);
//
// winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
//
// } else {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
param->Filter()->InitCLImage(cl_helper_.CLContext(), param->Filter()->InitCLImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue()); cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_3x3", "conv_bn_relu_kernel.cl"); this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
DLOG << " conv bn relu conv_3x3"; // }
} else { } else {
PADDLE_MOBILE_THROW_EXCEPTION(" not support "); PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
} }
...@@ -127,8 +150,21 @@ bool ConvBNReluKernel<GPU_CL, float>::Init( ...@@ -127,8 +150,21 @@ bool ConvBNReluKernel<GPU_CL, float>::Init(
template <> template <>
void ConvBNReluKernel<GPU_CL, float>::Compute( void ConvBNReluKernel<GPU_CL, float>::Compute(
const FusionConvBNReluParam<GPU_CL> &param) { const FusionConvBNReluParam<GPU_CL> &param) {
ConvAddBnRelu(this->cl_helper_, param, true, nullptr, param.NewScale(), switch (param.ExecMode()) {
param.NewBias()); case ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT:
WinogradConv3x3<4, 3>(&this->cl_helper_, param, true, nullptr,
param.NewScale(), param.NewBias());
break;
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
ConvAddBnRelu(&this->cl_helper_, param, true, nullptr, param.NewScale(),
param.NewBias());
break;
default:
PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
param.ExecMode());
}
} }
template class ConvBNReluKernel<GPU_CL, float>; template class ConvBNReluKernel<GPU_CL, float>;
......
...@@ -37,25 +37,49 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) { ...@@ -37,25 +37,49 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
DLOG << " height of one block: " << param->Filter()->dims()[2]; DLOG << " height of one block: " << param->Filter()->dims()[2];
DLOG << " filter dims: " << param->Filter()->dims(); DLOG << " filter dims: " << param->Filter()->dims();
const std::string conv_kernel_file = "conv_kernel.cl";
const std::string wino_kernel_file = "winograd_transform.cl";
if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT;
param->Filter()->InitNImage(cl_helper_.CLContext(), param->Filter()->InitNImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue()); cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_1x1_spl", "conv_kernel.cl");
this->cl_helper_.AddKernel("conv_1x1_spl", conv_kernel_file);
DLOG << "conv 1x1"; DLOG << "conv 1x1";
} else if (param->Filter()->dims()[1] == 1 && } else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] && param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] == 3) { param->Filter()->dims()[2] == 3) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
param->Filter()->InitDWImage(cl_helper_.CLContext(), param->Filter()->InitDWImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue()); cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("depth_conv_3x3", "depthwise_conv_kernel.cl");
this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file);
DLOG << "depth_conv 3x3"; DLOG << "depth_conv 3x3";
} else if (param->Filter()->dims()[2] == 3 && } else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3) { param->Filter()->dims()[3] == 3) {
// if (param->Strides()[0] == param->Strides()[1] &&
// param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) {
// param->ExecMode() = ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT;
// this->cl_helper_.AddKernel("winograd_filter_transform_2x2",
// wino_kernel_file);
// this->cl_helper_.AddKernel("winograd_input_transform_2x2",
// wino_kernel_file);
// this->cl_helper_.AddKernel("matmul", "matmul.cl");
// this->cl_helper_.AddKernel("winograd_output_transform_2x2",
// wino_kernel_file);
//
// winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
//
// } else {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
param->Filter()->InitCLImage(cl_helper_.CLContext(), param->Filter()->InitCLImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue()); cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_3x3", "conv_kernel.cl");
this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file);
// }
DLOG << "conv 3x3"; DLOG << "conv 3x3";
} else { } else {
...@@ -67,7 +91,19 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) { ...@@ -67,7 +91,19 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
template <> template <>
void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) { void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
ConvAddBnRelu(this->cl_helper_, param); switch (param.ExecMode()) {
case ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT:
WinogradConv3x3<4, 3>(&this->cl_helper_, param);
break;
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
ConvAddBnRelu(&this->cl_helper_, param);
break;
default:
PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
param.ExecMode());
}
} }
template class ConvKernel<GPU_CL, float>; template class ConvKernel<GPU_CL, float>;
......
...@@ -479,6 +479,11 @@ class ConvParam : public OpParam { ...@@ -479,6 +479,11 @@ class ConvParam : public OpParam {
EXEC_DEPTHWISE5x5_INT8, EXEC_DEPTHWISE5x5_INT8,
EXEC_SLIDINGWINDOW3x3S1_FLOAT, EXEC_SLIDINGWINDOW3x3S1_FLOAT,
EXEC_SLIDINGWINDOW3x3S2_FLOAT, EXEC_SLIDINGWINDOW3x3S2_FLOAT,
EXEC_DEPTHWISE3x3_FLOAT,
EXEC_SLIDINGWINDOW1x1_FLOAT,
EXEC_SLIDINGWINDOW3x3_FLOAT,
EXEC_SLIDINGWINDOW5x5_FLOAT,
EXEC_SLIDINGWINDOW7x7_FLOAT,
}; };
ExecMode &ExecMode() const { return exec_mode_; } ExecMode &ExecMode() const { return exec_mode_; }
......
...@@ -494,4 +494,7 @@ if (NOT FOUND_MATCH) ...@@ -494,4 +494,7 @@ if (NOT FOUND_MATCH)
ADD_EXECUTABLE(test-dwconv-bn-relu-op operators/test_dwconv_bn_relu_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-dwconv-bn-relu-op operators/test_dwconv_bn_relu_op.cpp test_helper.h test_include.h)
target_link_libraries(test-dwconv-bn-relu-op paddle-mobile) target_link_libraries(test-dwconv-bn-relu-op paddle-mobile)
ADD_EXECUTABLE(test-conv-gpu operators/test_conv_gpu.cpp test_helper.h test_include.h)
target_link_libraries(test-conv-gpu paddle-mobile)
endif () endif ()
...@@ -21,7 +21,10 @@ int main() { ...@@ -21,7 +21,10 @@ int main() {
paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile; paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
#endif #endif
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CL
paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile;
paddle_mobile.SetCLPath("/data/local/tmp/bin");
#else
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile; paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
#endif #endif
paddle_mobile.SetThreadNum(4); paddle_mobile.SetThreadNum(4);
...@@ -38,13 +41,13 @@ int main() { ...@@ -38,13 +41,13 @@ int main() {
input_tensor.data<float>() + input_tensor.numel()); input_tensor.data<float>() + input_tensor.numel());
#ifndef PADDLE_MOBILE_FPGA #ifndef PADDLE_MOBILE_FPGA
// 预热十次 // 预热十次
for (int i = 0; i < 10; ++i) { // for (int i = 0; i < 10; ++i) {
paddle_mobile.Predict(input, dims); // paddle_mobile.Predict(input, dims);
} // }
auto time3 = time(); auto time3 = time();
for (int i = 0; i < 10; ++i) { // for (int i = 0; i < 10; ++i) {
paddle_mobile.Predict(input, dims); paddle_mobile.Predict(input, dims);
} // }
auto time4 = time(); auto time4 = time();
std::cout << "predict cost :" << time_diff(time3, time4) << "ms" std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
<< std::endl; << std::endl;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_MOBILE_CL
#include <iostream>
#include "../test_helper.h"
#include "../test_include.h"
#include "common/common.h"
#include "framework/cl/cl_helper.h"
#include "framework/cl/cl_image.h"
#include "operators/conv_op.h"
#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
namespace paddle_mobile {
template <typename Itype, typename Otype, int Kernel, int Pad, int Stride>
int TestConvOp(int in_channels, int in_height, int in_width, int out_channels,
int groups) {
int kernel_h = Kernel;
int kernel_w = Kernel;
int pad_h = Pad;
int pad_w = Pad;
int stride_h = Stride;
int stride_w = Stride;
int dilation_h = 1;
int dilation_w = 1;
int batch_size = 1;
int input_c = in_channels;
int input_h = in_height;
int input_w = in_width;
int output_c = out_channels;
framework::DDim input_shape =
framework::make_ddim({batch_size, input_c, input_h, input_w});
framework::DDim filter_shape =
framework::make_ddim({output_c, input_c / groups, kernel_h, kernel_w});
// std::cerr << " init " << std::endl;
VariableNameMap inputs;
VariableNameMap outputs;
auto scope = std::make_shared<framework::Scope>();
inputs["Input"] = std::vector<std::string>({"input"});
inputs["Filter"] = std::vector<std::string>({"filter"});
outputs["Output"] = std::vector<std::string>({"output"});
cl_context context = scope->GetCLScpoe()->Context();
cl_command_queue command_queue = scope->GetCLScpoe()->CommandQueue();
// std::cerr << " input " << std::endl;
auto input_var = scope.get()->Var("input");
auto input = input_var->template GetMutable<framework::CLImage>();
const int in_numel = framework::product(input_shape);
float *in_data = new float[in_numel];
for (int i = 0; i < in_numel; ++i) {
in_data[i] = (i % 36 / 6) + 1;
}
input->SetTensorData(in_data, input_shape);
input->InitNormalCLImage(context, command_queue);
DLOG << "input image \n" << *input;
// std::cerr << " filter " << std::endl;
auto filter_var = scope.get()->Var("filter");
auto filter = filter_var->template GetMutable<framework::CLImage>();
const int filter_numel = product(filter_shape);
float *filter_data = new float[filter_numel];
for (int i = 0; i < filter_numel; ++i) {
filter_data[i] = i % 9;
}
filter->SetTensorData(filter_data, filter_shape);
// std::cerr << " attrs " << std::endl;
framework::AttributeMap attrs;
attrs["strides"].Set<vector<int>>(std::vector<int>({stride_h, stride_w}));
attrs["paddings"].Set<vector<int>>(std::vector<int>({pad_h, pad_w}));
attrs["dilations"].Set<vector<int>>(
std::vector<int>({dilation_h, dilation_w}));
attrs["groups"].Set<int>(groups);
std::cerr << " output " << std::endl;
auto output_var = scope.get()->Var("output");
auto output = output_var->template GetMutable<framework::CLImage>();
auto *op = new operators::ConvOp<GPU_CL, float>("conv2d", inputs, outputs,
attrs, scope.get());
op->InferShape();
framework::DDim ddim = output->dims();
DLOG << "output dims = " << ddim;
output->InitEmptyImage(context, command_queue, ddim);
// std::cerr << " op->init " << std::endl;
op->Init();
auto time1 = time();
op->Run();
auto time2 = time();
std::cerr << "time cost : " << time_diff(time1, time2) << std::endl;
delete op;
return 0;
}
} // namespace paddle_mobile
int TestAll(const int in_channels, const int in_height, const int in_width,
const int out_channels, const int groups) {
std::cerr << "in_channels=" << in_channels << ", in_height=" << in_height
<< ", in_width=" << in_width << ", out_channels=" << out_channels
<< ", groups=" << groups << std::endl;
std::cerr << "float, kernel=3, pad=1, stride=1" << std::endl;
paddle_mobile::TestConvOp<float, float, 3, 1, 1>(
in_channels, in_height, in_width, out_channels, groups);
return 0;
}
#endif
int main() {
// TestAll(4, 6, 6, 4, 1);
// TestAll(6, 32, 32, 24, 1);
// TestAll(12, 32, 32, 24, 1);
// TestAll(24, 32, 32, 24, 1);
// TestAll(36, 32, 32, 24, 1);
// TestAll(48, 32, 32, 24, 1);
// TestAll(60, 32, 32, 24, 1);
// TestAll(72, 32, 32, 24, 1);
// TestAll(116, 32, 32, 24, 1);
// TestAll(232, 32, 32, 24, 1);
// TestAll(464, 32, 32, 24, 1);
//
// TestAll(6, 64, 64, 24, 1);
// TestAll(12, 64, 64, 24, 1);
// TestAll(24, 64, 64, 24, 1);
// TestAll(36, 64, 64, 24, 1);
// TestAll(48, 64, 64, 24, 1);
// TestAll(60, 64, 64, 24, 1);
// TestAll(72, 64, 64, 24, 1);
// TestAll(116, 64, 64, 24, 1);
// TestAll(232, 64, 64, 24, 1);
// TestAll(464, 64, 64, 24, 1);
//
// TestAll(6, 128, 128, 24, 1);
// TestAll(12, 128, 128, 24, 1);
// TestAll(24, 128, 128, 24, 1);
// TestAll(36, 128, 128, 24, 1);
// TestAll(48, 128, 128, 24, 1);
// TestAll(60, 128, 128, 24, 1);
// TestAll(72, 128, 128, 24, 1);
// TestAll(116, 128, 128, 24, 1);
// TestAll(232, 128, 128, 24, 1);
// TestAll(464, 128, 128, 24, 1);
//
//
// TestAll(6, 32, 32, 6, 1);
// TestAll(12, 32, 32, 12, 1);
// TestAll(24, 32, 32, 24, 1);
// TestAll(36, 32, 32, 36, 1);
// TestAll(48, 32, 32, 48, 1);
// TestAll(60, 32, 32, 60, 1);
// TestAll(72, 32, 32, 72, 1);
// TestAll(116, 32, 32, 116, 1);
// TestAll(232, 32, 32, 232, 1);
// TestAll(464, 32, 32, 464, 1);
//
// TestAll(6, 64, 64, 6, 1);
// TestAll(12, 64, 64, 12, 1);
// TestAll(24, 64, 64, 24, 1);
// TestAll(36, 64, 64, 36, 1);
// TestAll(48, 64, 64, 48, 1);
// TestAll(60, 64, 64, 60, 1);
// TestAll(72, 64, 64, 72, 1);
// TestAll(116, 64, 64, 116, 1);
// TestAll(232, 64, 64, 232, 1);
// TestAll(464, 64, 64, 464, 1);
//
// TestAll(6, 128, 128, 6, 1);
// TestAll(12, 128, 128, 12, 1);
// TestAll(24, 128, 128, 24, 1);
// TestAll(36, 128, 128, 36, 1);
// TestAll(48, 128, 128, 48, 1);
// TestAll(60, 128, 128, 60, 1);
// TestAll(72, 128, 128, 72, 1);
// TestAll(116, 128, 128, 116, 1);
// TestAll(232, 128, 128, 232, 1);
// TestAll(464, 128, 128, 464, 1);
return 0;
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册