提交 13ab061f 编写于 作者: L liuruilong

update conv kernel

上级 27c4c43c
...@@ -16,7 +16,6 @@ file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm) ...@@ -16,7 +16,6 @@ file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm)
file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h) file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
include_directories(src/) include_directories(src/)
if(IS_IOS) if(IS_IOS)
set(CMAKE_CXX_FLAGS "-mfpu=neon -marm -fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -O3 -s -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}") set(CMAKE_CXX_FLAGS "-mfpu=neon -marm -fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -O3 -s -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}")
else() else()
......
...@@ -16,6 +16,9 @@ limitations under the License. */ ...@@ -16,6 +16,9 @@ limitations under the License. */
#include "framework/cl/cl_half.h" #include "framework/cl/cl_half.h"
namespace paddle_mobile {
namespace framework {
static const uint32_t mantissatable[2048] = { static const uint32_t mantissatable[2048] = {
0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34a00000, 0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34a00000,
0x34c00000, 0x34e00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x34c00000, 0x34e00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000,
...@@ -510,3 +513,6 @@ void HalfArray2FloatArray(half_t *h_array, float *f_array, int count) { ...@@ -510,3 +513,6 @@ void HalfArray2FloatArray(half_t *h_array, float *f_array, int count) {
f_array[i] = Half2Float(h_array[i]); f_array[i] = Half2Float(h_array[i]);
} }
} }
} // namespace framework
} // namespace paddle_mobile
...@@ -15,6 +15,9 @@ limitations under the License. */ ...@@ -15,6 +15,9 @@ limitations under the License. */
#pragma once #pragma once
#include <cstdint> #include <cstdint>
namespace paddle_mobile {
namespace framework {
typedef uint16_t half_t; typedef uint16_t half_t;
half_t Float2Half(float f); half_t Float2Half(float f);
...@@ -24,3 +27,6 @@ float Half2Float(half_t h); ...@@ -24,3 +27,6 @@ float Half2Float(half_t h);
void FloatArray2HalfArray(float *f_array, half_t *h_array, int count); void FloatArray2HalfArray(float *f_array, half_t *h_array, int count);
void HalfArray2FloatArray(half_t *h_array, float *f_array, int count); void HalfArray2FloatArray(half_t *h_array, float *f_array, int count);
} // namespace framework
} // namespace paddle_mobile
...@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "cl_image.h" #include "framework/cl/cl_image.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
void CLImageToTensor(CLImage *cl_image, Tensor *tensor, void CLImageToTensor(CLImage *cl_image, Tensor *tensor,
...@@ -63,7 +64,7 @@ void CLImageToTensor(CLImage *cl_image, Tensor *tensor, ...@@ -63,7 +64,7 @@ void CLImageToTensor(CLImage *cl_image, Tensor *tensor,
} }
if (err != CL_SUCCESS) { if (err != CL_SUCCESS) {
// TODO: error handling CL_CHECK_ERRORS(err);
} }
} }
void TensorToCLImage(const Tensor *tensor, CLImage *cl_image, void TensorToCLImage(const Tensor *tensor, CLImage *cl_image,
...@@ -97,7 +98,7 @@ void TensorToCLImage(const Tensor *tensor, CLImage *cl_image, ...@@ -97,7 +98,7 @@ void TensorToCLImage(const Tensor *tensor, CLImage *cl_image,
err = clEnqueueReadImage(commandQueue, image, CL_TRUE, origin, region, 0, 0, err = clEnqueueReadImage(commandQueue, image, CL_TRUE, origin, region, 0, 0,
imageData, 0, NULL, NULL); imageData, 0, NULL, NULL);
if (err != CL_SUCCESS) { if (err != CL_SUCCESS) {
// TODO: error handling CL_CHECK_ERRORS(err);
} }
size_t i0 = 0; size_t i0 = 0;
for (int n = 0; n < N; n++) { for (int n = 0; n < N; n++) {
...@@ -117,7 +118,7 @@ void TensorToCLImage(const Tensor *tensor, CLImage *cl_image, ...@@ -117,7 +118,7 @@ void TensorToCLImage(const Tensor *tensor, CLImage *cl_image,
} }
} }
#ifdef PADDLE_MOBILE_DEBUG #ifdef PADDLE_MOBILE_DEBUG
Print &operator<<(Print &printer, const CLImage &cl_image){ Print &operator<<(Print &printer, const CLImage &cl_image) {
printer << " dims: " << cl_image.dims() << "\n"; printer << " dims: " << cl_image.dims() << "\n";
int stride = cl_image.numel() / 20; int stride = cl_image.numel() / 20;
stride = stride > 0 ? stride : 1; stride = stride > 0 ? stride : 1;
...@@ -148,8 +149,8 @@ Print &operator<<(Print &printer, const CLImage &cl_image){ ...@@ -148,8 +149,8 @@ Print &operator<<(Print &printer, const CLImage &cl_image){
cl_mem image = cl_image.GetCLImage(); cl_mem image = cl_image.GetCLImage();
size_t origin[3] = {0, 0, 0}; size_t origin[3] = {0, 0, 0};
size_t region[3] = {width, height, 1}; size_t region[3] = {width, height, 1};
err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin, region, 0, 0, err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin,
imageData, 0, NULL, NULL); region, 0, 0, imageData, 0, NULL, NULL);
size_t i0 = 0; size_t i0 = 0;
for (int n = 0; n < N; n++) { for (int n = 0; n < N; n++) {
for (int c = 0; c < C; c++) { for (int c = 0; c < C; c++) {
...@@ -168,13 +169,13 @@ Print &operator<<(Print &printer, const CLImage &cl_image){ ...@@ -168,13 +169,13 @@ Print &operator<<(Print &printer, const CLImage &cl_image){
} }
if (err != CL_SUCCESS) { if (err != CL_SUCCESS) {
// TODO: error handling CL_CHECK_ERRORS(err);
} }
for (int i = 0; i < cl_image.numel(); i += stride) { for (int i = 0; i < cl_image.numel(); i += stride) {
printer << data[i] << " "; printer << data[i] << " ";
} }
return printer; return printer;
} }
#endif #endif
} // namespace framework } // namespace framework
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -46,27 +46,28 @@ class CLImage { ...@@ -46,27 +46,28 @@ class CLImage {
/* /*
* need call SetTensorData first * need call SetTensorData first
* */ * */
void InitCLImage(cl_context context,cl_command_queue command_queue) { void InitCLImage(cl_context context, cl_command_queue command_queue) {
if (tensor_data_ == nullptr) { if (tensor_data_ == nullptr) {
PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first"); PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first");
} }
if (tensor_dims_.size() <= 2) { if (tensor_dims_.size() <= 2) {
InitCLImage2C(context, command_queue,tensor_data_, tensor_dims_); InitCLImage2C(context, command_queue, tensor_data_, tensor_dims_);
} else { } else {
InitCLImage(context, command_queue,tensor_data_, tensor_dims_); InitCLImage(context, command_queue, tensor_data_, tensor_dims_);
} }
delete[](tensor_data_); delete[](tensor_data_);
tensor_data_ = nullptr; tensor_data_ = nullptr;
initialized_ = true; initialized_ = true;
} }
void InitEmptyImage(cl_context context, cl_command_queue command_queue,const DDim &dim) { void InitEmptyImage(cl_context context, cl_command_queue command_queue,
const DDim &dim) {
if (tensor_data_ != nullptr) { if (tensor_data_ != nullptr) {
PADDLE_MOBILE_THROW_EXCEPTION( PADDLE_MOBILE_THROW_EXCEPTION(
" empty image tensor data shouldn't have value"); " empty image tensor data shouldn't have value");
} }
DLOG << " init empty image "; DLOG << " init empty image ";
InitCLImage(context, command_queue,nullptr, dim); InitCLImage(context, command_queue, nullptr, dim);
initialized_ = true; initialized_ = true;
} }
...@@ -93,7 +94,7 @@ class CLImage { ...@@ -93,7 +94,7 @@ class CLImage {
* */ * */
inline size_t HeightOfOneBlock() const { return height_of_one_block_; } inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
inline cl_command_queue CommandQueue() const{ return command_queue_;} inline cl_command_queue CommandQueue() const { return command_queue_; }
/* /*
* resize original tensor dim * resize original tensor dim
...@@ -124,7 +125,8 @@ class CLImage { ...@@ -124,7 +125,8 @@ class CLImage {
const DDim &dims() const { return tensor_dims_; } const DDim &dims() const { return tensor_dims_; }
private: private:
void InitCLImage2C(cl_context context, cl_command_queue command_queue,float *tensor_data, const DDim &dim) { void InitCLImage2C(cl_context context, cl_command_queue command_queue,
float *tensor_data, const DDim &dim) {
command_queue_ = command_queue; command_queue_ = command_queue;
assert(dim.size() <= 2); assert(dim.size() <= 2);
int tdim[2] = {1, 1}; int tdim[2] = {1, 1};
...@@ -141,14 +143,15 @@ class CLImage { ...@@ -141,14 +143,15 @@ class CLImage {
imageData.reset(new half_t[width * height * 4]); imageData.reset(new half_t[width * height * 4]);
for (int h = 0; h < tdim[0]; h++) { for (int h = 0; h < tdim[0]; h++) {
for (int w = 0; w < tdim[1]; w++) { for (int w = 0; w < tdim[1]; w++) {
imageData[(h * width + w / 4) * 4 + (w % 4)] = Float2Half(tensor_data[h * tdim[1] + w]); imageData[(h * width + w / 4) * 4 + (w % 4)] =
Float2Half(tensor_data[h * tdim[1] + w]);
} }
} }
} }
InitCLImage(context, width, height, imageData.get()); InitCLImage(context, width, height, imageData.get());
} }
void InitCLImage(cl_context context,int width, int height, void *data) { void InitCLImage(cl_context context, int width, int height, void *data) {
cl_image_format cf = {.image_channel_order = CL_RGBA, cl_image_format cf = {.image_channel_order = CL_RGBA,
.image_channel_data_type = CL_HALF_FLOAT}; .image_channel_data_type = CL_HALF_FLOAT};
cl_image_desc cid = { cl_image_desc cid = {
...@@ -170,14 +173,14 @@ class CLImage { ...@@ -170,14 +173,14 @@ class CLImage {
&cf, // const cl_image_format *image_format &cf, // const cl_image_format *image_format
&cid, // const cl_image_desc *image_desc &cid, // const cl_image_desc *image_desc
data, // void *host_ptr data, // void *host_ptr
&err &err);
);
if (err != CL_SUCCESS) { if (err != CL_SUCCESS) {
CL_CHECK_ERRORS(err); CL_CHECK_ERRORS(err);
PADDLE_MOBILE_THROW_EXCEPTION(" create image 2d error "); PADDLE_MOBILE_THROW_EXCEPTION(" create image 2d error ");
} }
} }
void InitCLImage(cl_context context, cl_command_queue command_queue,float *tensor_data, const DDim &dim) { void InitCLImage(cl_context context, cl_command_queue command_queue,
float *tensor_data, const DDim &dim) {
DLOG << " tensor dim: " << dim; DLOG << " tensor dim: " << dim;
// NCHW -> [W * (C+3)/4, H * N] // NCHW -> [W * (C+3)/4, H * N]
tensor_dims_ = dim; tensor_dims_ = dim;
...@@ -207,6 +210,7 @@ class CLImage { ...@@ -207,6 +210,7 @@ class CLImage {
image_width_ = width; image_width_ = width;
image_height_ = height; image_height_ = height;
image_dims_ = make_ddim({image_width_, image_height_}); image_dims_ = make_ddim({image_width_, image_height_});
c_block_ = W / width;
std::unique_ptr<half_t[]> imageData{}; std::unique_ptr<half_t[]> imageData{};
int count = 0; int count = 0;
......
...@@ -37,7 +37,7 @@ limitations under the License. */ ...@@ -37,7 +37,7 @@ limitations under the License. */
#include "framework/cl/cl_image.h" #include "framework/cl/cl_image.h"
#endif #endif
int debug_to = 115; int debug_to = 3;
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
...@@ -953,13 +953,14 @@ void Executor<GPU_CL, Precision::FP32>::InitMemory() { ...@@ -953,13 +953,14 @@ void Executor<GPU_CL, Precision::FP32>::InitMemory() {
if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) { if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
auto cl_image = var->template GetMutable<framework::CLImage>(); auto cl_image = var->template GetMutable<framework::CLImage>();
cl_context context = program_.scope->GetCLScpoe()->Context(); cl_context context = program_.scope->GetCLScpoe()->Context();
cl_command_queue command_queue = program_.scope->GetCLScpoe()->CommandQueue(); cl_command_queue command_queue =
program_.scope->GetCLScpoe()->CommandQueue();
const framework::TensorDesc &desc = var_desc->Tensor_desc(); const framework::TensorDesc &desc = var_desc->Tensor_desc();
// framework::DDim ddim = framework::make_ddim(desc.Dims()); // framework::DDim ddim = framework::make_ddim(desc.Dims());
framework::DDim ddim = cl_image->dims(); framework::DDim ddim = cl_image->dims();
DLOG << var_desc->Name(); DLOG << var_desc->Name();
cl_image->InitEmptyImage(context,command_queue, ddim); cl_image->InitEmptyImage(context, command_queue, ddim);
} }
} }
} }
...@@ -1011,11 +1012,12 @@ void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() { ...@@ -1011,11 +1012,12 @@ void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
} else { } else {
auto cl_image = var->template GetMutable<framework::CLImage>(); auto cl_image = var->template GetMutable<framework::CLImage>();
cl_context context = program_.scope->GetCLScpoe()->Context(); cl_context context = program_.scope->GetCLScpoe()->Context();
cl_command_queue command_queue = program_.scope->GetCLScpoe()->CommandQueue(); cl_command_queue command_queue =
program_.scope->GetCLScpoe()->CommandQueue();
const framework::TensorDesc &desc = var_desc->Tensor_desc(); const framework::TensorDesc &desc = var_desc->Tensor_desc();
framework::DDim ddim = cl_image->dims(); framework::DDim ddim = cl_image->dims();
// framework::DDim ddim = framework::make_ddim(desc.Dims()); // framework::DDim ddim = framework::make_ddim(desc.Dims());
cl_image->InitEmptyImage(context, command_queue,ddim); cl_image->InitEmptyImage(context, command_queue, ddim);
} }
} }
} }
......
...@@ -73,7 +73,7 @@ void OperatorBase<Dtype>::Run() { ...@@ -73,7 +73,7 @@ void OperatorBase<Dtype>::Run() {
} else { } else {
CLImage *cl_image = vari->template GetMutable<framework::CLImage>(); CLImage *cl_image = vari->template GetMutable<framework::CLImage>();
if (cl_image) { if (cl_image) {
DLOG << type_ << " input- " << key << "="<<*cl_image; DLOG << type_ << " input- " << key << "=" << *cl_image;
} }
} }
...@@ -98,7 +98,7 @@ void OperatorBase<Dtype>::Run() { ...@@ -98,7 +98,7 @@ void OperatorBase<Dtype>::Run() {
} else { } else {
CLImage *cl_image = vari->template GetMutable<framework::CLImage>(); CLImage *cl_image = vari->template GetMutable<framework::CLImage>();
if (cl_image) { if (cl_image) {
DLOG << type_ << " output- " << key << "="<<*cl_image ; DLOG << type_ << " output- " << key << "=" << *cl_image;
} }
} }
......
...@@ -49,11 +49,13 @@ bool BatchNormKernel<GPU_CL, float>::Init(BatchNormParam<GPU_CL> *param) { ...@@ -49,11 +49,13 @@ bool BatchNormKernel<GPU_CL, float>::Init(BatchNormParam<GPU_CL> *param) {
framework::CLImage *new_scale = new framework::CLImage(); framework::CLImage *new_scale = new framework::CLImage();
new_scale->SetTensorData(new_scale_ptr, variance->dims()); new_scale->SetTensorData(new_scale_ptr, variance->dims());
new_scale->InitCLImage(this->cl_helper_.CLContext(),this->cl_helper_.CLCommandQueue()); new_scale->InitCLImage(this->cl_helper_.CLContext(),
this->cl_helper_.CLCommandQueue());
framework::CLImage *new_bias = new framework::CLImage(); framework::CLImage *new_bias = new framework::CLImage();
new_bias->SetTensorData(new_bias_ptr, variance->dims()); new_bias->SetTensorData(new_bias_ptr, variance->dims());
new_bias->InitCLImage(this->cl_helper_.CLContext(),this->cl_helper_.CLCommandQueue()); new_bias->InitCLImage(this->cl_helper_.CLContext(),
this->cl_helper_.CLCommandQueue());
param->SetNewScale(new_scale); param->SetNewScale(new_scale);
param->SetNewBias(new_bias); param->SetNewBias(new_bias);
......
...@@ -19,6 +19,7 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -19,6 +19,7 @@ __kernel void conv_3x3(__private const int global_size_dim0,
__private const int global_size_dim2, __private const int global_size_dim2,
__read_only image2d_t input_image, __read_only image2d_t input_image,
__read_only image2d_t filter, __read_only image2d_t filter,
#ifdef BIASE #ifdef BIASE
__read_only image2d_t bias, __read_only image2d_t bias,
#endif #endif
......
...@@ -29,8 +29,10 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init( ...@@ -29,8 +29,10 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
param->Paddings()[0] == param->Paddings()[1], param->Paddings()[0] == param->Paddings()[1],
"need equal"); "need equal");
param->Filter()->InitCLImage(cl_helper_.CLContext(),cl_helper_.CLCommandQueue()); param->Filter()->InitCLImage(cl_helper_.CLContext(),
param->Bias()->InitCLImage(cl_helper_.CLContext(),cl_helper_.CLCommandQueue()); cl_helper_.CLCommandQueue());
param->Bias()->InitCLImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
// const CL *mean = param->InputMean(); // const CL *mean = param->InputMean();
const framework::CLImage *mean = param->InputMean(); const framework::CLImage *mean = param->InputMean();
...@@ -62,12 +64,14 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init( ...@@ -62,12 +64,14 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
framework::CLImage *new_scale = new framework::CLImage(); framework::CLImage *new_scale = new framework::CLImage();
new_scale->SetTensorData(new_scale_ptr, variance->dims()); new_scale->SetTensorData(new_scale_ptr, variance->dims());
new_scale->InitCLImage(this->cl_helper_.CLContext(),cl_helper_.CLCommandQueue()); new_scale->InitCLImage(this->cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
framework::CLImage *new_bias = new framework::CLImage(); framework::CLImage *new_bias = new framework::CLImage();
new_bias->SetTensorData(new_bias_ptr, variance->dims()); new_bias->SetTensorData(new_bias_ptr, variance->dims());
new_bias->InitCLImage(this->cl_helper_.CLContext(),cl_helper_.CLCommandQueue()); new_bias->InitCLImage(this->cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
param->SetNewScale(new_scale); param->SetNewScale(new_scale);
param->SetNewBias(new_bias); param->SetNewBias(new_bias);
......
...@@ -25,8 +25,10 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) { ...@@ -25,8 +25,10 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
param->Filter()->dims()[2] == param->Filter()->dims()[3] && param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
param->Paddings()[0] == param->Paddings()[1], param->Paddings()[0] == param->Paddings()[1],
"need equal"); "need equal");
param->Filter()->InitCLImage(cl_helper_.CLContext(),this->cl_helper_.CLCommandQueue()); param->Filter()->InitCLImage(cl_helper_.CLContext(),
param->Bias()->InitCLImage(cl_helper_.CLContext(),this->cl_helper_.CLCommandQueue()); this->cl_helper_.CLCommandQueue());
param->Bias()->InitCLImage(cl_helper_.CLContext(),
this->cl_helper_.CLCommandQueue());
int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 - int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
static_cast<int>(param->Paddings()[1]); static_cast<int>(param->Paddings()[1]);
......
...@@ -26,7 +26,8 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) { ...@@ -26,7 +26,8 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
param->Paddings()[0] == param->Paddings()[1], param->Paddings()[0] == param->Paddings()[1],
"need equal"); "need equal");
param->Filter()->InitCLImage(cl_helper_.CLContext(),this->cl_helper_.CLCommandQueue()); param->Filter()->InitCLImage(cl_helper_.CLContext(),
this->cl_helper_.CLCommandQueue());
int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 - int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
static_cast<int>(param->Paddings()[1]); static_cast<int>(param->Paddings()[1]);
...@@ -95,6 +96,17 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) { ...@@ -95,6 +96,17 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
cl_int status; cl_int status;
DLOG << " begin set kernel arg "; DLOG << " begin set kernel arg ";
DLOG << " c block " << c_block;
DLOG << " w " << w;
DLOG << " nh " << nh;
DLOG << " stride " << stride;
DLOG << " offset " << offset;
DLOG << " input_c " << input_c;
DLOG << " dilation " << dilation;
DLOG << " input width " << input_width;
DLOG << " input height " << input_height;
DLOG << " output width " << output_width;
DLOG << " output height " << output_height;
status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
......
...@@ -27,7 +27,8 @@ bool DepthwiseConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) { ...@@ -27,7 +27,8 @@ bool DepthwiseConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
param->Filter()->dims()[2] == param->Filter()->dims()[3] && param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
param->Paddings()[0] == param->Paddings()[1], param->Paddings()[0] == param->Paddings()[1],
"need equal"); "need equal");
param->Filter()->InitCLImage(cl_helper_.CLContext(),this->cl_helper_.CLCommandQueue()); param->Filter()->InitCLImage(cl_helper_.CLContext(),
this->cl_helper_.CLCommandQueue());
int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 - int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
static_cast<int>(param->Paddings()[1]); static_cast<int>(param->Paddings()[1]);
param->SetOffset(offset); param->SetOffset(offset);
......
...@@ -30,7 +30,7 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) { ...@@ -30,7 +30,7 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
cl_int status; cl_int status;
auto output = param.Out(); auto output = param.Out();
const Tensor *input = param.InputX(); const Tensor *input = param.InputX();
DLOG<<*input; DLOG << *input;
const float *input_data = input->data<float>(); const float *input_data = input->data<float>();
int numel = input->numel(); int numel = input->numel();
cl_mem cl_image = output->GetCLImage(); cl_mem cl_image = output->GetCLImage();
......
...@@ -19,44 +19,45 @@ namespace operators { ...@@ -19,44 +19,45 @@ namespace operators {
template <> template <>
bool FetchKernel<GPU_CL, float>::Init(FetchParam<GPU_CL> *param) { bool FetchKernel<GPU_CL, float>::Init(FetchParam<GPU_CL> *param) {
// this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl"); // this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
return true; return true;
} }
template <> template <>
void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) { void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
// auto kernel = this->cl_helper_.KernelAt(0); // auto kernel = this->cl_helper_.KernelAt(0);
// auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.InputX()); // auto default_work_size =
// // this->cl_helper_.DefaultWorkSize(*param.InputX());
// auto input = param.InputX()->GetCLImage(); //
// auto *out = param.Out(); // auto input = param.InputX()->GetCLImage();
// // auto *out = param.Out();
// const auto &dims = param.InputX()->dims(); //
// const int N = dims[0]; // const auto &dims = param.InputX()->dims();
// const int C = dims[1]; // const int N = dims[0];
// const int in_height = dims[2]; // const int C = dims[1];
// const int in_width = dims[3]; // const int in_height = dims[2];
// // const int in_width = dims[3];
// int size_ch = in_height * in_width; //
// int size_block = size_ch * 4; // int size_ch = in_height * in_width;
// int size_batch = size_ch * C; // int size_block = size_ch * 4;
// // int size_batch = size_ch * C;
// // need create outputBuffer //
// cl_image_format imageFormat; // // need create outputBuffer
// imageFormat.image_channel_order = CL_RGBA; // cl_image_format imageFormat;
// imageFormat.image_channel_data_type = CL_FLOAT; // imageFormat.image_channel_order = CL_RGBA;
// cl_mem outputBuffer; // imageFormat.image_channel_data_type = CL_FLOAT;
// // cl_mem outputBuffer;
// clSetKernelArg(kernel, 0, sizeof(int), &in_height); //
// clSetKernelArg(kernel, 1, sizeof(int), &in_width); // clSetKernelArg(kernel, 0, sizeof(int), &in_height);
// clSetKernelArg(kernel, 2, sizeof(int), &size_ch); // clSetKernelArg(kernel, 1, sizeof(int), &in_width);
// clSetKernelArg(kernel, 3, sizeof(int), &size_block); // clSetKernelArg(kernel, 2, sizeof(int), &size_ch);
// clSetKernelArg(kernel, 4, sizeof(int), &size_batch); // clSetKernelArg(kernel, 3, sizeof(int), &size_block);
// clSetKernelArg(kernel, 5, sizeof(cl_mem), &input); // clSetKernelArg(kernel, 4, sizeof(int), &size_batch);
// clSetKernelArg(kernel, 6, sizeof(cl_mem), &outputBuffer); // clSetKernelArg(kernel, 5, sizeof(cl_mem), &input);
// // clSetKernelArg(kernel, 6, sizeof(cl_mem), &outputBuffer);
// clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, //
// default_work_size.data(), NULL, 0, NULL, NULL); // clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
// default_work_size.data(), NULL, 0, NULL, NULL);
} }
template class FetchKernel<GPU_CL, float>; template class FetchKernel<GPU_CL, float>;
......
...@@ -37,10 +37,10 @@ void ReshapeKernel<GPU_CL, float>::Compute(const ReshapeParam<GPU_CL> &param) { ...@@ -37,10 +37,10 @@ void ReshapeKernel<GPU_CL, float>::Compute(const ReshapeParam<GPU_CL> &param) {
int dims[4] = {1, 1, 1, 1}; int dims[4] = {1, 1, 1, 1};
int odims[4] = {1, 1, 1, 1}; int odims[4] = {1, 1, 1, 1};
for (int i = 0; i < inputDim.size(); i++) { for (int i = 0; i < inputDim.size(); i++) {
dims[4-inputDim.size()+i] = inputDim[i]; dims[4 - inputDim.size() + i] = inputDim[i];
} }
for (int i = 0; i < outputDim.size(); i++) { for (int i = 0; i < outputDim.size(); i++) {
odims[4-outputDim.size()+i] = outputDim[i]; odims[4 - outputDim.size() + i] = outputDim[i];
} }
clSetKernelArg(kernel, 2, sizeof(cl_int), &dims); clSetKernelArg(kernel, 2, sizeof(cl_int), &dims);
clSetKernelArg(kernel, 3, sizeof(cl_int), &dims[1]); clSetKernelArg(kernel, 3, sizeof(cl_int), &dims[1]);
......
...@@ -17,7 +17,7 @@ shift ...@@ -17,7 +17,7 @@ shift
perl -i -pe 's|^\s+#pragma\s+omp|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> #pragma omp|' "$@" perl -i -pe 's|^\s+#pragma\s+omp|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> #pragma omp|' "$@"
( (
# remove clang format ios_io folder # remove clang format ios_io folder
flist=$(echo "$@" | perl -pe 's|src/ios_io/[^ ]*||') flist=$(echo "$@" | perl -pe 's|src/io/ios_io/[^ ]*||')
clang-format -i $flist clang-format -i $flist
) )
perl -i -pe 's|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> ||' "$@" perl -i -pe 's|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> ||' "$@"
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册