提交 84e1d25f 编写于 作者: J Jiaying Zhao 提交者: GitHub

[MOBILE][OPENCL]Add elementwise_sub op; Fix instancenorm op; (#2622)

上级 a39f92ea
......@@ -134,6 +134,8 @@ const char *G_OP_TYPE_FILL_CONSTAN_BATCH_SIZE_LIKE =
"fill_constant_batch_size_like";
const char *G_OP_TYPE_FUSION_INSTANCENORM_RELU = "fusion_instancenorm_relu";
const char *G_OP_TYPE_PIXEL_SHUFFLE = "pixel_shuffle";
const char *G_OP_TYPE_EXPAND = "expand";
const char *G_OP_TYPE_GRID_SAMPLER = "grid_sampler";
std::unordered_map<
std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
......@@ -156,7 +158,7 @@ std::unordered_map<
{G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}},
{G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}},
{G_OP_TYPE_BATCHNORM, {{"X"}, {"Y"}}},
{G_OP_TYPE_INSTANCENORM, {{"X"}, {"Out"}}},
{G_OP_TYPE_INSTANCENORM, {{"X"}, {"Y"}}},
{G_OP_TYPE_FUSION_INSTANCENORM_RELU, {{"X"}, {"Out"}}},
{G_OP_TYPE_LRN, {{"X"}, {"Out"}}},
{G_OP_TYPE_CONCAT, {{"X"}, {"Out"}}},
......@@ -258,5 +260,7 @@ std::unordered_map<
{{"Ids", "Scores"}, {"SentenceIds", "SentenceScores"}}},
{G_OP_TYPE_FILL_CONSTAN_BATCH_SIZE_LIKE, {{"Input"}, {"Out"}}},
{G_OP_TYPE_PAD2D, {{"X"}, {"Out"}}},
{G_OP_TYPE_PIXEL_SHUFFLE, {{"X"}, {"Out"}}}};
{G_OP_TYPE_PIXEL_SHUFFLE, {{"X"}, {"Out"}}},
{G_OP_TYPE_EXPAND, {{"X"}, {"Out"}}},
{G_OP_TYPE_GRID_SAMPLER, {{"X", "Grid"}, {"Output"}}}};
} // namespace paddle_mobile
......@@ -265,6 +265,8 @@ extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN;
extern const char *G_OP_TYPE_FUSION_DECONV_BN_RELU;
extern const char *G_OP_TYPE_FUSION_INSTANCENORM_RELU;
extern const char *G_OP_TYPE_PIXEL_SHUFFLE;
extern const char *G_OP_TYPE_EXPAND;
extern const char *G_OP_TYPE_GRID_SAMPLER;
extern std::unordered_map<
std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
......
......@@ -246,7 +246,7 @@ LOAD_OP2(fusion_conv_bn, CPU, FPGA);
LOAD_FUSION_MATCHER(fusion_conv_bn);
#endif
#ifdef ELEMENTWISESUB_OP
LOAD_OP1(elementwise_sub, CPU)
LOAD_OP2(elementwise_sub, CPU, GPU_CL)
#endif
#ifdef TOP_K_OP
LOAD_OP1(top_k, CPU)
......@@ -380,3 +380,9 @@ LOAD_OP1(reduce_prod, CPU);
#ifdef PIXEL_SHUFFLE_OP
LOAD_OP1(pixel_shuffle, GPU_CL);
#endif
#ifdef EXPAND_OP
LOAD_OP1(expand, GPU_CL);
#endif
#ifdef GRID_SAMPLER_OP
LOAD_OP1(grid_sampler, GPU_CL);
#endif
......@@ -32,6 +32,9 @@ namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(elementwise_sub, ops::ElementwiseSubOp);
#endif
#ifdef PADDLE_MOBILE_CL
REGISTER_OPERATOR_CL(elementwise_sub, ops::ElementwiseSubOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
......
......@@ -45,7 +45,7 @@ class FusionInstanceNormReluMatcher : public framework::FusionOpMatcher {
template <typename DeviceType, typename T>
class FusionInstanceNormReluOp
: public framework::OperatorWithKernel<
DeviceType, InstanceNormParam<DeviceType>,
DeviceType, FusionInstanceNormReluParam<DeviceType>,
operators::InstanceNormReluKernel<DeviceType, T>> {
public:
FusionInstanceNormReluOp(const string &type, const VariableNameMap &inputs,
......@@ -53,7 +53,7 @@ class FusionInstanceNormReluOp
const framework::AttributeMap &attrs,
framework::Scope *scope)
: framework::OperatorWithKernel<
DeviceType, InstanceNormParam<DeviceType>,
DeviceType, FusionInstanceNormReluParam<DeviceType>,
operators::InstanceNormReluKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
......
......@@ -24,7 +24,7 @@ namespace operators {
template <typename Dtype, typename T>
void InstanceNormOp<Dtype, T>::InferShape() const {
auto x_dims = this->param_.InputX()->dims();
this->param_.Out()->Resize(x_dims);
this->param_.OutputY()->Resize(x_dims);
}
} // namespace operators
......
......@@ -17,17 +17,17 @@ limitations under the License. */
namespace paddle_mobile {
namespace operators {
void InstanceNorm(framework::CLHelper *cl_helper,
const InstanceNormParam<GPU_CL> &param) {
const framework::CLImage *input, framework::CLImage *output,
float epsilon) {
auto kernel = cl_helper->KernelAt(0);
auto &dims = param.Out()->dims();
auto &dims = output->dims();
const int n = dims[0];
const int c_group = (dims[1] + 3) / 4;
const int h = dims[2];
const int w = dims[3];
auto epsilon = param.Epsilon();
auto input = param.InputX()->GetCLImage();
auto out = param.Out()->GetCLImage();
auto input_image = input->GetCLImage();
auto out_image = output->GetCLImage();
// DLOG << "Epsilon: " << epsilon;
......@@ -66,9 +66,9 @@ void InstanceNorm(framework::CLHelper *cl_helper,
CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 5, sizeof(cl_float), &epsilon);
CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 6, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 6, sizeof(cl_mem), &input_image);
CL_CHECK_ERRORS(status);
clSetKernelArg(kernel, 7, sizeof(cl_mem), &out);
clSetKernelArg(kernel, 7, sizeof(cl_mem), &out_image);
CL_CHECK_ERRORS(status);
clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel, 3, NULL,
work_size, local_work_size, 0, NULL, NULL);
......
......@@ -21,7 +21,8 @@ limitations under the License. */
namespace paddle_mobile {
namespace operators {
void InstanceNorm(framework::CLHelper *cl_helper,
const InstanceNormParam<GPU_CL> &param);
const framework::CLImage *input, framework::CLImage *output,
float epsilon);
}
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void elementwise_sub(__global image2d_t inputImage, __global image2d_t bias, __write_only image2d_t outputImage) {
int x = get_global_id(0);
int y = get_global_id(1);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords;
coords.x = x;
coords.y = y;
half4 input = read_imageh(inputImage, sampler, coords);
half4 biase = read_imageh(bias, sampler, coords);
half4 output = input - biase;
write_imageh(outputImage, coords, output);
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef ELEMENTWISESUB_OP
#include "operators/kernel/elementwise_sub_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool ElementwiseSubKernel<GPU_CL, float>::Init(
ElementwiseSubParam<GPU_CL> *param) {
framework::CLImage *bias = reinterpret_cast<framework::CLImage *>(
const_cast<framework::CLImage *>(param->InputY()));
if (bias->dims().size() == 4) {
if (!bias->isInit()) {
bias->InitNormalCLImage(cl_helper_.CLContext(),
this->cl_helper_.CLCommandQueue());
}
DLOG << " bias: " << *bias;
this->cl_helper_.AddKernel("elementwise_sub", "elementwise_sub_kernel.cl");
} else {
DLOG << "error:bias dims not support";
}
return true;
}
template <>
void ElementwiseSubKernel<GPU_CL, float>::Compute(
const ElementwiseSubParam<GPU_CL> &param) {
auto input = param.InputX();
auto bias = param.InputY();
auto output = param.Out();
cl_int status;
auto kernel = this->cl_helper_.KernelAt(0);
if (bias->dims().size() == 4) {
cl_mem input_image = input->GetCLImage();
cl_mem bias_image = bias->GetCLImage();
cl_mem output_image = output->GetCLImage();
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &bias_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &output_image);
CL_CHECK_ERRORS(status);
auto width = input->ImageWidth();
auto height = input->ImageHeight();
size_t global_work_size[2] = {width, height};
status =
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
NULL, global_work_size, NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
} else {
DLOG << "error:bias dims not support";
}
}
template class ElementwiseSubKernel<GPU_CL, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -23,7 +23,7 @@ namespace operators {
template <>
bool InstanceNormKernel<GPU_CL, float>::Init(InstanceNormParam<GPU_CL> *param) {
auto &dims = param->Out()->dims();
auto &dims = param->OutputY()->dims();
const int h = dims[2];
std::string build_options = "";
if (h == 128) {
......@@ -41,7 +41,8 @@ bool InstanceNormKernel<GPU_CL, float>::Init(InstanceNormParam<GPU_CL> *param) {
template <>
void InstanceNormKernel<GPU_CL, float>::Compute(
const InstanceNormParam<GPU_CL> &param) {
InstanceNorm(&this->cl_helper_, param);
InstanceNorm(&this->cl_helper_, param.InputX(), param.OutputY(),
param.Epsilon());
}
template class InstanceNormKernel<GPU_CL, float>;
......
......@@ -23,7 +23,7 @@ namespace operators {
template <>
bool InstanceNormReluKernel<GPU_CL, float>::Init(
InstanceNormParam<GPU_CL> *param) {
FusionInstanceNormReluParam<GPU_CL> *param) {
auto &dims = param->Out()->dims();
const int h = dims[2];
std::string build_options = "-DRELU";
......@@ -41,8 +41,8 @@ bool InstanceNormReluKernel<GPU_CL, float>::Init(
template <>
void InstanceNormReluKernel<GPU_CL, float>::Compute(
const InstanceNormParam<GPU_CL> &param) {
InstanceNorm(&this->cl_helper_, param);
const FusionInstanceNormReluParam<GPU_CL> &param) {
InstanceNorm(&this->cl_helper_, param.InputX(), param.Out(), param.Epsilon());
}
template class InstanceNormReluKernel<GPU_CL, float>;
......
......@@ -30,10 +30,10 @@ using framework::OpKernelBase;
template <typename DeviceType, typename T>
class InstanceNormReluKernel
: public OpKernelBase<DeviceType, InstanceNormParam<DeviceType>> {
: public OpKernelBase<DeviceType, FusionInstanceNormReluParam<DeviceType>> {
public:
void Compute(const InstanceNormParam<DeviceType> &param);
bool Init(InstanceNormParam<DeviceType> *param);
void Compute(const FusionInstanceNormReluParam<DeviceType> &param);
bool Init(FusionInstanceNormReluParam<DeviceType> *param);
};
} // namespace operators
......
......@@ -927,6 +927,35 @@ class InstanceNormParam : public OpParam {
Scope *scope)
: OpParam(inputs, outputs, attrs, scope) {
input_x_ = InputXFrom<GType>(inputs, *scope);
output_y_ = OutputYFrom<GType>(outputs, *scope);
epsilon_ = GetAttr<float>("epsilon", attrs);
}
const GType *InputX() const { return input_x_; }
GType *OutputY() const { return output_y_; }
const float &Epsilon() const { return epsilon_; }
private:
GType *input_x_;
GType *output_y_;
float epsilon_;
};
#endif
#ifdef FUSION_INSTANCENORM_RELU_OP
template <typename Dtype>
class FusionInstanceNormReluParam : public OpParam {
typedef typename DtypeTensorTrait<Dtype>::gtype GType;
typedef typename DtypeTensorTrait<Dtype>::rtype RType;
public:
FusionInstanceNormReluParam(const VariableNameMap &inputs,
const VariableNameMap &outputs,
const AttributeMap &attrs, Scope *scope)
: OpParam(inputs, outputs, attrs, scope) {
input_x_ = InputXFrom<GType>(inputs, *scope);
out_ = OutFrom<GType>(outputs, *scope);
epsilon_ = GetAttr<float>("epsilon", attrs);
}
......@@ -3658,5 +3687,56 @@ class PixelShuffleParam : public OpParam {
};
#endif
#ifdef EXPAND_OP
template <typename Dtype>
class ExpandParam : public OpParam {
typedef typename DtypeTensorTrait<Dtype>::gtype GType;
typedef typename DtypeTensorTrait<Dtype>::rtype RType;
public:
ExpandParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
const AttributeMap &attrs, Scope *scope)
: OpParam(inputs, outputs, attrs, scope) {
input_x_ = InputXFrom<GType>(inputs, *scope);
out_ = OutFrom<GType>(outputs, *scope);
expand_times_ = GetAttr<std::vector<int>>("expand_times", attrs);
}
const GType *InputX() const { return input_x_; }
GType *Out() const { return out_; }
private:
GType *input_x_;
GType *out_;
std::vector<int> expand_times_;
};
#endif
#ifdef GRID_SAMPLER_OP
template <typename Dtype>
class GridSamplerParam : public OpParam {
typedef typename DtypeTensorTrait<Dtype>::gtype GType;
typedef typename DtypeTensorTrait<Dtype>::rtype RType;
public:
GridSamplerParam(const VariableNameMap &inputs,
const VariableNameMap &outputs, const AttributeMap &attrs,
Scope *scope)
: OpParam(inputs, outputs, attrs, scope) {
input_x_ = InputXFrom<GType>(inputs, *scope);
output_ = OutputFrom<GType>(outputs, *scope);
}
const GType *InputX() const { return input_x_; }
GType *Output() const { return output_; }
private:
GType *input_x_;
GType *output_;
};
#endif
} // namespace operators
} // namespace paddle_mobile
......@@ -379,6 +379,8 @@ if(NOT FOUND_MATCH)
set(REDUCE_PROD_OP ON)
set(FUSION_INSTANCENORM_RELU_OP ON)
set(PIXEL_SHUFFLE_OP ON)
set(EXPAND_OP ON)
set(GRID_SAMPLER_OP ON)
endif()
# option(BATCHNORM_OP "" ON)
......@@ -755,3 +757,10 @@ endif()
if (PIXEL_SHUFFLE_OP)
add_definitions(-DPIXEL_SHUFFLE_OP)
endif()
if (EXPAND_OP)
add_definitions(-DEXPAND_OP)
endif()
if (GRID_SAMPLER_OP)
add_definitions(-DGRID_SAMPLER_OP)
endif()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册