提交 438e456c 编写于 作者: A Alexander Alekhin

Merge pull request #10113 from wzw-intel:fusion

......@@ -1233,12 +1233,13 @@ struct Net::Impl
}
}
// For now, OpenCL target only support fusion with activation of ReLU/ChannelsPReLU
// For now, OpenCL target only support fusion with activation of ReLU/ChannelsPReLU/Power
if ( preferableTarget != DNN_TARGET_OPENCL ||
(preferableTarget == DNN_TARGET_OPENCL &&
nextData &&
(!nextData->type.compare("ReLU") ||
!nextData->type.compare("ChannelsPReLU"))) )
!nextData->type.compare("ChannelsPReLU") ||
!nextData->type.compare("Power"))) )
{
Ptr<ActivationLayer> nextActivLayer;
......@@ -1253,6 +1254,78 @@ struct Net::Impl
printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
activData->skipFlags[DNN_BACKEND_DEFAULT] = true;
ld.outputBlobs = layers[lpNext.lid].outputBlobs;
if ( preferableTarget == DNN_TARGET_OPENCL )
{
nextData = &layers[activData->consumers[0].lid];
lpNext = LayerPin(activData->consumers[0].lid, 0);
}
}
}
// fuse convlution layer followed by eltwise + relu
if ( preferableTarget == DNN_TARGET_OPENCL )
{
Ptr<EltwiseLayer> nextEltwiseLayer;
if( nextData )
nextEltwiseLayer = nextData->layerInstance.dynamicCast<EltwiseLayer>();
if( !nextEltwiseLayer.empty() && pinsToKeep.count(lpNext) == 0 )
{
LayerData *eltwiseData = nextData;
// go down from the second input and find the first non-skipped layer.
LayerData *downLayerData = &layers[eltwiseData->inputBlobsId[1].lid];
while (downLayerData->skipFlags[DNN_BACKEND_DEFAULT])
{
downLayerData = &layers[downLayerData->inputBlobsId[0].lid];
}
// second input layer is current layer.
if ( ld.id == downLayerData->id )
{
// go down from the first input and find the first non-skipped layer
downLayerData = &layers[eltwiseData->inputBlobsId[0].lid];
while (downLayerData->skipFlags[DNN_BACKEND_DEFAULT])
{
if ( !downLayerData->type.compare("Eltwise") )
downLayerData = &layers[downLayerData->inputBlobsId[1].lid];
else
downLayerData = &layers[downLayerData->inputBlobsId[0].lid];
}
Ptr<ConvolutionLayer> convLayer;
if( downLayerData )
convLayer = downLayerData->layerInstance.dynamicCast<ConvolutionLayer>();
// first input layer is convolution layer
if( !convLayer.empty() )
{
// fuse eltwise + activation layer
LayerData *firstConvLayerData = downLayerData;
{
nextData = &layers[eltwiseData->consumers[0].lid];
lpNext = LayerPin(eltwiseData->consumers[0].lid, 0);
Ptr<ActivationLayer> nextActivLayer;
if( nextData )
nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
if( !nextActivLayer.empty() && pinsToKeep.count(lpNext) == 0 &&
(!nextData->type.compare("ReLU") ||
!nextData->type.compare("ChannelsPReLU") ||
!nextData->type.compare("Power")) &&
currLayer->setActivation(nextActivLayer) )
{
CV_Assert(firstConvLayerData->outputBlobs.size() == 1 && ld.inputBlobs.size() == 1);
ld.inputBlobs.push_back(&firstConvLayerData->outputBlobs[0]);
printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str()));
printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
eltwiseData->skipFlags[DNN_BACKEND_DEFAULT] = true;
nextData->skipFlags[DNN_BACKEND_DEFAULT] = true;
ld.outputBlobs = layers[lpNext.lid].outputBlobs;
}
}
}
}
}
}
}
......
......@@ -142,6 +142,9 @@ public:
}
};
#define IS_POWER_LAYER(layer) \
(!layer.empty() && !layer->type.compare("Power"))
//TODO: simultaneously convolution and bias addition for cache optimization
class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
{
......@@ -161,6 +164,7 @@ public:
bool newWeightAndBias;
bool newActiv;
ocl4dnnFusedActiv_t activType;
float power;
#endif
ConvolutionLayerImpl()
{
......@@ -169,6 +173,7 @@ public:
newWeightAndBias = false;
newActiv = false;
activType = OCL4DNN_CONV_FUSED_ACTIV_NONE;
power = 0.f;
#endif
}
......@@ -225,6 +230,22 @@ public:
#ifdef HAVE_OPENCL
newActiv = true;
activType = OCL4DNN_CONV_FUSED_ACTIV_NONE;
if (preferableTarget == DNN_TARGET_OPENCL)
{
Ptr<PowerLayer> activ_power = activ.dynamicCast<PowerLayer>();
if (!activ_power.empty())
{
if (activ_power->scale != 1.f || activ_power->shift != 0.f)
newWeightAndBias = true;
if (activ_power->scale != 1.f)
weightsMat.release();
power = activ_power->power;
activType = OCL4DNN_CONV_FUSED_ACTIV_POWER;
}
}
#endif
return !activ.empty();
}
......@@ -727,11 +748,12 @@ public:
biasvec[k] = biasMat.at<float>(k);
}
if( !bnorm.empty() || !scaleLayer.empty() )
if( !bnorm.empty() || !scaleLayer.empty() || IS_POWER_LAYER(activ))
{
Mat scale, shift, scale2, shift2;
const float *scaleptr = 0, *shiftptr = 0;
const float *scaleptr2 = 0, *shiftptr2 = 0;
float a = 1.f, b = 0.f;
if( !bnorm.empty() )
{
......@@ -758,7 +780,14 @@ public:
}
}
if (shiftptr || shiftptr2)
if( IS_POWER_LAYER(activ) )
{
Ptr<PowerLayer> activ_power = activ.dynamicCast<PowerLayer>();
a = activ_power->scale;
b = activ_power->shift;
}
if (shiftptr || shiftptr2 || b != 0.f)
fusedBias = true;
for( int i = 0; i < outCn; i++ )
......@@ -771,9 +800,9 @@ public:
int j, wcols = weightsMat.cols;
for( j = 0; j < wcols; j++ )
w_i[j] *= (s1*s2);
w_i[j] *= (s1*s2*a);
biasvec[i] = biasvec[i]*(s1*s2) + (delta1*s2 + delta2);
biasvec[i] = biasvec[i]*(s1*s2*a) + (delta1*s2*a + delta2*a + b);
}
}
biasvec[outCn] = biasvec[outCn+1] = biasvec[outCn-1];
......@@ -827,10 +856,15 @@ public:
CV_Assert(!reluslope.empty());
convolutionOp->setActivPReLU(true, reluslope);
}
else if ( activType == OCL4DNN_CONV_FUSED_ACTIV_POWER)
{
convolutionOp->setActivPower(true, power);
}
else
{
convolutionOp->setActivReLU(false, 0);
convolutionOp->setActivPReLU(false, reluslope);
convolutionOp->setActivPower(false, 1.f);
}
newActiv = false;
}
......@@ -840,6 +874,7 @@ public:
int batch_size = inpMat.size[0];
return convolutionOp->Forward(inpMat,
inputs.size() == 2 ? inputs[1] : UMat(),
umat_blobs[0],
(hasBias() || fusedBias) ? umat_blobs[1] : UMat(),
outMat,
......
......@@ -77,6 +77,7 @@ typedef enum {
OCL4DNN_CONV_FUSED_ACTIV_NONE = 0,
OCL4DNN_CONV_FUSED_ACTIV_RELU = 1,
OCL4DNN_CONV_FUSED_ACTIV_PRELU = 2,
OCL4DNN_CONV_FUSED_ACTIV_POWER = 3
} ocl4dnnFusedActiv_t;
template<typename Dtype>
......@@ -86,11 +87,13 @@ class OCL4DNNConvSpatial
explicit OCL4DNNConvSpatial(OCL4DNNConvConfig config);
~OCL4DNNConvSpatial();
bool Forward(const UMat& bottom_data,
const UMat& bottom_data2,
const UMat& weight,
const UMat& bias,
UMat& top_data, int32_t batch_size);
void setActivReLU(bool fuse_activ, float slope);
void setActivPReLU(bool fuse_activ, std::vector<float> &slope);
void setActivPower(bool fuse_activ, float power);
void setBias(bool bias_term);
private:
......@@ -252,8 +255,8 @@ class OCL4DNNConvSpatial
int lx, int ly, int lz,
bool swizzle, bool nullLocal);
void generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems);
void setFusionDefine(ocl4dnnFusedActiv_t fused_activ);
void setFusionArg(ocl4dnnFusedActiv_t fused_activ, ocl::Kernel &kernel, cl_uint &argIdx);
void setFusionDefine(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise);
void setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx);
int32_t group_;
bool bias_term_;
......@@ -305,6 +308,8 @@ class OCL4DNNConvSpatial
float negative_slope_;
UMat negative_slope_umat_;
ocl4dnnFusedActiv_t fused_activ_;
float power_;
bool fused_eltwise_;
};
typedef enum {
......
......@@ -79,6 +79,8 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
group_ = config.group;
fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
fused_eltwise_ = false;
power_ = 1.f;
negative_slope_ = 0;
prev_kernel_type_ = -1;
tuned_ = false;
......@@ -141,8 +143,11 @@ OCL4DNNConvSpatial<Dtype>::~OCL4DNNConvSpatial()
}
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::setFusionDefine(ocl4dnnFusedActiv_t fused_activ)
void OCL4DNNConvSpatial<Dtype>::setFusionDefine(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise)
{
if (fused_eltwise)
addDef("FUSED_CONV_ELTWISE", 1);
switch (fused_activ) {
case OCL4DNN_CONV_FUSED_ACTIV_RELU:
addDef("FUSED_CONV_RELU", 1);
......@@ -150,6 +155,9 @@ void OCL4DNNConvSpatial<Dtype>::setFusionDefine(ocl4dnnFusedActiv_t fused_activ)
case OCL4DNN_CONV_FUSED_ACTIV_PRELU:
addDef("FUSED_CONV_PRELU", 1);
break;
case OCL4DNN_CONV_FUSED_ACTIV_POWER:
addDef("FUSED_CONV_POWER", 1);
break;
default:
;
}
......@@ -157,8 +165,11 @@ void OCL4DNNConvSpatial<Dtype>::setFusionDefine(ocl4dnnFusedActiv_t fused_activ)
}
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, ocl::Kernel &kernel, cl_uint &argIdx)
void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bool fused_eltwise, ocl::Kernel &kernel, cl_uint &argIdx)
{
if (fused_eltwise)
kernel.set(argIdx++, (cl_mem)bottom_data2_.handle(ACCESS_READ));
switch (fused_activ) {
case OCL4DNN_CONV_FUSED_ACTIV_RELU:
kernel.set(argIdx++, (float)negative_slope_);
......@@ -166,6 +177,9 @@ void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, oc
case OCL4DNN_CONV_FUSED_ACTIV_PRELU:
kernel.set(argIdx++, (cl_mem)negative_slope_umat_.handle(ACCESS_READ));
break;
case OCL4DNN_CONV_FUSED_ACTIV_POWER:
kernel.set(argIdx++, (float)power_);
break;
default:
;
}
......@@ -255,7 +269,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
addDef("ALIGNED_NUM_FILTERS", (int)alignSize(M_, simd_size));
addDef("OUT_BLOCK_SIZE", (output_block_width*output_block_height));
addDef("APPLY_BIAS", bias_term_);
setFusionDefine(fused_activ_);
setFusionDefine(fused_activ_, fused_eltwise_);
src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
}
......@@ -277,7 +291,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
addDef("APPLY_BIAS", bias_term_);
addDef("OUTPUT_Z", M_);
addDef("ZPAR", 1);
setFusionDefine(fused_activ_);
setFusionDefine(fused_activ_, fused_eltwise_);
src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
}
......@@ -314,7 +328,7 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
addDef("TILE_N_LAST", M_ % 32);
addDef("TILE_N_LAST_DIV8", (M_ % 32) / 8);
addDef("APPLY_BIAS", bias_term_);
setFusionDefine(fused_activ_);
setFusionDefine(fused_activ_, fused_eltwise_);
src_ = ocl::dnn::conv_layer_spatial_oclsrc;
}
}
......@@ -370,14 +384,37 @@ void OCL4DNNConvSpatial<Dtype>::setActivPReLU(bool fuse_activ, std::vector<float
fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
}
template<typename Dtype>
void OCL4DNNConvSpatial<Dtype>::setActivPower(bool fuse_activ, float power)
{
if ( fuse_activ )
{
fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_POWER;
power_ = power;
}
else
fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
}
template<typename Dtype>
bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom,
const UMat& bottom2,
const UMat& weight,
const UMat& bias,
UMat& top,
int32_t numImages)
{
num_ = numImages;
if (!bottom2.empty())
{
fused_eltwise_ = true;
bottom_data2_ = bottom2;
}
else
{
fused_eltwise_ = false;
}
prepareKernel(bottom, top, weight, bias, numImages);
if (bestKernelConfig.empty())
return false;
......@@ -428,7 +465,8 @@ void OCL4DNNConvSpatial<Dtype>::generateKey()
<< "p" << pad_w_ << "x" << pad_h_ << "_"
<< "num" << num_ << "_"
<< "M" << M_ << "_"
<< "activ" << fused_activ_;
<< "activ" << fused_activ_ << "_"
<< "eltwise" << fused_eltwise_;
key_ = ocl::Device::getDefault().vendorName() + "_EU" + cv::format("%d", ocl::Device::getDefault().maxComputeUnits()) + "_" + keyBuilder.str();
......@@ -678,7 +716,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
return false;
cl_uint argIdx = 0;
setFusionArg(fused_activ_, kernel, argIdx);
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
UMat img_buffer;
if (image_offset)
......@@ -771,7 +809,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
return false;
cl_uint argIdx = 0;
setFusionArg(fused_activ_, kernel, argIdx);
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
UMat img_buffer;
if (image_offset)
......@@ -888,7 +926,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
return false;
cl_uint argIdx = 0;
setFusionArg(fused_activ_, kernel, argIdx);
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
kernel.set(argIdx++, image_offset);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
......@@ -1491,6 +1529,7 @@ void OCL4DNNConvSpatial<Dtype>::prepareKernel(const UMat &bottom, UMat &top,
if (loadCachedConfig()) // check in-memory cache
return;
if (loadTunedConfig()) // check external storage
return;
......
......@@ -52,12 +52,21 @@
#elif defined(FUSED_CONV_PRELU)
#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (Dtype)(negative_slope[c])))
#define NEGATIVE_SLOPE_ARG __global const Dtype *negative_slope,
#elif defined(FUSED_CONV_POWER)
#define ACTIVATION_RELU_FUNCTION(x, c) pow(x, power)
#define NEGATIVE_SLOPE_ARG Dtype power,
#else
#define ACTIVATION_RELU_FUNCTION(x, c) (x)
#define NEGATIVE_SLOPE_ARG
#endif
#ifdef FUSED_CONV_ELTWISE
#define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { (_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(eltwise_data[(_offset_)] + (_data_), _channel_);} while(0)
#define ELTWISE_DATA_ARG __global Dtype* eltwise_data,
#else
#define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { (_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_data_, _channel_);} while(0)
#define ELTWISE_DATA_ARG
#endif
#define __CAT(x, y) x##y
......@@ -99,6 +108,7 @@
#ifdef KERNEL_BASIC
__kernel void ConvolveBasic(
ELTWISE_DATA_ARG
NEGATIVE_SLOPE_ARG
__global Dtype* image_data,
int image_offset,
......@@ -193,6 +203,7 @@ __attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
#endif
__kernel void
convolve_simd(
ELTWISE_DATA_ARG
NEGATIVE_SLOPE_ARG
__global Dtype* inputs_base,
filter_qualifier Dtype* weights_base,
......@@ -413,6 +424,7 @@ typedef struct float0 { float s0; } float0; //never used but makes compiler happ
#define ROW_PITCH input_width
#define GEMM_LIKE_KERNEL_ARGS \
ELTWISE_DATA_ARG \
NEGATIVE_SLOPE_ARG \
const __global Dtype *src0, \
const __global Dtype *src1, \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册