提交 db32d125 编写于 作者: X xiaoli.liu@intel.com

enable restructure for first stage

上级 2d10ea34
...@@ -132,6 +132,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -132,6 +132,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
std::shared_ptr<mkldnn::memory> user_src_memory_p; std::shared_ptr<mkldnn::memory> user_src_memory_p;
std::shared_ptr<mkldnn::memory> dst_memory_p; std::shared_ptr<mkldnn::memory> dst_memory_p;
std::vector<primitive> pipeline; std::vector<primitive> pipeline;
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd;
std::shared_ptr<platform::ConvMKLDNNHandler> handler;
auto prim_key = key + "@conv_p"; auto prim_key = key + "@conv_p";
auto dst_key = key + "@dst_mem_p"; auto dst_key = key + "@dst_mem_p";
...@@ -139,144 +141,62 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -139,144 +141,62 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto user_src_key = key + "@user_src_mem_p"; auto user_src_key = key + "@user_src_mem_p";
auto src_reorder_key = key + "@src_mem_p" + "reorder_p"; auto src_reorder_key = key + "@src_mem_p" + "reorder_p";
conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(dev_ctx.GetBlob(prim_key)); conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(dev_ctx.GetBlob(prim_key));
auto src_memory_reorder_p = std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(src_reorder_key)); if(conv_p == nullptr){
src_memory_p = std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(src_key)); if(is_INT8){
if(src_memory_reorder_p){ CreateINT8Primitive(ctx, is_test, dev_ctx, mkldnn_engine, input, filter,
user_src_memory_p = std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(user_src_key)); bias, output,
user_src_memory_p->set_data_handle(to_void_cast<T>(input_data)); strides, paddings,
} else if(src_memory_p){ dilations, fuse_relu,
src_memory_p->set_data_handle(to_void_cast<T>(input_data)); fuse_residual_conn,// input_data,
} filter_data, src_tz,
weights_tz, g,
dst_memory_p = std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(dst_key)); dst_tz, key,
dst_memory_p,
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd; pipeline,
conv_pd = std::static_pointer_cast<mkldnn::convolution_forward::primitive_desc>(dev_ctx.GetBlob(key_conv_pd)); key_conv_pd,
std::shared_ptr<platform::ConvMKLDNNHandler> handler; src_memory_p,
if(conv_pd){ user_src_memory_p,
handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx, mkldnn_engine, key)); conv_p,
} conv_pd,
if (!is_INT8 && dst_memory_p){ handler,
if (fuse_residual_conn) { force_fp32_output);
auto residual_param = ctx.Input<Tensor>("ResidualData"); }else{
auto residual_param_data = residual_param->data<T>(); CreateFP32Primitive(ctx, is_test, dev_ctx, mkldnn_engine, input, filter,
if (residual_param->format() != handler->GetDstFormat()) { bias, output,
auto output_data = strides, paddings,
output->mutable_data<T>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); dilations, fuse_relu,
auto residual_data_tz = fuse_residual_conn, //input_data,
paddle::framework::vectorize2int(residual_param->dims()); filter_data, src_tz,
auto residual_data_type = weights_tz, g,
paddle::framework::ToMKLDNNDataType(residual_param->type()); dst_tz, key,
dst_memory_p,
auto user_residual_md = platform::MKLDNNMemDesc( pipeline,
residual_data_tz, residual_data_type, residual_param->format()); key_conv_pd,
auto user_residual_memory_p = handler->AcquireResidualDataMemory( src_memory_p,
user_residual_md, to_void_cast<T>(residual_param_data)); user_src_memory_p,
dst_memory_p = handler->AcquireDstMemoryFromResidualDataMemory( conv_p,
user_residual_memory_p, to_void_cast<T>(output_data), pipeline); conv_pd,
} else { handler);
output->ShareDataWith(*residual_param);
auto output_data = output->mutable_data<T>(ctx.GetPlace());
dst_memory_p->set_data_handle(to_void_cast<T>(output_data));
}
} else {
auto output_data =
output->mutable_data<T>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize());
dst_memory_p->set_data_handle(to_void_cast<T>(output_data));
} }
} else if(is_INT8 && dst_memory_p){ } else {
if(fuse_residual_conn) { auto src_memory_reorder_p = std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(src_reorder_key));
auto residual_param = ctx.Input<Tensor>("ResidualData"); src_memory_p = std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(src_key));
auto residual_dt = paddle::framework::ToMKLDNNDataType(residual_param->type()); if(src_memory_reorder_p){
output->ShareDataWith(*residual_param); user_src_memory_p = std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(user_src_key));
if(residual_dt == mkldnn::memory::data_type::u8){ user_src_memory_p->set_data_handle(to_void_cast<T>(input_data));
uint8_t* output_data = output->mutable_data<uint8_t>(ctx.GetPlace()); } else if(src_memory_p){
dst_memory_p->set_data_handle(to_void_cast<uint8_t>(output_data)); src_memory_p->set_data_handle(to_void_cast<T>(input_data));
} else{
int8_t* output_data = output->mutable_data<int8_t>(ctx.GetPlace());
dst_memory_p->set_data_handle(to_void_cast<int8_t>(output_data));
}
} else if(!force_fp32_output){
if(fuse_relu){
uint8_t* output_data = output->mutable_data<uint8_t>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize());
dst_memory_p->set_data_handle(to_void_cast<uint8_t>(output_data));
} else{
int8_t* output_data = output->mutable_data<int8_t>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize());
dst_memory_p->set_data_handle(to_void_cast<int8_t>(output_data));
}
} else {
float* output_data = output->mutable_data<float>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize());
dst_memory_p->set_data_handle(to_void_cast<float>(output_data));
} }
}
dst_memory_p = std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(dst_key));
if(!is_INT8){ conv_pd = std::static_pointer_cast<mkldnn::convolution_forward::primitive_desc>(dev_ctx.GetBlob(key_conv_pd));
if(conv_p == nullptr){ if(conv_pd){
auto user_src_md = platform::MKLDNNMemDesc(
{src_tz}, platform::MKLDNNGetDataType<T>(), input->format());
auto user_weights_md = platform::MKLDNNMemDesc(
{weights_tz}, platform::MKLDNNGetDataType<T>(),
(g == 1) ? mkldnn::memory::format::oihw : mkldnn::memory::format::goihw);
/* create memory descriptor for convolution without specified format
* ('any') which lets a primitive (convolution in this case) choose
* the memory format preferred for best performance
*/
std::string data_format = ctx.Attr<std::string>("data_format");
auto chosen_memory_format =
platform::data_format_to_memory_format(data_format);
auto src_md = platform::MKLDNNMemDesc(
src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
auto weights_md = platform::MKLDNNMemDesc(
weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
std::vector<int> bias_tz; // TODO(mgallus): avoid empty vector creation.
// Currently used whenever bias is != nullptr.
auto dst_md = platform::MKLDNNMemDesc(
dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
// create a conv primitive descriptor and save it for usage in backward
if (bias) {
bias_tz = paddle::framework::vectorize2int(bias->dims());
auto bias_md = platform::MKLDNNMemDesc(
bias_tz, platform::MKLDNNGetDataType<T>(), memory::format::x);
conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
strides, paddings, mkldnn_engine,
fuse_relu, fuse_residual_conn, is_test);
} else {
conv_pd =
ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
mkldnn_engine, fuse_relu, fuse_residual_conn, is_test);
}
// Save conv_pd/src_memory/weights_memory for backward pass
dev_ctx.SetBlob(key_conv_pd, conv_pd);
handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx, mkldnn_engine, key)); handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx, mkldnn_engine, key));
}
// create mkldnn memory from input tensors (data/weights) if (!is_INT8){
user_src_memory_p =
handler->AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
auto user_weights_memory_p = handler->AcquireWeightsMemory(
user_weights_md, to_void_cast<float>(filter_data));
// create reorder primitive if the input format is not the preferred one
src_memory_p =
handler->AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
auto weights_memory_p = handler->AcquireWeightsMemoryFromPrimitive(
user_weights_memory_p, pipeline, is_test);
if (fuse_residual_conn) { if (fuse_residual_conn) {
auto residual_param = ctx.Input<Tensor>("ResidualData"); auto residual_param = ctx.Input<Tensor>("ResidualData");
auto residual_param_data = residual_param->data<T>(); auto residual_param_data = residual_param->data<T>();
PADDLE_ENFORCE(
residual_param_data != nullptr,
"Provide data if you want MKLDNN conv+elementwise_add fusion");
PADDLE_ENFORCE_EQ(output->dims(), residual_param->dims(),
"Output and elementwise parameter need to have the "
"same dimension sizes");
if (residual_param->format() != handler->GetDstFormat()) { if (residual_param->format() != handler->GetDstFormat()) {
auto output_data = auto output_data =
output->mutable_data<T>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); output->mutable_data<T>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize());
...@@ -284,7 +204,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -284,7 +204,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
paddle::framework::vectorize2int(residual_param->dims()); paddle::framework::vectorize2int(residual_param->dims());
auto residual_data_type = auto residual_data_type =
paddle::framework::ToMKLDNNDataType(residual_param->type()); paddle::framework::ToMKLDNNDataType(residual_param->type());
auto user_residual_md = platform::MKLDNNMemDesc( auto user_residual_md = platform::MKLDNNMemDesc(
residual_data_tz, residual_data_type, residual_param->format()); residual_data_tz, residual_data_type, residual_param->format());
auto user_residual_memory_p = handler->AcquireResidualDataMemory( auto user_residual_memory_p = handler->AcquireResidualDataMemory(
...@@ -294,255 +214,395 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -294,255 +214,395 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
} else { } else {
output->ShareDataWith(*residual_param); output->ShareDataWith(*residual_param);
auto output_data = output->mutable_data<T>(ctx.GetPlace()); auto output_data = output->mutable_data<T>(ctx.GetPlace());
dst_memory_p = dst_memory_p->set_data_handle(to_void_cast<T>(output_data));
handler->AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
} }
} else { } else {
auto output_data = auto output_data =
output->mutable_data<T>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); output->mutable_data<T>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize());
dst_memory_p = dst_memory_p->set_data_handle(to_void_cast<T>(output_data));
handler->AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
}
// create convolution op primitive
if (bias) {
const T* bias_data = bias->data<T>();
auto user_bias_md = platform::MKLDNNMemDesc(
{bias_tz}, platform::MKLDNNGetDataType<T>(), memory::format::x);
auto user_bias_memory_p =
handler->AcquireBiasMemory(user_bias_md, to_void_cast<T>(bias_data));
auto bias_memory_p =
handler->AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline, is_test);
conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p,
bias_memory_p, dst_memory_p);
} else {
conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p,
dst_memory_p);
}
// push primitive to stream and wait until it's executed
pipeline.push_back(*conv_p);
stream(stream::kind::eager).submit(pipeline).wait();
output->set_layout(DataLayout::kMKLDNN);
output->set_format(GetMKLDNNFormat(*dst_memory_p));
} else {
if(src_memory_reorder_p){
pipeline.push_back(*src_memory_reorder_p);
}
pipeline.push_back(*conv_p);
stream(stream::kind::eager).submit(pipeline).wait();
output->set_layout(DataLayout::kMKLDNN);
output->set_format(GetMKLDNNFormat(*dst_memory_p));
}
} else{
if(conv_p == nullptr){
auto* scale_in = ctx.HasInput("Scale_in") ? ctx.Input<Tensor>("Scale_in") : nullptr;
auto* scale_in_eltwise = ctx.HasInput("Scale_in_eltwise")? ctx.Input<Tensor>("Scale_in_eltwise") : nullptr;
auto* scale_weights = ctx.HasInput("Scale_weights")? ctx.Input<Tensor>("Scale_weights") : nullptr;
auto* scale_out = ctx.HasInput("Scale_out")? ctx.Input<Tensor>("Scale_out") : nullptr;
bool is_multi_channel = (scale_weights->memory_size() > 1) ? true : false;
auto scale_in_key = key + "@scale_in";
auto scale_weights_key = key + "@scale_weights";
auto scale_out_key = key + "@scale_out";
auto output_shift_scale_key = key + "@output_shift_scale";
auto sum_scale_key = key + "@sum_scale";
auto scale_in_eltwise_key = key + "@scale_in_eltwise";
std::vector<float> scale_in_data;
std::vector<float> scale_out_data = {1.0f};
std::vector<float> scale_weights_data;
std::vector<float> scale_in_eltwise_data;
std::vector<float> output_shift_scale;
std::vector<float> sum_scale = {1.0f};
std::vector<float> none_scale = {0};
int count = is_multi_channel? (g>1? weights_tz[1]*weights_tz[0] : weights_tz[0]) : 1;
scale_in_data = {*(scale_in->data<float>())};
scale_weights_data.resize(count);
#pragma omp parallel for if (count > 1)
for(int i=0; i<count; i++){
scale_weights_data[i] =*(scale_weights->data<float>() + i);
}
if(!force_fp32_output)
scale_out_data = {*(scale_out->data<float>())};
output_shift_scale.resize(count);
#pragma omp parallel for if (count > 1)
for(int i=0; i<count; i++){
if(scale_weights_data[i] == 0.0)
output_shift_scale[i] = scale_out_data[0];
else
output_shift_scale[i] = scale_out_data[0] / (scale_in_data[0] * scale_weights_data[i]);
}
if(fuse_residual_conn){
scale_in_eltwise_data = {*(scale_in_eltwise->data<float>())};
sum_scale[0] = scale_out_data[0] / scale_in_eltwise_data[0];
} }
} else if(is_INT8){
std::vector<primitive> pipeline;
auto user_src_md = platform::MKLDNNMemDesc(
{src_tz}, paddle::framework::ToMKLDNNDataType(input->type()), input->format());
auto user_weights_md = platform::MKLDNNMemDesc(
{weights_tz}, platform::MKLDNNGetDataType<float>(),
(g == 1) ? mkldnn::memory::format::oihw : mkldnn::memory::format::goihw);
/* create memory descriptor for convolution without specified format
* ('any') which lets a primitive (convolution in this case) choose
* the memory format preferred for best performance
*/
std::string data_format = ctx.Attr<std::string>("data_format");
auto chosen_memory_format =
platform::data_format_to_memory_format(data_format);
auto bias_tz = paddle::framework::vectorize2int(bias->dims());
auto src_md = platform::MKLDNNMemDesc(
src_tz, memory::data_type::u8, chosen_memory_format);
auto weights_md = platform::MKLDNNMemDesc(
weights_tz, memory::data_type::s8, chosen_memory_format);
auto dst_dt = fuse_relu?
paddle::framework::ToMKLDNNDataType(std::type_index(typeid(unsigned char)))
: paddle::framework::ToMKLDNNDataType(std::type_index(typeid(signed char)));
if(force_fp32_output){
dst_dt = paddle::framework::ToMKLDNNDataType(std::type_index(typeid(float)));
}
if(fuse_residual_conn){
auto residual = ctx.Input<Tensor>("ResidualData");
auto residual_dt = paddle::framework::ToMKLDNNDataType(residual->type());
if(dst_dt != residual_dt)
dst_dt = residual_dt;
}
auto dst_md = platform::MKLDNNMemDesc(dst_tz, dst_dt, chosen_memory_format);
// create a conv primitive descriptor and save it for usage in backward
if (bias) {
auto bias_md = platform::MKLDNNMemDesc(
bias_tz, memory::data_type::s32, memory::format::x);
conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
strides, paddings, mkldnn_engine,
fuse_relu, fuse_residual_conn,
output_shift_scale, sum_scale[0], is_test);
} else {
conv_pd =
ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
mkldnn_engine, fuse_relu, fuse_residual_conn,
output_shift_scale, sum_scale[0], is_test);
}
// Save conv_pd/src_memory/weights_memory for backward pass
dev_ctx.SetBlob(key_conv_pd, conv_pd);
handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx, mkldnn_engine, key));
// create mkldnn memory from input tensors (data/weights)
user_src_memory_p =
handler->AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
auto user_weights_memory_p = handler->AcquireWeightsMemory(
user_weights_md, to_void_cast<float>(filter_data));
// create reorder primitive if the input format is not the preferred one
src_memory_p =
handler->AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
std::shared_ptr<mkldnn::memory> weights_memory_p;
int mask_reorder = is_multi_channel? ((g!= 1) ? (1<<1)+(1<<0) : 1<<0) : 0;
weights_memory_p = handler->AcquireWeightsMemoryFromPrimitive(
user_weights_memory_p, pipeline, is_test, is_INT8, scale_weights_data, mask_reorder);
if(fuse_residual_conn) { if(fuse_residual_conn) {
auto residual_param = ctx.Input<Tensor>("ResidualData"); auto residual_param = ctx.Input<Tensor>("ResidualData");
PADDLE_ENFORCE_EQ(output->dims(), residual_param->dims(),
"Output and elementwise parameter need to have the "
"same dimension sizes");
auto residual_dt = paddle::framework::ToMKLDNNDataType(residual_param->type()); auto residual_dt = paddle::framework::ToMKLDNNDataType(residual_param->type());
PADDLE_ENFORCE_EQ(residual_param->format(), handler->GetDstFormat(),
"Conv input dimension and filter dimension should be the same.");
output->ShareDataWith(*residual_param); output->ShareDataWith(*residual_param);
if(residual_dt == mkldnn::memory::data_type::u8){ if(residual_dt == mkldnn::memory::data_type::u8){
uint8_t* output_data = output->mutable_data<uint8_t>(ctx.GetPlace()); uint8_t* output_data = output->mutable_data<uint8_t>(ctx.GetPlace());
dst_memory_p = dst_memory_p->set_data_handle(to_void_cast<uint8_t>(output_data));
handler->AcquireDstMemoryFromPrimitive(to_void_cast<uint8_t>(output_data));
} else{ } else{
int8_t* output_data = output->mutable_data<int8_t>(ctx.GetPlace()); int8_t* output_data = output->mutable_data<int8_t>(ctx.GetPlace());
dst_memory_p = dst_memory_p->set_data_handle(to_void_cast<int8_t>(output_data));
handler->AcquireDstMemoryFromPrimitive(to_void_cast<int8_t>(output_data));
} }
} else if(!force_fp32_output){ } else if(!force_fp32_output){
if(fuse_relu){ if(fuse_relu){
uint8_t* output_data = output->mutable_data<uint8_t>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); uint8_t* output_data = output->mutable_data<uint8_t>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize());
dst_memory_p = dst_memory_p->set_data_handle(to_void_cast<uint8_t>(output_data));
handler->AcquireDstMemoryFromPrimitive(to_void_cast<uint8_t>(output_data));
} else{ } else{
int8_t* output_data = output->mutable_data<int8_t>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); int8_t* output_data = output->mutable_data<int8_t>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize());
dst_memory_p = dst_memory_p->set_data_handle(to_void_cast<int8_t>(output_data));
handler->AcquireDstMemoryFromPrimitive(to_void_cast<int8_t>(output_data));
} }
} else { } else {
float* output_data = output->mutable_data<float>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize()); float* output_data = output->mutable_data<float>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize());
dst_memory_p = dst_memory_p->set_data_handle(to_void_cast<float>(output_data));
handler->AcquireDstMemoryFromPrimitive(to_void_cast<float>(output_data));
} }
}
// create convolution op primitive if(src_memory_reorder_p){
std::vector<float> scale_bias_data; pipeline.push_back(*src_memory_reorder_p);
auto scale_bias_key = key + "@scale_bias"; }
if (bias) { pipeline.push_back(*conv_p);
const float* bias_data = bias->data<float>(); }
auto user_bias_md = platform::MKLDNNMemDesc(
{bias_tz}, platform::MKLDNNGetDataType<float>(), memory::format::x); // push primitive to stream and wait until it's executed
auto user_bias_memory_p = //pipeline.push_back(*conv_p);
handler->AcquireBiasMemory(user_bias_md, to_void_cast<float>(bias_data)); stream(stream::kind::eager).submit(pipeline).wait();
std::shared_ptr<mkldnn::memory> bias_memory_p;
int mask_reorder = is_multi_channel? 1<<0 : 1; if (need_s8_to_u8) {
int count = is_multi_channel? (g>1? weights_tz[1]*weights_tz[0] : weights_tz[0]) : 1; output->mutable_data<uint8_t>(ctx.GetPlace());
scale_bias_data.resize(count); }
#pragma omp parallel for if (count > 1)
for(int i=0; i<count; i++){ output->set_layout(DataLayout::kMKLDNN);
scale_bias_data[i] = scale_in_data[0] * scale_weights_data[i]; output->set_format(GetMKLDNNFormat(*dst_memory_p));
} };
bias_memory_p =
handler->AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline, is_test, is_INT8, scale_bias_data, mask_reorder); private:
conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p, void CreateFP32Primitive(
bias_memory_p, dst_memory_p); paddle::framework::ExecutionContext ctx, bool is_test,
const paddle::platform::MKLDNNDeviceContext& dev_ctx,
const mkldnn::engine& mkldnn_engine,
const paddle::framework::Tensor* input, const paddle::framework::Tensor* filter,
const paddle::framework::Tensor* bias, paddle::framework::Tensor* output,
std::vector<int> strides, std::vector<int> paddings,
std::vector<int> dilations, bool fuse_relu,
bool fuse_residual_conn, //const T* input_data,
const float* filter_data, std::vector<int> src_tz,
std::vector<int> weights_tz, int g,
std::vector<int> dst_tz, const std::string key,
std::shared_ptr<mkldnn::memory> &dst_memory_p,
std::vector<primitive>& pipeline,
const std::string &key_conv_pd,
std::shared_ptr<mkldnn::memory> src_memory_p,
std::shared_ptr<mkldnn::memory> user_src_memory_p,
std::shared_ptr<mkldnn::convolution_forward> conv_p,
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd,
std::shared_ptr<platform::ConvMKLDNNHandler> handler) const{
const T* input_data = input->data<T>();
auto user_src_md = platform::MKLDNNMemDesc(
{src_tz}, platform::MKLDNNGetDataType<T>(), input->format());
auto user_weights_md = platform::MKLDNNMemDesc(
{weights_tz}, platform::MKLDNNGetDataType<T>(),
(g == 1) ? mkldnn::memory::format::oihw : mkldnn::memory::format::goihw);
/* create memory descriptor for convolution without specified format
* ('any') which lets a primitive (convolution in this case) choose
* the memory format preferred for best performance
*/
std::string data_format = ctx.Attr<std::string>("data_format");
auto chosen_memory_format =
platform::data_format_to_memory_format(data_format);
auto src_md = platform::MKLDNNMemDesc(
src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
auto weights_md = platform::MKLDNNMemDesc(
weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
std::vector<int> bias_tz; // TODO(mgallus): avoid empty vector creation.
// Currently used whenever bias is != nullptr.
auto dst_md = platform::MKLDNNMemDesc(
dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
// create a conv primitive descriptor and save it for usage in backward
if (bias) {
bias_tz = paddle::framework::vectorize2int(bias->dims());
auto bias_md = platform::MKLDNNMemDesc(
bias_tz, platform::MKLDNNGetDataType<T>(), memory::format::x);
conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
strides, paddings, mkldnn_engine,
fuse_relu, fuse_residual_conn, is_test);
} else {
conv_pd =
ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
mkldnn_engine, fuse_relu, fuse_residual_conn, is_test);
}
// Save conv_pd/src_memory/weights_memory for backward pass
dev_ctx.SetBlob(key_conv_pd, conv_pd);
handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx, mkldnn_engine, key));
// create mkldnn memory from input tensors (data/weights)
user_src_memory_p =
handler->AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
auto user_weights_memory_p = handler->AcquireWeightsMemory(
user_weights_md, to_void_cast<float>(filter_data));
// create reorder primitive if the input format is not the preferred one
src_memory_p =
handler->AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
auto weights_memory_p = handler->AcquireWeightsMemoryFromPrimitive(
user_weights_memory_p, pipeline, is_test);
if (fuse_residual_conn) {
auto residual_param = ctx.Input<Tensor>("ResidualData");
auto residual_param_data = residual_param->data<T>();
PADDLE_ENFORCE(
residual_param_data != nullptr,
"Provide data if you want MKLDNN conv+elementwise_add fusion");
PADDLE_ENFORCE_EQ(output->dims(), residual_param->dims(),
"Output and elementwise parameter need to have the "
"same dimension sizes");
if (residual_param->format() != handler->GetDstFormat()) {
auto output_data =
output->mutable_data<T>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize());
auto residual_data_tz =
paddle::framework::vectorize2int(residual_param->dims());
auto residual_data_type =
paddle::framework::ToMKLDNNDataType(residual_param->type());
auto user_residual_md = platform::MKLDNNMemDesc(
residual_data_tz, residual_data_type, residual_param->format());
auto user_residual_memory_p = handler->AcquireResidualDataMemory(
user_residual_md, to_void_cast<T>(residual_param_data));
dst_memory_p = handler->AcquireDstMemoryFromResidualDataMemory(
user_residual_memory_p, to_void_cast<T>(output_data), pipeline);
} else { } else {
conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p, output->ShareDataWith(*residual_param);
dst_memory_p); auto output_data = output->mutable_data<T>(ctx.GetPlace());
dst_memory_p =
handler->AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
} }
} else {
auto output_data =
output->mutable_data<T>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize());
dst_memory_p =
handler->AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
}
// create convolution op primitive
if (bias) {
const T* bias_data = bias->data<T>();
auto user_bias_md = platform::MKLDNNMemDesc(
{bias_tz}, platform::MKLDNNGetDataType<T>(), memory::format::x);
auto user_bias_memory_p =
handler->AcquireBiasMemory(user_bias_md, to_void_cast<T>(bias_data));
auto bias_memory_p =
handler->AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline, is_test);
conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p,
bias_memory_p, dst_memory_p);
} else {
conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p,
dst_memory_p);
}
// push primitive to stream and wait until it's executed
pipeline.push_back(*conv_p);
};
void CreateINT8Primitive(
const paddle::framework::ExecutionContext& ctx, bool is_test,
const paddle::platform::MKLDNNDeviceContext & dev_ctx,
const mkldnn::engine & mkldnn_engine,
const paddle::framework::Tensor* input, const paddle::framework::Tensor* filter,
const paddle::framework::Tensor* bias, paddle::framework::Tensor* output,
std::vector<int> strides, std::vector<int> paddings,
std::vector<int> dilations, bool fuse_relu,
bool fuse_residual_conn,// const T* input_data,
const float* filter_data, std::vector<int> src_tz,
std::vector<int> weights_tz, int g,
std::vector<int> dst_tz, const std::string key,
std::shared_ptr<mkldnn::memory>& dst_memory_p,
std::vector<primitive>& pipeline,
const std::string &key_conv_pd,
std::shared_ptr<mkldnn::memory> src_memory_p,
std::shared_ptr<mkldnn::memory> user_src_memory_p,
std::shared_ptr<mkldnn::convolution_forward> conv_p,
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd,
std::shared_ptr<platform::ConvMKLDNNHandler> handler,
bool force_fp32_output) const {
const T* input_data = input->data<T>();
bool is_INT8 = true;
auto* scale_in = ctx.HasInput("Scale_in") ? ctx.Input<Tensor>("Scale_in") : nullptr;
auto* scale_in_eltwise = ctx.HasInput("Scale_in_eltwise")? ctx.Input<Tensor>("Scale_in_eltwise") : nullptr;
auto* scale_weights = ctx.HasInput("Scale_weights")? ctx.Input<Tensor>("Scale_weights") : nullptr;
auto* scale_out = ctx.HasInput("Scale_out")? ctx.Input<Tensor>("Scale_out") : nullptr;
bool is_multi_channel = (scale_weights->memory_size() > 1) ? true : false;
auto scale_in_key = key + "@scale_in";
auto scale_weights_key = key + "@scale_weights";
auto scale_out_key = key + "@scale_out";
auto output_shift_scale_key = key + "@output_shift_scale";
auto sum_scale_key = key + "@sum_scale";
auto scale_in_eltwise_key = key + "@scale_in_eltwise";
std::vector<float> scale_in_data;
std::vector<float> scale_out_data = {1.0f};
std::vector<float> scale_weights_data;
std::vector<float> scale_in_eltwise_data;
std::vector<float> output_shift_scale;
std::vector<float> sum_scale = {1.0f};
std::vector<float> none_scale = {0};
int count = is_multi_channel? (g>1? weights_tz[1]*weights_tz[0] : weights_tz[0]) : 1;
scale_in_data = {*(scale_in->data<float>())};
scale_weights_data.resize(count);
#pragma omp parallel for if (count > 1)
for(int i=0; i<count; i++){
scale_weights_data[i] =*(scale_weights->data<float>() + i);
}
if(!force_fp32_output)
scale_out_data = {*(scale_out->data<float>())};
output_shift_scale.resize(count);
#pragma omp parallel for if (count > 1)
for(int i=0; i<count; i++){
if(scale_weights_data[i] == 0.0)
output_shift_scale[i] = scale_out_data[0];
else
output_shift_scale[i] = scale_out_data[0] / (scale_in_data[0] * scale_weights_data[i]);
}
if(fuse_residual_conn){
scale_in_eltwise_data = {*(scale_in_eltwise->data<float>())};
sum_scale[0] = scale_out_data[0] / scale_in_eltwise_data[0];
}
auto user_src_md = platform::MKLDNNMemDesc(
{src_tz}, paddle::framework::ToMKLDNNDataType(input->type()), input->format());
auto user_weights_md = platform::MKLDNNMemDesc(
{weights_tz}, platform::MKLDNNGetDataType<float>(),
(g == 1) ? mkldnn::memory::format::oihw : mkldnn::memory::format::goihw);
/* create memory descriptor for convolution without specified format
* ('any') which lets a primitive (convolution in this case) choose
* the memory format preferred for best performance
*/
std::string data_format = ctx.Attr<std::string>("data_format");
auto chosen_memory_format =
platform::data_format_to_memory_format(data_format);
auto bias_tz = paddle::framework::vectorize2int(bias->dims());
auto src_md = platform::MKLDNNMemDesc(
src_tz, memory::data_type::u8, chosen_memory_format);
auto weights_md = platform::MKLDNNMemDesc(
weights_tz, memory::data_type::s8, chosen_memory_format);
// push primitive to stream and wait until it's executed auto dst_dt = fuse_relu?
pipeline.push_back(*conv_p); paddle::framework::ToMKLDNNDataType(std::type_index(typeid(unsigned char)))
stream(stream::kind::eager).submit(pipeline).wait(); : paddle::framework::ToMKLDNNDataType(std::type_index(typeid(signed char)));
if(need_s8_to_u8){ if(force_fp32_output){
output->mutable_data<uint8_t>(ctx.GetPlace()); dst_dt = paddle::framework::ToMKLDNNDataType(std::type_index(typeid(float)));
} }
output->set_layout(DataLayout::kMKLDNN); if(fuse_residual_conn){
output->set_format(GetMKLDNNFormat(*dst_memory_p)); auto residual = ctx.Input<Tensor>("ResidualData");
auto residual_dt = paddle::framework::ToMKLDNNDataType(residual->type());
if(dst_dt != residual_dt)
dst_dt = residual_dt;
}
auto dst_md = platform::MKLDNNMemDesc(dst_tz, dst_dt, chosen_memory_format);
// create a conv primitive descriptor and save it for usage in backward
if (bias) {
auto bias_md = platform::MKLDNNMemDesc(
bias_tz, memory::data_type::s32, memory::format::x);
conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
strides, paddings, mkldnn_engine,
fuse_relu, fuse_residual_conn,
output_shift_scale, sum_scale[0], is_test);
} else { } else {
if(src_memory_reorder_p){ conv_pd =
pipeline.push_back(*src_memory_reorder_p); ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
mkldnn_engine, fuse_relu, fuse_residual_conn,
output_shift_scale, sum_scale[0], is_test);
}
// Save conv_pd/src_memory/weights_memory for backward pass
dev_ctx.SetBlob(key_conv_pd, conv_pd);
handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx, mkldnn_engine, key));
// create mkldnn memory from input tensors (data/weights)
user_src_memory_p =
handler->AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
auto user_weights_memory_p = handler->AcquireWeightsMemory(
user_weights_md, to_void_cast<float>(filter_data));
// create reorder primitive if the input format is not the preferred one
src_memory_p =
handler->AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
std::shared_ptr<mkldnn::memory> weights_memory_p;
int mask_reorder = is_multi_channel? ((g!= 1) ? (1<<1)+(1<<0) : 1<<0) : 0;
weights_memory_p = handler->AcquireWeightsMemoryFromPrimitive(
user_weights_memory_p, pipeline, is_test, is_INT8, scale_weights_data, mask_reorder);
if(fuse_residual_conn) {
auto residual_param = ctx.Input<Tensor>("ResidualData");
PADDLE_ENFORCE_EQ(output->dims(), residual_param->dims(),
"Output and elementwise parameter need to have the "
"same dimension sizes");
auto residual_dt = paddle::framework::ToMKLDNNDataType(residual_param->type());
PADDLE_ENFORCE_EQ(residual_param->format(), handler->GetDstFormat(),
"Conv input dimension and filter dimension should be the same.");
output->ShareDataWith(*residual_param);
if(residual_dt == mkldnn::memory::data_type::u8){
uint8_t* output_data = output->mutable_data<uint8_t>(ctx.GetPlace());
dst_memory_p =
handler->AcquireDstMemoryFromPrimitive(to_void_cast<uint8_t>(output_data));
} else{
int8_t* output_data = output->mutable_data<int8_t>(ctx.GetPlace());
dst_memory_p =
handler->AcquireDstMemoryFromPrimitive(to_void_cast<int8_t>(output_data));
} }
pipeline.push_back(*conv_p); } else if(!force_fp32_output){
stream(stream::kind::eager).submit(pipeline).wait(); if(fuse_relu){
uint8_t* output_data = output->mutable_data<uint8_t>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize());
if (need_s8_to_u8) { dst_memory_p =
output->mutable_data<uint8_t>(ctx.GetPlace()); handler->AcquireDstMemoryFromPrimitive(to_void_cast<uint8_t>(output_data));
} else{
int8_t* output_data = output->mutable_data<int8_t>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize());
dst_memory_p =
handler->AcquireDstMemoryFromPrimitive(to_void_cast<int8_t>(output_data));
} }
} else {
float* output_data = output->mutable_data<float>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler->GetDstMemorySize());
dst_memory_p =
handler->AcquireDstMemoryFromPrimitive(to_void_cast<float>(output_data));
}
output->set_layout(DataLayout::kMKLDNN); // create convolution op primitive
output->set_format(GetMKLDNNFormat(*dst_memory_p)); std::vector<float> scale_bias_data;
auto scale_bias_key = key + "@scale_bias";
if (bias) {
const float* bias_data = bias->data<float>();
auto user_bias_md = platform::MKLDNNMemDesc(
{bias_tz}, platform::MKLDNNGetDataType<float>(), memory::format::x);
auto user_bias_memory_p =
handler->AcquireBiasMemory(user_bias_md, to_void_cast<float>(bias_data));
std::shared_ptr<mkldnn::memory> bias_memory_p;
int mask_reorder = is_multi_channel? 1<<0 : 1;
int count = is_multi_channel? (g>1? weights_tz[1]*weights_tz[0] : weights_tz[0]) : 1;
scale_bias_data.resize(count);
#pragma omp parallel for if (count > 1)
for(int i=0; i<count; i++){
scale_bias_data[i] = scale_in_data[0] * scale_weights_data[i];
}
bias_memory_p =
handler->AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline, is_test, is_INT8, scale_bias_data, mask_reorder);
conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p,
bias_memory_p, dst_memory_p);
} else {
conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p,
dst_memory_p);
} }
}
}
private:
// push primitive to stream and wait until it's executed
pipeline.push_back(*conv_p);
};
void AppendKey(std::string& key, mkldnn::memory::dims& input_dims, // NOLINT void AppendKey(std::string& key, mkldnn::memory::dims& input_dims, // NOLINT
mkldnn::memory::dims& weights_dims, // NOLINT mkldnn::memory::dims& weights_dims, // NOLINT
std::vector<int>& strides, // NOLINT std::vector<int>& strides, // NOLINT
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册