提交 1f8afa6f 编写于 作者: X xiaolil1

enable initialization for INT8

上级 9dead9a2
...@@ -496,222 +496,249 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -496,222 +496,249 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
output->set_format(GetMKLDNNFormat(*dst_memory_p)); output->set_format(GetMKLDNNFormat(*dst_memory_p));
} else{ } else{
auto* scale_in = ctx.HasInput("Scale_in") ? ctx.Input<Tensor>("Scale_in") : nullptr; bool need_s8_to_u8 = false;
auto* scale_in_eltwise = ctx.HasInput("Scale_in_eltwise")? ctx.Input<Tensor>("Scale_in_eltwise") : nullptr; if (fuse_residual_conn && fuse_relu) {
auto* scale_weights = ctx.HasInput("Scale_weights")? ctx.Input<Tensor>("Scale_weights") : nullptr; need_s8_to_u8 = true;
auto* scale_out = ctx.HasInput("Scale_out")? ctx.Input<Tensor>("Scale_out") : nullptr;
bool is_multi_channel = (scale_weights->memory_size() > 1) ? true : false;
static std::unordered_map<std::string, std::vector<float>> scale_map;
bool scale_reuse = true;
auto scale_in_key = key + "@scale_in";
auto scale_weights_key = key + "@scale_weights";
auto scale_out_key = key + "@scale_out";
auto output_shift_scale_key = key + "@output_shift_scale";
auto sum_scale_key = key + "@sum_scale";
auto scale_in_eltwise_key = key + "@scale_in_eltwise";
std::vector<float> scale_in_data;
std::vector<float> scale_out_data;
std::vector<float> scale_weights_data;
std::vector<float> scale_in_eltwise_data;
std::vector<float> output_shift_scale;
std::vector<float> sum_scale = {1.0f};
std::vector<float> none_scale = {0};
if (GetScaleMap(scale_map, scale_in_key) == none_scale){
scale_reuse = false;
} }
std::shared_ptr<mkldnn::convolution_forward> conv_p;
if(!scale_reuse){ std::shared_ptr<mkldnn::memory> src_memory_p;
int count = is_multi_channel? (g>1? weights_tz[1]*weights_tz[0] : weights_tz[0]) : 1; std::shared_ptr<mkldnn::memory> dst_memory_p;
scale_in_data = {*(scale_in->data<float>())}; std::vector<primitive> pipeline;
scale_weights_data.resize(count);
#pragma omp parallel for if (count > 1) auto prim_key = key + "@conv_p";
for(int i=0; i<count; i++){ auto dst_key = key + "@dst_mem_p";
scale_weights_data[i] =*(scale_weights->data<float>() + i); auto src_key = key + "@src_mem_p";
} conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(dev_ctx.GetBlob(prim_key));
scale_out_data = {*(scale_out->data<float>())}; src_memory_p = std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(src_key));
output_shift_scale.resize(count); dst_memory_p = std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(dst_key));
#pragma omp parallel for if (count > 1)
for(int i=0; i<count; i++){ if (src_memory_p) {
if(scale_weights_data[i] == 0.0) src_memory_p->set_data_handle(to_void_cast<T>(input_data));
output_shift_scale[i] = scale_out_data[0]; }
else
output_shift_scale[i] = scale_out_data[0] / (scale_in_data[0] * scale_weights_data[i]); if(conv_p == nullptr){
} auto* scale_in = ctx.HasInput("Scale_in") ? ctx.Input<Tensor>("Scale_in") : nullptr;
if(fuse_residual_conn){ auto* scale_in_eltwise = ctx.HasInput("Scale_in_eltwise")? ctx.Input<Tensor>("Scale_in_eltwise") : nullptr;
scale_in_eltwise_data = {*(scale_in_eltwise->data<float>())}; auto* scale_weights = ctx.HasInput("Scale_weights")? ctx.Input<Tensor>("Scale_weights") : nullptr;
sum_scale[0] = scale_out_data[0] / scale_in_eltwise_data[0]; auto* scale_out = ctx.HasInput("Scale_out")? ctx.Input<Tensor>("Scale_out") : nullptr;
SetScaleMap(scale_map, scale_in_eltwise_key, scale_in_eltwise_data);
bool is_multi_channel = (scale_weights->memory_size() > 1) ? true : false;
static std::unordered_map<std::string, std::vector<float>> scale_map;
bool scale_reuse = true;
auto scale_in_key = key + "@scale_in";
auto scale_weights_key = key + "@scale_weights";
auto scale_out_key = key + "@scale_out";
auto output_shift_scale_key = key + "@output_shift_scale";
auto sum_scale_key = key + "@sum_scale";
auto scale_in_eltwise_key = key + "@scale_in_eltwise";
std::vector<float> scale_in_data;
std::vector<float> scale_out_data;
std::vector<float> scale_weights_data;
std::vector<float> scale_in_eltwise_data;
std::vector<float> output_shift_scale;
std::vector<float> sum_scale = {1.0f};
std::vector<float> none_scale = {0};
if (GetScaleMap(scale_map, scale_in_key) == none_scale){
scale_reuse = false;
} }
//scale reuse if(!scale_reuse){
SetScaleMap(scale_map, scale_in_key, scale_in_data); int count = is_multi_channel? (g>1? weights_tz[1]*weights_tz[0] : weights_tz[0]) : 1;
SetScaleMap(scale_map, scale_weights_key, scale_weights_data); scale_in_data = {*(scale_in->data<float>())};
SetScaleMap(scale_map, scale_out_key, scale_out_data); scale_weights_data.resize(count);
SetScaleMap(scale_map, output_shift_scale_key, output_shift_scale); #pragma omp parallel for if (count > 1)
SetScaleMap(scale_map, sum_scale_key, sum_scale); for(int i=0; i<count; i++){
} else{ scale_weights_data[i] =*(scale_weights->data<float>() + i);
scale_in_data = GetScaleMap(scale_map, scale_in_key); }
scale_out_data = GetScaleMap(scale_map, scale_out_key); scale_out_data = {*(scale_out->data<float>())};
scale_weights_data = GetScaleMap(scale_map, scale_weights_key); output_shift_scale.resize(count);
if(fuse_residual_conn){ #pragma omp parallel for if (count > 1)
scale_in_eltwise_data = GetScaleMap(scale_map, scale_in_eltwise_key); for(int i=0; i<count; i++){
if(scale_weights_data[i] == 0.0)
output_shift_scale[i] = scale_out_data[0];
else
output_shift_scale[i] = scale_out_data[0] / (scale_in_data[0] * scale_weights_data[i]);
}
if(fuse_residual_conn){
scale_in_eltwise_data = {*(scale_in_eltwise->data<float>())};
sum_scale[0] = scale_out_data[0] / scale_in_eltwise_data[0];
SetScaleMap(scale_map, scale_in_eltwise_key, scale_in_eltwise_data);
}
//scale reuse
SetScaleMap(scale_map, scale_in_key, scale_in_data);
SetScaleMap(scale_map, scale_weights_key, scale_weights_data);
SetScaleMap(scale_map, scale_out_key, scale_out_data);
SetScaleMap(scale_map, output_shift_scale_key, output_shift_scale);
SetScaleMap(scale_map, sum_scale_key, sum_scale);
} else{
scale_in_data = GetScaleMap(scale_map, scale_in_key);
scale_out_data = GetScaleMap(scale_map, scale_out_key);
scale_weights_data = GetScaleMap(scale_map, scale_weights_key);
if(fuse_residual_conn){
scale_in_eltwise_data = GetScaleMap(scale_map, scale_in_eltwise_key);
}
output_shift_scale = GetScaleMap(scale_map, output_shift_scale_key);
sum_scale = GetScaleMap(scale_map, sum_scale_key);
} }
output_shift_scale = GetScaleMap(scale_map, output_shift_scale_key);
sum_scale = GetScaleMap(scale_map, sum_scale_key);
}
std::vector<primitive> pipeline; std::vector<primitive> pipeline;
auto user_src_md = platform::MKLDNNMemDesc( auto user_src_md = platform::MKLDNNMemDesc(
{src_tz}, paddle::framework::ToMKLDNNDataType(input->type()), input->format()); {src_tz}, paddle::framework::ToMKLDNNDataType(input->type()), input->format());
auto user_weights_md = platform::MKLDNNMemDesc( auto user_weights_md = platform::MKLDNNMemDesc(
{weights_tz}, platform::MKLDNNGetDataType<float>(), {weights_tz}, platform::MKLDNNGetDataType<float>(),
(g == 1) ? mkldnn::memory::format::oihw : mkldnn::memory::format::goihw); (g == 1) ? mkldnn::memory::format::oihw : mkldnn::memory::format::goihw);
/* create memory descriptor for convolution without specified format /* create memory descriptor for convolution without specified format
* ('any') which lets a primitive (convolution in this case) choose * ('any') which lets a primitive (convolution in this case) choose
* the memory format preferred for best performance * the memory format preferred for best performance
*/ */
std::string data_format = ctx.Attr<std::string>("data_format"); std::string data_format = ctx.Attr<std::string>("data_format");
auto chosen_memory_format = auto chosen_memory_format =
platform::data_format_to_memory_format(data_format); platform::data_format_to_memory_format(data_format);
auto bias_tz = paddle::framework::vectorize2int(bias->dims()); auto bias_tz = paddle::framework::vectorize2int(bias->dims());
auto src_md = platform::MKLDNNMemDesc(
src_tz, memory::data_type::u8, chosen_memory_format);
auto weights_md = platform::MKLDNNMemDesc(
weights_tz, memory::data_type::s8, chosen_memory_format);
auto dst_dt = fuse_relu?
paddle::framework::ToMKLDNNDataType(std::type_index(typeid(unsigned char)))
: paddle::framework::ToMKLDNNDataType(std::type_index(typeid(signed char)));
if(fuse_residual_conn){
auto residual = ctx.Input<Tensor>("ResidualData");
auto residual_dt = paddle::framework::ToMKLDNNDataType(residual->type());
if(dst_dt != residual_dt)
dst_dt = residual_dt;
}
auto dst_md = platform::MKLDNNMemDesc(dst_tz, dst_dt, chosen_memory_format);
// create a conv primitive descriptor and save it for usage in backward
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd;
if (bias) {
auto bias_md = platform::MKLDNNMemDesc(
bias_tz, memory::data_type::s32, memory::format::x);
conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
strides, paddings, mkldnn_engine,
fuse_relu, fuse_residual_conn,
output_shift_scale, sum_scale[0], is_test);
} else {
conv_pd =
ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
mkldnn_engine, fuse_relu, fuse_residual_conn,
output_shift_scale, sum_scale[0], is_test);
}
// Save conv_pd/src_memory/weights_memory for backward pass
dev_ctx.SetBlob(key_conv_pd, conv_pd);
ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key);
// create mkldnn memory from input tensors (data/weights) auto src_md = platform::MKLDNNMemDesc(
auto user_src_memory_p = src_tz, memory::data_type::u8, chosen_memory_format);
handler.AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data)); auto weights_md = platform::MKLDNNMemDesc(
auto user_weights_memory_p = handler.AcquireWeightsMemory( weights_tz, memory::data_type::s8, chosen_memory_format);
user_weights_md, to_void_cast<float>(filter_data));
// create reorder primitive if the input format is not the preferred one auto dst_dt = fuse_relu?
auto src_memory_p = paddle::framework::ToMKLDNNDataType(std::type_index(typeid(unsigned char)))
handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); : paddle::framework::ToMKLDNNDataType(std::type_index(typeid(signed char)));
std::shared_ptr<mkldnn::memory> weights_memory_p;
int mask_reorder = is_multi_channel? ((g!= 1) ? (1<<1)+(1<<0) : 1<<0) : 0;
weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive(
user_weights_memory_p, pipeline, is_test, is_INT8, scale_weights_data, mask_reorder);
std::shared_ptr<mkldnn::memory> dst_memory_p; if(fuse_residual_conn){
bool need_s8_to_u8 = false; auto residual = ctx.Input<Tensor>("ResidualData");
if(fuse_residual_conn) { auto residual_dt = paddle::framework::ToMKLDNNDataType(residual->type());
auto residual_param = ctx.Input<Tensor>("ResidualData"); if(dst_dt != residual_dt)
PADDLE_ENFORCE_EQ(output->dims(), residual_param->dims(), dst_dt = residual_dt;
"Output and elementwise parameter need to have the "
"same dimension sizes");
auto residual_dt = paddle::framework::ToMKLDNNDataType(residual_param->type());
PADDLE_ENFORCE_EQ(residual_param->format(), handler.GetDstFormat(),
"Conv input dimension and filter dimension should be the same.");
output->ShareDataWith(*residual_param);
if(residual_dt == mkldnn::memory::data_type::u8){
uint8_t* output_data = output->mutable_data<uint8_t>(ctx.GetPlace());
dst_memory_p =
handler.AcquireDstMemoryFromPrimitive(to_void_cast<uint8_t>(output_data));
} else{
int8_t* output_data = output->mutable_data<int8_t>(ctx.GetPlace());
dst_memory_p =
handler.AcquireDstMemoryFromPrimitive(to_void_cast<int8_t>(output_data));
if(fuse_relu)
need_s8_to_u8 = true;
} }
} else { auto dst_md = platform::MKLDNNMemDesc(dst_tz, dst_dt, chosen_memory_format);
if(fuse_relu){
uint8_t* output_data = output->mutable_data<uint8_t>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler.GetDstMemorySize()); // create a conv primitive descriptor and save it for usage in backward
dst_memory_p = std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd;
handler.AcquireDstMemoryFromPrimitive(to_void_cast<uint8_t>(output_data));
} else{ if (bias) {
int8_t* output_data = output->mutable_data<int8_t>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler.GetDstMemorySize()); auto bias_md = platform::MKLDNNMemDesc(
dst_memory_p = bias_tz, memory::data_type::s32, memory::format::x);
handler.AcquireDstMemoryFromPrimitive(to_void_cast<int8_t>(output_data)); conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
strides, paddings, mkldnn_engine,
fuse_relu, fuse_residual_conn,
output_shift_scale, sum_scale[0], is_test);
} else {
conv_pd =
ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
mkldnn_engine, fuse_relu, fuse_residual_conn,
output_shift_scale, sum_scale[0], is_test);
}
// Save conv_pd/src_memory/weights_memory for backward pass
dev_ctx.SetBlob(key_conv_pd, conv_pd);
ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key);
// create mkldnn memory from input tensors (data/weights)
auto user_src_memory_p =
handler.AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
auto user_weights_memory_p = handler.AcquireWeightsMemory(
user_weights_md, to_void_cast<float>(filter_data));
// create reorder primitive if the input format is not the preferred one
src_memory_p =
handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
std::shared_ptr<mkldnn::memory> weights_memory_p;
int mask_reorder = is_multi_channel? ((g!= 1) ? (1<<1)+(1<<0) : 1<<0) : 0;
weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive(
user_weights_memory_p, pipeline, is_test, is_INT8, scale_weights_data, mask_reorder);
if(fuse_residual_conn) {
auto residual_param = ctx.Input<Tensor>("ResidualData");
PADDLE_ENFORCE_EQ(output->dims(), residual_param->dims(),
"Output and elementwise parameter need to have the "
"same dimension sizes");
auto residual_dt = paddle::framework::ToMKLDNNDataType(residual_param->type());
PADDLE_ENFORCE_EQ(residual_param->format(), handler.GetDstFormat(),
"Conv input dimension and filter dimension should be the same.");
output->ShareDataWith(*residual_param);
if(residual_dt == mkldnn::memory::data_type::u8){
uint8_t* output_data = output->mutable_data<uint8_t>(ctx.GetPlace());
dst_memory_p =
handler.AcquireDstMemoryFromPrimitive(to_void_cast<uint8_t>(output_data));
} else{
int8_t* output_data = output->mutable_data<int8_t>(ctx.GetPlace());
dst_memory_p =
handler.AcquireDstMemoryFromPrimitive(to_void_cast<int8_t>(output_data));
}
} else {
if(fuse_relu){
uint8_t* output_data = output->mutable_data<uint8_t>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler.GetDstMemorySize());
dst_memory_p =
handler.AcquireDstMemoryFromPrimitive(to_void_cast<uint8_t>(output_data));
} else{
int8_t* output_data = output->mutable_data<int8_t>(ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, handler.GetDstMemorySize());
dst_memory_p =
handler.AcquireDstMemoryFromPrimitive(to_void_cast<int8_t>(output_data));
}
} }
}
// create convolution op primitive // create convolution op primitive
std::shared_ptr<mkldnn::convolution_forward> conv_p; std::vector<float> scale_bias_data;
std::vector<float> scale_bias_data; auto scale_bias_key = key + "@scale_bias";
auto scale_bias_key = key + "@scale_bias"; if (bias) {
if (bias) { const float* bias_data = bias->data<float>();
const float* bias_data = bias->data<float>(); auto user_bias_md = platform::MKLDNNMemDesc(
auto user_bias_md = platform::MKLDNNMemDesc( {bias_tz}, platform::MKLDNNGetDataType<float>(), memory::format::x);
{bias_tz}, platform::MKLDNNGetDataType<float>(), memory::format::x); auto user_bias_memory_p =
auto user_bias_memory_p = handler.AcquireBiasMemory(user_bias_md, to_void_cast<float>(bias_data));
handler.AcquireBiasMemory(user_bias_md, to_void_cast<float>(bias_data)); std::shared_ptr<mkldnn::memory> bias_memory_p;
std::shared_ptr<mkldnn::memory> bias_memory_p; int mask_reorder = is_multi_channel? 1<<0 : 1;
int mask_reorder = is_multi_channel? 1<<0 : 1; if(!scale_reuse){
if(!scale_reuse){ int count = is_multi_channel? (g>1? weights_tz[1]*weights_tz[0] : weights_tz[0]) : 1;
int count = is_multi_channel? (g>1? weights_tz[1]*weights_tz[0] : weights_tz[0]) : 1; scale_bias_data.resize(count);
scale_bias_data.resize(count); #pragma omp parallel for if (count > 1)
#pragma omp parallel for if (count > 1) for(int i=0; i<count; i++){
for(int i=0; i<count; i++){ scale_bias_data[i] = scale_in_data[0] * scale_weights_data[i];
scale_bias_data[i] = scale_in_data[0] * scale_weights_data[i]; }
SetScaleMap(scale_map, scale_bias_key, scale_bias_data);
} else{
scale_bias_data = GetScaleMap(scale_map, scale_bias_key);
} }
SetScaleMap(scale_map, scale_bias_key, scale_bias_data); bias_memory_p =
} else{ handler.AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline, is_test, is_INT8, scale_bias_data, mask_reorder);
scale_bias_data = GetScaleMap(scale_map, scale_bias_key); conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
bias_memory_p, dst_memory_p);
} else {
conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
dst_memory_p);
} }
bias_memory_p =
handler.AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline, is_test, is_INT8, scale_bias_data, mask_reorder);
conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
bias_memory_p, dst_memory_p);
} else {
conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
dst_memory_p);
}
// push primitive to stream and wait until it's executed // push primitive to stream and wait until it's executed
pipeline.push_back(*conv_p); pipeline.push_back(*conv_p);
stream(stream::kind::eager).submit(pipeline).wait(); stream(stream::kind::eager).submit(pipeline).wait();
if(need_s8_to_u8){ if(need_s8_to_u8){
output->mutable_data<uint8_t>(ctx.GetPlace()); output->mutable_data<uint8_t>(ctx.GetPlace());
} }
output->set_layout(DataLayout::kMKLDNN); output->set_layout(DataLayout::kMKLDNN);
output->set_format(GetMKLDNNFormat(*dst_memory_p)); output->set_format(GetMKLDNNFormat(*dst_memory_p));
} else {
pipeline.push_back(*conv_p);
stream(stream::kind::eager).submit(pipeline).wait();
if (need_s8_to_u8) {
output->mutable_data<uint8_t>(ctx.GetPlace());
}
output->set_layout(DataLayout::kMKLDNN);
output->set_format(GetMKLDNNFormat(*dst_memory_p));
}
} }
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册