提交 e214530d 编写于 作者: X xiaolil1

modify for int8 running

上级 5ac1c95c
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
#include "paddle/fluid/framework/data_layout_transform.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -274,7 +275,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -274,7 +275,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
void Compute(const paddle::framework::ExecutionContext& ctx) const override { void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace."); "It must use CPUPlace.");
std::cout<<"this is conv kernel op....................."<<std::endl;
const bool is_test = ctx.Attr<bool>("is_test"); const bool is_test = ctx.Attr<bool>("is_test");
auto& dev_ctx = auto& dev_ctx =
...@@ -286,11 +287,12 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -286,11 +287,12 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr; auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
auto* output = ctx.Output<Tensor>("Output"); auto* output = ctx.Output<Tensor>("Output");
bool is_INT8 = ctx.HasInput("Scale_in")? true : false;
auto* scale_in = ctx.HasInput("Scale_in") ? ctx.Input<Tensor>("Scale_in") : nullptr; auto* scale_in = ctx.HasInput("Scale_in") ? ctx.Input<Tensor>("Scale_in") : nullptr;
auto* scale_in_eltwise = ctx.HasInput("Scale_in_eltwise")? ctx.Input<Tensor>("Scale_in_eltwise") : nullptr; auto* scale_in_eltwise = ctx.HasInput("Scale_in_eltwise")? ctx.Input<Tensor>("Scale_in_eltwise") : nullptr;
auto* scale_weights = ctx.HasInput("Scale_weights")? ctx.Input<Tensor>("Scale_weights") : nullptr; auto* scale_weights = ctx.HasInput("Scale_weights")? ctx.Input<Tensor>("Scale_weights") : nullptr;
auto* scale_out = ctx.HasInput("Scale_out")? ctx.Input<Tensor>("Scale_out") : nullptr; auto* scale_out = ctx.HasInput("Scale_out")? ctx.Input<Tensor>("Scale_out") : nullptr;
bool is_INT8 = ctx.HasInput("Scale_in")? true : false;
bool is_multi_channel = (is_INT8 && scale_weights->memory_size() > 1) ? true : false; bool is_multi_channel = (is_INT8 && scale_weights->memory_size() > 1) ? true : false;
PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN && PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
...@@ -318,13 +320,15 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -318,13 +320,15 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection"); bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
int groups = ctx.Attr<int>("groups"); int groups = ctx.Attr<int>("groups");
std::cout<<"fuse_relu = "<<fuse_relu<<" fuse_residual_conn = "<<fuse_residual_conn<<std::endl;
// TODO(tpatejko): add support for dilation // TODO(tpatejko): add support for dilation
PADDLE_ENFORCE( PADDLE_ENFORCE(
dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1, dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
"dilation in convolution is not implemented yet"); "dilation in convolution is not implemented yet");
const T* input_data = input->data<T>(); const T* input_data = input->data<T>();
const T* filter_data = filter->data<T>(); const float* filter_data = filter->data<float>();
std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims()); std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
std::vector<int> weights_tz = std::vector<int> weights_tz =
...@@ -344,17 +348,17 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -344,17 +348,17 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
} }
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims()); std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
std::vector<T> output_shift_scale; std::vector<float> output_shift_scale;
T sum_scale = 1.0f; float sum_scale = 1.0f;
if(is_INT8){ if(is_INT8){
std::cout<<"this is conv int8 op .............."<<std::endl;
int count = is_multi_channel? (g>1? weights_tz[1]*weights_tz[0] : weights_tz[0]) : 1; int count = is_multi_channel? (g>1? weights_tz[1]*weights_tz[0] : weights_tz[0]) : 1;
T scale_in_data = *(scale_in->data<T>()); float scale_in_data = *(scale_in->data<float>());
T scale_in_eltwise_data = *(scale_in_eltwise->data<T>()); std::vector<float> scale_weights_data(count);
std::vector<T> scale_weights_data(count);
for(int i=0; i<count; i++){ for(int i=0; i<count; i++){
scale_weights_data[i] =*(scale_weights->data<T>() + i); scale_weights_data[i] =*(scale_weights->data<float>() + i);
} }
T scale_out_data = *(scale_out->data<T>()); float scale_out_data = *(scale_out->data<float>());
output_shift_scale.resize(count); output_shift_scale.resize(count);
for(int i=0; i<count; i++){ for(int i=0; i<count; i++){
...@@ -363,9 +367,11 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -363,9 +367,11 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
else else
output_shift_scale[i] = scale_out_data / (scale_in_data * scale_weights_data[i]); output_shift_scale[i] = scale_out_data / (scale_in_data * scale_weights_data[i]);
} }
if(fuse_residual_conn){
float scale_in_eltwise_data = *(scale_in_eltwise->data<float>());
sum_scale = scale_out_data / scale_in_eltwise_data; sum_scale = scale_out_data / scale_in_eltwise_data;
} }
}
// Get unique name for storing MKLDNN primitives // Get unique name for storing MKLDNN primitives
const std::string key = ConvMKLDNNHandler::GetHash( const std::string key = ConvMKLDNNHandler::GetHash(
...@@ -373,13 +379,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -373,13 +379,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
ctx.op().Output("Output")); ctx.op().Output("Output"));
const std::string key_conv_pd = key + "@conv_pd"; const std::string key_conv_pd = key + "@conv_pd";
std::vector<primitive> pipeline; std::cout<<key_conv_pd<<std::endl;
std::vector<primitive> pipeline;
auto user_src_md = platform::MKLDNNMemDesc( auto user_src_md = platform::MKLDNNMemDesc(
{src_tz}, platform::MKLDNNGetDataType<T>(), input->format()); {src_tz}, paddle::framework::ToMKLDNNDataType(input->type()), input->format());
auto user_weights_md = platform::MKLDNNMemDesc( auto user_weights_md = platform::MKLDNNMemDesc(
{weights_tz}, platform::MKLDNNGetDataType<T>(), {weights_tz}, platform::MKLDNNGetDataType<float>(),
(g == 1) ? filter->format() : mkldnn::memory::format::goihw); (g == 1) ? mkldnn::memory::format::oihw : mkldnn::memory::format::goihw);
/* create memory descriptor for convolution without specified format /* create memory descriptor for convolution without specified format
* ('any') which lets a primitive (convolution in this case) choose * ('any') which lets a primitive (convolution in this case) choose
...@@ -390,21 +397,37 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -390,21 +397,37 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
platform::data_format_to_memory_format(data_format); platform::data_format_to_memory_format(data_format);
auto src_md = platform::MKLDNNMemDesc( auto src_md = platform::MKLDNNMemDesc(
src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format); src_tz, platform::MKLDNNGetDataType<float>(), chosen_memory_format);
auto weights_md = platform::MKLDNNMemDesc( auto weights_md = platform::MKLDNNMemDesc(
weights_tz, platform::MKLDNNGetDataType<T>(), weights_tz, platform::MKLDNNGetDataType<float>(),
(g == 1) ? chosen_memory_format : mkldnn::memory::format::goihw); (g == 1) ? chosen_memory_format : mkldnn::memory::format::goihw);
std::vector<int> bias_tz; // TODO(mgallus): avoid empty vector creation.
// Currently used whenever bias is != nullptr.
auto dst_md = platform::MKLDNNMemDesc( auto dst_md = platform::MKLDNNMemDesc(
dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format); dst_tz, platform::MKLDNNGetDataType<float>(), chosen_memory_format);
std::vector<int> bias_tz;
if(is_INT8){
src_md = platform::MKLDNNMemDesc(
src_tz, memory::data_type::u8, chosen_memory_format);
weights_md = platform::MKLDNNMemDesc(
weights_tz, memory::data_type::s8,
(g == 1) ? chosen_memory_format : mkldnn::memory::format::goihw);
dst_md = platform::MKLDNNMemDesc(
dst_tz,
fuse_relu? paddle::framework::ToMKLDNNDataType(std::type_index(typeid(unsigned char))) :
paddle::framework::ToMKLDNNDataType(std::type_index(typeid(char))),
chosen_memory_format);
}
// create a conv primitive descriptor and save it for usage in backward // create a conv primitive descriptor and save it for usage in backward
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd; std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd;
if (bias) { if (bias) {
bias_tz = paddle::framework::vectorize2int(bias->dims()); bias_tz = paddle::framework::vectorize2int(bias->dims());
auto bias_md = platform::MKLDNNMemDesc( auto bias_md = platform::MKLDNNMemDesc(
bias_tz, platform::MKLDNNGetDataType<T>(), memory::format::x); bias_tz, platform::MKLDNNGetDataType<float>(), memory::format::x);
if(is_INT8){
bias_md = platform::MKLDNNMemDesc(
bias_tz, memory::data_type::s32, memory::format::x);
}
if(is_INT8){ if(is_INT8){
conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
strides, paddings, mkldnn_engine, strides, paddings, mkldnn_engine,
...@@ -436,10 +459,60 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -436,10 +459,60 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto user_src_memory_p = auto user_src_memory_p =
handler.AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data)); handler.AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
auto user_weights_memory_p = handler.AcquireWeightsMemory( auto user_weights_memory_p = handler.AcquireWeightsMemory(
user_weights_md, to_void_cast<T>(filter_data)); user_weights_md, to_void_cast<float>(filter_data));
// create reorder primitive if the input format is not the preferred one
auto src_memory_p =
handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
std::shared_ptr<mkldnn::memory> weights_memory_p;// = handler.AcquireWeightsMemoryFromPrimitive(
//user_weights_memory_p, pipeline, is_test);
if(is_INT8){
int mask_reorder = is_multi_channel? 0 : ((g!= 1) ? (1<<1)+(1<<0) : 1<<0);
int count = is_multi_channel? (g>1? weights_tz[1]*weights_tz[0] : weights_tz[0]) : 1;
std::vector<float> scale_weights_data(count);
for(int i=0; i<count; i++){
scale_weights_data[i] = *(scale_weights->data<float>() + i);
}
weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive(
user_weights_memory_p, pipeline, is_test, is_INT8, scale_weights_data, mask_reorder);
} else{
weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive(
user_weights_memory_p, pipeline, is_test);
}
T* output_data = nullptr;
std::shared_ptr<mkldnn::memory> dst_memory_p;
if(is_INT8){
if (fuse_residual_conn) {
auto residual_param = ctx.Input<Tensor>("ResidualData");
PADDLE_ENFORCE_EQ(output->dims(), residual_param->dims(),
"Output and elementwise parameter need to have the "
"same dimension sizes");
output->ShareDataWith(*residual_param);
if(fuse_relu){
uint8_t* output_data = output->mutable_data<uint8_t>(ctx.GetPlace());
dst_memory_p =
handler.AcquireDstMemoryFromPrimitive(to_void_cast<uint8_t>(output_data));
} else{
int8_t* output_data = output->mutable_data<int8_t>(ctx.GetPlace());
dst_memory_p =
handler.AcquireDstMemoryFromPrimitive(to_void_cast<int8_t>(output_data));
}
} else {
if(fuse_relu){
uint8_t* output_data = output->mutable_data<uint8_t>(ctx.GetPlace(), handler.GetDstMemorySize());
dst_memory_p =
handler.AcquireDstMemoryFromPrimitive(to_void_cast<uint8_t>(output_data));
} else{
int8_t* output_data = output->mutable_data<int8_t>(ctx.GetPlace(), handler.GetDstMemorySize());
dst_memory_p =
handler.AcquireDstMemoryFromPrimitive(to_void_cast<int8_t>(output_data));
}
}
std::cout<<"input fmt = "<<input->format()<<" output fmt = "<<output->format()<<" dst fmt = "<<dst_memory_p->get_primitive_desc().desc().data.format<<std::endl;
} else{
T* output_data = nullptr;
if (fuse_residual_conn) { if (fuse_residual_conn) {
auto residual_param = ctx.Input<Tensor>("ResidualData"); auto residual_param = ctx.Input<Tensor>("ResidualData");
auto residual_param_data = residual_param->data<T>(); auto residual_param_data = residual_param->data<T>();
...@@ -457,44 +530,32 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -457,44 +530,32 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
output_data = output_data =
output->mutable_data<T>(ctx.GetPlace(), handler.GetDstMemorySize()); output->mutable_data<T>(ctx.GetPlace(), handler.GetDstMemorySize());
} }
dst_memory_p =
// create reorder primitive if the input format is not the preferred one
auto src_memory_p =
handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive(
user_weights_memory_p, pipeline, is_test);
if(is_INT8){
int mask_reorder = is_multi_channel? 0 : ((g!= 1) ? (1<<1)+(1<<0) : 1<<0);
int count = is_multi_channel? (g>1? weights_tz[1]*weights_tz[0] : weights_tz[0]) : 1;
std::vector<T> scale_weights_data(count);
for(int i=0; i<count; i++){
scale_weights_data[i] = *(scale_weights->data<T>() + i);
}
auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive(
user_weights_memory_p, pipeline, is_test, is_INT8, scale_weights_data, mask_reorder);
}
auto dst_memory_p =
handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data)); handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
}
// create convolution op primitive // create convolution op primitive
std::shared_ptr<mkldnn::convolution_forward> conv_p; std::shared_ptr<mkldnn::convolution_forward> conv_p;
if (bias) { if (bias) {
const T* bias_data = bias->data<T>(); const float* bias_data = bias->data<float>();
auto user_bias_md = platform::MKLDNNMemDesc( auto user_bias_md = platform::MKLDNNMemDesc(
{bias_tz}, platform::MKLDNNGetDataType<T>(), memory::format::x); {bias_tz}, platform::MKLDNNGetDataType<float>(), memory::format::x);
auto user_bias_memory_p = auto user_bias_memory_p =
handler.AcquireBiasMemory(user_bias_md, to_void_cast<T>(bias_data)); handler.AcquireBiasMemory(user_bias_md, to_void_cast<float>(bias_data));
auto bias_memory_p = std::shared_ptr<mkldnn::memory> bias_memory_p;// =
handler.AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline); //handler.AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline);
if(is_INT8){ if(is_INT8){
int mask_reorder = is_multi_channel? 0 : 1<<0; int mask_reorder = is_multi_channel? 0 : 1<<0;
int count = is_multi_channel? (g>1? weights_tz[1]*weights_tz[0] : weights_tz[0]) : 1; int count = is_multi_channel? (g>1? weights_tz[1]*weights_tz[0] : weights_tz[0]) : 1;
std::vector<T> scale_bias_data(count); std::vector<float> scale_bias_data(count);
for(int i=0; i<count; i++){ for(int i=0; i<count; i++){
scale_bias_data[i] = (*scale_in->data<T>()) * (*(scale_weights->data<T>() + i)); scale_bias_data[i] = (*scale_in->data<float>()) * (*(scale_weights->data<float>() + i));
} }
auto bias_memory_p = bias_memory_p =
handler.AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline, is_INT8, scale_bias_data, mask_reorder); handler.AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline, is_INT8, scale_bias_data, mask_reorder);
} else{
bias_memory_p =
handler.AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline);
} }
conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p, conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
bias_memory_p, dst_memory_p); bias_memory_p, dst_memory_p);
...@@ -503,17 +564,21 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -503,17 +564,21 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
dst_memory_p); dst_memory_p);
} }
// push primitive to stream and wait until it's executed // push primitive to stream and wait until it's executed
pipeline.push_back(*conv_p); pipeline.push_back(*conv_p);
stream(stream::kind::eager).submit(pipeline).wait(); stream(stream::kind::eager).submit(pipeline).wait();
output->set_layout(DataLayout::kMKLDNN); output->set_layout(DataLayout::kMKLDNN);
output->set_format(GetMKLDNNFormat(*dst_memory_p)); output->set_format(GetMKLDNNFormat(*dst_memory_p));
std::cout<<"input fmt = "<<input->format()<<" output fmt = "<<output->format()<<" dst fmt = "<<dst_memory_p->get_primitive_desc().desc().data.format<<"output dt = "<<paddle::framework::ToMKLDNNDataType(output->type())<<"dst dt = "<<dst_memory_p->get_primitive_desc().desc().data.data_type<<std::endl;
std::cout<<"this is conv end!!!!!!!!!!!!!!!!!!!!"<<std::endl;
} }
private: private:
mkldnn::primitive_attr CreatePostOps(bool fuse_relu, bool fuse_residual_conn, mkldnn::primitive_attr CreatePostOps(bool fuse_relu, bool fuse_residual_conn,
const std::vector<T> output_shift_scale, T sum_scale) const { const std::vector<float> output_shift_scale, float sum_scale) const {
mkldnn::primitive_attr conv_attr; mkldnn::primitive_attr conv_attr;
mkldnn::post_ops post_operations; mkldnn::post_ops post_operations;
// Fusion with Elementwise layer relies on adding a sum post-operation with // Fusion with Elementwise layer relies on adding a sum post-operation with
...@@ -568,7 +633,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -568,7 +633,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
const std::vector<int>& paddings, const std::vector<int>& paddings,
const mkldnn::engine& engine, const bool fuse_relu, const mkldnn::engine& engine, const bool fuse_relu,
const bool fuse_residual_conn, const bool fuse_residual_conn,
const std::vector<T> output_shift_scale, const T sum_scale) const { const std::vector<float> output_shift_scale, const float sum_scale) const {
memory::dims stride_dims = {strides[0], strides[1]}; memory::dims stride_dims = {strides[0], strides[1]};
memory::dims padding_dims = {paddings[0], paddings[1]}; memory::dims padding_dims = {paddings[0], paddings[1]};
...@@ -617,7 +682,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -617,7 +682,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
const std::vector<int>& paddings, const std::vector<int>& paddings,
const mkldnn::engine& engine, const bool fuse_relu, const mkldnn::engine& engine, const bool fuse_relu,
const bool fuse_residual_conn, const bool fuse_residual_conn,
const std::vector<T> output_shift_scale, const T sum_scale) const { const std::vector<float> output_shift_scale, const float sum_scale) const {
memory::dims stride_dims = {strides[0], strides[1]}; memory::dims stride_dims = {strides[0], strides[1]};
memory::dims padding_dims = {paddings[0], paddings[1]}; memory::dims padding_dims = {paddings[0], paddings[1]};
...@@ -841,7 +906,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -841,7 +906,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_KERNEL(conv2d, MKLDNN, ::paddle::platform::CPUPlace, REGISTER_OP_KERNEL(conv2d, MKLDNN, ::paddle::platform::CPUPlace,
ops::ConvMKLDNNOpKernel<float>); ops::ConvMKLDNNOpKernel<float>,
ops::ConvMKLDNNOpKernel<uint8_t>);
REGISTER_OP_KERNEL(conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace, REGISTER_OP_KERNEL(conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace,
ops::ConvMKLDNNGradOpKernel<float>); ops::ConvMKLDNNGradOpKernel<float>);
...@@ -94,10 +94,10 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( ...@@ -94,10 +94,10 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
auto input_data_type = auto input_data_type =
framework::ToDataType(ctx.Input<Tensor>("Input")->type()); framework::ToDataType(ctx.Input<Tensor>("Input")->type());
auto filter_data_type = //auto filter_data_type =
framework::ToDataType(ctx.Input<Tensor>("Filter")->type()); // framework::ToDataType(ctx.Input<Tensor>("Filter")->type());
PADDLE_ENFORCE_EQ(input_data_type, filter_data_type, //PADDLE_ENFORCE_EQ(input_data_type, filter_data_type,
"input and filter data type should be consistent"); // "input and filter data type should be consistent");
if (input_data_type == framework::proto::VarType::FP16) { if (input_data_type == framework::proto::VarType::FP16) {
PADDLE_ENFORCE_EQ(library, framework::LibraryType::kCUDNN, PADDLE_ENFORCE_EQ(library, framework::LibraryType::kCUDNN,
......
...@@ -40,15 +40,15 @@ class DeQuantOpKernel : public framework::OpKernel<T> { ...@@ -40,15 +40,15 @@ class DeQuantOpKernel : public framework::OpKernel<T> {
auto* input = ctx.Input<Tensor>("Input"); auto* input = ctx.Input<Tensor>("Input");
auto* scale = ctx.Input<Tensor>("Scale"); auto* scale = ctx.Input<Tensor>("Scale");
auto* output = ctx.Output<Tensor>("Output"); auto* output = ctx.Output<Tensor>("Output");
std::cout<<"this is dequant op ***********"<<std::endl;
auto& dev_ctx = auto& dev_ctx =
ctx.template device_context<platform::MKLDNNDeviceContext>(); ctx.template device_context<platform::MKLDNNDeviceContext>();
const auto& engine = dev_ctx.GetEngine(); const auto& engine = dev_ctx.GetEngine();
const T* input_data = input->data<T>(); const T* input_data = input->data<T>();
T* output_data = output->mutable_data<T>(ctx.GetPlace()); float* output_data = output->mutable_data<float>(ctx.GetPlace());
//T scale_data = *(scale->data<T>()); //T scale_data = *(scale->data<T>());
std::vector<T> scale_data = {*(scale->data<T>())}; std::vector<float> scale_data = {*(scale->data<float>())};
std::vector<primitive> pipeline; std::vector<primitive> pipeline;
std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims()); std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
...@@ -69,7 +69,7 @@ class DeQuantOpKernel : public framework::OpKernel<T> { ...@@ -69,7 +69,7 @@ class DeQuantOpKernel : public framework::OpKernel<T> {
auto dst_md = platform::MKLDNNMemDesc( auto dst_md = platform::MKLDNNMemDesc(
{dst_tz}, memory::data_type::f32, memory::format::nchw); {dst_tz}, memory::data_type::f32, memory::format::nchw);
auto dst_pd = mkldnn::memory::primitive_desc(dst_md, engine); auto dst_pd = mkldnn::memory::primitive_desc(dst_md, engine);
auto dst_memory = mkldnn::memory(dst_pd, to_void_cast<T>(output_data)); auto dst_memory = mkldnn::memory(dst_pd, to_void_cast<float>(output_data));
auto reorder_pd = std::shared_ptr<reorder::primitive_desc>( auto reorder_pd = std::shared_ptr<reorder::primitive_desc>(
new reorder::primitive_desc(dst_pd, src_pd, attri)); new reorder::primitive_desc(dst_pd, src_pd, attri));
...@@ -112,5 +112,5 @@ namespace ops = paddle::operators; ...@@ -112,5 +112,5 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR(dequantize, ops::DeQuantOp, ops::DeQuantOpMaker, paddle::framework::DefaultGradOpDescMaker<true>); REGISTER_OPERATOR(dequantize, ops::DeQuantOp, ops::DeQuantOpMaker, paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OP_KERNEL(dequantize, MKLDNN, ::paddle::platform::CPUPlace, ops::DeQuantOpKernel<float>); REGISTER_OP_KERNEL(dequantize, MKLDNN, ::paddle::platform::CPUPlace, ops::DeQuantOpKernel<uint8_t>);
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/pool_op.h" #include "paddle/fluid/operators/pool_op.h"
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
#include "paddle/fluid/framework/data_layout_transform.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -71,7 +72,7 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -71,7 +72,7 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
void Compute(const paddle::framework::ExecutionContext& ctx) const override { void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace."); "It must use CPUPlace.");
std::cout<<"this is pool op"<<std::endl;
auto& dev_ctx = auto& dev_ctx =
ctx.template device_context<platform::MKLDNNDeviceContext>(); ctx.template device_context<platform::MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine(); const auto& mkldnn_engine = dev_ctx.GetEngine();
...@@ -129,14 +130,19 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -129,14 +130,19 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
CorrectOutputSize(src_tz, dst_tz, ksize, paddings, strides, CorrectOutputSize(src_tz, dst_tz, ksize, paddings, strides,
padding_right_bottom); padding_right_bottom);
} }
mkldnn::memory::data_type dt = paddle::framework::ToMKLDNNDataType(input->type());
std::cout<<"input type = "<<dt<<std::endl;
auto src_md = platform::MKLDNNMemDesc( auto src_md = platform::MKLDNNMemDesc(
src_tz, platform::MKLDNNGetDataType<T>(), input_format); src_tz, dt, input_format);
/* create memory descriptor for pooling without specified format /* create memory descriptor for pooling without specified format
* ('any') which lets a primitive (pooling in this case) choose * ('any') which lets a primitive (pooling in this case) choose
* the memory format preferred for best performance * the memory format preferred for best performance
*/ */
auto dst_md = platform::MKLDNNMemDesc(dst_tz, mkldnn::memory::f32, auto dst_md = platform::MKLDNNMemDesc(dst_tz, dt,
mkldnn::memory::format::any); mkldnn::memory::format::any);
std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd = std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd =
...@@ -399,6 +405,9 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -399,6 +405,9 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_KERNEL(pool2d, MKLDNN, ::paddle::platform::CPUPlace, REGISTER_OP_KERNEL(pool2d, MKLDNN, ::paddle::platform::CPUPlace,
ops::PoolMKLDNNOpKernel<float>); ops::PoolMKLDNNOpKernel<float>,
ops::PoolMKLDNNOpKernel<int8_t>,
ops::PoolMKLDNNOpKernel<uint8_t>);
REGISTER_OP_KERNEL(pool2d_grad, MKLDNN, ::paddle::platform::CPUPlace, REGISTER_OP_KERNEL(pool2d_grad, MKLDNN, ::paddle::platform::CPUPlace,
ops::PoolMKLDNNGradOpKernel<float>); ops::PoolMKLDNNGradOpKernel<float>);
...@@ -37,7 +37,7 @@ class QuantOpKernel : public framework::OpKernel<T> { ...@@ -37,7 +37,7 @@ class QuantOpKernel : public framework::OpKernel<T> {
auto* input = ctx.Input<Tensor>("Input"); auto* input = ctx.Input<Tensor>("Input");
auto* scale = ctx.Input<Tensor>("Scale"); auto* scale = ctx.Input<Tensor>("Scale");
auto* output = ctx.Output<Tensor>("Output"); auto* output = ctx.Output<Tensor>("Output");
std::cout<<"this is quantize op!!!!!!!!!!!!!!"<<std::endl;
auto& dev_ctx = auto& dev_ctx =
ctx.template device_context<platform::MKLDNNDeviceContext>(); ctx.template device_context<platform::MKLDNNDeviceContext>();
const auto& engine = dev_ctx.GetEngine(); const auto& engine = dev_ctx.GetEngine();
...@@ -68,7 +68,12 @@ class QuantOpKernel : public framework::OpKernel<T> { ...@@ -68,7 +68,12 @@ class QuantOpKernel : public framework::OpKernel<T> {
auto reorder_pd = std::shared_ptr<reorder::primitive_desc>( auto reorder_pd = std::shared_ptr<reorder::primitive_desc>(
new reorder::primitive_desc(dst_pd, src_pd, attri)); new reorder::primitive_desc(dst_pd, src_pd, attri));
auto reorder_p= std::shared_ptr<reorder>(new reorder(*reorder_pd, *src_memory_p, dst_memory)); auto reorder_p= std::shared_ptr<reorder>(new reorder(*reorder_pd, *src_memory_p, dst_memory));
pipeline.push_back(*reorder_p); pipeline.push_back(*reorder_p);
stream(stream::kind::eager).submit(pipeline).wait();
output->set_layout(DataLayout::kMKLDNN);
output->set_format(GetMKLDNNFormat(dst_memory));
} }
}; };
......
...@@ -70,6 +70,7 @@ inline mkldnn::memory::desc MKLDNNMemDesc(const std::vector<int>& dims, ...@@ -70,6 +70,7 @@ inline mkldnn::memory::desc MKLDNNMemDesc(const std::vector<int>& dims,
mkldnn::memory::data_type data_type, mkldnn::memory::data_type data_type,
mkldnn::memory::format format) { mkldnn::memory::format format) {
mkldnn::memory::dims tz = dims; mkldnn::memory::dims tz = dims;
std::cout<<"this is MKLDNNMemDesc"<<" data_type"<<data_type<<" format"<<format<<std::endl;
return mkldnn::memory::desc({tz}, data_type, format); return mkldnn::memory::desc({tz}, data_type, format);
} }
...@@ -153,6 +154,7 @@ class MKLDNNHandler { ...@@ -153,6 +154,7 @@ class MKLDNNHandler {
std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key)); std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false), PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
"Fail to find mem primitive in device context"); "Fail to find mem primitive in device context");
//mem_p = nullptr;
if (mem_p == nullptr) { if (mem_p == nullptr) {
mem_p = std::make_shared<mkldnn::memory>(mdp, ptr); mem_p = std::make_shared<mkldnn::memory>(mdp, ptr);
dev_ctx_.SetBlob(local_key, mem_p); dev_ctx_.SetBlob(local_key, mem_p);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册