提交 61921084 编写于 作者: M Michał Gallus 提交者: Tao Luo

[DNNL] 3D Fully-Connected (#21746)

上级 c1fea3e3
...@@ -92,14 +92,15 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const { ...@@ -92,14 +92,15 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const {
// This is to add padding for dimension 128 on concern of MKL performance // This is to add padding for dimension 128 on concern of MKL performance
auto* scope = param_scope(); auto* scope = param_scope();
auto* weight = scope->FindVar(w->Name())->GetMutable<LoDTensor>(); auto* weight = scope->FindVar(w->Name())->GetMutable<LoDTensor>();
auto place = weight->place();
bool use_gpu = Get<bool>("use_gpu");
auto* weight_data = weight->data<float>(); auto* weight_data = weight->data<float>();
auto weight_dims = weight->dims(); auto weight_dims = weight->dims();
int weight_num = product(weight_dims); int weight_num = product(weight_dims);
int w_h = weight_dims[0]; int w_h = weight_dims[0];
int w_w = weight_dims[1]; int w_w = weight_dims[1];
if (!use_gpu) { bool use_gpu = Has("use_gpu") ? Get<bool>("use_gpu") : false;
bool use_fc_padding =
Has("use_fc_padding") ? Get<bool>("use_fc_padding") : true;
if (!use_gpu && use_fc_padding) {
if (w_h % 128 == 0 && w_w % 128 == 0) { if (w_h % 128 == 0 && w_w % 128 == 0) {
auto* weight_data_tmp = new float[weight_num]; auto* weight_data_tmp = new float[weight_num];
for (int i = 0; i < w_h; i++) { for (int i = 0; i < w_h; i++) {
......
...@@ -158,11 +158,47 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -158,11 +158,47 @@ void IRPassManager::CreatePasses(Argument *argument,
} }
} }
bool IRPassManager::HasPass(const std::string &pass_type) {
if (passes_.empty()) return false;
auto it = std::find_if(
passes_.begin(), passes_.end(),
[&](std::unique_ptr<Pass> &pass) { return pass->Type() == pass_type; });
return it != passes_.end();
}
std::unique_ptr<Pass> &IRPassManager::GetPass(const std::string &pass_type) {
PADDLE_ENFORCE_EQ(passes_.empty(), false,
platform::errors::PreconditionNotMet(
"The list of passes cannot be empty."));
auto it = std::find_if(passes_.begin(), passes_.end(),
[&](const std::unique_ptr<Pass> &pass) {
return pass->Type() == pass_type;
});
PADDLE_ENFORCE_NE(it, passes_.end(),
platform::errors::PermissionDenied(
"You cannot get pass which was not added earlier."));
return *it;
}
// Some passes depend on each other. This method serves for exchanging
// information between them.
void IRPassManager::UpdatePasses() {
// Update padding settings for fc_fuse_pass. Skipp adding padding for
// MKL-DNN-based FC
bool use_fc_padding = !HasPass("fc_mkldnn_pass");
if (HasPass("fc_fuse_pass")) {
auto &fc_fuse_pass = GetPass("fc_fuse_pass");
fc_fuse_pass->Set<bool>("use_fc_padding", new bool(use_fc_padding));
}
}
std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) { std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
if (passes_.empty()) { if (passes_.empty()) {
return graph; return graph;
} }
PADDLE_ENFORCE(graph.get()); PADDLE_ENFORCE_NOT_NULL(graph.get(), platform::errors::PreconditionNotMet(
"Graph cannot be NULL."));
UpdatePasses();
// Apply all the passes // Apply all the passes
for (const auto &pass : passes_) { for (const auto &pass : passes_) {
if (pass->Type() != "graph_viz_pass" && !disable_logs_) { if (pass->Type() != "graph_viz_pass" && !disable_logs_) {
......
...@@ -39,6 +39,7 @@ namespace inference { ...@@ -39,6 +39,7 @@ namespace inference {
namespace analysis { namespace analysis {
using framework::ProgramDesc; using framework::ProgramDesc;
using framework::ir::Graph; using framework::ir::Graph;
using framework::ir::Pass;
class IRPassManager final { class IRPassManager final {
public: public:
...@@ -53,9 +54,12 @@ class IRPassManager final { ...@@ -53,9 +54,12 @@ class IRPassManager final {
private: private:
void CreatePasses(Argument *argument, const std::vector<std::string> &passes); void CreatePasses(Argument *argument, const std::vector<std::string> &passes);
bool HasPass(const std::string &pass_type);
std::unique_ptr<Pass> &GetPass(const std::string &pass_type);
void UpdatePasses();
std::unique_ptr<Graph> graph_; std::unique_ptr<Graph> graph_;
std::vector<std::unique_ptr<framework::ir::Pass>> passes_; std::vector<std::unique_ptr<Pass>> passes_;
bool disable_logs_{false}; bool disable_logs_{false};
}; };
......
...@@ -52,26 +52,56 @@ class FCPrimitiveFactory { ...@@ -52,26 +52,56 @@ class FCPrimitiveFactory {
UpdateDataPointers(ctx, output, input); UpdateDataPointers(ctx, output, input);
this->Execute(); this->Execute();
return; return;
} } // Otherwise, create a new one.
auto src_desc = CreateMemDescriptor<T_in>(input, input->format());
input_ = CreateMemory<T_in>(src_desc, input);
// Since MKL-DNN doesn't support 4D column-major data formats in // Transform weights to default MKL-DNN format
// inner_product
// primitive, transpose the weights to be in row-major format
weights_ = TransposeWeights(weights); weights_ = TransposeWeights(weights);
if (src_desc.data.ndims == 4) { // Since MKL-DNN has a lot of limitations on what the input/weights/output
weights_ = CreateFourDimWeightsMemory(input, weights); // dimensions should be, to simplify the code, the creation of primitive
// descriptor has been divided into separate cases, based on the number
// of input dimensions.
size_t input_dim_num = input->dims().size();
boost::optional<mkldnn::inner_product_forward::primitive_desc> fc_prim_desc;
memory::desc usr_weights_desc = {};
switch (input_dim_num) {
case 2:
fc_prim_desc =
Create2DFcPrimDescriptor(input, weights, bias, output, ctx);
usr_weights_desc = Create2DUserWeightsDesc();
break;
case 3:
fc_prim_desc =
Create3DFcPrimDescriptor(input, weights, bias, output, ctx);
usr_weights_desc = Create3DUserWeightsDesc(weights);
break;
case 4:
fc_prim_desc =
Create4DFcPrimDescriptor(input, weights, bias, output, ctx);
usr_weights_desc = Create4DUserWeightsDesc(input, weights);
break;
default:
PADDLE_THROW(platform::errors::Unimplemented(
"DNNL FC doesn't support input dims different than 2, 3, 4."));
break;
} }
// If int8 data type is desired, weights are quantized to signed int8 input_ = CreateMemory<T_in>(fc_prim_desc->src_desc(), input);
QuantizeWeights(ctx); // Update weights format inside of its memory
weights_ = Reorder(usr_weights_desc, usr_weights_desc,
weights_->get_data_handle());
// Choose MKLDNNMemoryFormat::any so that MKL-DNN can determine itself what // Quantize weights and reorder to format chosen by FC primitive descriptor.
// is the best format for output during the creation of inner product QuantizeWeights(ctx, fc_prim_desc->weights_desc());
// primitive descriptor
auto dst_desc = CreateMemDescriptor<T_out>(output, MKLDNNMemoryFormat::any); bias_ = CreateMemory<float>(fc_prim_desc->bias_desc(), bias);
// If int8 is desired, quantize bias into 32-bit signed int
QuantizeBias(*fc_prim_desc, ctx);
fc_ = CreateFcPrimitive(*input_, *weights_, dst_desc, bias, output, ctx); // Based on format determined by inner_product, create output in desired
// memory format
output_ = CreateDstMemory(*fc_prim_desc, ctx, output);
// Return MKL-DNN primitive ready to be fed into pipeline and executed
fc_ = inner_product_forward(*fc_prim_desc);
this->Execute(); this->Execute();
} }
...@@ -99,26 +129,99 @@ class FCPrimitiveFactory { ...@@ -99,26 +129,99 @@ class FCPrimitiveFactory {
// variable, update its format to what has been determined in first // variable, update its format to what has been determined in first
// call to CreateFcPrimitive method. // call to CreateFcPrimitive method.
if (out->format() == MKLDNNMemoryFormat::undef) { if (out->format() == MKLDNNMemoryFormat::undef) {
auto output_format = platform::GetMKLDNNFormat(*output_); MKLDNNMemoryFormat format;
out->set_format((MKLDNNMemoryFormat)output_format); auto data_type = input_->get_desc().data.data_type;
if (data_type == mkldnn_f32)
format = MKLDNNMemoryFormat::nchw;
else
format = MKLDNNMemoryFormat::nhwc;
MKLDNNMemoryFormat selected = platform::MKLDNNFormatForSize(
framework::vectorize<int>(out->dims()).size(), format);
out->set_format(selected);
} }
} }
// Choose weight memory format based on input memory format mkldnn::inner_product_forward::primitive_desc Create2DFcPrimDescriptor(
MKLDNNMemoryFormat MatchWeightFormat(MKLDNNMemoryFormat fmt) { const LoDTensor* input, const Tensor* weights, const Tensor* bias,
using format = MKLDNNMemoryFormat; LoDTensor* output, const ExecutionContext& ctx) {
switch (fmt) { auto src_desc = CreateMemDescriptor<T_in>(input, input->format());
case format::nChw16c: auto weight_dims = Get2DWeightDimsForDNNL(weights);
return format::aBcd16b; auto weights_desc =
case format::nChw8c: CreateMemDescriptor<T_w>(weight_dims, MKLDNNMemoryFormat::any);
return format::aBcd8b; auto bias_desc = CreateMemDescriptor<float>(bias, MKLDNNMemoryFormat::x);
case format::nchw: auto dst_desc = CreateMemDescriptor<T_out>(output, MKLDNNMemoryFormat::any);
return format::oihw; const auto attrs = CreatePostOps(ctx);
case format::nhwc: return CreateFcPrimDesc(src_desc, weights_desc, bias_desc, dst_desc, attrs);
return format::hwio; }
default:
return format::undef; std::vector<int64_t> Get2DWeightDimsForDNNL(const Tensor* weights) {
} auto dims = framework::vectorize(weights->dims());
std::swap(dims[0], dims[1]); // swap input dim with output dim
return dims;
}
memory::desc Create2DUserWeightsDesc() { return weights_->get_desc(); }
mkldnn::inner_product_forward::primitive_desc Create3DFcPrimDescriptor(
const LoDTensor* input, const Tensor* weights, const Tensor* bias,
LoDTensor* output, const ExecutionContext& ctx) {
auto input_dims = framework::vectorize(input->dims());
std::vector<int64_t> new_input_dims = {input_dims[0] * input_dims[1], 1,
input_dims[2]};
auto src_desc = CreateMemDescriptor<T_in>(new_input_dims, input->format());
auto weight_dims = Get3DWeightDimsForDNNL(weights);
auto weights_desc =
CreateMemDescriptor<T_w>(weight_dims, MKLDNNMemoryFormat::any);
auto bias_desc = CreateMemDescriptor<float>(bias, MKLDNNMemoryFormat::x);
auto dst_dims = {input_dims[0] * input_dims[1], weight_dims[0]};
auto dst_desc =
CreateMemDescriptor<T_out>(dst_dims, MKLDNNMemoryFormat::any);
const auto attrs = CreatePostOps(ctx);
return CreateFcPrimDesc(src_desc, weights_desc, bias_desc, dst_desc, attrs);
}
std::vector<int64_t> Get3DWeightDimsForDNNL(const Tensor* weights) {
auto paddle_w_dims = framework::vectorize(weights->dims());
return {paddle_w_dims[1], 1, paddle_w_dims[0]};
}
memory::desc Create3DUserWeightsDesc(const Tensor* weights) {
auto dims = Get3DWeightDimsForDNNL(weights);
return CreateMemDescriptor<float>(dims, MKLDNNMemoryFormat::oiw);
}
mkldnn::inner_product_forward::primitive_desc Create4DFcPrimDescriptor(
const LoDTensor* input, const Tensor* weights, const Tensor* bias,
LoDTensor* output, const ExecutionContext& ctx) {
auto src_desc = CreateMemDescriptor<T_in>(input, input->format());
// Since MKL-DNN doesn't support 4D column-major data formats in
// inner_product primitive, transpose the weights to be in
// row-major format
auto dims = Get4DWeightDimsForDNNL(input, weights);
auto weights_desc = CreateMemDescriptor<T_w>(dims, MKLDNNMemoryFormat::any);
auto bias_desc = CreateMemDescriptor<float>(bias, MKLDNNMemoryFormat::x);
auto dst_desc = CreateMemDescriptor<T_out>(output, MKLDNNMemoryFormat::any);
const auto attrs = CreatePostOps(ctx);
return CreateFcPrimDesc(src_desc, weights_desc, bias_desc, dst_desc, attrs);
}
std::vector<int64_t> Get4DWeightDimsForDNNL(const LoDTensor* input,
const Tensor* weights) {
auto old_w_dims = framework::vectorize(weights->dims());
auto old_in_dims = framework::vectorize(input->dims());
auto dims = {old_w_dims[1], old_in_dims[1], old_in_dims[2], old_in_dims[3]};
return dims;
}
memory::desc Create4DUserWeightsDesc(const LoDTensor* input,
const Tensor* weights) {
auto dims = Get4DWeightDimsForDNNL(input, weights);
return CreateMemDescriptor<float>(dims, MKLDNNMemoryFormat::oihw);
} }
// Convert data from one data format to another // Convert data from one data format to another
...@@ -247,12 +350,9 @@ class FCPrimitiveFactory { ...@@ -247,12 +350,9 @@ class FCPrimitiveFactory {
return is_multi_channel_quantizied ? 1 << slice_dimension : 0; return is_multi_channel_quantizied ? 1 << slice_dimension : 0;
} }
void QuantizeWeights(const ExecutionContext& ctx) { void QuantizeWeights(const ExecutionContext& ctx, memory::desc dst) {
auto quantized_desc = weights_->get_desc(); weights_ =
quantized_desc.data.data_type = Reorder(*weights_, dst, ctx.Attr<std::vector<float>>("Scale_weights"));
(mkldnn_data_type_t)platform::MKLDNNGetDataType<T_w>();
weights_ = Reorder(*weights_, quantized_desc,
ctx.Attr<std::vector<float>>("Scale_weights"));
} }
void QuantizeBias(const inner_product_forward::primitive_desc& fc_prim_desc, void QuantizeBias(const inner_product_forward::primitive_desc& fc_prim_desc,
...@@ -282,43 +382,6 @@ class FCPrimitiveFactory { ...@@ -282,43 +382,6 @@ class FCPrimitiveFactory {
return attributes; return attributes;
} }
inner_product_forward CreateFcPrimitive(const memory& src_memory,
const memory& weights_memory,
const memory::desc& dst_desc,
const Tensor* bias, Tensor* output,
const ExecutionContext& ctx) {
// Acquire descriptors needed for creation of inner_product primitive
// descriptor
const auto weights_desc = weights_memory.get_desc();
const auto src_desc = src_memory.get_desc();
// Based on provided attributes, create attributes used by MKL-DNN to
// enable fused post-op activations such as 'relu'
const auto attrs = CreatePostOps(ctx);
// If bias exists, create inner_product primitive with or without bias
if (bias) {
auto bias_desc = CreateMemDescriptor<float>(bias, bias->format());
bias_ = CreateMemory<float>(bias_desc, bias);
// Create inner_product descriptor. At this point the format of output
// is determined.
auto fc_prim_desc =
CreateFcPrimDesc(src_desc, weights_desc, bias_desc, dst_desc, attrs);
// If int8 is desired, quantize bias into 32-bit signed int
QuantizeBias(fc_prim_desc, ctx);
// Based on format determined by inner_product, create output in desired
// memory format
output_ = CreateDstMemory(fc_prim_desc, ctx, output);
// Return MKL-DNN primitive ready to be fed into pipeline and executed
return inner_product_forward(fc_prim_desc);
} else {
auto fc_prim_desc =
CreateFcPrimDesc(src_desc, weights_desc, dst_desc, attrs);
output_ = CreateDstMemory(fc_prim_desc, ctx, output);
return inner_product_forward(fc_prim_desc);
}
}
mkldnn::inner_product_forward::primitive_desc CreateFcPrimDesc( mkldnn::inner_product_forward::primitive_desc CreateFcPrimDesc(
const mkldnn::memory::desc& input_desc, const mkldnn::memory::desc& input_desc,
const mkldnn::memory::desc& weights_desc, const mkldnn::memory::desc& weights_desc,
...@@ -332,43 +395,6 @@ class FCPrimitiveFactory { ...@@ -332,43 +395,6 @@ class FCPrimitiveFactory {
return inner_product_forward::primitive_desc(fc_desc, attrs, engine_); return inner_product_forward::primitive_desc(fc_desc, attrs, engine_);
} }
mkldnn::inner_product_forward::primitive_desc CreateFcPrimDesc(
const mkldnn::memory::desc& input_desc,
const mkldnn::memory::desc& weights_desc,
const mkldnn::memory::desc& dst_desc,
const mkldnn::primitive_attr& attrs) {
auto fc_desc = inner_product_forward::desc(prop_kind::forward, input_desc,
weights_desc, dst_desc);
return inner_product_forward::primitive_desc(fc_desc, attrs, engine_);
}
// Since MKL-DNN requires the number of input dimensions to be
// equal to the number of weight dimensions, we have to convert
// weights to 4D memory if input is 4D. It also requires that
// all dimensions of weights and inputs agree, with an exception
// for the batch size and number of output channels (the first dim).
// In order to perform that we have to prepare the memory descriptor
// by hand, as MKL-DNN's reorder does not support conversion
// from one dimensionality to another. Hence, we set
// the first dimension of weights to resemble number of outputs
// and then we use the sizes of number of input channels as well
// as image width and height for latter dimensions. Then we create
// memories, find a format corresponding with input format and
// perform a converion.
mkldnn::memory CreateFourDimWeightsMemory(const Tensor* input,
const Tensor* weights) {
auto input_dims = framework::vectorize(input->dims());
auto weight_dims = framework::vectorize(weights->dims());
auto dims = {weight_dims[1], input_dims[1], input_dims[2], input_dims[3]};
auto dst_format = MatchWeightFormat(input->format());
auto src_desc = CreateMemDescriptor<float>(dims, MKLDNNMemoryFormat::oihw);
auto dst_desc = CreateMemDescriptor<float>(dims, dst_format);
return Reorder(src_desc, dst_desc, weights_->get_data_handle());
}
// Create output memory based on output tensor and inner_product // Create output memory based on output tensor and inner_product
// primitive descriptor format chosen for output // primitive descriptor format chosen for output
mkldnn::memory CreateDstMemory( mkldnn::memory CreateDstMemory(
...@@ -379,7 +405,18 @@ class FCPrimitiveFactory { ...@@ -379,7 +405,18 @@ class FCPrimitiveFactory {
T_out* output_data = T_out* output_data =
output->mutable_data<T_out>(ctx.GetPlace(), buffer_size); output->mutable_data<T_out>(ctx.GetPlace(), buffer_size);
memory dst_mem(dst_desc, engine_, to_void_cast<T_out>(output_data)); memory dst_mem(dst_desc, engine_, to_void_cast<T_out>(output_data));
output->set_format(platform::GetMKLDNNFormat(dst_mem));
MKLDNNMemoryFormat format;
auto data_type = input_->get_desc().data.data_type;
if (data_type == mkldnn_f32)
format = MKLDNNMemoryFormat::nchw;
else
format = MKLDNNMemoryFormat::nhwc;
MKLDNNMemoryFormat selected = platform::MKLDNNFormatForSize(
framework::vectorize<int>(output->dims()).size(), format);
output->set_format(selected);
return dst_mem; return dst_mem;
} }
......
...@@ -19,14 +19,8 @@ import numpy as np ...@@ -19,14 +19,8 @@ import numpy as np
from paddle.fluid.tests.unittests.op_test import OpTest from paddle.fluid.tests.unittests.op_test import OpTest
def fully_connected_naive(input, weights, bias_data=None): def fully_connected_naive(input, weights, bias_data):
result = None result = np.dot(input, weights) + bias_data
if not bias_data:
result = np.dot(input, weights)
else:
result = np.dot(input, weights) + bias_data
return result return result
...@@ -39,18 +33,24 @@ class MatrixGenerate: ...@@ -39,18 +33,24 @@ class MatrixGenerate:
class TestFCMKLDNNOp(OpTest): class TestFCMKLDNNOp(OpTest):
def create_data(self): def create_data(self):
self.matrix = MatrixGenerate(1, 10, 15, 3, 3) self.matrix = MatrixGenerate(1, 10, 15, 3, 3)
self.bias = np.random.random(15).astype("float32")
def setUp(self): def setUp(self):
self.op_type = "fc" self.op_type = "fc"
self._cpu_only = True self._cpu_only = True
self.use_mkldnn = True self.use_mkldnn = True
self.create_data() self.create_data()
self.inputs = {'Input': self.matrix.input, 'W': self.matrix.weights} self.inputs = {
'Input': self.matrix.input,
'W': self.matrix.weights,
'Bias': self.bias
}
self.attrs = {'use_mkldnn': self.use_mkldnn, } self.attrs = {'use_mkldnn': self.use_mkldnn}
self.outputs = { self.outputs = {
'Out': fully_connected_naive(self.matrix.input, self.matrix.weights) 'Out': fully_connected_naive(self.matrix.input, self.matrix.weights,
self.bias)
} }
def test_check_output(self): def test_check_output(self):
...@@ -67,6 +67,7 @@ class TestFCMKLDNNOp(OpTest): ...@@ -67,6 +67,7 @@ class TestFCMKLDNNOp(OpTest):
class TestFCMKLDNNOp1(TestFCMKLDNNOp): class TestFCMKLDNNOp1(TestFCMKLDNNOp):
def create_data(self): def create_data(self):
self.matrix = MatrixGenerate(2, 15, 48, 2, 2) self.matrix = MatrixGenerate(2, 15, 48, 2, 2)
self.bias = np.random.random(48).astype("float32")
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册