提交 5d7d5482 编写于 作者: M Michał Gallus 提交者: Tao Luo

INT8 Fully-connected (#17641)

* Implement Int8 FC

* Integrate FC into INT8v2

test=develop

* int8 FC: transpose weights before computing scales

test=develop

* Add support for activation_type string in FC

test=develop

* Disable MKL-DNN's FC in VGG16 and 19

test=develop

* Disable FC quantization when mkldnn FC is disabled

test=develop

* Solve PADDLE_ENFORCES in FC int8

* Fix Paddle enforces and remove const cast

test=develop

* Fix style changes

test=develop

* Fix quantizer_tester test and add fc quantization

test=develop

* Fix FC test fail on CUDA

* Remove unnecessary log from quantize placement pass

test=develop

* Add Thread ID to FC hash key

test=develop

* Add comments to MKL-DNN FC Kernel

test=develop

* Refactor quantizer

test=develop

* Fix linter issues

test=develop

* Fix crash in slim googlenet

test=develop

* Fix PADDLE_ENFORCE messages

test=develop
上级 b639a882
......@@ -190,6 +190,10 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, FP32);\n")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, S8);\n")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, U8);\n")
elseif(${MKLDNN_FILE} STREQUAL "fc_mkldnn_op")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(fc, MKLDNN, FP32);\n")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(fc, MKLDNN, S8);\n")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(fc, MKLDNN, U8);\n")
else()
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
endif()
......
......@@ -905,15 +905,17 @@ PDNode *patterns::FCMKLDNN::operator()(paddle::framework::ir::PDNode *x,
auto *fc_op = pattern->NewNode(fc_repr())->assert_is_op("fc");
// Create variables
// Input
auto *input_var = pattern->NewNode(input_repr())
->AsInput()
->assert_is_op_input("fc", "Input");
// Filter
auto *fc_weight_var = pattern->NewNode(weights_repr())
->AsInput()
->assert_is_persistable_var()
->assert_is_op_input("fc", "W");
// Bias
auto *fc_bias_var = pattern->NewNode(bias_repr())
->AsInput()
->assert_is_persistable_var()
->assert_is_op_input("fc", "Bias");
// Output
auto *fc_out_var = pattern->NewNode(output_repr())
......@@ -921,7 +923,8 @@ PDNode *patterns::FCMKLDNN::operator()(paddle::framework::ir::PDNode *x,
->assert_is_op_output("fc", "Out")
->assert_is_only_output_of_op("fc");
fc_op->LinksFrom({x, fc_weight_var, fc_bias_var}).LinksTo({fc_out_var});
fc_op->LinksFrom({input_var, fc_weight_var, fc_bias_var})
.LinksTo({fc_out_var});
return fc_out_var;
}
......
......@@ -517,6 +517,7 @@ struct FCMKLDNN : public PatternBase {
// declare operator node's name
PATTERN_DECL_NODE(fc);
// declare variable node's name
PATTERN_DECL_NODE(input);
PATTERN_DECL_NODE(weights);
PATTERN_DECL_NODE(bias);
PATTERN_DECL_NODE(output);
......
......@@ -17,6 +17,7 @@
#include <utility>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/platform/errors.h"
#include "paddle/fluid/string/pretty_log.h"
namespace paddle {
......@@ -43,6 +44,13 @@ void CPUQuantizePass::QuantizeInput(Graph* g, Node* op, Node* input,
std::string input_name, double scale_to_one,
bool is_unsigned,
std::string scale_attr_name) const {
auto inputs = op->Op()->InputNames();
bool name_found =
std::find(inputs.begin(), inputs.end(), input_name) != inputs.end();
PADDLE_ENFORCE_EQ(
name_found, true,
platform::errors::InvalidArgument("%s isn't the input of the %s operator",
input_name, op->Op()->Type()));
unsigned max = is_unsigned ? U8_MAX : S8_MAX;
float scale = scale_to_one * max;
......@@ -122,6 +130,13 @@ void CPUQuantizePass::DequantizeOutput(Graph* g, Node* op, Node* output,
std::string output_name,
double scale_to_one, bool is_unsigned,
std::string scale_attr_name) const {
auto outputs = op->Op()->OutputNames();
bool name_found =
std::find(outputs.begin(), outputs.end(), output_name) != outputs.end();
PADDLE_ENFORCE_EQ(name_found, true,
platform::errors::InvalidArgument(
"%s isn't the output of the %s operator", output_name,
op->Op()->Type()));
unsigned max = is_unsigned ? U8_MAX : S8_MAX;
float scale = scale_to_one * max;
......@@ -228,6 +243,66 @@ void CPUQuantizePass::QuantizeConv(Graph* graph,
PrettyLogDetail(msg_ss.str().c_str());
}
void CPUQuantizePass::QuantizeFc(Graph* graph) const {
GraphPatternDetector gpd;
auto pattern = gpd.mutable_pattern();
patterns::FCMKLDNN fc_pattern{pattern, name_scope_};
auto* fc_input = gpd.mutable_pattern()
->NewNode("fc_quantizer/input")
->AsInput()
->assert_is_op_input("fc", "Input");
fc_pattern(fc_input, false);
int quantize_fc_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(4) << "Quantize fc op";
GET_IR_NODE_FROM_SUBGRAPH(fc, fc, fc_pattern);
auto* fc_op_desc = fc->Op();
// skip if should not be quantized
if (fc_op_desc->GetAttrIfExists<bool>("use_quantizer") != true ||
fc_op_desc->GetAttrIfExists<bool>("use_mkldnn") != true)
return;
GET_IR_NODE_FROM_SUBGRAPH(weights, weights, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(input, input, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(output, output, fc_pattern);
// get scales calculated after warmup, they scale variables to MAX=1.0
auto scales = Get<VarQuantScale>("quant_var_scales");
auto input_scale = scales[input->Name()].second.data<double>()[0];
bool is_input_unsigned = scales[input->Name()].first;
QuantizeInput(g, fc, input, "Input", input_scale, is_input_unsigned,
"Scale_in");
auto weight_scale_tensor = scales[weights->Name()].second;
EigenVectorArrayMap eigen_tensor{weight_scale_tensor.data<double>(),
weight_scale_tensor.numel(), 1};
eigen_tensor *= static_cast<double>(S8_MAX);
std::vector<float> filter_scale{
weight_scale_tensor.data<double>(),
weight_scale_tensor.data<double>() + weight_scale_tensor.numel()};
fc->Op()->SetAttr("Scale_weights", filter_scale);
auto output_scale = scales[output->Name()].second.data<double>()[0];
bool is_output_unsigned = scales[output->Name()].first;
DequantizeOutput(g, fc, output, "Out", output_scale, is_output_unsigned,
"Scale_out");
++quantize_fc_count;
};
gpd(graph, handler);
AddStatis(quantize_fc_count);
std::stringstream msg_ss;
msg_ss << "--- quantized " << quantize_fc_count << " fc ops";
PrettyLogDetail(msg_ss.str().c_str());
}
void CPUQuantizePass::QuantizePool(Graph* graph) const {
GraphPatternDetector gpd;
auto pattern = gpd.mutable_pattern();
......@@ -418,6 +493,7 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
QuantizeConcat(graph);
QuantizePriorBox(graph);
QuantizeTranspose(graph);
QuantizeFc(graph);
}
} // namespace ir
......
......@@ -46,6 +46,8 @@ class CPUQuantizePass : public FusePassBase {
void QuantizeConv(Graph* graph, bool with_residual_data = false) const;
void QuantizeFc(Graph* graph) const;
void QuantizePool(Graph* graph) const;
void QuantizeConcat(Graph* graph) const;
......
......@@ -62,6 +62,10 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
if (inputs.size() > 1) op->SetInput("W", {inputs[1]});
if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]});
op->SetOutput("Out", {outputs[0]});
op->SetAttr("use_quantizer", use_quantizer);
op->SetAttr("Scale_in", 1.0f);
op->SetAttr("Scale_out", 1.0f);
op->SetAttr("Scale_weights", std::vector<float>{1.0f});
} else if (type == "concat") {
op->SetInput("X", inputs);
op->SetOutput("Out", outputs);
......@@ -71,13 +75,13 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
namespace {
static const std::initializer_list<std::string> variable_names{
"a", "w1", "c", "d", "w2", "e", "f", "g",
"h", "w3", "b1", "i", "j", "w4", "b2"};
"a", "w1", "c", "d", "w2", "e", "f", "g", "h",
"w3", "b1", "i", "j", "w4", "b2", "w5", "b3"};
// (a,w1)->Conv1->c and c->Pool1->d
//
// (d,w2)->Conv2->e and e->Pool2->f
//
// d->Dropout1->g and g->Fc1->h and (h,w3,b1,i)->Conv3->j
// d->Dropout1->g and (g, w5, b3)->Fc1->h and (h,w3,b1,i)->Conv3->j
//
// (d,w4, b2)->Conv4->i
ProgramDesc BuildProgramDesc(bool use_mkldnn, bool use_quantizer) {
......@@ -98,7 +102,8 @@ ProgramDesc BuildProgramDesc(bool use_mkldnn, bool use_quantizer) {
SetOp(&prog, "pool2d", "Pool2", {"e"}, {"f"}, use_mkldnn, use_quantizer);
SetOp(&prog, "dropout", "Dropout1", {"d"}, {"g"}, use_mkldnn);
SetOp(&prog, "fc", "Fc1", {"g"}, {"h"}, use_mkldnn);
SetOp(&prog, "fc", "Fc1", {"g", "w5", "b3"}, {"h"}, use_mkldnn,
use_quantizer);
SetOp(&prog, "conv2d", "Conv3", {"h", "w3", "b1", "i"}, {"j"}, use_mkldnn,
use_quantizer);
......@@ -194,13 +199,13 @@ TEST(CpuQuantizePass, quantize) {
// (d->QUANT3->IN3,w2)->Conv2->OUT3->DEQUANT3->e and
// e->QUANT4->IN4->Pool2->OUT4->DEQUANT4->f
//
// d->Dropout1->g and g->Fc1->h and
// d->Dropout1->g and (g->QUANT8->IN8,w5,b3)->Fc1->OUT7->DEQUANT7->h and
// (h->QUANT5->IN5,w3,b1,i->QUANT6->IN6)->Conv3->OUT5->DEQUANT5->j
//
// (d->QUANT7->IN7,w4, b2)->Conv4->DEQUANT6->OUT6->i
// Insert nodes: 7 Quant + 7 IN + 6 OUT + 6 DEQUANT
int added_nodes = 7 + 7 + 6 + 6;
MainTest(BuildProgramDesc(use_mkldnn, use_quantizer), 4, 2, 7, 6, added_nodes,
// Insert nodes: 8 Quant + 8 IN + 7 OUT + 7 DEQUANT
int added_nodes = 8 + 8 + 7 + 7;
MainTest(BuildProgramDesc(use_mkldnn, use_quantizer), 4, 2, 8, 7, added_nodes,
2.0f * 127);
}
......
......@@ -26,12 +26,11 @@ namespace framework {
namespace ir {
void FCMKLDNNPass::ApplyImpl(ir::Graph* graph) const {
PADDLE_ENFORCE(graph);
PADDLE_ENFORCE_NOT_NULL(graph,
platform::errors::InvalidArgument(
"Pointer to graph argument should not be NULL."));
Init("fc_mkldnn_pass", graph);
auto* scope = param_scope();
PADDLE_ENFORCE(scope);
GraphPatternDetector gpd;
auto* x = gpd.mutable_pattern()
->NewNode("fc_mkldnn_pass/x")
......@@ -49,18 +48,25 @@ void FCMKLDNNPass::ApplyImpl(ir::Graph* graph) const {
return;
}
GET_IR_NODE_FROM_SUBGRAPH(fc, fc, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(input, input, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(weights, weights, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(bias, bias, fc_pattern);
GET_IR_NODE_FROM_SUBGRAPH(output, output, fc_pattern);
OpDesc* desc = fc->Op();
auto in_size = fc->inputs[0]->Var()->GetShape().size();
if (in_size != 2 && in_size != 4) {
auto dims = fc->inputs[0]->Var()->GetShape();
auto dim_num = dims.size();
bool are_dims_supported = dim_num == 2 || dim_num == 4;
constexpr size_t height_axis = 2;
constexpr size_t width_axis = 3;
bool is_size_supported =
dim_num == 4 ? (dims[width_axis] == 1 && dims[height_axis] == 1) : true;
if (!are_dims_supported || !is_size_supported) {
VLOG(3) << "Do not enable FC MKL-DNN for dimensions different than 2 & 4";
VLOG(3) << "Or when width and height are different than one";
return;
}
desc->SetAttr("use_mkldnn", true);
PADDLE_ENFORCE(subgraph.count(x));
found_fc_count++;
};
......
......@@ -276,7 +276,7 @@ class MkldnnQuantizerTest : public testing::Test {
std::pair<bool, framework::LoDTensor> GetMaxChScalingFactor(
const framework::LoDTensor& var_tensor, bool is_unsigned) const {
return mkldnn_quantizer->GetMaxChScalingFactor(var_tensor, is_unsigned);
return mkldnn_quantizer->GetMaxChScalingFactor(var_tensor, is_unsigned, 0);
}
std::pair<bool, framework::LoDTensor> GetKLScalingFactor(
......
......@@ -37,6 +37,11 @@ using framework::LoDTensor;
using framework::ir::Graph;
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
using EigenMatrixDoubleArray =
Eigen::Array<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
using EigenMatrixArray =
Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
using ConstEigenMatrixArrayMap = Eigen::Map<const EigenMatrixArray>;
using string::PrettyLogH1;
static LoDTensor CreateScaleTensor(int64_t channels_num = 1);
......@@ -66,7 +71,7 @@ bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() {
bool is_unsigned = false;
bool compute_scale = true;
if (is_output) {
if (op->Type() == "conv2d") {
if (op->Type() == "conv2d" || op->Type() == "fc") {
// output of conv2d with relu must be unsigned
std::string fuse_activation =
op->GetAttrIfExists<std::string>("fuse_activation");
......@@ -138,7 +143,12 @@ void AnalysisPredictor::MkldnnQuantizer::CalculateSingleScale(
scales_[var_name] = GetMaxScalingFactor(var_tensor, is_unsigned);
break;
case ScaleAlgo::MAX_CH:
scales_[var_name] = GetMaxChScalingFactor(var_tensor, is_unsigned);
scales_[var_name] = GetMaxChScalingFactor(var_tensor, is_unsigned,
/*is_transposed*/ false);
break;
case ScaleAlgo::MAX_CH_T:
scales_[var_name] = GetMaxChScalingFactor(var_tensor, is_unsigned,
/*is_transposed*/ true);
break;
case ScaleAlgo::KL:
scales_[var_name] = GetKLScalingFactor(var_tensor, is_unsigned);
......@@ -319,7 +329,7 @@ AnalysisPredictor::MkldnnQuantizer::GetMaxScalingFactor(
std::pair<bool, LoDTensor>
AnalysisPredictor::MkldnnQuantizer::GetMaxChScalingFactor(
const LoDTensor& var_tensor, bool is_unsigned) const {
const LoDTensor& var_tensor, bool is_unsigned, bool is_transposed) const {
PADDLE_ENFORCE(var_tensor.dims().size() > 0, "Tensor dimension is empty.");
ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(),
......@@ -331,18 +341,23 @@ AnalysisPredictor::MkldnnQuantizer::GetMaxChScalingFactor(
"Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
min_val);
int channels = var_tensor.dims()[0];
LoDTensor scale_tensor = CreateScaleTensor(channels);
auto* scale_ptr = scale_tensor.mutable_data<double>(CPUPlace());
for (int i = 0; i < channels; ++i) {
const auto tensor = var_tensor.Slice(i, i + 1);
auto dims = var_tensor.dims();
constexpr int num_col_dims = 1;
auto flattened_dims = framework::flatten_to_2d(dims, num_col_dims);
ConstEigenMatrixArrayMap eigen_tensor_mat{
var_tensor.data<float>(), flattened_dims[0], flattened_dims[1]};
ConstEigenVectorArrayMap eigen_tensor{tensor.data<float>(), tensor.numel(),
1};
float max_abs = eigen_tensor.abs().maxCoeff();
scale_ptr[i] = 1.0 / max_abs;
EigenMatrixDoubleArray scales;
if (is_transposed) {
scales = 1.0 / eigen_tensor_mat.cast<double>().abs().colwise().maxCoeff();
} else {
scales = 1.0 / eigen_tensor_mat.cast<double>().abs().rowwise().maxCoeff();
}
int output_channel_axis = is_transposed;
int channels = dims[output_channel_axis];
LoDTensor scale_tensor = CreateScaleTensor(channels);
auto* scale_ptr = scale_tensor.mutable_data<double>(CPUPlace());
std::copy(scales.data(), scales.data() + scales.size(), scale_ptr);
return std::make_pair(is_unsigned, scale_tensor);
}
......
......@@ -79,7 +79,8 @@ class AnalysisPredictor::MkldnnQuantizer {
const framework::LoDTensor& var_tensor, bool is_unsigned) const;
std::pair<bool, framework::LoDTensor> GetMaxChScalingFactor(
const framework::LoDTensor& var_tensor, bool is_unsigned) const;
const framework::LoDTensor& var_tensor, bool is_unsigned,
bool is_transposed) const;
std::pair<bool, framework::LoDTensor> GetMaxScalingFactor(
const framework::LoDTensor& var_tensor, bool is_unsigned) const;
......
......@@ -37,6 +37,11 @@ MkldnnQuantizerConfig::MkldnnQuantizerConfig() {
rules_["transpose2"]["X"] = ScaleAlgo::KL;
rules_["transpose2"]["Out"] = ScaleAlgo::NONE;
rules_["fc"]["Input"] = ScaleAlgo::KL;
rules_["fc"]["W"] = ScaleAlgo::MAX_CH_T;
rules_["fc"]["Bias"] = ScaleAlgo::NONE;
rules_["fc"]["Out"] = ScaleAlgo::KL;
}
ScaleAlgo MkldnnQuantizerConfig::scale_algo(
......
......@@ -27,8 +27,10 @@ namespace paddle {
// Algorithms for finding scale of quantized Tensors.
enum class ScaleAlgo {
NONE, // Do not compute scale
MAX, // Find scale based on the maximum absolute value
MAX_CH, // Find scale based on the maximum absolute value per channel
MAX, // Find scale based on the max absolute value
MAX_CH, // Find scale based on the max absolute value per output channel
MAX_CH_T, // Find scale based on the max absolute value per output channel
// of a transposed tensor
KL, // Find scale based on KL Divergence
};
......
......@@ -93,13 +93,21 @@ class FCOp : public framework::OperatorWithKernel {
const framework::ExecutionContext& ctx) const override {
framework::LibraryType library = framework::LibraryType::kPlain;
framework::DataLayout layout = framework::DataLayout::kAnyLayout;
int customized_type_value =
framework::OpKernelType::kDefaultCustomizedTypeValue;
auto input_data_type =
OperatorWithKernel::IndicateVarDataType(ctx, "Input");
if (ctx.Attr<bool>("use_mkldnn")) {
library = framework::LibraryType::kMKLDNN;
layout = framework::DataLayout::kMKLDNN;
using framework::proto::VarType;
customized_type_value = (input_data_type == VarType::INT8 ||
input_data_type == VarType::UINT8)
? kFCMKLDNNINT8
: kFCMKLDNNFP32;
}
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "Input"), ctx.GetPlace(),
layout, library);
return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
library, customized_type_value);
}
};
......@@ -132,6 +140,27 @@ class FCOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
"Skip calling InferShape() function in the runtime.")
.SetDefault(true);
/* int8 parameters */
AddAttr<bool>("use_quantizer",
"(bool, default false) "
"Set to true for operators that should be quantized and use "
"int8 kernel. "
"Only used on CPU.")
.SetDefault(false);
AddAttr<float>("Scale_in",
"(float, default 1.0f), The quantize scale of input data")
.SetDefault(1.0f);
AddAttr<std::vector<float>>("Scale_weights",
"(std::vector<float>, default {1.0f}), The "
"quantize scale of weights data")
.SetDefault({1.0f});
AddAttr<float>("Scale_out",
"(float, default 1.0f), The quantize scale of output data")
.SetDefault(1.0f);
AddAttr<bool>("force_fp32_output",
"(bool, default false) Force INT8 kernel output FP32, only "
"used in MKL-DNN INT8")
.SetDefault(false);
AddComment(R"DOC(
Fully Connected Operator.
......
......@@ -21,6 +21,7 @@ limitations under the License. */
namespace paddle {
namespace operators {
enum { kFCMKLDNNFP32 = 1, kFCMKLDNNINT8 = 2 };
using Tensor = framework::Tensor;
......
......@@ -78,7 +78,6 @@ class DeQuantOpKernel : public framework::OpKernel<T> {
auto dst_md = platform::MKLDNNMemDesc(
{dst_tz}, memory::data_type::f32,
platform::MKLDNNFormatForSize(dst_tz.size(), memory::format::nchw));
auto dst_pd = mkldnn::memory::primitive_desc(dst_md, engine);
dst_memory = std::make_shared<mkldnn::memory>(
dst_pd, to_void_cast<float>(output_data));
......
......@@ -42,6 +42,7 @@ class TestFCMKLDNNOp(OpTest):
def setUp(self):
self.op_type = "fc"
self._cpu_only = True
self.use_mkldnn = True
self.create_data()
self.inputs = {'Input': self.matrix.input, 'W': self.matrix.weights}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册