diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 4cd29486a8e846dc04fbc4e467f2c40782408dfa..5e0be5d445eae9d6d857ab0d6c5816807b4af523 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -37,18 +37,24 @@ endif(WIN32) add_subdirectory(api) +if(WITH_MKLDNN) + set(mkldnn_quantizer_src ${CMAKE_CURRENT_SOURCE_DIR}/api/mkldnn_quantizer.cc) + set(mkldnn_quantizer_cfg mkldnn_quantizer_config) +endif() + set(STATIC_INFERENCE_APIS paddle_fluid_api paddle_inference_api analysis_predictor) set(SHARED_INFERENCE_SRCS io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc + ${mkldnn_quantizer_src} ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc) if(WIN32) sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array - analysis_config paddle_pass_builder) + analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder) else(WIN32) cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} - zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder) + zero_copy_tensor reset_tensor_array analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder) endif(WIN32) if(NOT APPLE) @@ -61,11 +67,11 @@ endif() if(WIN32) sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array - analysis_config paddle_pass_builder) + analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder) else(WIN32) cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array - analysis_config paddle_pass_builder) + analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder) endif() get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) target_link_libraries(paddle_fluid_shared ${os_dependency_modules}) diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 90f09505c023c656a0b4ffaf9e3ef52152c0f0e7..882bb3468388e794e975d87de73537ac41f17cf7 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -33,13 +33,19 @@ endif() add_subdirectory(details) -cc_library(analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder) +if(WITH_MKLDNN) + set(mkldnn_quantizer_src mkldnn_quantizer.cc) + set(mkldnn_quantizer_cfg mkldnn_quantizer_config) + cc_library(${mkldnn_quantizer_cfg} SRCS mkldnn_quantizer_config.cc DEPS lod_tensor paddle_pass_builder) +endif() + +cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder) cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc) -cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api zero_copy_tensor +cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS paddle_inference_api zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager ${inference_deps}) cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config - analysis_config paddle_pass_builder zero_copy_tensor + paddle_pass_builder zero_copy_tensor reset_tensor_array) cc_test(test_paddle_inference_api diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 7bfdada49664544c829b1f4fc886292b29717c32..aee94e12340597e981ac385a01335d2ffa069191 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -108,6 +108,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { // MKLDNN related. CP_MEMBER(use_mkldnn_); CP_MEMBER(mkldnn_enabled_op_types_); + // Quantization related. + CP_MEMBER(use_mkldnn_quantizer_); + CP_MEMBER(mkldnn_quantizer_config_); CP_MEMBER(use_anakin_); CP_MEMBER(anakin_max_batchsize_); @@ -148,6 +151,26 @@ void AnalysisConfig::EnableMKLDNN() { Update(); } +void AnalysisConfig::EnableMkldnnQuantizer() { +#ifdef PADDLE_WITH_MKLDNN + if (!mkldnn_quantizer_config_) + mkldnn_quantizer_config_.reset(new MkldnnQuantizerConfig()); + use_mkldnn_quantizer_ = true; +#else + LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnQuantizer"; + use_mkldnn_quantizer_ = false; +#endif + + Update(); +} + +std::shared_ptr AnalysisConfig::mkldnn_quantizer_config() + const { + PADDLE_ENFORCE_NOT_NULL(mkldnn_quantizer_config_, + "MkldnnQuantizer was not enabled yet."); + return mkldnn_quantizer_config_; +} + void AnalysisConfig::EnableTensorRtEngine( int workspace_size, int max_batch_size, int min_subgraph_size, AnalysisConfig::Precision precision_mode, bool use_static) { @@ -224,15 +247,27 @@ void AnalysisConfig::Update() { #endif } - if (enable_memory_optim_) { - auto analysis_passes = pass_builder()->AnalysisPasses(); - auto memory_opti_pass_name = "memory_optimize_pass"; - bool already_exists = - std::find(analysis_passes.begin(), analysis_passes.end(), - memory_opti_pass_name) != analysis_passes.end(); - if (!already_exists) { - pass_builder()->AppendAnalysisPass(memory_opti_pass_name); + // Quantization passes must come after all other optimization passes + if (use_mkldnn_quantizer_) { + if (!enable_ir_optim_) { + LOG(ERROR) << "EnableMkldnnQuantizer() only works when IR optimization " + "is enabled."; } +#ifdef PADDLE_WITH_MKLDNN + pass_builder()->EnableMkldnnQuantizer(); +#else + LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnQuantizer"; + use_mkldnn_quantizer_ = false; +#endif + } + +#ifdef PADDLE_WITH_MKLDNN + // Do not optimize before quantization + if (enable_memory_optim_ && !use_mkldnn_quantizer_) { +#else + if (enable_memory_optim_) { +#endif + pass_builder()->AppendAnalysisPass("memory_optimize_pass"); } if (use_anakin_) { @@ -277,6 +312,7 @@ std::string AnalysisConfig::SerializeInfoCache() { for (auto &item : mkldnn_enabled_op_types_) ss << item; ss << ";"; + ss << use_mkldnn_quantizer_; ss << model_from_memory_; ss << enable_ir_optim_; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 001e8e66d5560f631dab8dd7c13bbaaef1e6195a..f7260561547bb0bd7aea1590239e38090953f6fc 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -18,6 +18,7 @@ #include #include #include +#include #include #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/feed_fetch_type.h" @@ -35,8 +36,13 @@ #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/inference/api/mkldnn_quantizer.h" +#endif + #if PADDLE_WITH_TENSORRT #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" @@ -341,10 +347,7 @@ bool AnalysisPredictor::GetFetch(std::vector *outputs, return true; } -// NOTE All the members in AnalysisConfig should be copied to Argument. -void AnalysisPredictor::OptimizeInferenceProgram() { - status_program_optimized_ = true; - +void AnalysisPredictor::PrepareArgument() { argument_.SetUseGPU(config_.use_gpu()); argument_.SetGPUDeviceId(config_.gpu_device_id()); argument_.SetEnableMemoryOptim(config_.enable_memory_optim()); @@ -390,6 +393,16 @@ void AnalysisPredictor::OptimizeInferenceProgram() { argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_); } +#ifdef PADDLE_WITH_MKLDNN + if (config_.mkldnn_quantizer_enabled()) { + LOG(INFO) << "Quantization is enabled"; + argument_.SetQuantizeEnabledOpTypes( + config_.mkldnn_quantizer_config()->enabled_op_types()); + argument_.SetQuantizeExcludedOpIds( + config_.mkldnn_quantizer_config()->excluded_op_ids()); + } +#endif + auto passes = config_.pass_builder()->AllPasses(); if (!config_.ir_optim()) { passes.clear(); @@ -398,6 +411,13 @@ void AnalysisPredictor::OptimizeInferenceProgram() { argument_.SetIrAnalysisPasses(passes); argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses()); argument_.SetScopeNotOwned(scope_.get()); +} + +// NOTE All the members in AnalysisConfig should be copied to Argument. +void AnalysisPredictor::OptimizeInferenceProgram() { + status_program_optimized_ = true; + + PrepareArgument(); Analyzer().Run(&argument_); PADDLE_ENFORCE(argument_.scope_valid()); @@ -439,12 +459,31 @@ std::unique_ptr CreatePaddlePredictor< } std::unique_ptr predictor(new AnalysisPredictor(config)); - if (!dynamic_cast(predictor.get())->Init(nullptr)) { + auto predictor_p = dynamic_cast(predictor.get()); + + if (!predictor_p->Init(nullptr)) { + return nullptr; + } + + if (config.mkldnn_quantizer_enabled() && !predictor_p->MkldnnQuantize()) { return nullptr; } + return predictor; } +bool AnalysisPredictor::MkldnnQuantize() { +#if PADDLE_WITH_MKLDNN + if (!mkldnn_quantizer_) + mkldnn_quantizer_ = new AnalysisPredictor::MkldnnQuantizer( + *this, config_.mkldnn_quantizer_config()); + return mkldnn_quantizer_->Quantize(); +#else + LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnQuantizer"; + return false; +#endif +} + void AnalysisPredictor::PrepareFeedFetch() { PADDLE_ENFORCE_NOT_NULL(sub_scope_); CreateFeedFetchVar(sub_scope_); @@ -703,6 +742,13 @@ AnalysisPredictor::~AnalysisPredictor() { scope_->DeleteScope(sub_scope_); } +#if PADDLE_WITH_MKLDNN + if (mkldnn_quantizer_) { + delete mkldnn_quantizer_; + mkldnn_quantizer_ = nullptr; + } +#endif + // TODO(Superjomn) deduce the directory path. std::string out_path = inference::analysis::GetMemoryCachePath( config_.model_dir(), config_.prog_file()); diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 087bfbd002658da28e3097a10933d5be2312ab4f..e4c537f426650f16ced32d3cb61b944a78c35b43 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -70,6 +70,7 @@ class AnalysisPredictor : public PaddlePredictor { void CreateFeedFetchVar(framework::Scope *scope); void PrepareFeedFetch(); + void PrepareArgument(); void OptimizeInferenceProgram(); Argument &analysis_argument() { return argument_; } @@ -83,6 +84,8 @@ class AnalysisPredictor : public PaddlePredictor { std::string GetSerializedProgram() const override; + bool MkldnnQuantize(); + protected: // For memory optimization. bool need_collect_var_shapes_for_memory_optim(); @@ -143,6 +146,16 @@ class AnalysisPredictor : public PaddlePredictor { std::vector fetches_; std::map idx2fetches_; +#if PADDLE_WITH_MKLDNN + // Helper class to perform quantization + class MkldnnQuantizer; + MkldnnQuantizer *mkldnn_quantizer_{nullptr}; + +#if PADDLE_WITH_TESTING + friend class MkldnnQuantizerTest; +#endif +#endif + // Memory buffer for feed inputs. The temporary LoDTensor will cause serious // concurrency problems, wrong results and memory leak, so cache them. std::vector feed_tensors_; diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index 6696839b53fb21c274843afd86b5d8b5c2042c51..0429a287c74f9db5257181151d90b77da86c694c 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -17,9 +17,13 @@ #include #include // NOLINT #include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/inference/api/mkldnn_quantizer.h" +#endif DEFINE_string(dirname, "", "dirname to tests."); @@ -243,4 +247,241 @@ TEST(AnalysisPredictor, memory_optim) { inference::CompareResult(output, output1); } +#ifdef PADDLE_WITH_MKLDNN +class MkldnnQuantizerTest : public testing::Test { + public: + MkldnnQuantizerTest() { + AnalysisConfig config(FLAGS_dirname); + + predictor.reset(new AnalysisPredictor(config)); + auto* predictor_p = static_cast(predictor.get()); + + auto qconfig = std::make_shared(); + + mkldnn_quantizer.reset( + new AnalysisPredictor::MkldnnQuantizer(*predictor_p, qconfig)); + } + + std::pair, float> Histogram( + const framework::LoDTensor& var_tensor, float min_val, float max_val, + int num_bins) const { + return mkldnn_quantizer->Histogram(var_tensor, min_val, max_val, num_bins); + } + + std::pair GetMaxScalingFactor( + const framework::LoDTensor& var_tensor, bool is_unsigned) const { + return mkldnn_quantizer->GetMaxScalingFactor(var_tensor, is_unsigned); + } + + std::pair GetMaxChScalingFactor( + const framework::LoDTensor& var_tensor, bool is_unsigned) const { + return mkldnn_quantizer->GetMaxChScalingFactor(var_tensor, is_unsigned); + } + + std::pair GetKLScalingFactor( + const framework::LoDTensor& var_tensor, bool is_unsigned) const { + return mkldnn_quantizer->GetKLScalingFactor(var_tensor, is_unsigned); + } + + protected: + std::unique_ptr predictor; + std::unique_ptr mkldnn_quantizer; + float abs_error = 1e-6; + static const std::array non_negative_values; + static const std::array positive_and_negative_values; +}; + +const std::array MkldnnQuantizerTest::non_negative_values = { + 0.0158671, 0.026459, 0.0280772, 0.00962479, 0.0131628, + 0.016704, 0.00118407, 0.00765726, 0.0123213, 0.00944741}; +const std::array MkldnnQuantizerTest::positive_and_negative_values = + {-0.0482659, -0.0102493, -0.00794221, -0.00387115, -0.00674586, + -0.0495346, 0.0629528, -0.00531285, -0.0230353, 0.0269089}; + +TEST_F(MkldnnQuantizerTest, histogram_inverted_min_max) { + const auto& values = non_negative_values; + auto min_val = *std::min_element(values.begin(), values.end()); + auto max_val = *std::max_element(values.begin(), values.end()); + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + + ASSERT_THROW(Histogram(var_tensor, max_val, min_val, 3), + platform::EnforceNotMet); +} + +TEST_F(MkldnnQuantizerTest, histogram_non_negative_to_3) { + // all non-negative values + const auto& values = non_negative_values; + auto min_val = *std::min_element(values.begin(), values.end()); + auto max_val = *std::max_element(values.begin(), values.end()); + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + + std::vector histogram; + float bin_width; + + std::tie(histogram, bin_width) = Histogram(var_tensor, min_val, max_val, 3); + + ASSERT_NEAR(bin_width, std::abs(max_val - min_val) / 3.f, abs_error) + << "Improperly calculated bin_width."; + + ASSERT_EQ(histogram[0], 4); + ASSERT_EQ(histogram[1], 4); + ASSERT_EQ(histogram[2], 2); +} + +TEST_F(MkldnnQuantizerTest, histogram_positive_and_negative_to_3) { + const auto& values = positive_and_negative_values; + auto min_val = *std::min_element(values.begin(), values.end()); + auto max_val = *std::max_element(values.begin(), values.end()); + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + + std::vector histogram; + float bin_width; + + std::tie(histogram, bin_width) = Histogram(var_tensor, min_val, max_val, 3); + + ASSERT_NEAR(bin_width, std::abs(max_val - min_val) / 3.0f, abs_error) + << "Improperly calculated bin_width."; + + ASSERT_EQ(histogram[0], 3); + ASSERT_EQ(histogram[1], 5); + ASSERT_EQ(histogram[2], 2); +} + +TEST_F(MkldnnQuantizerTest, histogram_zero_bins) { + const auto& values = non_negative_values; + auto min_val = *std::min_element(values.begin(), values.end()); + auto max_val = *std::max_element(values.begin(), values.end()); + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + + ASSERT_THROW(Histogram(var_tensor, min_val, max_val, 0), + platform::EnforceNotMet); +} + +TEST_F(MkldnnQuantizerTest, histogram_empty) { + // empty tensor + ASSERT_THROW(Histogram({}, -1, 1, 1), platform::EnforceNotMet); + + // zero tensor + framework::LoDTensor var_tensor; + var_tensor.Resize({0}); + ASSERT_TRUE(var_tensor.mutable_data(platform::CPUPlace())); + + ASSERT_THROW(Histogram(var_tensor, -1, 1, 1), platform::EnforceNotMet); +} + +TEST_F(MkldnnQuantizerTest, kl_scaling_factor_signed) { + const auto& values = positive_and_negative_values; + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + + bool is_unsigned; + framework::LoDTensor lod_tensor; + + std::tie(is_unsigned, lod_tensor) = GetKLScalingFactor(var_tensor, false); + + ASSERT_EQ(is_unsigned, false); + ASSERT_EQ(lod_tensor.numel(), 1); + ASSERT_NEAR(lod_tensor.data()[0], 1.0 / 0.0899106152344, abs_error); +} + +TEST_F(MkldnnQuantizerTest, max_scaling_factor_signed) { + const auto& values = positive_and_negative_values; + auto max_val = *std::max_element(values.begin(), values.end()); + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + + bool is_unsigned; + framework::LoDTensor lod_tensor; + + std::tie(is_unsigned, lod_tensor) = GetMaxScalingFactor(var_tensor, false); + + ASSERT_EQ(is_unsigned, false); + ASSERT_EQ(lod_tensor.numel(), 1); + ASSERT_NEAR(lod_tensor.data()[0], 1.0 / max_val, abs_error); +} + +TEST_F(MkldnnQuantizerTest, max_scaling_factor_unsigned) { + const auto& values = non_negative_values; + auto max_val = *std::max_element(values.begin(), values.end()); + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + + bool is_unsigned; + framework::LoDTensor lod_tensor; + + std::tie(is_unsigned, lod_tensor) = GetMaxScalingFactor(var_tensor, true); + + ASSERT_EQ(is_unsigned, true); + ASSERT_EQ(lod_tensor.numel(), 1); + ASSERT_NEAR(lod_tensor.data()[0], 1.0 / max_val, abs_error); +} + +TEST_F(MkldnnQuantizerTest, max_scaling_factor_chwise_unsigned) { + const auto& values = non_negative_values; + auto max_val = *std::max_element(values.begin(), values.end()); + int channels = 3; + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(channels, 1, 1, values.size())); + for (int i = 0; i < channels; i++) + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace()) + + i * values.size()); + + bool is_unsigned; + framework::LoDTensor lod_tensor; + + std::tie(is_unsigned, lod_tensor) = GetMaxChScalingFactor(var_tensor, true); + + ASSERT_EQ(is_unsigned, true); + ASSERT_EQ(lod_tensor.numel(), channels); + for (int i = 0; i < channels; i++) { + ASSERT_NEAR(lod_tensor.data()[i], 1.0 / max_val, abs_error); + } +} + +TEST_F(MkldnnQuantizerTest, kl_scaling_factor_unsigned) { + const auto& values = non_negative_values; + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + + bool is_unsigned; + framework::LoDTensor lod_tensor; + + std::tie(is_unsigned, lod_tensor) = GetKLScalingFactor(var_tensor, true); + + ASSERT_EQ(is_unsigned, true); + ASSERT_EQ(lod_tensor.numel(), 1); + ASSERT_NEAR(lod_tensor.data()[0], 1.0 / 0.0252845321362, abs_error); +} +#endif + } // namespace paddle diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc new file mode 100644 index 0000000000000000000000000000000000000000..de75e884f53143d9026636ad8663d89a36a30f69 --- /dev/null +++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc @@ -0,0 +1,437 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/mkldnn_quantizer.h" +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/type_defs.h" +#include "paddle/fluid/inference/analysis/analyzer.h" +#include "paddle/fluid/inference/api/analysis_predictor.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { + +using platform::CPUPlace; +using framework::LoDTensor; +using framework::ir::Graph; +using ConstEigenVectorArrayMap = + Eigen::Map>; +using string::PrettyLogH1; + +bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() { + PrettyLogH1("--- Calculating scales for quantization"); + using VariableNameMap = std::map>; + std::map> gathered_data; + for (const auto* op : predictor_.inference_program_->Block(0).AllOps()) { + if (op->HasAttr("use_quantizer") && + boost::get(op->GetAttr("use_quantizer"))) { + const VariableNameMap& connections_in = op->Inputs(); + const VariableNameMap& connections_out = op->Outputs(); + + auto glambda = [&](const VariableNameMap& connections, bool is_output) { + for (auto const& conn : connections) { + if (conn.second.size() == 0) continue; + auto& var_name = conn.second[0]; + + // skip if scale already computed + if (scales_.find(var_name) != scales_.end()) return; + + auto* var = predictor_.sub_scope_->FindVar(var_name); + PADDLE_ENFORCE(var, "%s is not in the scope", var_name); + PADDLE_ENFORCE(var->IsType(), + "Only support lod tensor now."); + LoDTensor* var_tensor = var->GetMutable(); + + // force unsigned type if already know it + bool is_unsigned = false; + if (is_output && op->Type() == "conv2d") { + // output of conv2d with relu must be unsigned + is_unsigned = op->HasAttr("fuse_relu") && + boost::get(op->GetAttr("fuse_relu")); + } else if (is_output && op->Type() == "pool2d") { + // output of pool2d with unsigned input must be unsigned + auto input_var_name = op->Input("X")[0]; + if (scales_.find(input_var_name) != scales_.end()) { + is_unsigned = scales_[input_var_name].first; + } + } + + CalculateSingleScale(op->Type(), conn.first, var_name, *var_tensor, + is_unsigned); + } + }; + + // handle outputs first so unsigned outputs could be inferred + glambda(connections_out, true /* is_output */); + glambda(connections_in, false /* is_output */); + } + } + + return true; +} + +void AnalysisPredictor::MkldnnQuantizer::CalculateSingleScale( + const std::string& op_type_name, const std::string& conn_name, + const std::string& var_name, const LoDTensor& var_tensor, + bool is_unsigned) { + auto rule = qconfig_->scale_algo(op_type_name, conn_name); + if (rule == ScaleAlgo::NONE) return; + + PADDLE_ENFORCE( + var_tensor.numel() > 0, + "MkldnnQuantizer: LoDTensor of variable %s for quantization of op " + "%s of connection %s should not be empty.", + var_name, op_type_name, conn_name); + + switch (rule) { + case ScaleAlgo::MAX: + scales_[var_name] = GetMaxScalingFactor(var_tensor, is_unsigned); + break; + case ScaleAlgo::MAX_CH: + scales_[var_name] = GetMaxChScalingFactor(var_tensor, is_unsigned); + break; + case ScaleAlgo::KL: + scales_[var_name] = GetKLScalingFactor(var_tensor, is_unsigned); + break; + default: + throw std::runtime_error( + "MkldnnQuantizer: Unexpected ScaleAlgo specified."); + } +} + +std::vector AnalysisPredictor::MkldnnQuantizer::ExpandQuantizedBins( + std::vector quantized_bins, std::vector reference_bins) const { + std::vector expanded_quantized_bins(reference_bins.size(), 0); + int num_merged_bins = reference_bins.size() / quantized_bins.size(); + int j_start = 0; + int j_end = num_merged_bins; + for (size_t idx = 0; idx < quantized_bins.size(); idx++) { + int zero_count = + std::count(&reference_bins[j_start], &reference_bins[j_end], 0); + num_merged_bins = j_end - j_start; + int avg_bin_ele; + if (zero_count == num_merged_bins) { + avg_bin_ele = 0; + } else { + avg_bin_ele = quantized_bins[idx] / (num_merged_bins - zero_count + 0.0); + } + for (int idx1 = j_start; idx1 < j_end; idx1++) { + expanded_quantized_bins[idx1] = + (reference_bins[idx1] == 0) ? 0 : avg_bin_ele; + } + j_start += num_merged_bins; + j_end += num_merged_bins; + if ((idx + 1) == quantized_bins.size() - 1) { + j_end = reference_bins.size(); + } + } + return expanded_quantized_bins; +} + +std::pair +AnalysisPredictor::MkldnnQuantizer::GetKLScalingFactor( + const LoDTensor& var_tensor, bool is_unsigned) const { + ConstEigenVectorArrayMap eigen_tensor{var_tensor.data(), + var_tensor.numel(), 1}; + int precision_hist_num_bins = 2048; + float max_val = eigen_tensor.maxCoeff(); + float min_val = eigen_tensor.minCoeff(); + bool is_positive = min_val >= 0.0f; + if (is_unsigned) + PADDLE_ENFORCE( + is_positive, + "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0", + min_val); + + int num_quantized_bins = 255; + + std::vector hist; + float bin_width; + int starting_iter; + int ending_iter = precision_hist_num_bins - 1; + if (is_positive) { + std::tie(hist, bin_width) = + Histogram(var_tensor, min_val, max_val, precision_hist_num_bins); + starting_iter = static_cast(ending_iter * 0.7); + } else { + float th = std::max(std::abs(max_val), std::abs(min_val)); + std::tie(hist, bin_width) = + Histogram(var_tensor, -th, th, precision_hist_num_bins); + starting_iter = 0; + if (std::abs(max_val) > std::abs(min_val)) { + while (starting_iter < ending_iter) { + if (hist[starting_iter] == 0) { + ++starting_iter; + continue; + } else { + break; + } + } + starting_iter += static_cast((ending_iter - starting_iter) * 0.6); + } else { + while (ending_iter > 0) { + if (hist[ending_iter] == 0) { + --ending_iter; + continue; + } else { + break; + } + } + starting_iter = static_cast(0.6 * ending_iter); + } + } + auto P_sum = eigen_tensor.size(); + int min_kl_divergence = 0; + int min_kl_index = 0; + bool kl_inited = false; + for (int i = starting_iter; i <= ending_iter; i++) { + std::vector reference_distr_P(&hist[0], &hist[i]); + auto outliers_count = + std::accumulate(&hist[i], &hist[precision_hist_num_bins], 0); + if (reference_distr_P[i - 1] == 0) { + continue; + } + reference_distr_P[i - 1] += outliers_count; + auto reference_distr_bins = reference_distr_P; + std::vector candidate_distr_Q(&hist[0], &hist[i]); + int num_merged_bins = i / num_quantized_bins; + std::vector candidate_distr_Q_quantized(num_quantized_bins, 0); + int j_start = 0; + int j_end = num_merged_bins; + for (int idx = 0; idx < num_quantized_bins; idx++) { + candidate_distr_Q_quantized[idx] = std::accumulate( + &candidate_distr_Q[j_start], &candidate_distr_Q[j_end], 0); + j_start += num_merged_bins; + j_end += num_merged_bins; + if ((idx + 1) == num_quantized_bins - 1) { + j_end = i; + } + } + candidate_distr_Q = + ExpandQuantizedBins(candidate_distr_Q_quantized, reference_distr_bins); + int Q_sum = + std::accumulate(candidate_distr_Q.begin(), candidate_distr_Q.end(), 0); + auto kl_divergence = + SafeEntropy(reference_distr_P, P_sum, candidate_distr_Q, Q_sum); + if (!kl_inited) { + min_kl_divergence = kl_divergence; + min_kl_index = i; + kl_inited = true; + } else if (kl_divergence < min_kl_divergence) { + min_kl_divergence = kl_divergence; + min_kl_index = i; + } else { + } + } + if (min_kl_index == 0) { + while (starting_iter > 0) { + if (hist[starting_iter] == 0) { + starting_iter -= 1; + continue; + } else { + break; + } + } + min_kl_index = starting_iter; + } + + LoDTensor scale_tensor; + scale_tensor.Resize({1}); + auto* scale_ptr = scale_tensor.mutable_data(CPUPlace()); + + scale_ptr[0] = 1.0 / ((min_kl_index + 0.5) * bin_width); + + return std::make_pair(is_unsigned, scale_tensor); +} + +std::pair +AnalysisPredictor::MkldnnQuantizer::GetMaxScalingFactor( + const LoDTensor& var_tensor, bool is_unsigned) const { + ConstEigenVectorArrayMap eigen_tensor{var_tensor.data(), + var_tensor.numel(), 1}; + float max_abs = eigen_tensor.abs().maxCoeff(); + float min_val = eigen_tensor.minCoeff(); + if (is_unsigned) + PADDLE_ENFORCE( + min_val >= 0.0f, + "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0", + min_val); + + LoDTensor scale_tensor; + scale_tensor.Resize({1}); + auto* scale_ptr = scale_tensor.mutable_data(CPUPlace()); + scale_ptr[0] = 1.0 / max_abs; + + return std::make_pair(is_unsigned, scale_tensor); +} + +std::pair +AnalysisPredictor::MkldnnQuantizer::GetMaxChScalingFactor( + const LoDTensor& var_tensor, bool is_unsigned) const { + PADDLE_ENFORCE(var_tensor.dims().size() > 0, "Tensor dimension is empty."); + + ConstEigenVectorArrayMap eigen_tensor{var_tensor.data(), + var_tensor.numel(), 1}; + float min_val = eigen_tensor.minCoeff(); + if (is_unsigned) + PADDLE_ENFORCE( + min_val >= 0.0f, + "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0", + min_val); + + int channels = var_tensor.dims()[0]; + LoDTensor scale_tensor; + scale_tensor.Resize({channels}); + auto* scale_ptr = scale_tensor.mutable_data(CPUPlace()); + + for (int i = 0; i < channels; ++i) { + const auto tensor = var_tensor.Slice(i, i + 1); + + ConstEigenVectorArrayMap eigen_tensor{tensor.data(), tensor.numel(), + 1}; + float max_abs = eigen_tensor.abs().maxCoeff(); + scale_ptr[i] = 1.0 / max_abs; + } + + return std::make_pair(is_unsigned, scale_tensor); +} + +std::pair, float> +AnalysisPredictor::MkldnnQuantizer::Histogram( + const framework::LoDTensor& var_tensor, float min_val, float max_val, + size_t num_bins) const { + PADDLE_ENFORCE_GT(num_bins, 0, + "MkldnnQuantizer: To calculate Histogram, num_bins (" + + std::to_string(num_bins) + ") must be positive."); + PADDLE_ENFORCE_GT( + var_tensor.numel(), 0, + "MkldnnQuantizer: To calculate Histogram, the tensor must not be empty."); + PADDLE_ENFORCE(max_val >= min_val, + "MkldnnQuantizer: To calculate Histogram, max_val (" + + std::to_string(max_val) + + ") must be greater or equal" + "to min_val (" + + std::to_string(min_val) + ")."); + ConstEigenVectorArrayMap eigen_tensor{var_tensor.data(), + var_tensor.numel(), 1}; + auto bin_width = std::abs(max_val - min_val) / num_bins; + std::vector hist(num_bins); + + for (int i = 0; i < eigen_tensor.size(); i++) { + int bin = std::min( + num_bins - 1, + static_cast(floor((eigen_tensor[i] - min_val) / bin_width))); + ++hist[bin]; + } + + return std::make_pair(std::move(hist), std::move(bin_width)); +} + +void AnalysisPredictor::MkldnnQuantizer::PrepareArgument() const { + auto& arg = predictor_.argument_; + if (!arg.scope_valid()) arg.SetScope(new framework::Scope); + arg.SetMainProgramNotOwned(predictor_.inference_program_.get()); + auto graph = std::unique_ptr(new Graph(arg.main_program())); + arg.SetMainGraph(graph.release()); + arg.main_graph().Set(framework::ir::kParamScopeAttr, + new framework::Scope*(arg.scope_ptr())); + + auto* builder = predictor_.config_.pass_builder(); + builder->SetPasses({ + "infer_clean_graph_pass", "cpu_quantize_pass", "cpu_quantize_squash_pass", + }); + if (predictor_.config_.ir_debug_) builder->TurnOnDebug(); + auto passes = builder->AllPasses(); + predictor_.argument_.SetIrAnalysisPasses(passes); + predictor_.argument_.SetAnalysisPasses( + {"ir_analysis_pass", "memory_optimize_pass", "ir_graph_to_program_pass"}); + predictor_.argument_.SetQuantVarScales(scales_); +} + +bool AnalysisPredictor::MkldnnQuantizer::Quantize() { + if (!RunWarmup()) return false; + if (!CalculateScales()) return false; + predictor_.PrepareScope(predictor_.scope_); + predictor_.CreateExecutor(); + if (!RunQuantizePasses()) return false; + predictor_.PrepareExecutor(); + predictor_.PrepareFeedFetch(); + return true; +} + +bool AnalysisPredictor::MkldnnQuantizer::RunQuantizePasses() const { + predictor_.executor_->CreateVariables(*predictor_.inference_program_, 0, true, + predictor_.sub_scope_); + PrepareArgument(); + auto& arg = predictor_.argument_; + Analyzer().Run(&arg); + PADDLE_ENFORCE(arg.scope_valid()); + VLOG(5) << "to prepare executor"; + ARGUMENT_CHECK_FIELD((&arg), ir_analyzed_program); + predictor_.inference_program_.reset( + new framework::ProgramDesc(arg.ir_analyzed_program())); + LOG(INFO) << "== optimize 2 end =="; + predictor_.executor_->CreateVariables(*predictor_.inference_program_, 0, + false, predictor_.sub_scope_); + return true; +} + +bool AnalysisPredictor::MkldnnQuantizer::RunWarmup() const { + VLOG(3) << "Predictor: run a quantization warmup iteration"; + auto warmup_data = qconfig_->warmup_data(); + PADDLE_ENFORCE_NOT_NULL(warmup_data, + "Warmup data cannot be NULL in the config."); + PrettyLogH1("--- Running warmup iteration for quantization"); + + // Run the inference program + std::vector output_slots; + predictor_.Run(*warmup_data, &output_slots, qconfig_->warmup_batch_size()); + + return true; +} + +float AnalysisPredictor::MkldnnQuantizer::SafeEntropy( + std::vector reference_distr_P, int P_sum, + std::vector candidate_distr_Q, int Q_sum) const { + PADDLE_ENFORCE_EQ(reference_distr_P.size(), candidate_distr_Q.size()); + float tmp_sum1 = 0; + float tmp_sum2 = 0; + for (size_t idx = 0; idx < reference_distr_P.size(); idx++) { + int p_idx = reference_distr_P[idx]; + int q_idx = candidate_distr_Q[idx]; + if (p_idx == 0) { + tmp_sum1 += 0; + tmp_sum2 += 0; + } else { + PADDLE_ENFORCE(q_idx != 0, "MkldnnQuantizer: Fatal error!, idx = " + + std::to_string(idx) + + " qindex = 0! p_idx = " + + std::to_string(p_idx)); + } + tmp_sum1 += p_idx * (log(Q_sum * p_idx)); + tmp_sum2 += p_idx * (log(P_sum * q_idx)); + } + return (tmp_sum1 - tmp_sum2) / P_sum; +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.h b/paddle/fluid/inference/api/mkldnn_quantizer.h new file mode 100644 index 0000000000000000000000000000000000000000..f4b0df5d742ed12f856fc7982d955e89288a1888 --- /dev/null +++ b/paddle/fluid/inference/api/mkldnn_quantizer.h @@ -0,0 +1,104 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/inference/analysis/analyzer.h" +#include "paddle/fluid/inference/api/analysis_predictor.h" +#include "paddle/fluid/inference/api/api_impl.h" +#include "paddle/fluid/inference/api/details/reset_tensor_array.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/string/printf.h" +#ifdef PADDLE_WITH_TESTING +#include +#include +#endif + +namespace paddle { + +/* + * Map variable name to tensor of scaling factors scaling it to MAX=1.0. + * bool denotes whether quantization of the variable should be done to unsigned + * type. + */ +using VarQuantScale = + std::unordered_map>; + +class AnalysisPredictor::MkldnnQuantizer { + public: + explicit MkldnnQuantizer( + AnalysisPredictor& predictor, // NOLINT + const std::shared_ptr& qconfig) + : predictor_(predictor), qconfig_(qconfig) {} + + // Execute full quantization procedure. + bool Quantize(); + +#if PADDLE_WITH_TESTING + friend class MkldnnQuantizerTest; +#endif + + private: + // Run single warmup iteration + bool RunWarmup() const; + // Gather data from variables and calculate scales for them. + bool CalculateScales(); + // Calculate a scale for tensor based on ScaleAlgo rules. + void CalculateSingleScale(const std::string& op_name, + const std::string& conn_name, + const std::string& var_name, + const framework::LoDTensor& var_tensor, + bool is_unsigned); + void PrepareArgument() const; + bool RunQuantizePasses() const; + + std::vector ExpandQuantizedBins(std::vector quantized_bins, + std::vector reference_bins) const; + + // Using the KL-divergence method get the most precise scaling factor. + std::pair GetKLScalingFactor( + const framework::LoDTensor& var_tensor, bool is_unsigned) const; + + std::pair GetMaxChScalingFactor( + const framework::LoDTensor& var_tensor, bool is_unsigned) const; + + std::pair GetMaxScalingFactor( + const framework::LoDTensor& var_tensor, bool is_unsigned) const; + + // Returns histogram and bin width + std::pair, float> Histogram( + const framework::LoDTensor& var_tensor, float min_val, float max_val, + size_t num_bins = 2048) const; + + // Calculate the entropy. + float SafeEntropy(std::vector reference_distr_P, int P_sum, + std::vector candidate_distr_Q, int Q_sum) const; + + private: + AnalysisPredictor& predictor_; + const std::shared_ptr qconfig_; + + // A map: variable name -> scale + VarQuantScale scales_; +}; + +} // namespace paddle diff --git a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc new file mode 100644 index 0000000000000000000000000000000000000000..f9ff542d86d2a7a3ac2e7f004e11eddfea3598d5 --- /dev/null +++ b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc @@ -0,0 +1,40 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h" + +namespace paddle { + +MkldnnQuantizerConfig::MkldnnQuantizerConfig() { + // The default configuration of scale computing algorightms + rules_["conv2d"]["Input"] = ScaleAlgo::KL; + rules_["conv2d"]["Filter"] = ScaleAlgo::MAX_CH; + rules_["conv2d"]["Bias"] = ScaleAlgo::NONE; // do not compute scale + rules_["conv2d"]["ResidualData"] = ScaleAlgo::KL; + rules_["conv2d"]["Output"] = ScaleAlgo::KL; // do not compute scale + + rules_["pool2d"]["X"] = ScaleAlgo::KL; + rules_["pool2d"]["Out"] = ScaleAlgo::KL; // do not compute scale +} + +ScaleAlgo MkldnnQuantizerConfig::scale_algo( + const std::string& op_type_name, const std::string& conn_name) const { + if (rules_.find(op_type_name) != rules_.end()) { + auto op_rule = rules_.at(op_type_name); + if (op_rule.find(conn_name) != op_rule.end()) return op_rule.at(conn_name); + } + return default_scale_algo_; +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 23df507aa609c018e54524159494b08b7e42c75c..2ad4add2945d65037829e0bb453372e38a04421c 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -27,10 +27,14 @@ // the abstract path of this header file will be changed. #include "paddle_api.h" // NOLINT #include "paddle_pass_builder.h" // NOLINT +#ifdef PADDLE_WITH_MKLDNN +#include "paddle_mkldnn_quantizer_config.h" // NOLINT +#endif namespace paddle { class AnalysisPredictor; +struct MkldnnQuantizerConfig; // NOTE WIP, not stable yet. struct AnalysisConfig { @@ -186,6 +190,16 @@ struct AnalysisConfig { mkldnn_enabled_op_types_ = op_list; } + /** Turn on quantization. + */ + void EnableMkldnnQuantizer(); + + /** A boolean state telling whether the quantization is enabled. + */ + bool mkldnn_quantizer_enabled() const { return use_mkldnn_quantizer_; } + + std::shared_ptr mkldnn_quantizer_config() const; + /** Specify the memory buffer of program and parameter * @param prog_buffer the memory buffer of program. * @param prog_buffer_size the size of the data. @@ -271,10 +285,14 @@ struct AnalysisConfig { std::string serialized_info_cache_; mutable std::unique_ptr pass_builder_; + bool use_anakin_{false}; int anakin_max_batchsize_; std::map> anakin_max_input_shape_; std::map engine_opt_info_; + + bool use_mkldnn_quantizer_{false}; + std::shared_ptr mkldnn_quantizer_config_; }; } // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h b/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h new file mode 100644 index 0000000000000000000000000000000000000000..d46f842de7a2277ee5d00672386b12af7ba28deb --- /dev/null +++ b/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h @@ -0,0 +1,105 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "paddle_api.h" // NOLINT + +namespace paddle { + +// Algorithms for finding scale of quantized Tensors. +enum class ScaleAlgo { + NONE, // Do not compute scale + MAX, // Find scale based on the maximum absolute value + MAX_CH, // Find scale based on the maximum absolute value per channel + KL, // Find scale based on KL Divergence +}; + +struct MkldnnQuantizerConfig { + MkldnnQuantizerConfig(); + + /** Specify a quantization algorithm for a connection (input/output) of the + * operator type. + * @param op_type_name the operator's name. + * @param conn_name name of the connection (input/output) of the operator. + * @param algo the algorithm for computing scale. + */ + void SetScaleAlgo(std::string op_type_name, std::string conn_name, + ScaleAlgo algo) { + rules_[op_type_name][conn_name] = algo; + } + + /** Get the quantization algorithm for a connection (input/output) of the + * operator type. + * @param op_type_name the operator's name. + * @param conn_name name of the connection (input/output) of the operator. + * @return the algorithm for computing scale. + */ + ScaleAlgo scale_algo(const std::string& op_type_name, + const std::string& conn_name) const; + + /** Set the batch of data to be used for warm-up iteration. + * @param data batch of data. + */ + void SetWarmupData(std::shared_ptr> data) { + warmup_data_ = data; + } + + /** Get the batch of data used for warm-up iteration. + * @return batch of data. + */ + std::shared_ptr> warmup_data() const { + return warmup_data_; + } + + void SetWarmupBatchSize(int batch_size) { warmup_bs_ = batch_size; } + + int warmup_batch_size() const { return warmup_bs_; } + + void SetEnabledOpTypes(std::unordered_set op_list) { + enabled_op_types_ = op_list; + } + + const std::unordered_set& enabled_op_types() const { + return enabled_op_types_; + } + + void SetExcludedOpIds(std::unordered_set op_ids_list) { + excluded_op_ids_ = op_ids_list; + } + + const std::unordered_set& excluded_op_ids() const { + return excluded_op_ids_; + } + + void SetDefaultScaleAlgo(ScaleAlgo algo) { default_scale_algo_ = algo; } + + ScaleAlgo default_scale_algo() const { return default_scale_algo_; } + + protected: + std::map> rules_; + std::unordered_set enabled_op_types_; + std::unordered_set excluded_op_ids_; + std::shared_ptr> warmup_data_; + int warmup_bs_{1}; + ScaleAlgo default_scale_algo_{ScaleAlgo::MAX}; +}; + +} // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 35dd1176718a3d7e4f3867ce048216ea45e5ba7f..8ec32b3a0b7fe459518e269fc72b182bc168435f 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -107,8 +107,8 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { use_gpu_ = true; } -void GpuPassStrategy::EnableQuantizer() { - LOG(ERROR) << "GPU not support quantization yet"; +void GpuPassStrategy::EnableMkldnnQuantizer() { + LOG(ERROR) << "GPU not support MKL-DNN quantization"; } void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) { diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 65403e790e93b1a4f99803472d77b0e7a02a4fec..de60185eb30687b65c384f9ab4443faee89965c8 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -30,6 +30,10 @@ class PaddlePassBuilder { explicit PaddlePassBuilder(const std::vector &passes) : passes_(passes) {} + void SetPasses(std::initializer_list passes) { + passes_ = passes; + } + /** Append a pass to the end of the passes. */ void AppendPass(const std::string &pass_type); @@ -85,9 +89,9 @@ class PassStrategy : public PaddlePassBuilder { */ virtual void EnableMKLDNN() {} - /** Enable quantize optimization + /** Enable MKLDNN quantize optimization */ - virtual void EnableQuantizer() {} + virtual void EnableMkldnnQuantizer() {} bool use_gpu() const { return use_gpu_; } @@ -130,15 +134,19 @@ class CpuPassStrategy : public PassStrategy { #endif } - void EnableQuantizer() override { - if (!use_quantizer_) { + void EnableMkldnnQuantizer() override { +#ifdef PADDLE_WITH_MKLDNN + if (!use_mkldnn_quantizer_) { passes_.push_back("cpu_quantize_placement_pass"); } - use_quantizer_ = true; + use_mkldnn_quantizer_ = true; +#else + use_mkldnn_quantizer_ = false; +#endif } protected: - bool use_quantizer_{false}; + bool use_mkldnn_quantizer_{false}; }; /** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode. @@ -153,7 +161,7 @@ class GpuPassStrategy : public PassStrategy { } void EnableMKLDNN() override; - void EnableQuantizer() override; + void EnableMkldnnQuantizer() override; virtual ~GpuPassStrategy() = default; };