未验证 提交 09dfc7a2 编写于 作者: W Wojciech Uss 提交者: GitHub

C-API quantization core 2 (#16396)

* C-API quantization core

test=develop
Co-authored-by: NSylwester Fraczek <sylwester.fraczek@intel.com>

* Decouple Quantizer from AnalysisPredictor

test=develop

* fixes after review

test=develop

* renamed mkldnn quantize stuff

test=develop

* remove ifdef from header file

test=develop
上级 e41d5813
...@@ -37,18 +37,24 @@ endif(WIN32) ...@@ -37,18 +37,24 @@ endif(WIN32)
add_subdirectory(api) add_subdirectory(api)
if(WITH_MKLDNN)
set(mkldnn_quantizer_src ${CMAKE_CURRENT_SOURCE_DIR}/api/mkldnn_quantizer.cc)
set(mkldnn_quantizer_cfg mkldnn_quantizer_config)
endif()
set(STATIC_INFERENCE_APIS paddle_fluid_api paddle_inference_api analysis_predictor) set(STATIC_INFERENCE_APIS paddle_fluid_api paddle_inference_api analysis_predictor)
set(SHARED_INFERENCE_SRCS set(SHARED_INFERENCE_SRCS
io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc
${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
${mkldnn_quantizer_src}
${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc) ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc)
if(WIN32) if(WIN32)
sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array
analysis_config paddle_pass_builder) analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder)
else(WIN32) else(WIN32)
cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS}
zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder) zero_copy_tensor reset_tensor_array analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder)
endif(WIN32) endif(WIN32)
if(NOT APPLE) if(NOT APPLE)
...@@ -61,11 +67,11 @@ endif() ...@@ -61,11 +67,11 @@ endif()
if(WIN32) if(WIN32)
sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array
analysis_config paddle_pass_builder) analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder)
else(WIN32) else(WIN32)
cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array
analysis_config paddle_pass_builder) analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder)
endif() endif()
get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(paddle_fluid_shared ${os_dependency_modules}) target_link_libraries(paddle_fluid_shared ${os_dependency_modules})
......
...@@ -33,13 +33,19 @@ endif() ...@@ -33,13 +33,19 @@ endif()
add_subdirectory(details) add_subdirectory(details)
cc_library(analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder) if(WITH_MKLDNN)
set(mkldnn_quantizer_src mkldnn_quantizer.cc)
set(mkldnn_quantizer_cfg mkldnn_quantizer_config)
cc_library(${mkldnn_quantizer_cfg} SRCS mkldnn_quantizer_config.cc DEPS lod_tensor paddle_pass_builder)
endif()
cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder)
cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc) cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api zero_copy_tensor cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS paddle_inference_api zero_copy_tensor
reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager ${inference_deps}) reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager ${inference_deps})
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS
lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config
analysis_config paddle_pass_builder zero_copy_tensor paddle_pass_builder zero_copy_tensor
reset_tensor_array) reset_tensor_array)
cc_test(test_paddle_inference_api cc_test(test_paddle_inference_api
......
...@@ -108,6 +108,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -108,6 +108,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
// MKLDNN related. // MKLDNN related.
CP_MEMBER(use_mkldnn_); CP_MEMBER(use_mkldnn_);
CP_MEMBER(mkldnn_enabled_op_types_); CP_MEMBER(mkldnn_enabled_op_types_);
// Quantization related.
CP_MEMBER(use_mkldnn_quantizer_);
CP_MEMBER(mkldnn_quantizer_config_);
CP_MEMBER(use_anakin_); CP_MEMBER(use_anakin_);
CP_MEMBER(anakin_max_batchsize_); CP_MEMBER(anakin_max_batchsize_);
...@@ -148,6 +151,26 @@ void AnalysisConfig::EnableMKLDNN() { ...@@ -148,6 +151,26 @@ void AnalysisConfig::EnableMKLDNN() {
Update(); Update();
} }
void AnalysisConfig::EnableMkldnnQuantizer() {
#ifdef PADDLE_WITH_MKLDNN
if (!mkldnn_quantizer_config_)
mkldnn_quantizer_config_.reset(new MkldnnQuantizerConfig());
use_mkldnn_quantizer_ = true;
#else
LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnQuantizer";
use_mkldnn_quantizer_ = false;
#endif
Update();
}
std::shared_ptr<MkldnnQuantizerConfig> AnalysisConfig::mkldnn_quantizer_config()
const {
PADDLE_ENFORCE_NOT_NULL(mkldnn_quantizer_config_,
"MkldnnQuantizer was not enabled yet.");
return mkldnn_quantizer_config_;
}
void AnalysisConfig::EnableTensorRtEngine( void AnalysisConfig::EnableTensorRtEngine(
int workspace_size, int max_batch_size, int min_subgraph_size, int workspace_size, int max_batch_size, int min_subgraph_size,
AnalysisConfig::Precision precision_mode, bool use_static) { AnalysisConfig::Precision precision_mode, bool use_static) {
...@@ -224,15 +247,27 @@ void AnalysisConfig::Update() { ...@@ -224,15 +247,27 @@ void AnalysisConfig::Update() {
#endif #endif
} }
if (enable_memory_optim_) { // Quantization passes must come after all other optimization passes
auto analysis_passes = pass_builder()->AnalysisPasses(); if (use_mkldnn_quantizer_) {
auto memory_opti_pass_name = "memory_optimize_pass"; if (!enable_ir_optim_) {
bool already_exists = LOG(ERROR) << "EnableMkldnnQuantizer() only works when IR optimization "
std::find(analysis_passes.begin(), analysis_passes.end(), "is enabled.";
memory_opti_pass_name) != analysis_passes.end();
if (!already_exists) {
pass_builder()->AppendAnalysisPass(memory_opti_pass_name);
} }
#ifdef PADDLE_WITH_MKLDNN
pass_builder()->EnableMkldnnQuantizer();
#else
LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnQuantizer";
use_mkldnn_quantizer_ = false;
#endif
}
#ifdef PADDLE_WITH_MKLDNN
// Do not optimize before quantization
if (enable_memory_optim_ && !use_mkldnn_quantizer_) {
#else
if (enable_memory_optim_) {
#endif
pass_builder()->AppendAnalysisPass("memory_optimize_pass");
} }
if (use_anakin_) { if (use_anakin_) {
...@@ -277,6 +312,7 @@ std::string AnalysisConfig::SerializeInfoCache() { ...@@ -277,6 +312,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
for (auto &item : mkldnn_enabled_op_types_) ss << item; for (auto &item : mkldnn_enabled_op_types_) ss << item;
ss << ";"; ss << ";";
ss << use_mkldnn_quantizer_;
ss << model_from_memory_; ss << model_from_memory_;
ss << enable_ir_optim_; ss << enable_ir_optim_;
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <fstream> #include <fstream>
#include <memory> #include <memory>
#include <string> #include <string>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/feed_fetch_type.h"
...@@ -35,8 +36,13 @@ ...@@ -35,8 +36,13 @@
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/inference/api/mkldnn_quantizer.h"
#endif
#if PADDLE_WITH_TENSORRT #if PADDLE_WITH_TENSORRT
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
...@@ -341,10 +347,7 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs, ...@@ -341,10 +347,7 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
return true; return true;
} }
// NOTE All the members in AnalysisConfig should be copied to Argument. void AnalysisPredictor::PrepareArgument() {
void AnalysisPredictor::OptimizeInferenceProgram() {
status_program_optimized_ = true;
argument_.SetUseGPU(config_.use_gpu()); argument_.SetUseGPU(config_.use_gpu());
argument_.SetGPUDeviceId(config_.gpu_device_id()); argument_.SetGPUDeviceId(config_.gpu_device_id());
argument_.SetEnableMemoryOptim(config_.enable_memory_optim()); argument_.SetEnableMemoryOptim(config_.enable_memory_optim());
...@@ -390,6 +393,16 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -390,6 +393,16 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_); argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_);
} }
#ifdef PADDLE_WITH_MKLDNN
if (config_.mkldnn_quantizer_enabled()) {
LOG(INFO) << "Quantization is enabled";
argument_.SetQuantizeEnabledOpTypes(
config_.mkldnn_quantizer_config()->enabled_op_types());
argument_.SetQuantizeExcludedOpIds(
config_.mkldnn_quantizer_config()->excluded_op_ids());
}
#endif
auto passes = config_.pass_builder()->AllPasses(); auto passes = config_.pass_builder()->AllPasses();
if (!config_.ir_optim()) { if (!config_.ir_optim()) {
passes.clear(); passes.clear();
...@@ -398,6 +411,13 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -398,6 +411,13 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
argument_.SetIrAnalysisPasses(passes); argument_.SetIrAnalysisPasses(passes);
argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses()); argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses());
argument_.SetScopeNotOwned(scope_.get()); argument_.SetScopeNotOwned(scope_.get());
}
// NOTE All the members in AnalysisConfig should be copied to Argument.
void AnalysisPredictor::OptimizeInferenceProgram() {
status_program_optimized_ = true;
PrepareArgument();
Analyzer().Run(&argument_); Analyzer().Run(&argument_);
PADDLE_ENFORCE(argument_.scope_valid()); PADDLE_ENFORCE(argument_.scope_valid());
...@@ -439,12 +459,31 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< ...@@ -439,12 +459,31 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
} }
std::unique_ptr<PaddlePredictor> predictor(new AnalysisPredictor(config)); std::unique_ptr<PaddlePredictor> predictor(new AnalysisPredictor(config));
if (!dynamic_cast<AnalysisPredictor *>(predictor.get())->Init(nullptr)) { auto predictor_p = dynamic_cast<AnalysisPredictor *>(predictor.get());
if (!predictor_p->Init(nullptr)) {
return nullptr;
}
if (config.mkldnn_quantizer_enabled() && !predictor_p->MkldnnQuantize()) {
return nullptr; return nullptr;
} }
return predictor; return predictor;
} }
bool AnalysisPredictor::MkldnnQuantize() {
#if PADDLE_WITH_MKLDNN
if (!mkldnn_quantizer_)
mkldnn_quantizer_ = new AnalysisPredictor::MkldnnQuantizer(
*this, config_.mkldnn_quantizer_config());
return mkldnn_quantizer_->Quantize();
#else
LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnQuantizer";
return false;
#endif
}
void AnalysisPredictor::PrepareFeedFetch() { void AnalysisPredictor::PrepareFeedFetch() {
PADDLE_ENFORCE_NOT_NULL(sub_scope_); PADDLE_ENFORCE_NOT_NULL(sub_scope_);
CreateFeedFetchVar(sub_scope_); CreateFeedFetchVar(sub_scope_);
...@@ -703,6 +742,13 @@ AnalysisPredictor::~AnalysisPredictor() { ...@@ -703,6 +742,13 @@ AnalysisPredictor::~AnalysisPredictor() {
scope_->DeleteScope(sub_scope_); scope_->DeleteScope(sub_scope_);
} }
#if PADDLE_WITH_MKLDNN
if (mkldnn_quantizer_) {
delete mkldnn_quantizer_;
mkldnn_quantizer_ = nullptr;
}
#endif
// TODO(Superjomn) deduce the directory path. // TODO(Superjomn) deduce the directory path.
std::string out_path = inference::analysis::GetMemoryCachePath( std::string out_path = inference::analysis::GetMemoryCachePath(
config_.model_dir(), config_.prog_file()); config_.model_dir(), config_.prog_file());
......
...@@ -70,6 +70,7 @@ class AnalysisPredictor : public PaddlePredictor { ...@@ -70,6 +70,7 @@ class AnalysisPredictor : public PaddlePredictor {
void CreateFeedFetchVar(framework::Scope *scope); void CreateFeedFetchVar(framework::Scope *scope);
void PrepareFeedFetch(); void PrepareFeedFetch();
void PrepareArgument();
void OptimizeInferenceProgram(); void OptimizeInferenceProgram();
Argument &analysis_argument() { return argument_; } Argument &analysis_argument() { return argument_; }
...@@ -83,6 +84,8 @@ class AnalysisPredictor : public PaddlePredictor { ...@@ -83,6 +84,8 @@ class AnalysisPredictor : public PaddlePredictor {
std::string GetSerializedProgram() const override; std::string GetSerializedProgram() const override;
bool MkldnnQuantize();
protected: protected:
// For memory optimization. // For memory optimization.
bool need_collect_var_shapes_for_memory_optim(); bool need_collect_var_shapes_for_memory_optim();
...@@ -143,6 +146,16 @@ class AnalysisPredictor : public PaddlePredictor { ...@@ -143,6 +146,16 @@ class AnalysisPredictor : public PaddlePredictor {
std::vector<framework::OpDesc *> fetches_; std::vector<framework::OpDesc *> fetches_;
std::map<size_t, std::string> idx2fetches_; std::map<size_t, std::string> idx2fetches_;
#if PADDLE_WITH_MKLDNN
// Helper class to perform quantization
class MkldnnQuantizer;
MkldnnQuantizer *mkldnn_quantizer_{nullptr};
#if PADDLE_WITH_TESTING
friend class MkldnnQuantizerTest;
#endif
#endif
// Memory buffer for feed inputs. The temporary LoDTensor will cause serious // Memory buffer for feed inputs. The temporary LoDTensor will cause serious
// concurrency problems, wrong results and memory leak, so cache them. // concurrency problems, wrong results and memory leak, so cache them.
std::vector<framework::LoDTensor> feed_tensors_; std::vector<framework::LoDTensor> feed_tensors_;
......
...@@ -17,9 +17,13 @@ ...@@ -17,9 +17,13 @@
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <thread> // NOLINT #include <thread> // NOLINT
#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h" #include "paddle/fluid/inference/tests/api/tester_helper.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/inference/api/mkldnn_quantizer.h"
#endif
DEFINE_string(dirname, "", "dirname to tests."); DEFINE_string(dirname, "", "dirname to tests.");
...@@ -243,4 +247,241 @@ TEST(AnalysisPredictor, memory_optim) { ...@@ -243,4 +247,241 @@ TEST(AnalysisPredictor, memory_optim) {
inference::CompareResult(output, output1); inference::CompareResult(output, output1);
} }
#ifdef PADDLE_WITH_MKLDNN
class MkldnnQuantizerTest : public testing::Test {
public:
MkldnnQuantizerTest() {
AnalysisConfig config(FLAGS_dirname);
predictor.reset(new AnalysisPredictor(config));
auto* predictor_p = static_cast<AnalysisPredictor*>(predictor.get());
auto qconfig = std::make_shared<MkldnnQuantizerConfig>();
mkldnn_quantizer.reset(
new AnalysisPredictor::MkldnnQuantizer(*predictor_p, qconfig));
}
std::pair<std::vector<int>, float> Histogram(
const framework::LoDTensor& var_tensor, float min_val, float max_val,
int num_bins) const {
return mkldnn_quantizer->Histogram(var_tensor, min_val, max_val, num_bins);
}
std::pair<bool, framework::LoDTensor> GetMaxScalingFactor(
const framework::LoDTensor& var_tensor, bool is_unsigned) const {
return mkldnn_quantizer->GetMaxScalingFactor(var_tensor, is_unsigned);
}
std::pair<bool, framework::LoDTensor> GetMaxChScalingFactor(
const framework::LoDTensor& var_tensor, bool is_unsigned) const {
return mkldnn_quantizer->GetMaxChScalingFactor(var_tensor, is_unsigned);
}
std::pair<bool, framework::LoDTensor> GetKLScalingFactor(
const framework::LoDTensor& var_tensor, bool is_unsigned) const {
return mkldnn_quantizer->GetKLScalingFactor(var_tensor, is_unsigned);
}
protected:
std::unique_ptr<PaddlePredictor> predictor;
std::unique_ptr<AnalysisPredictor::MkldnnQuantizer> mkldnn_quantizer;
float abs_error = 1e-6;
static const std::array<float, 10> non_negative_values;
static const std::array<float, 10> positive_and_negative_values;
};
const std::array<float, 10> MkldnnQuantizerTest::non_negative_values = {
0.0158671, 0.026459, 0.0280772, 0.00962479, 0.0131628,
0.016704, 0.00118407, 0.00765726, 0.0123213, 0.00944741};
const std::array<float, 10> MkldnnQuantizerTest::positive_and_negative_values =
{-0.0482659, -0.0102493, -0.00794221, -0.00387115, -0.00674586,
-0.0495346, 0.0629528, -0.00531285, -0.0230353, 0.0269089};
TEST_F(MkldnnQuantizerTest, histogram_inverted_min_max) {
const auto& values = non_negative_values;
auto min_val = *std::min_element(values.begin(), values.end());
auto max_val = *std::max_element(values.begin(), values.end());
framework::LoDTensor var_tensor;
var_tensor.Resize(framework::make_dim(values.size()));
std::copy(begin(values), end(values),
var_tensor.mutable_data<float>(platform::CPUPlace()));
ASSERT_THROW(Histogram(var_tensor, max_val, min_val, 3),
platform::EnforceNotMet);
}
TEST_F(MkldnnQuantizerTest, histogram_non_negative_to_3) {
// all non-negative values
const auto& values = non_negative_values;
auto min_val = *std::min_element(values.begin(), values.end());
auto max_val = *std::max_element(values.begin(), values.end());
framework::LoDTensor var_tensor;
var_tensor.Resize(framework::make_dim(values.size()));
std::copy(begin(values), end(values),
var_tensor.mutable_data<float>(platform::CPUPlace()));
std::vector<int> histogram;
float bin_width;
std::tie(histogram, bin_width) = Histogram(var_tensor, min_val, max_val, 3);
ASSERT_NEAR(bin_width, std::abs(max_val - min_val) / 3.f, abs_error)
<< "Improperly calculated bin_width.";
ASSERT_EQ(histogram[0], 4);
ASSERT_EQ(histogram[1], 4);
ASSERT_EQ(histogram[2], 2);
}
TEST_F(MkldnnQuantizerTest, histogram_positive_and_negative_to_3) {
const auto& values = positive_and_negative_values;
auto min_val = *std::min_element(values.begin(), values.end());
auto max_val = *std::max_element(values.begin(), values.end());
framework::LoDTensor var_tensor;
var_tensor.Resize(framework::make_dim(values.size()));
std::copy(begin(values), end(values),
var_tensor.mutable_data<float>(platform::CPUPlace()));
std::vector<int> histogram;
float bin_width;
std::tie(histogram, bin_width) = Histogram(var_tensor, min_val, max_val, 3);
ASSERT_NEAR(bin_width, std::abs(max_val - min_val) / 3.0f, abs_error)
<< "Improperly calculated bin_width.";
ASSERT_EQ(histogram[0], 3);
ASSERT_EQ(histogram[1], 5);
ASSERT_EQ(histogram[2], 2);
}
TEST_F(MkldnnQuantizerTest, histogram_zero_bins) {
const auto& values = non_negative_values;
auto min_val = *std::min_element(values.begin(), values.end());
auto max_val = *std::max_element(values.begin(), values.end());
framework::LoDTensor var_tensor;
var_tensor.Resize(framework::make_dim(values.size()));
std::copy(begin(values), end(values),
var_tensor.mutable_data<float>(platform::CPUPlace()));
ASSERT_THROW(Histogram(var_tensor, min_val, max_val, 0),
platform::EnforceNotMet);
}
TEST_F(MkldnnQuantizerTest, histogram_empty) {
// empty tensor
ASSERT_THROW(Histogram({}, -1, 1, 1), platform::EnforceNotMet);
// zero tensor
framework::LoDTensor var_tensor;
var_tensor.Resize({0});
ASSERT_TRUE(var_tensor.mutable_data<double>(platform::CPUPlace()));
ASSERT_THROW(Histogram(var_tensor, -1, 1, 1), platform::EnforceNotMet);
}
TEST_F(MkldnnQuantizerTest, kl_scaling_factor_signed) {
const auto& values = positive_and_negative_values;
framework::LoDTensor var_tensor;
var_tensor.Resize(framework::make_dim(values.size()));
std::copy(begin(values), end(values),
var_tensor.mutable_data<float>(platform::CPUPlace()));
bool is_unsigned;
framework::LoDTensor lod_tensor;
std::tie(is_unsigned, lod_tensor) = GetKLScalingFactor(var_tensor, false);
ASSERT_EQ(is_unsigned, false);
ASSERT_EQ(lod_tensor.numel(), 1);
ASSERT_NEAR(lod_tensor.data<double>()[0], 1.0 / 0.0899106152344, abs_error);
}
TEST_F(MkldnnQuantizerTest, max_scaling_factor_signed) {
const auto& values = positive_and_negative_values;
auto max_val = *std::max_element(values.begin(), values.end());
framework::LoDTensor var_tensor;
var_tensor.Resize(framework::make_dim(values.size()));
std::copy(begin(values), end(values),
var_tensor.mutable_data<float>(platform::CPUPlace()));
bool is_unsigned;
framework::LoDTensor lod_tensor;
std::tie(is_unsigned, lod_tensor) = GetMaxScalingFactor(var_tensor, false);
ASSERT_EQ(is_unsigned, false);
ASSERT_EQ(lod_tensor.numel(), 1);
ASSERT_NEAR(lod_tensor.data<double>()[0], 1.0 / max_val, abs_error);
}
TEST_F(MkldnnQuantizerTest, max_scaling_factor_unsigned) {
const auto& values = non_negative_values;
auto max_val = *std::max_element(values.begin(), values.end());
framework::LoDTensor var_tensor;
var_tensor.Resize(framework::make_dim(values.size()));
std::copy(begin(values), end(values),
var_tensor.mutable_data<float>(platform::CPUPlace()));
bool is_unsigned;
framework::LoDTensor lod_tensor;
std::tie(is_unsigned, lod_tensor) = GetMaxScalingFactor(var_tensor, true);
ASSERT_EQ(is_unsigned, true);
ASSERT_EQ(lod_tensor.numel(), 1);
ASSERT_NEAR(lod_tensor.data<double>()[0], 1.0 / max_val, abs_error);
}
TEST_F(MkldnnQuantizerTest, max_scaling_factor_chwise_unsigned) {
const auto& values = non_negative_values;
auto max_val = *std::max_element(values.begin(), values.end());
int channels = 3;
framework::LoDTensor var_tensor;
var_tensor.Resize(framework::make_dim(channels, 1, 1, values.size()));
for (int i = 0; i < channels; i++)
std::copy(begin(values), end(values),
var_tensor.mutable_data<float>(platform::CPUPlace()) +
i * values.size());
bool is_unsigned;
framework::LoDTensor lod_tensor;
std::tie(is_unsigned, lod_tensor) = GetMaxChScalingFactor(var_tensor, true);
ASSERT_EQ(is_unsigned, true);
ASSERT_EQ(lod_tensor.numel(), channels);
for (int i = 0; i < channels; i++) {
ASSERT_NEAR(lod_tensor.data<double>()[i], 1.0 / max_val, abs_error);
}
}
TEST_F(MkldnnQuantizerTest, kl_scaling_factor_unsigned) {
const auto& values = non_negative_values;
framework::LoDTensor var_tensor;
var_tensor.Resize(framework::make_dim(values.size()));
std::copy(begin(values), end(values),
var_tensor.mutable_data<float>(platform::CPUPlace()));
bool is_unsigned;
framework::LoDTensor lod_tensor;
std::tie(is_unsigned, lod_tensor) = GetKLScalingFactor(var_tensor, true);
ASSERT_EQ(is_unsigned, true);
ASSERT_EQ(lod_tensor.numel(), 1);
ASSERT_NEAR(lod_tensor.data<double>()[0], 1.0 / 0.0252845321362, abs_error);
}
#endif
} // namespace paddle } // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/api/mkldnn_quantizer.h"
#include <algorithm>
#include <map>
#include <numeric>
#include <unordered_map>
#include <utility>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/type_defs.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/pretty_log.h"
namespace paddle {
using platform::CPUPlace;
using framework::LoDTensor;
using framework::ir::Graph;
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
using string::PrettyLogH1;
bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() {
PrettyLogH1("--- Calculating scales for quantization");
using VariableNameMap = std::map<std::string, std::vector<std::string>>;
std::map<std::string, std::map<std::string, LoDTensor>> gathered_data;
for (const auto* op : predictor_.inference_program_->Block(0).AllOps()) {
if (op->HasAttr("use_quantizer") &&
boost::get<bool>(op->GetAttr("use_quantizer"))) {
const VariableNameMap& connections_in = op->Inputs();
const VariableNameMap& connections_out = op->Outputs();
auto glambda = [&](const VariableNameMap& connections, bool is_output) {
for (auto const& conn : connections) {
if (conn.second.size() == 0) continue;
auto& var_name = conn.second[0];
// skip if scale already computed
if (scales_.find(var_name) != scales_.end()) return;
auto* var = predictor_.sub_scope_->FindVar(var_name);
PADDLE_ENFORCE(var, "%s is not in the scope", var_name);
PADDLE_ENFORCE(var->IsType<LoDTensor>(),
"Only support lod tensor now.");
LoDTensor* var_tensor = var->GetMutable<LoDTensor>();
// force unsigned type if already know it
bool is_unsigned = false;
if (is_output && op->Type() == "conv2d") {
// output of conv2d with relu must be unsigned
is_unsigned = op->HasAttr("fuse_relu") &&
boost::get<bool>(op->GetAttr("fuse_relu"));
} else if (is_output && op->Type() == "pool2d") {
// output of pool2d with unsigned input must be unsigned
auto input_var_name = op->Input("X")[0];
if (scales_.find(input_var_name) != scales_.end()) {
is_unsigned = scales_[input_var_name].first;
}
}
CalculateSingleScale(op->Type(), conn.first, var_name, *var_tensor,
is_unsigned);
}
};
// handle outputs first so unsigned outputs could be inferred
glambda(connections_out, true /* is_output */);
glambda(connections_in, false /* is_output */);
}
}
return true;
}
void AnalysisPredictor::MkldnnQuantizer::CalculateSingleScale(
const std::string& op_type_name, const std::string& conn_name,
const std::string& var_name, const LoDTensor& var_tensor,
bool is_unsigned) {
auto rule = qconfig_->scale_algo(op_type_name, conn_name);
if (rule == ScaleAlgo::NONE) return;
PADDLE_ENFORCE(
var_tensor.numel() > 0,
"MkldnnQuantizer: LoDTensor of variable %s for quantization of op "
"%s of connection %s should not be empty.",
var_name, op_type_name, conn_name);
switch (rule) {
case ScaleAlgo::MAX:
scales_[var_name] = GetMaxScalingFactor(var_tensor, is_unsigned);
break;
case ScaleAlgo::MAX_CH:
scales_[var_name] = GetMaxChScalingFactor(var_tensor, is_unsigned);
break;
case ScaleAlgo::KL:
scales_[var_name] = GetKLScalingFactor(var_tensor, is_unsigned);
break;
default:
throw std::runtime_error(
"MkldnnQuantizer: Unexpected ScaleAlgo specified.");
}
}
std::vector<int> AnalysisPredictor::MkldnnQuantizer::ExpandQuantizedBins(
std::vector<int> quantized_bins, std::vector<int> reference_bins) const {
std::vector<int> expanded_quantized_bins(reference_bins.size(), 0);
int num_merged_bins = reference_bins.size() / quantized_bins.size();
int j_start = 0;
int j_end = num_merged_bins;
for (size_t idx = 0; idx < quantized_bins.size(); idx++) {
int zero_count =
std::count(&reference_bins[j_start], &reference_bins[j_end], 0);
num_merged_bins = j_end - j_start;
int avg_bin_ele;
if (zero_count == num_merged_bins) {
avg_bin_ele = 0;
} else {
avg_bin_ele = quantized_bins[idx] / (num_merged_bins - zero_count + 0.0);
}
for (int idx1 = j_start; idx1 < j_end; idx1++) {
expanded_quantized_bins[idx1] =
(reference_bins[idx1] == 0) ? 0 : avg_bin_ele;
}
j_start += num_merged_bins;
j_end += num_merged_bins;
if ((idx + 1) == quantized_bins.size() - 1) {
j_end = reference_bins.size();
}
}
return expanded_quantized_bins;
}
std::pair<bool, LoDTensor>
AnalysisPredictor::MkldnnQuantizer::GetKLScalingFactor(
const LoDTensor& var_tensor, bool is_unsigned) const {
ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(),
var_tensor.numel(), 1};
int precision_hist_num_bins = 2048;
float max_val = eigen_tensor.maxCoeff();
float min_val = eigen_tensor.minCoeff();
bool is_positive = min_val >= 0.0f;
if (is_unsigned)
PADDLE_ENFORCE(
is_positive,
"Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
min_val);
int num_quantized_bins = 255;
std::vector<int> hist;
float bin_width;
int starting_iter;
int ending_iter = precision_hist_num_bins - 1;
if (is_positive) {
std::tie(hist, bin_width) =
Histogram(var_tensor, min_val, max_val, precision_hist_num_bins);
starting_iter = static_cast<int>(ending_iter * 0.7);
} else {
float th = std::max(std::abs(max_val), std::abs(min_val));
std::tie(hist, bin_width) =
Histogram(var_tensor, -th, th, precision_hist_num_bins);
starting_iter = 0;
if (std::abs(max_val) > std::abs(min_val)) {
while (starting_iter < ending_iter) {
if (hist[starting_iter] == 0) {
++starting_iter;
continue;
} else {
break;
}
}
starting_iter += static_cast<int>((ending_iter - starting_iter) * 0.6);
} else {
while (ending_iter > 0) {
if (hist[ending_iter] == 0) {
--ending_iter;
continue;
} else {
break;
}
}
starting_iter = static_cast<int>(0.6 * ending_iter);
}
}
auto P_sum = eigen_tensor.size();
int min_kl_divergence = 0;
int min_kl_index = 0;
bool kl_inited = false;
for (int i = starting_iter; i <= ending_iter; i++) {
std::vector<int> reference_distr_P(&hist[0], &hist[i]);
auto outliers_count =
std::accumulate(&hist[i], &hist[precision_hist_num_bins], 0);
if (reference_distr_P[i - 1] == 0) {
continue;
}
reference_distr_P[i - 1] += outliers_count;
auto reference_distr_bins = reference_distr_P;
std::vector<int> candidate_distr_Q(&hist[0], &hist[i]);
int num_merged_bins = i / num_quantized_bins;
std::vector<int> candidate_distr_Q_quantized(num_quantized_bins, 0);
int j_start = 0;
int j_end = num_merged_bins;
for (int idx = 0; idx < num_quantized_bins; idx++) {
candidate_distr_Q_quantized[idx] = std::accumulate(
&candidate_distr_Q[j_start], &candidate_distr_Q[j_end], 0);
j_start += num_merged_bins;
j_end += num_merged_bins;
if ((idx + 1) == num_quantized_bins - 1) {
j_end = i;
}
}
candidate_distr_Q =
ExpandQuantizedBins(candidate_distr_Q_quantized, reference_distr_bins);
int Q_sum =
std::accumulate(candidate_distr_Q.begin(), candidate_distr_Q.end(), 0);
auto kl_divergence =
SafeEntropy(reference_distr_P, P_sum, candidate_distr_Q, Q_sum);
if (!kl_inited) {
min_kl_divergence = kl_divergence;
min_kl_index = i;
kl_inited = true;
} else if (kl_divergence < min_kl_divergence) {
min_kl_divergence = kl_divergence;
min_kl_index = i;
} else {
}
}
if (min_kl_index == 0) {
while (starting_iter > 0) {
if (hist[starting_iter] == 0) {
starting_iter -= 1;
continue;
} else {
break;
}
}
min_kl_index = starting_iter;
}
LoDTensor scale_tensor;
scale_tensor.Resize({1});
auto* scale_ptr = scale_tensor.mutable_data<double>(CPUPlace());
scale_ptr[0] = 1.0 / ((min_kl_index + 0.5) * bin_width);
return std::make_pair(is_unsigned, scale_tensor);
}
std::pair<bool, LoDTensor>
AnalysisPredictor::MkldnnQuantizer::GetMaxScalingFactor(
const LoDTensor& var_tensor, bool is_unsigned) const {
ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(),
var_tensor.numel(), 1};
float max_abs = eigen_tensor.abs().maxCoeff();
float min_val = eigen_tensor.minCoeff();
if (is_unsigned)
PADDLE_ENFORCE(
min_val >= 0.0f,
"Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
min_val);
LoDTensor scale_tensor;
scale_tensor.Resize({1});
auto* scale_ptr = scale_tensor.mutable_data<double>(CPUPlace());
scale_ptr[0] = 1.0 / max_abs;
return std::make_pair(is_unsigned, scale_tensor);
}
std::pair<bool, LoDTensor>
AnalysisPredictor::MkldnnQuantizer::GetMaxChScalingFactor(
const LoDTensor& var_tensor, bool is_unsigned) const {
PADDLE_ENFORCE(var_tensor.dims().size() > 0, "Tensor dimension is empty.");
ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(),
var_tensor.numel(), 1};
float min_val = eigen_tensor.minCoeff();
if (is_unsigned)
PADDLE_ENFORCE(
min_val >= 0.0f,
"Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
min_val);
int channels = var_tensor.dims()[0];
LoDTensor scale_tensor;
scale_tensor.Resize({channels});
auto* scale_ptr = scale_tensor.mutable_data<double>(CPUPlace());
for (int i = 0; i < channels; ++i) {
const auto tensor = var_tensor.Slice(i, i + 1);
ConstEigenVectorArrayMap eigen_tensor{tensor.data<float>(), tensor.numel(),
1};
float max_abs = eigen_tensor.abs().maxCoeff();
scale_ptr[i] = 1.0 / max_abs;
}
return std::make_pair(is_unsigned, scale_tensor);
}
std::pair<std::vector<int>, float>
AnalysisPredictor::MkldnnQuantizer::Histogram(
const framework::LoDTensor& var_tensor, float min_val, float max_val,
size_t num_bins) const {
PADDLE_ENFORCE_GT(num_bins, 0,
"MkldnnQuantizer: To calculate Histogram, num_bins (" +
std::to_string(num_bins) + ") must be positive.");
PADDLE_ENFORCE_GT(
var_tensor.numel(), 0,
"MkldnnQuantizer: To calculate Histogram, the tensor must not be empty.");
PADDLE_ENFORCE(max_val >= min_val,
"MkldnnQuantizer: To calculate Histogram, max_val (" +
std::to_string(max_val) +
") must be greater or equal"
"to min_val (" +
std::to_string(min_val) + ").");
ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(),
var_tensor.numel(), 1};
auto bin_width = std::abs(max_val - min_val) / num_bins;
std::vector<int> hist(num_bins);
for (int i = 0; i < eigen_tensor.size(); i++) {
int bin = std::min(
num_bins - 1,
static_cast<size_t>(floor((eigen_tensor[i] - min_val) / bin_width)));
++hist[bin];
}
return std::make_pair(std::move(hist), std::move(bin_width));
}
void AnalysisPredictor::MkldnnQuantizer::PrepareArgument() const {
auto& arg = predictor_.argument_;
if (!arg.scope_valid()) arg.SetScope(new framework::Scope);
arg.SetMainProgramNotOwned(predictor_.inference_program_.get());
auto graph = std::unique_ptr<Graph>(new Graph(arg.main_program()));
arg.SetMainGraph(graph.release());
arg.main_graph().Set(framework::ir::kParamScopeAttr,
new framework::Scope*(arg.scope_ptr()));
auto* builder = predictor_.config_.pass_builder();
builder->SetPasses({
"infer_clean_graph_pass", "cpu_quantize_pass", "cpu_quantize_squash_pass",
});
if (predictor_.config_.ir_debug_) builder->TurnOnDebug();
auto passes = builder->AllPasses();
predictor_.argument_.SetIrAnalysisPasses(passes);
predictor_.argument_.SetAnalysisPasses(
{"ir_analysis_pass", "memory_optimize_pass", "ir_graph_to_program_pass"});
predictor_.argument_.SetQuantVarScales(scales_);
}
bool AnalysisPredictor::MkldnnQuantizer::Quantize() {
if (!RunWarmup()) return false;
if (!CalculateScales()) return false;
predictor_.PrepareScope(predictor_.scope_);
predictor_.CreateExecutor();
if (!RunQuantizePasses()) return false;
predictor_.PrepareExecutor();
predictor_.PrepareFeedFetch();
return true;
}
bool AnalysisPredictor::MkldnnQuantizer::RunQuantizePasses() const {
predictor_.executor_->CreateVariables(*predictor_.inference_program_, 0, true,
predictor_.sub_scope_);
PrepareArgument();
auto& arg = predictor_.argument_;
Analyzer().Run(&arg);
PADDLE_ENFORCE(arg.scope_valid());
VLOG(5) << "to prepare executor";
ARGUMENT_CHECK_FIELD((&arg), ir_analyzed_program);
predictor_.inference_program_.reset(
new framework::ProgramDesc(arg.ir_analyzed_program()));
LOG(INFO) << "== optimize 2 end ==";
predictor_.executor_->CreateVariables(*predictor_.inference_program_, 0,
false, predictor_.sub_scope_);
return true;
}
bool AnalysisPredictor::MkldnnQuantizer::RunWarmup() const {
VLOG(3) << "Predictor: run a quantization warmup iteration";
auto warmup_data = qconfig_->warmup_data();
PADDLE_ENFORCE_NOT_NULL(warmup_data,
"Warmup data cannot be NULL in the config.");
PrettyLogH1("--- Running warmup iteration for quantization");
// Run the inference program
std::vector<PaddleTensor> output_slots;
predictor_.Run(*warmup_data, &output_slots, qconfig_->warmup_batch_size());
return true;
}
float AnalysisPredictor::MkldnnQuantizer::SafeEntropy(
std::vector<int> reference_distr_P, int P_sum,
std::vector<int> candidate_distr_Q, int Q_sum) const {
PADDLE_ENFORCE_EQ(reference_distr_P.size(), candidate_distr_Q.size());
float tmp_sum1 = 0;
float tmp_sum2 = 0;
for (size_t idx = 0; idx < reference_distr_P.size(); idx++) {
int p_idx = reference_distr_P[idx];
int q_idx = candidate_distr_Q[idx];
if (p_idx == 0) {
tmp_sum1 += 0;
tmp_sum2 += 0;
} else {
PADDLE_ENFORCE(q_idx != 0, "MkldnnQuantizer: Fatal error!, idx = " +
std::to_string(idx) +
" qindex = 0! p_idx = " +
std::to_string(p_idx));
}
tmp_sum1 += p_idx * (log(Q_sum * p_idx));
tmp_sum2 += p_idx * (log(P_sum * q_idx));
}
return (tmp_sum1 - tmp_sum2) / P_sum;
}
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/string/printf.h"
#ifdef PADDLE_WITH_TESTING
#include <gtest/gtest.h>
#include <gtest/gtest_prod.h>
#endif
namespace paddle {
/*
* Map variable name to tensor of scaling factors scaling it to MAX=1.0.
* bool denotes whether quantization of the variable should be done to unsigned
* type.
*/
using VarQuantScale =
std::unordered_map<std::string, std::pair<bool, framework::LoDTensor>>;
class AnalysisPredictor::MkldnnQuantizer {
public:
explicit MkldnnQuantizer(
AnalysisPredictor& predictor, // NOLINT
const std::shared_ptr<MkldnnQuantizerConfig>& qconfig)
: predictor_(predictor), qconfig_(qconfig) {}
// Execute full quantization procedure.
bool Quantize();
#if PADDLE_WITH_TESTING
friend class MkldnnQuantizerTest;
#endif
private:
// Run single warmup iteration
bool RunWarmup() const;
// Gather data from variables and calculate scales for them.
bool CalculateScales();
// Calculate a scale for tensor based on ScaleAlgo rules.
void CalculateSingleScale(const std::string& op_name,
const std::string& conn_name,
const std::string& var_name,
const framework::LoDTensor& var_tensor,
bool is_unsigned);
void PrepareArgument() const;
bool RunQuantizePasses() const;
std::vector<int> ExpandQuantizedBins(std::vector<int> quantized_bins,
std::vector<int> reference_bins) const;
// Using the KL-divergence method get the most precise scaling factor.
std::pair<bool, framework::LoDTensor> GetKLScalingFactor(
const framework::LoDTensor& var_tensor, bool is_unsigned) const;
std::pair<bool, framework::LoDTensor> GetMaxChScalingFactor(
const framework::LoDTensor& var_tensor, bool is_unsigned) const;
std::pair<bool, framework::LoDTensor> GetMaxScalingFactor(
const framework::LoDTensor& var_tensor, bool is_unsigned) const;
// Returns histogram and bin width
std::pair<std::vector<int>, float> Histogram(
const framework::LoDTensor& var_tensor, float min_val, float max_val,
size_t num_bins = 2048) const;
// Calculate the entropy.
float SafeEntropy(std::vector<int> reference_distr_P, int P_sum,
std::vector<int> candidate_distr_Q, int Q_sum) const;
private:
AnalysisPredictor& predictor_;
const std::shared_ptr<MkldnnQuantizerConfig> qconfig_;
// A map: variable name -> scale
VarQuantScale scales_;
};
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h"
namespace paddle {
MkldnnQuantizerConfig::MkldnnQuantizerConfig() {
// The default configuration of scale computing algorightms
rules_["conv2d"]["Input"] = ScaleAlgo::KL;
rules_["conv2d"]["Filter"] = ScaleAlgo::MAX_CH;
rules_["conv2d"]["Bias"] = ScaleAlgo::NONE; // do not compute scale
rules_["conv2d"]["ResidualData"] = ScaleAlgo::KL;
rules_["conv2d"]["Output"] = ScaleAlgo::KL; // do not compute scale
rules_["pool2d"]["X"] = ScaleAlgo::KL;
rules_["pool2d"]["Out"] = ScaleAlgo::KL; // do not compute scale
}
ScaleAlgo MkldnnQuantizerConfig::scale_algo(
const std::string& op_type_name, const std::string& conn_name) const {
if (rules_.find(op_type_name) != rules_.end()) {
auto op_rule = rules_.at(op_type_name);
if (op_rule.find(conn_name) != op_rule.end()) return op_rule.at(conn_name);
}
return default_scale_algo_;
}
} // namespace paddle
...@@ -27,10 +27,14 @@ ...@@ -27,10 +27,14 @@
// the abstract path of this header file will be changed. // the abstract path of this header file will be changed.
#include "paddle_api.h" // NOLINT #include "paddle_api.h" // NOLINT
#include "paddle_pass_builder.h" // NOLINT #include "paddle_pass_builder.h" // NOLINT
#ifdef PADDLE_WITH_MKLDNN
#include "paddle_mkldnn_quantizer_config.h" // NOLINT
#endif
namespace paddle { namespace paddle {
class AnalysisPredictor; class AnalysisPredictor;
struct MkldnnQuantizerConfig;
// NOTE WIP, not stable yet. // NOTE WIP, not stable yet.
struct AnalysisConfig { struct AnalysisConfig {
...@@ -186,6 +190,16 @@ struct AnalysisConfig { ...@@ -186,6 +190,16 @@ struct AnalysisConfig {
mkldnn_enabled_op_types_ = op_list; mkldnn_enabled_op_types_ = op_list;
} }
/** Turn on quantization.
*/
void EnableMkldnnQuantizer();
/** A boolean state telling whether the quantization is enabled.
*/
bool mkldnn_quantizer_enabled() const { return use_mkldnn_quantizer_; }
std::shared_ptr<MkldnnQuantizerConfig> mkldnn_quantizer_config() const;
/** Specify the memory buffer of program and parameter /** Specify the memory buffer of program and parameter
* @param prog_buffer the memory buffer of program. * @param prog_buffer the memory buffer of program.
* @param prog_buffer_size the size of the data. * @param prog_buffer_size the size of the data.
...@@ -271,10 +285,14 @@ struct AnalysisConfig { ...@@ -271,10 +285,14 @@ struct AnalysisConfig {
std::string serialized_info_cache_; std::string serialized_info_cache_;
mutable std::unique_ptr<PassStrategy> pass_builder_; mutable std::unique_ptr<PassStrategy> pass_builder_;
bool use_anakin_{false}; bool use_anakin_{false};
int anakin_max_batchsize_; int anakin_max_batchsize_;
std::map<std::string, std::vector<int>> anakin_max_input_shape_; std::map<std::string, std::vector<int>> anakin_max_input_shape_;
std::map<std::string, std::string> engine_opt_info_; std::map<std::string, std::string> engine_opt_info_;
bool use_mkldnn_quantizer_{false};
std::shared_ptr<MkldnnQuantizerConfig> mkldnn_quantizer_config_;
}; };
} // namespace paddle } // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cassert>
#include <map>
#include <memory>
#include <string>
#include <unordered_set>
#include <vector>
#include "paddle_api.h" // NOLINT
namespace paddle {
// Algorithms for finding scale of quantized Tensors.
enum class ScaleAlgo {
NONE, // Do not compute scale
MAX, // Find scale based on the maximum absolute value
MAX_CH, // Find scale based on the maximum absolute value per channel
KL, // Find scale based on KL Divergence
};
struct MkldnnQuantizerConfig {
MkldnnQuantizerConfig();
/** Specify a quantization algorithm for a connection (input/output) of the
* operator type.
* @param op_type_name the operator's name.
* @param conn_name name of the connection (input/output) of the operator.
* @param algo the algorithm for computing scale.
*/
void SetScaleAlgo(std::string op_type_name, std::string conn_name,
ScaleAlgo algo) {
rules_[op_type_name][conn_name] = algo;
}
/** Get the quantization algorithm for a connection (input/output) of the
* operator type.
* @param op_type_name the operator's name.
* @param conn_name name of the connection (input/output) of the operator.
* @return the algorithm for computing scale.
*/
ScaleAlgo scale_algo(const std::string& op_type_name,
const std::string& conn_name) const;
/** Set the batch of data to be used for warm-up iteration.
* @param data batch of data.
*/
void SetWarmupData(std::shared_ptr<std::vector<PaddleTensor>> data) {
warmup_data_ = data;
}
/** Get the batch of data used for warm-up iteration.
* @return batch of data.
*/
std::shared_ptr<std::vector<PaddleTensor>> warmup_data() const {
return warmup_data_;
}
void SetWarmupBatchSize(int batch_size) { warmup_bs_ = batch_size; }
int warmup_batch_size() const { return warmup_bs_; }
void SetEnabledOpTypes(std::unordered_set<std::string> op_list) {
enabled_op_types_ = op_list;
}
const std::unordered_set<std::string>& enabled_op_types() const {
return enabled_op_types_;
}
void SetExcludedOpIds(std::unordered_set<int> op_ids_list) {
excluded_op_ids_ = op_ids_list;
}
const std::unordered_set<int>& excluded_op_ids() const {
return excluded_op_ids_;
}
void SetDefaultScaleAlgo(ScaleAlgo algo) { default_scale_algo_ = algo; }
ScaleAlgo default_scale_algo() const { return default_scale_algo_; }
protected:
std::map<std::string, std::map<std::string, ScaleAlgo>> rules_;
std::unordered_set<std::string> enabled_op_types_;
std::unordered_set<int> excluded_op_ids_;
std::shared_ptr<std::vector<PaddleTensor>> warmup_data_;
int warmup_bs_{1};
ScaleAlgo default_scale_algo_{ScaleAlgo::MAX};
};
} // namespace paddle
...@@ -107,8 +107,8 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { ...@@ -107,8 +107,8 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
use_gpu_ = true; use_gpu_ = true;
} }
void GpuPassStrategy::EnableQuantizer() { void GpuPassStrategy::EnableMkldnnQuantizer() {
LOG(ERROR) << "GPU not support quantization yet"; LOG(ERROR) << "GPU not support MKL-DNN quantization";
} }
void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) { void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
......
...@@ -30,6 +30,10 @@ class PaddlePassBuilder { ...@@ -30,6 +30,10 @@ class PaddlePassBuilder {
explicit PaddlePassBuilder(const std::vector<std::string> &passes) explicit PaddlePassBuilder(const std::vector<std::string> &passes)
: passes_(passes) {} : passes_(passes) {}
void SetPasses(std::initializer_list<std::string> passes) {
passes_ = passes;
}
/** Append a pass to the end of the passes. */ /** Append a pass to the end of the passes. */
void AppendPass(const std::string &pass_type); void AppendPass(const std::string &pass_type);
...@@ -85,9 +89,9 @@ class PassStrategy : public PaddlePassBuilder { ...@@ -85,9 +89,9 @@ class PassStrategy : public PaddlePassBuilder {
*/ */
virtual void EnableMKLDNN() {} virtual void EnableMKLDNN() {}
/** Enable quantize optimization /** Enable MKLDNN quantize optimization
*/ */
virtual void EnableQuantizer() {} virtual void EnableMkldnnQuantizer() {}
bool use_gpu() const { return use_gpu_; } bool use_gpu() const { return use_gpu_; }
...@@ -130,15 +134,19 @@ class CpuPassStrategy : public PassStrategy { ...@@ -130,15 +134,19 @@ class CpuPassStrategy : public PassStrategy {
#endif #endif
} }
void EnableQuantizer() override { void EnableMkldnnQuantizer() override {
if (!use_quantizer_) { #ifdef PADDLE_WITH_MKLDNN
if (!use_mkldnn_quantizer_) {
passes_.push_back("cpu_quantize_placement_pass"); passes_.push_back("cpu_quantize_placement_pass");
} }
use_quantizer_ = true; use_mkldnn_quantizer_ = true;
#else
use_mkldnn_quantizer_ = false;
#endif
} }
protected: protected:
bool use_quantizer_{false}; bool use_mkldnn_quantizer_{false};
}; };
/** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode. /** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode.
...@@ -153,7 +161,7 @@ class GpuPassStrategy : public PassStrategy { ...@@ -153,7 +161,7 @@ class GpuPassStrategy : public PassStrategy {
} }
void EnableMKLDNN() override; void EnableMKLDNN() override;
void EnableQuantizer() override; void EnableMkldnnQuantizer() override;
virtual ~GpuPassStrategy() = default; virtual ~GpuPassStrategy() = default;
}; };
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册