提交 ddc1a005 编写于 作者: 李寅

Merge branch 'gemmlowp' into 'master'

Add quantized fully connected

See merge request !756
......@@ -822,34 +822,6 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
}
}
typedef gemmlowp::VectorMap<const int32_t, gemmlowp::VectorShape::Col>
ColVectorMap;
typedef std::tuple<
gemmlowp::OutputStageBiasAddition<ColVectorMap>,
gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint,
gemmlowp::OutputStageSaturatingCastToUint8> Pipeline;
inline Pipeline MakeOutputPipeline(
const int32_t* bias_data, const index_t channels, const float lhs_scale,
const float rhs_scale, const float output_scale,
const int32_t output_zero_point) {
ColVectorMap bias_vector(bias_data, channels);
gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
bias_addition_stage.bias_vector = bias_vector;
int32_t quantized_multiplier;
int32_t right_shift;
GetOutputMultiplierAndShift(lhs_scale, rhs_scale, output_scale,
&quantized_multiplier, &right_shift);
gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint
quantize_down_stage;
quantize_down_stage.result_offset_after_shift = output_zero_point;
quantize_down_stage.result_fixedpoint_multiplier = quantized_multiplier;
quantize_down_stage.result_shift = right_shift;
gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
return std::make_tuple(bias_addition_stage, quantize_down_stage,
saturating_cast_stage);
}
MaceStatus operator()(const Tensor *input, // NHWC
const Tensor *filter, // OHWI
const Tensor *bias,
......@@ -959,7 +931,7 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
gemmlowp::MatrixMap<uint8_t, gemmlowp::MapOrder::ColMajor>
output_matrix(output_data, gemm_output_rows, gemm_output_cols);
const auto &output_pipeline = MakeOutputPipeline(
const auto &output_pipeline = GemmlowpOutputPipeline::Make(
bias_data, channels, filter->scale(), input->scale(), output->scale(),
output->zero_point());
......
......@@ -22,6 +22,7 @@
#include "mace/core/tensor.h"
#include "mace/kernels/activation.h"
#include "mace/kernels/gemm.h"
#include "mace/kernels/gemmlowp_util.h"
namespace mace {
namespace kernels {
......@@ -46,10 +47,10 @@ struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase {
: FullyConnectedBase(activation, relux_max_limit) {}
MaceStatus operator()(const Tensor *input,
const Tensor *weight,
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
const Tensor *weight,
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
MACE_UNUSED(future);
std::vector<index_t> output_shape = {input->dim(0), weight->dim(0), 1, 1};
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
......@@ -83,6 +84,67 @@ struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase {
}
};
template <>
struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase {
FullyConnectedFunctor(const ActivationType activation,
const float relux_max_limit)
: FullyConnectedBase(activation, relux_max_limit) {}
MaceStatus operator()(const Tensor *input,
const Tensor *weight,
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
MACE_UNUSED(future);
gemmlowp::GemmContext& gemm_context = GetGemmlowpContext();
std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
const int N = static_cast<int>(output->dim(0));
const int input_size =
static_cast<int>(weight->dim(1) * weight->dim(2) * weight->dim(3));
const int output_size = static_cast<int>(weight->dim(0));
Tensor::MappingGuard guard_input(input);
Tensor::MappingGuard guard_weight(weight);
Tensor::MappingGuard guard_output(output);
auto input_ptr = input->data<uint8_t>();
auto weight_ptr = weight->data<uint8_t>();
auto output_ptr = output->mutable_data<uint8_t>();
std::vector<index_t> bias_shape{output_size};
std::unique_ptr<Tensor> zero_bias;
const int32_t *bias_ptr = nullptr;
if (bias == nullptr) {
zero_bias.reset(
new Tensor(GetDeviceAllocator(DeviceType::CPU), DT_INT32));
zero_bias->Resize(bias_shape);
zero_bias->Clear();
bias_ptr = zero_bias->data<int32_t>();
} else {
bias_ptr = bias->data<int32_t>();
}
gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::RowMajor>
weight_matrix(weight_ptr, output_size, input_size);
gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::ColMajor>
input_matrix(input_ptr, input_size, N);
gemmlowp::MatrixMap<uint8_t, gemmlowp::MapOrder::ColMajor>
output_matrix(output_ptr, output_size, N);
const auto &output_pipeline = GemmlowpOutputPipeline::Make(
bias_ptr, output_size, weight->scale(), input->scale(), output->scale(),
output->zero_point());
using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;
gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, BitDepthParams>(
&gemm_context, weight_matrix, input_matrix, &output_matrix,
-weight->zero_point(), -input->zero_point(), output_pipeline);
return MACE_SUCCESS;
}
};
#ifdef MACE_ENABLE_OPENCL
template <typename T>
struct FullyConnectedFunctor<DeviceType::GPU, T> : FullyConnectedBase {
......
......@@ -15,12 +15,44 @@
#ifndef MACE_KERNELS_GEMMLOWP_UTIL_H_
#define MACE_KERNELS_GEMMLOWP_UTIL_H_
#include <tuple>
#include "public/gemmlowp.h"
#include "mace/kernels/quantize.h"
namespace mace {
gemmlowp::GemmContext& GetGemmlowpContext();
struct GemmlowpOutputPipeline {
typedef gemmlowp::VectorMap<const int32_t, gemmlowp::VectorShape::Col>
ColVectorMap;
typedef std::tuple<
gemmlowp::OutputStageBiasAddition<ColVectorMap>,
gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint,
gemmlowp::OutputStageSaturatingCastToUint8> Pipeline;
static Pipeline Make(
const int32_t *bias_data, const index_t channels, const float lhs_scale,
const float rhs_scale, const float output_scale,
const int32_t output_zero_point) {
ColVectorMap bias_vector(bias_data, channels);
gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
bias_addition_stage.bias_vector = bias_vector;
int32_t quantized_multiplier;
int32_t right_shift;
kernels::GetOutputMultiplierAndShift(lhs_scale, rhs_scale, output_scale,
&quantized_multiplier, &right_shift);
gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint
quantize_down_stage;
quantize_down_stage.result_offset_after_shift = output_zero_point;
quantize_down_stage.result_fixedpoint_multiplier = quantized_multiplier;
quantize_down_stage.result_shift = right_shift;
gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
return std::make_tuple(bias_addition_stage, quantize_down_stage,
saturating_cast_stage);
}
};
} // namespace mace
#endif // MACE_KERNELS_GEMMLOWP_UTIL_H_
......@@ -24,6 +24,11 @@ void Register_FullyConnected(OperatorRegistryBase *op_registry) {
.Build(),
FullyConnectedOp<DeviceType::CPU, float>);
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected")
.Device(DeviceType::CPU)
.TypeConstraint<uint8_t>("T")
.Build(),
FullyConnectedOp<DeviceType::CPU, uint8_t>);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected")
.Device(DeviceType::GPU)
......
......@@ -80,6 +80,43 @@ void FCBenchmark(
}
net.Sync();
}
template <>
void FCBenchmark<CPU, uint8_t>(
int iters, int batch, int height, int width, int channel, int out_channel) {
mace::testing::StopTiming();
OpsTestNet net;
// Add input data
net.AddRandomInput<CPU, uint8_t>("Input", {batch, height, width, channel});
net.GetTensor("Input")->SetScale(0.1);
net.AddRandomInput<CPU, uint8_t>("Weight",
{out_channel, height, width, channel});
net.GetTensor("Weight")->SetScale(0.1);
net.AddRandomInput<CPU, uint8_t>("Bias", {out_channel});
OpDefBuilder("FullyConnected", "FullyConnectedTest")
.Input("Input")
.Input("Weight")
.Input("Bias")
.Output("Output")
.AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef());
net.Setup(CPU);
net.GetTensor("Output")->SetScale(0.1);
// Warm-up
for (int i = 0; i < 2; ++i) {
net.Run();
}
mace::testing::StartTiming();
while (iters--) {
net.Run();
}
}
} // namespace
#define MACE_BM_FC_MACRO(N, H, W, C, OC, TYPE, DEVICE) \
......@@ -98,7 +135,8 @@ void FCBenchmark(
#define MACE_BM_FC(N, H, W, C, OC) \
MACE_BM_FC_MACRO(N, H, W, C, OC, float, CPU); \
MACE_BM_FC_MACRO(N, H, W, C, OC, float, GPU); \
MACE_BM_FC_MACRO(N, H, W, C, OC, half, GPU);
MACE_BM_FC_MACRO(N, H, W, C, OC, half, GPU); \
MACE_BM_FC_MACRO(N, H, W, C, OC, uint8_t, CPU);
MACE_BM_FC(1, 16, 16, 32, 32);
MACE_BM_FC(1, 8, 8, 32, 1000);
......
......@@ -15,6 +15,7 @@
#include <fstream>
#include "mace/core/operator.h"
#include "mace/kernels/quantize.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
......@@ -216,6 +217,107 @@ TEST_F(FullyConnectedOpTest, ComplexHalfWidthFormatAligned) {
Random<half>(1, 14, 14, 13, 23);
}
namespace {
void QuantRandom(const index_t batch,
const index_t height,
const index_t width,
const index_t channels,
const index_t out_channel) {
// Construct graph
OpsTestNet net;
// Add input data
net.AddRandomInput<CPU, float>(
"Input", {batch, height, width, channels});
net.AddRandomInput<CPU, float>(
"Weight", {out_channel, height, width, channels});
net.AddRandomInput<CPU, float>("Bias", {out_channel});
net.TransformDataFormat<CPU, float>("Input", NHWC, "InputNCHW", NCHW);
net.TransformDataFormat<CPU, float>("Weight", OHWI, "WeightOIHW", OIHW);
OpDefBuilder("FullyConnected", "FullyConnectedTest")
.Input("InputNCHW")
.Input("WeightOIHW")
.Input("Bias")
.Output("OutputNCHW")
.AddIntArg("T", DT_FLOAT)
.Finalize(net.NewOperatorDef());
net.RunOp();
net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
OpDefBuilder("Quantize", "QuantizeWeight")
.Input("Weight")
.Output("QuantizedWeight")
.OutputType({DT_UINT8})
.AddIntArg("T", DT_UINT8)
.AddIntArg("non_zero", true)
.Finalize(net.NewOperatorDef());
net.RunOp();
OpDefBuilder("Quantize", "QuantizeInput")
.Input("Input")
.Output("QuantizedInput")
.OutputType({DT_UINT8})
.AddIntArg("T", DT_UINT8)
.AddIntArg("non_zero", true)
.Finalize(net.NewOperatorDef());
net.RunOp();
OpDefBuilder("Quantize", "QuantizeOutput")
.Input("Output")
.Output("ExpectedQuantizedOutput")
.OutputType({DT_UINT8})
.AddIntArg("T", DT_UINT8)
.AddIntArg("non_zero", true)
.Finalize(net.NewOperatorDef());
net.RunOp();
Tensor *q_weight = net.GetTensor("QuantizedWeight");
Tensor *q_input = net.GetTensor("QuantizedInput");
Tensor *bias = net.GetTensor("Bias");
auto bias_data = bias->data<float>();
std::vector<int32_t> q_bias(bias->size());
kernels::QuantizeWithScaleAndZeropoint(
bias_data, bias->size(), q_input->scale() * q_weight->scale(), 0,
q_bias.data());
net.AddInputFromArray<DeviceType::CPU, int32_t>("QuantizedBias",
{out_channel}, q_bias);
OpDefBuilder("FullyConnected", "QuantizeFullyConnectedTest")
.Input("QuantizedInput")
.Input("QuantizedWeight")
.Input("QuantizedBias")
.Output("QuantizedOutput")
.AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef());
net.Setup(DeviceType::CPU);
Tensor *eq_output = net.GetTensor("ExpectedQuantizedOutput");
Tensor *q_output = net.GetTensor("QuantizedOutput");
q_output->SetScale(eq_output->scale());
q_output->SetZeroPoint(eq_output->zero_point());
net.Run();
OpDefBuilder("Dequantize", "DeQuantizeTest")
.Input("QuantizedOutput")
.Output("DequantizedOutput")
.OutputType({DT_FLOAT})
.AddIntArg("T", DT_UINT8)
.Finalize(net.NewOperatorDef());
net.RunOp();
// Check
ExpectTensorSimilar<float>(*net.GetOutput("Output"),
*net.GetTensor("DequantizedOutput"), 0.01);
}
} // namespace
TEST_F(FullyConnectedOpTest, Quant) {
QuantRandom(1, 16, 16, 32, 16);
QuantRandom(1, 7, 7, 32, 16);
QuantRandom(1, 7, 7, 512, 128);
QuantRandom(1, 1, 1, 2048, 1024);
}
} // namespace test
} // namespace ops
} // namespace mace
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册