diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h index 53ccb310fdf184f6d590b3c9712051bd9bf640ec..282472bca3829cd112a0be0d62a5409fdb3bbfc5 100644 --- a/mace/kernels/conv_2d.h +++ b/mace/kernels/conv_2d.h @@ -822,34 +822,6 @@ struct Conv2dFunctor : Conv2dFunctorBase { } } - typedef gemmlowp::VectorMap - ColVectorMap; - typedef std::tuple< - gemmlowp::OutputStageBiasAddition, - gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint, - gemmlowp::OutputStageSaturatingCastToUint8> Pipeline; - inline Pipeline MakeOutputPipeline( - const int32_t* bias_data, const index_t channels, const float lhs_scale, - const float rhs_scale, const float output_scale, - const int32_t output_zero_point) { - ColVectorMap bias_vector(bias_data, channels); - gemmlowp::OutputStageBiasAddition bias_addition_stage; - bias_addition_stage.bias_vector = bias_vector; - int32_t quantized_multiplier; - int32_t right_shift; - GetOutputMultiplierAndShift(lhs_scale, rhs_scale, output_scale, - &quantized_multiplier, &right_shift); - gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint - quantize_down_stage; - quantize_down_stage.result_offset_after_shift = output_zero_point; - quantize_down_stage.result_fixedpoint_multiplier = quantized_multiplier; - quantize_down_stage.result_shift = right_shift; - - gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage; - return std::make_tuple(bias_addition_stage, quantize_down_stage, - saturating_cast_stage); - } - MaceStatus operator()(const Tensor *input, // NHWC const Tensor *filter, // OHWI const Tensor *bias, @@ -959,7 +931,7 @@ struct Conv2dFunctor : Conv2dFunctorBase { gemmlowp::MatrixMap output_matrix(output_data, gemm_output_rows, gemm_output_cols); - const auto &output_pipeline = MakeOutputPipeline( + const auto &output_pipeline = GemmlowpOutputPipeline::Make( bias_data, channels, filter->scale(), input->scale(), output->scale(), output->zero_point()); diff --git a/mace/kernels/fully_connected.h b/mace/kernels/fully_connected.h index 82b0ba1e4f3d408fa5ddf2e919e7d177d7b0c29a..e5172920a2a3c08862948257debff0387da10a0c 100644 --- a/mace/kernels/fully_connected.h +++ b/mace/kernels/fully_connected.h @@ -22,6 +22,7 @@ #include "mace/core/tensor.h" #include "mace/kernels/activation.h" #include "mace/kernels/gemm.h" +#include "mace/kernels/gemmlowp_util.h" namespace mace { namespace kernels { @@ -46,10 +47,10 @@ struct FullyConnectedFunctor: FullyConnectedBase { : FullyConnectedBase(activation, relux_max_limit) {} MaceStatus operator()(const Tensor *input, - const Tensor *weight, - const Tensor *bias, - Tensor *output, - StatsFuture *future) { + const Tensor *weight, + const Tensor *bias, + Tensor *output, + StatsFuture *future) { MACE_UNUSED(future); std::vector output_shape = {input->dim(0), weight->dim(0), 1, 1}; MACE_RETURN_IF_ERROR(output->Resize(output_shape)); @@ -83,6 +84,67 @@ struct FullyConnectedFunctor: FullyConnectedBase { } }; +template <> +struct FullyConnectedFunctor: FullyConnectedBase { + FullyConnectedFunctor(const ActivationType activation, + const float relux_max_limit) + : FullyConnectedBase(activation, relux_max_limit) {} + + MaceStatus operator()(const Tensor *input, + const Tensor *weight, + const Tensor *bias, + Tensor *output, + StatsFuture *future) { + MACE_UNUSED(future); + gemmlowp::GemmContext& gemm_context = GetGemmlowpContext(); + + std::vector output_shape = {input->dim(0), 1, 1, weight->dim(0)}; + MACE_RETURN_IF_ERROR(output->Resize(output_shape)); + const int N = static_cast(output->dim(0)); + const int input_size = + static_cast(weight->dim(1) * weight->dim(2) * weight->dim(3)); + const int output_size = static_cast(weight->dim(0)); + + Tensor::MappingGuard guard_input(input); + Tensor::MappingGuard guard_weight(weight); + Tensor::MappingGuard guard_output(output); + auto input_ptr = input->data(); + auto weight_ptr = weight->data(); + auto output_ptr = output->mutable_data(); + + std::vector bias_shape{output_size}; + std::unique_ptr zero_bias; + const int32_t *bias_ptr = nullptr; + if (bias == nullptr) { + zero_bias.reset( + new Tensor(GetDeviceAllocator(DeviceType::CPU), DT_INT32)); + zero_bias->Resize(bias_shape); + zero_bias->Clear(); + bias_ptr = zero_bias->data(); + } else { + bias_ptr = bias->data(); + } + + gemmlowp::MatrixMap + weight_matrix(weight_ptr, output_size, input_size); + gemmlowp::MatrixMap + input_matrix(input_ptr, input_size, N); + gemmlowp::MatrixMap + output_matrix(output_ptr, output_size, N); + + const auto &output_pipeline = GemmlowpOutputPipeline::Make( + bias_ptr, output_size, weight->scale(), input->scale(), output->scale(), + output->zero_point()); + + using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams; + gemmlowp::GemmWithOutputPipeline( + &gemm_context, weight_matrix, input_matrix, &output_matrix, + -weight->zero_point(), -input->zero_point(), output_pipeline); + + return MACE_SUCCESS; + } +}; + #ifdef MACE_ENABLE_OPENCL template struct FullyConnectedFunctor : FullyConnectedBase { diff --git a/mace/kernels/gemmlowp_util.h b/mace/kernels/gemmlowp_util.h index 9b8e400be9d71e3a5c70aa7b2abef3458f448e37..8ce20d38030a6e981c6ae2cdb037bee088a0eea6 100644 --- a/mace/kernels/gemmlowp_util.h +++ b/mace/kernels/gemmlowp_util.h @@ -15,12 +15,44 @@ #ifndef MACE_KERNELS_GEMMLOWP_UTIL_H_ #define MACE_KERNELS_GEMMLOWP_UTIL_H_ +#include + #include "public/gemmlowp.h" +#include "mace/kernels/quantize.h" namespace mace { gemmlowp::GemmContext& GetGemmlowpContext(); +struct GemmlowpOutputPipeline { + typedef gemmlowp::VectorMap + ColVectorMap; + typedef std::tuple< + gemmlowp::OutputStageBiasAddition, + gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint, + gemmlowp::OutputStageSaturatingCastToUint8> Pipeline; + static Pipeline Make( + const int32_t *bias_data, const index_t channels, const float lhs_scale, + const float rhs_scale, const float output_scale, + const int32_t output_zero_point) { + ColVectorMap bias_vector(bias_data, channels); + gemmlowp::OutputStageBiasAddition bias_addition_stage; + bias_addition_stage.bias_vector = bias_vector; + int32_t quantized_multiplier; + int32_t right_shift; + kernels::GetOutputMultiplierAndShift(lhs_scale, rhs_scale, output_scale, + &quantized_multiplier, &right_shift); + gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint + quantize_down_stage; + quantize_down_stage.result_offset_after_shift = output_zero_point; + quantize_down_stage.result_fixedpoint_multiplier = quantized_multiplier; + quantize_down_stage.result_shift = right_shift; + + gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage; + return std::make_tuple(bias_addition_stage, quantize_down_stage, + saturating_cast_stage); + } +}; } // namespace mace #endif // MACE_KERNELS_GEMMLOWP_UTIL_H_ diff --git a/mace/ops/fully_connected.cc b/mace/ops/fully_connected.cc index 5ad8c4664a40fbbe6a9b7fe8cde20f1705a78d41..31f3bf869c07c0bf1ebbf14deeb234365624a9b2 100644 --- a/mace/ops/fully_connected.cc +++ b/mace/ops/fully_connected.cc @@ -24,6 +24,11 @@ void Register_FullyConnected(OperatorRegistryBase *op_registry) { .Build(), FullyConnectedOp); + MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + FullyConnectedOp); #ifdef MACE_ENABLE_OPENCL MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected") .Device(DeviceType::GPU) diff --git a/mace/ops/fully_connected_benchmark.cc b/mace/ops/fully_connected_benchmark.cc index 021b6396a7786ea322936f53617e7b12f4fae1b5..9f0fe549af4b903d89d478aecd5d70857ea94e4e 100644 --- a/mace/ops/fully_connected_benchmark.cc +++ b/mace/ops/fully_connected_benchmark.cc @@ -80,6 +80,43 @@ void FCBenchmark( } net.Sync(); } + +template <> +void FCBenchmark( + int iters, int batch, int height, int width, int channel, int out_channel) { + mace::testing::StopTiming(); + + OpsTestNet net; + + // Add input data + net.AddRandomInput("Input", {batch, height, width, channel}); + net.GetTensor("Input")->SetScale(0.1); + net.AddRandomInput("Weight", + {out_channel, height, width, channel}); + net.GetTensor("Weight")->SetScale(0.1); + net.AddRandomInput("Bias", {out_channel}); + + OpDefBuilder("FullyConnected", "FullyConnectedTest") + .Input("Input") + .Input("Weight") + .Input("Bias") + .Output("Output") + .AddIntArg("T", DT_UINT8) + .Finalize(net.NewOperatorDef()); + + net.Setup(CPU); + net.GetTensor("Output")->SetScale(0.1); + + // Warm-up + for (int i = 0; i < 2; ++i) { + net.Run(); + } + + mace::testing::StartTiming(); + while (iters--) { + net.Run(); + } +} } // namespace #define MACE_BM_FC_MACRO(N, H, W, C, OC, TYPE, DEVICE) \ @@ -98,7 +135,8 @@ void FCBenchmark( #define MACE_BM_FC(N, H, W, C, OC) \ MACE_BM_FC_MACRO(N, H, W, C, OC, float, CPU); \ MACE_BM_FC_MACRO(N, H, W, C, OC, float, GPU); \ - MACE_BM_FC_MACRO(N, H, W, C, OC, half, GPU); + MACE_BM_FC_MACRO(N, H, W, C, OC, half, GPU); \ + MACE_BM_FC_MACRO(N, H, W, C, OC, uint8_t, CPU); MACE_BM_FC(1, 16, 16, 32, 32); MACE_BM_FC(1, 8, 8, 32, 1000); diff --git a/mace/ops/fully_connected_test.cc b/mace/ops/fully_connected_test.cc index d84f050010227751ff7c59af364ee22372ed40a5..8b30096da8475217c48f05b85dde702c3e754edc 100644 --- a/mace/ops/fully_connected_test.cc +++ b/mace/ops/fully_connected_test.cc @@ -15,6 +15,7 @@ #include #include "mace/core/operator.h" +#include "mace/kernels/quantize.h" #include "mace/ops/ops_test_util.h" namespace mace { @@ -216,6 +217,107 @@ TEST_F(FullyConnectedOpTest, ComplexHalfWidthFormatAligned) { Random(1, 14, 14, 13, 23); } +namespace { +void QuantRandom(const index_t batch, + const index_t height, + const index_t width, + const index_t channels, + const index_t out_channel) { + // Construct graph + OpsTestNet net; + + // Add input data + net.AddRandomInput( + "Input", {batch, height, width, channels}); + net.AddRandomInput( + "Weight", {out_channel, height, width, channels}); + net.AddRandomInput("Bias", {out_channel}); + net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); + net.TransformDataFormat("Weight", OHWI, "WeightOIHW", OIHW); + + OpDefBuilder("FullyConnected", "FullyConnectedTest") + .Input("InputNCHW") + .Input("WeightOIHW") + .Input("Bias") + .Output("OutputNCHW") + .AddIntArg("T", DT_FLOAT) + .Finalize(net.NewOperatorDef()); + net.RunOp(); + net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); + + OpDefBuilder("Quantize", "QuantizeWeight") + .Input("Weight") + .Output("QuantizedWeight") + .OutputType({DT_UINT8}) + .AddIntArg("T", DT_UINT8) + .AddIntArg("non_zero", true) + .Finalize(net.NewOperatorDef()); + net.RunOp(); + + OpDefBuilder("Quantize", "QuantizeInput") + .Input("Input") + .Output("QuantizedInput") + .OutputType({DT_UINT8}) + .AddIntArg("T", DT_UINT8) + .AddIntArg("non_zero", true) + .Finalize(net.NewOperatorDef()); + net.RunOp(); + + OpDefBuilder("Quantize", "QuantizeOutput") + .Input("Output") + .Output("ExpectedQuantizedOutput") + .OutputType({DT_UINT8}) + .AddIntArg("T", DT_UINT8) + .AddIntArg("non_zero", true) + .Finalize(net.NewOperatorDef()); + net.RunOp(); + + Tensor *q_weight = net.GetTensor("QuantizedWeight"); + Tensor *q_input = net.GetTensor("QuantizedInput"); + Tensor *bias = net.GetTensor("Bias"); + auto bias_data = bias->data(); + std::vector q_bias(bias->size()); + kernels::QuantizeWithScaleAndZeropoint( + bias_data, bias->size(), q_input->scale() * q_weight->scale(), 0, + q_bias.data()); + net.AddInputFromArray("QuantizedBias", + {out_channel}, q_bias); + + OpDefBuilder("FullyConnected", "QuantizeFullyConnectedTest") + .Input("QuantizedInput") + .Input("QuantizedWeight") + .Input("QuantizedBias") + .Output("QuantizedOutput") + .AddIntArg("T", DT_UINT8) + .Finalize(net.NewOperatorDef()); + net.Setup(DeviceType::CPU); + Tensor *eq_output = net.GetTensor("ExpectedQuantizedOutput"); + Tensor *q_output = net.GetTensor("QuantizedOutput"); + q_output->SetScale(eq_output->scale()); + q_output->SetZeroPoint(eq_output->zero_point()); + net.Run(); + + OpDefBuilder("Dequantize", "DeQuantizeTest") + .Input("QuantizedOutput") + .Output("DequantizedOutput") + .OutputType({DT_FLOAT}) + .AddIntArg("T", DT_UINT8) + .Finalize(net.NewOperatorDef()); + net.RunOp(); + + // Check + ExpectTensorSimilar(*net.GetOutput("Output"), + *net.GetTensor("DequantizedOutput"), 0.01); +} +} // namespace + +TEST_F(FullyConnectedOpTest, Quant) { + QuantRandom(1, 16, 16, 32, 16); + QuantRandom(1, 7, 7, 32, 16); + QuantRandom(1, 7, 7, 512, 128); + QuantRandom(1, 1, 1, 2048, 1024); +} + } // namespace test } // namespace ops } // namespace mace