From 286d1e2118fda11fc0a3f8012ad9947f8d5baeb5 Mon Sep 17 00:00:00 2001 From: liuqi Date: Sun, 7 Jan 2018 21:51:25 +0800 Subject: [PATCH] Add folded batch norm to combine batchnorm and relu. --- mace/core/operator.cc | 2 + mace/kernels/batch_norm.h | 41 ++++++++++----- mace/kernels/opencl/batch_norm_opencl.cc | 22 ++++++-- mace/kernels/opencl/cl/batch_norm.cl | 18 +++++-- mace/ops/batch_norm.h | 16 +++--- mace/ops/folded_batch_norm.cc | 37 ++++++++++++++ mace/ops/folded_batch_norm.h | 52 +++++++++++++++++++ mace/python/tools/tf_converter_lib.py | 64 ++++++++++++++++++++---- tools/validate_gcn.sh | 2 +- 9 files changed, 216 insertions(+), 38 deletions(-) create mode 100644 mace/ops/folded_batch_norm.cc create mode 100644 mace/ops/folded_batch_norm.h diff --git a/mace/core/operator.cc b/mace/core/operator.cc index e759d89d..eca09f3b 100644 --- a/mace/core/operator.cc +++ b/mace/core/operator.cc @@ -76,6 +76,7 @@ extern void Register_Relu(OperatorRegistry *op_registry); extern void Register_ResizeBilinear(OperatorRegistry *op_registry); extern void Register_SpaceToBatchND(OperatorRegistry *op_registry); extern void Register_Softmax(OperatorRegistry *op_registry); +extern void Register_FoldedBatchNorm(OperatorRegistry *op_registry); OperatorRegistry::OperatorRegistry() { Register_AddN(this); @@ -95,6 +96,7 @@ OperatorRegistry::OperatorRegistry() { Register_ResizeBilinear(this); Register_SpaceToBatchND(this); Register_Softmax(this); + Register_FoldedBatchNorm(this); } } // namespace mace diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h index a8cbe58a..41305f9a 100644 --- a/mace/kernels/batch_norm.h +++ b/mace/kernels/batch_norm.h @@ -14,13 +14,14 @@ namespace kernels { template struct BatchNormFunctor { - float epsilon_; - void operator()(const Tensor *input, const Tensor *scale, const Tensor *offset, const Tensor *mean, const Tensor *var, + const float epsilon, + const bool folded_constant, + const bool fused_relu, Tensor *output, StatsFuture *future) { // Batch normalization in the paper https://arxiv.org/abs/1502.03167 . @@ -39,8 +40,6 @@ struct BatchNormFunctor { Tensor::MappingGuard input_mapper(input); Tensor::MappingGuard scale_mapper(scale); Tensor::MappingGuard offset_mapper(offset); - Tensor::MappingGuard mean_mapper(mean); - Tensor::MappingGuard var_mapper(var); Tensor::MappingGuard output_mapper(output); const T *input_ptr = input->data(); @@ -50,13 +49,18 @@ struct BatchNormFunctor { const T *var_ptr = var->data(); T *output_ptr = output->mutable_data(); - vector new_scale(channels); - vector new_offset(channels); - + vector new_scale; + vector new_offset; + if (!folded_constant) { + new_scale.resize(channels); + new_offset.resize(channels); + Tensor::MappingGuard mean_mapper(mean); + Tensor::MappingGuard var_mapper(var); #pragma omp parallel for - for (index_t c = 0; c < channels; ++c) { - new_scale[c] = scale_ptr[c] / std::sqrt(var_ptr[c] + epsilon_); - new_offset[c] = offset_ptr[c] - mean_ptr[c] * new_scale[c]; + for (index_t c = 0; c < channels; ++c) { + new_scale[c] = scale_ptr[c] / std::sqrt(var_ptr[c] + epsilon); + new_offset[c] = offset_ptr[c] - mean_ptr[c] * new_scale[c]; + } } index_t pos = 0; @@ -66,7 +70,14 @@ struct BatchNormFunctor { for (index_t h = 0; h < height; ++h) { for (index_t w = 0; w < width; ++w) { for (index_t c = 0; c < channels; ++c) { - output_ptr[pos] = new_scale[c] * input_ptr[pos] + new_offset[c]; + if (folded_constant) { + output_ptr[pos] = scale_ptr[c] * input_ptr[pos] + offset_ptr[c]; + } else { + output_ptr[pos] = new_scale[c] * input_ptr[pos] + new_offset[c]; + } + if (fused_relu) { + output_ptr[pos] = std::max(output_ptr[pos], static_cast(0)); + } ++pos; } } @@ -82,18 +93,22 @@ void BatchNormFunctor::operator()( const Tensor *offset, const Tensor *mean, const Tensor *var, + const float epsilon, + const bool folded_constant, + const bool fused_relu, Tensor *output, StatsFuture *future); template struct BatchNormFunctor { - float epsilon_; - void operator()(const Tensor *input, const Tensor *scale, const Tensor *offset, const Tensor *mean, const Tensor *var, + const float epsilon, + const bool folded_constant, + const bool fused_relu, Tensor *output, StatsFuture *future); }; diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc index 6cd46e76..068061ec 100644 --- a/mace/kernels/opencl/batch_norm_opencl.cc +++ b/mace/kernels/opencl/batch_norm_opencl.cc @@ -19,8 +19,13 @@ void BatchNormFunctor::operator()( const Tensor *offset, const Tensor *mean, const Tensor *var, + const float epsilon, + const bool folded_constant, + const bool fused_relu, Tensor *output, StatsFuture *future) { + MACE_CHECK(folded_constant || (mean != nullptr && var != nullptr)); + const index_t batch = input->dim(0); const index_t height = input->dim(1); const index_t width = input->dim(2); @@ -33,15 +38,23 @@ void BatchNormFunctor::operator()( auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + if (folded_constant) { + built_options.emplace("-DFOLDED_CONSTANT"); + } + if (fused_relu) { + built_options.emplace("-DFUSED_RELU"); + } auto bm_kernel = runtime->BuildKernel("batch_norm", "batch_norm", built_options); uint32_t idx = 0; bm_kernel.setArg(idx++, *(static_cast(input->buffer()))); bm_kernel.setArg(idx++, *(static_cast(scale->buffer()))); bm_kernel.setArg(idx++, *(static_cast(offset->buffer()))); - bm_kernel.setArg(idx++, *(static_cast(mean->buffer()))); - bm_kernel.setArg(idx++, *(static_cast(var->buffer()))); - bm_kernel.setArg(idx++, epsilon_); + if (!folded_constant) { + bm_kernel.setArg(idx++, *(static_cast(mean->buffer()))); + bm_kernel.setArg(idx++, *(static_cast(var->buffer()))); + bm_kernel.setArg(idx++, epsilon); + } bm_kernel.setArg(idx++, *(static_cast(output->buffer()))); const uint32_t gws[3] = {static_cast(channel_blocks), @@ -89,7 +102,8 @@ void BatchNormFunctor::operator()( << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" - << output->dim(3); + << output->dim(3) << "_" + << folded_constant; OpenCLProfilingTimer timer(&event); Tuner::Get()->template TuneOrRun(ss.str(), lws, diff --git a/mace/kernels/opencl/cl/batch_norm.cl b/mace/kernels/opencl/cl/batch_norm.cl index 027b678b..3eec516e 100644 --- a/mace/kernels/opencl/cl/batch_norm.cl +++ b/mace/kernels/opencl/cl/batch_norm.cl @@ -3,27 +3,39 @@ __kernel void batch_norm(__read_only image2d_t input, __read_only image2d_t scale, __read_only image2d_t offset, +#ifndef FOLDED_CONSTANT __read_only image2d_t mean, __read_only image2d_t var, __private const float epsilon, +#endif __write_only image2d_t output) { const int ch_blk = get_global_id(0); const int w = get_global_id(1); const int hb = get_global_id(2); const int width = get_global_size(1); +#ifdef FOLDED_CONSTANT + DATA_TYPE4 bn_scale = READ_IMAGET(scale, SAMPLER, (int2)(ch_blk, 0)); + DATA_TYPE4 bn_offset = READ_IMAGET(offset, SAMPLER, (int2)(ch_blk, 0)); +#else DATA_TYPE4 scale_value = READ_IMAGET(scale, SAMPLER, (int2)(ch_blk, 0)); DATA_TYPE4 offset_value = READ_IMAGET(offset, SAMPLER, (int2)(ch_blk, 0)); DATA_TYPE4 mean_value = READ_IMAGET(mean, SAMPLER, (int2)(ch_blk, 0)); DATA_TYPE4 var_value = READ_IMAGET(var, SAMPLER, (int2)(ch_blk, 0)); // native_rsqrt seems not faster than rsqrt - DATA_TYPE4 new_scale = scale_value * rsqrt(var_value + (DATA_TYPE4)epsilon); - DATA_TYPE4 new_offset = mad(0 - mean_value, new_scale, offset_value); + DATA_TYPE4 bn_scale = scale_value * rsqrt(var_value + (DATA_TYPE4)epsilon); + DATA_TYPE4 bn_offset = mad(0 - mean_value, new_scale, offset_value); +#endif const int pos = mad24(ch_blk, width, w); DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb)); - DATA_TYPE4 out = mad(in, new_scale, new_offset); + DATA_TYPE4 out = mad(in, bn_scale, bn_offset); + +#ifdef FUSED_RELU + out = fmax(out, 0); +#endif + WRITE_IMAGET(output, (int2)(pos, hb), out); } diff --git a/mace/ops/batch_norm.h b/mace/ops/batch_norm.h index 96c4a1fc..2f84509f 100644 --- a/mace/ops/batch_norm.h +++ b/mace/ops/batch_norm.h @@ -2,8 +2,8 @@ // Copyright (c) 2017 XiaoMi All rights reserved. // -#ifndef MACE_BATCH_NORM_H_ -#define MACE_BATCH_NORM_H_ +#ifndef MACE_OPS_BATCH_NORM_H_ +#define MACE_OPS_BATCH_NORM_H_ #include "mace/core/operator.h" #include "mace/kernels/batch_norm.h" @@ -14,9 +14,9 @@ template class BatchNormOp : public Operator { public: BatchNormOp(const OperatorDef &operator_def, Workspace *ws) - : Operator(operator_def, ws), functor_() { - functor_.epsilon_ = - OperatorBase::GetSingleArgument("epsilon", static_cast(1e-4)); + : Operator(operator_def, ws) { + epsilon_ = + OperatorBase::GetSingleArgument("epsilon", static_cast(1e-4)); } bool Run(StatsFuture *future) override { @@ -40,11 +40,13 @@ class BatchNormOp : public Operator { Tensor *output = this->Output(OUTPUT); output->ResizeLike(input); - functor_(input, scale, offset, mean, var, output, future); + functor_(input, scale, offset, mean, var, epsilon_, + false, false, output, future); return true; } private: + float epsilon_; kernels::BatchNormFunctor functor_; protected: @@ -54,4 +56,4 @@ class BatchNormOp : public Operator { } // namespace mace -#endif // MACE_BATCH_NORM_H_ +#endif // MACE_OPS_BATCH_NORM_H_ diff --git a/mace/ops/folded_batch_norm.cc b/mace/ops/folded_batch_norm.cc new file mode 100644 index 00000000..5a04c48d --- /dev/null +++ b/mace/ops/folded_batch_norm.cc @@ -0,0 +1,37 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/ops/folded_batch_norm.h" + +namespace mace { + +void Register_FoldedBatchNorm(OperatorRegistry *op_registry) { + REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + FoldedBatchNormOp); + +#if MACE_ENABLE_NEON + REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") + .Device(DeviceType::NEON) + .TypeConstraint("T") + .Build(), + FoldedBatchNormOp); +#endif // MACE_ENABLE_NEON + + REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") + .Device(DeviceType::OPENCL) + .TypeConstraint("T") + .Build(), + FoldedBatchNormOp); + + REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") + .Device(DeviceType::OPENCL) + .TypeConstraint("T") + .Build(), + FoldedBatchNormOp); +} + +} // namespace mace diff --git a/mace/ops/folded_batch_norm.h b/mace/ops/folded_batch_norm.h new file mode 100644 index 00000000..5cb67ccd --- /dev/null +++ b/mace/ops/folded_batch_norm.h @@ -0,0 +1,52 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#ifndef MACE_OPS_FOLDED_BATCH_NORM_H_ +#define MACE_OPS_FOLDED_BATCH_NORM_H_ + +#include "mace/core/operator.h" +#include "mace/kernels/batch_norm.h" + +namespace mace { + +template +class FoldedBatchNormOp : public Operator { + public: + FoldedBatchNormOp(const OperatorDef &operator_def, Workspace *ws) + : Operator(operator_def, ws) { + fused_relu_ = OperatorBase::GetSingleArgument("fused_relu", false); + } + + bool Run(StatsFuture *future) override { + const Tensor *input = this->Input(INPUT); + const Tensor *scale = this->Input(SCALE); + const Tensor *offset = this->Input(OFFSET); + + MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional. ", + input->dim_size()); + MACE_CHECK(scale->dim_size() == 1, "scale must be 1-dimensional. ", + scale->dim_size()); + MACE_CHECK(offset->dim_size() == 1, "offset must be 1-dimensional. ", + offset->dim_size()); + + Tensor *output = this->Output(OUTPUT); + output->ResizeLike(input); + + functor_(input, scale, offset, nullptr, nullptr, 0, + true, fused_relu_, output, future); + return true; + } + + private: + bool fused_relu_; + kernels::BatchNormFunctor functor_; + + protected: + OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR); + OP_OUTPUT_TAGS(OUTPUT); +}; + +} // namespace mace + +#endif // MACE_OPS_FOLDED_BATCH_NORM_H_ diff --git a/mace/python/tools/tf_converter_lib.py b/mace/python/tools/tf_converter_lib.py index 006d41fe..22a31359 100644 --- a/mace/python/tools/tf_converter_lib.py +++ b/mace/python/tools/tf_converter_lib.py @@ -1,6 +1,7 @@ from mace.proto import mace_pb2 import tensorflow as tf import numpy as np +import math from mace.python.tools import memory_optimizer # TODO: support NCHW formt, now only support NHWC. @@ -136,6 +137,22 @@ class TFConverter(object): output_shapes.append(output_shape) op.output_shape.extend(output_shapes) + def add_tensor(self, name, shape, tf_dt, value): + tensor = self.net_def.tensors.add() + tensor.name = name + + shape = list(shape) + tensor.dims.extend(shape) + + if tf_dt == tf.float32: + tensor.data_type = mace_pb2.DT_FLOAT + tensor.float_data.extend(value.flat) + elif tf_dt == tf.int32: + tensor.data_type = mace_pb2.DT_INT32 + tensor.int32_data.extend(value.flat) + else: + raise Exception("Not supported tensor type: " + tf_dt.name) + def convert_tensor(self, op): if op.outputs[0].name not in self.unused_tensor: tensor = self.net_def.tensors.add() @@ -212,25 +229,52 @@ class TFConverter(object): arg.name = 'T' arg.i = self.dt op_def.name = op.name - op_def.type = 'BatchNorm' + op_def.type = 'FoldedBatchNorm' + gamma_tensor = get_input_tensor(op, 1) + gamma_value = gamma_tensor.eval().astype(np.float32) + beta_value = get_input_tensor(op, 2).eval().astype(np.float32) + mean_value = get_input_tensor(op, 3).eval().astype(np.float32) + var_value = get_input_tensor(op, 4).eval().astype(np.float32) + epsilon_value = op.get_attr('epsilon') + + scale_value = ( + (1.0 / np.vectorize(math.sqrt)(var_value + epsilon_value)) * + gamma_value) + offset_value = (-mean_value * scale_value) + beta_value + name_prefix = op.inputs[1].name + idx = name_prefix.rfind('/') + name_prefix = op.inputs[1].name[:idx] + '/' + input_names = [name_prefix+'scale:0', name_prefix+'offset:0'] + self.add_tensor(input_names[0], gamma_value.shape, + gamma_tensor.dtype, scale_value) + self.add_tensor(input_names[1], gamma_value.shape, + gamma_tensor.dtype, offset_value) + if self.device == 'gpu': op_def.input.extend([op.inputs[0].name]) - for i in range(1, len(op.inputs)): - output_name = self.add_buffer_to_image(op.inputs[i].name, "ARGUMENT") + for name in input_names: + output_name = self.add_buffer_to_image(name, "ARGUMENT") op_def.input.extend([output_name]) else: - op_def.input.extend([input.name for input in op.inputs]) - op_def.output.extend([op.outputs[0].name]) + op_def.input.extend([input.name for input in input_names]) - self.add_output_shape(op.outputs, op_def) + self.resolved_ops[op.name] = 1 + + final_op = op + if len(self.tf_graph[op.name]) == 1 and self.tf_graph[op.name][0].type == 'Relu': + relu_op = self.tf_graph[op.name][0] + final_op = relu_op + fused_relu_arg = op_def.arg.add() + fused_relu_arg.name = 'fused_relu' + fused_relu_arg.i = 1 + self.resolved_ops[relu_op.name] = 1 + + op_def.output.extend([final_op.outputs[0].name]) + self.add_output_shape(final_op.outputs, op_def) - epsilon_arg = op_def.arg.add() - epsilon_arg.name = 'epsilon' - epsilon_arg.f = op.get_attr('epsilon') data_format_arg = op_def.arg.add() data_format_arg.name = 'data_format' data_format_arg.s = 'NHWC' - self.resolved_ops[op.name] = 1 self.net_def.op.extend([op_def]) def convert_batchnorm(self, op): diff --git a/tools/validate_gcn.sh b/tools/validate_gcn.sh index 7cbfc4d5..8d01b110 100755 --- a/tools/validate_gcn.sh +++ b/tools/validate_gcn.sh @@ -96,7 +96,7 @@ bazel-bin/mace/python/tools/tf_converter --input=${TF_MODEL_FILE_PATH} \ --output_type=source \ --template=${MACE_SOURCE_DIR}/mace/python/tools/model.template \ --model_tag=${MODEL_TAG} \ - --confuse=False || exit -1 + --confuse=True || exit -1 echo "Step 3: Generate version source" rm -rf ${VERSION_SOURCE_PATH} -- GitLab