提交 286d1e21 编写于 作者: L liuqi

Add folded batch norm to combine batchnorm and relu.

上级 2d50eea4
......@@ -76,6 +76,7 @@ extern void Register_Relu(OperatorRegistry *op_registry);
extern void Register_ResizeBilinear(OperatorRegistry *op_registry);
extern void Register_SpaceToBatchND(OperatorRegistry *op_registry);
extern void Register_Softmax(OperatorRegistry *op_registry);
extern void Register_FoldedBatchNorm(OperatorRegistry *op_registry);
OperatorRegistry::OperatorRegistry() {
Register_AddN(this);
......@@ -95,6 +96,7 @@ OperatorRegistry::OperatorRegistry() {
Register_ResizeBilinear(this);
Register_SpaceToBatchND(this);
Register_Softmax(this);
Register_FoldedBatchNorm(this);
}
} // namespace mace
......@@ -14,13 +14,14 @@ namespace kernels {
template <DeviceType D, typename T>
struct BatchNormFunctor {
float epsilon_;
void operator()(const Tensor *input,
const Tensor *scale,
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
const float epsilon,
const bool folded_constant,
const bool fused_relu,
Tensor *output,
StatsFuture *future) {
// Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
......@@ -39,8 +40,6 @@ struct BatchNormFunctor {
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard scale_mapper(scale);
Tensor::MappingGuard offset_mapper(offset);
Tensor::MappingGuard mean_mapper(mean);
Tensor::MappingGuard var_mapper(var);
Tensor::MappingGuard output_mapper(output);
const T *input_ptr = input->data<T>();
......@@ -50,13 +49,18 @@ struct BatchNormFunctor {
const T *var_ptr = var->data<T>();
T *output_ptr = output->mutable_data<T>();
vector<T> new_scale(channels);
vector<T> new_offset(channels);
vector<T> new_scale;
vector<T> new_offset;
if (!folded_constant) {
new_scale.resize(channels);
new_offset.resize(channels);
Tensor::MappingGuard mean_mapper(mean);
Tensor::MappingGuard var_mapper(var);
#pragma omp parallel for
for (index_t c = 0; c < channels; ++c) {
new_scale[c] = scale_ptr[c] / std::sqrt(var_ptr[c] + epsilon_);
new_offset[c] = offset_ptr[c] - mean_ptr[c] * new_scale[c];
for (index_t c = 0; c < channels; ++c) {
new_scale[c] = scale_ptr[c] / std::sqrt(var_ptr[c] + epsilon);
new_offset[c] = offset_ptr[c] - mean_ptr[c] * new_scale[c];
}
}
index_t pos = 0;
......@@ -66,7 +70,14 @@ struct BatchNormFunctor {
for (index_t h = 0; h < height; ++h) {
for (index_t w = 0; w < width; ++w) {
for (index_t c = 0; c < channels; ++c) {
output_ptr[pos] = new_scale[c] * input_ptr[pos] + new_offset[c];
if (folded_constant) {
output_ptr[pos] = scale_ptr[c] * input_ptr[pos] + offset_ptr[c];
} else {
output_ptr[pos] = new_scale[c] * input_ptr[pos] + new_offset[c];
}
if (fused_relu) {
output_ptr[pos] = std::max(output_ptr[pos], static_cast<T>(0));
}
++pos;
}
}
......@@ -82,18 +93,22 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
const float epsilon,
const bool folded_constant,
const bool fused_relu,
Tensor *output,
StatsFuture *future);
template <typename T>
struct BatchNormFunctor<DeviceType::OPENCL, T> {
float epsilon_;
void operator()(const Tensor *input,
const Tensor *scale,
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
const float epsilon,
const bool folded_constant,
const bool fused_relu,
Tensor *output,
StatsFuture *future);
};
......
......@@ -19,8 +19,13 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
const float epsilon,
const bool folded_constant,
const bool fused_relu,
Tensor *output,
StatsFuture *future) {
MACE_CHECK(folded_constant || (mean != nullptr && var != nullptr));
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
const index_t width = input->dim(2);
......@@ -33,15 +38,23 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (folded_constant) {
built_options.emplace("-DFOLDED_CONSTANT");
}
if (fused_relu) {
built_options.emplace("-DFUSED_RELU");
}
auto bm_kernel = runtime->BuildKernel("batch_norm", "batch_norm", built_options);
uint32_t idx = 0;
bm_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
bm_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(scale->buffer())));
bm_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(offset->buffer())));
bm_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(mean->buffer())));
bm_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(var->buffer())));
bm_kernel.setArg(idx++, epsilon_);
if (!folded_constant) {
bm_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(mean->buffer())));
bm_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(var->buffer())));
bm_kernel.setArg(idx++, epsilon);
}
bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
......@@ -89,7 +102,8 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
<< output->dim(3) << "_"
<< folded_constant;
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
......
......@@ -3,27 +3,39 @@
__kernel void batch_norm(__read_only image2d_t input,
__read_only image2d_t scale,
__read_only image2d_t offset,
#ifndef FOLDED_CONSTANT
__read_only image2d_t mean,
__read_only image2d_t var,
__private const float epsilon,
#endif
__write_only image2d_t output) {
const int ch_blk = get_global_id(0);
const int w = get_global_id(1);
const int hb = get_global_id(2);
const int width = get_global_size(1);
#ifdef FOLDED_CONSTANT
DATA_TYPE4 bn_scale = READ_IMAGET(scale, SAMPLER, (int2)(ch_blk, 0));
DATA_TYPE4 bn_offset = READ_IMAGET(offset, SAMPLER, (int2)(ch_blk, 0));
#else
DATA_TYPE4 scale_value = READ_IMAGET(scale, SAMPLER, (int2)(ch_blk, 0));
DATA_TYPE4 offset_value = READ_IMAGET(offset, SAMPLER, (int2)(ch_blk, 0));
DATA_TYPE4 mean_value = READ_IMAGET(mean, SAMPLER, (int2)(ch_blk, 0));
DATA_TYPE4 var_value = READ_IMAGET(var, SAMPLER, (int2)(ch_blk, 0));
// native_rsqrt seems not faster than rsqrt
DATA_TYPE4 new_scale = scale_value * rsqrt(var_value + (DATA_TYPE4)epsilon);
DATA_TYPE4 new_offset = mad(0 - mean_value, new_scale, offset_value);
DATA_TYPE4 bn_scale = scale_value * rsqrt(var_value + (DATA_TYPE4)epsilon);
DATA_TYPE4 bn_offset = mad(0 - mean_value, new_scale, offset_value);
#endif
const int pos = mad24(ch_blk, width, w);
DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
DATA_TYPE4 out = mad(in, new_scale, new_offset);
DATA_TYPE4 out = mad(in, bn_scale, bn_offset);
#ifdef FUSED_RELU
out = fmax(out, 0);
#endif
WRITE_IMAGET(output, (int2)(pos, hb), out);
}
......@@ -2,8 +2,8 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_BATCH_NORM_H_
#define MACE_BATCH_NORM_H_
#ifndef MACE_OPS_BATCH_NORM_H_
#define MACE_OPS_BATCH_NORM_H_
#include "mace/core/operator.h"
#include "mace/kernels/batch_norm.h"
......@@ -14,9 +14,9 @@ template <DeviceType D, class T>
class BatchNormOp : public Operator<D, T> {
public:
BatchNormOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws), functor_() {
functor_.epsilon_ =
OperatorBase::GetSingleArgument<float>("epsilon", static_cast<float>(1e-4));
: Operator<D, T>(operator_def, ws) {
epsilon_ =
OperatorBase::GetSingleArgument<float>("epsilon", static_cast<float>(1e-4));
}
bool Run(StatsFuture *future) override {
......@@ -40,11 +40,13 @@ class BatchNormOp : public Operator<D, T> {
Tensor *output = this->Output(OUTPUT);
output->ResizeLike(input);
functor_(input, scale, offset, mean, var, output, future);
functor_(input, scale, offset, mean, var, epsilon_,
false, false, output, future);
return true;
}
private:
float epsilon_;
kernels::BatchNormFunctor<D, T> functor_;
protected:
......@@ -54,4 +56,4 @@ class BatchNormOp : public Operator<D, T> {
} // namespace mace
#endif // MACE_BATCH_NORM_H_
#endif // MACE_OPS_BATCH_NORM_H_
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/ops/folded_batch_norm.h"
namespace mace {
void Register_FoldedBatchNorm(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
FoldedBatchNormOp<DeviceType::CPU, float>);
#if MACE_ENABLE_NEON
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
.Device(DeviceType::NEON)
.TypeConstraint<float>("T")
.Build(),
FoldedBatchNormOp<DeviceType::NEON, float>);
#endif // MACE_ENABLE_NEON
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
.Device(DeviceType::OPENCL)
.TypeConstraint<float>("T")
.Build(),
FoldedBatchNormOp<DeviceType::OPENCL, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
.Device(DeviceType::OPENCL)
.TypeConstraint<half>("T")
.Build(),
FoldedBatchNormOp<DeviceType::OPENCL, half>);
}
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_OPS_FOLDED_BATCH_NORM_H_
#define MACE_OPS_FOLDED_BATCH_NORM_H_
#include "mace/core/operator.h"
#include "mace/kernels/batch_norm.h"
namespace mace {
template <DeviceType D, class T>
class FoldedBatchNormOp : public Operator<D, T> {
public:
FoldedBatchNormOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws) {
fused_relu_ = OperatorBase::GetSingleArgument<bool>("fused_relu", false);
}
bool Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
const Tensor *scale = this->Input(SCALE);
const Tensor *offset = this->Input(OFFSET);
MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional. ",
input->dim_size());
MACE_CHECK(scale->dim_size() == 1, "scale must be 1-dimensional. ",
scale->dim_size());
MACE_CHECK(offset->dim_size() == 1, "offset must be 1-dimensional. ",
offset->dim_size());
Tensor *output = this->Output(OUTPUT);
output->ResizeLike(input);
functor_(input, scale, offset, nullptr, nullptr, 0,
true, fused_relu_, output, future);
return true;
}
private:
bool fused_relu_;
kernels::BatchNormFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR);
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace mace
#endif // MACE_OPS_FOLDED_BATCH_NORM_H_
from mace.proto import mace_pb2
import tensorflow as tf
import numpy as np
import math
from mace.python.tools import memory_optimizer
# TODO: support NCHW formt, now only support NHWC.
......@@ -136,6 +137,22 @@ class TFConverter(object):
output_shapes.append(output_shape)
op.output_shape.extend(output_shapes)
def add_tensor(self, name, shape, tf_dt, value):
tensor = self.net_def.tensors.add()
tensor.name = name
shape = list(shape)
tensor.dims.extend(shape)
if tf_dt == tf.float32:
tensor.data_type = mace_pb2.DT_FLOAT
tensor.float_data.extend(value.flat)
elif tf_dt == tf.int32:
tensor.data_type = mace_pb2.DT_INT32
tensor.int32_data.extend(value.flat)
else:
raise Exception("Not supported tensor type: " + tf_dt.name)
def convert_tensor(self, op):
if op.outputs[0].name not in self.unused_tensor:
tensor = self.net_def.tensors.add()
......@@ -212,25 +229,52 @@ class TFConverter(object):
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = 'BatchNorm'
op_def.type = 'FoldedBatchNorm'
gamma_tensor = get_input_tensor(op, 1)
gamma_value = gamma_tensor.eval().astype(np.float32)
beta_value = get_input_tensor(op, 2).eval().astype(np.float32)
mean_value = get_input_tensor(op, 3).eval().astype(np.float32)
var_value = get_input_tensor(op, 4).eval().astype(np.float32)
epsilon_value = op.get_attr('epsilon')
scale_value = (
(1.0 / np.vectorize(math.sqrt)(var_value + epsilon_value)) *
gamma_value)
offset_value = (-mean_value * scale_value) + beta_value
name_prefix = op.inputs[1].name
idx = name_prefix.rfind('/')
name_prefix = op.inputs[1].name[:idx] + '/'
input_names = [name_prefix+'scale:0', name_prefix+'offset:0']
self.add_tensor(input_names[0], gamma_value.shape,
gamma_tensor.dtype, scale_value)
self.add_tensor(input_names[1], gamma_value.shape,
gamma_tensor.dtype, offset_value)
if self.device == 'gpu':
op_def.input.extend([op.inputs[0].name])
for i in range(1, len(op.inputs)):
output_name = self.add_buffer_to_image(op.inputs[i].name, "ARGUMENT")
for name in input_names:
output_name = self.add_buffer_to_image(name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([input.name for input in op.inputs])
op_def.output.extend([op.outputs[0].name])
op_def.input.extend([input.name for input in input_names])
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
final_op = op
if len(self.tf_graph[op.name]) == 1 and self.tf_graph[op.name][0].type == 'Relu':
relu_op = self.tf_graph[op.name][0]
final_op = relu_op
fused_relu_arg = op_def.arg.add()
fused_relu_arg.name = 'fused_relu'
fused_relu_arg.i = 1
self.resolved_ops[relu_op.name] = 1
op_def.output.extend([final_op.outputs[0].name])
self.add_output_shape(final_op.outputs, op_def)
epsilon_arg = op_def.arg.add()
epsilon_arg.name = 'epsilon'
epsilon_arg.f = op.get_attr('epsilon')
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
data_format_arg.s = 'NHWC'
self.resolved_ops[op.name] = 1
self.net_def.op.extend([op_def])
def convert_batchnorm(self, op):
......
......@@ -96,7 +96,7 @@ bazel-bin/mace/python/tools/tf_converter --input=${TF_MODEL_FILE_PATH} \
--output_type=source \
--template=${MACE_SOURCE_DIR}/mace/python/tools/model.template \
--model_tag=${MODEL_TAG} \
--confuse=False || exit -1
--confuse=True || exit -1
echo "Step 3: Generate version source"
rm -rf ${VERSION_SOURCE_PATH}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册