提交 bc576436 编写于 作者: L liuqi

Fix bugs of batch norm and graph transformation.

上级 286d1e21
......@@ -12,16 +12,26 @@
namespace mace {
namespace kernels {
struct BatchNormFunctorBase {
BatchNormFunctorBase(bool folded_constant, bool fused_relu) :
folded_constant_(folded_constant),
fused_relu_(fused_relu){}
const bool folded_constant_;
const bool fused_relu_;
};
template <DeviceType D, typename T>
struct BatchNormFunctor {
struct BatchNormFunctor : BatchNormFunctorBase{
BatchNormFunctor(const bool folded_constant, const bool fused_relu) :
BatchNormFunctorBase(folded_constant, fused_relu) {}
void operator()(const Tensor *input,
const Tensor *scale,
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
const float epsilon,
const bool folded_constant,
const bool fused_relu,
Tensor *output,
StatsFuture *future) {
// Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
......@@ -45,17 +55,17 @@ struct BatchNormFunctor {
const T *input_ptr = input->data<T>();
const T *scale_ptr = scale->data<T>();
const T *offset_ptr = offset->data<T>();
const T *mean_ptr = mean->data<T>();
const T *var_ptr = var->data<T>();
T *output_ptr = output->mutable_data<T>();
vector<T> new_scale;
vector<T> new_offset;
if (!folded_constant) {
if (!folded_constant_) {
new_scale.resize(channels);
new_offset.resize(channels);
Tensor::MappingGuard mean_mapper(mean);
Tensor::MappingGuard var_mapper(var);
const T *mean_ptr = mean->data<T>();
const T *var_ptr = var->data<T>();
#pragma omp parallel for
for (index_t c = 0; c < channels; ++c) {
new_scale[c] = scale_ptr[c] / std::sqrt(var_ptr[c] + epsilon);
......@@ -70,12 +80,12 @@ struct BatchNormFunctor {
for (index_t h = 0; h < height; ++h) {
for (index_t w = 0; w < width; ++w) {
for (index_t c = 0; c < channels; ++c) {
if (folded_constant) {
if (folded_constant_) {
output_ptr[pos] = scale_ptr[c] * input_ptr[pos] + offset_ptr[c];
} else {
output_ptr[pos] = new_scale[c] * input_ptr[pos] + new_offset[c];
}
if (fused_relu) {
if (fused_relu_) {
output_ptr[pos] = std::max(output_ptr[pos], static_cast<T>(0));
}
++pos;
......@@ -94,21 +104,19 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
const Tensor *mean,
const Tensor *var,
const float epsilon,
const bool folded_constant,
const bool fused_relu,
Tensor *output,
StatsFuture *future);
template <typename T>
struct BatchNormFunctor<DeviceType::OPENCL, T> {
struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
BatchNormFunctor(const bool folded_constant, const bool fused_relu) :
BatchNormFunctorBase(folded_constant, fused_relu) {}
void operator()(const Tensor *input,
const Tensor *scale,
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
const float epsilon,
const bool folded_constant,
const bool fused_relu,
Tensor *output,
StatsFuture *future);
};
......
......@@ -20,11 +20,9 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *mean,
const Tensor *var,
const float epsilon,
const bool folded_constant,
const bool fused_relu,
Tensor *output,
StatsFuture *future) {
MACE_CHECK(folded_constant || (mean != nullptr && var != nullptr));
MACE_CHECK(folded_constant_ || (mean != nullptr && var != nullptr));
const index_t batch = input->dim(0);
const index_t height = input->dim(1);
......@@ -38,10 +36,10 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (folded_constant) {
if (folded_constant_) {
built_options.emplace("-DFOLDED_CONSTANT");
}
if (fused_relu) {
if (fused_relu_) {
built_options.emplace("-DFUSED_RELU");
}
auto bm_kernel = runtime->BuildKernel("batch_norm", "batch_norm", built_options);
......@@ -50,7 +48,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
bm_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
bm_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(scale->buffer())));
bm_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(offset->buffer())));
if (!folded_constant) {
if (!folded_constant_) {
bm_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(mean->buffer())));
bm_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(var->buffer())));
bm_kernel.setArg(idx++, epsilon);
......@@ -103,7 +101,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3) << "_"
<< folded_constant;
<< folded_constant_;
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
......
......@@ -25,7 +25,7 @@ __kernel void batch_norm(__read_only image2d_t input,
// native_rsqrt seems not faster than rsqrt
DATA_TYPE4 bn_scale = scale_value * rsqrt(var_value + (DATA_TYPE4)epsilon);
DATA_TYPE4 bn_offset = mad(0 - mean_value, new_scale, offset_value);
DATA_TYPE4 bn_offset = mad(0 - mean_value, bn_scale, offset_value);
#endif
const int pos = mad24(ch_blk, width, w);
......
......@@ -14,7 +14,7 @@ template <DeviceType D, class T>
class BatchNormOp : public Operator<D, T> {
public:
BatchNormOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws) {
: Operator<D, T>(operator_def, ws), functor_(false, false) {
epsilon_ =
OperatorBase::GetSingleArgument<float>("epsilon", static_cast<float>(1e-4));
}
......@@ -40,8 +40,7 @@ class BatchNormOp : public Operator<D, T> {
Tensor *output = this->Output(OUTPUT);
output->ResizeLike(input);
functor_(input, scale, offset, mean, var, epsilon_,
false, false, output, future);
functor_(input, scale, offset, mean, var, epsilon_, output, future);
return true;
}
......
......@@ -14,8 +14,8 @@ template <DeviceType D, class T>
class FoldedBatchNormOp : public Operator<D, T> {
public:
FoldedBatchNormOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws) {
fused_relu_ = OperatorBase::GetSingleArgument<bool>("fused_relu", false);
: Operator<D, T>(operator_def, ws),
functor_(true, OperatorBase::GetSingleArgument<bool>("fused_relu", false)) {
}
bool Run(StatsFuture *future) override {
......@@ -33,13 +33,11 @@ class FoldedBatchNormOp : public Operator<D, T> {
Tensor *output = this->Output(OUTPUT);
output->ResizeLike(input);
functor_(input, scale, offset, nullptr, nullptr, 0,
true, fused_relu_, output, future);
functor_(input, scale, offset, nullptr, nullptr, 0, output, future);
return true;
}
private:
bool fused_relu_;
kernels::BatchNormFunctor<D, T> functor_;
protected:
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/core/operator.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
class FoldedBatchNormOpTest : public OpsTestBase {};
void CalculateScaleOffset(const std::vector<float> &gamma,
const std::vector<float> &beta,
const std::vector<float> &mean,
const std::vector<float> &var,
const float epsilon,
std::vector<float> &scale,
std::vector<float> &offset) {
size_t size = gamma.size();
for (int i = 0 ; i < size; ++i) {
scale[i] = gamma[i] / std::sqrt(var[i] + epsilon);
offset[i] = offset[i] - mean[i] * scale[i];
}
}
template <DeviceType D>
void Simple() {
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>("Input", {1, 6, 2, 1},
{5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
std::vector<float> scale(1);
std::vector<float> offset(1);
CalculateScaleOffset({4.0f}, {2.0}, {10}, {11.67f}, 1e-3, scale, offset);
net.AddInputFromArray<D, float>("Scale", {1}, scale);
net.AddInputFromArray<D, float>("Offset", {1}, offset);
if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT);
BufferToImage<D, float>(net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT);
BufferToImage<D, float>(net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
ImageToBuffer<D, float>(net, "OutputImage", "Output",
kernels::BufferType::IN_OUT);
} else {
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("Input")
.Input("Scale")
.Input("Offset")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
// Check
auto expected =
CreateTensor<float>({1, 6, 2, 1}, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83,
3.17, 3.17, 5.51, 5.51, 7.86, 7.86});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2);
}
TEST_F(FoldedBatchNormOpTest, SimpleCPU) { Simple<DeviceType::CPU>(); }
/*
TEST_F(FoldedBatchNormOpTest, SimpleNEON) {
Simple<DeviceType::NEON>();
}
*/
TEST_F(FoldedBatchNormOpTest, SimpleOPENCL) { Simple<DeviceType::OPENCL>(); }
/*
TEST_F(FoldedBatchNormOpTest, SimpleRandomNeon) {
srand(time(NULL));
// generate random input
index_t batch = 1 + rand() % 10;
index_t channels = 3 + rand() % 50;
index_t height = 64;
index_t width = 64;
// Construct graph
OpsTestNet net;
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("Input")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.Input("Epsilon")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<DeviceType::CPU, float>("Input", {batch, channels, height,
width});
net.AddRandomInput<DeviceType::CPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::CPU, float>("Offset", {channels});
net.AddRandomInput<DeviceType::CPU, float>("Mean", {channels});
net.AddRandomInput<DeviceType::CPU, float>("Var", {channels}, true);
net.AddInputFromArray<DeviceType::CPU, float>("Epsilon", {}, {1e-3});
// run cpu
net.RunOp();
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// Run NEON
net.RunOp(DeviceType::NEON);
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-2);
}
TEST_F(FoldedBatchNormOpTest, ComplexRandomNeon) {
srand(time(NULL));
// generate random input
index_t batch = 1 + rand() % 10;
index_t channels = 3 + rand() % 50;
index_t height = 103;
index_t width = 113;
// Construct graph
OpsTestNet net;
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("Input")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.Input("Epsilon")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<DeviceType::CPU, float>("Input", {batch, channels, height,
width});
net.AddRandomInput<DeviceType::CPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::CPU, float>("Offset", {channels});
net.AddRandomInput<DeviceType::CPU, float>("Mean", {channels});
net.AddRandomInput<DeviceType::CPU, float>("Var", {channels}, true);
net.AddInputFromArray<DeviceType::CPU, float>("Epsilon", {}, {1e-3});
// run cpu
net.RunOp();
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// Run NEON
net.RunOp(DeviceType::NEON);
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-2);
}
*/
TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
srand(time(NULL));
// generate random input
index_t batch = 1 + rand() % 10;
index_t channels = 3 + rand() % 50;
index_t height = 64;
index_t width = 64;
// Construct graph
OpsTestNet net;
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("Input")
.Input("Scale")
.Input("Offset")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<DeviceType::OPENCL, float>(
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
// run cpu
net.RunOp();
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// Run on opencl
BufferToImage<DeviceType::OPENCL, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT);
BufferToImage<DeviceType::OPENCL, float>(net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
// Run on opencl
net.RunOp(DeviceType::OPENCL);
net.Sync();
ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2);
}
TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
srand(time(NULL));
// generate random input
index_t batch = 1 + rand() % 10;
index_t channels = 3 + rand() % 50;
index_t height = 64;
index_t width = 64;
// Construct graph
OpsTestNet net;
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("Input")
.Input("Scale")
.Input("Offset")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<DeviceType::OPENCL, float>(
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
// run cpu
net.RunOp();
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// Run on opencl
BufferToImage<DeviceType::OPENCL, half>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT);
BufferToImage<DeviceType::OPENCL, half>(net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, half>(net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
// Run on opencl
net.RunOp(DeviceType::OPENCL);
net.Sync();
ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.5);
}
TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
srand(time(NULL));
// generate random input
index_t batch = 1 + rand() % 10;
index_t channels = 3 + rand() % 50;
index_t height = 103;
index_t width = 113;
// Construct graph
OpsTestNet net;
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("Input")
.Input("Scale")
.Input("Offset")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<DeviceType::OPENCL, float>(
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
// run cpu
net.RunOp();
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// Run on opencl
BufferToImage<DeviceType::OPENCL, float>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT);
BufferToImage<DeviceType::OPENCL, float>(net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, float>(net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
// Run on opencl
net.RunOp(DeviceType::OPENCL);
ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2);
}
TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
srand(time(NULL));
// generate random input
index_t batch = 1 + rand() % 10;
index_t channels = 3 + rand() % 50;
index_t height = 103;
index_t width = 113;
// Construct graph
OpsTestNet net;
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("Input")
.Input("Scale")
.Input("Offset")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<DeviceType::OPENCL, float>(
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
// run cpu
net.RunOp();
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// Run on opencl
BufferToImage<DeviceType::OPENCL, half>(net, "Input", "InputImage",
kernels::BufferType::IN_OUT);
BufferToImage<DeviceType::OPENCL, half>(net, "Scale", "ScaleImage",
kernels::BufferType::ARGUMENT);
BufferToImage<DeviceType::OPENCL, half>(net, "Offset", "OffsetImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
// Run on opencl
net.RunOp(DeviceType::OPENCL);
ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.5);
}
}
......@@ -39,6 +39,13 @@ def main(unused_args):
f.write(str(output_graph_def))
print("Model conversion is completed.")
def str2bool(v):
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')
def parse_args():
"""Parses command line arguments."""
......@@ -91,7 +98,9 @@ def parse_args():
help="template path")
parser.add_argument(
"--confuse",
type=bool,
type=str2bool,
nargs='?',
const=False,
default=False,
help="confuse model names")
parser.add_argument(
......
......@@ -228,10 +228,19 @@ class TFConverter(object):
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
data_format_arg.s = 'NHWC'
op_def.name = op.name
op_def.type = 'FoldedBatchNorm'
gamma_tensor = get_input_tensor(op, 1)
gamma_value = gamma_tensor.eval().astype(np.float32)
for i in range(1, 5):
input_tensor = get_input_tensor(op, i)
assert input_tensor.shape == gamma_tensor.shape
self.unused_tensor.add(input_tensor.name)
gamma_value = get_input_tensor(op, 1).eval().astype(np.float32)
beta_value = get_input_tensor(op, 2).eval().astype(np.float32)
mean_value = get_input_tensor(op, 3).eval().astype(np.float32)
var_value = get_input_tensor(op, 4).eval().astype(np.float32)
......@@ -241,9 +250,8 @@ class TFConverter(object):
(1.0 / np.vectorize(math.sqrt)(var_value + epsilon_value)) *
gamma_value)
offset_value = (-mean_value * scale_value) + beta_value
name_prefix = op.inputs[1].name
idx = name_prefix.rfind('/')
name_prefix = op.inputs[1].name[:idx] + '/'
idx = gamma_tensor.name.rfind('/')
name_prefix = gamma_tensor.name[:idx] + '/'
input_names = [name_prefix+'scale:0', name_prefix+'offset:0']
self.add_tensor(input_names[0], gamma_value.shape,
gamma_tensor.dtype, scale_value)
......@@ -259,8 +267,8 @@ class TFConverter(object):
op_def.input.extend([input.name for input in input_names])
self.resolved_ops[op.name] = 1
final_op = op
if len(self.tf_graph[op.name]) == 1 and self.tf_graph[op.name][0].type == 'Relu':
relu_op = self.tf_graph[op.name][0]
final_op = relu_op
......@@ -272,9 +280,6 @@ class TFConverter(object):
op_def.output.extend([final_op.outputs[0].name])
self.add_output_shape(final_op.outputs, op_def)
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
data_format_arg.s = 'NHWC'
self.net_def.op.extend([op_def])
def convert_batchnorm(self, op):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册