提交 46baf92a 编写于 作者: 李寅

Refactor eltwise

上级 7bdc8a4d
此差异已折叠。
......@@ -42,48 +42,79 @@ struct SoftmaxFunctor<DeviceType::CPU, float> {
Tensor *output,
StatsFuture *future) {
MACE_UNUSED(future);
const index_t batch = input->dim(0);
const index_t class_count = input->dim(1);
const index_t class_size = input->dim(2) * input->dim(3);
const index_t batch_size = class_count * class_size;
Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard output_guard(output);
const float *input_data = input->data<float>();
float *output_data = output->mutable_data<float>();
for (index_t b = 0; b < batch; ++b) {
// softmax for nchw image
if (input->dim_size() == 4) {
const index_t batch = input->dim(0);
const index_t class_count = input->dim(1);
const index_t class_size = input->dim(2) * input->dim(3);
const index_t batch_size = class_count * class_size;
for (index_t b = 0; b < batch; ++b) {
#pragma omp parallel for
for (index_t k = 0; k < class_size; ++k) {
const float *input_ptr = input_data + b * batch_size + k;
float *output_ptr = output_data + b * batch_size + k;
float max_val = std::numeric_limits<float>::lowest();
index_t channel_offset = 0;
for (index_t c = 0; c < class_count; ++c) {
float data = input_ptr[channel_offset];
if (data > max_val) {
max_val = data;
}
channel_offset += class_size;
}
channel_offset = 0;
float sum = 0;
for (index_t c = 0; c < class_count; ++c) {
float exp_value = ::exp(input_ptr[channel_offset] - max_val);
sum += exp_value;
output_ptr[channel_offset] = exp_value;
channel_offset += class_size;
}
sum = std::max(sum, std::numeric_limits<float>::min());
channel_offset = 0;
for (index_t c = 0; c < class_count; ++c) {
output_ptr[channel_offset] /= sum;
channel_offset += class_size;
}
} // k
} // b
} else if (input->dim_size() == 2) { // normal 2d softmax
const index_t class_size = input->dim(0);
const index_t class_count = input->dim(1);
#pragma omp parallel for
for (index_t k = 0; k < class_size; ++k) {
const float *input_ptr = input_data + b * batch_size + k;
float *output_ptr = output_data + b * batch_size + k;
const float *input_ptr = input_data + k * class_count;
float *output_ptr = output_data + k * class_count;
float max_val = std::numeric_limits<float>::lowest();
index_t channel_offset = 0;
for (index_t c = 0; c < class_count; ++c) {
float data = input_ptr[channel_offset];
if (data > max_val) {
max_val = data;
}
channel_offset += class_size;
max_val = std::max(max_val, input_ptr[c]);
}
channel_offset = 0;
float sum = 0;
for (index_t c = 0; c < class_count; ++c) {
float exp_value = ::exp(input_ptr[channel_offset] - max_val);
float exp_value = ::exp(input_ptr[c] - max_val);
sum += exp_value;
output_ptr[channel_offset] = exp_value;
channel_offset += class_size;
output_ptr[c] = exp_value;
}
channel_offset = 0;
sum = std::max(sum, std::numeric_limits<float>::min());
for (index_t c = 0; c < class_count; ++c) {
output_ptr[channel_offset] /= sum;
channel_offset += class_size;
output_ptr[c] /= sum;
}
} // k
} // b
}
} else {
MACE_NOT_IMPLEMENTED;
}
return MACE_SUCCESS;
}
......
......@@ -30,7 +30,9 @@ class EltwiseOp : public Operator<D, T> {
static_cast<kernels::EltwiseType>(OperatorBase::GetOptionalArg<int>(
"type", static_cast<int>(kernels::EltwiseType::NONE))),
OperatorBase::GetRepeatedArgs<float>("coeff"),
OperatorBase::GetOptionalArg<float>("value", 1.0)) {}
OperatorBase::GetOptionalArg<float>("value", 1.0),
static_cast<DataFormat>(OperatorBase::GetOptionalArg<int>(
"data_format", 0))) {}
MaceStatus Run(StatsFuture *future) override {
const Tensor *input0 = this->Input(0);
......
......@@ -41,6 +41,7 @@ void SimpleTensorScalar(const kernels::EltwiseType type,
.Input("TInput")
.AddIntArg("type", static_cast<int>(type))
.AddFloatArg("value", x)
.AddIntArg("data_format", DataFormat::NCHW)
.Output("TOutput")
.Finalize(net.NewOperatorDef());
// Run
......@@ -84,15 +85,24 @@ void SimpleTensorEltwise(const kernels::EltwiseType type,
net.AddInputFromArray<D, float>("Input1", shape1, input1);
if (D == DeviceType::CPU) {
net.TransformDataFormat<D, float>("Input0", NHWC, "TInput0", NCHW);
net.TransformDataFormat<D, float>("Input1", NHWC, "TInput1", NCHW);
OpDefBuilder("Eltwise", "EltwiseTest")
.Input("TInput0")
.Input("TInput1")
auto op_builder = OpDefBuilder("Eltwise", "EltwiseTest")
.AddIntArg("type", static_cast<int>(type))
.AddFloatsArg("coeff", coeff)
.Output("TOutput")
.Finalize(net.NewOperatorDef());
.AddIntArg("data_format", DataFormat::NCHW)
.Output("TOutput");
if (shape0.size() > 1) {
net.TransformDataFormat<D, float>("Input0", NHWC, "TInput0", NCHW);
op_builder.Input("TInput0");
} else {
op_builder.Input("Input0");
}
if (shape1.size() > 1) {
net.TransformDataFormat<D, float>("Input1", NHWC, "TInput1", NCHW);
op_builder.Input("TInput1");
} else {
op_builder.Input("Input1");
}
op_builder.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
......@@ -214,6 +224,35 @@ TEST_F(EltwiseOpTest, CPUSimpleTensorVector) {
kernels::EltwiseType::SQR_DIFF, {1, 1, 1, 5}, {1, 2, 3, 4, 5},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {3},
{1, 2, 3}, {2, 4, 6, 5, 7, 9});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::SUB, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{5}, {1, 2, 3, 4, 5}, {0, 0, 0, 0, 0, 5, 5, 5, 5, 5});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::SUB, {5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 0, 0, 0, 0, -5, -5, -5, -5, -5});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::PROD, {3}, {1, 2, 3}, {1, 2, 1, 3},
{1, 2, 3, 4, 5, 6}, {1, 4, 9, 4, 10, 18});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::DIV, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{5}, {1, 1, 1, 1, 5}, {1, 2, 3, 4, 1, 6, 7, 8, 9, 2});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::DIV, {5}, {1, 1, 1, 2, 4}, {1, 2, 1, 5},
{1, 1, 1, 2, 2, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 2, 1, 1, 1, 2, 4});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::MIN, {5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::MAX, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{5}, {1, 2, 3, 4, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::SQR_DIFF, {5}, {1, 2, 3, 4, 5},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
}
TEST_F(EltwiseOpTest, GPUSimpleTensorVector) {
......@@ -322,6 +361,7 @@ void RandomTensorScalar(const kernels::EltwiseType type,
.Input("TInput")
.AddIntArg("type", static_cast<int>(type))
.AddFloatArg("value", 0.1)
.AddIntArg("data_format", DataFormat::NCHW)
.Output("TOutput")
.Finalize(net.NewOperatorDef());
// Run
......@@ -375,6 +415,7 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
.Input("TInput1")
.AddIntArg("type", static_cast<int>(type))
.AddFloatsArg("coeff", coeff)
.AddIntArg("data_format", DataFormat::NCHW)
.Output("TOutput")
.Finalize(net.NewOperatorDef());
......
......@@ -29,8 +29,12 @@ void Simple() {
// Add input data
net.AddInputFromArray<D, float>("Input", {1, 1, 2, 4},
{1, 1, 1, 1, 1, 2, 3, 4});
auto expected = CreateTensor<float>(
{1, 1, 2, 4},
{0.25, 0.25, 0.25, 0.25, 0.0320586, 0.08714432, 0.23688282, 0.64391426});
if (D == DeviceType::CPU) {
// test 4d softmax
net.TransformDataFormat<CPU, float>("Input", NHWC, "InputNCHW", NCHW);
OpDefBuilder("Softmax", "SoftmaxTest")
.Input("InputNCHW")
......@@ -40,6 +44,21 @@ void Simple() {
// Run
net.RunOp(D);
net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
// check 2d softmax
net.AddInputFromArray<D, float>("Input2d", {2, 4},
{1, 1, 1, 1, 1, 2, 3, 4});
OpDefBuilder("Softmax", "SoftmaxTest")
.Input("Input2d")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.GetOutput("Output")->Reshape({1, 1, 2, 4});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -55,15 +74,11 @@ void Simple() {
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} else {
MACE_NOT_IMPLEMENTED;
}
auto expected = CreateTensor<float>(
{1, 1, 2, 4},
{0.25, 0.25, 0.25, 0.25, 0.0320586, 0.08714432, 0.23688282, 0.64391426});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
}
} // namespace
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册