提交 46baf92a 编写于 作者: 李寅

Refactor eltwise

上级 7bdc8a4d
......@@ -17,8 +17,8 @@
#include <algorithm>
#include <memory>
#include <vector>
#include <utility>
#include <vector>
#include "mace/core/future.h"
#include "mace/core/tensor.h"
......@@ -44,70 +44,253 @@ enum EltwiseType {
NONE = 10,
};
inline void TensorScalar(const EltwiseType type,
const float *input0,
const float value,
const index_t size,
float *output) {
inline void TensorBroadcastEltwise(const EltwiseType type,
const float *input0,
const float *input1,
const std::vector<float> &coeff,
const index_t diff_size,
const index_t common_size,
const bool swapped,
float *output) {
switch (type) {
case SUM:
if (coeff.empty()) {
#pragma omp parallel for collapse(2)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
input0[i + d * common_size] + input1[i];
}
}
} else {
std::vector<float> coeff_copy = coeff;
if (swapped) {
std::swap(coeff_copy[0], coeff_copy[1]);
}
#pragma omp parallel for collapse(2)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
input0[i + d * common_size] * coeff_copy[0] +
input1[i] * coeff_copy[1];
}
}
}
break;
case SUB:
if (!swapped) {
#pragma omp parallel for collapse(2)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
input0[i + d * common_size] - input1[i];
}
}
} else {
#pragma omp parallel for collapse(2)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
input1[i] - input0[i + d * common_size];
}
}
}
break;
case PROD:
#pragma omp parallel for collapse(2)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] = input0[i + d * common_size] * input1[i];
}
}
break;
case DIV:
if (!swapped) {
#pragma omp parallel for collapse(2)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
input0[i + d * common_size] / input1[i];
}
}
} else {
#pragma omp parallel for collapse(2)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
input1[i] / input0[i + d * common_size];
}
}
}
break;
case MIN:
#pragma omp parallel for collapse(2)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
std::min(input0[i + d * common_size], input1[i]);
}
}
break;
case MAX:
#pragma omp parallel for collapse(2)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
std::max(input0[i + d * common_size], input1[i]);
}
}
break;
case SQR_DIFF:
#pragma omp parallel for collapse(2)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
std::pow(input0[i + d * common_size] - input1[i], 2.f);
}
}
break;
case POW:
if (!swapped) {
#pragma omp parallel for collapse(2)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
std::pow(input0[i + d * common_size], input1[i]);
}
}
} else {
#pragma omp parallel for collapse(2)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
std::pow(input1[i], input0[i + d * common_size]);
}
}
}
break;
case NEG:
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] + value;
for (index_t i = 0; i < diff_size * common_size; ++i) {
output[i] = -input0[i];
}
break;
case ABS:
#pragma omp parallel for
for (index_t i = 0; i < diff_size * common_size; ++i) {
output[i] = std::fabs(input0[i]);
}
break;
default:
LOG(FATAL) << "Eltwise op not support type " << type;
}
}
// Multiplication is costly, so we specialize the following case.
inline void TensorEltwise(const EltwiseType type,
const float *input0,
const float *input1,
const std::vector<float> &coeff,
const index_t size,
const bool swapped,
float *output) {
switch (type) {
case SUM:
if (coeff.empty()) {
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] + input1[i];
}
} else {
std::vector<float> coeff_copy = coeff;
if (swapped) {
std::swap(coeff_copy[0], coeff_copy[1]);
}
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] * coeff_copy[0] + input1[i] * coeff_copy[1];
}
}
break;
case SUB:
if (!swapped) {
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] - value;
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] - input1[i];
}
} else {
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = input1[i] - input0[i];
}
}
break;
case PROD:
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] * value;
output[i] = input0[i] * input1[i];
}
break;
case DIV:
if (!swapped) {
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] / value;
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] / input1[i];
}
} else {
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = input1[i] / input0[i];
}
}
break;
case MIN:
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = std::min<float>(input0[i], value);
output[i] = std::min(input0[i], input1[i]);
}
break;
case MAX:
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = std::max<float>(input0[i], value);
output[i] = std::max(input0[i], input1[i]);
}
break;
case NEG:
case SQR_DIFF:
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = -input0[i];
output[i] = std::pow(input0[i] - input1[i], 2.f);
}
break;
case ABS:
case POW:
if (!swapped) {
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = std::abs(input0[i]);
for (index_t i = 0; i < size; ++i) {
output[i] = std::pow(input0[i], input1[i]);
}
} else {
for (index_t i = 0; i < size; ++i) {
output[i] = std::pow(input1[i], input0[i]);
}
}
break;
case SQR_DIFF:
case NEG:
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = std::pow(input0[i] - value, 2.f);
output[i] = -input0[i];
}
break;
case POW:
case ABS:
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = std::pow(input0[i], value);
output[i] = std::fabs(input0[i]);
}
break;
default:
......@@ -115,328 +298,304 @@ inline void TensorScalar(const EltwiseType type,
}
}
inline void TensorBatchVector(const EltwiseType type,
const float *input0,
const float *input1,
const index_t batch,
const index_t channel,
const index_t hw,
const bool swapped,
float *output) {
// Multiplication is costly, so we specialize the following case.
inline void TensorScalarEltwise(const EltwiseType type,
const float *input0,
const float input1,
const std::vector<float> &coeff,
const index_t size,
const bool swapped,
float *output) {
switch (type) {
case SUM:
#pragma omp parallel for collapse(3)
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channel; ++c) {
for (index_t i = 0; i < hw; ++i) {
const index_t idx0 = (b * channel + c) * hw + i;
const index_t idx1 = b * channel + c;
output[idx0] = input0[idx0] + input1[idx1];
}
if (coeff.empty()) {
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] + input1;
}
} else {
std::vector<float> coeff_copy = coeff;
if (swapped) {
std::swap(coeff_copy[0], coeff_copy[1]);
}
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] * coeff_copy[0] + input1 * coeff_copy[1];
}
}
break;
case SUB:
if (swapped) {
#pragma omp parallel for collapse(3)
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channel; ++c) {
for (index_t i = 0; i < hw; ++i) {
const index_t idx0 = (b * channel + c) * hw + i;
const index_t idx1 = b * channel + c;
output[idx0] = input1[idx1] - input0[idx0];
}
}
if (!swapped) {
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] - input1;
}
} else {
#pragma omp parallel for collapse(3)
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channel; ++c) {
for (index_t i = 0; i < hw; ++i) {
const index_t idx0 = (b * channel + c) * hw + i;
const index_t idx1 = b * channel + c;
output[idx0] = input0[idx0] - input1[idx1];
}
}
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = input1 - input0[i];
}
}
break;
case PROD:
#pragma omp parallel for collapse(3)
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channel; ++c) {
for (index_t i = 0; i < hw; ++i) {
const index_t idx0 = (b * channel + c) * hw + i;
const index_t idx1 = b * channel + c;
output[idx0] = input0[idx0] * input1[idx1];
}
}
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] * input1;
}
break;
case DIV:
if (swapped) {
#pragma omp parallel for collapse(3)
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channel; ++c) {
for (index_t i = 0; i < hw; ++i) {
const index_t idx0 = (b * channel + c) * hw + i;
const index_t idx1 = b * channel + c;
output[idx0] = input1[idx1] / input0[idx0];
}
}
if (!swapped) {
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] / input1;
}
} else {
#pragma omp parallel for collapse(3)
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channel; ++c) {
for (index_t i = 0; i < hw; ++i) {
const index_t idx0 = (b * channel + c) * hw + i;
const index_t idx1 = b * channel + c;
output[idx0] = input0[idx0] / input1[idx1];
}
}
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = input1 / input0[i];
}
}
break;
case MIN:
#pragma omp parallel for collapse(3)
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channel; ++c) {
for (index_t i = 0; i < hw; ++i) {
const index_t idx0 = (b * channel + c) * hw + i;
const index_t idx1 = b * channel + c;
output[idx0] = std::min<float>(input0[idx0], input1[idx1]);
}
}
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = std::min(input0[i], input1);
}
break;
case MAX:
#pragma omp parallel for collapse(3)
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channel; ++c) {
for (index_t i = 0; i < hw; ++i) {
const index_t idx0 = (b * channel + c) * hw + i;
const index_t idx1 = b * channel + c;
output[idx0] = std::max<float>(input0[idx0], input1[idx1]);
}
}
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = std::max(input0[i], input1);
}
break;
case SQR_DIFF:
#pragma omp parallel for collapse(3)
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channel; ++c) {
for (index_t i = 0; i < hw; ++i) {
const index_t idx0 = (b * channel + c) * hw + i;
const index_t idx1 = b * channel + c;
output[idx0] = std::pow(input0[idx0] - input1[idx1], 2.f);
}
}
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = std::pow(input0[i] - input1, 2.f);
}
break;
case POW:
#pragma omp parallel for collapse(3)
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channel; ++c) {
for (index_t i = 0; i < hw; ++i) {
const index_t idx0 = (b * channel + c) * hw + i;
const index_t idx1 = b * channel + c;
output[idx0] = std::pow(input0[idx0], input1[idx1]);
}
if (!swapped) {
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = std::pow(input0[i], input1);
}
} else {
for (index_t i = 0; i < size; ++i) {
output[i] = std::pow(input1, input0[i]);
}
}
break;
case NEG:
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = -input0[i];
}
break;
case ABS:
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = std::fabs(input0[i]);
}
break;
default:
LOG(FATAL) << "Eltwise op not support type " << type;
}
}
inline void TensorVector(const EltwiseType type,
const float *input0,
const float *input1,
const index_t batch,
const index_t channel,
const index_t hw,
const bool swapped,
float *output) {
inline void TensorEltwisePerChannel(const EltwiseType type,
const float *input0,
const float *input1,
const std::vector<float> &coeff,
const index_t batch0,
const index_t batch1,
const index_t channel,
const index_t image_size,
const bool swapped,
float *output) {
switch (type) {
case SUM:
#pragma omp parallel for collapse(3)
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channel; ++c) {
for (index_t i = 0; i < hw; ++i) {
const index_t idx0 = (b * channel + c) * hw + i;
const index_t idx1 = c;
output[idx0] = input0[idx0] + input1[idx1];
if (coeff.empty()) {
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) {
const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
float *out_ptr = output + ((b * channel) + c) * image_size;
for (index_t i = 0; i < image_size; ++i) {
out_ptr[i] = in0_ptr[i] + in1_ptr[c];
}
}
}
} else {
std::vector<float> coeff_copy = coeff;
if (swapped) {
std::swap(coeff_copy[0], coeff_copy[1]);
}
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) {
const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
float *out_ptr = output + ((b * channel) + c) * image_size;
for (index_t i = 0; i < image_size; ++i) {
out_ptr[i] =
in0_ptr[i] * coeff_copy[0] + in1_ptr[c] * coeff_copy[1];
}
}
}
}
break;
case SUB:
if (swapped) {
#pragma omp parallel for collapse(3)
for (index_t b = 0; b < batch; ++b) {
if (!swapped) {
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) {
for (index_t i = 0; i < hw; ++i) {
const index_t idx0 = (b * channel + c) * hw + i;
const index_t idx1 = c;
output[idx0] = input1[idx1] - input0[idx0];
const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
float *out_ptr = output + ((b * channel) + c) * image_size;
for (index_t i = 0; i < image_size; ++i) {
out_ptr[i] = in0_ptr[i] - in1_ptr[c];
}
}
}
} else {
#pragma omp parallel for collapse(3)
for (index_t b = 0; b < batch; ++b) {
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) {
for (index_t i = 0; i < hw; ++i) {
const index_t idx0 = (b * channel + c) * hw + i;
const index_t idx1 = c;
output[idx0] = input0[idx0] - input1[idx1];
const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
float *out_ptr = output + ((b * channel) + c) * image_size;
for (index_t i = 0; i < image_size; ++i) {
out_ptr[i] = in1_ptr[c] - in0_ptr[i];
}
}
}
}
break;
case PROD:
#pragma omp parallel for collapse(3)
for (index_t b = 0; b < batch; ++b) {
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) {
for (index_t i = 0; i < hw; ++i) {
const index_t idx0 = (b * channel + c) * hw + i;
const index_t idx1 = c;
output[idx0] = input0[idx0] * input1[idx1];
const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
float *out_ptr = output + ((b * channel) + c) * image_size;
for (index_t i = 0; i < image_size; ++i) {
out_ptr[i] = in0_ptr[i] * in1_ptr[c];
}
}
}
break;
case DIV:
if (swapped) {
#pragma omp parallel for collapse(3)
for (index_t b = 0; b < batch; ++b) {
if (!swapped) {
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) {
for (index_t i = 0; i < hw; ++i) {
const index_t idx0 = (b * channel + c) * hw + i;
const index_t idx1 = c;
output[idx0] = input1[idx1] / input0[idx0];
const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
float *out_ptr = output + ((b * channel) + c) * image_size;
for (index_t i = 0; i < image_size; ++i) {
out_ptr[i] = in0_ptr[i] / in1_ptr[c];
}
}
}
} else {
#pragma omp parallel for collapse(3)
for (index_t b = 0; b < batch; ++b) {
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) {
for (index_t i = 0; i < hw; ++i) {
const index_t idx0 = (b * channel + c) * hw + i;
const index_t idx1 = c;
output[idx0] = input0[idx0] / input1[idx1];
const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
float *out_ptr = output + ((b * channel) + c) * image_size;
for (index_t i = 0; i < image_size; ++i) {
out_ptr[i] = in1_ptr[c] / in0_ptr[i];
}
}
}
}
break;
case MIN:
#pragma omp parallel for collapse(3)
for (index_t b = 0; b < batch; ++b) {
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) {
for (index_t i = 0; i < hw; ++i) {
const index_t idx0 = (b * channel + c) * hw + i;
const index_t idx1 = c;
output[idx0] = std::min<float>(input0[idx0], input1[idx1]);
const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
float *out_ptr = output + ((b * channel) + c) * image_size;
for (index_t i = 0; i < image_size; ++i) {
out_ptr[i] = std::min(in0_ptr[i], in1_ptr[c]);
}
}
}
break;
case MAX:
#pragma omp parallel for collapse(3)
for (index_t b = 0; b < batch; ++b) {
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) {
for (index_t i = 0; i < hw; ++i) {
const index_t idx0 = (b * channel + c) * hw + i;
const index_t idx1 = c;
output[idx0] = std::max<float>(input0[idx0], input1[idx1]);
const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
float *out_ptr = output + ((b * channel) + c) * image_size;
for (index_t i = 0; i < image_size; ++i) {
out_ptr[i] = std::max(in0_ptr[i], in1_ptr[c]);
}
}
}
break;
case SQR_DIFF:
#pragma omp parallel for collapse(3)
for (index_t b = 0; b < batch; ++b) {
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) {
for (index_t i = 0; i < hw; ++i) {
const index_t idx0 = (b * channel + c) * hw + i;
const index_t idx1 = c;
output[idx0] = std::pow(input0[idx0] - input1[idx1], 2.f);
const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
float *out_ptr = output + ((b * channel) + c) * image_size;
for (index_t i = 0; i < image_size; ++i) {
out_ptr[i] = std::pow(in0_ptr[i] - in1_ptr[c], 2.f);
}
}
}
break;
case POW:
#pragma omp parallel for collapse(3)
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channel; ++c) {
for (index_t i = 0; i < hw; ++i) {
const index_t idx0 = (b * channel + c) * hw + i;
const index_t idx1 = c;
output[idx0] = std::pow(input0[idx0], input1[idx1]);
if (!swapped) {
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) {
const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
float *out_ptr = output + ((b * channel) + c) * image_size;
for (index_t i = 0; i < image_size; ++i) {
out_ptr[i] = std::pow(in0_ptr[i], in1_ptr[c]);
}
}
}
} else {
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) {
const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
float *out_ptr = output + ((b * channel) + c) * image_size;
for (index_t i = 0; i < image_size; ++i) {
out_ptr[i] = std::pow(in1_ptr[c], in0_ptr[i]);
}
}
}
}
break;
default:
LOG(FATAL) << "Eltwise op not support type " << type;
}
}
inline void TensorEltwise(const EltwiseType type,
const float *input0,
const float *input1,
const index_t size,
float *output) {
switch (type) {
case SUM:
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] + input1[i];
}
break;
case SUB:
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] - input1[i];
}
break;
case PROD:
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] * input1[i];
}
break;
case DIV:
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] / input1[i];
}
break;
case MIN:
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = std::min<float>(input0[i], input1[i]);
}
break;
case MAX:
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = std::max<float>(input0[i], input1[i]);
}
break;
case SQR_DIFF:
case NEG:
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = std::pow(input0[i] - input1[i], 2.f);
for (index_t i = 0; i < batch0 * channel * image_size; ++i) {
output[i] = -input0[i];
}
break;
case POW:
case ABS:
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = std::pow(input0[i], input1[i]);
for (index_t i = 0; i < batch0 * channel * image_size; ++i) {
output[i] = std::fabs(input0[i]);
}
break;
default:
......@@ -444,95 +603,109 @@ inline void TensorEltwise(const EltwiseType type,
}
}
struct EltwiseFunctorBase {
EltwiseFunctorBase(const EltwiseType type,
const std::vector<float> &coeff,
const float value)
: type_(type), coeff_(coeff), value_(value) {}
const float value,
const DataFormat data_format)
: type_(type), coeff_(coeff), value_(value), data_format_(data_format) {}
EltwiseType type_;
std::vector<float> coeff_;
float value_;
DataFormat data_format_;
};
template <DeviceType D, typename T>
struct EltwiseFunctor;
template <>
struct EltwiseFunctor<DeviceType::CPU, float>: EltwiseFunctorBase {
struct EltwiseFunctor<DeviceType::CPU, float> : EltwiseFunctorBase {
EltwiseFunctor(const EltwiseType type,
const std::vector<float> &coeff,
const float value)
: EltwiseFunctorBase(type, coeff, value) {}
const float value,
const DataFormat data_format)
: EltwiseFunctorBase(type, coeff, value, data_format) {}
MaceStatus operator()(const Tensor *input0,
const Tensor *input1,
Tensor *output,
StatsFuture *future) {
const Tensor *input1,
Tensor *output,
StatsFuture *future) {
MACE_UNUSED(future);
if (input1 == nullptr) {
scalar_tensor_.Resize({});
Tensor::MappingGuard guard(&scalar_tensor_);
auto scalar_data = scalar_tensor_.mutable_data<float>();
scalar_data[0] = value_;
input1 = &scalar_tensor_;
}
bool swapped = false;
if (input1 != nullptr) {
MACE_CHECK(input0->dim_size() == input1->dim_size()
|| input0->dim_size() == 1
|| input1->dim_size() == 1)
<< "Inputs of Eltwise op must be same shape";
if (input0->size() != input1->size()) {
if (input0->size() < input1->size()) {
std::swap(input0, input1);
swapped = true;
}
if (input1->dim_size() == 1) {
MACE_CHECK(input0->dim(1) == input1->dim(0))
<< "Element-Wise op only support channel dimension broadcast";
} else {
MACE_CHECK((input0->dim(0) == input1->dim(0) || input1->dim(0) == 1)
&& input0->dim(1) == input1->dim(1)
&& input1->dim(2) == 1
&& input1->dim(3) == 1)
<< "Element-Wise op only support channel dimension broadcast";
if (input0->size() < input1->size()) {
std::swap(input0, input1);
swapped = true;
}
// check if we can broadcast tensor
uint32_t rank_diff =
static_cast<uint32_t>(input0->dim_size() - input1->dim_size());
if (data_format_ == NCHW) {
MACE_CHECK(
input0->dim_size() == 4 &&
(input1->dim_size() == 0 ||
input1->dim_size() == 4 && input1->dim(1) == input0->dim(1) &&
(input1->dim(0) == input0->dim(0) || input1->dim(0) == 1) ||
input1->dim_size() == 1 && input1->dim(0) == input0->dim(1)),
"only support broadcast channel dimension");
} else {
if (rank_diff > 0 && rank_diff < input0->dim_size()) {
for (uint32_t i = 0; i < input1->dim_size(); ++i) {
MACE_CHECK(input0->dim(rank_diff + i) == input1->dim(i),
"Element-Wise op only support tail dimensions broadcast");
}
}
}
index_t common_size = input1->size();
index_t diff_size = input0->size() / common_size;
MACE_RETURN_IF_ERROR(output->ResizeLike(input0));
Tensor::MappingGuard input0_guard(input0);
Tensor::MappingGuard input1_guard(input1);
Tensor::MappingGuard output_guard(output);
const float *input0_ptr = input0->data<float>();
const float *input1_ptr = input1->data<float>();
float *output_ptr = output->mutable_data<float>();
const index_t size = input0->size();
if (input1 == nullptr) {
TensorScalar(type_, input0_ptr, value_, size, output_ptr);
if (data_format_ == NCHW && input1->dim_size() > 0 &&
input1->size() < input0->size()) {
TensorEltwisePerChannel(
type_, input0_ptr, input1_ptr, coeff_, input0->dim(0),
input1->dim_size() == 1 ? 1 : input1->dim(0), input0->dim(1),
input0->dim(2) * input0->dim(3), swapped, output_ptr);
} else {
Tensor::MappingGuard input1_guard(input1);
const float *input1_ptr = input1->data<float>();
if (input1->size() != input0->size()) {
const index_t batch = input0->dim(0);
const index_t channel = input0->dim(1);
const index_t hw = input0->dim(2) * input0->dim(3);
if (input1->dim(0) == 1 || input1->dim_size() == 1)
TensorVector(type_, input0_ptr, input1_ptr,
batch, channel, hw, swapped, output_ptr);
else
TensorBatchVector(type_, input0_ptr, input1_ptr,
batch, channel, hw, swapped, output_ptr);
} else {
if (!coeff_.empty() && type_ == SUM) {
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output_ptr[i] = coeff_[0] * input0_ptr[i] +
coeff_[1] * input1_ptr[i];
}
if (input1->size() == input0->size()) {
TensorEltwise(type_, input0_ptr, input1_ptr, coeff_, input0->size(),
swapped, output_ptr);
} else if (input1->size() < input0->size()) {
if (input1->size() > 1) {
TensorBroadcastEltwise(type_, input0_ptr, input1_ptr, coeff_,
diff_size, common_size, swapped, output_ptr);
} else {
TensorEltwise(type_, input0_ptr, input1_ptr, size, output_ptr);
TensorScalarEltwise(type_, input0_ptr, input1_ptr[0], coeff_,
input0->size(), swapped, output_ptr);
}
}
}
return MACE_SUCCESS;
}
Tensor scalar_tensor_;
};
#ifdef MACE_ENABLE_OPENCL
......@@ -540,13 +713,14 @@ template <typename T>
struct EltwiseFunctor<DeviceType::GPU, T> : EltwiseFunctorBase {
EltwiseFunctor(const EltwiseType type,
const std::vector<float> &coeff,
const float value)
: EltwiseFunctorBase(type, coeff, value) {}
const float value,
const DataFormat data_format)
: EltwiseFunctorBase(type, coeff, value, data_format) {}
MaceStatus operator()(const Tensor *input0,
const Tensor *input1,
Tensor *output,
StatsFuture *future);
const Tensor *input1,
Tensor *output,
StatsFuture *future);
cl::Kernel kernel_;
uint32_t kwg_size_;
......
......@@ -42,48 +42,79 @@ struct SoftmaxFunctor<DeviceType::CPU, float> {
Tensor *output,
StatsFuture *future) {
MACE_UNUSED(future);
const index_t batch = input->dim(0);
const index_t class_count = input->dim(1);
const index_t class_size = input->dim(2) * input->dim(3);
const index_t batch_size = class_count * class_size;
Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard output_guard(output);
const float *input_data = input->data<float>();
float *output_data = output->mutable_data<float>();
for (index_t b = 0; b < batch; ++b) {
// softmax for nchw image
if (input->dim_size() == 4) {
const index_t batch = input->dim(0);
const index_t class_count = input->dim(1);
const index_t class_size = input->dim(2) * input->dim(3);
const index_t batch_size = class_count * class_size;
for (index_t b = 0; b < batch; ++b) {
#pragma omp parallel for
for (index_t k = 0; k < class_size; ++k) {
const float *input_ptr = input_data + b * batch_size + k;
float *output_ptr = output_data + b * batch_size + k;
float max_val = std::numeric_limits<float>::lowest();
index_t channel_offset = 0;
for (index_t c = 0; c < class_count; ++c) {
float data = input_ptr[channel_offset];
if (data > max_val) {
max_val = data;
}
channel_offset += class_size;
}
channel_offset = 0;
float sum = 0;
for (index_t c = 0; c < class_count; ++c) {
float exp_value = ::exp(input_ptr[channel_offset] - max_val);
sum += exp_value;
output_ptr[channel_offset] = exp_value;
channel_offset += class_size;
}
sum = std::max(sum, std::numeric_limits<float>::min());
channel_offset = 0;
for (index_t c = 0; c < class_count; ++c) {
output_ptr[channel_offset] /= sum;
channel_offset += class_size;
}
} // k
} // b
} else if (input->dim_size() == 2) { // normal 2d softmax
const index_t class_size = input->dim(0);
const index_t class_count = input->dim(1);
#pragma omp parallel for
for (index_t k = 0; k < class_size; ++k) {
const float *input_ptr = input_data + b * batch_size + k;
float *output_ptr = output_data + b * batch_size + k;
const float *input_ptr = input_data + k * class_count;
float *output_ptr = output_data + k * class_count;
float max_val = std::numeric_limits<float>::lowest();
index_t channel_offset = 0;
for (index_t c = 0; c < class_count; ++c) {
float data = input_ptr[channel_offset];
if (data > max_val) {
max_val = data;
}
channel_offset += class_size;
max_val = std::max(max_val, input_ptr[c]);
}
channel_offset = 0;
float sum = 0;
for (index_t c = 0; c < class_count; ++c) {
float exp_value = ::exp(input_ptr[channel_offset] - max_val);
float exp_value = ::exp(input_ptr[c] - max_val);
sum += exp_value;
output_ptr[channel_offset] = exp_value;
channel_offset += class_size;
output_ptr[c] = exp_value;
}
channel_offset = 0;
sum = std::max(sum, std::numeric_limits<float>::min());
for (index_t c = 0; c < class_count; ++c) {
output_ptr[channel_offset] /= sum;
channel_offset += class_size;
output_ptr[c] /= sum;
}
} // k
} // b
}
} else {
MACE_NOT_IMPLEMENTED;
}
return MACE_SUCCESS;
}
......
......@@ -30,7 +30,9 @@ class EltwiseOp : public Operator<D, T> {
static_cast<kernels::EltwiseType>(OperatorBase::GetOptionalArg<int>(
"type", static_cast<int>(kernels::EltwiseType::NONE))),
OperatorBase::GetRepeatedArgs<float>("coeff"),
OperatorBase::GetOptionalArg<float>("value", 1.0)) {}
OperatorBase::GetOptionalArg<float>("value", 1.0),
static_cast<DataFormat>(OperatorBase::GetOptionalArg<int>(
"data_format", 0))) {}
MaceStatus Run(StatsFuture *future) override {
const Tensor *input0 = this->Input(0);
......
......@@ -41,6 +41,7 @@ void SimpleTensorScalar(const kernels::EltwiseType type,
.Input("TInput")
.AddIntArg("type", static_cast<int>(type))
.AddFloatArg("value", x)
.AddIntArg("data_format", DataFormat::NCHW)
.Output("TOutput")
.Finalize(net.NewOperatorDef());
// Run
......@@ -84,15 +85,24 @@ void SimpleTensorEltwise(const kernels::EltwiseType type,
net.AddInputFromArray<D, float>("Input1", shape1, input1);
if (D == DeviceType::CPU) {
net.TransformDataFormat<D, float>("Input0", NHWC, "TInput0", NCHW);
net.TransformDataFormat<D, float>("Input1", NHWC, "TInput1", NCHW);
OpDefBuilder("Eltwise", "EltwiseTest")
.Input("TInput0")
.Input("TInput1")
auto op_builder = OpDefBuilder("Eltwise", "EltwiseTest")
.AddIntArg("type", static_cast<int>(type))
.AddFloatsArg("coeff", coeff)
.Output("TOutput")
.Finalize(net.NewOperatorDef());
.AddIntArg("data_format", DataFormat::NCHW)
.Output("TOutput");
if (shape0.size() > 1) {
net.TransformDataFormat<D, float>("Input0", NHWC, "TInput0", NCHW);
op_builder.Input("TInput0");
} else {
op_builder.Input("Input0");
}
if (shape1.size() > 1) {
net.TransformDataFormat<D, float>("Input1", NHWC, "TInput1", NCHW);
op_builder.Input("TInput1");
} else {
op_builder.Input("Input1");
}
op_builder.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
......@@ -214,6 +224,35 @@ TEST_F(EltwiseOpTest, CPUSimpleTensorVector) {
kernels::EltwiseType::SQR_DIFF, {1, 1, 1, 5}, {1, 2, 3, 4, 5},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {3},
{1, 2, 3}, {2, 4, 6, 5, 7, 9});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::SUB, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{5}, {1, 2, 3, 4, 5}, {0, 0, 0, 0, 0, 5, 5, 5, 5, 5});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::SUB, {5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 0, 0, 0, 0, -5, -5, -5, -5, -5});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::PROD, {3}, {1, 2, 3}, {1, 2, 1, 3},
{1, 2, 3, 4, 5, 6}, {1, 4, 9, 4, 10, 18});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::DIV, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{5}, {1, 1, 1, 1, 5}, {1, 2, 3, 4, 1, 6, 7, 8, 9, 2});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::DIV, {5}, {1, 1, 1, 2, 4}, {1, 2, 1, 5},
{1, 1, 1, 2, 2, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 2, 1, 1, 1, 2, 4});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::MIN, {5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::MAX, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{5}, {1, 2, 3, 4, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::SQR_DIFF, {5}, {1, 2, 3, 4, 5},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
}
TEST_F(EltwiseOpTest, GPUSimpleTensorVector) {
......@@ -322,6 +361,7 @@ void RandomTensorScalar(const kernels::EltwiseType type,
.Input("TInput")
.AddIntArg("type", static_cast<int>(type))
.AddFloatArg("value", 0.1)
.AddIntArg("data_format", DataFormat::NCHW)
.Output("TOutput")
.Finalize(net.NewOperatorDef());
// Run
......@@ -375,6 +415,7 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
.Input("TInput1")
.AddIntArg("type", static_cast<int>(type))
.AddFloatsArg("coeff", coeff)
.AddIntArg("data_format", DataFormat::NCHW)
.Output("TOutput")
.Finalize(net.NewOperatorDef());
......
......@@ -29,8 +29,12 @@ void Simple() {
// Add input data
net.AddInputFromArray<D, float>("Input", {1, 1, 2, 4},
{1, 1, 1, 1, 1, 2, 3, 4});
auto expected = CreateTensor<float>(
{1, 1, 2, 4},
{0.25, 0.25, 0.25, 0.25, 0.0320586, 0.08714432, 0.23688282, 0.64391426});
if (D == DeviceType::CPU) {
// test 4d softmax
net.TransformDataFormat<CPU, float>("Input", NHWC, "InputNCHW", NCHW);
OpDefBuilder("Softmax", "SoftmaxTest")
.Input("InputNCHW")
......@@ -40,6 +44,21 @@ void Simple() {
// Run
net.RunOp(D);
net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
// check 2d softmax
net.AddInputFromArray<D, float>("Input2d", {2, 4},
{1, 1, 1, 1, 1, 2, 3, 4});
OpDefBuilder("Softmax", "SoftmaxTest")
.Input("Input2d")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.GetOutput("Output")->Reshape({1, 1, 2, 4});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -55,15 +74,11 @@ void Simple() {
// Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} else {
MACE_NOT_IMPLEMENTED;
}
auto expected = CreateTensor<float>(
{1, 1, 2, 4},
{0.25, 0.25, 0.25, 0.25, 0.0320586, 0.08714432, 0.23688282, 0.64391426});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
}
} // namespace
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册