提交 46baf92a 编写于 作者: 李寅

Refactor eltwise

上级 7bdc8a4d
...@@ -17,8 +17,8 @@ ...@@ -17,8 +17,8 @@
#include <algorithm> #include <algorithm>
#include <memory> #include <memory>
#include <vector>
#include <utility> #include <utility>
#include <vector>
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
...@@ -44,70 +44,253 @@ enum EltwiseType { ...@@ -44,70 +44,253 @@ enum EltwiseType {
NONE = 10, NONE = 10,
}; };
inline void TensorScalar(const EltwiseType type, inline void TensorBroadcastEltwise(const EltwiseType type,
const float *input0, const float *input0,
const float value, const float *input1,
const index_t size, const std::vector<float> &coeff,
float *output) { const index_t diff_size,
const index_t common_size,
const bool swapped,
float *output) {
switch (type) { switch (type) {
case SUM: case SUM:
if (coeff.empty()) {
#pragma omp parallel for collapse(2)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
input0[i + d * common_size] + input1[i];
}
}
} else {
std::vector<float> coeff_copy = coeff;
if (swapped) {
std::swap(coeff_copy[0], coeff_copy[1]);
}
#pragma omp parallel for collapse(2)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
input0[i + d * common_size] * coeff_copy[0] +
input1[i] * coeff_copy[1];
}
}
}
break;
case SUB:
if (!swapped) {
#pragma omp parallel for collapse(2)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
input0[i + d * common_size] - input1[i];
}
}
} else {
#pragma omp parallel for collapse(2)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
input1[i] - input0[i + d * common_size];
}
}
}
break;
case PROD:
#pragma omp parallel for collapse(2)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] = input0[i + d * common_size] * input1[i];
}
}
break;
case DIV:
if (!swapped) {
#pragma omp parallel for collapse(2)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
input0[i + d * common_size] / input1[i];
}
}
} else {
#pragma omp parallel for collapse(2)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
input1[i] / input0[i + d * common_size];
}
}
}
break;
case MIN:
#pragma omp parallel for collapse(2)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
std::min(input0[i + d * common_size], input1[i]);
}
}
break;
case MAX:
#pragma omp parallel for collapse(2)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
std::max(input0[i + d * common_size], input1[i]);
}
}
break;
case SQR_DIFF:
#pragma omp parallel for collapse(2)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
std::pow(input0[i + d * common_size] - input1[i], 2.f);
}
}
break;
case POW:
if (!swapped) {
#pragma omp parallel for collapse(2)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
std::pow(input0[i + d * common_size], input1[i]);
}
}
} else {
#pragma omp parallel for collapse(2)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
std::pow(input1[i], input0[i + d * common_size]);
}
}
}
break;
case NEG:
#pragma omp parallel for #pragma omp parallel for
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < diff_size * common_size; ++i) {
output[i] = input0[i] + value; output[i] = -input0[i];
}
break;
case ABS:
#pragma omp parallel for
for (index_t i = 0; i < diff_size * common_size; ++i) {
output[i] = std::fabs(input0[i]);
}
break;
default:
LOG(FATAL) << "Eltwise op not support type " << type;
}
}
// Multiplication is costly, so we specialize the following case.
inline void TensorEltwise(const EltwiseType type,
const float *input0,
const float *input1,
const std::vector<float> &coeff,
const index_t size,
const bool swapped,
float *output) {
switch (type) {
case SUM:
if (coeff.empty()) {
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] + input1[i];
}
} else {
std::vector<float> coeff_copy = coeff;
if (swapped) {
std::swap(coeff_copy[0], coeff_copy[1]);
}
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] * coeff_copy[0] + input1[i] * coeff_copy[1];
}
} }
break; break;
case SUB: case SUB:
if (!swapped) {
#pragma omp parallel for #pragma omp parallel for
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] - value; output[i] = input0[i] - input1[i];
}
} else {
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = input1[i] - input0[i];
}
} }
break; break;
case PROD: case PROD:
#pragma omp parallel for #pragma omp parallel for
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] * value; output[i] = input0[i] * input1[i];
} }
break; break;
case DIV: case DIV:
if (!swapped) {
#pragma omp parallel for #pragma omp parallel for
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] / value; output[i] = input0[i] / input1[i];
}
} else {
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = input1[i] / input0[i];
}
} }
break; break;
case MIN: case MIN:
#pragma omp parallel for #pragma omp parallel for
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = std::min<float>(input0[i], value); output[i] = std::min(input0[i], input1[i]);
} }
break; break;
case MAX: case MAX:
#pragma omp parallel for #pragma omp parallel for
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = std::max<float>(input0[i], value); output[i] = std::max(input0[i], input1[i]);
} }
break; break;
case NEG: case SQR_DIFF:
#pragma omp parallel for #pragma omp parallel for
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = -input0[i]; output[i] = std::pow(input0[i] - input1[i], 2.f);
} }
break; break;
case ABS: case POW:
if (!swapped) {
#pragma omp parallel for #pragma omp parallel for
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = std::abs(input0[i]); output[i] = std::pow(input0[i], input1[i]);
}
} else {
for (index_t i = 0; i < size; ++i) {
output[i] = std::pow(input1[i], input0[i]);
}
} }
break; break;
case SQR_DIFF: case NEG:
#pragma omp parallel for #pragma omp parallel for
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = std::pow(input0[i] - value, 2.f); output[i] = -input0[i];
} }
break; break;
case POW: case ABS:
#pragma omp parallel for #pragma omp parallel for
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = std::pow(input0[i], value); output[i] = std::fabs(input0[i]);
} }
break; break;
default: default:
...@@ -115,328 +298,304 @@ inline void TensorScalar(const EltwiseType type, ...@@ -115,328 +298,304 @@ inline void TensorScalar(const EltwiseType type,
} }
} }
inline void TensorBatchVector(const EltwiseType type, // Multiplication is costly, so we specialize the following case.
const float *input0, inline void TensorScalarEltwise(const EltwiseType type,
const float *input1, const float *input0,
const index_t batch, const float input1,
const index_t channel, const std::vector<float> &coeff,
const index_t hw, const index_t size,
const bool swapped, const bool swapped,
float *output) { float *output) {
switch (type) { switch (type) {
case SUM: case SUM:
#pragma omp parallel for collapse(3) if (coeff.empty()) {
for (index_t b = 0; b < batch; ++b) { #pragma omp parallel for
for (index_t c = 0; c < channel; ++c) { for (index_t i = 0; i < size; ++i) {
for (index_t i = 0; i < hw; ++i) { output[i] = input0[i] + input1;
const index_t idx0 = (b * channel + c) * hw + i; }
const index_t idx1 = b * channel + c;
output[idx0] = input0[idx0] + input1[idx1]; } else {
} std::vector<float> coeff_copy = coeff;
if (swapped) {
std::swap(coeff_copy[0], coeff_copy[1]);
}
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] * coeff_copy[0] + input1 * coeff_copy[1];
} }
} }
break; break;
case SUB: case SUB:
if (swapped) { if (!swapped) {
#pragma omp parallel for collapse(3) #pragma omp parallel for
for (index_t b = 0; b < batch; ++b) { for (index_t i = 0; i < size; ++i) {
for (index_t c = 0; c < channel; ++c) { output[i] = input0[i] - input1;
for (index_t i = 0; i < hw; ++i) {
const index_t idx0 = (b * channel + c) * hw + i;
const index_t idx1 = b * channel + c;
output[idx0] = input1[idx1] - input0[idx0];
}
}
} }
} else { } else {
#pragma omp parallel for collapse(3) #pragma omp parallel for
for (index_t b = 0; b < batch; ++b) { for (index_t i = 0; i < size; ++i) {
for (index_t c = 0; c < channel; ++c) { output[i] = input1 - input0[i];
for (index_t i = 0; i < hw; ++i) {
const index_t idx0 = (b * channel + c) * hw + i;
const index_t idx1 = b * channel + c;
output[idx0] = input0[idx0] - input1[idx1];
}
}
} }
} }
break; break;
case PROD: case PROD:
#pragma omp parallel for collapse(3) #pragma omp parallel for
for (index_t b = 0; b < batch; ++b) { for (index_t i = 0; i < size; ++i) {
for (index_t c = 0; c < channel; ++c) { output[i] = input0[i] * input1;
for (index_t i = 0; i < hw; ++i) {
const index_t idx0 = (b * channel + c) * hw + i;
const index_t idx1 = b * channel + c;
output[idx0] = input0[idx0] * input1[idx1];
}
}
} }
break; break;
case DIV: case DIV:
if (swapped) { if (!swapped) {
#pragma omp parallel for collapse(3) #pragma omp parallel for
for (index_t b = 0; b < batch; ++b) { for (index_t i = 0; i < size; ++i) {
for (index_t c = 0; c < channel; ++c) { output[i] = input0[i] / input1;
for (index_t i = 0; i < hw; ++i) {
const index_t idx0 = (b * channel + c) * hw + i;
const index_t idx1 = b * channel + c;
output[idx0] = input1[idx1] / input0[idx0];
}
}
} }
} else { } else {
#pragma omp parallel for collapse(3) #pragma omp parallel for
for (index_t b = 0; b < batch; ++b) { for (index_t i = 0; i < size; ++i) {
for (index_t c = 0; c < channel; ++c) { output[i] = input1 / input0[i];
for (index_t i = 0; i < hw; ++i) {
const index_t idx0 = (b * channel + c) * hw + i;
const index_t idx1 = b * channel + c;
output[idx0] = input0[idx0] / input1[idx1];
}
}
} }
} }
break; break;
case MIN: case MIN:
#pragma omp parallel for collapse(3) #pragma omp parallel for
for (index_t b = 0; b < batch; ++b) { for (index_t i = 0; i < size; ++i) {
for (index_t c = 0; c < channel; ++c) { output[i] = std::min(input0[i], input1);
for (index_t i = 0; i < hw; ++i) {
const index_t idx0 = (b * channel + c) * hw + i;
const index_t idx1 = b * channel + c;
output[idx0] = std::min<float>(input0[idx0], input1[idx1]);
}
}
} }
break; break;
case MAX: case MAX:
#pragma omp parallel for collapse(3) #pragma omp parallel for
for (index_t b = 0; b < batch; ++b) { for (index_t i = 0; i < size; ++i) {
for (index_t c = 0; c < channel; ++c) { output[i] = std::max(input0[i], input1);
for (index_t i = 0; i < hw; ++i) {
const index_t idx0 = (b * channel + c) * hw + i;
const index_t idx1 = b * channel + c;
output[idx0] = std::max<float>(input0[idx0], input1[idx1]);
}
}
} }
break; break;
case SQR_DIFF: case SQR_DIFF:
#pragma omp parallel for collapse(3) #pragma omp parallel for
for (index_t b = 0; b < batch; ++b) { for (index_t i = 0; i < size; ++i) {
for (index_t c = 0; c < channel; ++c) { output[i] = std::pow(input0[i] - input1, 2.f);
for (index_t i = 0; i < hw; ++i) {
const index_t idx0 = (b * channel + c) * hw + i;
const index_t idx1 = b * channel + c;
output[idx0] = std::pow(input0[idx0] - input1[idx1], 2.f);
}
}
} }
break; break;
case POW: case POW:
#pragma omp parallel for collapse(3) if (!swapped) {
for (index_t b = 0; b < batch; ++b) { #pragma omp parallel for
for (index_t c = 0; c < channel; ++c) { for (index_t i = 0; i < size; ++i) {
for (index_t i = 0; i < hw; ++i) { output[i] = std::pow(input0[i], input1);
const index_t idx0 = (b * channel + c) * hw + i;
const index_t idx1 = b * channel + c;
output[idx0] = std::pow(input0[idx0], input1[idx1]);
}
} }
} else {
for (index_t i = 0; i < size; ++i) {
output[i] = std::pow(input1, input0[i]);
}
}
break;
case NEG:
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = -input0[i];
}
break;
case ABS:
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = std::fabs(input0[i]);
} }
break; break;
default: default:
LOG(FATAL) << "Eltwise op not support type " << type; LOG(FATAL) << "Eltwise op not support type " << type;
} }
} }
inline void TensorVector(const EltwiseType type,
const float *input0, inline void TensorEltwisePerChannel(const EltwiseType type,
const float *input1, const float *input0,
const index_t batch, const float *input1,
const index_t channel, const std::vector<float> &coeff,
const index_t hw, const index_t batch0,
const bool swapped, const index_t batch1,
float *output) { const index_t channel,
const index_t image_size,
const bool swapped,
float *output) {
switch (type) { switch (type) {
case SUM: case SUM:
#pragma omp parallel for collapse(3) if (coeff.empty()) {
for (index_t b = 0; b < batch; ++b) { #pragma omp parallel for collapse(2)
for (index_t c = 0; c < channel; ++c) { for (index_t b = 0; b < batch0; ++b) {
for (index_t i = 0; i < hw; ++i) { for (index_t c = 0; c < channel; ++c) {
const index_t idx0 = (b * channel + c) * hw + i; const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
const index_t idx1 = c; const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
output[idx0] = input0[idx0] + input1[idx1]; float *out_ptr = output + ((b * channel) + c) * image_size;
for (index_t i = 0; i < image_size; ++i) {
out_ptr[i] = in0_ptr[i] + in1_ptr[c];
}
}
}
} else {
std::vector<float> coeff_copy = coeff;
if (swapped) {
std::swap(coeff_copy[0], coeff_copy[1]);
}
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) {
const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
float *out_ptr = output + ((b * channel) + c) * image_size;
for (index_t i = 0; i < image_size; ++i) {
out_ptr[i] =
in0_ptr[i] * coeff_copy[0] + in1_ptr[c] * coeff_copy[1];
}
} }
} }
} }
break; break;
case SUB: case SUB:
if (swapped) { if (!swapped) {
#pragma omp parallel for collapse(3) #pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) { for (index_t c = 0; c < channel; ++c) {
for (index_t i = 0; i < hw; ++i) { const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
const index_t idx0 = (b * channel + c) * hw + i; const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
const index_t idx1 = c; float *out_ptr = output + ((b * channel) + c) * image_size;
output[idx0] = input1[idx1] - input0[idx0]; for (index_t i = 0; i < image_size; ++i) {
out_ptr[i] = in0_ptr[i] - in1_ptr[c];
} }
} }
} }
} else { } else {
#pragma omp parallel for collapse(3) #pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) { for (index_t c = 0; c < channel; ++c) {
for (index_t i = 0; i < hw; ++i) { const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
const index_t idx0 = (b * channel + c) * hw + i; const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
const index_t idx1 = c; float *out_ptr = output + ((b * channel) + c) * image_size;
output[idx0] = input0[idx0] - input1[idx1]; for (index_t i = 0; i < image_size; ++i) {
out_ptr[i] = in1_ptr[c] - in0_ptr[i];
} }
} }
} }
} }
break; break;
case PROD: case PROD:
#pragma omp parallel for collapse(3) #pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) { for (index_t c = 0; c < channel; ++c) {
for (index_t i = 0; i < hw; ++i) { const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
const index_t idx0 = (b * channel + c) * hw + i; const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
const index_t idx1 = c; float *out_ptr = output + ((b * channel) + c) * image_size;
output[idx0] = input0[idx0] * input1[idx1]; for (index_t i = 0; i < image_size; ++i) {
out_ptr[i] = in0_ptr[i] * in1_ptr[c];
} }
} }
} }
break; break;
case DIV: case DIV:
if (swapped) { if (!swapped) {
#pragma omp parallel for collapse(3) #pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) { for (index_t c = 0; c < channel; ++c) {
for (index_t i = 0; i < hw; ++i) { const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
const index_t idx0 = (b * channel + c) * hw + i; const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
const index_t idx1 = c; float *out_ptr = output + ((b * channel) + c) * image_size;
output[idx0] = input1[idx1] / input0[idx0]; for (index_t i = 0; i < image_size; ++i) {
out_ptr[i] = in0_ptr[i] / in1_ptr[c];
} }
} }
} }
} else { } else {
#pragma omp parallel for collapse(3) #pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) { for (index_t c = 0; c < channel; ++c) {
for (index_t i = 0; i < hw; ++i) { const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
const index_t idx0 = (b * channel + c) * hw + i; const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
const index_t idx1 = c; float *out_ptr = output + ((b * channel) + c) * image_size;
output[idx0] = input0[idx0] / input1[idx1]; for (index_t i = 0; i < image_size; ++i) {
out_ptr[i] = in1_ptr[c] / in0_ptr[i];
} }
} }
} }
} }
break; break;
case MIN: case MIN:
#pragma omp parallel for collapse(3) #pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) { for (index_t c = 0; c < channel; ++c) {
for (index_t i = 0; i < hw; ++i) { const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
const index_t idx0 = (b * channel + c) * hw + i; const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
const index_t idx1 = c; float *out_ptr = output + ((b * channel) + c) * image_size;
output[idx0] = std::min<float>(input0[idx0], input1[idx1]); for (index_t i = 0; i < image_size; ++i) {
out_ptr[i] = std::min(in0_ptr[i], in1_ptr[c]);
} }
} }
} }
break; break;
case MAX: case MAX:
#pragma omp parallel for collapse(3) #pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) { for (index_t c = 0; c < channel; ++c) {
for (index_t i = 0; i < hw; ++i) { const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
const index_t idx0 = (b * channel + c) * hw + i; const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
const index_t idx1 = c; float *out_ptr = output + ((b * channel) + c) * image_size;
output[idx0] = std::max<float>(input0[idx0], input1[idx1]); for (index_t i = 0; i < image_size; ++i) {
out_ptr[i] = std::max(in0_ptr[i], in1_ptr[c]);
} }
} }
} }
break; break;
case SQR_DIFF: case SQR_DIFF:
#pragma omp parallel for collapse(3) #pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) { for (index_t c = 0; c < channel; ++c) {
for (index_t i = 0; i < hw; ++i) { const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
const index_t idx0 = (b * channel + c) * hw + i; const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
const index_t idx1 = c; float *out_ptr = output + ((b * channel) + c) * image_size;
output[idx0] = std::pow(input0[idx0] - input1[idx1], 2.f); for (index_t i = 0; i < image_size; ++i) {
out_ptr[i] = std::pow(in0_ptr[i] - in1_ptr[c], 2.f);
} }
} }
} }
break; break;
case POW: case POW:
#pragma omp parallel for collapse(3) if (!swapped) {
for (index_t b = 0; b < batch; ++b) { #pragma omp parallel for collapse(2)
for (index_t c = 0; c < channel; ++c) { for (index_t b = 0; b < batch0; ++b) {
for (index_t i = 0; i < hw; ++i) { for (index_t c = 0; c < channel; ++c) {
const index_t idx0 = (b * channel + c) * hw + i; const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
const index_t idx1 = c; const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
output[idx0] = std::pow(input0[idx0], input1[idx1]); float *out_ptr = output + ((b * channel) + c) * image_size;
for (index_t i = 0; i < image_size; ++i) {
out_ptr[i] = std::pow(in0_ptr[i], in1_ptr[c]);
}
}
}
} else {
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) {
const float *in0_ptr = input0 + ((b * channel) + c) * image_size;
const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
float *out_ptr = output + ((b * channel) + c) * image_size;
for (index_t i = 0; i < image_size; ++i) {
out_ptr[i] = std::pow(in1_ptr[c], in0_ptr[i]);
}
} }
} }
} }
break; break;
default: case NEG:
LOG(FATAL) << "Eltwise op not support type " << type;
}
}
inline void TensorEltwise(const EltwiseType type,
const float *input0,
const float *input1,
const index_t size,
float *output) {
switch (type) {
case SUM:
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] + input1[i];
}
break;
case SUB:
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] - input1[i];
}
break;
case PROD:
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] * input1[i];
}
break;
case DIV:
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] / input1[i];
}
break;
case MIN:
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = std::min<float>(input0[i], input1[i]);
}
break;
case MAX:
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = std::max<float>(input0[i], input1[i]);
}
break;
case SQR_DIFF:
#pragma omp parallel for #pragma omp parallel for
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < batch0 * channel * image_size; ++i) {
output[i] = std::pow(input0[i] - input1[i], 2.f); output[i] = -input0[i];
} }
break; break;
case POW: case ABS:
#pragma omp parallel for #pragma omp parallel for
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < batch0 * channel * image_size; ++i) {
output[i] = std::pow(input0[i], input1[i]); output[i] = std::fabs(input0[i]);
} }
break; break;
default: default:
...@@ -444,95 +603,109 @@ inline void TensorEltwise(const EltwiseType type, ...@@ -444,95 +603,109 @@ inline void TensorEltwise(const EltwiseType type,
} }
} }
struct EltwiseFunctorBase { struct EltwiseFunctorBase {
EltwiseFunctorBase(const EltwiseType type, EltwiseFunctorBase(const EltwiseType type,
const std::vector<float> &coeff, const std::vector<float> &coeff,
const float value) const float value,
: type_(type), coeff_(coeff), value_(value) {} const DataFormat data_format)
: type_(type), coeff_(coeff), value_(value), data_format_(data_format) {}
EltwiseType type_; EltwiseType type_;
std::vector<float> coeff_; std::vector<float> coeff_;
float value_; float value_;
DataFormat data_format_;
}; };
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct EltwiseFunctor; struct EltwiseFunctor;
template <> template <>
struct EltwiseFunctor<DeviceType::CPU, float>: EltwiseFunctorBase { struct EltwiseFunctor<DeviceType::CPU, float> : EltwiseFunctorBase {
EltwiseFunctor(const EltwiseType type, EltwiseFunctor(const EltwiseType type,
const std::vector<float> &coeff, const std::vector<float> &coeff,
const float value) const float value,
: EltwiseFunctorBase(type, coeff, value) {} const DataFormat data_format)
: EltwiseFunctorBase(type, coeff, value, data_format) {}
MaceStatus operator()(const Tensor *input0, MaceStatus operator()(const Tensor *input0,
const Tensor *input1, const Tensor *input1,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
MACE_UNUSED(future); MACE_UNUSED(future);
if (input1 == nullptr) {
scalar_tensor_.Resize({});
Tensor::MappingGuard guard(&scalar_tensor_);
auto scalar_data = scalar_tensor_.mutable_data<float>();
scalar_data[0] = value_;
input1 = &scalar_tensor_;
}
bool swapped = false; bool swapped = false;
if (input1 != nullptr) { if (input0->size() < input1->size()) {
MACE_CHECK(input0->dim_size() == input1->dim_size() std::swap(input0, input1);
|| input0->dim_size() == 1 swapped = true;
|| input1->dim_size() == 1) }
<< "Inputs of Eltwise op must be same shape";
if (input0->size() != input1->size()) { // check if we can broadcast tensor
if (input0->size() < input1->size()) { uint32_t rank_diff =
std::swap(input0, input1); static_cast<uint32_t>(input0->dim_size() - input1->dim_size());
swapped = true; if (data_format_ == NCHW) {
} MACE_CHECK(
if (input1->dim_size() == 1) { input0->dim_size() == 4 &&
MACE_CHECK(input0->dim(1) == input1->dim(0)) (input1->dim_size() == 0 ||
<< "Element-Wise op only support channel dimension broadcast"; input1->dim_size() == 4 && input1->dim(1) == input0->dim(1) &&
} else { (input1->dim(0) == input0->dim(0) || input1->dim(0) == 1) ||
MACE_CHECK((input0->dim(0) == input1->dim(0) || input1->dim(0) == 1) input1->dim_size() == 1 && input1->dim(0) == input0->dim(1)),
&& input0->dim(1) == input1->dim(1) "only support broadcast channel dimension");
&& input1->dim(2) == 1 } else {
&& input1->dim(3) == 1) if (rank_diff > 0 && rank_diff < input0->dim_size()) {
<< "Element-Wise op only support channel dimension broadcast"; for (uint32_t i = 0; i < input1->dim_size(); ++i) {
MACE_CHECK(input0->dim(rank_diff + i) == input1->dim(i),
"Element-Wise op only support tail dimensions broadcast");
} }
} }
} }
index_t common_size = input1->size();
index_t diff_size = input0->size() / common_size;
MACE_RETURN_IF_ERROR(output->ResizeLike(input0)); MACE_RETURN_IF_ERROR(output->ResizeLike(input0));
Tensor::MappingGuard input0_guard(input0); Tensor::MappingGuard input0_guard(input0);
Tensor::MappingGuard input1_guard(input1);
Tensor::MappingGuard output_guard(output); Tensor::MappingGuard output_guard(output);
const float *input0_ptr = input0->data<float>(); const float *input0_ptr = input0->data<float>();
const float *input1_ptr = input1->data<float>();
float *output_ptr = output->mutable_data<float>(); float *output_ptr = output->mutable_data<float>();
const index_t size = input0->size();
if (input1 == nullptr) { if (data_format_ == NCHW && input1->dim_size() > 0 &&
TensorScalar(type_, input0_ptr, value_, size, output_ptr); input1->size() < input0->size()) {
TensorEltwisePerChannel(
type_, input0_ptr, input1_ptr, coeff_, input0->dim(0),
input1->dim_size() == 1 ? 1 : input1->dim(0), input0->dim(1),
input0->dim(2) * input0->dim(3), swapped, output_ptr);
} else { } else {
Tensor::MappingGuard input1_guard(input1); if (input1->size() == input0->size()) {
TensorEltwise(type_, input0_ptr, input1_ptr, coeff_, input0->size(),
const float *input1_ptr = input1->data<float>(); swapped, output_ptr);
if (input1->size() != input0->size()) { } else if (input1->size() < input0->size()) {
const index_t batch = input0->dim(0); if (input1->size() > 1) {
const index_t channel = input0->dim(1); TensorBroadcastEltwise(type_, input0_ptr, input1_ptr, coeff_,
const index_t hw = input0->dim(2) * input0->dim(3); diff_size, common_size, swapped, output_ptr);
if (input1->dim(0) == 1 || input1->dim_size() == 1)
TensorVector(type_, input0_ptr, input1_ptr,
batch, channel, hw, swapped, output_ptr);
else
TensorBatchVector(type_, input0_ptr, input1_ptr,
batch, channel, hw, swapped, output_ptr);
} else {
if (!coeff_.empty() && type_ == SUM) {
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output_ptr[i] = coeff_[0] * input0_ptr[i] +
coeff_[1] * input1_ptr[i];
}
} else { } else {
TensorEltwise(type_, input0_ptr, input1_ptr, size, output_ptr); TensorScalarEltwise(type_, input0_ptr, input1_ptr[0], coeff_,
input0->size(), swapped, output_ptr);
} }
} }
} }
return MACE_SUCCESS; return MACE_SUCCESS;
} }
Tensor scalar_tensor_;
}; };
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
...@@ -540,13 +713,14 @@ template <typename T> ...@@ -540,13 +713,14 @@ template <typename T>
struct EltwiseFunctor<DeviceType::GPU, T> : EltwiseFunctorBase { struct EltwiseFunctor<DeviceType::GPU, T> : EltwiseFunctorBase {
EltwiseFunctor(const EltwiseType type, EltwiseFunctor(const EltwiseType type,
const std::vector<float> &coeff, const std::vector<float> &coeff,
const float value) const float value,
: EltwiseFunctorBase(type, coeff, value) {} const DataFormat data_format)
: EltwiseFunctorBase(type, coeff, value, data_format) {}
MaceStatus operator()(const Tensor *input0, MaceStatus operator()(const Tensor *input0,
const Tensor *input1, const Tensor *input1,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
uint32_t kwg_size_; uint32_t kwg_size_;
......
...@@ -42,48 +42,79 @@ struct SoftmaxFunctor<DeviceType::CPU, float> { ...@@ -42,48 +42,79 @@ struct SoftmaxFunctor<DeviceType::CPU, float> {
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
MACE_UNUSED(future); MACE_UNUSED(future);
const index_t batch = input->dim(0);
const index_t class_count = input->dim(1);
const index_t class_size = input->dim(2) * input->dim(3);
const index_t batch_size = class_count * class_size;
Tensor::MappingGuard input_guard(input); Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard output_guard(output); Tensor::MappingGuard output_guard(output);
const float *input_data = input->data<float>(); const float *input_data = input->data<float>();
float *output_data = output->mutable_data<float>(); float *output_data = output->mutable_data<float>();
for (index_t b = 0; b < batch; ++b) { // softmax for nchw image
if (input->dim_size() == 4) {
const index_t batch = input->dim(0);
const index_t class_count = input->dim(1);
const index_t class_size = input->dim(2) * input->dim(3);
const index_t batch_size = class_count * class_size;
for (index_t b = 0; b < batch; ++b) {
#pragma omp parallel for
for (index_t k = 0; k < class_size; ++k) {
const float *input_ptr = input_data + b * batch_size + k;
float *output_ptr = output_data + b * batch_size + k;
float max_val = std::numeric_limits<float>::lowest();
index_t channel_offset = 0;
for (index_t c = 0; c < class_count; ++c) {
float data = input_ptr[channel_offset];
if (data > max_val) {
max_val = data;
}
channel_offset += class_size;
}
channel_offset = 0;
float sum = 0;
for (index_t c = 0; c < class_count; ++c) {
float exp_value = ::exp(input_ptr[channel_offset] - max_val);
sum += exp_value;
output_ptr[channel_offset] = exp_value;
channel_offset += class_size;
}
sum = std::max(sum, std::numeric_limits<float>::min());
channel_offset = 0;
for (index_t c = 0; c < class_count; ++c) {
output_ptr[channel_offset] /= sum;
channel_offset += class_size;
}
} // k
} // b
} else if (input->dim_size() == 2) { // normal 2d softmax
const index_t class_size = input->dim(0);
const index_t class_count = input->dim(1);
#pragma omp parallel for #pragma omp parallel for
for (index_t k = 0; k < class_size; ++k) { for (index_t k = 0; k < class_size; ++k) {
const float *input_ptr = input_data + b * batch_size + k; const float *input_ptr = input_data + k * class_count;
float *output_ptr = output_data + b * batch_size + k; float *output_ptr = output_data + k * class_count;
float max_val = std::numeric_limits<float>::lowest(); float max_val = std::numeric_limits<float>::lowest();
index_t channel_offset = 0;
for (index_t c = 0; c < class_count; ++c) { for (index_t c = 0; c < class_count; ++c) {
float data = input_ptr[channel_offset]; max_val = std::max(max_val, input_ptr[c]);
if (data > max_val) {
max_val = data;
}
channel_offset += class_size;
} }
channel_offset = 0;
float sum = 0; float sum = 0;
for (index_t c = 0; c < class_count; ++c) { for (index_t c = 0; c < class_count; ++c) {
float exp_value = ::exp(input_ptr[channel_offset] - max_val); float exp_value = ::exp(input_ptr[c] - max_val);
sum += exp_value; sum += exp_value;
output_ptr[channel_offset] = exp_value; output_ptr[c] = exp_value;
channel_offset += class_size;
} }
channel_offset = 0; sum = std::max(sum, std::numeric_limits<float>::min());
for (index_t c = 0; c < class_count; ++c) { for (index_t c = 0; c < class_count; ++c) {
output_ptr[channel_offset] /= sum; output_ptr[c] /= sum;
channel_offset += class_size;
} }
} // k }
} // b } else {
MACE_NOT_IMPLEMENTED;
}
return MACE_SUCCESS; return MACE_SUCCESS;
} }
......
...@@ -30,7 +30,9 @@ class EltwiseOp : public Operator<D, T> { ...@@ -30,7 +30,9 @@ class EltwiseOp : public Operator<D, T> {
static_cast<kernels::EltwiseType>(OperatorBase::GetOptionalArg<int>( static_cast<kernels::EltwiseType>(OperatorBase::GetOptionalArg<int>(
"type", static_cast<int>(kernels::EltwiseType::NONE))), "type", static_cast<int>(kernels::EltwiseType::NONE))),
OperatorBase::GetRepeatedArgs<float>("coeff"), OperatorBase::GetRepeatedArgs<float>("coeff"),
OperatorBase::GetOptionalArg<float>("value", 1.0)) {} OperatorBase::GetOptionalArg<float>("value", 1.0),
static_cast<DataFormat>(OperatorBase::GetOptionalArg<int>(
"data_format", 0))) {}
MaceStatus Run(StatsFuture *future) override { MaceStatus Run(StatsFuture *future) override {
const Tensor *input0 = this->Input(0); const Tensor *input0 = this->Input(0);
......
...@@ -41,6 +41,7 @@ void SimpleTensorScalar(const kernels::EltwiseType type, ...@@ -41,6 +41,7 @@ void SimpleTensorScalar(const kernels::EltwiseType type,
.Input("TInput") .Input("TInput")
.AddIntArg("type", static_cast<int>(type)) .AddIntArg("type", static_cast<int>(type))
.AddFloatArg("value", x) .AddFloatArg("value", x)
.AddIntArg("data_format", DataFormat::NCHW)
.Output("TOutput") .Output("TOutput")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
...@@ -84,15 +85,24 @@ void SimpleTensorEltwise(const kernels::EltwiseType type, ...@@ -84,15 +85,24 @@ void SimpleTensorEltwise(const kernels::EltwiseType type,
net.AddInputFromArray<D, float>("Input1", shape1, input1); net.AddInputFromArray<D, float>("Input1", shape1, input1);
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
net.TransformDataFormat<D, float>("Input0", NHWC, "TInput0", NCHW); auto op_builder = OpDefBuilder("Eltwise", "EltwiseTest")
net.TransformDataFormat<D, float>("Input1", NHWC, "TInput1", NCHW);
OpDefBuilder("Eltwise", "EltwiseTest")
.Input("TInput0")
.Input("TInput1")
.AddIntArg("type", static_cast<int>(type)) .AddIntArg("type", static_cast<int>(type))
.AddFloatsArg("coeff", coeff) .AddFloatsArg("coeff", coeff)
.Output("TOutput") .AddIntArg("data_format", DataFormat::NCHW)
.Finalize(net.NewOperatorDef()); .Output("TOutput");
if (shape0.size() > 1) {
net.TransformDataFormat<D, float>("Input0", NHWC, "TInput0", NCHW);
op_builder.Input("TInput0");
} else {
op_builder.Input("Input0");
}
if (shape1.size() > 1) {
net.TransformDataFormat<D, float>("Input1", NHWC, "TInput1", NCHW);
op_builder.Input("TInput1");
} else {
op_builder.Input("Input1");
}
op_builder.Finalize(net.NewOperatorDef());
// Run // Run
net.RunOp(D); net.RunOp(D);
...@@ -214,6 +224,35 @@ TEST_F(EltwiseOpTest, CPUSimpleTensorVector) { ...@@ -214,6 +224,35 @@ TEST_F(EltwiseOpTest, CPUSimpleTensorVector) {
kernels::EltwiseType::SQR_DIFF, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, kernels::EltwiseType::SQR_DIFF, {1, 1, 1, 5}, {1, 2, 3, 4, 5},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{0, 0, 0, 0, 0, 25, 25, 25, 25, 25}); {0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {3},
{1, 2, 3}, {2, 4, 6, 5, 7, 9});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::SUB, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{5}, {1, 2, 3, 4, 5}, {0, 0, 0, 0, 0, 5, 5, 5, 5, 5});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::SUB, {5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 0, 0, 0, 0, -5, -5, -5, -5, -5});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::PROD, {3}, {1, 2, 3}, {1, 2, 1, 3},
{1, 2, 3, 4, 5, 6}, {1, 4, 9, 4, 10, 18});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::DIV, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{5}, {1, 1, 1, 1, 5}, {1, 2, 3, 4, 1, 6, 7, 8, 9, 2});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::DIV, {5}, {1, 1, 1, 2, 4}, {1, 2, 1, 5},
{1, 1, 1, 2, 2, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 2, 1, 1, 1, 2, 4});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::MIN, {5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::MAX, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{5}, {1, 2, 3, 4, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
SimpleTensorEltwise<DeviceType::CPU, float>(
kernels::EltwiseType::SQR_DIFF, {5}, {1, 2, 3, 4, 5},
{1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
{0, 0, 0, 0, 0, 25, 25, 25, 25, 25});
} }
TEST_F(EltwiseOpTest, GPUSimpleTensorVector) { TEST_F(EltwiseOpTest, GPUSimpleTensorVector) {
...@@ -322,6 +361,7 @@ void RandomTensorScalar(const kernels::EltwiseType type, ...@@ -322,6 +361,7 @@ void RandomTensorScalar(const kernels::EltwiseType type,
.Input("TInput") .Input("TInput")
.AddIntArg("type", static_cast<int>(type)) .AddIntArg("type", static_cast<int>(type))
.AddFloatArg("value", 0.1) .AddFloatArg("value", 0.1)
.AddIntArg("data_format", DataFormat::NCHW)
.Output("TOutput") .Output("TOutput")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
// Run // Run
...@@ -375,6 +415,7 @@ void RandomTensorEltwise(const kernels::EltwiseType type, ...@@ -375,6 +415,7 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
.Input("TInput1") .Input("TInput1")
.AddIntArg("type", static_cast<int>(type)) .AddIntArg("type", static_cast<int>(type))
.AddFloatsArg("coeff", coeff) .AddFloatsArg("coeff", coeff)
.AddIntArg("data_format", DataFormat::NCHW)
.Output("TOutput") .Output("TOutput")
.Finalize(net.NewOperatorDef()); .Finalize(net.NewOperatorDef());
......
...@@ -29,8 +29,12 @@ void Simple() { ...@@ -29,8 +29,12 @@ void Simple() {
// Add input data // Add input data
net.AddInputFromArray<D, float>("Input", {1, 1, 2, 4}, net.AddInputFromArray<D, float>("Input", {1, 1, 2, 4},
{1, 1, 1, 1, 1, 2, 3, 4}); {1, 1, 1, 1, 1, 2, 3, 4});
auto expected = CreateTensor<float>(
{1, 1, 2, 4},
{0.25, 0.25, 0.25, 0.25, 0.0320586, 0.08714432, 0.23688282, 0.64391426});
if (D == DeviceType::CPU) { if (D == DeviceType::CPU) {
// test 4d softmax
net.TransformDataFormat<CPU, float>("Input", NHWC, "InputNCHW", NCHW); net.TransformDataFormat<CPU, float>("Input", NHWC, "InputNCHW", NCHW);
OpDefBuilder("Softmax", "SoftmaxTest") OpDefBuilder("Softmax", "SoftmaxTest")
.Input("InputNCHW") .Input("InputNCHW")
...@@ -40,6 +44,21 @@ void Simple() { ...@@ -40,6 +44,21 @@ void Simple() {
// Run // Run
net.RunOp(D); net.RunOp(D);
net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC); net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
// check 2d softmax
net.AddInputFromArray<D, float>("Input2d", {2, 4},
{1, 1, 1, 1, 1, 2, 3, 4});
OpDefBuilder("Softmax", "SoftmaxTest")
.Input("Input2d")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
net.GetOutput("Output")->Reshape({1, 1, 2, 4});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} else if (D == DeviceType::GPU) { } else if (D == DeviceType::GPU) {
BufferToImage<D, float>(&net, "Input", "InputImage", BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
...@@ -55,15 +74,11 @@ void Simple() { ...@@ -55,15 +74,11 @@ void Simple() {
// Transfer output // Transfer output
ImageToBuffer<D, float>(&net, "OutputImage", "Output", ImageToBuffer<D, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL); kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
auto expected = CreateTensor<float>(
{1, 1, 2, 4},
{0.25, 0.25, 0.25, 0.25, 0.0320586, 0.08714432, 0.23688282, 0.64391426});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
} }
} // namespace } // namespace
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册