From 46baf92a9d243347fa5dc511db79e0e71f756cd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=AF=85?= Date: Tue, 5 Jun 2018 15:28:05 +0800 Subject: [PATCH] Refactor eltwise --- mace/kernels/eltwise.h | 780 ++++++++++++++++++++++++--------------- mace/kernels/softmax.h | 77 ++-- mace/ops/eltwise.h | 4 +- mace/ops/eltwise_test.cc | 55 ++- mace/ops/softmax_test.cc | 27 +- 5 files changed, 603 insertions(+), 340 deletions(-) diff --git a/mace/kernels/eltwise.h b/mace/kernels/eltwise.h index 44a60e38..82418618 100644 --- a/mace/kernels/eltwise.h +++ b/mace/kernels/eltwise.h @@ -17,8 +17,8 @@ #include #include -#include #include +#include #include "mace/core/future.h" #include "mace/core/tensor.h" @@ -44,70 +44,253 @@ enum EltwiseType { NONE = 10, }; -inline void TensorScalar(const EltwiseType type, - const float *input0, - const float value, - const index_t size, - float *output) { +inline void TensorBroadcastEltwise(const EltwiseType type, + const float *input0, + const float *input1, + const std::vector &coeff, + const index_t diff_size, + const index_t common_size, + const bool swapped, + float *output) { switch (type) { case SUM: + if (coeff.empty()) { +#pragma omp parallel for collapse(2) + for (index_t d = 0; d < diff_size; ++d) { + for (index_t i = 0; i < common_size; ++i) { + output[i + d * common_size] = + input0[i + d * common_size] + input1[i]; + } + } + } else { + std::vector coeff_copy = coeff; + if (swapped) { + std::swap(coeff_copy[0], coeff_copy[1]); + } +#pragma omp parallel for collapse(2) + for (index_t d = 0; d < diff_size; ++d) { + for (index_t i = 0; i < common_size; ++i) { + output[i + d * common_size] = + input0[i + d * common_size] * coeff_copy[0] + + input1[i] * coeff_copy[1]; + } + } + } + break; + case SUB: + if (!swapped) { +#pragma omp parallel for collapse(2) + for (index_t d = 0; d < diff_size; ++d) { + for (index_t i = 0; i < common_size; ++i) { + output[i + d * common_size] = + input0[i + d * common_size] - input1[i]; + } + } + } else { +#pragma omp parallel for collapse(2) + for (index_t d = 0; d < diff_size; ++d) { + for (index_t i = 0; i < common_size; ++i) { + output[i + d * common_size] = + input1[i] - input0[i + d * common_size]; + } + } + } + break; + case PROD: +#pragma omp parallel for collapse(2) + for (index_t d = 0; d < diff_size; ++d) { + for (index_t i = 0; i < common_size; ++i) { + output[i + d * common_size] = input0[i + d * common_size] * input1[i]; + } + } + break; + case DIV: + if (!swapped) { +#pragma omp parallel for collapse(2) + for (index_t d = 0; d < diff_size; ++d) { + for (index_t i = 0; i < common_size; ++i) { + output[i + d * common_size] = + input0[i + d * common_size] / input1[i]; + } + } + } else { +#pragma omp parallel for collapse(2) + for (index_t d = 0; d < diff_size; ++d) { + for (index_t i = 0; i < common_size; ++i) { + output[i + d * common_size] = + input1[i] / input0[i + d * common_size]; + } + } + } + break; + case MIN: +#pragma omp parallel for collapse(2) + for (index_t d = 0; d < diff_size; ++d) { + for (index_t i = 0; i < common_size; ++i) { + output[i + d * common_size] = + std::min(input0[i + d * common_size], input1[i]); + } + } + break; + case MAX: +#pragma omp parallel for collapse(2) + for (index_t d = 0; d < diff_size; ++d) { + for (index_t i = 0; i < common_size; ++i) { + output[i + d * common_size] = + std::max(input0[i + d * common_size], input1[i]); + } + } + break; + case SQR_DIFF: +#pragma omp parallel for collapse(2) + for (index_t d = 0; d < diff_size; ++d) { + for (index_t i = 0; i < common_size; ++i) { + output[i + d * common_size] = + std::pow(input0[i + d * common_size] - input1[i], 2.f); + } + } + break; + case POW: + if (!swapped) { +#pragma omp parallel for collapse(2) + for (index_t d = 0; d < diff_size; ++d) { + for (index_t i = 0; i < common_size; ++i) { + output[i + d * common_size] = + std::pow(input0[i + d * common_size], input1[i]); + } + } + } else { +#pragma omp parallel for collapse(2) + for (index_t d = 0; d < diff_size; ++d) { + for (index_t i = 0; i < common_size; ++i) { + output[i + d * common_size] = + std::pow(input1[i], input0[i + d * common_size]); + } + } + } + break; + case NEG: #pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = input0[i] + value; + for (index_t i = 0; i < diff_size * common_size; ++i) { + output[i] = -input0[i]; + } + break; + case ABS: +#pragma omp parallel for + for (index_t i = 0; i < diff_size * common_size; ++i) { + output[i] = std::fabs(input0[i]); + } + break; + default: + LOG(FATAL) << "Eltwise op not support type " << type; + } +} + +// Multiplication is costly, so we specialize the following case. +inline void TensorEltwise(const EltwiseType type, + const float *input0, + const float *input1, + const std::vector &coeff, + const index_t size, + const bool swapped, + float *output) { + switch (type) { + case SUM: + if (coeff.empty()) { +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = input0[i] + input1[i]; + } + + } else { + std::vector coeff_copy = coeff; + if (swapped) { + std::swap(coeff_copy[0], coeff_copy[1]); + } +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = input0[i] * coeff_copy[0] + input1[i] * coeff_copy[1]; + } } break; case SUB: + if (!swapped) { #pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = input0[i] - value; + for (index_t i = 0; i < size; ++i) { + output[i] = input0[i] - input1[i]; + } + + } else { +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = input1[i] - input0[i]; + } } break; case PROD: #pragma omp parallel for for (index_t i = 0; i < size; ++i) { - output[i] = input0[i] * value; + output[i] = input0[i] * input1[i]; } + break; case DIV: + if (!swapped) { #pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = input0[i] / value; + for (index_t i = 0; i < size; ++i) { + output[i] = input0[i] / input1[i]; + } + + } else { +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = input1[i] / input0[i]; + } } break; case MIN: #pragma omp parallel for for (index_t i = 0; i < size; ++i) { - output[i] = std::min(input0[i], value); + output[i] = std::min(input0[i], input1[i]); } + break; case MAX: #pragma omp parallel for for (index_t i = 0; i < size; ++i) { - output[i] = std::max(input0[i], value); + output[i] = std::max(input0[i], input1[i]); } + break; - case NEG: + case SQR_DIFF: #pragma omp parallel for for (index_t i = 0; i < size; ++i) { - output[i] = -input0[i]; + output[i] = std::pow(input0[i] - input1[i], 2.f); } + break; - case ABS: + case POW: + if (!swapped) { #pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = std::abs(input0[i]); + for (index_t i = 0; i < size; ++i) { + output[i] = std::pow(input0[i], input1[i]); + } + } else { + for (index_t i = 0; i < size; ++i) { + output[i] = std::pow(input1[i], input0[i]); + } } break; - case SQR_DIFF: + case NEG: #pragma omp parallel for for (index_t i = 0; i < size; ++i) { - output[i] = std::pow(input0[i] - value, 2.f); + output[i] = -input0[i]; } break; - case POW: + case ABS: #pragma omp parallel for for (index_t i = 0; i < size; ++i) { - output[i] = std::pow(input0[i], value); + output[i] = std::fabs(input0[i]); } break; default: @@ -115,328 +298,304 @@ inline void TensorScalar(const EltwiseType type, } } -inline void TensorBatchVector(const EltwiseType type, - const float *input0, - const float *input1, - const index_t batch, - const index_t channel, - const index_t hw, - const bool swapped, - float *output) { +// Multiplication is costly, so we specialize the following case. +inline void TensorScalarEltwise(const EltwiseType type, + const float *input0, + const float input1, + const std::vector &coeff, + const index_t size, + const bool swapped, + float *output) { switch (type) { case SUM: -#pragma omp parallel for collapse(3) - for (index_t b = 0; b < batch; ++b) { - for (index_t c = 0; c < channel; ++c) { - for (index_t i = 0; i < hw; ++i) { - const index_t idx0 = (b * channel + c) * hw + i; - const index_t idx1 = b * channel + c; - output[idx0] = input0[idx0] + input1[idx1]; - } + if (coeff.empty()) { +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = input0[i] + input1; + } + + } else { + std::vector coeff_copy = coeff; + if (swapped) { + std::swap(coeff_copy[0], coeff_copy[1]); + } +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = input0[i] * coeff_copy[0] + input1 * coeff_copy[1]; } } break; case SUB: - if (swapped) { -#pragma omp parallel for collapse(3) - for (index_t b = 0; b < batch; ++b) { - for (index_t c = 0; c < channel; ++c) { - for (index_t i = 0; i < hw; ++i) { - const index_t idx0 = (b * channel + c) * hw + i; - const index_t idx1 = b * channel + c; - output[idx0] = input1[idx1] - input0[idx0]; - } - } + if (!swapped) { +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = input0[i] - input1; } + } else { -#pragma omp parallel for collapse(3) - for (index_t b = 0; b < batch; ++b) { - for (index_t c = 0; c < channel; ++c) { - for (index_t i = 0; i < hw; ++i) { - const index_t idx0 = (b * channel + c) * hw + i; - const index_t idx1 = b * channel + c; - output[idx0] = input0[idx0] - input1[idx1]; - } - } +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = input1 - input0[i]; } } break; case PROD: -#pragma omp parallel for collapse(3) - for (index_t b = 0; b < batch; ++b) { - for (index_t c = 0; c < channel; ++c) { - for (index_t i = 0; i < hw; ++i) { - const index_t idx0 = (b * channel + c) * hw + i; - const index_t idx1 = b * channel + c; - output[idx0] = input0[idx0] * input1[idx1]; - } - } +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = input0[i] * input1; } + break; case DIV: - if (swapped) { -#pragma omp parallel for collapse(3) - for (index_t b = 0; b < batch; ++b) { - for (index_t c = 0; c < channel; ++c) { - for (index_t i = 0; i < hw; ++i) { - const index_t idx0 = (b * channel + c) * hw + i; - const index_t idx1 = b * channel + c; - output[idx0] = input1[idx1] / input0[idx0]; - } - } + if (!swapped) { +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = input0[i] / input1; } + } else { -#pragma omp parallel for collapse(3) - for (index_t b = 0; b < batch; ++b) { - for (index_t c = 0; c < channel; ++c) { - for (index_t i = 0; i < hw; ++i) { - const index_t idx0 = (b * channel + c) * hw + i; - const index_t idx1 = b * channel + c; - output[idx0] = input0[idx0] / input1[idx1]; - } - } +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = input1 / input0[i]; } } break; case MIN: -#pragma omp parallel for collapse(3) - for (index_t b = 0; b < batch; ++b) { - for (index_t c = 0; c < channel; ++c) { - for (index_t i = 0; i < hw; ++i) { - const index_t idx0 = (b * channel + c) * hw + i; - const index_t idx1 = b * channel + c; - output[idx0] = std::min(input0[idx0], input1[idx1]); - } - } +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = std::min(input0[i], input1); } + break; case MAX: -#pragma omp parallel for collapse(3) - for (index_t b = 0; b < batch; ++b) { - for (index_t c = 0; c < channel; ++c) { - for (index_t i = 0; i < hw; ++i) { - const index_t idx0 = (b * channel + c) * hw + i; - const index_t idx1 = b * channel + c; - output[idx0] = std::max(input0[idx0], input1[idx1]); - } - } +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = std::max(input0[i], input1); } + break; case SQR_DIFF: -#pragma omp parallel for collapse(3) - for (index_t b = 0; b < batch; ++b) { - for (index_t c = 0; c < channel; ++c) { - for (index_t i = 0; i < hw; ++i) { - const index_t idx0 = (b * channel + c) * hw + i; - const index_t idx1 = b * channel + c; - output[idx0] = std::pow(input0[idx0] - input1[idx1], 2.f); - } - } +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = std::pow(input0[i] - input1, 2.f); } + break; case POW: -#pragma omp parallel for collapse(3) - for (index_t b = 0; b < batch; ++b) { - for (index_t c = 0; c < channel; ++c) { - for (index_t i = 0; i < hw; ++i) { - const index_t idx0 = (b * channel + c) * hw + i; - const index_t idx1 = b * channel + c; - output[idx0] = std::pow(input0[idx0], input1[idx1]); - } + if (!swapped) { +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = std::pow(input0[i], input1); } + } else { + for (index_t i = 0; i < size; ++i) { + output[i] = std::pow(input1, input0[i]); + } + } + break; + case NEG: +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = -input0[i]; + } + break; + case ABS: +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = std::fabs(input0[i]); } break; default: LOG(FATAL) << "Eltwise op not support type " << type; } } -inline void TensorVector(const EltwiseType type, - const float *input0, - const float *input1, - const index_t batch, - const index_t channel, - const index_t hw, - const bool swapped, - float *output) { + +inline void TensorEltwisePerChannel(const EltwiseType type, + const float *input0, + const float *input1, + const std::vector &coeff, + const index_t batch0, + const index_t batch1, + const index_t channel, + const index_t image_size, + const bool swapped, + float *output) { switch (type) { case SUM: -#pragma omp parallel for collapse(3) - for (index_t b = 0; b < batch; ++b) { - for (index_t c = 0; c < channel; ++c) { - for (index_t i = 0; i < hw; ++i) { - const index_t idx0 = (b * channel + c) * hw + i; - const index_t idx1 = c; - output[idx0] = input0[idx0] + input1[idx1]; + if (coeff.empty()) { +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch0; ++b) { + for (index_t c = 0; c < channel; ++c) { + const float *in0_ptr = input0 + ((b * channel) + c) * image_size; + const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + float *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = in0_ptr[i] + in1_ptr[c]; + } + } + } + } else { + std::vector coeff_copy = coeff; + if (swapped) { + std::swap(coeff_copy[0], coeff_copy[1]); + } +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch0; ++b) { + for (index_t c = 0; c < channel; ++c) { + const float *in0_ptr = input0 + ((b * channel) + c) * image_size; + const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + float *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = + in0_ptr[i] * coeff_copy[0] + in1_ptr[c] * coeff_copy[1]; + } } } } break; case SUB: - if (swapped) { -#pragma omp parallel for collapse(3) - for (index_t b = 0; b < batch; ++b) { + if (!swapped) { +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch0; ++b) { for (index_t c = 0; c < channel; ++c) { - for (index_t i = 0; i < hw; ++i) { - const index_t idx0 = (b * channel + c) * hw + i; - const index_t idx1 = c; - output[idx0] = input1[idx1] - input0[idx0]; + const float *in0_ptr = input0 + ((b * channel) + c) * image_size; + const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + float *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = in0_ptr[i] - in1_ptr[c]; } } } } else { -#pragma omp parallel for collapse(3) - for (index_t b = 0; b < batch; ++b) { +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch0; ++b) { for (index_t c = 0; c < channel; ++c) { - for (index_t i = 0; i < hw; ++i) { - const index_t idx0 = (b * channel + c) * hw + i; - const index_t idx1 = c; - output[idx0] = input0[idx0] - input1[idx1]; + const float *in0_ptr = input0 + ((b * channel) + c) * image_size; + const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + float *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = in1_ptr[c] - in0_ptr[i]; } } } } break; case PROD: -#pragma omp parallel for collapse(3) - for (index_t b = 0; b < batch; ++b) { +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch0; ++b) { for (index_t c = 0; c < channel; ++c) { - for (index_t i = 0; i < hw; ++i) { - const index_t idx0 = (b * channel + c) * hw + i; - const index_t idx1 = c; - output[idx0] = input0[idx0] * input1[idx1]; + const float *in0_ptr = input0 + ((b * channel) + c) * image_size; + const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + float *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = in0_ptr[i] * in1_ptr[c]; } } } break; case DIV: - if (swapped) { -#pragma omp parallel for collapse(3) - for (index_t b = 0; b < batch; ++b) { + if (!swapped) { +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch0; ++b) { for (index_t c = 0; c < channel; ++c) { - for (index_t i = 0; i < hw; ++i) { - const index_t idx0 = (b * channel + c) * hw + i; - const index_t idx1 = c; - output[idx0] = input1[idx1] / input0[idx0]; + const float *in0_ptr = input0 + ((b * channel) + c) * image_size; + const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + float *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = in0_ptr[i] / in1_ptr[c]; } } } } else { -#pragma omp parallel for collapse(3) - for (index_t b = 0; b < batch; ++b) { +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch0; ++b) { for (index_t c = 0; c < channel; ++c) { - for (index_t i = 0; i < hw; ++i) { - const index_t idx0 = (b * channel + c) * hw + i; - const index_t idx1 = c; - output[idx0] = input0[idx0] / input1[idx1]; + const float *in0_ptr = input0 + ((b * channel) + c) * image_size; + const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + float *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = in1_ptr[c] / in0_ptr[i]; } } } } break; case MIN: -#pragma omp parallel for collapse(3) - for (index_t b = 0; b < batch; ++b) { +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch0; ++b) { for (index_t c = 0; c < channel; ++c) { - for (index_t i = 0; i < hw; ++i) { - const index_t idx0 = (b * channel + c) * hw + i; - const index_t idx1 = c; - output[idx0] = std::min(input0[idx0], input1[idx1]); + const float *in0_ptr = input0 + ((b * channel) + c) * image_size; + const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + float *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = std::min(in0_ptr[i], in1_ptr[c]); } } } break; case MAX: -#pragma omp parallel for collapse(3) - for (index_t b = 0; b < batch; ++b) { +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch0; ++b) { for (index_t c = 0; c < channel; ++c) { - for (index_t i = 0; i < hw; ++i) { - const index_t idx0 = (b * channel + c) * hw + i; - const index_t idx1 = c; - output[idx0] = std::max(input0[idx0], input1[idx1]); + const float *in0_ptr = input0 + ((b * channel) + c) * image_size; + const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + float *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = std::max(in0_ptr[i], in1_ptr[c]); } } } break; case SQR_DIFF: -#pragma omp parallel for collapse(3) - for (index_t b = 0; b < batch; ++b) { +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch0; ++b) { for (index_t c = 0; c < channel; ++c) { - for (index_t i = 0; i < hw; ++i) { - const index_t idx0 = (b * channel + c) * hw + i; - const index_t idx1 = c; - output[idx0] = std::pow(input0[idx0] - input1[idx1], 2.f); + const float *in0_ptr = input0 + ((b * channel) + c) * image_size; + const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + float *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = std::pow(in0_ptr[i] - in1_ptr[c], 2.f); } } } break; case POW: -#pragma omp parallel for collapse(3) - for (index_t b = 0; b < batch; ++b) { - for (index_t c = 0; c < channel; ++c) { - for (index_t i = 0; i < hw; ++i) { - const index_t idx0 = (b * channel + c) * hw + i; - const index_t idx1 = c; - output[idx0] = std::pow(input0[idx0], input1[idx1]); + if (!swapped) { +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch0; ++b) { + for (index_t c = 0; c < channel; ++c) { + const float *in0_ptr = input0 + ((b * channel) + c) * image_size; + const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + float *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = std::pow(in0_ptr[i], in1_ptr[c]); + } + } + } + } else { +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch0; ++b) { + for (index_t c = 0; c < channel; ++c) { + const float *in0_ptr = input0 + ((b * channel) + c) * image_size; + const float *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0); + float *out_ptr = output + ((b * channel) + c) * image_size; + for (index_t i = 0; i < image_size; ++i) { + out_ptr[i] = std::pow(in1_ptr[c], in0_ptr[i]); + } } } } break; - default: - LOG(FATAL) << "Eltwise op not support type " << type; - } -} -inline void TensorEltwise(const EltwiseType type, - const float *input0, - const float *input1, - const index_t size, - float *output) { - switch (type) { - case SUM: -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = input0[i] + input1[i]; - } - break; - case SUB: -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = input0[i] - input1[i]; - } - break; - case PROD: -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = input0[i] * input1[i]; - } - break; - case DIV: -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = input0[i] / input1[i]; - } - break; - case MIN: -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = std::min(input0[i], input1[i]); - } - break; - case MAX: -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = std::max(input0[i], input1[i]); - } - break; - case SQR_DIFF: + case NEG: #pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = std::pow(input0[i] - input1[i], 2.f); + for (index_t i = 0; i < batch0 * channel * image_size; ++i) { + output[i] = -input0[i]; } break; - case POW: + case ABS: #pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output[i] = std::pow(input0[i], input1[i]); + for (index_t i = 0; i < batch0 * channel * image_size; ++i) { + output[i] = std::fabs(input0[i]); } break; default: @@ -444,95 +603,109 @@ inline void TensorEltwise(const EltwiseType type, } } - struct EltwiseFunctorBase { EltwiseFunctorBase(const EltwiseType type, const std::vector &coeff, - const float value) - : type_(type), coeff_(coeff), value_(value) {} + const float value, + const DataFormat data_format) + : type_(type), coeff_(coeff), value_(value), data_format_(data_format) {} EltwiseType type_; std::vector coeff_; float value_; + DataFormat data_format_; }; template struct EltwiseFunctor; template <> -struct EltwiseFunctor: EltwiseFunctorBase { +struct EltwiseFunctor : EltwiseFunctorBase { EltwiseFunctor(const EltwiseType type, const std::vector &coeff, - const float value) - : EltwiseFunctorBase(type, coeff, value) {} + const float value, + const DataFormat data_format) + : EltwiseFunctorBase(type, coeff, value, data_format) {} MaceStatus operator()(const Tensor *input0, - const Tensor *input1, - Tensor *output, - StatsFuture *future) { + const Tensor *input1, + Tensor *output, + StatsFuture *future) { MACE_UNUSED(future); + + if (input1 == nullptr) { + scalar_tensor_.Resize({}); + Tensor::MappingGuard guard(&scalar_tensor_); + auto scalar_data = scalar_tensor_.mutable_data(); + scalar_data[0] = value_; + input1 = &scalar_tensor_; + } + bool swapped = false; - if (input1 != nullptr) { - MACE_CHECK(input0->dim_size() == input1->dim_size() - || input0->dim_size() == 1 - || input1->dim_size() == 1) - << "Inputs of Eltwise op must be same shape"; - if (input0->size() != input1->size()) { - if (input0->size() < input1->size()) { - std::swap(input0, input1); - swapped = true; - } - if (input1->dim_size() == 1) { - MACE_CHECK(input0->dim(1) == input1->dim(0)) - << "Element-Wise op only support channel dimension broadcast"; - } else { - MACE_CHECK((input0->dim(0) == input1->dim(0) || input1->dim(0) == 1) - && input0->dim(1) == input1->dim(1) - && input1->dim(2) == 1 - && input1->dim(3) == 1) - << "Element-Wise op only support channel dimension broadcast"; + if (input0->size() < input1->size()) { + std::swap(input0, input1); + swapped = true; + } + + // check if we can broadcast tensor + uint32_t rank_diff = + static_cast(input0->dim_size() - input1->dim_size()); + if (data_format_ == NCHW) { + MACE_CHECK( + input0->dim_size() == 4 && + (input1->dim_size() == 0 || + input1->dim_size() == 4 && input1->dim(1) == input0->dim(1) && + (input1->dim(0) == input0->dim(0) || input1->dim(0) == 1) || + input1->dim_size() == 1 && input1->dim(0) == input0->dim(1)), + "only support broadcast channel dimension"); + } else { + if (rank_diff > 0 && rank_diff < input0->dim_size()) { + for (uint32_t i = 0; i < input1->dim_size(); ++i) { + MACE_CHECK(input0->dim(rank_diff + i) == input1->dim(i), + "Element-Wise op only support tail dimensions broadcast"); } } } + + index_t common_size = input1->size(); + index_t diff_size = input0->size() / common_size; + MACE_RETURN_IF_ERROR(output->ResizeLike(input0)); Tensor::MappingGuard input0_guard(input0); + Tensor::MappingGuard input1_guard(input1); Tensor::MappingGuard output_guard(output); const float *input0_ptr = input0->data(); + const float *input1_ptr = input1->data(); float *output_ptr = output->mutable_data(); - const index_t size = input0->size(); - if (input1 == nullptr) { - TensorScalar(type_, input0_ptr, value_, size, output_ptr); + + if (data_format_ == NCHW && input1->dim_size() > 0 && + input1->size() < input0->size()) { + TensorEltwisePerChannel( + type_, input0_ptr, input1_ptr, coeff_, input0->dim(0), + input1->dim_size() == 1 ? 1 : input1->dim(0), input0->dim(1), + input0->dim(2) * input0->dim(3), swapped, output_ptr); + } else { - Tensor::MappingGuard input1_guard(input1); - - const float *input1_ptr = input1->data(); - if (input1->size() != input0->size()) { - const index_t batch = input0->dim(0); - const index_t channel = input0->dim(1); - const index_t hw = input0->dim(2) * input0->dim(3); - if (input1->dim(0) == 1 || input1->dim_size() == 1) - TensorVector(type_, input0_ptr, input1_ptr, - batch, channel, hw, swapped, output_ptr); - else - TensorBatchVector(type_, input0_ptr, input1_ptr, - batch, channel, hw, swapped, output_ptr); - } else { - if (!coeff_.empty() && type_ == SUM) { -#pragma omp parallel for - for (index_t i = 0; i < size; ++i) { - output_ptr[i] = coeff_[0] * input0_ptr[i] + - coeff_[1] * input1_ptr[i]; - } + if (input1->size() == input0->size()) { + TensorEltwise(type_, input0_ptr, input1_ptr, coeff_, input0->size(), + swapped, output_ptr); + } else if (input1->size() < input0->size()) { + if (input1->size() > 1) { + TensorBroadcastEltwise(type_, input0_ptr, input1_ptr, coeff_, + diff_size, common_size, swapped, output_ptr); } else { - TensorEltwise(type_, input0_ptr, input1_ptr, size, output_ptr); + TensorScalarEltwise(type_, input0_ptr, input1_ptr[0], coeff_, + input0->size(), swapped, output_ptr); } } } return MACE_SUCCESS; } + + Tensor scalar_tensor_; }; #ifdef MACE_ENABLE_OPENCL @@ -540,13 +713,14 @@ template struct EltwiseFunctor : EltwiseFunctorBase { EltwiseFunctor(const EltwiseType type, const std::vector &coeff, - const float value) - : EltwiseFunctorBase(type, coeff, value) {} + const float value, + const DataFormat data_format) + : EltwiseFunctorBase(type, coeff, value, data_format) {} MaceStatus operator()(const Tensor *input0, - const Tensor *input1, - Tensor *output, - StatsFuture *future); + const Tensor *input1, + Tensor *output, + StatsFuture *future); cl::Kernel kernel_; uint32_t kwg_size_; diff --git a/mace/kernels/softmax.h b/mace/kernels/softmax.h index ebcb7b40..406f87af 100644 --- a/mace/kernels/softmax.h +++ b/mace/kernels/softmax.h @@ -42,48 +42,79 @@ struct SoftmaxFunctor { Tensor *output, StatsFuture *future) { MACE_UNUSED(future); - const index_t batch = input->dim(0); - const index_t class_count = input->dim(1); - const index_t class_size = input->dim(2) * input->dim(3); - const index_t batch_size = class_count * class_size; - Tensor::MappingGuard input_guard(input); Tensor::MappingGuard output_guard(output); const float *input_data = input->data(); float *output_data = output->mutable_data(); - for (index_t b = 0; b < batch; ++b) { + // softmax for nchw image + if (input->dim_size() == 4) { + const index_t batch = input->dim(0); + const index_t class_count = input->dim(1); + const index_t class_size = input->dim(2) * input->dim(3); + const index_t batch_size = class_count * class_size; + + for (index_t b = 0; b < batch; ++b) { +#pragma omp parallel for + for (index_t k = 0; k < class_size; ++k) { + const float *input_ptr = input_data + b * batch_size + k; + float *output_ptr = output_data + b * batch_size + k; + + float max_val = std::numeric_limits::lowest(); + index_t channel_offset = 0; + for (index_t c = 0; c < class_count; ++c) { + float data = input_ptr[channel_offset]; + if (data > max_val) { + max_val = data; + } + channel_offset += class_size; + } + + channel_offset = 0; + float sum = 0; + for (index_t c = 0; c < class_count; ++c) { + float exp_value = ::exp(input_ptr[channel_offset] - max_val); + sum += exp_value; + output_ptr[channel_offset] = exp_value; + channel_offset += class_size; + } + + sum = std::max(sum, std::numeric_limits::min()); + channel_offset = 0; + for (index_t c = 0; c < class_count; ++c) { + output_ptr[channel_offset] /= sum; + channel_offset += class_size; + } + } // k + } // b + } else if (input->dim_size() == 2) { // normal 2d softmax + const index_t class_size = input->dim(0); + const index_t class_count = input->dim(1); #pragma omp parallel for for (index_t k = 0; k < class_size; ++k) { - const float *input_ptr = input_data + b * batch_size + k; - float *output_ptr = output_data + b * batch_size + k; + const float *input_ptr = input_data + k * class_count; + float *output_ptr = output_data + k * class_count; float max_val = std::numeric_limits::lowest(); - index_t channel_offset = 0; for (index_t c = 0; c < class_count; ++c) { - float data = input_ptr[channel_offset]; - if (data > max_val) { - max_val = data; - } - channel_offset += class_size; + max_val = std::max(max_val, input_ptr[c]); } - channel_offset = 0; float sum = 0; for (index_t c = 0; c < class_count; ++c) { - float exp_value = ::exp(input_ptr[channel_offset] - max_val); + float exp_value = ::exp(input_ptr[c] - max_val); sum += exp_value; - output_ptr[channel_offset] = exp_value; - channel_offset += class_size; + output_ptr[c] = exp_value; } - channel_offset = 0; + sum = std::max(sum, std::numeric_limits::min()); for (index_t c = 0; c < class_count; ++c) { - output_ptr[channel_offset] /= sum; - channel_offset += class_size; + output_ptr[c] /= sum; } - } // k - } // b + } + } else { + MACE_NOT_IMPLEMENTED; + } return MACE_SUCCESS; } diff --git a/mace/ops/eltwise.h b/mace/ops/eltwise.h index 9cc800bf..8c88e9a2 100644 --- a/mace/ops/eltwise.h +++ b/mace/ops/eltwise.h @@ -30,7 +30,9 @@ class EltwiseOp : public Operator { static_cast(OperatorBase::GetOptionalArg( "type", static_cast(kernels::EltwiseType::NONE))), OperatorBase::GetRepeatedArgs("coeff"), - OperatorBase::GetOptionalArg("value", 1.0)) {} + OperatorBase::GetOptionalArg("value", 1.0), + static_cast(OperatorBase::GetOptionalArg( + "data_format", 0))) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input0 = this->Input(0); diff --git a/mace/ops/eltwise_test.cc b/mace/ops/eltwise_test.cc index e8fd81cf..37666b33 100644 --- a/mace/ops/eltwise_test.cc +++ b/mace/ops/eltwise_test.cc @@ -41,6 +41,7 @@ void SimpleTensorScalar(const kernels::EltwiseType type, .Input("TInput") .AddIntArg("type", static_cast(type)) .AddFloatArg("value", x) + .AddIntArg("data_format", DataFormat::NCHW) .Output("TOutput") .Finalize(net.NewOperatorDef()); // Run @@ -84,15 +85,24 @@ void SimpleTensorEltwise(const kernels::EltwiseType type, net.AddInputFromArray("Input1", shape1, input1); if (D == DeviceType::CPU) { - net.TransformDataFormat("Input0", NHWC, "TInput0", NCHW); - net.TransformDataFormat("Input1", NHWC, "TInput1", NCHW); - OpDefBuilder("Eltwise", "EltwiseTest") - .Input("TInput0") - .Input("TInput1") + auto op_builder = OpDefBuilder("Eltwise", "EltwiseTest") .AddIntArg("type", static_cast(type)) .AddFloatsArg("coeff", coeff) - .Output("TOutput") - .Finalize(net.NewOperatorDef()); + .AddIntArg("data_format", DataFormat::NCHW) + .Output("TOutput"); + if (shape0.size() > 1) { + net.TransformDataFormat("Input0", NHWC, "TInput0", NCHW); + op_builder.Input("TInput0"); + } else { + op_builder.Input("Input0"); + } + if (shape1.size() > 1) { + net.TransformDataFormat("Input1", NHWC, "TInput1", NCHW); + op_builder.Input("TInput1"); + } else { + op_builder.Input("Input1"); + } + op_builder.Finalize(net.NewOperatorDef()); // Run net.RunOp(D); @@ -214,6 +224,35 @@ TEST_F(EltwiseOpTest, CPUSimpleTensorVector) { kernels::EltwiseType::SQR_DIFF, {1, 1, 1, 5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 0, 0, 0, 0, 25, 25, 25, 25, 25}); + + SimpleTensorEltwise( + kernels::EltwiseType::SUM, {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {3}, + {1, 2, 3}, {2, 4, 6, 5, 7, 9}); + SimpleTensorEltwise( + kernels::EltwiseType::SUB, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, + {5}, {1, 2, 3, 4, 5}, {0, 0, 0, 0, 0, 5, 5, 5, 5, 5}); + SimpleTensorEltwise( + kernels::EltwiseType::SUB, {5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {0, 0, 0, 0, 0, -5, -5, -5, -5, -5}); + SimpleTensorEltwise( + kernels::EltwiseType::PROD, {3}, {1, 2, 3}, {1, 2, 1, 3}, + {1, 2, 3, 4, 5, 6}, {1, 4, 9, 4, 10, 18}); + SimpleTensorEltwise( + kernels::EltwiseType::DIV, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, + {5}, {1, 1, 1, 1, 5}, {1, 2, 3, 4, 1, 6, 7, 8, 9, 2}); + SimpleTensorEltwise( + kernels::EltwiseType::DIV, {5}, {1, 1, 1, 2, 4}, {1, 2, 1, 5}, + {1, 1, 1, 2, 2, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 2, 1, 1, 1, 2, 4}); + SimpleTensorEltwise( + kernels::EltwiseType::MIN, {5}, {1, 2, 3, 4, 5}, {1, 2, 1, 5}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5}); + SimpleTensorEltwise( + kernels::EltwiseType::MAX, {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, + {5}, {1, 2, 3, 4, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}); + SimpleTensorEltwise( + kernels::EltwiseType::SQR_DIFF, {5}, {1, 2, 3, 4, 5}, + {1, 2, 1, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, + {0, 0, 0, 0, 0, 25, 25, 25, 25, 25}); } TEST_F(EltwiseOpTest, GPUSimpleTensorVector) { @@ -322,6 +361,7 @@ void RandomTensorScalar(const kernels::EltwiseType type, .Input("TInput") .AddIntArg("type", static_cast(type)) .AddFloatArg("value", 0.1) + .AddIntArg("data_format", DataFormat::NCHW) .Output("TOutput") .Finalize(net.NewOperatorDef()); // Run @@ -375,6 +415,7 @@ void RandomTensorEltwise(const kernels::EltwiseType type, .Input("TInput1") .AddIntArg("type", static_cast(type)) .AddFloatsArg("coeff", coeff) + .AddIntArg("data_format", DataFormat::NCHW) .Output("TOutput") .Finalize(net.NewOperatorDef()); diff --git a/mace/ops/softmax_test.cc b/mace/ops/softmax_test.cc index 9997ee10..5468ca24 100644 --- a/mace/ops/softmax_test.cc +++ b/mace/ops/softmax_test.cc @@ -29,8 +29,12 @@ void Simple() { // Add input data net.AddInputFromArray("Input", {1, 1, 2, 4}, {1, 1, 1, 1, 1, 2, 3, 4}); + auto expected = CreateTensor( + {1, 1, 2, 4}, + {0.25, 0.25, 0.25, 0.25, 0.0320586, 0.08714432, 0.23688282, 0.64391426}); if (D == DeviceType::CPU) { + // test 4d softmax net.TransformDataFormat("Input", NHWC, "InputNCHW", NCHW); OpDefBuilder("Softmax", "SoftmaxTest") .Input("InputNCHW") @@ -40,6 +44,21 @@ void Simple() { // Run net.RunOp(D); net.TransformDataFormat("OutputNCHW", NCHW, "Output", NHWC); + + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); + + // check 2d softmax + net.AddInputFromArray("Input2d", {2, 4}, + {1, 1, 1, 1, 1, 2, 3, 4}); + OpDefBuilder("Softmax", "SoftmaxTest") + .Input("Input2d") + .Output("Output") + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(D); + net.GetOutput("Output")->Reshape({1, 1, 2, 4}); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } else if (D == DeviceType::GPU) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); @@ -55,15 +74,11 @@ void Simple() { // Transfer output ImageToBuffer(&net, "OutputImage", "Output", kernels::BufferType::IN_OUT_CHANNEL); + + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } else { MACE_NOT_IMPLEMENTED; } - - auto expected = CreateTensor( - {1, 1, 2, 4}, - {0.25, 0.25, 0.25, 0.25, 0.0320586, 0.08714432, 0.23688282, 0.64391426}); - - ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } } // namespace -- GitLab