提交 9fe6761a 编写于 作者: B Bin Li

feat: Add bf16 kernels for MobileNet

上级 62d8ba37
......@@ -518,7 +518,8 @@ Use ``-h`` to get detailed help.
Reduce Library Size
-------------------
* Build for your own usage purpose.
* Build for your own usage purpose. Some configuration variables in tools/bazel_build_standalone_lib.sh
are set to ``true`` by default, you can change them to ``false`` to reduce the library size.
* **dynamic library**
- If the models don't need to run on device ``dsp``, change the build option ``--define hexagon=true``
......
......@@ -62,27 +62,27 @@ class BFloat16 {
}
template<typename T>
BFloat16 operator+(T value) const {
return BFloat16(Sphinx(
static_cast<uint32_t>(data_ << 16)).f + static_cast<float>(value));
float operator+(T value) const {
return Sphinx(static_cast<uint32_t>(data_ << 16)).f
+ static_cast<float>(value);
}
template<typename T>
BFloat16 operator-(T value) const {
return BFloat16(Sphinx(
static_cast<uint32_t>(data_ << 16)).f - static_cast<float>(value));
float operator-(T value) const {
return Sphinx(static_cast<uint32_t>(data_ << 16)).f
- static_cast<float>(value);
}
template<typename T>
BFloat16 operator*(T value) const {
return BFloat16(Sphinx(
static_cast<uint32_t>(data_ << 16)).f * static_cast<float>(value));
float operator*(T value) const {
return Sphinx(static_cast<uint32_t>(data_ << 16)).f
* static_cast<float>(value);
}
template<typename T>
BFloat16 operator/(T value) const {
return BFloat16(Sphinx(
static_cast<uint32_t>(data_ << 16)).f / static_cast<float>(value));
float operator/(T value) const {
return Sphinx(static_cast<uint32_t>(data_ << 16)).f
/ static_cast<float>(value);
}
template<typename T>
......@@ -223,7 +223,6 @@ inline ostream &operator<<(ostream &ss, // NOLINT
} // namespace std
inline float operator+(const float &a, const mace::BFloat16 &value) {
return a + static_cast<float>(value);
}
......@@ -256,6 +255,38 @@ inline void operator/=(float &a, const mace::BFloat16 &value) { // NOLINT
a /= static_cast<float>(value);
}
inline double operator+(const double &a, const mace::BFloat16 &value) {
return a + static_cast<double>(value);
}
inline double operator-(const double &a, const mace::BFloat16 &value) {
return a - static_cast<double>(value);
}
inline double operator*(const double &a, const mace::BFloat16 &value) {
return a * static_cast<double>(value);
}
inline double operator/(const double &a, const mace::BFloat16 &value) {
return a / static_cast<double>(value);
}
inline void operator+=(double &a, const mace::BFloat16 &value) { // NOLINT
a += static_cast<double>(value);
}
inline void operator-=(double &a, const mace::BFloat16 &value) { // NOLINT
a -= static_cast<double>(value);
}
inline void operator*=(double &a, const mace::BFloat16 &value) { // NOLINT
a *= static_cast<double>(value);
}
inline void operator/=(double &a, const mace::BFloat16 &value) { // NOLINT
a /= static_cast<double>(value);
}
#endif // MACE_ENABLE_BFLOAT16
#endif // MACE_CORE_BFLOAT16_H_
......@@ -104,15 +104,13 @@ cc_library(
"arm/fp32/*.cc",
"arm/fp16/gemv.h",
],
exclude = [
"arm/fp32/*_test.cc",
],
) + if_quantize_enabled(glob(
[
"arm/q8/*.cc",
],
exclude = [
"arm/q8/*_test.cc",
)) + if_bfloat16_enabled(glob(
[
"arm/bf16/*.cc",
],
)),
hdrs = glob(
......@@ -124,6 +122,10 @@ cc_library(
[
"arm/q8/*.h",
],
)) + if_bfloat16_enabled(glob(
[
"arm/bf16/*.h",
],
)),
copts = [
"-Werror",
......
......@@ -11,6 +11,9 @@ file(GLOB OPS_ARM_NEON_BASE_KERNELS_SRCS
file(GLOB OPS_ARM_NEON_FP32_KERNELS_SRCS
arm/fp32/*.cc
)
file(GLOB OPS_ARM_NEON_BF16_KERNELS_SRCS
arm/bf16/*.cc
)
file(GLOB OPS_ARM_NEON_Q8_KERNELS_SRCS
arm/q8/*.cc
)
......@@ -39,6 +42,9 @@ if(MACE_ENABLE_NEON)
if(MACE_ENABLE_QUANTIZE)
set(OPS_SRCS ${OPS_SRCS} ${OPS_ARM_NEON_Q8_KERNELS_SRCS})
endif(MACE_ENABLE_QUANTIZE)
if(MACE_ENABLE_BFLOAT16)
set(OPS_SRCS ${OPS_SRCS} ${OPS_ARM_NEON_BF16_KERNELS_SRCS})
endif(MACE_ENABLE_BFLOAT16)
endif(MACE_ENABLE_NEON)
if(MACE_ENABLE_OPENCL)
......
......@@ -14,10 +14,25 @@
#include "mace/ops/arm/base/activation.h"
#include <algorithm>
#include "mace/ops/arm/base/common_neon.h"
namespace mace {
namespace ops {
namespace arm {
extern template void Activation<uint8_t>::ActivateRelu(
utils::ThreadPool *, const Tensor *, Tensor *);
extern template void Activation<uint8_t>::ActivateRelux(
utils::ThreadPool *, const Tensor *, Tensor *);
extern template void Activation<uint8_t>::ActivateLeakyRelu(
utils::ThreadPool *, const Tensor *, Tensor *);
extern template void Activation<uint8_t>::ActivateTanh(
utils::ThreadPool *, const Tensor *, Tensor *);
extern template void Activation<uint8_t>::ActivateSigmoid(
utils::ThreadPool *, const Tensor *, Tensor *);
template<typename T>
MaceStatus Activation<T>::Compute(const OpContext *context,
const Tensor *input, Tensor *output) {
......@@ -76,15 +91,156 @@ void Activation<T>::DoActivation(const OpContext *context,
}
}
template<typename T>
void Activation<T>::ActivateRelu(utils::ThreadPool *thread_pool,
const Tensor *input,
Tensor *output) {
const auto input_data = input->data<T>();
auto output_data = output->mutable_data<T>();
const index_t input_size = input->size();
const float32x4_t vzero = vdupq_n_f32(0.f);
const index_t block_count = input_size / 4;
thread_pool->Compute1D(
[=](index_t start, index_t end, index_t step) {
const T *input_ptr = input_data + start * 4;
T *output_ptr = output_data + start * 4;
for (index_t i = start; i < end; i += step) {
float32x4_t v = vld1q(input_ptr);
v = vmaxq_f32(v, vzero);
vst1q(output_ptr, v);
input_ptr += 4;
output_ptr += 4;
}
},
0, block_count, 1);
// remain
for (index_t i = block_count * 4; i < input_size; ++i) {
output_data[i] = std::max(0.f, input_data[i]);
}
}
template<typename T>
void Activation<T>::ActivateRelux(utils::ThreadPool *thread_pool,
const Tensor *input,
Tensor *output) {
const auto input_data = input->data<T>();
auto output_data = output->mutable_data<T>();
const index_t input_size = input->size();
const float32x4_t vzero = vdupq_n_f32(0.f);
const float32x4_t vlimit = vdupq_n_f32(limit_);
const index_t block_count = input_size / 4;
thread_pool->Compute1D(
[=](index_t start, index_t end, index_t step) {
auto input_ptr = input_data + start * 4;
auto output_ptr = output_data + start * 4;
for (index_t i = start; i < end; i += step) {
float32x4_t v = vld1q(input_ptr);
v = vmaxq_f32(v, vzero);
v = vminq_f32(v, vlimit);
vst1q(output_ptr, v);
input_ptr += 4;
output_ptr += 4;
}
},
0, block_count, 1);
// remain
for (index_t i = block_count * 4; i < input_size; ++i) {
output_data[i] = std::max(0.f, std::min(limit_, input_data[i]));
}
}
template<typename T>
void Activation<T>::ActivateLeakyRelu(utils::ThreadPool *thread_pool,
const Tensor *input,
Tensor *output) {
const auto input_data = input->data<T>();
auto output_data = output->mutable_data<T>();
const index_t input_size = input->size();
const float32x4_t vzero = vdupq_n_f32(0.f);
const float32x4_t valpha = vdupq_n_f32(leakyrelu_coefficient_);
const index_t block_count = input_size / 4;
thread_pool->Compute1D(
[=](index_t start, index_t end, index_t step) {
auto input_ptr = input_data + start * 4;
auto output_ptr = output_data + start * 4;
for (index_t i = start; i < end; i += step) {
float32x4_t v = vld1q(input_ptr);
float32x4_t u = vminq_f32(v, vzero);
v = vmaxq_f32(v, vzero);
v = vmlaq_f32(v, valpha, u);
vst1q(output_ptr, v);
input_ptr += 4;
output_ptr += 4;
}
},
0, block_count, 1);
// remain
for (index_t i = block_count * 4; i < input_size; ++i) {
output_data[i] = std::max(input_data[i], 0.f) +
std::min(input_data[i], 0.f) * leakyrelu_coefficient_;
}
}
template<typename T>
void Activation<T>::ActivateTanh(utils::ThreadPool *thread_pool,
const Tensor *input,
Tensor *output) {
const auto input_data = input->data<T>();
auto output_data = output->mutable_data<T>();
const index_t input_size = input->size();
thread_pool->Compute1D(
[=](index_t start, index_t end, index_t step) {
for (index_t i = start; i < end; i += step) {
output_data[i] = std::tanh(input_data[i]);
}
},
0, input_size, 1);
}
template<typename T>
void Activation<T>::ActivateSigmoid(utils::ThreadPool *thread_pool,
const Tensor *input,
Tensor *output) {
const auto input_data = input->data<T>();
auto output_data = output->mutable_data<T>();
const index_t input_size = input->size();
thread_pool->Compute1D(
[=](index_t start, index_t end, index_t step) {
for (index_t i = start; i < end; i += step) {
output_data[i] = 1 / (1 + std::exp(-(input_data[i])));
}
},
0, input_size, 1);
}
void RegisterActivationDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Activation<float>, delegator::ActivationParam,
MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, float, ImplType::NEON));
#ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_DELEGATOR(
registry, Activation<uint8_t>, delegator::ActivationParam,
MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, uint8_t, ImplType::NEON));
#endif // MACE_ENABLE_QUANTIZE
MACE_REGISTER_BF16_DELEGATOR(
registry, Activation<BFloat16>, delegator::ActivationParam,
MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, BFloat16,
ImplType::NEON));
}
} // namespace arm
......
......@@ -14,10 +14,24 @@
#include "mace/ops/arm/base/bias_add.h"
#include <functional>
#include <vector>
#include "mace/ops/arm/base/common_neon.h"
namespace mace {
namespace ops {
namespace arm {
extern template void BiasAdd<uint8_t>::AddBiasNCHW<1>(
utils::ThreadPool *, const Tensor *, const Tensor *, Tensor *);
extern template void BiasAdd<uint8_t>::AddBiasNCHW<2>(
utils::ThreadPool *, const Tensor *, const Tensor *, Tensor *);
extern template void BiasAdd<uint8_t>::AddBiasNHWC<1>(
utils::ThreadPool *, const Tensor *, const Tensor *, Tensor *);
extern template void BiasAdd<uint8_t>::AddBiasNHWC<2>(
utils::ThreadPool *, const Tensor *, const Tensor *, Tensor *);
template<typename T>
MaceStatus BiasAdd<T>::Compute(const OpContext *context,
const Tensor *input,
......@@ -69,15 +83,101 @@ void BiasAdd<T>::AddBias(const OpContext *context,
}
}
template <typename T>
template <int Dim>
void BiasAdd<T>::AddBiasNCHW(utils::ThreadPool *thread_pool,
const Tensor *input,
const Tensor *bias,
Tensor *output) {
const auto input_data = input->data<T>();
const auto bias_data = bias->data<T>();
auto output_data = output->mutable_data<T>();
const index_t batch = input->dim(0);
const index_t channels = input->dim(1);
const index_t image_size = input->dim(2) * input->dim(3);
const index_t block_count = image_size / 4;
const index_t remain = image_size % 4;
thread_pool->Compute2D(
[=](index_t start0, index_t end0, index_t step0, index_t start1,
index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) {
const index_t b_offset = b * channels;
for (index_t c = start1; c < end1; c += step1) {
const index_t offset = (b_offset + c) * image_size;
auto input_ptr = input_data + offset;
auto output_ptr = output_data + offset;
const float bias = bias_data[bias_index<Dim>(b_offset, c)];
float32x4_t vbias = vdupq_n_f32(bias);
for (index_t i = 0; i < block_count; ++i) {
float32x4_t v = vld1q(input_ptr);
v = vaddq_f32(v, vbias);
vst1q(output_ptr, v);
input_ptr += 4;
output_ptr += 4;
}
for (index_t i = 0; i < remain; ++i) {
(*output_ptr++) = (*input_ptr++) + bias;
}
}
}
},
0, batch, 1, 0, channels, 1);
}
template <typename T>
template <int Dim>
void BiasAdd<T>::AddBiasNHWC(utils::ThreadPool *thread_pool,
const Tensor *input,
const Tensor *bias,
Tensor *output) {
const auto input_ptr = input->data<T>();
const auto bias_ptr = bias->data<T>();
auto output_ptr = output->mutable_data<T>();
const std::vector<index_t> &shape = input->shape();
const index_t channels = *shape.rbegin();
const auto batch = shape[0];
if (Dim == 2) {
MACE_CHECK(batch == bias->shape()[0]);
}
const index_t fused_hw = std::accumulate(shape.begin() + 1, shape.end() - 1,
1, std::multiplies<index_t>());
thread_pool->Compute2D(
[=](index_t start0, index_t end0, index_t step0, index_t start1,
index_t end1, index_t step1) {
for (index_t i = start0; i < end0; i += step0) {
auto offset = i * fused_hw;
auto bias_offset = i * channels;
for (index_t j = start1; j < end1; j += step1) {
index_t pos = (offset + j) * channels;
for (index_t c = 0; c < channels; ++c, ++pos) {
output_ptr[pos] =
input_ptr[pos] + bias_ptr[bias_index<Dim>(bias_offset, c)];
}
}
}
},
0, batch, 1, 0, fused_hw, 1);
}
void RegisterBiasAddDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, BiasAdd<float>, DelegatorParam,
MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, float, ImplType::NEON));
#ifdef MACE_ENABLE_QUANTIZE
MACE_REGISTER_DELEGATOR(
registry, BiasAdd<uint8_t>, DelegatorParam,
MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, uint8_t, ImplType::NEON));
#endif // MACE_ENABLE_QUANTIZE
MACE_REGISTER_BF16_DELEGATOR(
registry, BiasAdd<BFloat16>, DelegatorParam,
MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, BFloat16, ImplType::NEON));
}
} // namespace arm
......
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_ARM_BASE_COMMON_NEON_H_
#define MACE_OPS_ARM_BASE_COMMON_NEON_H_
#include <arm_neon.h>
#include "mace/core/bfloat16.h"
namespace mace {
namespace ops {
namespace arm {
typedef struct float32x8_t {
float32x4_t val[2];
} float32x8_t;
#if !defined(__aarch64__)
inline float vaddvq_f32(float32x4_t v) {
float32x2_t _sum = vadd_f32(vget_low_f32(v), vget_high_f32(v));
_sum = vpadd_f32(_sum, _sum);
return vget_lane_f32(_sum, 0);
}
#endif
inline float32x4_t neon_vfma_lane_0(float32x4_t a,
float32x4_t b,
float32x4_t c) {
#ifdef __aarch64__
return vfmaq_laneq_f32(a, b, c, 0);
#else
return vmlaq_lane_f32(a, b, vget_low_f32(c), 0);
#endif
}
inline float32x4_t neon_vfma_lane_1(float32x4_t a,
float32x4_t b,
float32x4_t c) {
#ifdef __aarch64__
return vfmaq_laneq_f32(a, b, c, 1);
#else
return vmlaq_lane_f32(a, b, vget_low_f32(c), 1);
#endif
}
inline float32x4_t neon_vfma_lane_2(float32x4_t a,
float32x4_t b,
float32x4_t c) {
#ifdef __aarch64__
return vfmaq_laneq_f32(a, b, c, 2);
#else
return vmlaq_lane_f32(a, b, vget_high_f32(c), 0);
#endif
}
inline float32x4_t neon_vfma_lane_3(float32x4_t a,
float32x4_t b,
float32x4_t c) {
#ifdef __aarch64__
return vfmaq_laneq_f32(a, b, c, 3);
#else
return vmlaq_lane_f32(a, b, vget_high_f32(c), 1);
#endif
}
inline void neon_vec_left_shift_1(const float32x4_t &src,
float32x4_t *dst) {
(*dst)[0] = src[1];
(*dst)[1] = src[2];
(*dst)[2] = src[3];
}
inline void neon_vec_left_shift_2(const float32x4_t &src,
float32x4_t *dst) {
(*dst)[0] = src[2];
(*dst)[1] = src[3];
}
inline void neon_vec_left_shift_3(const float32x4_t &src,
float32x4_t *dst) {
(*dst)[0] = src[3];
}
inline void neon_vec_right_shift_1(const float32x4_t &src,
float32x4_t *dst) {
(*dst)[1] = src[0];
(*dst)[2] = src[1];
(*dst)[3] = src[2];
}
inline void neon_vec_right_shift_2(const float32x4_t &src,
float32x4_t *dst) {
(*dst)[2] = src[0];
(*dst)[3] = src[1];
}
inline void neon_vec_right_shift_3(const float32x4_t &src,
float32x4_t *dst) {
(*dst)[3] = src[0];
}
inline float32x2_t vld1(const float *ptr) {
return vld1_f32(ptr);
}
inline void vst1(float *ptr, float32x2_t v) {
vst1_f32(ptr, v);
}
inline float32x4_t vld1q(const float *ptr) {
return vld1q_f32(ptr);
}
inline float32x4x2_t vld2q(const float *ptr) {
return vld2q_f32(ptr);
}
inline float32x4x3_t vld3q(const float *ptr) {
return vld3q_f32(ptr);
}
inline void vst1q(float *ptr, float32x4_t v) {
vst1q_f32(ptr, v);
}
inline void vst2q(float *ptr, float32x4x2_t v) {
vst2q_f32(ptr, v);
}
inline void vst3q(float *ptr, float32x4x3_t v) {
vst3q_f32(ptr, v);
}
inline float32x8_t vld1o(float *ptr) {
return {vld1q_f32(ptr), vld1q_f32(ptr + 4)};
}
inline void vst1o(float *ptr, float32x8_t v) {
vst1q_f32(ptr, v.val[0]);
vst1q_f32(ptr + 4, v.val[1]);
}
#if defined(MACE_ENABLE_BFLOAT16)
// load of 2D vector
inline float32x2_t vld1_bf16(const BFloat16 *ptr) {
return (float32x2_t){ptr[0], ptr[1]}; // NOLINT(readability/braces)
}
inline float32x2_t vld1_bf16(const uint16_t *ptr) {
return vld1_bf16(reinterpret_cast<const BFloat16 *>(ptr));
}
inline float32x2_t vld1(const BFloat16 *ptr) {
return vld1_bf16(ptr);
}
inline float32x2_t vld1(const uint16_t *ptr) {
return vld1_bf16(reinterpret_cast<const BFloat16 *>(ptr));
}
// store of 2D vector
inline void vst1_bf16(BFloat16 *ptr, float32x2_t v) {
ptr[0] = v[0];
ptr[1] = v[1];
}
inline void vst1_bf16(uint16_t *ptr, float32x2_t v) {
vst1_bf16(reinterpret_cast<BFloat16 *>(ptr), v);
}
inline void vst1(BFloat16 *ptr, float32x2_t v) {
vst1_bf16(ptr, v);
}
inline void vst1(uint16_t *ptr, float32x2_t v) {
vst1_bf16(reinterpret_cast<BFloat16 *>(ptr), v);
}
// load of 4D vector
inline float32x4_t vld1q_bf16(const uint16_t *ptr) {
return vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(ptr), 16));
}
inline float32x4_t vld1q_bf16(const BFloat16 *ptr) {
return vld1q_bf16(reinterpret_cast<const uint16_t *>(ptr));
}
inline float32x4_t vld1q(const uint16_t *ptr) {
return vld1q_bf16(ptr);
}
inline float32x4_t vld1q(const BFloat16 *ptr) {
return vld1q_bf16(reinterpret_cast<const uint16_t *>(ptr));
}
// load of 2 4D vectors and perform de-interleaving
inline float32x4x2_t vld2q_bf16(const uint16_t *ptr) {
uint16x4x2_t u = vld2_u16(ptr);
return {vreinterpretq_f32_u32(vshll_n_u16(u.val[0], 16)),
vreinterpretq_f32_u32(vshll_n_u16(u.val[1], 16))};
}
inline float32x4x2_t vld2q_bf16(const BFloat16 *ptr) {
return vld2q_bf16(reinterpret_cast<const uint16_t *>(ptr));
}
inline float32x4x2_t vld2q(const uint16_t *ptr) {
return vld2q_bf16(ptr);
}
inline float32x4x2_t vld2q(const BFloat16 *ptr) {
return vld2q_bf16(reinterpret_cast<const uint16_t *>(ptr));
}
// load of 3 4D vectors and perform de-interleaving
inline float32x4x3_t vld3q_bf16(const uint16_t *ptr) {
uint16x4x3_t u = vld3_u16(ptr);
return {vreinterpretq_f32_u32(vshll_n_u16(u.val[0], 16)),
vreinterpretq_f32_u32(vshll_n_u16(u.val[1], 16)),
vreinterpretq_f32_u32(vshll_n_u16(u.val[2], 16))};
}
inline float32x4x3_t vld3q_bf16(const BFloat16 *ptr) {
return vld3q_bf16(reinterpret_cast<const uint16_t *>(ptr));
}
inline float32x4x3_t vld3q(const uint16_t *ptr) {
return vld3q_bf16(ptr);
}
inline float32x4x3_t vld3q(const BFloat16 *ptr) {
return vld3q_bf16(reinterpret_cast<const uint16_t *>(ptr));
}
// store of 4D vector
inline void vst1q_bf16(uint16_t *ptr, const float32x4_t v) {
vst1_u16(ptr, vshrn_n_u32(vreinterpretq_u32_f32(v), 16));
}
inline void vst1q_bf16(BFloat16 *ptr, const float32x4_t v) {
vst1q_bf16(reinterpret_cast<uint16_t *>(ptr), v);
}
inline void vst1q(uint16_t *ptr, const float32x4_t v) {
vst1q_bf16(ptr, v);
}
inline void vst1q(BFloat16 *ptr, const float32x4_t v) {
vst1q_bf16(reinterpret_cast<uint16_t *>(ptr), v);
}
// store of 2 4D vectors and perform interleaving
inline void vst2q_bf16(uint16_t *ptr, const float32x4x2_t v) {
uint16x4x2_t u = {vshrn_n_u32(vreinterpretq_u32_f32(v.val[0]), 16),
vshrn_n_u32(vreinterpretq_u32_f32(v.val[1]), 16)};
vst2_u16(ptr, u);
}
inline void vst2q_bf16(BFloat16 *ptr, const float32x4x2_t v) {
vst2q_bf16(reinterpret_cast<uint16_t *>(ptr), v);
}
inline void vst2q(uint16_t *ptr, const float32x4x2_t v) {
vst2q_bf16(ptr, v);
}
inline void vst2q(BFloat16 *ptr, const float32x4x2_t v) {
vst2q_bf16(reinterpret_cast<uint16_t *>(ptr), v);
}
// store of 3 4D vectors and perform interleaving
inline void vst3q_bf16(uint16_t *ptr, const float32x4x3_t v) {
uint16x4x3_t u = {vshrn_n_u32(vreinterpretq_u32_f32(v.val[0]), 16),
vshrn_n_u32(vreinterpretq_u32_f32(v.val[0]), 16),
vshrn_n_u32(vreinterpretq_u32_f32(v.val[0]), 16)};
vst3_u16(ptr, u);
}
inline void vst3q_bf16(BFloat16 *ptr, const float32x4x3_t v) {
vst3q_bf16(reinterpret_cast<uint16_t *>(ptr), v);
}
inline void vst3q(uint16_t *ptr, const float32x4x3_t v) {
vst3q_bf16(ptr, v);
}
inline void vst3q(BFloat16 *ptr, const float32x4x3_t v) {
vst3q_bf16(reinterpret_cast<uint16_t *>(ptr), v);
}
// load of 8D vector
inline float32x8_t vld1o_bf16(const uint16_t *ptr) {
uint16x8_t u = vld1q_u16(ptr);
return {vreinterpretq_f32_u32(vshll_n_u16(vget_low_u16(u), 16)),
vreinterpretq_f32_u32(vshll_n_u16(vget_high_u16(u), 16))};
}
inline float32x8_t vld1o_bf16(const BFloat16 *ptr) {
return vld1o_bf16(reinterpret_cast<const uint16_t *>(ptr));
}
inline float32x8_t vld1o(const uint16_t *ptr) {
return vld1o_bf16(ptr);
}
inline float32x8_t vld1o(const BFloat16 *ptr) {
return vld1o_bf16(reinterpret_cast<const uint16_t *>(ptr));
}
// store of 8D vector
inline void vst1o_bf16(uint16_t *ptr, const float32x8_t v) {
vst1q_u16(ptr, vcombine_u16(
vshrn_n_u32(vreinterpretq_u32_f32(v.val[0]), 16),
vshrn_n_u32(vreinterpretq_u32_f32(v.val[1]), 16)));
}
inline void vst1o_bf16(BFloat16 *ptr, const float32x8_t v) {
vst1o_bf16(reinterpret_cast<uint16_t *>(ptr), v);
}
inline void vst1o(uint16_t *ptr, const float32x8_t v) {
vst1o_bf16(ptr, v);
}
inline void vst1o(BFloat16 *ptr, const float32x8_t v) {
vst1o_bf16(reinterpret_cast<uint16_t *>(ptr), v);
}
#endif // MACE_ENABLE_BFLOAT16
} // namespace arm
} // namespace ops
} // namespace mace
#endif // MACE_OPS_ARM_BASE_COMMON_NEON_H_
......@@ -96,6 +96,11 @@ void RegisterConv2dK1x1Delegator(OpDelegatorRegistry *registry) {
registry, Conv2dK1x1<float>, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K1x1));
MACE_REGISTER_BF16_DELEGATOR(
registry, Conv2dK1x1<BFloat16>, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
BFloat16, ImplType::NEON, K1x1));
}
} // namespace arm
......
......@@ -27,6 +27,15 @@ void RegisterConv2dK3x3Delegator(OpDelegatorRegistry *registry) {
registry, Conv2dK3x3S2<float>, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S2));
MACE_REGISTER_BF16_DELEGATOR(
registry, Conv2dK3x3S1<BFloat16>, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
BFloat16, ImplType::NEON, K3x3S1));
MACE_REGISTER_BF16_DELEGATOR(
registry, Conv2dK3x3S2<BFloat16>, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
BFloat16, ImplType::NEON, K3x3S2));
}
} // namespace arm
......
......@@ -12,10 +12,11 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/arm/fp32/conv_2d_3x3_winograd.h"
#include "mace/ops/arm/base/conv_2d_3x3_winograd.h"
#include <algorithm>
#include "mace/ops/arm/base/common_neon.h"
#include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/ops/delegator/conv_2d.h"
#include "mace/utils/math.h"
......@@ -24,12 +25,12 @@
namespace mace {
namespace ops {
namespace arm {
namespace fp32 {
MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) {
template<typename T>
MaceStatus Conv2dK3x3Winograd<T>::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) {
const index_t batch = input->dim(0);
const index_t in_channels = input->dim(1);
const index_t in_height = input->dim(2);
......@@ -84,17 +85,17 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,
// pad input and transform input
auto scratch_buffer = context->device()->scratch_buffer();
const index_t padded_in_size = is_in_padded ? PadAlignSize(
sizeof(float) * batch * in_channels * padded_in_height
sizeof(T) * batch * in_channels * padded_in_height
* padded_in_width) : 0;
const index_t padded_out_size = is_out_padded ? PadAlignSize(
sizeof(float) * batch * out_channels * padded_out_height
sizeof(T) * batch * out_channels * padded_out_height
* padded_out_width) : 0;
const index_t transformed_in_size = PadAlignSize(
sizeof(float) * batch * in_tile_area * in_channels * tile_count);
sizeof(T) * batch * in_tile_area * in_channels * tile_count);
const index_t transformed_out_size = PadAlignSize(
sizeof(float) * batch * in_tile_area * out_channels * tile_count);
sizeof(T) * batch * in_tile_area * out_channels * tile_count);
const index_t transformed_filter_size =
PadAlignSize(sizeof(float) * in_tile_area * out_channels * in_channels);
PadAlignSize(sizeof(T) * in_tile_area * out_channels * in_channels);
const index_t gemm_pack_size =
transformed_in_size + transformed_filter_size + transformed_filter_size;
......@@ -104,8 +105,8 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,
+ transformed_out_size + gemm_pack_size);
const Tensor *padded_in = input;
Tensor tmp_padded_in
(scratch_buffer->Scratch(padded_in_size), DataType::DT_FLOAT);
Tensor tmp_padded_in(scratch_buffer->Scratch(padded_in_size),
DataTypeToEnum<T>::value);
if (is_in_padded) {
tmp_padded_in.Resize({batch, in_channels, padded_in_height,
padded_in_width});
......@@ -115,8 +116,8 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,
}
Tensor *padded_out = output;
Tensor tmp_padded_out
(scratch_buffer->Scratch(padded_out_size), DataType::DT_FLOAT);
Tensor tmp_padded_out(scratch_buffer->Scratch(padded_out_size),
DataTypeToEnum<T>::value);
if (is_out_padded) {
padded_out = &tmp_padded_out;
padded_out->Resize({batch, out_channels, padded_out_height,
......@@ -125,17 +126,17 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,
auto transformed_in = scratch_buffer->Scratch(transformed_in_size);
auto transformed_out = scratch_buffer->Scratch(transformed_out_size);
auto padded_in_data = padded_in->data<float>();
auto padded_out_data = padded_out->mutable_data<float>();
auto transformed_in_data = transformed_in.mutable_data<float>();
auto transformed_out_data = transformed_out.mutable_data<float>();
auto filter_data = filter->data<float>();
auto padded_in_data = padded_in->data<T>();
auto padded_out_data = padded_out->mutable_data<T>();
auto transformed_in_data = transformed_in.mutable_data<T>();
auto transformed_out_data = transformed_out.mutable_data<T>();
auto filter_data = filter->data<T>();
if (!filter->is_weight() || out_tile_size != out_tile_size_) {
out_tile_size_ = out_tile_size;
transformed_filter_.reset(new Tensor);
transformed_filter_->Resize({in_tile_area, out_channels, in_channels});
auto transformed_filter_data = transformed_filter_->mutable_data<float>();
auto transformed_filter_data = transformed_filter_->mutable_data<T>();
switch (out_tile_size) {
case 2:
TransformFilter4x4(context,
......@@ -181,9 +182,9 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,
const index_t scratch_buffer_offset = scratch_buffer->offset();
const index_t transformed_in_size_per_batch =
in_tile_area * in_channels * tile_count * sizeof(float);
in_tile_area * in_channels * tile_count * sizeof(T);
const index_t transformed_out_size_per_batch =
in_tile_area * out_channels * tile_count * sizeof(float);
in_tile_area * out_channels * tile_count * sizeof(T);
for (index_t b = 0; b < batch; ++b) {
scratch_buffer->Rewind(scratch_buffer_offset);
......@@ -194,10 +195,11 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,
b * transformed_out_size_per_batch,
transformed_out_size_per_batch);
Tensor transformed_in_this_batch(transformed_in_slice, DataType::DT_FLOAT);
Tensor transformed_in_this_batch(transformed_in_slice,
DataTypeToEnum<T>::value);
transformed_in_this_batch.Resize({in_tile_area, in_channels, tile_count});
Tensor
transformed_out_this_batch(transformed_out_slice, DataType::DT_FLOAT);
Tensor transformed_out_this_batch(transformed_out_slice,
DataTypeToEnum<T>::value);
transformed_out_this_batch.Resize({in_tile_area, out_channels, tile_count});
gemm_.Compute(context,
......@@ -246,11 +248,12 @@ MaceStatus Conv2dK3x3Winograd::Compute(const OpContext *context,
}
// OCHW => TOC
void Conv2dK3x3Winograd::TransformFilter4x4(const OpContext *context,
const float *filter,
const index_t in_channels,
const index_t out_channels,
float *output) {
template<typename T>
void Conv2dK3x3Winograd<T>::TransformFilter4x4(const OpContext *context,
const T *filter,
const index_t in_channels,
const index_t out_channels,
T *output) {
const index_t stride = out_channels * in_channels;
utils::ThreadPool
......@@ -339,11 +342,12 @@ void Conv2dK3x3Winograd::TransformFilter4x4(const OpContext *context,
⎢ ⎥
⎣ 0 0 1 ⎦
*/
void Conv2dK3x3Winograd::TransformFilter8x8(const OpContext *context,
const float *filter,
const index_t in_channels,
const index_t out_channels,
float *output) {
template<typename T>
void Conv2dK3x3Winograd<T>::TransformFilter8x8(const OpContext *context,
const T *filter,
const index_t in_channels,
const index_t out_channels,
T *output) {
const index_t stride = out_channels * in_channels;
const float G[8][3] = {{1.0f, 0.0f, 0.0f},
......@@ -396,14 +400,15 @@ void Conv2dK3x3Winograd::TransformFilter8x8(const OpContext *context,
}
// NCHW => NTCB (T: in tile pixels, B: tile indices)
void Conv2dK3x3Winograd::TransformInput4x4(const OpContext *context,
const float *input,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t tile_count,
float *output) {
template<typename T>
void Conv2dK3x3Winograd<T>::TransformInput4x4(const OpContext *context,
const T *input,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t tile_count,
T *output) {
const index_t stride = in_channels * tile_count;
const index_t in_height_width = in_height * in_width;
const index_t input_batch_size = in_height_width * in_channels;
......@@ -420,14 +425,12 @@ void Conv2dK3x3Winograd::TransformInput4x4(const OpContext *context,
for (index_t h = 0; h < in_height - 2; h += 2) {
for (index_t w = 0; w < in_width - 2; w += 2) {
float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13,
d14,
d15;
d14, d15;
float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13,
s14,
s15;
s14, s15;
// load tile data
const float *input_ptr = input + n * input_batch_size +
const T *input_ptr = input + n * input_batch_size +
c * in_height_width + h * in_width + w;
d0 = input_ptr[0];
d1 = input_ptr[1];
......@@ -468,7 +471,7 @@ void Conv2dK3x3Winograd::TransformInput4x4(const OpContext *context,
s15 = (d5 - d13) - (d7 - d15);
// store output
float *output_ptr =
T *output_ptr =
output + n * output_batch_size + c * tile_count + tile_index;
output_ptr[0] = s0;
output_ptr[1 * stride] = s1;
......@@ -517,14 +520,15 @@ void Conv2dK3x3Winograd::TransformInput4x4(const OpContext *context,
⎢ ⎥
⎣0 -1 0 21/4 0 -21/4 0 1⎦
*/
void Conv2dK3x3Winograd::TransformInput8x8(const OpContext *context,
const float *input,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t tile_count,
float *output) {
template<typename T>
void Conv2dK3x3Winograd<T>::TransformInput8x8(const OpContext *context,
const T *input,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t tile_count,
T *output) {
const index_t stride = in_channels * tile_count;
const index_t in_height_width = in_height * in_width;
const index_t input_batch_size = in_height_width * in_channels;
......@@ -540,7 +544,7 @@ void Conv2dK3x3Winograd::TransformInput8x8(const OpContext *context,
float s[8][8];
for (index_t h = 0; h < in_height - 2; h += 6) {
for (index_t w = 0; w < in_width - 2; w += 6) {
const float *input_ptr = input + n * input_batch_size +
const T *input_ptr = input + n * input_batch_size +
c * in_height_width + h * in_width + w;
for (int i = 0; i < 8; ++i) {
......@@ -575,7 +579,7 @@ void Conv2dK3x3Winograd::TransformInput8x8(const OpContext *context,
input_ptr += in_width;
}
float *output_ptr =
T *output_ptr =
output + n * output_batch_size + c * tile_count + tile_index;
for (int i = 0; i < 8; ++i) {
float d0, d1, d2, d3, d4, d5, d6, d7;
......@@ -616,14 +620,15 @@ void Conv2dK3x3Winograd::TransformInput8x8(const OpContext *context,
}
// NTOB => NToOB => NOHoWo
void Conv2dK3x3Winograd::TransformOutput4x4(const OpContext *context,
const float *input,
index_t batch,
index_t out_height,
index_t out_width,
index_t out_channels,
index_t tile_count,
float *output) {
template<typename T>
void Conv2dK3x3Winograd<T>::TransformOutput4x4(const OpContext *context,
const T *input,
index_t batch,
index_t out_height,
index_t out_width,
index_t out_channels,
index_t tile_count,
T *output) {
const index_t stride = out_channels * tile_count;
const index_t input_batch_size = 16 * stride;
const index_t out_image_size = out_height * out_width;
......@@ -644,7 +649,7 @@ void Conv2dK3x3Winograd::TransformOutput4x4(const OpContext *context,
float s0, s1, s2, s3, s4, s5, s6, s7;
float v0, v1, v2, v3;
const float *input_ptr =
const T *input_ptr =
input + n * input_batch_size + m * tile_count + tile_offset;
d0 = input_ptr[0];
d1 = input_ptr[1 * stride];
......@@ -680,7 +685,7 @@ void Conv2dK3x3Winograd::TransformOutput4x4(const OpContext *context,
v2 = s2 - s4 - s6;
v3 = s3 - s5 - s7;
float *output_ptr = output + n * output_batch_size +
T *output_ptr = output + n * output_batch_size +
m * out_image_size + h * out_width + w;
output_ptr[0] = v0;
output_ptr[1] = v1;
......@@ -710,14 +715,15 @@ void Conv2dK3x3Winograd::TransformOutput4x4(const OpContext *context,
⎢ ⎥
⎣0 1 -1 32 -32 1 -1 1⎦
*/
void Conv2dK3x3Winograd::TransformOutput8x8(const OpContext *context,
const float *input,
index_t batch,
index_t out_height,
index_t out_width,
index_t out_channels,
index_t tile_count,
float *output) {
template<typename T>
void Conv2dK3x3Winograd<T>::TransformOutput8x8(const OpContext *context,
const T *input,
index_t batch,
index_t out_height,
index_t out_width,
index_t out_channels,
index_t tile_count,
T *output) {
const index_t stride = out_channels * tile_count;
const index_t input_batch_size = 64 * stride;
const index_t out_image_size = out_height * out_width;
......@@ -733,7 +739,7 @@ void Conv2dK3x3Winograd::TransformOutput8x8(const OpContext *context,
float s[8][6];
for (index_t h = 0; h < out_height; h += 6) {
for (index_t w = 0; w < out_width; w += 6) {
const float *input_ptr =
const T *input_ptr =
input + n * input_batch_size + m * tile_count + tile_offset;
for (int i = 0; i < 8; ++i) {
float d0, d1, d2, d3, d4, d5, d6, d7;
......@@ -764,7 +770,7 @@ void Conv2dK3x3Winograd::TransformOutput8x8(const OpContext *context,
input_ptr += 8 * stride;
}
float *output_ptr = output + n * output_batch_size +
T *output_ptr = output + n * output_batch_size +
m * out_image_size + h * out_width + w;
for (int i = 0; i < 6; ++i) {
......@@ -803,12 +809,16 @@ void Conv2dK3x3Winograd::TransformOutput8x8(const OpContext *context,
void RegisterConv2dK3x3WinogradDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Conv2dK3x3Winograd, delegator::Conv2dParam,
registry, Conv2dK3x3Winograd<float>, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3Winograd));
MACE_REGISTER_BF16_DELEGATOR(
registry, Conv2dK3x3Winograd<BFloat16>, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
BFloat16, ImplType::NEON, K3x3Winograd));
}
} // namespace fp32
} // namespace arm
} // namespace ops
} // namespace mace
......@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_ARM_FP32_CONV_2D_3X3_WINOGRAD_H_
#define MACE_OPS_ARM_FP32_CONV_2D_3X3_WINOGRAD_H_
#ifndef MACE_OPS_ARM_BASE_CONV_2D_3X3_WINOGRAD_H_
#define MACE_OPS_ARM_BASE_CONV_2D_3X3_WINOGRAD_H_
#include <vector>
#include <memory>
......@@ -27,12 +27,12 @@
namespace mace {
namespace ops {
namespace arm {
namespace fp32 {
template<typename T>
class Conv2dK3x3Winograd : public Conv2dBase {
public:
explicit Conv2dK3x3Winograd(const delegator::Conv2dParam &param)
: Conv2dBase(param, sizeof(float)),
: Conv2dBase(param, sizeof(T)),
gemm_(delegator::GemmParam()),
transformed_filter_(nullptr),
out_tile_size_(0) {}
......@@ -47,61 +47,60 @@ class Conv2dK3x3Winograd : public Conv2dBase {
private:
void TransformFilter4x4(const OpContext *context,
const float *filter,
const T *filter,
const index_t in_channels,
const index_t out_channels,
float *output);
T *output);
void TransformFilter8x8(const OpContext *context,
const float *filter,
const T *filter,
const index_t in_channels,
const index_t out_channels,
float *output);
T *output);
void TransformInput4x4(const OpContext *context,
const float *input,
const T *input,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t tile_count,
float *output);
T *output);
void TransformInput8x8(const OpContext *context,
const float *input,
const T *input,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t tile_count,
float *output);
T *output);
void TransformOutput4x4(const OpContext *context,
const float *input,
const T *input,
index_t batch,
index_t out_height,
index_t out_width,
index_t out_channels,
index_t tile_count,
float *output);
T *output);
void TransformOutput8x8(const OpContext *context,
const float *input,
const T *input,
index_t batch,
index_t out_height,
index_t out_width,
index_t out_channels,
index_t tile_count,
float *output);
T *output);
Gemm<float> gemm_;
Gemm<T> gemm_;
std::unique_ptr<Tensor> transformed_filter_;
index_t out_tile_size_;
};
} // namespace fp32
} // namespace arm
} // namespace ops
} // namespace mace
#endif // MACE_OPS_ARM_FP32_CONV_2D_3X3_WINOGRAD_H_
#endif // MACE_OPS_ARM_BASE_CONV_2D_3X3_WINOGRAD_H_
......@@ -16,6 +16,8 @@
#include <memory>
#include "mace/ops/arm/base/common_neon.h"
namespace mace {
namespace ops {
namespace arm {
......@@ -57,10 +59,155 @@ MaceStatus Conv2dGeneral<T>::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS;
}
template<typename T>
MaceStatus Conv2dGeneral<T>::DoCompute(
const ConvComputeParam &p, const T *filter_data,
const T *input_data, T *output_data,
const std::vector<index_t> &filter_shape) {
const index_t filter_height = filter_shape[2];
const index_t filter_width = filter_shape[3];
const index_t filter_size = filter_height * filter_width;
p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) {
for (index_t m = start1; m < end1; m += step1) {
const int stride_h = strides_[0];
const int stride_w = strides_[1];
const int dilation_h = dilations_[0];
const int dilation_w = dilations_[1];
if (m + 3 < p.out_channels) {
T *out_ptr0_base =
output_data + b * p.out_batch_size + m * p.out_image_size;
T *out_ptr1_base = out_ptr0_base + p.out_image_size;
T *out_ptr2_base = out_ptr1_base + p.out_image_size;
T *out_ptr3_base = out_ptr2_base + p.out_image_size;
for (index_t h = 0; h < p.out_height; ++h) {
const index_t ih = h * stride_h;
for (index_t w = 0; w + 3 < p.out_width; w += 4) {
const index_t iw = w * stride_w;
index_t out_offset = h * p.out_width + w;
float32x4_t vo0 = vdupq_n_f32(0.f);
float32x4_t vo1 = vdupq_n_f32(0.f);
float32x4_t vo2 = vdupq_n_f32(0.f);
float32x4_t vo3 = vdupq_n_f32(0.f);
const T *in_ptr_base = input_data + b * p.in_batch_size;
const T *filter_ptr0 =
filter_data + m * p.in_channels * filter_size;
const T *filter_ptr1 = filter_ptr0 + p.in_channels * filter_size;
const T *filter_ptr2 = filter_ptr1 + p.in_channels * filter_size;
const T *filter_ptr3 = filter_ptr2 + p.in_channels * filter_size;
for (index_t c = 0; c < p.in_channels; ++c) {
index_t in_offset = ih * p.in_width + iw;
// calc by row
for (index_t kh = 0; kh < filter_height; ++kh) {
for (index_t kw = 0; kw < filter_width; ++kw) {
const T i0 = in_ptr_base[in_offset + kw * dilation_w];
const T i1 =
in_ptr_base[in_offset + stride_w + kw * dilation_w];
const T i2 =
in_ptr_base[in_offset + 2 * stride_w + kw * dilation_w];
const T i3 =
in_ptr_base[in_offset + 3 * stride_w + kw * dilation_w];
const T f0 = filter_ptr0[kw];
const T f1 = filter_ptr1[kw];
const T f2 = filter_ptr2[kw];
const T f3 = filter_ptr3[kw];
// outch 0
vo0[0] += i0 * f0;
vo0[1] += i1 * f0;
vo0[2] += i2 * f0;
vo0[3] += i3 * f0;
// outch 1
vo1[0] += i0 * f1;
vo1[1] += i1 * f1;
vo1[2] += i2 * f1;
vo1[3] += i3 * f1;
// outch 2
vo2[0] += i0 * f2;
vo2[1] += i1 * f2;
vo2[2] += i2 * f2;
vo2[3] += i3 * f2;
// outch 3
vo3[0] += i0 * f3;
vo3[1] += i1 * f3;
vo3[2] += i2 * f3;
vo3[3] += i3 * f3;
} // kw
in_offset += dilation_h * p.in_width;
filter_ptr0 += filter_width;
filter_ptr1 += filter_width;
filter_ptr2 += filter_width;
filter_ptr3 += filter_width;
} // kh
in_ptr_base += p.in_image_size;
} // c
vst1q(out_ptr0_base + out_offset, vo0);
vst1q(out_ptr1_base + out_offset, vo1);
vst1q(out_ptr2_base + out_offset, vo2);
vst1q(out_ptr3_base + out_offset, vo3);
} // w
} // h
} else {
for (index_t mm = m; mm < p.out_channels; ++mm) {
T *out_ptr0_base =
output_data + b * p.out_batch_size + mm * p.out_image_size;
for (index_t h = 0; h < p.out_height; ++h) {
for (index_t w = 0; w + 3 < p.out_width; w += 4) {
// input offset
const index_t ih = h * stride_h;
const index_t iw = w * stride_w;
// output offset
const index_t out_offset = h * p.out_width + w;
// output (1 outch x 1 height x 4 width): vo_outch_height
float32x4_t vo0 = vdupq_n_f32(0.f);
const T *in_ptr_base = input_data + b * p.in_batch_size;
const T *filter_ptr0 =
filter_data + mm * p.in_channels * filter_size;
for (index_t c = 0; c < p.in_channels; ++c) {
index_t in_offset = ih * p.in_width + iw;
for (index_t kh = 0; kh < filter_height; ++kh) {
for (index_t kw = 0; kw < filter_width; ++kw) {
T i0 = in_ptr_base[in_offset + kw * dilation_w];
T i1 = in_ptr_base[in_offset + stride_w +
kw * dilation_w];
T i2 = in_ptr_base[in_offset + 2 * stride_w +
kw * dilation_w];
T i3 = in_ptr_base[in_offset + 3 * stride_w +
kw * dilation_w];
T f0 = filter_ptr0[kw];
// outch 0
vo0[0] += i0 * f0;
vo0[1] += i1 * f0;
vo0[2] += i2 * f0;
vo0[3] += i3 * f0;
} // kw
in_offset += dilation_h * p.in_width;
filter_ptr0 += filter_width;
} // kh
in_ptr_base += p.in_image_size;
} // c
vst1q(out_ptr0_base + out_offset, vo0);
} // w
} // h
} // mm
} // if
} // m
} // b
}, 0, p.batch, 1, 0, p.out_channels, 4);
return MaceStatus::MACE_SUCCESS;
}
void RegisterConv2dGeneralDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Conv2dGeneral<float>, delegator::Conv2dParam,
MACE_DELEGATOR_KEY(Conv2d, DeviceType::CPU, float, ImplType::NEON));
MACE_REGISTER_BF16_DELEGATOR(
registry, Conv2dGeneral<BFloat16>, delegator::Conv2dParam,
MACE_DELEGATOR_KEY(Conv2d, DeviceType::CPU, BFloat16, ImplType::NEON));
}
} // namespace arm
......
......@@ -14,10 +14,436 @@
#include "mace/ops/arm/base/depthwise_conv_2d_3x3.h"
#include "mace/ops/arm/base/common_neon.h"
namespace mace {
namespace ops {
namespace arm {
namespace {
template<typename T>
void DepthwiseConv2d3x3Pixel(const T *in_base,
const T *filter,
const index_t out_h,
const index_t out_w,
const index_t in_h_start,
const index_t in_w_start,
const index_t out_width,
const index_t in_height,
const index_t in_width,
T *out_base) {
const index_t filter_width = 3;
float sum = 0.0f;
index_t in_h = in_h_start;
const T *in = in_base + in_h * in_width;
const T *filter_ptr = filter;
if (in_h >= 0 && in_h < in_height) {
index_t in_w = in_w_start;
if (in_w >= 0 && in_w < in_width) {
sum += in[in_w] * filter_ptr[0];
}
in_w++;
if (in_w >= 0 && in_w < in_width) {
sum += in[in_w] * filter_ptr[1];
}
in_w++;
if (in_w >= 0 && in_w < in_width) {
sum += in[in_w] * filter_ptr[2];
}
}
in_h++;
in += in_width;
filter_ptr += filter_width;
if (in_h >= 0 && in_h < in_height) {
index_t in_w = in_w_start;
if (in_w >= 0 && in_w < in_width) {
sum += in[in_w] * filter_ptr[0];
}
in_w++;
if (in_w >= 0 && in_w < in_width) {
sum += in[in_w] * filter_ptr[1];
}
in_w++;
if (in_w >= 0 && in_w < in_width) {
sum += in[in_w] * filter_ptr[2];
}
}
in_h++;
in += in_width;
filter_ptr += filter_width;
if (in_h >= 0 && in_h < in_height) {
index_t in_w = in_w_start;
if (in_w >= 0 && in_w < in_width) {
sum += in[in_w] * filter_ptr[0];
}
in_w++;
if (in_w >= 0 && in_w < in_width) {
sum += in[in_w] * filter_ptr[1];
}
in_w++;
if (in_w >= 0 && in_w < in_width) {
sum += in[in_w] * filter_ptr[2];
}
}
out_base[out_h * out_width + out_w] = sum;
}
} // namespace
template<typename T>
MaceStatus DepthwiseConv2dK3x3S1<T>::DoCompute(
const DepthwiseConvComputeParam &p, const T *filter_data,
const T *input_data, T *output_data) {
p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) {
for (index_t m = start1; m < end1; m += step1) {
const index_t c = m / p.multiplier;
const index_t multi_index = m % p.multiplier;
auto filter_ptr = filter_data + multi_index * p.in_channels * 9 + c * 9;
auto in_base = input_data + b * p.in_batch_size + c * p.in_image_size;
auto out_base = output_data + b * p.out_batch_size +
m * p.out_image_size;
index_t h, w;
// top
for (h = 0; h < p.valid_h_start; ++h) {
for (w = 0; w < p.out_width; ++w) {
DepthwiseConv2d3x3Pixel(in_base,
filter_ptr,
h,
w,
h - p.pad_top,
w - p.pad_left,
p.out_width,
p.in_height,
p.in_width,
out_base);
}
}
// load filter (1 outch x 3 height x 3 width): vf_outch_height
float32x4_t vf00, vf01, vf02;
vf00 = vld1q(filter_ptr);
vf01 = vld1q(filter_ptr + 3);
vf02 = vld1q(filter_ptr + 5);
for (h = p.valid_h_start; h + 1 < p.valid_h_stop; h += 2) {
// left
for (w = 0; w < p.valid_w_start; ++w) {
DepthwiseConv2d3x3Pixel(in_base,
filter_ptr,
h,
w,
h - p.pad_top,
w - p.pad_left,
p.out_width,
p.in_height,
p.in_width,
out_base);
DepthwiseConv2d3x3Pixel(in_base,
filter_ptr,
h + 1,
w,
h + 1 - p.pad_top,
w - p.pad_left,
p.out_width,
p.in_height,
p.in_width,
out_base);
}
for (w = p.valid_w_start; w + 3 < p.valid_w_stop; w += 4) {
// input (4 height x 3 slide): vi_height_slide
float32x4_t vi00, vi01, vi02, vi0n;
float32x4_t vi10, vi11, vi12, vi1n;
float32x4_t vi20, vi21, vi22, vi2n;
float32x4_t vi30, vi31, vi32, vi3n;
// output (1 outch x 2 height x 4 width): vo_outch_height
float32x4_t vo00, vo01;
// load input
index_t in_h = h - p.pad_top;
index_t in_w = w - p.pad_left;
index_t in_offset = in_h * p.in_width + in_w;
vi00 = vld1q(in_base + in_offset);
vi0n = vld1q(in_base + in_offset + 4);
vi10 = vld1q(in_base + in_offset + p.in_width);
vi1n = vld1q(in_base + in_offset + p.in_width + 4);
vi20 = vld1q(in_base + in_offset + 2 * p.in_width);
vi2n = vld1q(in_base + in_offset + 2 * p.in_width + 4);
vi30 = vld1q(in_base + in_offset + 3 * p.in_width);
vi3n = vld1q(in_base + in_offset + 3 * p.in_width + 4);
vi01 = vextq_f32(vi00, vi0n, 1);
vi02 = vextq_f32(vi00, vi0n, 2);
vi11 = vextq_f32(vi10, vi1n, 1);
vi12 = vextq_f32(vi10, vi1n, 2);
vi21 = vextq_f32(vi20, vi2n, 1);
vi22 = vextq_f32(vi20, vi2n, 2);
vi31 = vextq_f32(vi30, vi3n, 1);
vi32 = vextq_f32(vi30, vi3n, 2);
// load ouptut
index_t out_offset = h * p.out_width + w;
vo00 = vld1q(out_base + out_offset);
vo01 = vld1q(out_base + out_offset + p.out_width);
#if defined(__aarch64__)
// outch 0, height 0
vo00 = vfmaq_laneq_f32(vo00, vi00, vf00, 0);
vo00 = vfmaq_laneq_f32(vo00, vi01, vf00, 1);
vo00 = vfmaq_laneq_f32(vo00, vi02, vf00, 2);
vo00 = vfmaq_laneq_f32(vo00, vi10, vf01, 0);
vo00 = vfmaq_laneq_f32(vo00, vi11, vf01, 1);
vo00 = vfmaq_laneq_f32(vo00, vi12, vf01, 2);
vo00 = vfmaq_laneq_f32(vo00, vi20, vf02, 1);
vo00 = vfmaq_laneq_f32(vo00, vi21, vf02, 2);
vo00 = vfmaq_laneq_f32(vo00, vi22, vf02, 3);
// outch 0, height 1
vo01 = vfmaq_laneq_f32(vo01, vi10, vf00, 0);
vo01 = vfmaq_laneq_f32(vo01, vi11, vf00, 1);
vo01 = vfmaq_laneq_f32(vo01, vi12, vf00, 2);
vo01 = vfmaq_laneq_f32(vo01, vi20, vf01, 0);
vo01 = vfmaq_laneq_f32(vo01, vi21, vf01, 1);
vo01 = vfmaq_laneq_f32(vo01, vi22, vf01, 2);
vo01 = vfmaq_laneq_f32(vo01, vi30, vf02, 1);
vo01 = vfmaq_laneq_f32(vo01, vi31, vf02, 2);
vo01 = vfmaq_laneq_f32(vo01, vi32, vf02, 3);
#else
// outch 0, height 0
vo00 = vmlaq_lane_f32(vo00, vi00, vget_low_f32(vf00), 0);
vo00 = vmlaq_lane_f32(vo00, vi01, vget_low_f32(vf00), 1);
vo00 = vmlaq_lane_f32(vo00, vi02, vget_high_f32(vf00), 0);
vo00 = vmlaq_lane_f32(vo00, vi10, vget_low_f32(vf01), 0);
vo00 = vmlaq_lane_f32(vo00, vi11, vget_low_f32(vf01), 1);
vo00 = vmlaq_lane_f32(vo00, vi12, vget_high_f32(vf01), 0);
vo00 = vmlaq_lane_f32(vo00, vi20, vget_low_f32(vf02), 1);
vo00 = vmlaq_lane_f32(vo00, vi21, vget_high_f32(vf02), 0);
vo00 = vmlaq_lane_f32(vo00, vi22, vget_high_f32(vf02), 1);
// outch 0, height 1
vo01 = vmlaq_lane_f32(vo01, vi10, vget_low_f32(vf00), 0);
vo01 = vmlaq_lane_f32(vo01, vi11, vget_low_f32(vf00), 1);
vo01 = vmlaq_lane_f32(vo01, vi12, vget_high_f32(vf00), 0);
vo01 = vmlaq_lane_f32(vo01, vi20, vget_low_f32(vf01), 0);
vo01 = vmlaq_lane_f32(vo01, vi21, vget_low_f32(vf01), 1);
vo01 = vmlaq_lane_f32(vo01, vi22, vget_high_f32(vf01), 0);
vo01 = vmlaq_lane_f32(vo01, vi30, vget_low_f32(vf02), 1);
vo01 = vmlaq_lane_f32(vo01, vi31, vget_high_f32(vf02), 0);
vo01 = vmlaq_lane_f32(vo01, vi32, vget_high_f32(vf02), 1);
#endif
vst1q(out_base + out_offset, vo00);
vst1q(out_base + out_offset + p.out_width, vo01);
} // w
// right
for (; w < p.out_width; ++w) {
DepthwiseConv2d3x3Pixel(in_base,
filter_ptr,
h,
w,
h - p.pad_top,
w - p.pad_left,
p.out_width,
p.in_height,
p.in_width,
out_base);
DepthwiseConv2d3x3Pixel(in_base,
filter_ptr,
h + 1,
w,
h + 1 - p.pad_top,
w - p.pad_left,
p.out_width,
p.in_height,
p.in_width,
out_base);
}
} // h
// bottom
for (; h < p.out_height; ++h) {
for (w = 0; w < p.out_width; ++w) {
DepthwiseConv2d3x3Pixel(in_base,
filter_ptr,
h,
w,
h - p.pad_top,
w - p.pad_left,
p.out_width,
p.in_height,
p.in_width,
out_base);
}
}
} // m
} // b
}, 0, p.batch, 1, 0, p.out_channels, 1); // threadpool
return MaceStatus::MACE_SUCCESS;
}
template<typename T>
MaceStatus DepthwiseConv2dK3x3S2<T>::DoCompute(
const DepthwiseConvComputeParam &p, const T *filter_data,
const T *input_data, T *output_data) {
p.thread_pool.Compute2D(
[=](index_t start0, index_t end0, index_t step0, index_t start1,
index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) {
for (index_t m = start1; m < end1; m += step1) {
index_t c = m / p.multiplier;
index_t multi_index = m % p.multiplier;
auto filter_ptr = filter_data + multi_index * p.in_channels * 9 +
c * 9;
auto in_base = input_data + b * p.in_batch_size +
c * p.in_image_size;
auto out_base = output_data + b * p.out_batch_size +
m * p.out_image_size;
index_t h, w;
// top
for (h = 0; h < p.valid_h_start; ++h) {
for (w = 0; w < p.out_width; ++w) {
DepthwiseConv2d3x3Pixel(in_base,
filter_ptr,
h,
w,
h * 2 - p.pad_top,
w * 2 - p.pad_left,
p.out_width,
p.in_height,
p.in_width,
out_base);
}
}
// load filter (1 outch x 3 height x 3 width): vf_outch_height
float32x4_t vf00, vf01, vf02;
vf00 = vld1q(filter_ptr);
vf01 = vld1q(filter_ptr + 3);
vf02 = vld1q(filter_ptr + 5);
for (h = p.valid_h_start; h < p.valid_h_stop; ++h) {
// left
for (w = 0; w < p.valid_w_start; ++w) {
DepthwiseConv2d3x3Pixel(in_base,
filter_ptr,
h,
w,
h * 2 - p.pad_top,
w * 2 - p.pad_left,
p.out_width,
p.in_height,
p.in_width,
out_base);
}
for (w = p.valid_w_start; w + 3 < p.valid_w_stop; w += 4) {
float32x4x2_t vi0, vi1, vi2;
float32x4_t vi0n, vi1n, vi2n;
// input (3 height x 3 slide): vi_height_slide
float32x4_t vi00, vi01, vi02;
float32x4_t vi10, vi11, vi12;
float32x4_t vi20, vi21, vi22;
// output (1 outch x 1 height x 4 width): vo
float32x4_t vo;
// load input
index_t in_h = h * 2 - p.pad_top;
index_t in_w = w * 2 - p.pad_left;
index_t in_offset = in_h * p.in_width + in_w;
vi0 = vld2q(in_base + in_offset); // [0.2.4.6, 1.3.5.7]
vi1 = vld2q(in_base + in_offset + p.in_width);
vi2 = vld2q(in_base + in_offset + 2 * p.in_width);
vi0n = vld1q(in_base + in_offset + 8); // [8.9.10.11]
vi1n = vld1q(in_base + in_offset + p.in_width + 8);
vi2n = vld1q(in_base + in_offset + 2 * p.in_width + 8);
// load ouptut
index_t out_offset = h * p.out_width + w;
vo = vld1q(out_base + out_offset);
vi00 = vi0.val[0]; // [0.2.4.6]
vi01 = vi0.val[1]; // [1.3.5.7]
vi02 = vextq_f32(vi00, vi0n, 1); // [2.4.6.8]
vi10 = vi1.val[0];
vi11 = vi1.val[1];
vi12 = vextq_f32(vi10, vi1n, 1);
vi20 = vi2.val[0];
vi21 = vi2.val[1];
vi22 = vextq_f32(vi20, vi2n, 1);
#if defined(__aarch64__)
// outch 0, height 0
vo = vfmaq_laneq_f32(vo, vi00, vf00, 0);
vo = vfmaq_laneq_f32(vo, vi01, vf00, 1);
vo = vfmaq_laneq_f32(vo, vi02, vf00, 2);
vo = vfmaq_laneq_f32(vo, vi10, vf01, 0);
vo = vfmaq_laneq_f32(vo, vi11, vf01, 1);
vo = vfmaq_laneq_f32(vo, vi12, vf01, 2);
vo = vfmaq_laneq_f32(vo, vi20, vf02, 1);
vo = vfmaq_laneq_f32(vo, vi21, vf02, 2);
vo = vfmaq_laneq_f32(vo, vi22, vf02, 3);
#else
// outch 0, height 0
vo = vmlaq_lane_f32(vo, vi00, vget_low_f32(vf00), 0);
vo = vmlaq_lane_f32(vo, vi01, vget_low_f32(vf00), 1);
vo = vmlaq_lane_f32(vo, vi02, vget_high_f32(vf00), 0);
vo = vmlaq_lane_f32(vo, vi10, vget_low_f32(vf01), 0);
vo = vmlaq_lane_f32(vo, vi11, vget_low_f32(vf01), 1);
vo = vmlaq_lane_f32(vo, vi12, vget_high_f32(vf01), 0);
vo = vmlaq_lane_f32(vo, vi20, vget_low_f32(vf02), 1);
vo = vmlaq_lane_f32(vo, vi21, vget_high_f32(vf02), 0);
vo = vmlaq_lane_f32(vo, vi22, vget_high_f32(vf02), 1);
#endif
vst1q(out_base + out_offset, vo);
} // w
// right
for (; w < p.out_width; ++w) {
DepthwiseConv2d3x3Pixel(in_base,
filter_ptr,
h,
w,
h * 2 - p.pad_top,
w * 2 - p.pad_left,
p.out_width,
p.in_height,
p.in_width,
out_base);
}
} // h
// bottom
for (; h < p.out_height; ++h) {
for (w = 0; w < p.out_width; ++w) {
DepthwiseConv2d3x3Pixel(in_base,
filter_ptr,
h,
w,
h * 2 - p.pad_top,
w * 2 - p.pad_left,
p.out_width,
p.in_height,
p.in_width,
out_base);
}
}
} // m
} // b
},
0, p.batch, 1, 0, p.out_channels, 1);
return MaceStatus::MACE_SUCCESS;
}
void RegisterDepthwiseConv2dK3x3Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, DepthwiseConv2dK3x3S1<float>, delegator::DepthwiseConv2dParam,
......@@ -27,6 +453,17 @@ void RegisterDepthwiseConv2dK3x3Delegator(OpDelegatorRegistry *registry) {
registry, DepthwiseConv2dK3x3S2<float>, delegator::DepthwiseConv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S2));
MACE_REGISTER_BF16_DELEGATOR(
registry, DepthwiseConv2dK3x3S1<BFloat16>,
delegator::DepthwiseConv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU,
BFloat16, ImplType::NEON, K3x3S1));
MACE_REGISTER_BF16_DELEGATOR(
registry, DepthwiseConv2dK3x3S2<BFloat16>,
delegator::DepthwiseConv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU,
BFloat16, ImplType::NEON, K3x3S2));
}
} // namespace arm
......
此差异已折叠。
......@@ -110,46 +110,16 @@ class Gemm : public delegator::Gemm {
void UnpackOutput(const T *packed_output,
MatrixMap<T> *output);
template<int RowBlockSize, int ColBlockSize>
void Unpack(const T *packed_output,
MatrixMap<T> *output) {
const index_t rows = output->rows();
const index_t cols = output->cols();
for (index_t r = 0; r < rows; ++r) {
for (index_t c = 0; c < cols; ++c) {
*output->data(r, c) = packed_output[r * ColBlockSize + c];
}
}
}
template<int WidthBlockSize, int DepthBlockSize>
void Pack(const MatrixMap<const T> &matrix,
MatrixMajor dst_major,
T *packed_matrix) {
const index_t rows = matrix.rows();
const index_t cols = matrix.cols();
index_t depth = cols;
if (dst_major == RowMajor) {
// rhs
depth = rows;
}
const index_t depth_padded = RoundUp(depth, static_cast<index_t>(4));
memset(static_cast<void *>(packed_matrix), 0,
sizeof(T) * WidthBlockSize * depth_padded);
if (dst_major == ColMajor) {
for (index_t c = 0; c < cols; ++c) {
for (index_t r = 0; r < rows; ++r) {
packed_matrix[c * WidthBlockSize + r] = matrix(r, c);
}
}
} else {
for (index_t r = 0; r < rows; ++r) {
for (index_t c = 0; c < cols; ++c) {
packed_matrix[r * WidthBlockSize + c] = matrix(r, c);
}
}
}
}
void Unpack4x8(const T *packed_output, MatrixMap<T> *output);
void Unpack8x8(const T *packed_output, MatrixMap<T> *output);
void Pack4x4(const MatrixMap<const T> &matrix,
MatrixMajor dst_major,
T *packed_matrix);
void Pack8x4(const MatrixMap<const T> &matrix,
MatrixMajor dst_major,
T *packed_matrix);
private:
Buffer pack_cache_;
......
此差异已折叠。
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/arm/base/gemm.h"
#include <arm_neon.h>
#include <algorithm>
#include <utility>
#include "mace/port/env.h"
namespace mace {
namespace ops {
namespace arm {
template <>
void Gemm<BFloat16>::ComputeBlock(const BFloat16 *packed_lhs_data,
const BFloat16 *packed_rhs_data,
const index_t depth_padded,
BFloat16 *packed_output_data) {
const BFloat16 *lhs_ptr = packed_lhs_data;
const BFloat16 *rhs_ptr = packed_rhs_data;
const index_t depth_block_count = depth_padded / 4;
#ifdef __aarch64__
// Register layout: (8x4) x (4,8)
//
// +--------+--------+
// | v8 ... | v9 ... |
// Rhs +--------+--------+
// | v10... | v11... |
// +--------+--------+
// | v12... | v13... |
// +--------+--------+
// | v14... | v15... |
// +--------+--------+
//
// Lhs
//
// +----+----+----+----+ - - +--------+--------+
// | v0 | v2 | v4 | v6 | | v16... | v17... |
// | . | | | | | v18... | v19... |
// | . | | | | | v20... | v21... |
// | . | | | | | v22... | v23... |
// +----+----|----+----+ +--------+--------+
// | v1 | v3 | v5 | v7 | | v24... | v25... |
// | . | | | | | v26... | v27... |
// | . | | | | | v28... | v29... |
// | . | | | | | v30... | v31... |
// +----+----|----+----+ +--------+--------+
//
// Accumulator
//
if (depth_block_count > 0) {
index_t r_depth_block_count = depth_block_count;
// just make compiler happy
MACE_UNUSED(r_depth_block_count);
asm volatile(
"dup v16.4s, wzr \n"
"dup v17.4s, wzr \n"
"dup v18.4s, wzr \n"
"dup v19.4s, wzr \n"
"dup v20.4s, wzr \n"
"dup v21.4s, wzr \n"
"dup v22.4s, wzr \n"
"dup v23.4s, wzr \n"
"dup v24.4s, wzr \n"
"dup v25.4s, wzr \n"
"dup v26.4s, wzr \n"
"dup v27.4s, wzr \n"
"dup v28.4s, wzr \n"
"dup v29.4s, wzr \n"
"dup v30.4s, wzr \n"
"dup v31.4s, wzr \n"
// prelogue
"ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [%[lhs_ptr]], #32 \n"
"shll v0.4s, v0.4h, #16 \n"
"shll v1.4s, v1.4h, #16 \n"
"shll v2.4s, v2.4h, #16 \n"
"shll v3.4s, v3.4h, #16 \n"
"ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [%[lhs_ptr]], #32 \n"
"shll v4.4s, v4.4h, #16 \n"
"shll v5.4s, v5.4h, #16 \n"
"shll v6.4s, v6.4h, #16 \n"
"shll v7.4s, v7.4h, #16 \n"
"ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [%[rhs_ptr]], #32 \n"
"shll v8.4s, v8.4h, #16 \n"
"shll v9.4s, v9.4h, #16 \n"
"shll v10.4s, v10.4h, #16 \n"
"shll v11.4s, v11.4h, #16 \n"
"ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [%[rhs_ptr]], #32 \n"
"shll v12.4s, v12.4h, #16 \n"
"shll v13.4s, v13.4h, #16 \n"
"shll v14.4s, v14.4h, #16 \n"
"shll v15.4s, v15.4h, #16 \n"
"subs %[r_depth_block_count], %[r_depth_block_count], #1 \n"
"beq 1f\n"
"0: \n"
"fmla v16.4s, v8.4s, v0.s[0] \n"
"fmla v17.4s, v9.4s, v0.s[0] \n"
"fmla v18.4s, v8.4s, v0.s[1] \n"
"fmla v19.4s, v9.4s, v0.s[1] \n"
"fmla v20.4s, v8.4s, v0.s[2] \n"
"fmla v21.4s, v9.4s, v0.s[2] \n"
"fmla v22.4s, v8.4s, v0.s[3] \n"
"fmla v23.4s, v9.4s, v0.s[3] \n"
"ld1 {v0.4h}, [%[lhs_ptr]], #8 \n"
"shll v0.4s, v0.4h, #16 \n"
"fmla v24.4s, v8.4s, v1.s[0] \n"
"fmla v25.4s, v9.4s, v1.s[0] \n"
"fmla v26.4s, v8.4s, v1.s[1] \n"
"fmla v27.4s, v9.4s, v1.s[1] \n"
"fmla v28.4s, v8.4s, v1.s[2] \n"
"fmla v29.4s, v9.4s, v1.s[2] \n"
"fmla v30.4s, v8.4s, v1.s[3] \n"
"fmla v31.4s, v9.4s, v1.s[3] \n"
"ld1 {v1.4h}, [%[lhs_ptr]], #8 \n"
"shll v1.4s, v1.4h, #16 \n"
"ld1 {v8.4h, v9.4h}, [%[rhs_ptr]], #16 \n"
"shll v8.4s, v8.4h, #16 \n"
"shll v9.4s, v9.4h, #16 \n"
"fmla v16.4s, v10.4s, v2.s[0] \n"
"fmla v17.4s, v11.4s, v2.s[0] \n"
"fmla v18.4s, v10.4s, v2.s[1] \n"
"fmla v19.4s, v11.4s, v2.s[1] \n"
"fmla v20.4s, v10.4s, v2.s[2] \n"
"fmla v21.4s, v11.4s, v2.s[2] \n"
"fmla v22.4s, v10.4s, v2.s[3] \n"
"fmla v23.4s, v11.4s, v2.s[3] \n"
"ld1 {v2.4h}, [%[lhs_ptr]], #8 \n"
"shll v2.4s, v2.4h, #16 \n"
"fmla v24.4s, v10.4s, v3.s[0] \n"
"fmla v25.4s, v11.4s, v3.s[0] \n"
"fmla v26.4s, v10.4s, v3.s[1] \n"
"fmla v27.4s, v11.4s, v3.s[1] \n"
"fmla v28.4s, v10.4s, v3.s[2] \n"
"fmla v29.4s, v11.4s, v3.s[2] \n"
"fmla v30.4s, v10.4s, v3.s[3] \n"
"fmla v31.4s, v11.4s, v3.s[3] \n"
"ld1 {v3.4h}, [%[lhs_ptr]], #8 \n"
"shll v3.4s, v3.4h, #16 \n"
"ld1 {v10.4h, v11.4h}, [%[rhs_ptr]], #16 \n"
"shll v10.4s, v10.4h, #16 \n"
"shll v11.4s, v11.4h, #16 \n"
"fmla v16.4s, v12.4s, v4.s[0] \n"
"fmla v17.4s, v13.4s, v4.s[0] \n"
"fmla v18.4s, v12.4s, v4.s[1] \n"
"fmla v19.4s, v13.4s, v4.s[1] \n"
"fmla v20.4s, v12.4s, v4.s[2] \n"
"fmla v21.4s, v13.4s, v4.s[2] \n"
"fmla v22.4s, v12.4s, v4.s[3] \n"
"fmla v23.4s, v13.4s, v4.s[3] \n"
"ld1 {v4.4h}, [%[lhs_ptr]], #8 \n"
"shll v4.4s, v4.4h, #16 \n"
"fmla v24.4s, v12.4s, v5.s[0] \n"
"fmla v25.4s, v13.4s, v5.s[0] \n"
"fmla v26.4s, v12.4s, v5.s[1] \n"
"fmla v27.4s, v13.4s, v5.s[1] \n"
"fmla v28.4s, v12.4s, v5.s[2] \n"
"fmla v29.4s, v13.4s, v5.s[2] \n"
"fmla v30.4s, v12.4s, v5.s[3] \n"
"fmla v31.4s, v13.4s, v5.s[3] \n"
"ld1 {v5.4h}, [%[lhs_ptr]], #8 \n"
"shll v5.4s, v5.4h, #16 \n"
"ld1 {v12.4h, v13.4h}, [%[rhs_ptr]], #16 \n"
"shll v12.4s, v12.4h, #16 \n"
"shll v13.4s, v13.4h, #16 \n"
"fmla v16.4s, v14.4s, v6.s[0] \n"
"fmla v17.4s, v15.4s, v6.s[0] \n"
"fmla v18.4s, v14.4s, v6.s[1] \n"
"fmla v19.4s, v15.4s, v6.s[1] \n"
"fmla v20.4s, v14.4s, v6.s[2] \n"
"fmla v21.4s, v15.4s, v6.s[2] \n"
"fmla v22.4s, v14.4s, v6.s[3] \n"
"fmla v23.4s, v15.4s, v6.s[3] \n"
"ld1 {v6.4h}, [%[lhs_ptr]], #8 \n"
"shll v6.4s, v6.4h, #16 \n"
"subs %[r_depth_block_count], %[r_depth_block_count], #1 \n"
"fmla v24.4s, v14.4s, v7.s[0] \n"
"fmla v25.4s, v15.4s, v7.s[0] \n"
"fmla v26.4s, v14.4s, v7.s[1] \n"
"fmla v27.4s, v15.4s, v7.s[1] \n"
"fmla v28.4s, v14.4s, v7.s[2] \n"
"fmla v29.4s, v15.4s, v7.s[2] \n"
"fmla v30.4s, v14.4s, v7.s[3] \n"
"fmla v31.4s, v15.4s, v7.s[3] \n"
"ld1 {v7.4h}, [%[lhs_ptr]], #8 \n"
"shll v7.4s, v7.4h, #16 \n"
"ld1 {v14.4h, v15.4h}, [%[rhs_ptr]], #16 \n"
"shll v14.4s, v14.4h, #16 \n"
"shll v15.4s, v15.4h, #16 \n"
"bne 0b \n"
// prologue
"1:\n"
"fmla v16.4s, v8.4s, v0.s[0] \n"
"fmla v17.4s, v9.4s, v0.s[0] \n"
"fmla v18.4s, v8.4s, v0.s[1] \n"
"fmla v19.4s, v9.4s, v0.s[1] \n"
"fmla v20.4s, v8.4s, v0.s[2] \n"
"fmla v21.4s, v9.4s, v0.s[2] \n"
"fmla v22.4s, v8.4s, v0.s[3] \n"
"fmla v23.4s, v9.4s, v0.s[3] \n"
"fmla v24.4s, v8.4s, v1.s[0] \n"
"fmla v25.4s, v9.4s, v1.s[0] \n"
"fmla v26.4s, v8.4s, v1.s[1] \n"
"fmla v27.4s, v9.4s, v1.s[1] \n"
"fmla v28.4s, v8.4s, v1.s[2] \n"
"fmla v29.4s, v9.4s, v1.s[2] \n"
"fmla v30.4s, v8.4s, v1.s[3] \n"
"fmla v31.4s, v9.4s, v1.s[3] \n"
"fmla v16.4s, v10.4s, v2.s[0] \n"
"fmla v17.4s, v11.4s, v2.s[0] \n"
"fmla v18.4s, v10.4s, v2.s[1] \n"
"fmla v19.4s, v11.4s, v2.s[1] \n"
"fmla v20.4s, v10.4s, v2.s[2] \n"
"fmla v21.4s, v11.4s, v2.s[2] \n"
"fmla v22.4s, v10.4s, v2.s[3] \n"
"fmla v23.4s, v11.4s, v2.s[3] \n"
"fmla v24.4s, v10.4s, v3.s[0] \n"
"fmla v25.4s, v11.4s, v3.s[0] \n"
"fmla v26.4s, v10.4s, v3.s[1] \n"
"fmla v27.4s, v11.4s, v3.s[1] \n"
"fmla v28.4s, v10.4s, v3.s[2] \n"
"fmla v29.4s, v11.4s, v3.s[2] \n"
"fmla v30.4s, v10.4s, v3.s[3] \n"
"fmla v31.4s, v11.4s, v3.s[3] \n"
"fmla v16.4s, v12.4s, v4.s[0] \n"
"fmla v17.4s, v13.4s, v4.s[0] \n"
"fmla v18.4s, v12.4s, v4.s[1] \n"
"fmla v19.4s, v13.4s, v4.s[1] \n"
"fmla v20.4s, v12.4s, v4.s[2] \n"
"fmla v21.4s, v13.4s, v4.s[2] \n"
"fmla v22.4s, v12.4s, v4.s[3] \n"
"fmla v23.4s, v13.4s, v4.s[3] \n"
"fmla v24.4s, v12.4s, v5.s[0] \n"
"fmla v25.4s, v13.4s, v5.s[0] \n"
"fmla v26.4s, v12.4s, v5.s[1] \n"
"fmla v27.4s, v13.4s, v5.s[1] \n"
"fmla v28.4s, v12.4s, v5.s[2] \n"
"fmla v29.4s, v13.4s, v5.s[2] \n"
"fmla v30.4s, v12.4s, v5.s[3] \n"
"fmla v31.4s, v13.4s, v5.s[3] \n"
"fmla v16.4s, v14.4s, v6.s[0] \n"
"fmla v17.4s, v15.4s, v6.s[0] \n"
"fmla v18.4s, v14.4s, v6.s[1] \n"
"fmla v19.4s, v15.4s, v6.s[1] \n"
"fmla v20.4s, v14.4s, v6.s[2] \n"
"fmla v21.4s, v15.4s, v6.s[2] \n"
"fmla v22.4s, v14.4s, v6.s[3] \n"
"fmla v23.4s, v15.4s, v6.s[3] \n"
"fmla v24.4s, v14.4s, v7.s[0] \n"
"fmla v25.4s, v15.4s, v7.s[0] \n"
"fmla v26.4s, v14.4s, v7.s[1] \n"
"fmla v27.4s, v15.4s, v7.s[1] \n"
"fmla v28.4s, v14.4s, v7.s[2] \n"
"fmla v29.4s, v15.4s, v7.s[2] \n"
"fmla v30.4s, v14.4s, v7.s[3] \n"
"fmla v31.4s, v15.4s, v7.s[3] \n"
"shrn v16.4h, v16.4s, #16 \n"
"shrn v17.4h, v17.4s, #16 \n"
"shrn v18.4h, v18.4s, #16 \n"
"shrn v19.4h, v19.4s, #16 \n"
"st1 {v16.4h, v17.4h, v18.4h, v19.4h}, [%[packed_output_data]], #32 \n"
"shrn v20.4h, v20.4s, #16 \n"
"shrn v21.4h, v21.4s, #16 \n"
"shrn v22.4h, v22.4s, #16 \n"
"shrn v23.4h, v23.4s, #16 \n"
"st1 {v20.4h, v21.4h, v22.4h, v23.4h}, [%[packed_output_data]], #32 \n"
"shrn v24.4h, v24.4s, #16 \n"
"shrn v25.4h, v25.4s, #16 \n"
"shrn v26.4h, v26.4s, #16 \n"
"shrn v27.4h, v27.4s, #16 \n"
"st1 {v24.4h, v25.4h, v26.4h, v27.4h}, [%[packed_output_data]], #32 \n"
"shrn v28.4h, v28.4s, #16 \n"
"shrn v29.4h, v29.4s, #16 \n"
"shrn v30.4h, v30.4s, #16 \n"
"shrn v31.4h, v31.4s, #16 \n"
"st1 {v28.4h, v29.4h, v30.4h, v31.4h}, [%[packed_output_data]], #32 \n"
: // outputs
[lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
[packed_output_data] "+r"(packed_output_data),
[r_depth_block_count] "+r"(r_depth_block_count)
: // inputs
: // clabbers
"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
"v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
"v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28",
"v29", "v30", "v31");
}
#else // armeabi-v7a
// Register layout: (4x4) x (4,8)
//
// +--------+--------+
// | q4 ... | q5 ... |
// Rhs +--------+--------+
// | q6 ... | q7 ... |
// +--------+--------+
// | q4 ... | q5 ... |
// +--------+--------+
// | q6 ... | q7 ... |
// +--------+--------+
//
// Lhs
//
// +----+----+----+----+ - - +--------+--------+
// | q0 | q1 | q2 | q3 | | q8... | q9... |
// | . | | | | | q10... | q11... |
// | . | | | | | q12... | q13... |
// | . | | | | | q14... | q15... |
// +----+----+----+----+ +--------+--------+
//
// Accumulator
//
if (depth_block_count > 0) {
index_t r_depth_block_count = depth_block_count;
// just make compiler happy
MACE_UNUSED(r_depth_block_count);
asm volatile(
"mov r0, #0\n"
"vdup.f32 q8, r0 \n"
"vdup.f32 q9, r0 \n"
"vdup.f32 q10, r0 \n"
"vdup.f32 q11, r0 \n"
"vdup.f32 q12, r0 \n"
"vdup.f32 q13, r0 \n"
"vdup.f32 q14, r0 \n"
"vdup.f32 q15, r0 \n"
// prelogue
"vld1.u16 {d0-d3}, [%[lhs_ptr]]! \n"
"vshll.u16 q3, d3, #16 \n"
"vshll.u16 q2, d2, #16 \n"
"vshll.u16 q1, d1, #16 \n"
"vshll.u16 q0, d0, #16 \n"
"vld1.u16 {d8-d11}, [%[rhs_ptr]]! \n"
"vshll.u16 q7, d11, #16 \n"
"vshll.u16 q6, d10, #16 \n"
"vshll.u16 q5, d9, #16 \n"
"vshll.u16 q4, d8, #16 \n"
"subs %[r_depth_block_count], %[r_depth_block_count], #1 \n"
"beq 1f\n"
"0: \n"
"vmla.f32 q8, q4, d0[0] \n"
"vmla.f32 q9, q5, d0[0] \n"
"vmla.f32 q10, q4, d0[1] \n"
"vmla.f32 q11, q5, d0[1] \n"
"vmla.f32 q12, q4, d1[0] \n"
"vmla.f32 q13, q5, d1[0] \n"
"vmla.f32 q14, q4, d1[1] \n"
"vmla.f32 q15, q5, d1[1] \n"
"vld1.u16 {d0}, [%[lhs_ptr]]! \n"
"vld1.u16 {d8-d9}, [%[rhs_ptr]]! \n"
"vshll.u16 q0, d0, #16 \n"
"vshll.u16 q5, d9, #16 \n"
"vshll.u16 q4, d8, #16 \n"
"vmla.f32 q8, q6, d2[0] \n"
"vmla.f32 q9, q7, d2[0] \n"
"vmla.f32 q10, q6, d2[1] \n"
"vmla.f32 q11, q7, d2[1] \n"
"vmla.f32 q12, q6, d3[0] \n"
"vmla.f32 q13, q7, d3[0] \n"
"vmla.f32 q14, q6, d3[1] \n"
"vmla.f32 q15, q7, d3[1] \n"
"vld1.u16 {d2}, [%[lhs_ptr]]! \n"
"vld1.u16 {d12-d13}, [%[rhs_ptr]]! \n"
"vshll.u16 q1, d2, #16 \n"
"vshll.u16 q7, d13, #16 \n"
"vshll.u16 q6, d12, #16 \n"
"vmla.f32 q8, q4, d4[0] \n"
"vmla.f32 q9, q5, d4[0] \n"
"vmla.f32 q10, q4, d4[1] \n"
"vmla.f32 q11, q5, d4[1] \n"
"vmla.f32 q12, q4, d5[0] \n"
"vmla.f32 q13, q5, d5[0] \n"
"vmla.f32 q14, q4, d5[1] \n"
"vmla.f32 q15, q5, d5[1] \n"
"vld1.u16 {d4}, [%[lhs_ptr]]! \n"
"vld1.u16 {d8-d9}, [%[rhs_ptr]]! \n"
"vshll.u16 q2, d4, #16 \n"
"vshll.u16 q5, d9, #16 \n"
"vshll.u16 q4, d8, #16 \n"
"subs %[r_depth_block_count], %[r_depth_block_count], #1 \n"
"vmla.f32 q8, q6, d6[0] \n"
"vmla.f32 q9, q7, d6[0] \n"
"vmla.f32 q10, q6, d6[1] \n"
"vmla.f32 q11, q7, d6[1] \n"
"vmla.f32 q12, q6, d7[0] \n"
"vmla.f32 q13, q7, d7[0] \n"
"vmla.f32 q14, q6, d7[1] \n"
"vmla.f32 q15, q7, d7[1] \n"
"vld1.u16 {d6}, [%[lhs_ptr]]! \n"
"vld1.u16 {d12-d13}, [%[rhs_ptr]]! \n"
"vshll.u16 q3, d6, #16 \n"
"vshll.u16 q7, d13, #16 \n"
"vshll.u16 q6, d12, #16 \n"
"bne 0b \n"
// prologue
"1:\n"
"vmla.f32 q8, q4, d0[0] \n"
"vmla.f32 q9, q5, d0[0] \n"
"vmla.f32 q10, q4, d0[1] \n"
"vmla.f32 q11, q5, d0[1] \n"
"vmla.f32 q12, q4, d1[0] \n"
"vmla.f32 q13, q5, d1[0] \n"
"vmla.f32 q14, q4, d1[1] \n"
"vmla.f32 q15, q5, d1[1] \n"
"vld1.u16 {d8-d9}, [%[rhs_ptr]]! \n"
"vshll.u16 q5, d9, #16 \n"
"vshll.u16 q4, d8, #16 \n"
"vmla.f32 q8, q6, d2[0] \n"
"vmla.f32 q9, q7, d2[0] \n"
"vmla.f32 q10, q6, d2[1] \n"
"vmla.f32 q11, q7, d2[1] \n"
"vmla.f32 q12, q6, d3[0] \n"
"vmla.f32 q13, q7, d3[0] \n"
"vmla.f32 q14, q6, d3[1] \n"
"vmla.f32 q15, q7, d3[1] \n"
"vld1.u16 {d12-d13}, [%[rhs_ptr]]! \n"
"vshll.u16 q7, d13, #16 \n"
"vshll.u16 q6, d12, #16 \n"
"vmla.f32 q8, q4, d4[0] \n"
"vmla.f32 q9, q5, d4[0] \n"
"vmla.f32 q10, q4, d4[1] \n"
"vmla.f32 q11, q5, d4[1] \n"
"vmla.f32 q12, q4, d5[0] \n"
"vmla.f32 q13, q5, d5[0] \n"
"vmla.f32 q14, q4, d5[1] \n"
"vmla.f32 q15, q5, d5[1] \n"
"vmla.f32 q8, q6, d6[0] \n"
"vmla.f32 q9, q7, d6[0] \n"
"vmla.f32 q10, q6, d6[1] \n"
"vmla.f32 q11, q7, d6[1] \n"
"vmla.f32 q12, q6, d7[0] \n"
"vmla.f32 q13, q7, d7[0] \n"
"vmla.f32 q14, q6, d7[1] \n"
"vmla.f32 q15, q7, d7[1] \n"
"vshrn.u32 d16, q8, #16 \n"
"vshrn.u32 d17, q9, #16 \n"
"vst1.u16 {d16-d17}, [%[packed_output_data]]! \n"
"vshrn.u32 d20, q10, #16 \n"
"vshrn.u32 d21, q11, #16 \n"
"vst1.u16 {d20-d21}, [%[packed_output_data]]! \n"
"vshrn.u32 d24, q12, #16 \n"
"vshrn.u32 d25, q13, #16 \n"
"vst1.u16 {d24-d25}, [%[packed_output_data]]! \n"
"vshrn.u32 d28, q14, #16 \n"
"vshrn.u32 d29, q15, #16 \n"
"vst1.u16 {d28-d29}, [%[packed_output_data]]! \n"
: // outputs
[lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
[packed_output_data] "+r"(packed_output_data),
[r_depth_block_count] "+r"(r_depth_block_count)
: // inputs
: // clabbers
"cc", "memory", "r0", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
}
#endif
}
} // namespace arm
} // namespace ops
} // namespace mace
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <arm_neon.h>
#include <algorithm>
#include "mace/ops/arm/base/activation.h"
namespace mace {
namespace ops {
namespace arm {
template<>
void Activation<float>::ActivateRelu(utils::ThreadPool *thread_pool,
const Tensor *input,
Tensor *output) {
const auto input_data = input->data<float>();
auto output_data = output->mutable_data<float>();
const index_t input_size = input->size();
const float32x4_t vzero = vdupq_n_f32(0.f);
const index_t block_count = input_size / 4;
thread_pool->Compute1D(
[=](index_t start, index_t end, index_t step) {
auto input_ptr = input_data + start * 4;
auto output_ptr = output_data + start * 4;
for (index_t i = start; i < end; i += step) {
float32x4_t v = vld1q_f32(input_ptr);
v = vmaxq_f32(v, vzero);
vst1q_f32(output_ptr, v);
input_ptr += 4;
output_ptr += 4;
}
},
0, block_count, 1);
// remain
for (index_t i = block_count * 4; i < input_size; ++i) {
output_data[i] = std::max(0.f, input_data[i]);
}
}
template<>
void Activation<float>::ActivateRelux(utils::ThreadPool *thread_pool,
const Tensor *input,
Tensor *output) {
const auto input_data = input->data<float>();
auto output_data = output->mutable_data<float>();
const index_t input_size = input->size();
const float32x4_t vzero = vdupq_n_f32(0.f);
const float32x4_t vlimit = vdupq_n_f32(limit_);
const index_t block_count = input_size / 4;
thread_pool->Compute1D(
[=](index_t start, index_t end, index_t step) {
auto input_ptr = input_data + start * 4;
auto output_ptr = output_data + start * 4;
for (index_t i = start; i < end; i += step) {
float32x4_t v = vld1q_f32(input_ptr);
v = vmaxq_f32(v, vzero);
v = vminq_f32(v, vlimit);
vst1q_f32(output_ptr, v);
input_ptr += 4;
output_ptr += 4;
}
},
0, block_count, 1);
// remain
for (index_t i = block_count * 4; i < input_size; ++i) {
output_data[i] = std::max(0.f, std::min(limit_, input_data[i]));
}
}
template<>
void Activation<float>::ActivateLeakyRelu(utils::ThreadPool *thread_pool,
const Tensor *input,
Tensor *output) {
const auto input_data = input->data<float>();
auto output_data = output->mutable_data<float>();
const index_t input_size = input->size();
const float32x4_t vzero = vdupq_n_f32(0.f);
const float32x4_t valpha = vdupq_n_f32(leakyrelu_coefficient_);
const index_t block_count = input_size / 4;
thread_pool->Compute1D(
[=](index_t start, index_t end, index_t step) {
auto input_ptr = input_data + start * 4;
auto output_ptr = output_data + start * 4;
for (index_t i = start; i < end; i += step) {
float32x4_t v = vld1q_f32(input_ptr);
float32x4_t u = vminq_f32(v, vzero);
v = vmaxq_f32(v, vzero);
v = vmlaq_f32(v, valpha, u);
vst1q_f32(output_ptr, v);
input_ptr += 4;
output_ptr += 4;
}
},
0, block_count, 1);
// remain
for (index_t i = block_count * 4; i < input_size; ++i) {
output_data[i] = std::max(input_data[i], 0.f) +
std::min(input_data[i], 0.f) * leakyrelu_coefficient_;
}
}
template<>
void Activation<float>::ActivateTanh(utils::ThreadPool *thread_pool,
const Tensor *input,
Tensor *output) {
const auto input_data = input->data<float>();
auto output_data = output->mutable_data<float>();
const index_t input_size = input->size();
thread_pool->Compute1D(
[=](index_t start, index_t end, index_t step) {
for (index_t i = start; i < end; i += step) {
output_data[i] = std::tanh(input_data[i]);
}
},
0, input_size, 1);
}
template<>
void Activation<float>::ActivateSigmoid(utils::ThreadPool *thread_pool,
const Tensor *input,
Tensor *output) {
const auto input_data = input->data<float>();
auto output_data = output->mutable_data<float>();
const index_t input_size = input->size();
thread_pool->Compute1D(
[=](index_t start, index_t end, index_t step) {
for (index_t i = start; i < end; i += step) {
output_data[i] = 1 / (1 + std::exp(-(input_data[i])));
}
},
0, input_size, 1);
}
} // namespace arm
} // namespace ops
} // namespace mace
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/arm/base/bias_add.h"
#include <arm_neon.h>
namespace mace {
namespace ops {
namespace arm {
template <>
template <int Dim>
void BiasAdd<float>::AddBiasNCHW(utils::ThreadPool *thread_pool,
const Tensor *input,
const Tensor *bias,
Tensor *output) {
auto input_data = input->data<float>();
auto bias_data = bias->data<float>();
auto output_data = output->mutable_data<float>();
const index_t batch = input->dim(0);
const index_t channels = input->dim(1);
const index_t image_size = input->dim(2) * input->dim(3);
const index_t block_count = image_size / 4;
const index_t remain = image_size % 4;
thread_pool->Compute2D(
[=](index_t start0, index_t end0, index_t step0, index_t start1,
index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) {
const index_t b_offset = b * channels;
for (index_t c = start1; c < end1; c += step1) {
const index_t offset = (b_offset + c) * image_size;
auto input_ptr = input_data + offset;
auto output_ptr = output_data + offset;
const float bias = bias_data[bias_index<Dim>(b_offset, c)];
float32x4_t vbias = vdupq_n_f32(bias);
for (index_t i = 0; i < block_count; ++i) {
float32x4_t v = vld1q_f32(input_ptr);
v = vaddq_f32(v, vbias);
vst1q_f32(output_ptr, v);
input_ptr += 4;
output_ptr += 4;
}
for (index_t i = 0; i < remain; ++i) {
(*output_ptr++) = (*input_ptr++) + bias;
}
}
}
},
0, batch, 1, 0, channels, 1);
}
template <>
template <int Dim>
void BiasAdd<float>::AddBiasNHWC(utils::ThreadPool *thread_pool,
const Tensor *input,
const Tensor *bias,
Tensor *output) {
const float *input_ptr = input->data<float>();
const float *bias_ptr = bias->data<float>();
float *output_ptr = output->mutable_data<float>();
const std::vector<index_t> &shape = input->shape();
const index_t channels = *shape.rbegin();
const auto batch = shape[0];
if (Dim == 2) {
MACE_CHECK(batch == bias->shape()[0]);
}
const index_t fused_hw = std::accumulate(shape.begin() + 1, shape.end() - 1,
1, std::multiplies<index_t>());
thread_pool->Compute2D(
[=](index_t start0, index_t end0, index_t step0, index_t start1,
index_t end1, index_t step1) {
for (index_t i = start0; i < end0; i += step0) {
auto offset = i * fused_hw;
auto bias_offset = i * channels;
for (index_t j = start1; j < end1; j += step1) {
index_t pos = (offset + j) * channels;
for (index_t c = 0; c < channels; ++c, ++pos) {
output_ptr[pos] =
input_ptr[pos] + bias_ptr[bias_index<Dim>(bias_offset, c)];
}
}
}
},
0, batch, 1, 0, fused_hw, 1);
}
template void BiasAdd<float>::AddBiasNCHW<1>(utils::ThreadPool *thread_pool,
const Tensor *input,
const Tensor *bias,
Tensor *output);
template void BiasAdd<float>::AddBiasNCHW<2>(utils::ThreadPool *thread_pool,
const Tensor *input,
const Tensor *bias,
Tensor *output);
template void BiasAdd<float>::AddBiasNHWC<1>(utils::ThreadPool *thread_pool,
const Tensor *input,
const Tensor *bias,
Tensor *output);
template void BiasAdd<float>::AddBiasNHWC<2>(utils::ThreadPool *thread_pool,
const Tensor *input,
const Tensor *bias,
Tensor *output);
} // namespace arm
} // namespace ops
} // namespace mace
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_ARM_FP32_COMMON_NEON_H_
#define MACE_OPS_ARM_FP32_COMMON_NEON_H_
#if defined(MACE_ENABLE_NEON)
#include <arm_neon.h>
namespace mace {
namespace ops {
namespace arm {
inline float32x4_t neon_vfma_lane_0(float32x4_t a,
float32x4_t b,
float32x4_t c) {
#ifdef __aarch64__
return vfmaq_laneq_f32(a, b, c, 0);
#else
return vmlaq_lane_f32(a, b, vget_low_f32(c), 0);
#endif
}
inline float32x4_t neon_vfma_lane_1(float32x4_t a,
float32x4_t b,
float32x4_t c) {
#ifdef __aarch64__
return vfmaq_laneq_f32(a, b, c, 1);
#else
return vmlaq_lane_f32(a, b, vget_low_f32(c), 1);
#endif
}
inline float32x4_t neon_vfma_lane_2(float32x4_t a,
float32x4_t b,
float32x4_t c) {
#ifdef __aarch64__
return vfmaq_laneq_f32(a, b, c, 2);
#else
return vmlaq_lane_f32(a, b, vget_high_f32(c), 0);
#endif
}
inline float32x4_t neon_vfma_lane_3(float32x4_t a,
float32x4_t b,
float32x4_t c) {
#ifdef __aarch64__
return vfmaq_laneq_f32(a, b, c, 3);
#else
return vmlaq_lane_f32(a, b, vget_high_f32(c), 1);
#endif
}
} // namespace arm
} // namespace ops
} // namespace mace
#endif // MACE_ENABLE_NEON
#endif // MACE_OPS_ARM_FP32_COMMON_NEON_H_
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include "mace/ops/arm/base/conv_2d_general.h"
#include "mace/ops/delegator/conv_2d.h"
namespace mace {
namespace ops {
namespace arm {
template<>
MaceStatus Conv2dGeneral<float>::DoCompute(
const ConvComputeParam &p, const float *filter_data,
const float *input_data, float *output_data,
const std::vector<index_t> &filter_shape) {
const index_t filter_height = filter_shape[2];
const index_t filter_width = filter_shape[3];
const index_t filter_size = filter_height * filter_width;
p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) {
for (index_t m = start1; m < end1; m += step1) {
const int stride_h = strides_[0];
const int stride_w = strides_[1];
const int dilation_h = dilations_[0];
const int dilation_w = dilations_[1];
if (m + 3 < p.out_channels) {
float *out_ptr0_base =
output_data + b * p.out_batch_size + m * p.out_image_size;
float *out_ptr1_base = out_ptr0_base + p.out_image_size;
float *out_ptr2_base = out_ptr1_base + p.out_image_size;
float *out_ptr3_base = out_ptr2_base + p.out_image_size;
for (index_t c = 0; c < p.in_channels; ++c) {
const float *in_ptr_base =
input_data + b * p.in_batch_size + c * p.in_image_size;
const float *filter_ptr0 =
filter_data + m * p.in_channels * filter_size + c * filter_size;
const float *filter_ptr1 =
filter_ptr0 + p.in_channels * filter_size;
const float *filter_ptr2 =
filter_ptr1 + p.in_channels * filter_size;
const float *filter_ptr3 =
filter_ptr2 + p.in_channels * filter_size;
for (index_t h = 0; h < p.out_height; ++h) {
for (index_t w = 0; w + 3 < p.out_width; w += 4) {
// input offset
index_t ih = h * stride_h;
index_t iw = w * stride_w;
index_t in_offset = ih * p.in_width + iw;
// output (4 outch x 1 height x 4 width): vo_outch_height
float vo0[4], vo1[4], vo2[4], vo3[4];
// load output
index_t out_offset = h * p.out_width + w;
for (index_t ow = 0; ow < 4; ++ow) {
vo0[ow] = out_ptr0_base[out_offset + ow];
vo1[ow] = out_ptr1_base[out_offset + ow];
vo2[ow] = out_ptr2_base[out_offset + ow];
vo3[ow] = out_ptr3_base[out_offset + ow];
}
// calc by row
for (index_t kh = 0; kh < filter_height; ++kh) {
for (index_t kw = 0; kw < filter_width; ++kw) {
// outch 0
vo0[0] += in_ptr_base[in_offset
+ kw * dilation_w] * filter_ptr0[kw];
vo0[1] += in_ptr_base[in_offset + stride_w
+ kw * dilation_w] * filter_ptr0[kw];
vo0[2] += in_ptr_base[in_offset + 2 * stride_w
+ kw * dilation_w] * filter_ptr0[kw];
vo0[3] += in_ptr_base[in_offset + 3 * stride_w
+ kw * dilation_w] * filter_ptr0[kw];
// outch 1
vo1[0] += in_ptr_base[in_offset
+ kw * dilation_w] * filter_ptr1[kw];
vo1[1] += in_ptr_base[in_offset + stride_w
+ kw * dilation_w] * filter_ptr1[kw];
vo1[2] += in_ptr_base[in_offset + 2 * stride_w
+ kw * dilation_w] * filter_ptr1[kw];
vo1[3] += in_ptr_base[in_offset + 3 * stride_w
+ kw * dilation_w] * filter_ptr1[kw];
// outch 2
vo2[0] += in_ptr_base[in_offset
+ kw * dilation_w] * filter_ptr2[kw];
vo2[1] += in_ptr_base[in_offset + stride_w
+ kw * dilation_w] * filter_ptr2[kw];
vo2[2] += in_ptr_base[in_offset + 2 * stride_w
+ kw * dilation_w] * filter_ptr2[kw];
vo2[3] += in_ptr_base[in_offset + 3 * stride_w
+ kw * dilation_w] * filter_ptr2[kw];
// outch 3
vo3[0] += in_ptr_base[in_offset
+ kw * dilation_w] * filter_ptr3[kw];
vo3[1] += in_ptr_base[in_offset + stride_w
+ kw * dilation_w] * filter_ptr3[kw];
vo3[2] += in_ptr_base[in_offset + 2 * stride_w
+ kw * dilation_w] * filter_ptr3[kw];
vo3[3] += in_ptr_base[in_offset + 3 * stride_w
+ kw * dilation_w] * filter_ptr3[kw];
} // kw
in_offset += dilation_h * p.in_width;
filter_ptr0 += filter_width;
filter_ptr1 += filter_width;
filter_ptr2 += filter_width;
filter_ptr3 += filter_width;
} // kh
for (index_t ow = 0; ow < 4; ++ow) {
out_ptr0_base[out_offset + ow] = vo0[ow];
out_ptr1_base[out_offset + ow] = vo1[ow];
out_ptr2_base[out_offset + ow] = vo2[ow];
out_ptr3_base[out_offset + ow] = vo3[ow];
}
filter_ptr0 -= filter_size;
filter_ptr1 -= filter_size;
filter_ptr2 -= filter_size;
filter_ptr3 -= filter_size;
} // w
} // h
} // c
} else {
for (index_t mm = m; mm < p.out_channels; ++mm) {
float *out_ptr0_base =
output_data + b * p.out_batch_size + mm * p.out_image_size;
for (index_t c = 0; c < p.in_channels; ++c) {
const float *in_ptr_base =
input_data + b * p.in_batch_size + c * p.in_image_size;
const float *filter_ptr0 =
filter_data + mm * p.in_channels * filter_size
+ c * filter_size;
for (index_t h = 0; h < p.out_height; ++h) {
for (index_t w = 0; w + 3 < p.out_width; w += 4) {
// input offset
index_t ih = h * stride_h;
index_t iw = w * stride_w;
index_t in_offset = ih * p.in_width + iw;
// output (1 outch x 1 height x 4 width): vo_outch_height
float vo0[4];
// load output
index_t out_offset = h * p.out_width + w;
for (index_t ow = 0; ow < 4; ++ow) {
vo0[ow] = out_ptr0_base[out_offset + ow];
}
// calc by row
for (index_t kh = 0; kh < filter_height; ++kh) {
for (index_t kw = 0; kw < filter_width; ++kw) {
// outch 0
vo0[0] += in_ptr_base[in_offset
+ kw * dilation_w] * filter_ptr0[kw];
vo0[1] += in_ptr_base[in_offset + stride_w
+ kw * dilation_w] * filter_ptr0[kw];
vo0[2] += in_ptr_base[in_offset + 2 * stride_w
+ kw * dilation_w] * filter_ptr0[kw];
vo0[3] += in_ptr_base[in_offset + 3 * stride_w
+ kw * dilation_w] * filter_ptr0[kw];
} // kw
in_offset += dilation_h * p.in_width;
filter_ptr0 += filter_width;
} // kh
for (index_t ow = 0; ow < 4; ++ow) {
out_ptr0_base[out_offset + ow] = vo0[ow];
}
filter_ptr0 -= filter_size;
} // w
} // h
} // c
} // mm
} // if
} // m
} // b
}, 0, p.batch, 1, 0, p.out_channels, 4);
return MaceStatus::MACE_SUCCESS;
}
} // namespace arm
} // namespace ops
} // namespace mace
......@@ -14,8 +14,8 @@
#include <arm_neon.h>
#include "mace/ops/arm/base/common_neon.h"
#include "mace/ops/arm/base/deconv_2d_2x2.h"
#include "mace/ops/arm/fp32/common_neon.h"
namespace mace {
namespace ops {
......
......@@ -14,8 +14,8 @@
#include <arm_neon.h>
#include "mace/ops/arm/base/common_neon.h"
#include "mace/ops/arm/base/deconv_2d_3x3.h"
#include "mace/ops/arm/fp32/common_neon.h"
namespace mace {
namespace ops {
......
......@@ -14,8 +14,8 @@
#include <arm_neon.h>
#include "mace/ops/arm/base/common_neon.h"
#include "mace/ops/arm/base/deconv_2d_4x4.h"
#include "mace/ops/arm/fp32/common_neon.h"
namespace mace {
namespace ops {
......
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <arm_neon.h>
#include "mace/ops/arm/base/depthwise_conv_2d_3x3.h"
namespace mace {
namespace ops {
namespace arm {
namespace {
void DepthwiseConv2dPixel(const float *in_base,
const float *filter,
const index_t out_h,
const index_t out_w,
const index_t in_h_start,
const index_t in_w_start,
const index_t out_width,
const index_t in_height,
const index_t in_width,
int filter_height,
int filter_width,
float *out_base) {
float sum = 0;
for (int i = 0; i < filter_height; ++i) {
for (int j = 0; j < filter_width; ++j) {
index_t in_h = in_h_start + i;
index_t in_w = in_w_start + j;
if (in_h >= 0 && in_h < in_height && in_w >= 0 && in_w < in_width) {
sum += in_base[in_h * in_width + in_w] * filter[i * filter_width + j];
}
}
}
out_base[out_h * out_width + out_w] = sum;
}
} // namespace
template<>
MaceStatus DepthwiseConv2dK3x3S1<float>::DoCompute(
const DepthwiseConvComputeParam &p, const float *filter_data,
const float *input_data, float *output_data) {
p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) {
for (index_t m = start1; m < end1; m += step1) {
const index_t c = m / p.multiplier;
const index_t multi_index = m % p.multiplier;
const float
*in_base = input_data + b * p.in_batch_size + c * p.in_image_size;
const float
*filter_ptr = filter_data + multi_index * p.in_channels * 9 + c * 9;
float *out_base =
output_data + b * p.out_batch_size + m * p.out_image_size;
index_t h, w;
// top
for (h = 0; h < p.valid_h_start; ++h) {
for (w = 0; w < p.out_width; ++w) {
DepthwiseConv2dPixel(in_base,
filter_ptr,
h,
w,
h - p.pad_top,
w - p.pad_left,
p.out_width,
p.in_height,
p.in_width,
3,
3,
out_base);
}
}
// load filter (1 outch x 3 height x 3 width): vf_outch_height
float32x4_t vf00, vf01, vf02;
vf00 = vld1q_f32(filter_ptr);
vf01 = vld1q_f32(filter_ptr + 3);
vf02 = vld1q_f32(filter_ptr + 5);
for (h = p.valid_h_start; h + 1 < p.valid_h_stop; h += 2) {
// left
for (w = 0; w < p.valid_w_start; ++w) {
DepthwiseConv2dPixel(in_base,
filter_ptr,
h,
w,
h - p.pad_top,
w - p.pad_left,
p.out_width,
p.in_height,
p.in_width,
3,
3,
out_base);
DepthwiseConv2dPixel(in_base,
filter_ptr,
h + 1,
w,
h + 1 - p.pad_top,
w - p.pad_left,
p.out_width,
p.in_height,
p.in_width,
3,
3,
out_base);
}
for (w = p.valid_w_start; w + 3 < p.valid_w_stop; w += 4) {
// input (4 height x 3 slide): vi_height_slide
float32x4_t vi00, vi01, vi02, vi0n;
float32x4_t vi10, vi11, vi12, vi1n;
float32x4_t vi20, vi21, vi22, vi2n;
float32x4_t vi30, vi31, vi32, vi3n;
// output (1 outch x 2 height x 4 width): vo_outch_height
float32x4_t vo00, vo01;
// load input
index_t in_h = h - p.pad_top;
index_t in_w = w - p.pad_left;
index_t in_offset = in_h * p.in_width + in_w;
vi00 = vld1q_f32(in_base + in_offset);
vi0n = vld1q_f32(in_base + in_offset + 4);
vi10 = vld1q_f32(in_base + in_offset + p.in_width);
vi1n = vld1q_f32(in_base + in_offset + p.in_width + 4);
vi20 = vld1q_f32(in_base + in_offset + 2 * p.in_width);
vi2n = vld1q_f32(in_base + in_offset + 2 * p.in_width + 4);
vi30 = vld1q_f32(in_base + in_offset + 3 * p.in_width);
vi3n = vld1q_f32(in_base + in_offset + 3 * p.in_width + 4);
vi01 = vextq_f32(vi00, vi0n, 1);
vi02 = vextq_f32(vi00, vi0n, 2);
vi11 = vextq_f32(vi10, vi1n, 1);
vi12 = vextq_f32(vi10, vi1n, 2);
vi21 = vextq_f32(vi20, vi2n, 1);
vi22 = vextq_f32(vi20, vi2n, 2);
vi31 = vextq_f32(vi30, vi3n, 1);
vi32 = vextq_f32(vi30, vi3n, 2);
// load ouptut
index_t out_offset = h * p.out_width + w;
vo00 = vld1q_f32(out_base + out_offset);
vo01 = vld1q_f32(out_base + out_offset + p.out_width);
#if defined(__aarch64__)
// outch 0, height 0
vo00 = vfmaq_laneq_f32(vo00, vi00, vf00, 0);
vo00 = vfmaq_laneq_f32(vo00, vi01, vf00, 1);
vo00 = vfmaq_laneq_f32(vo00, vi02, vf00, 2);
vo00 = vfmaq_laneq_f32(vo00, vi10, vf01, 0);
vo00 = vfmaq_laneq_f32(vo00, vi11, vf01, 1);
vo00 = vfmaq_laneq_f32(vo00, vi12, vf01, 2);
vo00 = vfmaq_laneq_f32(vo00, vi20, vf02, 1);
vo00 = vfmaq_laneq_f32(vo00, vi21, vf02, 2);
vo00 = vfmaq_laneq_f32(vo00, vi22, vf02, 3);
// outch 0, height 1
vo01 = vfmaq_laneq_f32(vo01, vi10, vf00, 0);
vo01 = vfmaq_laneq_f32(vo01, vi11, vf00, 1);
vo01 = vfmaq_laneq_f32(vo01, vi12, vf00, 2);
vo01 = vfmaq_laneq_f32(vo01, vi20, vf01, 0);
vo01 = vfmaq_laneq_f32(vo01, vi21, vf01, 1);
vo01 = vfmaq_laneq_f32(vo01, vi22, vf01, 2);
vo01 = vfmaq_laneq_f32(vo01, vi30, vf02, 1);
vo01 = vfmaq_laneq_f32(vo01, vi31, vf02, 2);
vo01 = vfmaq_laneq_f32(vo01, vi32, vf02, 3);
#else
// outch 0, height 0
vo00 = vmlaq_lane_f32(vo00, vi00, vget_low_f32(vf00), 0);
vo00 = vmlaq_lane_f32(vo00, vi01, vget_low_f32(vf00), 1);
vo00 = vmlaq_lane_f32(vo00, vi02, vget_high_f32(vf00), 0);
vo00 = vmlaq_lane_f32(vo00, vi10, vget_low_f32(vf01), 0);
vo00 = vmlaq_lane_f32(vo00, vi11, vget_low_f32(vf01), 1);
vo00 = vmlaq_lane_f32(vo00, vi12, vget_high_f32(vf01), 0);
vo00 = vmlaq_lane_f32(vo00, vi20, vget_low_f32(vf02), 1);
vo00 = vmlaq_lane_f32(vo00, vi21, vget_high_f32(vf02), 0);
vo00 = vmlaq_lane_f32(vo00, vi22, vget_high_f32(vf02), 1);
// outch 0, height 1
vo01 = vmlaq_lane_f32(vo01, vi10, vget_low_f32(vf00), 0);
vo01 = vmlaq_lane_f32(vo01, vi11, vget_low_f32(vf00), 1);
vo01 = vmlaq_lane_f32(vo01, vi12, vget_high_f32(vf00), 0);
vo01 = vmlaq_lane_f32(vo01, vi20, vget_low_f32(vf01), 0);
vo01 = vmlaq_lane_f32(vo01, vi21, vget_low_f32(vf01), 1);
vo01 = vmlaq_lane_f32(vo01, vi22, vget_high_f32(vf01), 0);
vo01 = vmlaq_lane_f32(vo01, vi30, vget_low_f32(vf02), 1);
vo01 = vmlaq_lane_f32(vo01, vi31, vget_high_f32(vf02), 0);
vo01 = vmlaq_lane_f32(vo01, vi32, vget_high_f32(vf02), 1);
#endif
vst1q_f32(out_base + out_offset, vo00);
vst1q_f32(out_base + out_offset + p.out_width, vo01);
} // w
// right
for (; w < p.out_width; ++w) {
DepthwiseConv2dPixel(in_base,
filter_ptr,
h,
w,
h - p.pad_top,
w - p.pad_left,
p.out_width,
p.in_height,
p.in_width,
3,
3,
out_base);
DepthwiseConv2dPixel(in_base,
filter_ptr,
h + 1,
w,
h + 1 - p.pad_top,
w - p.pad_left,
p.out_width,
p.in_height,
p.in_width,
3,
3,
out_base);
}
} // h
// bottom
for (; h < p.out_height; ++h) {
for (w = 0; w < p.out_width; ++w) {
DepthwiseConv2dPixel(in_base,
filter_ptr,
h,
w,
h - p.pad_top,
w - p.pad_left,
p.out_width,
p.in_height,
p.in_width,
3,
3,
out_base);
}
}
} // m
} // b
}, 0, p.batch, 1, 0, p.out_channels, 1); // threadpool
return MaceStatus::MACE_SUCCESS;
}
template<>
MaceStatus DepthwiseConv2dK3x3S2<float>::DoCompute(
const DepthwiseConvComputeParam &p, const float *filter_data,
const float *input_data, float *output_data) {
p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) {
for (index_t m = start1; m < end1; m += step1) {
index_t c = m / p.multiplier;
index_t multi_index = m % p.multiplier;
const float
*in_base = input_data + b * p.in_batch_size + c * p.in_image_size;
const float
*filter_ptr = filter_data + multi_index * p.in_channels * 9 + c * 9;
float *out_base =
output_data + b * p.out_batch_size + m * p.out_image_size;
index_t h, w;
// top
for (h = 0; h < p.valid_h_start; ++h) {
for (w = 0; w < p.out_width; ++w) {
DepthwiseConv2dPixel(in_base,
filter_ptr,
h,
w,
h * 2 - p.pad_top,
w * 2 - p.pad_left,
p.out_width,
p.in_height,
p.in_width,
3,
3,
out_base);
}
}
// load filter (1 outch x 3 height x 3 width): vf_outch_height
float32x4_t vf00, vf01, vf02;
vf00 = vld1q_f32(filter_ptr);
vf01 = vld1q_f32(filter_ptr + 3);
vf02 = vld1q_f32(filter_ptr + 5);
for (h = p.valid_h_start; h < p.valid_h_stop; ++h) {
// left
for (w = 0; w < p.valid_w_start; ++w) {
DepthwiseConv2dPixel(in_base,
filter_ptr,
h,
w,
h * 2 - p.pad_top,
w * 2 - p.pad_left,
p.out_width,
p.in_height,
p.in_width,
3,
3,
out_base);
}
for (w = p.valid_w_start; w + 3 < p.valid_w_stop; w += 4) {
float32x4x2_t vi0, vi1, vi2;
float32x4_t vi0n, vi1n, vi2n;
// input (3 height x 3 slide): vi_height_slide
float32x4_t vi00, vi01, vi02;
float32x4_t vi10, vi11, vi12;
float32x4_t vi20, vi21, vi22;
// output (1 outch x 1 height x 4 width): vo
float32x4_t vo;
// load input
index_t in_h = h * 2 - p.pad_top;
index_t in_w = w * 2 - p.pad_left;
index_t in_offset = in_h * p.in_width + in_w;
vi0 = vld2q_f32(in_base + in_offset); // [0.2.4.6, 1.3.5.7]
vi1 = vld2q_f32(in_base + in_offset + p.in_width);
vi2 = vld2q_f32(in_base + in_offset + 2 * p.in_width);
vi0n = vld1q_f32(in_base + in_offset + 8); // [8.9.10.11]
vi1n = vld1q_f32(in_base + in_offset + p.in_width + 8);
vi2n = vld1q_f32(in_base + in_offset + 2 * p.in_width + 8);
// load ouptut
index_t out_offset = h * p.out_width + w;
vo = vld1q_f32(out_base + out_offset);
vi00 = vi0.val[0]; // [0.2.4.6]
vi01 = vi0.val[1]; // [1.3.5.7]
vi02 = vextq_f32(vi00, vi0n, 1); // [2.4.6.8]
vi10 = vi1.val[0];
vi11 = vi1.val[1];
vi12 = vextq_f32(vi10, vi1n, 1);
vi20 = vi2.val[0];
vi21 = vi2.val[1];
vi22 = vextq_f32(vi20, vi2n, 1);
#if defined(__aarch64__)
// outch 0, height 0
vo = vfmaq_laneq_f32(vo, vi00, vf00, 0);
vo = vfmaq_laneq_f32(vo, vi01, vf00, 1);
vo = vfmaq_laneq_f32(vo, vi02, vf00, 2);
vo = vfmaq_laneq_f32(vo, vi10, vf01, 0);
vo = vfmaq_laneq_f32(vo, vi11, vf01, 1);
vo = vfmaq_laneq_f32(vo, vi12, vf01, 2);
vo = vfmaq_laneq_f32(vo, vi20, vf02, 1);
vo = vfmaq_laneq_f32(vo, vi21, vf02, 2);
vo = vfmaq_laneq_f32(vo, vi22, vf02, 3);
#else
// outch 0, height 0
vo = vmlaq_lane_f32(vo, vi00, vget_low_f32(vf00), 0);
vo = vmlaq_lane_f32(vo, vi01, vget_low_f32(vf00), 1);
vo = vmlaq_lane_f32(vo, vi02, vget_high_f32(vf00), 0);
vo = vmlaq_lane_f32(vo, vi10, vget_low_f32(vf01), 0);
vo = vmlaq_lane_f32(vo, vi11, vget_low_f32(vf01), 1);
vo = vmlaq_lane_f32(vo, vi12, vget_high_f32(vf01), 0);
vo = vmlaq_lane_f32(vo, vi20, vget_low_f32(vf02), 1);
vo = vmlaq_lane_f32(vo, vi21, vget_high_f32(vf02), 0);
vo = vmlaq_lane_f32(vo, vi22, vget_high_f32(vf02), 1);
#endif
vst1q_f32(out_base + out_offset, vo);
} // w
// right
for (; w < p.out_width; ++w) {
DepthwiseConv2dPixel(in_base,
filter_ptr,
h,
w,
h * 2 - p.pad_top,
w * 2 - p.pad_left,
p.out_width,
p.in_height,
p.in_width,
3,
3,
out_base);
}
} // h
// bottom
for (; h < p.out_height; ++h) {
for (w = 0; w < p.out_width; ++w) {
DepthwiseConv2dPixel(in_base,
filter_ptr,
h,
w,
h * 2 - p.pad_top,
w * 2 - p.pad_left,
p.out_width,
p.in_height,
p.in_width,
3,
3,
out_base);
}
}
} // m
} // b
}, 0, p.batch, 1, 0, p.out_channels, 1);
return MaceStatus::MACE_SUCCESS;
}
} // namespace arm
} // namespace ops
} // namespace mace
......@@ -14,8 +14,8 @@
#include <arm_neon.h>
#include "mace/ops/arm/base/common_neon.h"
#include "mace/ops/arm/base/depthwise_deconv_2d_3x3.h"
#include "mace/ops/arm/fp32/common_neon.h"
namespace mace {
namespace ops {
......
......@@ -14,8 +14,8 @@
#include <arm_neon.h>
#include "mace/ops/arm/base/common_neon.h"
#include "mace/ops/arm/base/depthwise_deconv_2d_4x4.h"
#include "mace/ops/arm/fp32/common_neon.h"
namespace mace {
namespace ops {
......
此差异已折叠。
......@@ -37,9 +37,7 @@ extern void RegisterGemvDelegator(OpDelegatorRegistry *registry);
#ifdef MACE_ENABLE_NEON
namespace arm {
namespace fp32 {
extern void RegisterConv2dK3x3WinogradDelegator(OpDelegatorRegistry *registry);
} // namespace fp32
extern void RegisterActivationDelegator(OpDelegatorRegistry *registry);
extern void RegisterBiasAddDelegator(OpDelegatorRegistry *registry);
......@@ -98,7 +96,7 @@ void RegisterAllOpDelegators(OpDelegatorRegistry *registry) {
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_NEON
arm::fp32::RegisterConv2dK3x3WinogradDelegator(registry);
arm::RegisterConv2dK3x3WinogradDelegator(registry);
arm::RegisterActivationDelegator(registry);
arm::RegisterBiasAddDelegator(registry);
......
......@@ -10,6 +10,7 @@ load(
"if_android_armv7",
"if_hexagon_enabled",
"if_neon_enabled",
"if_bfloat16_enabled",
"if_opencl_enabled",
"if_quantize_enabled",
)
......@@ -58,6 +59,8 @@ cc_test(
"-DMACE_ENABLE_OPENCL",
]) + if_quantize_enabled([
"-DMACE_ENABLE_QUANTIZE",
]) + if_bfloat16_enabled([
"-DMACE_ENABLE_BFLOAT16",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]),
......
......@@ -67,15 +67,24 @@ void ReluBenchmark(int iters, int batch, int channels, int height, int width) {
} \
MACE_BENCHMARK(MACE_BM_RELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#ifdef MACE_ENABLE_BFLOAT16
#define MACE_BM_RELU_BF16_MACRO(N, C, H, W) \
MACE_BM_RELU_MACRO(N, C, H, W, BFloat16, CPU)
#else
#define MACE_BM_RELU_BF16_MACRO(N, C, H, W)
#endif // MACE_ENABLE_BFLOAT16
#ifdef MACE_ENABLE_OPENCL
#define MACE_BM_RELU(N, C, H, W) \
MACE_BM_RELU_MACRO(N, C, H, W, float, CPU); \
MACE_BM_RELU_MACRO(N, C, H, W, float, GPU); \
#define MACE_BM_RELU_GPU_MACRO(N, C, H, W) \
MACE_BM_RELU_MACRO(N, C, H, W, float, GPU); \
MACE_BM_RELU_MACRO(N, C, H, W, half, GPU)
#else
#define MACE_BM_RELU(N, C, H, W) \
MACE_BM_RELU_MACRO(N, C, H, W, float, CPU)
#endif
#define MACE_BM_RELU_GPU_MACRO(N, C, H, W)
#endif // MACE_ENABLE_OPENCL
#define MACE_BM_RELU(N, C, H, W) \
MACE_BM_RELU_MACRO(N, C, H, W, float, CPU); \
MACE_BM_RELU_BF16_MACRO(N, C, H, W); \
MACE_BM_RELU_GPU_MACRO(N, C, H, W)
MACE_BM_RELU(1, 1, 512, 512);
MACE_BM_RELU(1, 3, 128, 128);
......@@ -128,15 +137,24 @@ void ReluxBenchmark(int iters, int batch, int channels, int height, int width) {
} \
MACE_BENCHMARK(MACE_BM_RELUX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#ifdef MACE_ENABLE_BFLOAT16
#define MACE_BM_RELUX_BF16_MACRO(N, C, H, W) \
MACE_BM_RELUX_MACRO(N, C, H, W, BFloat16, CPU)
#else
#define MACE_BM_RELUX_BF16_MACRO(N, C, H, W)
#endif // MACE_ENABLE_BFLOAT16
#ifdef MACE_ENABLE_OPENCL
#define MACE_BM_RELUX(N, C, H, W) \
MACE_BM_RELUX_MACRO(N, C, H, W, float, CPU); \
MACE_BM_RELUX_MACRO(N, C, H, W, float, GPU); \
#define MACE_BM_RELUX_GPU_MACRO(N, C, H, W) \
MACE_BM_RELUX_MACRO(N, C, H, W, float, GPU); \
MACE_BM_RELUX_MACRO(N, C, H, W, half, GPU)
#else
#define MACE_BM_RELUX(N, C, H, W) \
MACE_BM_RELUX_MACRO(N, C, H, W, float, CPU)
#endif
#define MACE_BM_RELUX_GPU_MACRO(N, C, H, W)
#endif // MACE_ENABLE_OPENCL
#define MACE_BM_RELUX(N, C, H, W) \
MACE_BM_RELUX_MACRO(N, C, H, W, float, CPU); \
MACE_BM_RELUX_BF16_MACRO(N, C, H, W); \
MACE_BM_RELUX_GPU_MACRO(N, C, H, W)
MACE_BM_RELUX(1, 1, 512, 512);
MACE_BM_RELUX(1, 3, 128, 128);
......@@ -192,15 +210,24 @@ void PreluBenchmark(int iters, int batch, int channels, int height, int width) {
} \
MACE_BENCHMARK(MACE_BM_PRELU_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#ifdef MACE_ENABLE_BFLOAT16
#define MACE_BM_PRELU_BF16_MACRO(N, C, H, W) \
MACE_BM_PRELU_MACRO(N, C, H, W, BFloat16, CPU)
#else
#define MACE_BM_PRELU_BF16_MACRO(N, C, H, W)
#endif // MACE_ENABLE_BFLOAT16
#ifdef MACE_ENABLE_OPENCL
#define MACE_BM_PRELU(N, C, H, W) \
MACE_BM_PRELU_MACRO(N, C, H, W, float, CPU); \
MACE_BM_PRELU_MACRO(N, C, H, W, float, GPU); \
#define MACE_BM_PRELU_GPU_MACRO(N, C, H, W) \
MACE_BM_PRELU_MACRO(N, C, H, W, float, GPU); \
MACE_BM_PRELU_MACRO(N, C, H, W, half, GPU)
#else
#define MACE_BM_PRELU(N, C, H, W) \
MACE_BM_PRELU_MACRO(N, C, H, W, float, CPU)
#endif
#define MACE_BM_PRELU_GPU_MACRO(N, C, H, W)
#endif // MACE_ENABLE_OPENCL
#define MACE_BM_PRELU(N, C, H, W) \
MACE_BM_PRELU_MACRO(N, C, H, W, float, CPU); \
MACE_BM_PRELU_BF16_MACRO(N, C, H, W); \
MACE_BM_PRELU_GPU_MACRO(N, C, H, W)
MACE_BM_PRELU(1, 1, 512, 512);
MACE_BM_PRELU(1, 3, 128, 128);
......@@ -316,15 +343,24 @@ void TanhBenchmark(int iters, int batch, int channels, int height, int width) {
} \
MACE_BENCHMARK(MACE_BM_TANH_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#ifdef MACE_ENABLE_BFLOAT16
#define MACE_BM_TANH_BF16_MACRO(N, C, H, W) \
MACE_BM_TANH_MACRO(N, C, H, W, BFloat16, CPU)
#else
#define MACE_BM_TANH_BF16_MACRO(N, C, H, W)
#endif // MACE_ENABLE_BFLOAT16
#ifdef MACE_ENABLE_OPENCL
#define MACE_BM_TANH(N, C, H, W) \
MACE_BM_TANH_MACRO(N, C, H, W, float, CPU); \
MACE_BM_TANH_MACRO(N, C, H, W, float, GPU); \
#define MACE_BM_TANH_GPU_MACRO(N, C, H, W) \
MACE_BM_TANH_MACRO(N, C, H, W, float, GPU); \
MACE_BM_TANH_MACRO(N, C, H, W, half, GPU)
#else
#define MACE_BM_TANH(N, C, H, W) \
MACE_BM_TANH_MACRO(N, C, H, W, float, CPU)
#endif
#define MACE_BM_TANH_GPU_MACRO(N, C, H, W)
#endif // MACE_ENABLE_OPENCL
#define MACE_BM_TANH(N, C, H, W) \
MACE_BM_TANH_MACRO(N, C, H, W, float, CPU); \
MACE_BM_TANH_BF16_MACRO(N, C, H, W); \
MACE_BM_TANH_GPU_MACRO(N, C, H, W)
MACE_BM_TANH(1, 1, 512, 512);
MACE_BM_TANH(1, 3, 128, 128);
......@@ -377,15 +413,24 @@ void SigmoidBenchmark(
} \
MACE_BENCHMARK(MACE_BM_SIGMOID_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#ifdef MACE_ENABLE_BFLOAT16
#define MACE_BM_SIGMOID_BF16_MACRO(N, C, H, W) \
MACE_BM_SIGMOID_MACRO(N, C, H, W, BFloat16, CPU)
#else
#define MACE_BM_SIGMOID_BF16_MACRO(N, C, H, W)
#endif // MACE_ENABLE_BFLOAT16
#ifdef MACE_ENABLE_OPENCL
#define MACE_BM_SIGMOID(N, C, H, W) \
MACE_BM_SIGMOID_MACRO(N, C, H, W, float, CPU); \
#define MACE_BM_SIGMOID_GPU_MACRO(N, C, H, W) \
MACE_BM_SIGMOID_MACRO(N, C, H, W, float, GPU); \
MACE_BM_SIGMOID_MACRO(N, C, H, W, half, GPU)
#else
#define MACE_BM_SIGMOID_GPU_MACRO(N, C, H, W)
#endif // MACE_ENABLE_OPENCL
#define MACE_BM_SIGMOID(N, C, H, W) \
MACE_BM_SIGMOID_MACRO(N, C, H, W, float, CPU)
#endif
MACE_BM_SIGMOID_MACRO(N, C, H, W, float, CPU); \
MACE_BM_SIGMOID_BF16_MACRO(N, C, H, W); \
MACE_BM_SIGMOID_GPU_MACRO(N, C, H, W)
MACE_BM_SIGMOID(1, 1, 512, 512);
MACE_BM_SIGMOID(1, 3, 128, 128);
......
......@@ -68,24 +68,31 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
} \
MACE_BENCHMARK(MACE_BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)
#if defined(MACE_ENABLE_OPENCL) && defined(MACE_ENABLE_QUANTIZE)
#define MACE_BM_BIAS_ADD(N, C, H, W) \
MACE_BM_BIAS_ADD_MACRO(N, C, H, W, float, CPU); \
MACE_BM_BIAS_ADD_MACRO(N, C, H, W, uint8_t, CPU); \
#ifdef MACE_ENABLE_QUANTIZE
#define MACE_BM_BIAS_ADD_Q8_MACRO(N, C, H, W) \
MACE_BM_BIAS_ADD_MACRO(N, C, H, W, uint8_t, CPU)
#else
#define MACE_BM_BIAS_ADD_Q8_MACRO(N, C, H, W)
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_BFLOAT16
#define MACE_BM_BIAS_ADD_BF16_MACRO(N, C, H, W) \
MACE_BM_BIAS_ADD_MACRO(N, C, H, W, BFloat16, CPU)
#else
#define MACE_BM_BIAS_ADD_BF16_MACRO(N, C, H, W)
#endif // MACE_ENABLE_BFLOAT16
#ifdef MACE_ENABLE_OPENCL
#define MACE_BM_BIAS_ADD_GPU_MACRO(N, C, H, W) \
MACE_BM_BIAS_ADD_MACRO(N, C, H, W, float, GPU); \
MACE_BM_BIAS_ADD_MACRO(N, C, H, W, half, GPU);
#elif defined(MACE_ENABLE_OPENCL)
#define MACE_BM_BIAS_ADD(N, C, H, W) \
MACE_BM_BIAS_ADD_MACRO(N, C, H, W, float, CPU); \
MACE_BM_BIAS_ADD_MACRO(N, C, H, W, float, GPU); \
MACE_BM_BIAS_ADD_MACRO(N, C, H, W, half, GPU);
#elif defined(MACE_ENABLE_QUANTIZE)
MACE_BM_BIAS_ADD_MACRO(N, C, H, W, half, GPU)
#else
#define MACE_BM_BIAS_ADD_GPU_MACRO(N, C, H, W)
#endif // MACE_ENABLE_OPENCL
#define MACE_BM_BIAS_ADD(N, C, H, W) \
MACE_BM_BIAS_ADD_MACRO(N, C, H, W, float, CPU); \
MACE_BM_BIAS_ADD_MACRO(N, C, H, W, uint8_t, CPU);
#define MACE_BM_BIAS_ADD(N, C, H, W) \
MACE_BM_BIAS_ADD_MACRO(N, C, H, W, float, CPU);
#endif
MACE_BM_BIAS_ADD_Q8_MACRO(N, C, H, W); \
MACE_BM_BIAS_ADD_BF16_MACRO(N, C, H, W); \
MACE_BM_BIAS_ADD_GPU_MACRO(N, C, H, W)
MACE_BM_BIAS_ADD(1, 1, 512, 512);
MACE_BM_BIAS_ADD(1, 3, 128, 128);
......
......@@ -42,7 +42,7 @@ void Conv2d(int iters,
// Add input data
if (D == DeviceType::CPU) {
net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
} else if (D == DeviceType::GPU) {
net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
} else {
......@@ -169,26 +169,31 @@ void Conv2d<CPU, uint8_t>(int iters,
MACE_BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##D##\
DILATION##_##P##_##OC##_##TYPE##_##DEVICE)
#if defined(MACE_ENABLE_OPENCL) && defined(MACE_ENABLE_QUANTIZE)
#define MACE_BM_CONV_2D(N, C, H, W, KH, KW, S, D, P, OC) \
MACE_BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, CPU); \
MACE_BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, GPU); \
MACE_BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, half, GPU); \
#ifdef MACE_ENABLE_QUANTIZE
#define MACE_BM_CONV_2D_Q8_MACRO(N, C, H, W, KH, KW, S, D, P, OC) \
MACE_BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, uint8_t, CPU)
#elif defined(MACE_ENABLE_OPENCL)
#define MACE_BM_CONV_2D(N, C, H, W, KH, KW, S, D, P, OC) \
MACE_BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, CPU); \
#else
#define MACE_BM_CONV_2D_Q8_MACRO(N, C, H, W, KH, KW, S, D, P, OC)
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_BFLOAT16
#define MACE_BM_CONV_2D_BF16_MACRO(N, C, H, W, KH, KW, S, D, P, OC) \
MACE_BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, BFloat16, CPU)
#else
#define MACE_BM_CONV_2D_BF16_MACRO(N, C, H, W, KH, KW, S, D, P, OC)
#endif // MACE_ENABLE_BFLOAT16
#ifdef MACE_ENABLE_OPENCL
#define MACE_BM_CONV_2D_GPU_MACRO(N, C, H, W, KH, KW, S, D, P, OC) \
MACE_BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, GPU); \
MACE_BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, half, GPU)
#elif defined(MACE_ENABLE_QUANTIZE)
#define MACE_BM_CONV_2D(N, C, H, W, KH, KW, S, D, P, OC) \
MACE_BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, CPU); \
MACE_BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, uint8_t, CPU)
#else
#define MACE_BM_CONV_2D(N, C, H, W, KH, KW, S, D, P, OC) \
MACE_BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, CPU)
#endif
#define MACE_BM_CONV_2D_GPU_MACRO(N, C, H, W, KH, KW, S, D, P, OC)
#endif // MACE_ENABLE_OPENCL
#define MACE_BM_CONV_2D(N, C, H, W, KH, KW, S, D, P, OC) \
MACE_BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, CPU); \
MACE_BM_CONV_2D_Q8_MACRO(N, C, H, W, KH, KW, S, D, P, OC); \
MACE_BM_CONV_2D_BF16_MACRO(N, C, H, W, KH, KW, S, D, P, OC); \
MACE_BM_CONV_2D_GPU_MACRO(N, C, H, W, KH, KW, S, D, P, OC)
// Filter sizes and data alignments
MACE_BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, 1, VALID, 128);
......
......@@ -128,25 +128,31 @@ void DepthwiseConv2d(int iters,
MACE_BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE\
##_##P##_##M##_##TYPE##_##DEVICE)
#if defined(MACE_ENABLE_OPENCL) && defined(MACE_ENABLE_QUANTIZE)
#define MACE_BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, M) \
MACE_BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU); \
MACE_BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, GPU); \
MACE_BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, GPU); \
#ifdef MACE_ENABLE_QUANTIZE
#define MACE_BM_DEPTHWISE_CONV_2D_Q8_MACRO(N, C, H, W, KH, KW, S, P, M) \
MACE_BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, uint8_t, CPU)
#elif defined(MACE_ENABLE_OPENCL)
#define MACE_BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, M) \
MACE_BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU); \
#else
#define MACE_BM_DEPTHWISE_CONV_2D_Q8_MACRO(N, C, H, W, KH, KW, S, P, M)
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_BFLOAT16
#define MACE_BM_DEPTHWISE_CONV_2D_BF16_MACRO(N, C, H, W, KH, KW, S, P, M) \
MACE_BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, BFloat16, CPU)
#else
#define MACE_BM_DEPTHWISE_CONV_2D_BF16_MACRO(N, C, H, W, KH, KW, S, P, M)
#endif // MACE_ENABLE_BFLOAT16
#ifdef MACE_ENABLE_OPENCL
#define MACE_BM_DEPTHWISE_CONV_2D_GPU_MACRO(N, C, H, W, KH, KW, S, P, M) \
MACE_BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, GPU); \
MACE_BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, GPU)
#elif defined(MACE_ENABLE_QUANTIZE)
#define MACE_BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, M) \
MACE_BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU); \
MACE_BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, uint8_t, CPU)
#else
#define MACE_BM_DEPTHWISE_CONV_2D_GPU_MACRO(N, C, H, W, KH, KW, S, P, M)
#endif // MACE_ENABLE_OPENCL
#define MACE_BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, M) \
MACE_BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU)
#endif
MACE_BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU); \
MACE_BM_DEPTHWISE_CONV_2D_Q8_MACRO(N, C, H, W, KH, KW, S, P, M); \
MACE_BM_DEPTHWISE_CONV_2D_BF16_MACRO(N, C, H, W, KH, KW, S, P, M); \
MACE_BM_DEPTHWISE_CONV_2D_GPU_MACRO(N, C, H, W, KH, KW, S, P, M)
MACE_BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 1, SAME, 1);
MACE_BM_DEPTHWISE_CONV_2D(1, 32, 56, 56, 3, 3, 2, VALID, 1);
......
......@@ -11,6 +11,7 @@ load(
"if_hexagon_enabled",
"if_hta_enabled",
"if_neon_enabled",
"if_bfloat16_enabled",
"if_opencl_enabled",
"if_quantize_enabled",
)
......@@ -37,13 +38,19 @@ cc_test(
"mace/ops/arm/q8/*.cc",
"mace/ops/fixpoint_test.cc",
]
)) + if_bfloat16_enabled(glob(
[
"mace/ops/arm/bf16/*.cc",
]
)) + if_opencl_enabled(glob(
[
"mace/ops/opencl/*.cc",
]
)) + if_hta_enabled([
"mace/core/runtime/hexagon/hta_transform_test.cc",
]),
)) + if_hta_enabled(
[
"mace/core/runtime/hexagon/hta_transform_test.cc",
]
),
copts = [
"-Werror",
"-Wextra",
......@@ -57,6 +64,8 @@ cc_test(
"-DMACE_ENABLE_OPENCL",
]) + if_quantize_enabled([
"-DMACE_ENABLE_QUANTIZE",
]) + if_bfloat16_enabled([
"-DMACE_ENABLE_BFLOAT16",
]) + if_hexagon_enabled([
"-DMACE_ENABLE_HEXAGON",
]) + if_hta_enabled([
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册