提交 c2523ee2 编写于 作者: 刘托

Merge branch 'optimize_deconv_cpu' into 'master'

optimize deconv cpu

See merge request !797
#Wed May 02 11:53:35 CST 2018
#Wed Sep 12 13:35:24 CST 2018
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
zipStoreBase=GRADLE_USER_HOME
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_ARM_DECONV_2D_NEON_H_
#define MACE_KERNELS_ARM_DECONV_2D_NEON_H_
#if defined(MACE_ENABLE_NEON)
#include <arm_neon.h>
#endif
#include "mace/core/types.h"
namespace mace {
namespace kernels {
void Deconv2dNeonK3x3S1(const float *input,
const float *filter,
const float *bias,
const index_t *in_shape,
const index_t *out_shape,
float *output);
void Deconv2dNeonK3x3S2(const float *input,
const float *filter,
const float *bias,
const index_t *in_shape,
const index_t *out_shape,
float *output);
void Deconv2dNeonK4x4S1(const float *input,
const float *filter,
const float *bias,
const index_t *in_shape,
const index_t *out_shape,
float *output);
void Deconv2dNeonK4x4S2(const float *input,
const float *filter,
const float *bias,
const index_t *in_shape,
const index_t *out_shape,
float *output);
#ifdef MACE_ENABLE_NEON
inline float32x4_t neon_vfma_lane_0(float32x4_t a,
float32x4_t b,
float32x4_t c) {
#ifdef __aarch64__
return vfmaq_laneq_f32(a, b, c, 0);
#else
return vmlaq_lane_f32(a, b, vget_low_f32(c), 0);
#endif
}
inline float32x4_t neon_vfma_lane_1(float32x4_t a,
float32x4_t b,
float32x4_t c) {
#ifdef __aarch64__
return vfmaq_laneq_f32(a, b, c, 1);
#else
return vmlaq_lane_f32(a, b, vget_low_f32(c), 1);
#endif
}
inline float32x4_t neon_vfma_lane_2(float32x4_t a,
float32x4_t b,
float32x4_t c) {
#ifdef __aarch64__
return vfmaq_laneq_f32(a, b, c, 2);
#else
return vmlaq_lane_f32(a, b, vget_high_f32(c), 0);
#endif
}
inline float32x4_t neon_vfma_lane_3(float32x4_t a,
float32x4_t b,
float32x4_t c) {
#ifdef __aarch64__
return vfmaq_laneq_f32(a, b, c, 3);
#else
return vmlaq_lane_f32(a, b, vget_high_f32(c), 1);
#endif
}
#endif
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_ARM_DECONV_2D_NEON_H_
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/macros.h"
#include "mace/kernels/arm/deconv_2d_neon.h"
namespace mace {
namespace kernels {
void Deconv2dNeonK3x3S1(const float *input,
const float *filter,
const float *bias,
const index_t *in_shape,
const index_t *out_shape,
float *output) {
const index_t inch = in_shape[1];
const index_t h = in_shape[2];
const index_t w = in_shape[3];
const index_t outch = out_shape[1];
const index_t outh = out_shape[2];
const index_t outw = out_shape[3];
const index_t out_img_size = outh * outw;
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t oc = 0; oc < outch; oc += 2) {
if (oc + 1 < outch) {
float *out_base0 = output + (b * outch + oc) * out_img_size;
float *out_base1 = out_base0 + out_img_size;
const float bias_value0 = bias ? bias[oc] : 0.f;
const float bias_value1 = bias ? bias[oc + 1] : 0.f;
std::fill_n(out_base0, out_img_size, bias_value0);
std::fill_n(out_base1, out_img_size, bias_value1);
for (index_t ic = 0; ic < inch; ++ic) {
const float *input_base = input + (b * inch + ic) * h * w;
const float *kernel_base0 = filter + (oc * inch + ic) * 9;
const float *kernel_base1 = kernel_base0 + inch * 9;
const float *in = input_base;
// output channel 0
const float *k0_0 = kernel_base0;
const float *k0_1 = kernel_base0 + 3;
const float *k0_2 = kernel_base0 + 5;
// output channel 1
const float *k1_0 = kernel_base1;
const float *k1_1 = kernel_base1 + 3;
const float *k1_2 = kernel_base1 + 5;
#if defined(MACE_ENABLE_NEON)
// load filter
float32x4_t k00_vec, k01_vec, k02_vec;
float32x4_t k10_vec, k11_vec, k12_vec;
k00_vec = vld1q_f32(k0_0);
k01_vec = vld1q_f32(k0_1);
k02_vec = vld1q_f32(k0_2);
k10_vec = vld1q_f32(k1_0);
k11_vec = vld1q_f32(k1_1);
k12_vec = vld1q_f32(k1_2);
#endif
for (index_t i = 0; i < h; ++i) {
float *out_row_base0 = out_base0 + i * outw;
float *out_row0_0 = out_row_base0;
float *out_row0_1 = out_row_base0 + outw;
float *out_row0_2 = out_row_base0 + 2 * outw;
float *out_row_base1 = out_base1 + i * outw;
float *out_row1_0 = out_row_base1;
float *out_row1_1 = out_row_base1 + outw;
float *out_row1_2 = out_row_base1 + 2 * outw;
index_t j = 0;
#if defined(MACE_ENABLE_NEON)
for (; j + 3 < w; j += 4) {
float32x4_t in_vec = vld1q_f32(in);
float32x4_t out00, out01, out02;
float32x4_t out10, out11, out12;
float32x4_t out20, out21, out22;
out00 = vld1q_f32(out_row0_0);
out00 = neon_vfma_lane_0(out00, in_vec, k00_vec);
vst1q_f32(out_row0_0, out00);
out01 = vld1q_f32(out_row0_0 + 1);
out01 = neon_vfma_lane_1(out01, in_vec, k00_vec);
vst1q_f32(out_row0_0 + 1, out01);
out02 = vld1q_f32(out_row0_0 + 2);
out02 = neon_vfma_lane_2(out02, in_vec, k00_vec);
vst1q_f32(out_row0_0 + 2, out02);
out10 = vld1q_f32(out_row0_1 + 0);
out10 = neon_vfma_lane_0(out10, in_vec, k01_vec);
vst1q_f32(out_row0_1 + 0, out10);
out11 = vld1q_f32(out_row0_1 + 1);
out11 = neon_vfma_lane_1(out11, in_vec, k01_vec);
vst1q_f32(out_row0_1 + 1, out11);
out12 = vld1q_f32(out_row0_1 + 2);
out12 = neon_vfma_lane_2(out12, in_vec, k01_vec);
vst1q_f32(out_row0_1 + 2, out12);
out20 = vld1q_f32(out_row0_2 + 0);
out20 = neon_vfma_lane_1(out20, in_vec, k02_vec);
vst1q_f32(out_row0_2 + 0, out20);
out21 = vld1q_f32(out_row0_2 + 1);
out21 = neon_vfma_lane_2(out21, in_vec, k02_vec);
vst1q_f32(out_row0_2 + 1, out21);
out22 = vld1q_f32(out_row0_2 + 2);
out22 = neon_vfma_lane_3(out22, in_vec, k02_vec);
vst1q_f32(out_row0_2 + 2, out22);
out00 = vld1q_f32(out_row1_0 + 0);
out00 = neon_vfma_lane_0(out00, in_vec, k10_vec);
vst1q_f32(out_row1_0 + 0, out00);
out01 = vld1q_f32(out_row1_0 + 1);
out01 = neon_vfma_lane_1(out01, in_vec, k10_vec);
vst1q_f32(out_row1_0 + 1, out01);
out02 = vld1q_f32(out_row1_0 + 2);
out02 = neon_vfma_lane_2(out02, in_vec, k10_vec);
vst1q_f32(out_row1_0 + 2, out02);
out10 = vld1q_f32(out_row1_1 + 0);
out10 = neon_vfma_lane_0(out10, in_vec, k11_vec);
vst1q_f32(out_row1_1 + 0, out10);
out11 = vld1q_f32(out_row1_1 + 1);
out11 = neon_vfma_lane_1(out11, in_vec, k11_vec);
vst1q_f32(out_row1_1 + 1, out11);
out12 = vld1q_f32(out_row1_1 + 2);
out12 = neon_vfma_lane_2(out12, in_vec, k11_vec);
vst1q_f32(out_row1_1 + 2, out12);
out20 = vld1q_f32(out_row1_2 + 0);
out20 = neon_vfma_lane_1(out20, in_vec, k12_vec);
vst1q_f32(out_row1_2 + 0, out20);
out21 = vld1q_f32(out_row1_2 + 1);
out21 = neon_vfma_lane_2(out21, in_vec, k12_vec);
vst1q_f32(out_row1_2 + 1, out21);
out22 = vld1q_f32(out_row1_2 + 2);
out22 = neon_vfma_lane_3(out22, in_vec, k12_vec);
vst1q_f32(out_row1_2 + 2, out22);
in += 4;
out_row0_0 += 4;
out_row0_1 += 4;
out_row0_2 += 4;
out_row1_0 += 4;
out_row1_1 += 4;
out_row1_2 += 4;
}
#endif
for (; j < w; ++j) {
float val = in[0];
for (int k = 0; k < 3; ++k) {
out_row0_0[k] += val * k0_0[k];
out_row0_1[k] += val * k0_1[k];
out_row0_2[k] += val * k0_2[k + 1];
out_row1_0[k] += val * k1_0[k];
out_row1_1[k] += val * k1_1[k];
out_row1_2[k] += val * k1_2[k + 1];
}
in++;
out_row0_0++;
out_row0_1++;
out_row0_2++;
out_row1_0++;
out_row1_1++;
out_row1_2++;
}
}
}
} else {
float *out_base0 = output + (b * outch + oc) * outh * outw;
const float bias_value0 = bias ? bias[oc] : 0.f;
std::fill_n(out_base0, outh * outw, bias_value0);
for (index_t ic = 0; ic < inch; ++ic) {
const float *input_base = input + (b * inch + ic) * h * w;
const float *kernel_base0 = filter + (oc * inch + ic) * 9;
const float *in = input_base;
const float *k0_0 = kernel_base0;
const float *k0_1 = kernel_base0 + 3;
const float *k0_2 = kernel_base0 + 5;
#if defined(MACE_ENABLE_NEON)
// load filter
float32x4_t k00_vec = vld1q_f32(k0_0);
float32x4_t k01_vec = vld1q_f32(k0_1);
float32x4_t k02_vec = vld1q_f32(k0_2);
#endif
for (index_t i = 0; i < h; ++i) {
float *out_row_base0 = out_base0 + i * outw;
float *out_row0_0 = out_row_base0;
float *out_row0_1 = out_row_base0 + outw;
float *out_row0_2 = out_row_base0 + 2 * outw;
index_t j = 0;
#if defined(MACE_ENABLE_NEON)
for (; j + 3 < w; j += 4) {
float32x4_t in_vec = vld1q_f32(in);
float32x4_t out00, out01, out02;
float32x4_t out10, out11, out12;
float32x4_t out20, out21, out22;
out00 = vld1q_f32(out_row0_0 + 0);
out00 = neon_vfma_lane_0(out00, in_vec, k00_vec);
vst1q_f32(out_row0_0 + 0, out00);
out01 = vld1q_f32(out_row0_0 + 1);
out01 = neon_vfma_lane_1(out01, in_vec, k00_vec);
vst1q_f32(out_row0_0 + 1, out01);
out02 = vld1q_f32(out_row0_0 + 2);
out02 = neon_vfma_lane_2(out02, in_vec, k00_vec);
vst1q_f32(out_row0_0 + 2, out02);
out10 = vld1q_f32(out_row0_1 + 0);
out10 = neon_vfma_lane_0(out10, in_vec, k01_vec);
vst1q_f32(out_row0_1 + 0, out10);
out11 = vld1q_f32(out_row0_1 + 1);
out11 = neon_vfma_lane_1(out11, in_vec, k01_vec);
vst1q_f32(out_row0_1 + 1, out11);
out12 = vld1q_f32(out_row0_1 + 2);
out12 = neon_vfma_lane_2(out12, in_vec, k01_vec);
vst1q_f32(out_row0_1 + 2, out12);
out20 = vld1q_f32(out_row0_2 + 0);
out20 = neon_vfma_lane_1(out20, in_vec, k02_vec);
vst1q_f32(out_row0_2 + 0, out20);
out21 = vld1q_f32(out_row0_2 + 1);
out21 = neon_vfma_lane_2(out21, in_vec, k02_vec);
vst1q_f32(out_row0_2 + 1, out21);
out22 = vld1q_f32(out_row0_2 + 2);
out22 = neon_vfma_lane_3(out22, in_vec, k02_vec);
vst1q_f32(out_row0_2 + 2, out22);
in += 4;
out_row0_0 += 4;
out_row0_1 += 4;
out_row0_2 += 4;
}
#endif
for (; j < w; ++j) {
float val = in[0];
for (int k = 0; k < 3; ++k) {
out_row0_0[k] += val * k0_0[k];
out_row0_1[k] += val * k0_1[k];
out_row0_2[k] += val * k0_2[k + 1];
}
in++;
out_row0_0++;
out_row0_1++;
out_row0_2++;
}
}
}
}
}
}
}
void Deconv2dNeonK3x3S2(const float *input,
const float *filter,
const float *bias,
const index_t *in_shape,
const index_t *out_shape,
float *output) {
const index_t inch = in_shape[1];
const index_t h = in_shape[2];
const index_t w = in_shape[3];
const index_t outch = out_shape[1];
const index_t outh = out_shape[2];
const index_t outw = out_shape[3];
const index_t out_img_size = outh * outw;
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t oc = 0; oc < outch; ++oc) {
float *out_base = output + (b * outch + oc) * out_img_size;
const float bias_value = bias ? bias[oc] : 0.f;
std::fill_n(out_base, out_img_size, bias_value);
for (index_t ic = 0; ic < inch; ++ic) {
const float *input_base = input + (b * inch + ic) * h * w;
const float *kernel_base = filter + (oc * inch + ic) * 9;
const float *in = input_base;
const float *k0 = kernel_base;
const float *k1 = kernel_base + 3;
const float *k2 = kernel_base + 5;
#if defined(MACE_ENABLE_NEON)
float32x4_t k0_vec = vld1q_f32(k0);
float32x4_t k1_vec = vld1q_f32(k1);
float32x4_t k2_vec = vld1q_f32(k2);
#endif
for (index_t i = 0; i < h; ++i) {
float *out_row_base = out_base + i * 2 * outw;
float *out_row_0 = out_row_base;
float *out_row_1 = out_row_0 + outw;
float *out_row_2 = out_row_1 + outw;
index_t j = 0;
#if defined(MACE_ENABLE_NEON)
for (; j + 3 < w; j += 4) {
float32x4_t in_vec = vld1q_f32(in);
// out row 0
float32x4x2_t out00 = vld2q_f32(out_row_0);
out00.val[0] =
neon_vfma_lane_0(out00.val[0], in_vec, k0_vec);
out00.val[1] =
neon_vfma_lane_1(out00.val[1], in_vec, k0_vec);
vst2q_f32(out_row_0, out00);
float32x4x2_t out01 = vld2q_f32(out_row_0 + 2);
out01.val[0] =
neon_vfma_lane_2(out01.val[0], in_vec, k0_vec);
vst2q_f32(out_row_0 + 2, out01);
// out row 1
float32x4x2_t out10 = vld2q_f32(out_row_1);
out10.val[0] =
neon_vfma_lane_0(out10.val[0], in_vec, k1_vec);
out10.val[1] =
neon_vfma_lane_1(out10.val[1], in_vec, k1_vec);
vst2q_f32(out_row_1, out10);
float32x4x2_t out11 = vld2q_f32(out_row_1 + 2);
out11.val[0] =
neon_vfma_lane_2(out11.val[0], in_vec, k1_vec);
vst2q_f32(out_row_1 + 2, out11);
// out row 2
float32x4x2_t out20 = vld2q_f32(out_row_2);
out20.val[0] =
neon_vfma_lane_1(out20.val[0], in_vec, k2_vec);
out20.val[1] =
neon_vfma_lane_2(out20.val[1], in_vec, k2_vec);
vst2q_f32(out_row_2, out20);
float32x4x2_t out21 = vld2q_f32(out_row_2 + 2);
out21.val[0] =
neon_vfma_lane_3(out21.val[0], in_vec, k2_vec);
vst2q_f32(out_row_2 + 2, out21);
in += 4;
out_row_0 += 8;
out_row_1 += 8;
out_row_2 += 8;
}
#endif
for (; j < w; ++j) {
float val = in[0];
for (int k = 0; k < 3; ++k) {
out_row_0[k] += val * k0[k];
out_row_1[k] += val * k1[k];
out_row_2[k] += val * k2[k + 1];
}
in++;
out_row_0 += 2;
out_row_1 += 2;
out_row_2 += 2;
}
}
}
}
}
}
} // namespace kernels
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/macros.h"
#include "mace/kernels/arm/deconv_2d_neon.h"
namespace mace {
namespace kernels {
void Deconv2dNeonK4x4S1(const float *input,
const float *filter,
const float *bias,
const index_t *in_shape,
const index_t *out_shape,
float *output) {
const index_t w = in_shape[3];
const index_t h = in_shape[2];
const index_t inch = in_shape[1];
const index_t outh = out_shape[2];
const index_t outw = out_shape[3];
const index_t outch = out_shape[1];
const index_t out_img_size = outh * outw;
#pragma omp parallel for
for (int b = 0; b < out_shape[0]; ++b) {
for (int oc = 0; oc < outch; oc += 2) {
if (oc + 1 < outch) {
float *out_base = output + (b * outch + oc) * out_img_size;
float *out_base1 = out_base + out_img_size;
const float bias_value = bias ? bias[oc] : 0.f;
std::fill_n(out_base, out_img_size, bias_value);
const float bias_value1 = bias ? bias[oc + 1] : 0.f;
std::fill_n(out_base1, out_img_size, bias_value1);
for (int q = 0; q < inch; q++) {
const float *input_base = input + (b * inch + q) * h * w;
const float *in = input_base;
const float *kernel_base = filter + (oc * inch + q) * 16;
const float *k0 = kernel_base;
const float *k1 = kernel_base + 4;
const float *k2 = kernel_base + 8;
const float *k3 = kernel_base + 12;
const float *kernel_base1 = kernel_base + inch * 16;
const float *k10 = kernel_base1;
const float *k11 = kernel_base1 + 4;
const float *k12 = kernel_base1 + 8;
const float *k13 = kernel_base1 + 12;
#if defined(MACE_ENABLE_NEON)
float32x4_t k0_vec = vld1q_f32(k0);
float32x4_t k1_vec = vld1q_f32(k1);
float32x4_t k2_vec = vld1q_f32(k2);
float32x4_t k3_vec = vld1q_f32(k3);
float32x4_t k10_vec = vld1q_f32(k10);
float32x4_t k11_vec = vld1q_f32(k11);
float32x4_t k12_vec = vld1q_f32(k12);
float32x4_t k13_vec = vld1q_f32(k13);
#endif
for (int i = 0; i < h; i++) {
float *out_row = out_base + i * outw;
float *out_row_0 = out_row;
float *out_row_1 = out_row_0 + outw;
float *out_row_2 = out_row_1 + outw;
float *out_row_3 = out_row_2 + outw;
float *out_row1 = out_base1 + i * outw;
float *out_row1_0 = out_row1;
float *out_row1_1 = out_row1_0 + outw;
float *out_row1_2 = out_row1_1 + outw;
float *out_row1_3 = out_row1_2 + outw;
int j = 0;
#if defined(MACE_ENABLE_NEON)
for (; j + 3 < w; j += 4) {
float32x4_t in_vec = vld1q_f32(in);
float32x4_t out00, out01, out02, out03;
float32x4_t out10, out11, out12, out13;
out00 = vld1q_f32(out_row_0);
out00 = neon_vfma_lane_0(out00, in_vec, k0_vec);
vst1q_f32(out_row_0, out00);
out10 = vld1q_f32(out_row1_0);
out10 = neon_vfma_lane_0(out10, in_vec, k10_vec);
vst1q_f32(out_row1_0, out10);
out01 = vld1q_f32(out_row_0 + 1);
out01 = neon_vfma_lane_1(out01, in_vec, k0_vec);
vst1q_f32(out_row_0 + 1, out01);
out11 = vld1q_f32(out_row1_0 + 1);
out11 = neon_vfma_lane_1(out11, in_vec, k10_vec);
vst1q_f32(out_row1_0 + 1, out11);
out02 = vld1q_f32(out_row_0 + 2);
out02 = neon_vfma_lane_2(out02, in_vec, k0_vec);
vst1q_f32(out_row_0 + 2, out02);
out12 = vld1q_f32(out_row1_0 + 2);
out12 = neon_vfma_lane_2(out12, in_vec, k10_vec);
vst1q_f32(out_row1_0 + 2, out12);
out03 = vld1q_f32(out_row_0 + 3);
out03 = neon_vfma_lane_3(out03, in_vec, k0_vec);
vst1q_f32(out_row_0 + 3, out03);
out13 = vld1q_f32(out_row1_0 + 3);
out13 = neon_vfma_lane_3(out13, in_vec, k10_vec);
vst1q_f32(out_row1_0 + 3, out13);
//
out00 = vld1q_f32(out_row_1);
out00 = neon_vfma_lane_0(out00, in_vec, k1_vec);
vst1q_f32(out_row_1, out00);
out10 = vld1q_f32(out_row1_1);
out10 = neon_vfma_lane_0(out10, in_vec, k11_vec);
vst1q_f32(out_row1_1, out10);
out01 = vld1q_f32(out_row_1 + 1);
out01 = neon_vfma_lane_1(out01, in_vec, k1_vec);
vst1q_f32(out_row_1 + 1, out01);
out11 = vld1q_f32(out_row1_1 + 1);
out11 = neon_vfma_lane_1(out11, in_vec, k11_vec);
vst1q_f32(out_row1_1 + 1, out11);
out02 = vld1q_f32(out_row_1 + 2);
out02 = neon_vfma_lane_2(out02, in_vec, k1_vec);
vst1q_f32(out_row_1 + 2, out02);
out12 = vld1q_f32(out_row1_1 + 2);
out12 = neon_vfma_lane_2(out12, in_vec, k11_vec);
vst1q_f32(out_row1_1 + 2, out12);
out03 = vld1q_f32(out_row_1 + 3);
out03 = neon_vfma_lane_3(out03, in_vec, k1_vec);
vst1q_f32(out_row_1 + 3, out03);
out13 = vld1q_f32(out_row1_1 + 3);
out13 = neon_vfma_lane_3(out13, in_vec, k11_vec);
vst1q_f32(out_row1_1 + 3, out13);
//
out00 = vld1q_f32(out_row_2 + 0);
out00 = neon_vfma_lane_0(out00, in_vec, k2_vec);
vst1q_f32(out_row_2 + 0, out00);
out10 = vld1q_f32(out_row1_2 + 0);
out10 = neon_vfma_lane_0(out10, in_vec, k12_vec);
vst1q_f32(out_row1_2 + 0, out10);
out01 = vld1q_f32(out_row_2 + 1);
out01 = neon_vfma_lane_1(out01, in_vec, k2_vec);
vst1q_f32(out_row_2 + 1, out01);
out11 = vld1q_f32(out_row1_2 + 1);
out11 = neon_vfma_lane_1(out11, in_vec, k12_vec);
vst1q_f32(out_row1_2 + 1, out11);
out02 = vld1q_f32(out_row_2 + 2);
out02 = neon_vfma_lane_2(out02, in_vec, k2_vec);
vst1q_f32(out_row_2 + 2, out02);
out12 = vld1q_f32(out_row1_2 + 2);
out12 = neon_vfma_lane_2(out12, in_vec, k12_vec);
vst1q_f32(out_row1_2 + 2, out12);
out03 = vld1q_f32(out_row_2 + 3);
out03 = neon_vfma_lane_3(out03, in_vec, k2_vec);
vst1q_f32(out_row_2 + 3, out03);
out13 = vld1q_f32(out_row1_2 + 3);
out13 = neon_vfma_lane_3(out13, in_vec, k12_vec);
vst1q_f32(out_row1_2 + 3, out13);
//
out00 = vld1q_f32(out_row_3 + 0);
out00 = neon_vfma_lane_0(out00, in_vec, k3_vec);
vst1q_f32(out_row_3 + 0, out00);
out10 = vld1q_f32(out_row1_3 + 0);
out10 = neon_vfma_lane_0(out10, in_vec, k13_vec);
vst1q_f32(out_row1_3 + 0, out10);
out01 = vld1q_f32(out_row_3 + 1);
out01 = neon_vfma_lane_1(out01, in_vec, k3_vec);
vst1q_f32(out_row_3 + 1, out01);
out11 = vld1q_f32(out_row1_3 + 1);
out11 = neon_vfma_lane_1(out11, in_vec, k13_vec);
vst1q_f32(out_row1_3 + 1, out11);
out02 = vld1q_f32(out_row_3 + 2);
out02 = neon_vfma_lane_2(out02, in_vec, k3_vec);
vst1q_f32(out_row_3 + 2, out02);
out12 = vld1q_f32(out_row1_3 + 2);
out12 = neon_vfma_lane_2(out12, in_vec, k13_vec);
vst1q_f32(out_row1_3 + 2, out12);
out03 = vld1q_f32(out_row_3 + 3);
out03 = neon_vfma_lane_3(out03, in_vec, k3_vec);
vst1q_f32(out_row_3 + 3, out03);
out13 = vld1q_f32(out_row1_3 + 3);
out13 = neon_vfma_lane_3(out13, in_vec, k13_vec);
vst1q_f32(out_row1_3 + 3, out13);
in += 4;
out_row_0 += 4;
out_row_1 += 4;
out_row_2 += 4;
out_row_3 += 4;
out_row1_0 += 4;
out_row1_1 += 4;
out_row1_2 += 4;
out_row1_3 += 4;
}
#endif
for (; j < w; j++) {
float val = in[0];
for (int k = 0; k < 4; ++k) {
out_row_0[k] += val * k0[k];
out_row_1[k] += val * k1[k];
out_row_2[k] += val * k2[k];
out_row_3[k] += val * k3[k];
out_row1_0[k] += val * k10[k];
out_row1_1[k] += val * k11[k];
out_row1_2[k] += val * k12[k];
out_row1_3[k] += val * k13[k];
}
in++;
out_row_0++;
out_row_1++;
out_row_2++;
out_row_3++;
out_row1_0++;
out_row1_1++;
out_row1_2++;
out_row1_3++;
}
}
}
} else {
float *out_base = output + (b * outch + oc) * out_img_size;
const float bias_value = bias ? bias[oc] : 0.f;
std::fill_n(out_base, out_img_size, bias_value);
for (int q = 0; q < inch; q++) {
const float *input_base = input + (b * inch + q) * h * w;
const float *kernel_base = filter + (oc * inch + q) * 16;
const float *in = input_base;
const float *k0 = kernel_base;
const float *k1 = kernel_base + 4;
const float *k2 = kernel_base + 8;
const float *k3 = kernel_base + 12;
#if defined(MACE_ENABLE_NEON)
float32x4_t k0_vec = vld1q_f32(k0);
float32x4_t k1_vec = vld1q_f32(k1);
float32x4_t k2_vec = vld1q_f32(k2);
float32x4_t k3_vec = vld1q_f32(k3);
#endif
for (int i = 0; i < h; i++) {
float *out_row = out_base + i * outw;
float *out_row_0 = out_row;
float *out_row_1 = out_row_0 + outw;
float *out_row_2 = out_row_1 + outw;
float *out_row_3 = out_row_2 + outw;
int j = 0;
#if defined(MACE_ENABLE_NEON)
for (; j + 3 < w; j += 4) {
float32x4_t in_vec = vld1q_f32(in);
float32x4_t out00 = vld1q_f32(out_row_0);
out00 = neon_vfma_lane_0(out00, in_vec, k0_vec);
vst1q_f32(out_row_0, out00);
float32x4_t out01 = vld1q_f32(out_row_0 + 1);
out01 = neon_vfma_lane_1(out01, in_vec, k0_vec);
vst1q_f32(out_row_0 + 1, out01);
float32x4_t out02 = vld1q_f32(out_row_0 + 2);
out02 = neon_vfma_lane_2(out02, in_vec, k0_vec);
vst1q_f32(out_row_0 + 2, out02);
float32x4_t out03 = vld1q_f32(out_row_0 + 3);
out03 = neon_vfma_lane_3(out03, in_vec, k0_vec);
vst1q_f32(out_row_0 + 3, out03);
//
float32x4_t out10 = vld1q_f32(out_row_1);
out10 = neon_vfma_lane_0(out10, in_vec, k1_vec);
vst1q_f32(out_row_1, out10);
float32x4_t out11 = vld1q_f32(out_row_1 + 1);
out11 = neon_vfma_lane_1(out11, in_vec, k1_vec);
vst1q_f32(out_row_1 + 1, out11);
float32x4_t out12 = vld1q_f32(out_row_1 + 2);
out12 = neon_vfma_lane_2(out12, in_vec, k1_vec);
vst1q_f32(out_row_1 + 2, out12);
float32x4_t out13 = vld1q_f32(out_row_1 + 3);
out13 = neon_vfma_lane_3(out13, in_vec, k1_vec);
vst1q_f32(out_row_1 + 3, out13);
//
float32x4_t out20 = vld1q_f32(out_row_2 + 0);
out20 = neon_vfma_lane_0(out20, in_vec, k2_vec);
vst1q_f32(out_row_2 + 0, out20);
float32x4_t out21 = vld1q_f32(out_row_2 + 1);
out21 = neon_vfma_lane_1(out21, in_vec, k2_vec);
vst1q_f32(out_row_2 + 1, out21);
float32x4_t out22 = vld1q_f32(out_row_2 + 2);
out22 = neon_vfma_lane_2(out22, in_vec, k2_vec);
vst1q_f32(out_row_2 + 2, out22);
float32x4_t out23 = vld1q_f32(out_row_2 + 3);
out23 = neon_vfma_lane_3(out23, in_vec, k2_vec);
vst1q_f32(out_row_2 + 3, out23);
//
float32x4_t out30 = vld1q_f32(out_row_3 + 0);
out30 = neon_vfma_lane_0(out30, in_vec, k3_vec);
vst1q_f32(out_row_3 + 0, out30);
float32x4_t out31 = vld1q_f32(out_row_3 + 1);
out31 = neon_vfma_lane_1(out31, in_vec, k3_vec);
vst1q_f32(out_row_3 + 1, out31);
float32x4_t out32 = vld1q_f32(out_row_3 + 2);
out32 = neon_vfma_lane_2(out32, in_vec, k3_vec);
vst1q_f32(out_row_3 + 2, out32);
float32x4_t out33 = vld1q_f32(out_row_3 + 3);
out33 = neon_vfma_lane_3(out33, in_vec, k3_vec);
vst1q_f32(out_row_3 + 3, out33);
in += 4;
out_row_0 += 4;
out_row_1 += 4;
out_row_2 += 4;
out_row_3 += 4;
}
#endif
for (; j < w; j++) {
float val = in[0];
for (int k = 0; k < 4; ++k) {
out_row_0[k] += val * k0[k];
out_row_1[k] += val * k1[k];
out_row_2[k] += val * k2[k];
out_row_3[k] += val * k3[k];
}
in++;
out_row_0++;
out_row_1++;
out_row_2++;
out_row_3++;
}
}
}
}
}
}
}
void Deconv2dNeonK4x4S2(const float *input,
const float *filter,
const float *bias,
const index_t *in_shape,
const index_t *out_shape,
float *output) {
const index_t w = in_shape[3];
const index_t h = in_shape[2];
const index_t inch = in_shape[1];
const index_t outh = out_shape[2];
const index_t outw = out_shape[3];
const index_t outch = out_shape[1];
const index_t out_img_size = outh * outw;
#pragma omp parallel for
for (int b = 0; b < out_shape[0]; ++b) {
for (int p = 0; p < outch; p++) {
float *out_base = output + (b * outch + p) * out_img_size;
const float bias_value = bias ? bias[p] : 0.f;
std::fill_n(out_base, outh * outw, bias_value);
for (int q = 0; q < inch; q++) {
const float *input_base = input + (b * inch + q) * h * w;
const float *kernel_base = filter + (p * inch + q) * 16;
const float *in = input_base;
const float *k0 = kernel_base;
const float *k1 = kernel_base + 4;
const float *k2 = kernel_base + 8;
const float *k3 = kernel_base + 12;
#if defined(MACE_ENABLE_NEON)
float32x4_t k0_vec = vld1q_f32(k0);
float32x4_t k1_vec = vld1q_f32(k1);
float32x4_t k2_vec = vld1q_f32(k2);
float32x4_t k3_vec = vld1q_f32(k3);
#endif
for (int i = 0; i < h; i++) {
float *out_row = out_base + 2 * i * outw;
float *out_row_0 = out_row;
float *out_row_1 = out_row_0 + outw;
float *out_row_2 = out_row_1 + outw;
float *out_row_3 = out_row_2 + outw;
int j = 0;
#if defined(MACE_ENABLE_NEON)
for (; j + 3 < w; j += 4) {
float32x4_t in_vec = vld1q_f32(in);
// row 0
float32x4x2_t out0 = vld2q_f32(out_row_0);
out0.val[0] =
neon_vfma_lane_0(out0.val[0], in_vec, k0_vec);
out0.val[1] =
neon_vfma_lane_1(out0.val[1], in_vec, k0_vec);
vst2q_f32(out_row_0, out0);
out0 = vld2q_f32(out_row_0 + 2);
out0.val[0] =
neon_vfma_lane_2(out0.val[0], in_vec, k0_vec);
out0.val[1] =
neon_vfma_lane_3(out0.val[1], in_vec, k0_vec);
vst2q_f32(out_row_0 + 2, out0);
// row 1
float32x4x2_t out1 = vld2q_f32(out_row_1);
out1.val[0] =
neon_vfma_lane_0(out1.val[0], in_vec, k1_vec);
out1.val[1] =
neon_vfma_lane_1(out1.val[1], in_vec, k1_vec);
vst2q_f32(out_row_1, out1);
out1 = vld2q_f32(out_row_1 + 2);
out1.val[0] =
neon_vfma_lane_2(out1.val[0], in_vec, k1_vec);
out1.val[1] =
neon_vfma_lane_3(out1.val[1], in_vec, k1_vec);
vst2q_f32(out_row_1 + 2, out1);
// row 2
float32x4x2_t out2 = vld2q_f32(out_row_2);
out2.val[0] =
neon_vfma_lane_0(out2.val[0], in_vec, k2_vec);
out2.val[1] =
neon_vfma_lane_1(out2.val[1], in_vec, k2_vec);
vst2q_f32(out_row_2, out2);
out2 = vld2q_f32(out_row_2 + 2);
out2.val[0] =
neon_vfma_lane_2(out2.val[0], in_vec, k2_vec);
out2.val[1] =
neon_vfma_lane_3(out2.val[1], in_vec, k2_vec);
vst2q_f32(out_row_2 + 2, out2);
// row 3
float32x4x2_t out3 = vld2q_f32(out_row_3);
out3.val[0] =
neon_vfma_lane_0(out3.val[0], in_vec, k3_vec);
out3.val[1] =
neon_vfma_lane_1(out3.val[1], in_vec, k3_vec);
vst2q_f32(out_row_3, out3);
out3 = vld2q_f32(out_row_3 + 2);
out3.val[0] =
neon_vfma_lane_2(out3.val[0], in_vec, k3_vec);
out3.val[1] =
neon_vfma_lane_3(out3.val[1], in_vec, k3_vec);
vst2q_f32(out_row_3 + 2, out3);
in += 4;
out_row_0 += 8;
out_row_1 += 8;
out_row_2 += 8;
out_row_3 += 8;
}
#endif
for (; j < w; j++) {
float val = in[0];
for (int k = 0; k < 4; ++k) {
out_row_0[k] += val * k0[k];
out_row_1[k] += val * k1[k];
out_row_2[k] += val * k2[k];
out_row_3[k] += val * k3[k];
}
in++;
out_row_0 += 2;
out_row_1 += 2;
out_row_2 += 2;
out_row_3 += 2;
}
}
}
}
}
}
} // namespace kernels
} // namespace mace
......@@ -15,76 +15,25 @@
#ifndef MACE_KERNELS_DECONV_2D_H_
#define MACE_KERNELS_DECONV_2D_H_
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
#if defined(MACE_ENABLE_NEON)
#include <arm_neon.h>
#endif
#include <algorithm>
#include <functional>
#include <memory>
#include <vector>
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/activation.h"
#include "mace/kernels/arm/deconv_2d_neon.h"
#include "mace/kernels/conv_pool_2d_util.h"
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
namespace deconv {
template<typename T>
void Deconv2dNCHW(const T *input,
const T *filter,
const T *bias,
const index_t *in_shape,
const index_t *out_shape,
const index_t *kernel_hw,
const int *strides,
const int *padding,
float *output) {
#pragma omp parallel for collapse(4)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t oc = 0; oc < out_shape[1]; ++oc) {
for (index_t oh = 0; oh < out_shape[2]; ++oh) {
for (index_t ow = 0; ow < out_shape[3]; ++ow) {
index_t filter_start_y, filter_start_x;
index_t start_x = std::max<int>(0, ow + strides[1] -1 - padding[1]);
index_t start_y = std::max<int>(0, oh + strides[0] -1 - padding[0]);
start_x /= strides[1];
start_y /= strides[0];
filter_start_x = padding[1] + strides[1] * start_x - ow;
filter_start_y = padding[0] + strides[0] * start_y - oh;
filter_start_x = kernel_hw[1] - 1 - filter_start_x;
filter_start_y = kernel_hw[0] - 1 - filter_start_y;
T out_value = 0;
index_t out_pos =
((b * out_shape[1] + oc) * out_shape[2] + oh) * out_shape[3] + ow;
for (index_t ic = 0; ic < in_shape[1]; ++ic) {
for (index_t f_y = filter_start_y, ih = start_y;
f_y >= 0 && ih < in_shape[2]; f_y -= strides[0], ++ih) {
for (index_t f_x = filter_start_x, iw = start_x;
f_x >= 0 && iw < in_shape[3]; f_x -= strides[1], ++iw) {
index_t weight_pos =
((oc * in_shape[1] + ic) * kernel_hw[0] + f_y)
* kernel_hw[1] + f_x;
index_t in_pos =
((b * in_shape[1] + ic) * in_shape[2] + ih)
* in_shape[3] + iw;
out_value += input[in_pos] * filter[weight_pos];
}
}
}
if (bias != nullptr)
out_value += bias[oc];
output[out_pos] = out_value;
}
}
}
}
}
} // namespace deconv
struct Deconv2dFunctorBase : OpKernel {
Deconv2dFunctorBase(OpKernelContext *context,
const std::vector<int> &strides,
......@@ -107,6 +56,7 @@ struct Deconv2dFunctorBase : OpKernel {
const int *strides,
index_t *output_shape,
const int *padding_size,
int *input_padding,
const bool isNCHW = false) {
MACE_CHECK_NOTNULL(output_shape);
MACE_CHECK_NOTNULL(padding_size);
......@@ -119,13 +69,18 @@ struct Deconv2dFunctorBase : OpKernel {
const index_t in_height = isNCHW ? input_shape[2] : input_shape[1];
const index_t in_width = isNCHW ? input_shape[3] : input_shape[2];
const index_t filter_h = filter_shape[2];
const index_t filter_w = filter_shape[3];
const index_t kernel_h = filter_shape[2];
const index_t kernel_w = filter_shape[3];
input_padding[0] = static_cast<int>((kernel_h -1) * 2 - padding_size[0]);
input_padding[1] = static_cast<int>((kernel_w -1) * 2 - padding_size[1]);
input_padding[0] = std::max<int>(0, input_padding[0]);
input_padding[1] = std::max<int>(0, input_padding[1]);
index_t out_height =
(in_height - 1) * strides[0] + filter_h -padding_size[0];
(in_height - 1) * strides[0] + kernel_h - padding_size[0];
index_t out_width =
(in_width - 1) * strides[1] + filter_w -padding_size[1];
(in_width - 1) * strides[1] + kernel_w - padding_size[1];
output_shape[0] = input_shape[0];
if (isNCHW) {
......@@ -206,8 +161,12 @@ struct Deconv2dFunctorBase : OpKernel {
const float relux_max_limit_;
};
template <DeviceType D, typename T>
struct Deconv2dFunctor : Deconv2dFunctorBase {
template<DeviceType D, typename T>
struct Deconv2dFunctor;
template<>
struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
Deconv2dFunctor(OpKernelContext *context,
const std::vector<int> &strides,
const Padding &padding_type,
......@@ -223,6 +182,92 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
activation,
relux_max_limit) {}
void Deconv2dGeneral(const float *input,
const float *filter,
const float *bias,
const index_t kernel_h,
const index_t kernel_w,
const int *strides,
const index_t *in_shape,
const index_t *out_shape,
float *output) {
const index_t kernel_size = kernel_h * kernel_w;
std::vector<int> out_map(kernel_size);
int p0 = 0;
int p1 = 0;
index_t gap = out_shape[3] - kernel_w;
for (int i = 0; i < kernel_h; ++i) {
for (int j = 0; j < kernel_w; ++j) {
out_map[p0] = p1;
p0++;
p1++;
}
p1 += gap;
}
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t in_height = in_shape[2];
const index_t in_width = in_shape[3];
const index_t out_img_size = out_height * out_width;
const index_t in_img_size = in_height * in_width;
#pragma omp parallel for
for (int b = 0; b < in_shape[0]; ++b) {
for (int oc = 0; oc < out_shape[1]; ++oc) {
float *out_base =
output + (b * out_shape[1] + oc) * out_img_size;
const float bias_value = bias ? bias[oc] : 0.f;
std::fill_n(out_base, out_img_size, bias_value);
for (int i = 0; i < in_height; ++i) {
for (int j = 0; j < in_width; ++j) {
const index_t out_offset =
i * strides[0] * out_width + j * strides[1];
for (int ic = 0; ic < in_shape[1]; ++ic) {
const index_t input_idx =
(b * in_shape[1] + ic) * in_img_size + i * in_width + j;
const float val = input[input_idx];
const index_t kernel_offset =
(oc * in_shape[1] + ic) * kernel_size;
for (int k = 0; k < kernel_size; ++k) {
const index_t out_idx = out_offset + out_map[k];
const index_t kernel_idx = kernel_offset + k;
out_base[out_idx] += val * filter[kernel_idx];
}
}
}
}
}
}
}
void CropPadOut(const float *input,
const index_t *in_shape,
const index_t *out_shape,
const index_t pad_h,
const index_t pad_w,
float *output) {
const index_t batch = in_shape[0];
const index_t channel = in_shape[1];
const index_t in_height = in_shape[2];
const index_t in_width = in_shape[3];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
#pragma omp parallel for
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channel; ++j) {
for (int k = 0; k < out_height; ++k) {
const float *input_base =
input + ((i * channel + j) * in_height + (k + pad_h)) * in_width;
float *output_base =
output + ((i * channel + j) * out_height + k)* out_width;
memcpy(output_base, input_base + pad_w, out_width * sizeof(float));
}
}
}
}
MaceStatus operator()(const Tensor *input, // NCHW
const Tensor *filter, // OIHW
const Tensor *bias,
......@@ -235,6 +280,7 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
MACE_CHECK_NOTNULL(output);
std::vector<int> paddings(2);
std::vector<int> out_paddings(2);
std::vector<index_t> output_shape(4);
if (paddings_.empty()) { // tensorflow
paddings = std::vector<int>(2, 0);
......@@ -261,19 +307,20 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
output_shape.data(),
paddings.data(), true);
} else { // caffe
paddings = paddings_;
out_paddings = paddings_;
output_shape = std::vector<index_t>(4, 0);
CalcDeconvOutputSize(input->shape().data(),
filter->shape().data(),
strides_.data(),
output_shape.data(),
paddings.data(), true);
out_paddings.data(),
paddings.data(),
true);
}
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3);
const index_t *in_shape = input->shape().data();
const index_t kernel_hw[2] = {kernel_h, kernel_w};
MACE_CHECK(filter->dim(0) == output_shape[1], filter->dim(0), " != ",
output_shape[1]);
......@@ -281,28 +328,144 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
in_shape[1]);
MACE_CHECK(in_shape[0] == output_shape[0],
"Input/Output batch size mismatch");
std::function<void(const float *input,
const float *filter,
const float *bias,
const index_t *in_shape,
const index_t *out_shape,
float *output)> deconv_func;
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard bias_mapper(bias);
Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<T>();
auto filter_data = filter->data<T>();
auto bias_data = bias == nullptr ? nullptr : bias->data<T>();
auto output_data = output->mutable_data<T>();
int padding[2];
padding[0] = (paddings[0] + 1) >> 1;
padding[1] = (paddings[1] + 1) >> 1;
deconv::Deconv2dNCHW(input_data,
filter_data,
bias_data,
in_shape,
output_shape.data(),
kernel_hw,
strides_.data(),
padding,
output_data);
DoActivation(output_data,
auto input_data = input->data<float>();
auto filter_data = filter->data<float>();
auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
auto output_data = output->mutable_data<float>();
const index_t padded_out_h = (in_shape[2] - 1) * strides_[0] + kernel_h;
const index_t padded_out_w = (in_shape[3] - 1) * strides_[1] + kernel_w;
const index_t pad_h = (padded_out_h - output_shape[2]) / 2;
const index_t pad_w = (padded_out_w - output_shape[3]) / 2;
std::vector<index_t> padded_out_shape({output_shape[0], output_shape[1],
padded_out_h, padded_out_w});
index_t padded_out_size =
std::accumulate(padded_out_shape.begin(),
padded_out_shape.end(),
1,
std::multiplies<index_t>()) * sizeof(float);
ScratchBuffer *scratch = context_->device()->scratch_buffer();
scratch->Rewind();
scratch->GrowSize(padded_out_size);
Tensor padded_out(scratch->Scratch(padded_out_size), DT_FLOAT);
auto *padded_out_data = padded_out.mutable_data<float>();
bool use_neon_3x3_s1 = kernel_h == kernel_w && kernel_h == 3 &&
strides_[0] == strides_[1] && strides_[0] == 1;
bool use_neon_3x3_s2 = kernel_h == kernel_w && kernel_h == 3 &&
strides_[0] == strides_[1] && strides_[0] == 2;
bool use_neon_4x4_s1 = kernel_h == kernel_w && kernel_h == 4 &&
strides_[0] == strides_[1] && strides_[0] == 1;
bool use_neon_4x4_s2 = kernel_h == kernel_w && kernel_h == 4 &&
strides_[0] == strides_[1] && strides_[0] == 2;
if (use_neon_3x3_s1) {
deconv_func = [=](const float *input,
const float *filter,
const float *bias,
const index_t *in_shape,
const index_t *padded_out_shape,
float *padded_output) {
Deconv2dNeonK3x3S1(input,
filter,
bias,
in_shape,
padded_out_shape,
padded_output);
};
} else if (use_neon_3x3_s2) {
deconv_func = [=](const float *input,
const float *filter,
const float *bias,
const index_t *in_shape,
const index_t *padded_out_shape,
float *padded_output) {
Deconv2dNeonK3x3S2(input,
filter,
bias,
in_shape,
padded_out_shape,
padded_output);
};
} else if (use_neon_4x4_s1) {
deconv_func = [=](const float *input,
const float *filter,
const float *bias,
const index_t *in_shape,
const index_t *padded_out_shape,
float *padded_output) {
Deconv2dNeonK4x4S1(input,
filter,
bias,
in_shape,
padded_out_shape,
padded_output);
};
} else if (use_neon_4x4_s2) {
deconv_func = [=](const float *input,
const float *filter,
const float *bias,
const index_t *in_shape,
const index_t *padded_out_shape,
float *padded_output) {
Deconv2dNeonK4x4S2(input,
filter,
bias,
in_shape,
padded_out_shape,
padded_output);
};
} else {
deconv_func = [=](const float *input,
const float *filter,
const float *bias,
const index_t *in_shape,
const index_t *padded_out_shape,
float *padded_output) {
Deconv2dGeneral(input,
filter,
bias,
kernel_h,
kernel_w,
strides_.data(),
in_shape,
padded_out_shape,
padded_output);
};
}
bool no_pad =
padded_out_h == output_shape[2] && padded_out_w == output_shape[3];
float *out_data = no_pad ? output_data : padded_out_data;
deconv_func(input_data,
filter_data,
bias_data,
in_shape,
padded_out_shape.data(),
out_data);
if (!no_pad) {
CropPadOut(out_data,
padded_out_shape.data(),
output_shape.data(),
pad_h,
pad_w,
output_data);
}
DoActivation<float>(output_data,
output_data,
output->size(),
activation_,
......
......@@ -53,6 +53,7 @@ MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(
MACE_CHECK_NOTNULL(filter);
MACE_CHECK_NOTNULL(output);
std::vector<int> paddings(2);
std::vector<int> out_paddings(2);
std::vector<index_t> output_shape(4);
if (paddings_.empty()) {
paddings = std::vector<int>(2, 0);
......@@ -74,12 +75,14 @@ MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(
output_shape.data(),
paddings.data());
} else {
paddings = paddings_;
out_paddings = paddings_;
paddings = std::vector<int>(2, 0);
output_shape = std::vector<index_t>(4, 0);
CalcDeconvOutputSize(input->shape().data(),
filter->shape().data(),
strides_.data(),
output_shape.data(),
out_paddings.data(),
paddings.data());
}
......
......@@ -116,6 +116,7 @@ static void Deconv2d(int iters,
// TODO(liutuo): add cpu benchmark when optimized.
#define MACE_BM_DECONV_2D(N, C, H, W, KH, KW, S, OH, OW, P, OC) \
MACE_BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, float, CPU); \
MACE_BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, float, GPU); \
MACE_BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, half, GPU);
......@@ -124,6 +125,11 @@ MACE_BM_DECONV_2D(1, 32, 60, 60, 1, 1, 1, 60, 60, VALID, 128);
MACE_BM_DECONV_2D(1, 128, 60, 60, 3, 3, 1, 62, 62, VALID, 128);
MACE_BM_DECONV_2D(1, 32, 60, 60, 3, 3, 1, 60, 60, SAME, 32);
MACE_BM_DECONV_2D(1, 128, 60, 60, 4, 4, 1, 63, 63, VALID, 128);
MACE_BM_DECONV_2D(1, 32, 60, 60, 4, 4, 1, 60, 60, SAME, 32);
MACE_BM_DECONV_2D(1, 3, 224, 224, 4, 4, 2, 448, 448, SAME, 32);
MACE_BM_DECONV_2D(1, 3, 224, 224, 4, 4, 2, 450, 450, VALID, 32);
MACE_BM_DECONV_2D(1, 3, 512, 512, 7, 7, 2, 1023, 1023, SAME, 32);
MACE_BM_DECONV_2D(1, 128, 16, 16, 5, 5, 1, 20, 20, VALID, 32);
MACE_BM_DECONV_2D(1, 128, 64, 64, 5, 5, 1, 68, 68, VALID, 32);
......@@ -134,6 +140,7 @@ MACE_BM_DECONV_2D(1, 64, 32, 32, 1, 1, 1, 32, 32, VALID, 128);
MACE_BM_DECONV_2D(1, 64, 33, 32, 3, 3, 2, 65, 63, SAME, 128);
MACE_BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 447, 447, SAME, 32);
MACE_BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 449, 449, VALID, 32);
MACE_BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 448, 448, SAME, 32);
} // namespace test
} // namespace ops
......
......@@ -380,11 +380,11 @@ void TestComplexDeconvNxNS12(const int batch,
1e-4);
};
for (int kernel_size : {1, 3, 5, 7}) {
for (int kernel_size : {3, 4, 5, 7}) {
func(kernel_size, kernel_size, stride, stride, VALID, -1);
func(kernel_size, kernel_size, stride, stride, SAME, -1);
func(kernel_size, kernel_size, stride, stride, VALID, 1);
func(kernel_size, kernel_size, stride, stride, VALID, 2);
func(kernel_size, kernel_size, stride, stride, VALID, 3);
}
}
} // namespace
......@@ -410,8 +410,8 @@ TEST_F(Deconv2dOpTest, OPENCLUnalignedDeconvNxNS34) {
}
TEST_F(Deconv2dOpTest, OPENCLUnalignedDeconvNxNMultiBatch) {
TestComplexDeconvNxNS12<DeviceType::GPU, float>(3, {17, 13, 5, 7}, 1);
TestComplexDeconvNxNS12<DeviceType::GPU, float>(5, {17, 13, 5, 7}, 2);
TestComplexDeconvNxNS12<DeviceType::GPU, float>(3, {17, 113, 5, 7}, 1);
TestComplexDeconvNxNS12<DeviceType::GPU, float>(5, {17, 113, 5, 7}, 2);
}
} // namespace test
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册