提交 9e2ab0d1 编写于 作者: 刘琦

Merge branch 'test-ndk' into 'master'

fix ndk-r15c openmp bugs

See merge request !491
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <omp.h> #include <omp.h>
#endif #endif
#include <errno.h>
#include <unistd.h> #include <unistd.h>
#include <sys/syscall.h> #include <sys/syscall.h>
#include <sys/types.h> #include <sys/types.h>
......
...@@ -31,68 +31,38 @@ extern void Conv2dNeonK1x1S1(const float *input, ...@@ -31,68 +31,38 @@ extern void Conv2dNeonK1x1S1(const float *input,
extern void Conv2dNeonK3x3S1(const float *input, extern void Conv2dNeonK3x3S1(const float *input,
const float *filter, const float *filter,
const index_t batch, const index_t *in_shape,
const index_t in_height, const index_t *out_shape,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
float *output); float *output);
extern void Conv2dNeonK3x3S2(const float *input, extern void Conv2dNeonK3x3S2(const float *input,
const float *filter, const float *filter,
const index_t batch, const index_t *in_shape,
const index_t in_height, const index_t *out_shape,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
float *output); float *output);
extern void Conv2dNeonK5x5S1(const float *input, extern void Conv2dNeonK5x5S1(const float *input,
const float *filter, const float *filter,
const index_t batch, const index_t *in_shape,
const index_t in_height, const index_t *out_shape,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
float *output); float *output);
extern void Conv2dNeonK7x7S1(const float *input, extern void Conv2dNeonK7x7S1(const float *input,
const float *filter, const float *filter,
const index_t batch, const index_t *in_shape,
const index_t in_height, const index_t *out_shape,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
float *output); float *output);
extern void Conv2dNeonK7x7S2(const float *input, extern void Conv2dNeonK7x7S2(const float *input,
const float *filter, const float *filter,
const index_t batch, const index_t *in_shape,
const index_t in_height, const index_t *out_shape,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
float *output); float *output);
extern void Conv2dNeonK7x7S3(const float *input, extern void Conv2dNeonK7x7S3(const float *input,
const float *filter, const float *filter,
const index_t batch, const index_t *in_shape,
const index_t in_height, const index_t *out_shape,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
float *output); float *output);
} // namespace kernels } // namespace kernels
......
...@@ -24,22 +24,22 @@ namespace kernels { ...@@ -24,22 +24,22 @@ namespace kernels {
// Ho = 2, Wo = 4, Co = 2 // Ho = 2, Wo = 4, Co = 2
void Conv2dNeonK3x3S1(const float *input, void Conv2dNeonK3x3S1(const float *input,
const float *filter, const float *filter,
const index_t batch, const index_t *in_shape,
const index_t in_height, const index_t *out_shape,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
float *output) { float *output) {
const index_t in_image_size = in_height * in_width; const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_height * out_width; const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = in_channels * in_image_size; const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_channels * out_image_size; const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_channels; m += 2) { for (index_t m = 0; m < out_shape[1]; m += 2) {
const index_t out_channels = out_shape[1];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3];
if (m + 1 < out_channels) { if (m + 1 < out_channels) {
float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
...@@ -522,23 +522,22 @@ void Conv2dNeonK3x3S1(const float *input, ...@@ -522,23 +522,22 @@ void Conv2dNeonK3x3S1(const float *input,
void Conv2dNeonK3x3S2(const float *input, void Conv2dNeonK3x3S2(const float *input,
const float *filter, const float *filter,
const index_t batch, const index_t *in_shape,
const index_t in_height, const index_t *out_shape,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
float *output) { float *output) {
const index_t in_image_size = in_height * in_width; const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_height * out_width; const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = in_channels * in_image_size; const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_channels * out_image_size; const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_channels; ++m) { for (index_t m = 0; m < out_shape[1]; ++m) {
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < in_shape[1]; ++c) {
const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const float *in_base = input + b * in_batch_size + c * in_image_size; const float *in_base = input + b * in_batch_size + c * in_image_size;
const float const float
*filter_ptr = filter + m * in_channels * 9 + c * 9; *filter_ptr = filter + m * in_channels * 9 + c * 9;
......
...@@ -103,22 +103,22 @@ inline void Conv2dCPUK5x5Calc(const float *in_ptr_base, ...@@ -103,22 +103,22 @@ inline void Conv2dCPUK5x5Calc(const float *in_ptr_base,
// Ho = 1, Wo = 4, Co = 4 // Ho = 1, Wo = 4, Co = 4
void Conv2dNeonK5x5S1(const float *input, void Conv2dNeonK5x5S1(const float *input,
const float *filter, const float *filter,
const index_t batch, const index_t *in_shape,
const index_t in_height, const index_t *out_shape,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
float *output) { float *output) {
const index_t in_image_size = in_height * in_width; const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_height * out_width; const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = in_channels * in_image_size; const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_channels * out_image_size; const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_channels; m += 4) { for (index_t m = 0; m < out_shape[1]; m += 4) {
const index_t out_channels = out_shape[1];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3];
if (m + 3 < out_channels) { if (m + 3 < out_channels) {
float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__) #if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
......
...@@ -180,22 +180,22 @@ inline void Conv2dCPUK7x7Calc(const float *in_ptr_base, ...@@ -180,22 +180,22 @@ inline void Conv2dCPUK7x7Calc(const float *in_ptr_base,
// Ho = 1, Wo = 4, Co = 4 // Ho = 1, Wo = 4, Co = 4
void Conv2dNeonK7x7S1(const float *input, void Conv2dNeonK7x7S1(const float *input,
const float *filter, const float *filter,
const index_t batch, const index_t *in_shape,
const index_t in_height, const index_t *out_shape,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
float *output) { float *output) {
const index_t in_image_size = in_height * in_width; const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_height * out_width; const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = in_channels * in_image_size; const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_channels * out_image_size; const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_channels; m += 4) { for (index_t m = 0; m < out_shape[1]; m += 4) {
const index_t out_channels = out_shape[1];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3];
if (m + 3 < out_channels) { if (m + 3 < out_channels) {
float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
...@@ -336,22 +336,22 @@ void Conv2dNeonK7x7S1(const float *input, ...@@ -336,22 +336,22 @@ void Conv2dNeonK7x7S1(const float *input,
// Ho = 1, Wo = 4, Co = 4 // Ho = 1, Wo = 4, Co = 4
void Conv2dNeonK7x7S2(const float *input, void Conv2dNeonK7x7S2(const float *input,
const float *filter, const float *filter,
const index_t batch, const index_t *in_shape,
const index_t in_height, const index_t *out_shape,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
float *output) { float *output) {
const index_t in_image_size = in_height * in_width; const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_height * out_width; const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = in_channels * in_image_size; const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_channels * out_image_size; const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_channels; m += 4) { for (index_t m = 0; m < out_shape[1]; m += 4) {
const index_t out_channels = out_shape[1];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3];
if (m + 3 < out_channels) { if (m + 3 < out_channels) {
float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
...@@ -502,22 +502,22 @@ void Conv2dNeonK7x7S2(const float *input, ...@@ -502,22 +502,22 @@ void Conv2dNeonK7x7S2(const float *input,
// Ho = 1, Wo = 4, Co = 4 // Ho = 1, Wo = 4, Co = 4
void Conv2dNeonK7x7S3(const float *input, void Conv2dNeonK7x7S3(const float *input,
const float *filter, const float *filter,
const index_t batch, const index_t *in_shape,
const index_t in_height, const index_t *out_shape,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
float *output) { float *output) {
const index_t in_image_size = in_height * in_width; const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_height * out_width; const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = in_channels * in_image_size; const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_channels * out_image_size; const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_channels; m += 4) { for (index_t m = 0; m < out_shape[1]; m += 4) {
const index_t out_channels = out_shape[1];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3];
if (m + 3 < out_channels) { if (m + 3 < out_channels) {
float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
......
...@@ -22,15 +22,9 @@ namespace kernels { ...@@ -22,15 +22,9 @@ namespace kernels {
void DepthwiseConv2dNeonK3x3S1(const float *input, void DepthwiseConv2dNeonK3x3S1(const float *input,
const float *filter, const float *filter,
const index_t batch, const index_t *in_shape,
const index_t in_height, const index_t *out_shape,
const index_t in_width, const int *pad_hw,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
const int pad_top,
const int pad_left,
const index_t valid_h_start, const index_t valid_h_start,
const index_t valid_h_stop, const index_t valid_h_stop,
const index_t valid_w_start, const index_t valid_w_start,
...@@ -39,15 +33,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, ...@@ -39,15 +33,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
void DepthwiseConv2dNeonK3x3S2(const float *input, void DepthwiseConv2dNeonK3x3S2(const float *input,
const float *filter, const float *filter,
const index_t batch, const index_t *in_shape,
const index_t in_height, const index_t *out_shape,
const index_t in_width, const int *pad_hw,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
const int pad_top,
const int pad_left,
const index_t valid_h_start, const index_t valid_h_start,
const index_t valid_h_stop, const index_t valid_h_stop,
const index_t valid_w_start, const index_t valid_w_start,
......
...@@ -52,15 +52,9 @@ void DepthwiseConv2dPixel(const float *in_base, ...@@ -52,15 +52,9 @@ void DepthwiseConv2dPixel(const float *in_base,
// Ho = 2, Wo = 4, Co = 1 // Ho = 2, Wo = 4, Co = 1
void DepthwiseConv2dNeonK3x3S1(const float *input, void DepthwiseConv2dNeonK3x3S1(const float *input,
const float *filter, const float *filter,
const index_t batch, const index_t* in_shape,
const index_t in_height, const index_t* out_shape,
const index_t in_width, const int* pad_hw,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
const int pad_top,
const int pad_left,
const index_t valid_h_start, const index_t valid_h_start,
const index_t valid_h_stop, const index_t valid_h_stop,
const index_t valid_w_start, const index_t valid_w_start,
...@@ -70,25 +64,30 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, ...@@ -70,25 +64,30 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
MACE_UNUSED(valid_w_start); MACE_UNUSED(valid_w_start);
MACE_UNUSED(valid_w_stop); MACE_UNUSED(valid_w_stop);
#endif #endif
const index_t multiplier = out_channels / in_channels; const index_t multiplier = out_shape[1] / in_shape[1];
const index_t in_image_size = in_height * in_width; const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_height * out_width; const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = in_channels * in_image_size; const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_channels * out_image_size; const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < in_shape[0]; ++b) {
for (index_t m = 0; m < out_channels; ++m) { for (index_t m = 0; m < out_shape[1]; ++m) {
index_t c = m / multiplier; index_t c = m / multiplier;
index_t multi_index = m % multiplier; index_t multi_index = m % multiplier;
const float *in_base = input + b * in_batch_size + c * in_image_size; const float *in_base = input + b * in_batch_size + c * in_image_size;
const float *filter_ptr = filter + multi_index * in_channels * 9 + c * 9; const float *filter_ptr = filter + multi_index * in_shape[1] * 9 + c * 9;
float *out_base = output + b * out_batch_size + m * out_image_size; float *out_base = output + b * out_batch_size + m * out_image_size;
index_t h, w; index_t h, w;
const index_t pad_top = pad_hw[0];
const index_t pad_left = pad_hw[1];
const index_t out_width = out_shape[3];
const index_t in_height = in_shape[2];
const index_t in_width = in_shape[3];
// top // top
for (h = 0; h < valid_h_start; ++h) { for (h = 0; h < valid_h_start; ++h) {
for (w = 0; w < out_width; ++w) { for (w = 0; w < out_shape[3]; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base,
filter_ptr, filter_ptr,
h, h,
...@@ -256,7 +255,7 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, ...@@ -256,7 +255,7 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
} // h } // h
#else #else
for (index_t ih = valid_h_start; ih < valid_h_stop; ++ih) { for (index_t ih = valid_h_start; ih < valid_h_stop; ++ih) {
for (index_t iw = 0; iw < out_width; ++iw) { for (index_t iw = 0; iw < out_shape[3]; ++iw) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base,
filter_ptr, filter_ptr,
ih, ih,
...@@ -274,8 +273,8 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, ...@@ -274,8 +273,8 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
#endif #endif
// bottom // bottom
for (; h < out_height; ++h) { for (; h < out_shape[2]; ++h) {
for (w = 0; w < out_width; ++w) { for (w = 0; w < out_shape[3]; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base,
filter_ptr, filter_ptr,
h, h,
...@@ -296,15 +295,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, ...@@ -296,15 +295,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
void DepthwiseConv2dNeonK3x3S2(const float *input, void DepthwiseConv2dNeonK3x3S2(const float *input,
const float *filter, const float *filter,
const index_t batch, const index_t* in_shape,
const index_t in_height, const index_t* out_shape,
const index_t in_width, const int* pad_hw,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
const int pad_top,
const int pad_left,
const index_t valid_h_start, const index_t valid_h_start,
const index_t valid_h_stop, const index_t valid_h_stop,
const index_t valid_w_start, const index_t valid_w_start,
...@@ -314,22 +307,26 @@ void DepthwiseConv2dNeonK3x3S2(const float *input, ...@@ -314,22 +307,26 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
MACE_UNUSED(valid_w_start); MACE_UNUSED(valid_w_start);
MACE_UNUSED(valid_w_stop); MACE_UNUSED(valid_w_stop);
#endif #endif
const index_t multiplier = out_channels / in_channels; const index_t multiplier = out_shape[1] / in_shape[1];
const index_t in_image_size = in_height * in_width; const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_height * out_width; const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = in_channels * in_image_size; const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_channels * out_image_size; const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < in_shape[0]; ++b) {
for (index_t m = 0; m < out_channels; ++m) { for (index_t m = 0; m < out_shape[1]; ++m) {
index_t c = m / multiplier; index_t c = m / multiplier;
index_t multi_index = m % multiplier; index_t multi_index = m % multiplier;
const float *in_base = input + b * in_batch_size + c * in_image_size; const float *in_base = input + b * in_batch_size + c * in_image_size;
const float *filter_ptr = filter + multi_index * in_channels * 9 + c * 9; const float *filter_ptr = filter + multi_index * in_shape[1] * 9 + c * 9;
float *out_base = output + b * out_batch_size + m * out_image_size; float *out_base = output + b * out_batch_size + m * out_image_size;
index_t h, w; index_t h, w;
const index_t pad_top = pad_hw[0];
const index_t pad_left = pad_hw[1];
const index_t out_width = out_shape[3];
const index_t in_height = in_shape[2];
const index_t in_width = in_shape[3];
// top // top
for (h = 0; h < valid_h_start; ++h) { for (h = 0; h < valid_h_start; ++h) {
for (w = 0; w < out_width; ++w) { for (w = 0; w < out_width; ++w) {
...@@ -472,8 +469,8 @@ void DepthwiseConv2dNeonK3x3S2(const float *input, ...@@ -472,8 +469,8 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
#endif #endif
// bottom // bottom
for (; h < out_height; ++h) { for (; h < out_shape[2]; ++h) {
for (w = 0; w < out_width; ++w) { for (w = 0; w < out_shape[3]; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base,
filter_ptr, filter_ptr,
h, h,
......
...@@ -84,49 +84,46 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { ...@@ -84,49 +84,46 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
void Conv2dGeneral(const float *input, void Conv2dGeneral(const float *input,
const float *filter, const float *filter,
const index_t batch, const index_t *in_shape,
const index_t in_height, const index_t *out_shape,
const index_t in_width, const index_t *filter_shape,
const index_t in_channels, const int *stride_hw,
const index_t out_height, const int *dilation_hw,
const index_t out_width,
const index_t out_channels,
const int filter_height,
const int filter_width,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
float *output) { float *output) {
const index_t in_image_size = in_height * in_width; const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_height * out_width; const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = in_channels * in_image_size; const index_t in_batch_size = filter_shape[1] * in_image_size;
const index_t out_batch_size = out_channels * out_image_size; const index_t out_batch_size = filter_shape[0] * out_image_size;
const index_t filter_size = filter_height * filter_width; const index_t filter_size = filter_shape[2] * filter_shape[3];
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < in_shape[0]; b++) {
for (index_t m = 0; m < out_channels; m += 4) { for (index_t m = 0; m < filter_shape[0]; m += 4) {
const index_t in_height = in_shape[2];
const index_t in_width = in_shape[3];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t out_channels = filter_shape[0];
const index_t in_channels = filter_shape[1];
const int stride_h = stride_hw[0];
const int stride_w = stride_hw[1];
const int dilation_h = dilation_hw[0];
const int dilation_w = dilation_hw[1];
if (m + 3 < out_channels) { if (m + 3 < out_channels) {
float *out_ptr0_base = float *out_ptr0_base =
output + b * out_batch_size + m * out_image_size; output + b * out_batch_size + m * out_image_size;
float *out_ptr1_base = float *out_ptr1_base = out_ptr0_base + out_image_size;
output + b * out_batch_size + (m + 1) * out_image_size; float *out_ptr2_base = out_ptr1_base + out_image_size;
float *out_ptr2_base = float *out_ptr3_base = out_ptr2_base + out_image_size;
output + b * out_batch_size + (m + 2) * out_image_size;
float *out_ptr3_base =
output + b * out_batch_size + (m + 3) * out_image_size;
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < in_channels; ++c) {
const float *in_ptr_base = const float *in_ptr_base =
input + b * in_batch_size + c * in_image_size; input + b * in_batch_size + c * in_image_size;
const float *filter_ptr0 = const float *filter_ptr0 =
filter + m * in_channels * filter_size + c * filter_size; filter + m * in_channels * filter_size + c * filter_size;
const float *filter_ptr1 = const float *filter_ptr1 = filter_ptr0 + in_channels * filter_size;
filter + (m + 1) * in_channels * filter_size + c * filter_size; const float *filter_ptr2 = filter_ptr1 + in_channels * filter_size;
const float *filter_ptr2 = const float *filter_ptr3 = filter_ptr2 + in_channels * filter_size;
filter + (m + 2) * in_channels * filter_size + c * filter_size;
const float *filter_ptr3 =
filter + (m + 3) * in_channels * filter_size + c * filter_size;
for (index_t h = 0; h < out_height; ++h) { for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) { for (index_t w = 0; w + 3 < out_width; w += 4) {
// input offset // input offset
...@@ -144,8 +141,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { ...@@ -144,8 +141,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
vo3[ow] = out_ptr3_base[out_offset + ow]; vo3[ow] = out_ptr3_base[out_offset + ow];
} }
// calc by row // calc by row
for (index_t kh = 0; kh < filter_height; ++kh) { for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
for (index_t kw = 0; kw < filter_width; ++kw) { for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
// outch 0 // outch 0
vo0[0] += in_ptr_base[in_offset vo0[0] += in_ptr_base[in_offset
+ kw * dilation_w] * filter_ptr0[kw]; + kw * dilation_w] * filter_ptr0[kw];
...@@ -185,10 +182,10 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { ...@@ -185,10 +182,10 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
} // kw } // kw
in_offset += dilation_h * in_width; in_offset += dilation_h * in_width;
filter_ptr0 += filter_width; filter_ptr0 += filter_shape[3];
filter_ptr1 += filter_width; filter_ptr1 += filter_shape[3];
filter_ptr2 += filter_width; filter_ptr2 += filter_shape[3];
filter_ptr3 += filter_width; filter_ptr3 += filter_shape[3];
} // kh } // kh
for (index_t ow = 0; ow < 4; ++ow) { for (index_t ow = 0; ow < 4; ++ow) {
...@@ -230,8 +227,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { ...@@ -230,8 +227,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
} }
// calc by row // calc by row
for (index_t kh = 0; kh < filter_height; ++kh) { for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
for (index_t kw = 0; kw < filter_width; ++kw) { for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
// outch 0 // outch 0
vo0[0] += in_ptr_base[in_offset vo0[0] += in_ptr_base[in_offset
+ kw * dilation_w] * filter_ptr0[kw]; + kw * dilation_w] * filter_ptr0[kw];
...@@ -244,7 +241,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { ...@@ -244,7 +241,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
} // kw } // kw
in_offset += dilation_h * in_width; in_offset += dilation_h * in_width;
filter_ptr0 += filter_width; filter_ptr0 += filter_shape[3];
} // kh } // kh
for (index_t ow = 0; ow < 4; ++ow) { for (index_t ow = 0; ow < 4; ++ow) {
...@@ -325,6 +322,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { ...@@ -325,6 +322,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
index_t dilation_h = dilations_[0]; index_t dilation_h = dilations_[0];
index_t dilation_w = dilations_[1]; index_t dilation_w = dilations_[1];
const index_t filter_hw[2] = {filter_h, filter_w};
MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch"); MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch");
index_t padded_input_height = input_height + paddings[0]; index_t padded_input_height = input_height + paddings[0];
...@@ -478,6 +477,10 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { ...@@ -478,6 +477,10 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
transformed_output(scratch_->Scratch(transformed_output_size), DT_FLOAT); transformed_output(scratch_->Scratch(transformed_output_size), DT_FLOAT);
Tensor padded_input(scratch_->Scratch(padded_input_size), DT_FLOAT); Tensor padded_input(scratch_->Scratch(padded_input_size), DT_FLOAT);
Tensor padded_output(scratch_->Scratch(padded_output_size), DT_FLOAT); Tensor padded_output(scratch_->Scratch(padded_output_size), DT_FLOAT);
const index_t extra_input_shape[4] =
{batch, input_channels, extra_input_height, extra_input_width};
const index_t extra_output_shape[4] =
{batch, channels, extra_output_height, extra_output_width};
// decide which convolution function to call // decide which convolution function to call
if (use_winograd) { if (use_winograd) {
...@@ -512,6 +515,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { ...@@ -512,6 +515,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
float *transformed_input_data = transformed_input.mutable_data<float>(); float *transformed_input_data = transformed_input.mutable_data<float>();
float *transformed_output_data = transformed_output.mutable_data<float>(); float *transformed_output_data = transformed_output.mutable_data<float>();
conv_func = [=](const float *pad_input, float *pad_output) { conv_func = [=](const float *pad_input, float *pad_output) {
WinoGradConv3x3s1(pad_input, WinoGradConv3x3s1(pad_input,
transformed_filter_ptr, transformed_filter_ptr,
...@@ -529,26 +533,16 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { ...@@ -529,26 +533,16 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
conv_func = [=](const float *pad_input, float *pad_output) { conv_func = [=](const float *pad_input, float *pad_output) {
Conv2dNeonK3x3S1(pad_input, Conv2dNeonK3x3S1(pad_input,
filter_data, filter_data,
batch, extra_input_shape,
extra_input_height, extra_output_shape,
extra_input_width,
input_channels,
extra_output_height,
extra_output_width,
channels,
pad_output); pad_output);
}; };
} else if (use_neon_3x3_s2) { } else if (use_neon_3x3_s2) {
conv_func = [=](const float *pad_input, float *pad_output) { conv_func = [=](const float *pad_input, float *pad_output) {
Conv2dNeonK3x3S2(pad_input, Conv2dNeonK3x3S2(pad_input,
filter_data, filter_data,
batch, extra_input_shape,
extra_input_height, extra_output_shape,
extra_input_width,
input_channels,
extra_output_height,
extra_output_width,
channels,
pad_output); pad_output);
}; };
} else if (use_neon_1x1_s1) { } else if (use_neon_1x1_s1) {
...@@ -566,71 +560,43 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { ...@@ -566,71 +560,43 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
conv_func = [=](const float *pad_input, float *pad_output) { conv_func = [=](const float *pad_input, float *pad_output) {
Conv2dNeonK5x5S1(pad_input, Conv2dNeonK5x5S1(pad_input,
filter_data, filter_data,
batch, extra_input_shape,
extra_input_height, extra_output_shape,
extra_input_width,
input_channels,
extra_output_height,
extra_output_width,
channels,
pad_output); pad_output);
}; };
} else if (use_neon_7x7_s1) { } else if (use_neon_7x7_s1) {
conv_func = [=](const float *pad_input, float *pad_output) { conv_func = [=](const float *pad_input, float *pad_output) {
Conv2dNeonK7x7S1(pad_input, Conv2dNeonK7x7S1(pad_input,
filter_data, filter_data,
batch, extra_input_shape,
extra_input_height, extra_output_shape,
extra_input_width,
input_channels,
extra_output_height,
extra_output_width,
channels,
pad_output); pad_output);
}; };
} else if (use_neon_7x7_s2) { } else if (use_neon_7x7_s2) {
conv_func = [=](const float *pad_input, float *pad_output) { conv_func = [=](const float *pad_input, float *pad_output) {
Conv2dNeonK7x7S2(pad_input, Conv2dNeonK7x7S2(pad_input,
filter_data, filter_data,
batch, extra_input_shape,
extra_input_height, extra_output_shape,
extra_input_width,
input_channels,
extra_output_height,
extra_output_width,
channels,
pad_output); pad_output);
}; };
} else if (use_neon_7x7_s3) { } else if (use_neon_7x7_s3) {
conv_func = [=](const float *pad_input, float *pad_output) { conv_func = [=](const float *pad_input, float *pad_output) {
Conv2dNeonK7x7S3(pad_input, Conv2dNeonK7x7S3(pad_input,
filter_data, filter_data,
batch, extra_input_shape,
extra_input_height, extra_output_shape,
extra_input_width,
input_channels,
extra_output_height,
extra_output_width,
channels,
pad_output); pad_output);
}; };
} else { } else {
conv_func = [=](const float *pad_input, float *pad_output) { conv_func = [=](const float *pad_input, float *pad_output) {
Conv2dGeneral(pad_input, Conv2dGeneral(pad_input,
filter_data, filter_data,
batch, extra_input_shape,
extra_input_height, extra_output_shape,
extra_input_width, filter_shape.data(),
input_channels, strides_,
extra_output_height, dilations_,
extra_output_width,
channels,
filter_h,
filter_w,
stride_h,
stride_w,
dilation_h,
dilation_w,
pad_output); pad_output);
}; };
} }
......
...@@ -41,48 +41,40 @@ template<typename T> ...@@ -41,48 +41,40 @@ template<typename T>
void Deconv2dNCHW(const T *input, void Deconv2dNCHW(const T *input,
const T *filter, const T *filter,
const T *bias, const T *bias,
const index_t batch, const index_t *in_shape,
const index_t in_height, const index_t *out_shape,
const index_t in_width, const index_t *kernel_hw,
const index_t in_channels, const int *strides,
const index_t out_height, const int *padding,
const index_t out_width,
const index_t out_channels,
const index_t filter_height,
const index_t filter_width,
const index_t stride_h,
const index_t stride_w,
const int padding_top,
const int padding_left,
float *output) { float *output) {
#pragma omp parallel for collapse(4) #pragma omp parallel for collapse(4)
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t oc = 0; oc < out_channels; ++oc) { for (index_t oc = 0; oc < out_shape[1]; ++oc) {
for (index_t oh = 0; oh < out_height; ++oh) { for (index_t oh = 0; oh < out_shape[2]; ++oh) {
for (index_t ow = 0; ow < out_width; ++ow) { for (index_t ow = 0; ow < out_shape[3]; ++ow) {
index_t filter_start_y, filter_start_x; index_t filter_start_y, filter_start_x;
index_t start_x = std::max<int>(0, ow + stride_w -1 - padding_left); index_t start_x = std::max<int>(0, ow + strides[1] -1 - padding[1]);
index_t start_y = std::max<int>(0, oh + stride_h -1 - padding_top); index_t start_y = std::max<int>(0, oh + strides[0] -1 - padding[0]);
start_x /= stride_w; start_x /= strides[1];
start_y /= stride_h; start_y /= strides[0];
filter_start_x = padding_left + stride_w * start_x - ow; filter_start_x = padding[1] + strides[1] * start_x - ow;
filter_start_y = padding_top + stride_h * start_y - oh; filter_start_y = padding[0] + strides[0] * start_y - oh;
filter_start_x = filter_width - 1 - filter_start_x; filter_start_x = kernel_hw[1] - 1 - filter_start_x;
filter_start_y = filter_height - 1 - filter_start_y; filter_start_y = kernel_hw[0] - 1 - filter_start_y;
T out_value = 0; T out_value = 0;
index_t out_pos = index_t out_pos =
((b * out_channels + oc) * out_height + oh) * out_width + ow; ((b * out_shape[1] + oc) * out_shape[2] + oh) * out_shape[3] + ow;
for (index_t ic = 0; ic < in_channels; ++ic) { for (index_t ic = 0; ic < in_shape[1]; ++ic) {
for (index_t f_y = filter_start_y, ih = start_y; for (index_t f_y = filter_start_y, ih = start_y;
f_y >= 0 && ih < in_height; f_y -= stride_h, ++ih) { f_y >= 0 && ih < in_shape[2]; f_y -= strides[0], ++ih) {
for (index_t f_x = filter_start_x, iw = start_x; for (index_t f_x = filter_start_x, iw = start_x;
f_x >= 0 && iw < in_width; f_x -= stride_w, ++iw) { f_x >= 0 && iw < in_shape[3]; f_x -= strides[1], ++iw) {
index_t weight_pos = index_t weight_pos =
((oc * in_channels + ic) * filter_height + f_y) ((oc * in_shape[1] + ic) * kernel_hw[0] + f_y)
* filter_width + f_x; * kernel_hw[1] + f_x;
index_t in_pos = index_t in_pos =
((b * in_channels + ic) * in_height + ih) ((b * in_shape[1] + ic) * in_shape[2] + ih)
* in_width + iw; * in_shape[3] + iw;
out_value += input[in_pos] * filter[weight_pos]; out_value += input[in_pos] * filter[weight_pos];
} }
} }
...@@ -269,26 +261,17 @@ struct Deconv2dFunctor : Deconv2dFunctorBase { ...@@ -269,26 +261,17 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
paddings_.data(), true); paddings_.data(), true);
output->Resize(output_shape_); output->Resize(output_shape_);
} }
index_t batch = output->dim(0);
index_t channels = output->dim(1);
index_t height = output->dim(2);
index_t width = output->dim(3);
index_t input_batch = input->dim(0);
index_t input_channels = input->dim(1);
index_t input_height = input->dim(2);
index_t input_width = input->dim(3);
index_t kernel_h = filter->dim(2); index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3); index_t kernel_w = filter->dim(3);
MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels); const index_t *in_shape = input->shape().data();
MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ", const index_t *out_shape = output->shape().data();
input_channels); const index_t kernel_hw[2] = {kernel_h, kernel_w};
index_t stride_h = strides_[0]; MACE_CHECK(filter->dim(0) == out_shape[1], filter->dim(0), " != ",
index_t stride_w = strides_[1]; output_shape[1]);
MACE_CHECK(filter->dim(1) == in_shape[1], filter->dim(1), " != ",
MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch"); in_shape[1]);
MACE_CHECK(in_shape[0] == out_shape[0], "Input/Output batch size mismatch");
Tensor::MappingGuard input_mapper(input); Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard filter_mapper(filter); Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard bias_mapper(bias); Tensor::MappingGuard bias_mapper(bias);
...@@ -297,17 +280,23 @@ struct Deconv2dFunctor : Deconv2dFunctorBase { ...@@ -297,17 +280,23 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
auto filter_data = filter->data<T>(); auto filter_data = filter->data<T>();
auto bias_data = bias == nullptr ? nullptr : bias->data<T>(); auto bias_data = bias == nullptr ? nullptr : bias->data<T>();
auto output_data = output->mutable_data<T>(); auto output_data = output->mutable_data<T>();
int padding_top = (paddings_[0] + 1) >> 1; int padding[2];
int padding_left = (paddings_[1] + 1) >> 1; padding[0] = (paddings_[0] + 1) >> 1;
padding[1] = (paddings_[1] + 1) >> 1;
deconv::Deconv2dNCHW(input_data, filter_data, bias_data, deconv::Deconv2dNCHW(input_data,
batch, input_height, input_width, input_channels, filter_data,
height, width, channels, bias_data,
kernel_h, kernel_w, in_shape,
stride_h, stride_w, padding_top, padding_left, out_shape,
kernel_hw,
strides_,
padding,
output_data); output_data);
DoActivation(output_data, output_data, output->size(), activation_, DoActivation(output_data,
output_data,
output->size(),
activation_,
relux_max_limit_); relux_max_limit_);
} }
}; };
......
...@@ -34,10 +34,10 @@ struct DepthToSpaceOpFunctor { ...@@ -34,10 +34,10 @@ struct DepthToSpaceOpFunctor {
: block_size_(block_size), d2s_(d2s) {} : block_size_(block_size), d2s_(d2s) {}
void operator()(const Tensor *input, Tensor *output, StatsFuture *future) { void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
MACE_UNUSED(future); MACE_UNUSED(future);
const int batch_size = input->dim(0); const index_t batch_size = input->dim(0);
const int input_depth = input->dim(1); const index_t input_depth = input->dim(1);
const int input_height = input->dim(2); const index_t input_height = input->dim(2);
const int input_width = input->dim(3); const index_t input_width = input->dim(3);
index_t output_depth, output_width, output_height; index_t output_depth, output_width, output_height;
...@@ -62,11 +62,11 @@ struct DepthToSpaceOpFunctor { ...@@ -62,11 +62,11 @@ struct DepthToSpaceOpFunctor {
if (d2s_) { if (d2s_) {
#pragma omp parallel for #pragma omp parallel for
for (int b = 0; b < batch_size; ++b) { for (index_t b = 0; b < batch_size; ++b) {
for (int d = 0; d < output_depth; ++d) { for (index_t d = 0; d < output_depth; ++d) {
for (int h = 0; h < output_height; ++h) { for (index_t h = 0; h < output_height; ++h) {
const int in_h = h / block_size_; const index_t in_h = h / block_size_;
const int offset_h = (h % block_size_); const index_t offset_h = (h % block_size_);
for (int w = 0; w < output_width; ++w) { for (int w = 0; w < output_width; ++w) {
const index_t in_w = w / block_size_; const index_t in_w = w / block_size_;
const index_t offset_w = w % block_size_; const index_t offset_w = w % block_size_;
...@@ -86,18 +86,18 @@ struct DepthToSpaceOpFunctor { ...@@ -86,18 +86,18 @@ struct DepthToSpaceOpFunctor {
} }
} else { } else {
#pragma omp parallel for #pragma omp parallel for
for (int b = 0; b < batch_size; ++b) { for (index_t b = 0; b < batch_size; ++b) {
for (int d = 0; d < input_depth; ++d) { for (index_t d = 0; d < input_depth; ++d) {
for (int h = 0; h < input_height; ++h) { for (index_t h = 0; h < input_height; ++h) {
const int out_h = h / block_size_; const index_t out_h = h / block_size_;
const int offset_h = (h % block_size_); const index_t offset_h = (h % block_size_);
for (int w = 0; w < input_width; ++w) { for (index_t w = 0; w < input_width; ++w) {
const int out_w = w / block_size_; const index_t out_w = w / block_size_;
const int offset_w = (w % block_size_); const index_t offset_w = (w % block_size_);
const int offset_d = const index_t offset_d =
(offset_h * block_size_ + offset_w) * input_depth; (offset_h * block_size_ + offset_w) * input_depth;
const int out_d = d + offset_d; const index_t out_d = d + offset_d;
const index_t o_index = const index_t o_index =
((b * output_depth + out_d) * output_height + out_h) ((b * output_depth + out_d) * output_height + out_h)
* output_width + out_w; * output_width + out_w;
......
...@@ -78,28 +78,27 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float> ...@@ -78,28 +78,27 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
void DepthwiseConv2dGeneral(const float *input, void DepthwiseConv2dGeneral(const float *input,
const float *filter, const float *filter,
const index_t batch, const index_t *in_shape,
const index_t in_height, const index_t *out_shape,
const index_t in_width, const index_t *filter_shape,
const index_t in_channels, const int *stride_hw,
const index_t out_height, const int *dilation_hw,
const index_t out_width, const int *pad_hw,
const index_t out_channels,
const int filter_height,
const int filter_width,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int pad_top,
const int pad_left,
float *output) { float *output) {
const index_t multiplier = out_channels / in_channels; const index_t multiplier = filter_shape[0] / filter_shape[1];
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < in_shape[0]; ++b) {
for (index_t m = 0; m < out_channels; ++m) { for (index_t m = 0; m < filter_shape[0]; ++m) {
for (index_t h = 0; h < out_height; ++h) { for (index_t h = 0; h < out_shape[2]; ++h) {
for (index_t w = 0; w < out_width; ++w) { for (index_t w = 0; w < out_shape[3]; ++w) {
const index_t out_channels = filter_shape[0];
const index_t in_channels = filter_shape[1];
const index_t filter_height = filter_shape[2];
const index_t filter_width = filter_shape[3];
const index_t in_height = in_shape[2];
const index_t in_width = in_shape[3];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
index_t out_offset = index_t out_offset =
((b * out_channels + m) * out_height + h) * out_width + w; ((b * out_channels + m) * out_height + h) * out_width + w;
index_t c = m / multiplier; index_t c = m / multiplier;
...@@ -107,8 +106,8 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float> ...@@ -107,8 +106,8 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
float sum = 0; float sum = 0;
for (index_t kh = 0; kh < filter_height; ++kh) { for (index_t kh = 0; kh < filter_height; ++kh) {
for (index_t kw = 0; kw < filter_width; ++kw) { for (index_t kw = 0; kw < filter_width; ++kw) {
index_t ih = h * stride_h + kh * dilation_h - pad_top; index_t ih = h * stride_hw[0] + kh * dilation_hw[0] - pad_hw[0];
index_t iw = w * stride_w + kw * dilation_w - pad_left; index_t iw = w * stride_hw[1] + kw * dilation_hw[1] - pad_hw[1];
if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) { if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) {
index_t in_offset = index_t in_offset =
((b * in_channels + c) * in_height + ih) * in_width + iw; ((b * in_channels + c) * in_height + ih) * in_width + iw;
...@@ -214,20 +213,18 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float> ...@@ -214,20 +213,18 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
auto bias_data = bias == nullptr ? nullptr : bias->data<float>(); auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
auto output_data = output->mutable_data<float>(); auto output_data = output->mutable_data<float>();
const int pad_hw[2] = {pad_top, pad_left};
const index_t input_shape[4] =
{batch, input_channels, input_height, input_width};
if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1 if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1
&& dilation_h == 1 && dilation_w == 1) { && dilation_h == 1 && dilation_w == 1) {
conv_func = [=](const float *input, float *output) { conv_func = [=](const float *input, float *output) {
DepthwiseConv2dNeonK3x3S1(input, DepthwiseConv2dNeonK3x3S1(input,
filter_data, filter_data,
batch, input_shape,
input_height, output_shape.data(),
input_width, pad_hw,
input_channels,
height,
width,
channels,
pad_top,
pad_left,
valid_h_start, valid_h_start,
valid_h_stop, valid_h_stop,
valid_w_start, valid_w_start,
...@@ -239,15 +236,9 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float> ...@@ -239,15 +236,9 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
conv_func = [=](const float *input, float *output) { conv_func = [=](const float *input, float *output) {
DepthwiseConv2dNeonK3x3S2(input, DepthwiseConv2dNeonK3x3S2(input,
filter_data, filter_data,
batch, input_shape,
input_height, output_shape.data(),
input_width, pad_hw,
input_channels,
height,
width,
channels,
pad_top,
pad_left,
valid_h_start, valid_h_start,
valid_h_stop, valid_h_stop,
valid_w_start, valid_w_start,
...@@ -258,21 +249,12 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float> ...@@ -258,21 +249,12 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
conv_func = [=](const float *input, float *output) { conv_func = [=](const float *input, float *output) {
DepthwiseConv2dGeneral(input, DepthwiseConv2dGeneral(input,
filter_data, filter_data,
batch, input_shape,
input_height, output_shape.data(),
input_width, filter_shape.data(),
input_channels, strides_,
height, dilations_,
width, pad_hw,
channels,
filter_h,
filter_w,
stride_h,
stride_w,
dilation_h,
dilation_w,
pad_top,
pad_left,
output); output);
}; };
} }
......
...@@ -37,6 +37,10 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase { ...@@ -37,6 +37,10 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase {
const BufferType type, const BufferType type,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
MACE_UNUSED(input);
MACE_UNUSED(type);
MACE_UNUSED(output);
MACE_UNUSED(future);
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
}; };
......
...@@ -90,7 +90,8 @@ void BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input, ...@@ -90,7 +90,8 @@ void BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
} else { } else {
std::vector<uint32_t> roundup_gws(lws.size()); std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) { for (size_t i = 0; i < lws.size(); ++i) {
roundup_gws[i] = RoundUp(gws[i], lws[i]); if (lws[i] != 0)
roundup_gws[i] = RoundUp(gws[i], lws[i]);
} }
error = runtime->command_queue().enqueueNDRangeKernel( error = runtime->command_queue().enqueueNDRangeKernel(
......
...@@ -75,39 +75,38 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase { ...@@ -75,39 +75,38 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
} }
void MaxPooling(const float *input, void MaxPooling(const float *input,
const index_t batch, const index_t *in_shape,
const index_t in_height, const index_t *out_shape,
const index_t in_width, const int *filter_hw,
const index_t channels, const int *stride_hw,
const index_t out_height, const int *dilation_hw,
const index_t out_width, const int *pad_hw,
const int filter_height,
const int filter_width,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int pad_top,
const int pad_left,
float *output) { float *output) {
const index_t in_image_size = in_height * in_width; const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_height * out_width; const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = channels * in_image_size; const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = channels * out_image_size; const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t c = 0; c < channels; ++c) { for (index_t c = 0; c < out_shape[1]; ++c) {
const index_t out_base = b * out_batch_size + c * out_image_size; const index_t out_base = b * out_batch_size + c * out_image_size;
const index_t in_base = b * in_batch_size + c * in_image_size; const index_t in_base = b * in_batch_size + c * in_image_size;
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t in_height = in_shape[2];
const index_t in_width = in_shape[3];
for (index_t h = 0; h < out_height; ++h) { for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w < out_width; ++w) { for (index_t w = 0; w < out_width; ++w) {
const index_t out_offset = out_base + h * out_width + w; const index_t out_offset = out_base + h * out_width + w;
float res = std::numeric_limits<float>::lowest(); float res = std::numeric_limits<float>::lowest();
for (int fh = 0; fh < filter_height; ++fh) { for (int fh = 0; fh < filter_hw[0]; ++fh) {
for (int fw = 0; fw < filter_width; ++fw) { for (int fw = 0; fw < filter_hw[1]; ++fw) {
int inh = h * stride_h + dilation_h * fh - pad_top; index_t inh =
int inw = w * stride_w + dilation_w * fw - pad_left; h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0];
index_t inw =
w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1];
if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) { if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) {
index_t input_offset = in_base + inh * in_width + inw; index_t input_offset = in_base + inh * in_width + inw;
res = std::max(res, input[input_offset]); res = std::max(res, input[input_offset]);
...@@ -122,40 +121,38 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase { ...@@ -122,40 +121,38 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
} }
void AvgPooling(const float *input, void AvgPooling(const float *input,
const index_t batch, const index_t *in_shape,
const index_t in_height, const index_t *out_shape,
const index_t in_width, const int *filter_hw,
const index_t channels, const int *stride_hw,
const index_t out_height, const int *dilation_hw,
const index_t out_width, const int *pad_hw,
const int filter_height,
const int filter_width,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int pad_top,
const int pad_left,
float *output) { float *output) {
const index_t in_image_size = in_height * in_width; const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_height * out_width; const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = channels * in_image_size; const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = channels * out_image_size; const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t c = 0; c < channels; ++c) { for (index_t c = 0; c < out_shape[1]; ++c) {
const index_t out_base = b * out_batch_size + c * out_image_size; const index_t out_base = b * out_batch_size + c * out_image_size;
const index_t in_base = b * in_batch_size + c * in_image_size; const index_t in_base = b * in_batch_size + c * in_image_size;
const index_t in_height = in_shape[2];
const index_t in_width = in_shape[3];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
for (index_t h = 0; h < out_height; ++h) { for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w < out_width; ++w) { for (index_t w = 0; w < out_width; ++w) {
const index_t out_offset = out_base + h * out_width + w; const index_t out_offset = out_base + h * out_width + w;
float res = 0; float res = 0;
int block_size = 0; int block_size = 0;
for (int fh = 0; fh < filter_height; ++fh) { for (int fh = 0; fh < filter_hw[0]; ++fh) {
for (int fw = 0; fw < filter_width; ++fw) { for (int fw = 0; fw < filter_hw[1]; ++fw) {
int inh = h * stride_h + dilation_h * fh - pad_top; index_t inh =
int inw = w * stride_w + dilation_w * fw - pad_left; h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0];
index_t inw =
w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1];
if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) { if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) {
index_t input_offset = in_base + inh * in_width + inw; index_t input_offset = in_base + inh * in_width + inw;
res += input[input_offset]; res += input[input_offset];
...@@ -200,59 +197,25 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase { ...@@ -200,59 +197,25 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
const float *input = input_tensor->data<float>(); const float *input = input_tensor->data<float>();
float *output = output_tensor->mutable_data<float>(); float *output = output_tensor->mutable_data<float>();
const index_t *input_shape = input_tensor->shape().data(); const index_t *input_shape = input_tensor->shape().data();
index_t batch = output_shape[0]; int pad_hw[2] = {paddings[0] / 2, paddings[1] / 2};
index_t channels = output_shape[1];
index_t height = output_shape[2];
index_t width = output_shape[3];
index_t input_height = input_shape[2];
index_t input_width = input_shape[3];
int filter_h = kernels_[0];
int filter_w = kernels_[1];
int stride_h = strides_[0];
int stride_w = strides_[1];
int dilation_h = dilations_[0];
int dilation_w = dilations_[1];
int pad_top = paddings[0] / 2;
int pad_left = paddings[1] / 2;
if (pooling_type_ == PoolingType::MAX) { if (pooling_type_ == PoolingType::MAX) {
MaxPooling(input, MaxPooling(input,
batch, input_shape,
input_height, output_shape.data(),
input_width, kernels_,
channels, strides_,
height, dilations_,
width, pad_hw,
filter_h,
filter_w,
stride_h,
stride_w,
dilation_h,
dilation_w,
pad_top,
pad_left,
output); output);
} else if (pooling_type_ == PoolingType::AVG) { } else if (pooling_type_ == PoolingType::AVG) {
AvgPooling(input, AvgPooling(input,
batch, input_shape,
input_height, output_shape.data(),
input_width, kernels_,
channels, strides_,
height, dilations_,
width, pad_hw,
filter_h,
filter_w,
stride_h,
stride_w,
dilation_h,
dilation_w,
pad_top,
pad_left,
output); output);
} else { } else {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
......
...@@ -111,6 +111,7 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase { ...@@ -111,6 +111,7 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
MACE_UNUSED(input); MACE_UNUSED(input);
MACE_UNUSED(bias); MACE_UNUSED(bias);
MACE_UNUSED(output); MACE_UNUSED(output);
MACE_UNUSED(future);
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
}; };
......
...@@ -38,7 +38,7 @@ class ActivationOp : public Operator<D, T> { ...@@ -38,7 +38,7 @@ class ActivationOp : public Operator<D, T> {
const Tensor *input_tensor = this->Input(0); const Tensor *input_tensor = this->Input(0);
const Tensor *alpha_tensor = const Tensor *alpha_tensor =
this->InputSize() >= 2 ? this->Input(1) : nullptr; this->InputSize() >= 2 ? this->Input(1) : nullptr;
Tensor *output_tensor = this->outputs_[0]; Tensor *output_tensor = this->Output(0);
output_tensor->ResizeLike(input_tensor); output_tensor->ResizeLike(input_tensor);
functor_(input_tensor, alpha_tensor, output_tensor, future); functor_(input_tensor, alpha_tensor, output_tensor, future);
......
...@@ -618,6 +618,8 @@ struct Expector<EXP_TYPE, RES_TYPE, false> { ...@@ -618,6 +618,8 @@ struct Expector<EXP_TYPE, RES_TYPE, false> {
static void Near(const Tensor &x, const Tensor &y, static void Near(const Tensor &x, const Tensor &y,
const double rel_err, const double rel_err,
const double abs_err) { const double abs_err) {
MACE_UNUSED(rel_err);
MACE_UNUSED(abs_err);
Equal(x, y); Equal(x, y);
} }
}; };
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "mace/utils/logging.h" #include "mace/utils/logging.h"
#include <stdlib.h> #include <stdlib.h>
#include <string.h>
#if defined(ANDROID) || defined(__ANDROID__) #if defined(ANDROID) || defined(__ANDROID__)
#include <android/log.h> #include <android/log.h>
#include <iostream> #include <iostream>
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册