提交 1fcd812b 编写于 作者: L liutuo

fix ndk-r15c openmp bugs

上级 3b11a1b2
......@@ -18,6 +18,7 @@
#include <omp.h>
#endif
#include <errno.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
......
......@@ -31,68 +31,38 @@ extern void Conv2dNeonK1x1S1(const float *input,
extern void Conv2dNeonK3x3S1(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
const index_t *in_shape,
const index_t *out_shape,
float *output);
extern void Conv2dNeonK3x3S2(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
const index_t *in_shape,
const index_t *out_shape,
float *output);
extern void Conv2dNeonK5x5S1(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
const index_t *in_shape,
const index_t *out_shape,
float *output);
extern void Conv2dNeonK7x7S1(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
const index_t *in_shape,
const index_t *out_shape,
float *output);
extern void Conv2dNeonK7x7S2(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
const index_t *in_shape,
const index_t *out_shape,
float *output);
extern void Conv2dNeonK7x7S3(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
const index_t *in_shape,
const index_t *out_shape,
float *output);
} // namespace kernels
......
......@@ -24,22 +24,22 @@ namespace kernels {
// Ho = 2, Wo = 4, Co = 2
void Conv2dNeonK3x3S1(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
const index_t *in_shape,
const index_t *out_shape,
float *output) {
const index_t in_image_size = in_height * in_width;
const index_t out_image_size = out_height * out_width;
const index_t in_batch_size = in_channels * in_image_size;
const index_t out_batch_size = out_channels * out_image_size;
const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) {
for (index_t m = 0; m < out_channels; m += 2) {
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; m += 2) {
const index_t out_channels = out_shape[1];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3];
if (m + 1 < out_channels) {
float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON)
......@@ -522,23 +522,22 @@ void Conv2dNeonK3x3S1(const float *input,
void Conv2dNeonK3x3S2(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
const index_t *in_shape,
const index_t *out_shape,
float *output) {
const index_t in_image_size = in_height * in_width;
const index_t out_image_size = out_height * out_width;
const index_t in_batch_size = in_channels * in_image_size;
const index_t out_batch_size = out_channels * out_image_size;
const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) {
for (index_t m = 0; m < out_channels; ++m) {
for (index_t c = 0; c < in_channels; ++c) {
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; ++m) {
for (index_t c = 0; c < in_shape[1]; ++c) {
const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const float *in_base = input + b * in_batch_size + c * in_image_size;
const float
*filter_ptr = filter + m * in_channels * 9 + c * 9;
......
......@@ -103,22 +103,22 @@ inline void Conv2dCPUK5x5Calc(const float *in_ptr_base,
// Ho = 1, Wo = 4, Co = 4
void Conv2dNeonK5x5S1(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
const index_t *in_shape,
const index_t *out_shape,
float *output) {
const index_t in_image_size = in_height * in_width;
const index_t out_image_size = out_height * out_width;
const index_t in_batch_size = in_channels * in_image_size;
const index_t out_batch_size = out_channels * out_image_size;
const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) {
for (index_t m = 0; m < out_channels; m += 4) {
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; m += 4) {
const index_t out_channels = out_shape[1];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3];
if (m + 3 < out_channels) {
float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
......
......@@ -180,22 +180,22 @@ inline void Conv2dCPUK7x7Calc(const float *in_ptr_base,
// Ho = 1, Wo = 4, Co = 4
void Conv2dNeonK7x7S1(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
const index_t *in_shape,
const index_t *out_shape,
float *output) {
const index_t in_image_size = in_height * in_width;
const index_t out_image_size = out_height * out_width;
const index_t in_batch_size = in_channels * in_image_size;
const index_t out_batch_size = out_channels * out_image_size;
const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) {
for (index_t m = 0; m < out_channels; m += 4) {
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; m += 4) {
const index_t out_channels = out_shape[1];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3];
if (m + 3 < out_channels) {
float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON)
......@@ -336,22 +336,22 @@ void Conv2dNeonK7x7S1(const float *input,
// Ho = 1, Wo = 4, Co = 4
void Conv2dNeonK7x7S2(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
const index_t *in_shape,
const index_t *out_shape,
float *output) {
const index_t in_image_size = in_height * in_width;
const index_t out_image_size = out_height * out_width;
const index_t in_batch_size = in_channels * in_image_size;
const index_t out_batch_size = out_channels * out_image_size;
const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) {
for (index_t m = 0; m < out_channels; m += 4) {
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; m += 4) {
const index_t out_channels = out_shape[1];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3];
if (m + 3 < out_channels) {
float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON)
......@@ -502,22 +502,22 @@ void Conv2dNeonK7x7S2(const float *input,
// Ho = 1, Wo = 4, Co = 4
void Conv2dNeonK7x7S3(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
const index_t *in_shape,
const index_t *out_shape,
float *output) {
const index_t in_image_size = in_height * in_width;
const index_t out_image_size = out_height * out_width;
const index_t in_batch_size = in_channels * in_image_size;
const index_t out_batch_size = out_channels * out_image_size;
const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) {
for (index_t m = 0; m < out_channels; m += 4) {
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; m += 4) {
const index_t out_channels = out_shape[1];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t in_channels = in_shape[1];
const index_t in_width = in_shape[3];
if (m + 3 < out_channels) {
float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON)
......
......@@ -22,15 +22,9 @@ namespace kernels {
void DepthwiseConv2dNeonK3x3S1(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
const int pad_top,
const int pad_left,
const index_t *in_shape,
const index_t *out_shape,
const int *pad_hw,
const index_t valid_h_start,
const index_t valid_h_stop,
const index_t valid_w_start,
......@@ -39,15 +33,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
void DepthwiseConv2dNeonK3x3S2(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
const int pad_top,
const int pad_left,
const index_t *in_shape,
const index_t *out_shape,
const int *pad_hw,
const index_t valid_h_start,
const index_t valid_h_stop,
const index_t valid_w_start,
......
......@@ -52,15 +52,9 @@ void DepthwiseConv2dPixel(const float *in_base,
// Ho = 2, Wo = 4, Co = 1
void DepthwiseConv2dNeonK3x3S1(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
const int pad_top,
const int pad_left,
const index_t* in_shape,
const index_t* out_shape,
const int* pad_hw,
const index_t valid_h_start,
const index_t valid_h_stop,
const index_t valid_w_start,
......@@ -70,25 +64,30 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
MACE_UNUSED(valid_w_start);
MACE_UNUSED(valid_w_stop);
#endif
const index_t multiplier = out_channels / in_channels;
const index_t in_image_size = in_height * in_width;
const index_t out_image_size = out_height * out_width;
const index_t in_batch_size = in_channels * in_image_size;
const index_t out_batch_size = out_channels * out_image_size;
const index_t multiplier = out_shape[1] / in_shape[1];
const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) {
for (index_t m = 0; m < out_channels; ++m) {
for (index_t b = 0; b < in_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; ++m) {
index_t c = m / multiplier;
index_t multi_index = m % multiplier;
const float *in_base = input + b * in_batch_size + c * in_image_size;
const float *filter_ptr = filter + multi_index * in_channels * 9 + c * 9;
const float *filter_ptr = filter + multi_index * in_shape[1] * 9 + c * 9;
float *out_base = output + b * out_batch_size + m * out_image_size;
index_t h, w;
const index_t pad_top = pad_hw[0];
const index_t pad_left = pad_hw[1];
const index_t out_width = out_shape[3];
const index_t in_height = in_shape[2];
const index_t in_width = in_shape[3];
// top
for (h = 0; h < valid_h_start; ++h) {
for (w = 0; w < out_width; ++w) {
for (w = 0; w < out_shape[3]; ++w) {
DepthwiseConv2dPixel(in_base,
filter_ptr,
h,
......@@ -256,7 +255,7 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
} // h
#else
for (index_t ih = valid_h_start; ih < valid_h_stop; ++ih) {
for (index_t iw = 0; iw < out_width; ++iw) {
for (index_t iw = 0; iw < out_shape[3]; ++iw) {
DepthwiseConv2dPixel(in_base,
filter_ptr,
ih,
......@@ -274,8 +273,8 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
#endif
// bottom
for (; h < out_height; ++h) {
for (w = 0; w < out_width; ++w) {
for (; h < out_shape[2]; ++h) {
for (w = 0; w < out_shape[3]; ++w) {
DepthwiseConv2dPixel(in_base,
filter_ptr,
h,
......@@ -296,15 +295,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
void DepthwiseConv2dNeonK3x3S2(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
const int pad_top,
const int pad_left,
const index_t* in_shape,
const index_t* out_shape,
const int* pad_hw,
const index_t valid_h_start,
const index_t valid_h_stop,
const index_t valid_w_start,
......@@ -314,22 +307,26 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
MACE_UNUSED(valid_w_start);
MACE_UNUSED(valid_w_stop);
#endif
const index_t multiplier = out_channels / in_channels;
const index_t in_image_size = in_height * in_width;
const index_t out_image_size = out_height * out_width;
const index_t in_batch_size = in_channels * in_image_size;
const index_t out_batch_size = out_channels * out_image_size;
const index_t multiplier = out_shape[1] / in_shape[1];
const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) {
for (index_t m = 0; m < out_channels; ++m) {
for (index_t b = 0; b < in_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; ++m) {
index_t c = m / multiplier;
index_t multi_index = m % multiplier;
const float *in_base = input + b * in_batch_size + c * in_image_size;
const float *filter_ptr = filter + multi_index * in_channels * 9 + c * 9;
const float *filter_ptr = filter + multi_index * in_shape[1] * 9 + c * 9;
float *out_base = output + b * out_batch_size + m * out_image_size;
index_t h, w;
const index_t pad_top = pad_hw[0];
const index_t pad_left = pad_hw[1];
const index_t out_width = out_shape[3];
const index_t in_height = in_shape[2];
const index_t in_width = in_shape[3];
// top
for (h = 0; h < valid_h_start; ++h) {
for (w = 0; w < out_width; ++w) {
......@@ -472,8 +469,8 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
#endif
// bottom
for (; h < out_height; ++h) {
for (w = 0; w < out_width; ++w) {
for (; h < out_shape[2]; ++h) {
for (w = 0; w < out_shape[3]; ++w) {
DepthwiseConv2dPixel(in_base,
filter_ptr,
h,
......
......@@ -84,49 +84,46 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
void Conv2dGeneral(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
const int filter_height,
const int filter_width,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const index_t *in_shape,
const index_t *out_shape,
const index_t *filter_shape,
const int *stride_hw,
const int *dilation_hw,
float *output) {
const index_t in_image_size = in_height * in_width;
const index_t out_image_size = out_height * out_width;
const index_t in_batch_size = in_channels * in_image_size;
const index_t out_batch_size = out_channels * out_image_size;
const index_t filter_size = filter_height * filter_width;
const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = filter_shape[1] * in_image_size;
const index_t out_batch_size = filter_shape[0] * out_image_size;
const index_t filter_size = filter_shape[2] * filter_shape[3];
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) {
for (index_t m = 0; m < out_channels; m += 4) {
for (index_t b = 0; b < in_shape[0]; b++) {
for (index_t m = 0; m < filter_shape[0]; m += 4) {
const index_t in_height = in_shape[2];
const index_t in_width = in_shape[3];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t out_channels = filter_shape[0];
const index_t in_channels = filter_shape[1];
const int stride_h = stride_hw[0];
const int stride_w = stride_hw[1];
const int dilation_h = dilation_hw[0];
const int dilation_w = dilation_hw[1];
if (m + 3 < out_channels) {
float *out_ptr0_base =
output + b * out_batch_size + m * out_image_size;
float *out_ptr1_base =
output + b * out_batch_size + (m + 1) * out_image_size;
float *out_ptr2_base =
output + b * out_batch_size + (m + 2) * out_image_size;
float *out_ptr3_base =
output + b * out_batch_size + (m + 3) * out_image_size;
float *out_ptr1_base = out_ptr0_base + out_image_size;
float *out_ptr2_base = out_ptr1_base + out_image_size;
float *out_ptr3_base = out_ptr2_base + out_image_size;
for (index_t c = 0; c < in_channels; ++c) {
const float *in_ptr_base =
input + b * in_batch_size + c * in_image_size;
const float *filter_ptr0 =
filter + m * in_channels * filter_size + c * filter_size;
const float *filter_ptr1 =
filter + (m + 1) * in_channels * filter_size + c * filter_size;
const float *filter_ptr2 =
filter + (m + 2) * in_channels * filter_size + c * filter_size;
const float *filter_ptr3 =
filter + (m + 3) * in_channels * filter_size + c * filter_size;
const float *filter_ptr1 = filter_ptr0 + in_channels * filter_size;
const float *filter_ptr2 = filter_ptr1 + in_channels * filter_size;
const float *filter_ptr3 = filter_ptr2 + in_channels * filter_size;
for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) {
// input offset
......@@ -144,8 +141,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
vo3[ow] = out_ptr3_base[out_offset + ow];
}
// calc by row
for (index_t kh = 0; kh < filter_height; ++kh) {
for (index_t kw = 0; kw < filter_width; ++kw) {
for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
// outch 0
vo0[0] += in_ptr_base[in_offset
+ kw * dilation_w] * filter_ptr0[kw];
......@@ -185,10 +182,10 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
} // kw
in_offset += dilation_h * in_width;
filter_ptr0 += filter_width;
filter_ptr1 += filter_width;
filter_ptr2 += filter_width;
filter_ptr3 += filter_width;
filter_ptr0 += filter_shape[3];
filter_ptr1 += filter_shape[3];
filter_ptr2 += filter_shape[3];
filter_ptr3 += filter_shape[3];
} // kh
for (index_t ow = 0; ow < 4; ++ow) {
......@@ -230,8 +227,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
}
// calc by row
for (index_t kh = 0; kh < filter_height; ++kh) {
for (index_t kw = 0; kw < filter_width; ++kw) {
for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
// outch 0
vo0[0] += in_ptr_base[in_offset
+ kw * dilation_w] * filter_ptr0[kw];
......@@ -244,7 +241,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
} // kw
in_offset += dilation_h * in_width;
filter_ptr0 += filter_width;
filter_ptr0 += filter_shape[3];
} // kh
for (index_t ow = 0; ow < 4; ++ow) {
......@@ -325,6 +322,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
index_t dilation_h = dilations_[0];
index_t dilation_w = dilations_[1];
const index_t filter_hw[2] = {filter_h, filter_w};
MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch");
index_t padded_input_height = input_height + paddings[0];
......@@ -478,6 +477,10 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
transformed_output(scratch_->Scratch(transformed_output_size), DT_FLOAT);
Tensor padded_input(scratch_->Scratch(padded_input_size), DT_FLOAT);
Tensor padded_output(scratch_->Scratch(padded_output_size), DT_FLOAT);
const index_t extra_input_shape[4] =
{batch, input_channels, extra_input_height, extra_input_width};
const index_t extra_output_shape[4] =
{batch, channels, extra_output_height, extra_output_width};
// decide which convolution function to call
if (use_winograd) {
......@@ -512,6 +515,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
float *transformed_input_data = transformed_input.mutable_data<float>();
float *transformed_output_data = transformed_output.mutable_data<float>();
conv_func = [=](const float *pad_input, float *pad_output) {
WinoGradConv3x3s1(pad_input,
transformed_filter_ptr,
......@@ -529,26 +533,16 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
conv_func = [=](const float *pad_input, float *pad_output) {
Conv2dNeonK3x3S1(pad_input,
filter_data,
batch,
extra_input_height,
extra_input_width,
input_channels,
extra_output_height,
extra_output_width,
channels,
extra_input_shape,
extra_output_shape,
pad_output);
};
} else if (use_neon_3x3_s2) {
conv_func = [=](const float *pad_input, float *pad_output) {
Conv2dNeonK3x3S2(pad_input,
filter_data,
batch,
extra_input_height,
extra_input_width,
input_channels,
extra_output_height,
extra_output_width,
channels,
extra_input_shape,
extra_output_shape,
pad_output);
};
} else if (use_neon_1x1_s1) {
......@@ -566,71 +560,43 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
conv_func = [=](const float *pad_input, float *pad_output) {
Conv2dNeonK5x5S1(pad_input,
filter_data,
batch,
extra_input_height,
extra_input_width,
input_channels,
extra_output_height,
extra_output_width,
channels,
extra_input_shape,
extra_output_shape,
pad_output);
};
} else if (use_neon_7x7_s1) {
conv_func = [=](const float *pad_input, float *pad_output) {
Conv2dNeonK7x7S1(pad_input,
filter_data,
batch,
extra_input_height,
extra_input_width,
input_channels,
extra_output_height,
extra_output_width,
channels,
extra_input_shape,
extra_output_shape,
pad_output);
};
} else if (use_neon_7x7_s2) {
conv_func = [=](const float *pad_input, float *pad_output) {
Conv2dNeonK7x7S2(pad_input,
filter_data,
batch,
extra_input_height,
extra_input_width,
input_channels,
extra_output_height,
extra_output_width,
channels,
extra_input_shape,
extra_output_shape,
pad_output);
};
} else if (use_neon_7x7_s3) {
conv_func = [=](const float *pad_input, float *pad_output) {
Conv2dNeonK7x7S3(pad_input,
filter_data,
batch,
extra_input_height,
extra_input_width,
input_channels,
extra_output_height,
extra_output_width,
channels,
extra_input_shape,
extra_output_shape,
pad_output);
};
} else {
conv_func = [=](const float *pad_input, float *pad_output) {
Conv2dGeneral(pad_input,
filter_data,
batch,
extra_input_height,
extra_input_width,
input_channels,
extra_output_height,
extra_output_width,
channels,
filter_h,
filter_w,
stride_h,
stride_w,
dilation_h,
dilation_w,
extra_input_shape,
extra_output_shape,
filter_shape.data(),
strides_,
dilations_,
pad_output);
};
}
......
......@@ -41,48 +41,40 @@ template<typename T>
void Deconv2dNCHW(const T *input,
const T *filter,
const T *bias,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
const index_t filter_height,
const index_t filter_width,
const index_t stride_h,
const index_t stride_w,
const int padding_top,
const int padding_left,
const index_t *in_shape,
const index_t *out_shape,
const index_t *kernel_hw,
const int *strides,
const int *padding,
float *output) {
#pragma omp parallel for collapse(4)
for (index_t b = 0; b < batch; ++b) {
for (index_t oc = 0; oc < out_channels; ++oc) {
for (index_t oh = 0; oh < out_height; ++oh) {
for (index_t ow = 0; ow < out_width; ++ow) {
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t oc = 0; oc < out_shape[1]; ++oc) {
for (index_t oh = 0; oh < out_shape[2]; ++oh) {
for (index_t ow = 0; ow < out_shape[3]; ++ow) {
index_t filter_start_y, filter_start_x;
index_t start_x = std::max<int>(0, ow + stride_w -1 - padding_left);
index_t start_y = std::max<int>(0, oh + stride_h -1 - padding_top);
start_x /= stride_w;
start_y /= stride_h;
filter_start_x = padding_left + stride_w * start_x - ow;
filter_start_y = padding_top + stride_h * start_y - oh;
filter_start_x = filter_width - 1 - filter_start_x;
filter_start_y = filter_height - 1 - filter_start_y;
index_t start_x = std::max<int>(0, ow + strides[1] -1 - padding[1]);
index_t start_y = std::max<int>(0, oh + strides[0] -1 - padding[0]);
start_x /= strides[1];
start_y /= strides[0];
filter_start_x = padding[1] + strides[1] * start_x - ow;
filter_start_y = padding[0] + strides[0] * start_y - oh;
filter_start_x = kernel_hw[1] - 1 - filter_start_x;
filter_start_y = kernel_hw[0] - 1 - filter_start_y;
T out_value = 0;
index_t out_pos =
((b * out_channels + oc) * out_height + oh) * out_width + ow;
for (index_t ic = 0; ic < in_channels; ++ic) {
((b * out_shape[1] + oc) * out_shape[2] + oh) * out_shape[3] + ow;
for (index_t ic = 0; ic < in_shape[1]; ++ic) {
for (index_t f_y = filter_start_y, ih = start_y;
f_y >= 0 && ih < in_height; f_y -= stride_h, ++ih) {
f_y >= 0 && ih < in_shape[2]; f_y -= strides[0], ++ih) {
for (index_t f_x = filter_start_x, iw = start_x;
f_x >= 0 && iw < in_width; f_x -= stride_w, ++iw) {
f_x >= 0 && iw < in_shape[3]; f_x -= strides[1], ++iw) {
index_t weight_pos =
((oc * in_channels + ic) * filter_height + f_y)
* filter_width + f_x;
((oc * in_shape[1] + ic) * kernel_hw[0] + f_y)
* kernel_hw[1] + f_x;
index_t in_pos =
((b * in_channels + ic) * in_height + ih)
* in_width + iw;
((b * in_shape[1] + ic) * in_shape[2] + ih)
* in_shape[3] + iw;
out_value += input[in_pos] * filter[weight_pos];
}
}
......@@ -269,26 +261,17 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
paddings_.data(), true);
output->Resize(output_shape_);
}
index_t batch = output->dim(0);
index_t channels = output->dim(1);
index_t height = output->dim(2);
index_t width = output->dim(3);
index_t input_batch = input->dim(0);
index_t input_channels = input->dim(1);
index_t input_height = input->dim(2);
index_t input_width = input->dim(3);
index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3);
MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels);
MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
input_channels);
index_t stride_h = strides_[0];
index_t stride_w = strides_[1];
MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch");
const index_t *in_shape = input->shape().data();
const index_t *out_shape = output->shape().data();
const index_t kernel_hw[2] = {kernel_h, kernel_w};
MACE_CHECK(filter->dim(0) == out_shape[1], filter->dim(0), " != ",
output_shape[1]);
MACE_CHECK(filter->dim(1) == in_shape[1], filter->dim(1), " != ",
in_shape[1]);
MACE_CHECK(in_shape[0] == out_shape[0], "Input/Output batch size mismatch");
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard bias_mapper(bias);
......@@ -297,17 +280,23 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
auto filter_data = filter->data<T>();
auto bias_data = bias == nullptr ? nullptr : bias->data<T>();
auto output_data = output->mutable_data<T>();
int padding_top = (paddings_[0] + 1) >> 1;
int padding_left = (paddings_[1] + 1) >> 1;
deconv::Deconv2dNCHW(input_data, filter_data, bias_data,
batch, input_height, input_width, input_channels,
height, width, channels,
kernel_h, kernel_w,
stride_h, stride_w, padding_top, padding_left,
int padding[2];
padding[0] = (paddings_[0] + 1) >> 1;
padding[1] = (paddings_[1] + 1) >> 1;
deconv::Deconv2dNCHW(input_data,
filter_data,
bias_data,
in_shape,
out_shape,
kernel_hw,
strides_,
padding,
output_data);
DoActivation(output_data, output_data, output->size(), activation_,
DoActivation(output_data,
output_data,
output->size(),
activation_,
relux_max_limit_);
}
};
......
......@@ -34,10 +34,10 @@ struct DepthToSpaceOpFunctor {
: block_size_(block_size), d2s_(d2s) {}
void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
MACE_UNUSED(future);
const int batch_size = input->dim(0);
const int input_depth = input->dim(1);
const int input_height = input->dim(2);
const int input_width = input->dim(3);
const index_t batch_size = input->dim(0);
const index_t input_depth = input->dim(1);
const index_t input_height = input->dim(2);
const index_t input_width = input->dim(3);
index_t output_depth, output_width, output_height;
......@@ -62,11 +62,11 @@ struct DepthToSpaceOpFunctor {
if (d2s_) {
#pragma omp parallel for
for (int b = 0; b < batch_size; ++b) {
for (int d = 0; d < output_depth; ++d) {
for (int h = 0; h < output_height; ++h) {
const int in_h = h / block_size_;
const int offset_h = (h % block_size_);
for (index_t b = 0; b < batch_size; ++b) {
for (index_t d = 0; d < output_depth; ++d) {
for (index_t h = 0; h < output_height; ++h) {
const index_t in_h = h / block_size_;
const index_t offset_h = (h % block_size_);
for (int w = 0; w < output_width; ++w) {
const index_t in_w = w / block_size_;
const index_t offset_w = w % block_size_;
......@@ -86,18 +86,18 @@ struct DepthToSpaceOpFunctor {
}
} else {
#pragma omp parallel for
for (int b = 0; b < batch_size; ++b) {
for (int d = 0; d < input_depth; ++d) {
for (int h = 0; h < input_height; ++h) {
const int out_h = h / block_size_;
const int offset_h = (h % block_size_);
for (int w = 0; w < input_width; ++w) {
const int out_w = w / block_size_;
const int offset_w = (w % block_size_);
const int offset_d =
for (index_t b = 0; b < batch_size; ++b) {
for (index_t d = 0; d < input_depth; ++d) {
for (index_t h = 0; h < input_height; ++h) {
const index_t out_h = h / block_size_;
const index_t offset_h = (h % block_size_);
for (index_t w = 0; w < input_width; ++w) {
const index_t out_w = w / block_size_;
const index_t offset_w = (w % block_size_);
const index_t offset_d =
(offset_h * block_size_ + offset_w) * input_depth;
const int out_d = d + offset_d;
const index_t out_d = d + offset_d;
const index_t o_index =
((b * output_depth + out_d) * output_height + out_h)
* output_width + out_w;
......
......@@ -78,28 +78,27 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
void DepthwiseConv2dGeneral(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
const int filter_height,
const int filter_width,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int pad_top,
const int pad_left,
const index_t *in_shape,
const index_t *out_shape,
const index_t *filter_shape,
const int *stride_hw,
const int *dilation_hw,
const int *pad_hw,
float *output) {
const index_t multiplier = out_channels / in_channels;
const index_t multiplier = filter_shape[0] / filter_shape[1];
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) {
for (index_t m = 0; m < out_channels; ++m) {
for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w < out_width; ++w) {
for (index_t b = 0; b < in_shape[0]; ++b) {
for (index_t m = 0; m < filter_shape[0]; ++m) {
for (index_t h = 0; h < out_shape[2]; ++h) {
for (index_t w = 0; w < out_shape[3]; ++w) {
const index_t out_channels = filter_shape[0];
const index_t in_channels = filter_shape[1];
const index_t filter_height = filter_shape[2];
const index_t filter_width = filter_shape[3];
const index_t in_height = in_shape[2];
const index_t in_width = in_shape[3];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
index_t out_offset =
((b * out_channels + m) * out_height + h) * out_width + w;
index_t c = m / multiplier;
......@@ -107,8 +106,8 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
float sum = 0;
for (index_t kh = 0; kh < filter_height; ++kh) {
for (index_t kw = 0; kw < filter_width; ++kw) {
index_t ih = h * stride_h + kh * dilation_h - pad_top;
index_t iw = w * stride_w + kw * dilation_w - pad_left;
index_t ih = h * stride_hw[0] + kh * dilation_hw[0] - pad_hw[0];
index_t iw = w * stride_hw[1] + kw * dilation_hw[1] - pad_hw[1];
if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) {
index_t in_offset =
((b * in_channels + c) * in_height + ih) * in_width + iw;
......@@ -214,20 +213,18 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
auto output_data = output->mutable_data<float>();
const int pad_hw[2] = {pad_top, pad_left};
const index_t input_shape[4] =
{batch, input_channels, input_height, input_width};
if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1
&& dilation_h == 1 && dilation_w == 1) {
conv_func = [=](const float *input, float *output) {
DepthwiseConv2dNeonK3x3S1(input,
filter_data,
batch,
input_height,
input_width,
input_channels,
height,
width,
channels,
pad_top,
pad_left,
input_shape,
output_shape.data(),
pad_hw,
valid_h_start,
valid_h_stop,
valid_w_start,
......@@ -239,15 +236,9 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
conv_func = [=](const float *input, float *output) {
DepthwiseConv2dNeonK3x3S2(input,
filter_data,
batch,
input_height,
input_width,
input_channels,
height,
width,
channels,
pad_top,
pad_left,
input_shape,
output_shape.data(),
pad_hw,
valid_h_start,
valid_h_stop,
valid_w_start,
......@@ -258,21 +249,12 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
conv_func = [=](const float *input, float *output) {
DepthwiseConv2dGeneral(input,
filter_data,
batch,
input_height,
input_width,
input_channels,
height,
width,
channels,
filter_h,
filter_w,
stride_h,
stride_w,
dilation_h,
dilation_w,
pad_top,
pad_left,
input_shape,
output_shape.data(),
filter_shape.data(),
strides_,
dilations_,
pad_hw,
output);
};
}
......
......@@ -37,6 +37,10 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase {
const BufferType type,
Tensor *output,
StatsFuture *future) {
MACE_UNUSED(input);
MACE_UNUSED(type);
MACE_UNUSED(output);
MACE_UNUSED(future);
MACE_NOT_IMPLEMENTED;
}
};
......
......@@ -90,7 +90,8 @@ void BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
} else {
std::vector<uint32_t> roundup_gws(lws.size());
for (size_t i = 0; i < lws.size(); ++i) {
roundup_gws[i] = RoundUp(gws[i], lws[i]);
if (lws[i] != 0)
roundup_gws[i] = RoundUp(gws[i], lws[i]);
}
error = runtime->command_queue().enqueueNDRangeKernel(
......
......@@ -75,39 +75,38 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
}
void MaxPooling(const float *input,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t channels,
const index_t out_height,
const index_t out_width,
const int filter_height,
const int filter_width,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int pad_top,
const int pad_left,
const index_t *in_shape,
const index_t *out_shape,
const int *filter_hw,
const int *stride_hw,
const int *dilation_hw,
const int *pad_hw,
float *output) {
const index_t in_image_size = in_height * in_width;
const index_t out_image_size = out_height * out_width;
const index_t in_batch_size = channels * in_image_size;
const index_t out_batch_size = channels * out_image_size;
const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channels; ++c) {
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t c = 0; c < out_shape[1]; ++c) {
const index_t out_base = b * out_batch_size + c * out_image_size;
const index_t in_base = b * in_batch_size + c * in_image_size;
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t in_height = in_shape[2];
const index_t in_width = in_shape[3];
for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w < out_width; ++w) {
const index_t out_offset = out_base + h * out_width + w;
float res = std::numeric_limits<float>::lowest();
for (int fh = 0; fh < filter_height; ++fh) {
for (int fw = 0; fw < filter_width; ++fw) {
int inh = h * stride_h + dilation_h * fh - pad_top;
int inw = w * stride_w + dilation_w * fw - pad_left;
for (int fh = 0; fh < filter_hw[0]; ++fh) {
for (int fw = 0; fw < filter_hw[1]; ++fw) {
index_t inh =
h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0];
index_t inw =
w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1];
if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) {
index_t input_offset = in_base + inh * in_width + inw;
res = std::max(res, input[input_offset]);
......@@ -122,40 +121,38 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
}
void AvgPooling(const float *input,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t channels,
const index_t out_height,
const index_t out_width,
const int filter_height,
const int filter_width,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int pad_top,
const int pad_left,
const index_t *in_shape,
const index_t *out_shape,
const int *filter_hw,
const int *stride_hw,
const int *dilation_hw,
const int *pad_hw,
float *output) {
const index_t in_image_size = in_height * in_width;
const index_t out_image_size = out_height * out_width;
const index_t in_batch_size = channels * in_image_size;
const index_t out_batch_size = channels * out_image_size;
const index_t in_image_size = in_shape[2] * in_shape[3];
const index_t out_image_size = out_shape[2] * out_shape[3];
const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channels; ++c) {
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t c = 0; c < out_shape[1]; ++c) {
const index_t out_base = b * out_batch_size + c * out_image_size;
const index_t in_base = b * in_batch_size + c * in_image_size;
const index_t in_height = in_shape[2];
const index_t in_width = in_shape[3];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w < out_width; ++w) {
const index_t out_offset = out_base + h * out_width + w;
float res = 0;
int block_size = 0;
for (int fh = 0; fh < filter_height; ++fh) {
for (int fw = 0; fw < filter_width; ++fw) {
int inh = h * stride_h + dilation_h * fh - pad_top;
int inw = w * stride_w + dilation_w * fw - pad_left;
for (int fh = 0; fh < filter_hw[0]; ++fh) {
for (int fw = 0; fw < filter_hw[1]; ++fw) {
index_t inh =
h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0];
index_t inw =
w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1];
if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) {
index_t input_offset = in_base + inh * in_width + inw;
res += input[input_offset];
......@@ -200,59 +197,25 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
const float *input = input_tensor->data<float>();
float *output = output_tensor->mutable_data<float>();
const index_t *input_shape = input_tensor->shape().data();
index_t batch = output_shape[0];
index_t channels = output_shape[1];
index_t height = output_shape[2];
index_t width = output_shape[3];
index_t input_height = input_shape[2];
index_t input_width = input_shape[3];
int filter_h = kernels_[0];
int filter_w = kernels_[1];
int stride_h = strides_[0];
int stride_w = strides_[1];
int dilation_h = dilations_[0];
int dilation_w = dilations_[1];
int pad_top = paddings[0] / 2;
int pad_left = paddings[1] / 2;
int pad_hw[2] = {paddings[0] / 2, paddings[1] / 2};
if (pooling_type_ == PoolingType::MAX) {
MaxPooling(input,
batch,
input_height,
input_width,
channels,
height,
width,
filter_h,
filter_w,
stride_h,
stride_w,
dilation_h,
dilation_w,
pad_top,
pad_left,
input_shape,
output_shape.data(),
kernels_,
strides_,
dilations_,
pad_hw,
output);
} else if (pooling_type_ == PoolingType::AVG) {
AvgPooling(input,
batch,
input_height,
input_width,
channels,
height,
width,
filter_h,
filter_w,
stride_h,
stride_w,
dilation_h,
dilation_w,
pad_top,
pad_left,
input_shape,
output_shape.data(),
kernels_,
strides_,
dilations_,
pad_hw,
output);
} else {
MACE_NOT_IMPLEMENTED;
......
......@@ -111,6 +111,7 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
MACE_UNUSED(input);
MACE_UNUSED(bias);
MACE_UNUSED(output);
MACE_UNUSED(future);
MACE_NOT_IMPLEMENTED;
}
};
......
......@@ -38,7 +38,7 @@ class ActivationOp : public Operator<D, T> {
const Tensor *input_tensor = this->Input(0);
const Tensor *alpha_tensor =
this->InputSize() >= 2 ? this->Input(1) : nullptr;
Tensor *output_tensor = this->outputs_[0];
Tensor *output_tensor = this->Output(0);
output_tensor->ResizeLike(input_tensor);
functor_(input_tensor, alpha_tensor, output_tensor, future);
......
......@@ -618,6 +618,8 @@ struct Expector<EXP_TYPE, RES_TYPE, false> {
static void Near(const Tensor &x, const Tensor &y,
const double rel_err,
const double abs_err) {
MACE_UNUSED(rel_err);
MACE_UNUSED(abs_err);
Equal(x, y);
}
};
......
......@@ -15,6 +15,7 @@
#include "mace/utils/logging.h"
#include <stdlib.h>
#include <string.h>
#if defined(ANDROID) || defined(__ANDROID__)
#include <android/log.h>
#include <iostream>
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册