...
 
Commits (5)
    https://gitcode.net/wjd2002/ncnn/-/commit/a9a7be0e0a1938d6fca8c51c410f5867fdefb297 c_api: expose Mat border processing api (#4855) 2023-07-15T19:13:18+08:00 Mek101 mek101-dev.inv@slmail.me https://gitcode.net/wjd2002/ncnn/-/commit/411a098d5e009a1ef5ab2791b63b6f92ddd9a05c Expose layer_to_index in c-api (#4860) 2023-07-16T22:08:01+08:00 Mek101 mek101-dev.inv@slmail.me https://gitcode.net/wjd2002/ncnn/-/commit/9f29a1737c07ad797c9ecf62140bdc966cd18a5f c_api return null on null layer (#4865) 2023-07-18T13:03:29+08:00 Mek101 mek101-dev.inv@slmail.me https://gitcode.net/wjd2002/ncnn/-/commit/2303b77ac17ac880860252152f26f8d058abc1ee Update how-to-build.md (#4872) 2023-07-21T19:35:36+08:00 ฅ'ω'ฅ 1152383857@qq.com https://gitcode.net/wjd2002/ncnn/-/commit/55709708e998f21962763a40db1099630ef36458 x86 optimization for convolution int8 packed unified elempack (#4861) 2023-07-22T22:01:37+08:00 nihui nihuini@tencent.com
......@@ -155,7 +155,7 @@ jobs:
uses: actions/cache@v3
with:
path: qemu-install
key: qemu-aarch64-install-20220502-2
key: qemu-aarch64-install-20230717
- name: install-qemu-build-deps
if: steps.cache-qemu.outputs.cache-hit != 'true'
run: |
......@@ -167,7 +167,7 @@ jobs:
with:
repository: qemu/qemu
path: qemu
ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65
ref: ed8ad9728a9c0eec34db9dff61dfa2f1dd625637
- name: qemu
if: steps.cache-qemu.outputs.cache-hit != 'true'
run: |
......
......@@ -147,14 +147,16 @@ Download and Install Visual Studio Community 2017 from https://visualstudio.micr
Start the command prompt: `Start → Programs → Visual Studio 2017 → Visual Studio Tools → x64 Native Tools Command Prompt for VS 2017`
> You can also search `x64 Native Tools Command Prompt for VS 2017` directly.
Download protobuf-3.11.2 from https://github.com/google/protobuf/archive/v3.11.2.zip
Build protobuf library:
```shell
cd <protobuf-root-dir>
mkdir build
cd build
mkdir protobuf_build
cd protobuf_build
cmake -A x64 -DCMAKE_INSTALL_PREFIX=%cd%/install -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
cmake --build . --config Release -j 2
cmake --build . --config Release --target install
......
......@@ -1028,8 +1028,14 @@ ncnn_layer_t ncnn_layer_create()
ncnn_layer_t ncnn_layer_create_by_typeindex(int typeindex)
{
void* pthis = (void*)(ncnn::create_layer(typeindex));
if (!pthis)
{
return 0;
}
ncnn_layer_t layer = (ncnn_layer_t)malloc(sizeof(__ncnn_layer_t));
layer->pthis = (void*)(ncnn::create_layer(typeindex));
layer->pthis = pthis;
layer->load_param = __ncnn_layer_load_param;
layer->load_model = __ncnn_layer_load_model;
layer->create_pipeline = __ncnn_layer_create_pipeline;
......@@ -1044,8 +1050,14 @@ ncnn_layer_t ncnn_layer_create_by_typeindex(int typeindex)
#if NCNN_STRING
ncnn_layer_t ncnn_layer_create_by_type(const char* type)
{
void* pthis = (void*)(ncnn::create_layer(type));
if (!pthis)
{
return 0;
}
ncnn_layer_t layer = (ncnn_layer_t)malloc(sizeof(__ncnn_layer_t));
layer->pthis = (void*)(ncnn::create_layer(type));
layer->pthis = pthis;
layer->load_param = __ncnn_layer_load_param;
layer->load_model = __ncnn_layer_load_model;
layer->create_pipeline = __ncnn_layer_create_pipeline;
......@@ -1056,6 +1068,11 @@ ncnn_layer_t ncnn_layer_create_by_type(const char* type)
layer->forward_inplace_n = __ncnn_layer_forward_inplace_n;
return layer;
}
int ncnn_layer_type_to_index(const char* type)
{
return ncnn::layer_to_index(type);
}
#endif /* NCNN_STRING */
void ncnn_layer_destroy(ncnn_layer_t layer)
......@@ -1417,6 +1434,30 @@ int ncnn_extractor_extract_index(ncnn_extractor_t ex, int index, ncnn_mat_t* mat
return ret;
}
void ncnn_copy_make_border(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int type, float v, const ncnn_option_t opt)
{
const Option _opt = opt ? *((const Option*)opt) : Option();
copy_make_border(*(const Mat*)src, *(Mat*)dst, top, bottom, left, right, type, v, _opt);
}
void ncnn_copy_make_border_3d(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int front, int behind, int type, float v, const ncnn_option_t opt)
{
const Option _opt = opt ? *((const Option*)opt) : Option();
copy_make_border_3d(*(const Mat*)src, *(Mat*)dst, top, bottom, left, right, front, behind, type, v, _opt);
}
void ncnn_copy_cut_border(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, const ncnn_option_t opt)
{
const Option _opt = opt ? *((const Option*)opt) : Option();
copy_cut_border(*(const Mat*)src, *(Mat*)dst, top, bottom, left, right, _opt);
}
void ncnn_copy_cut_border_3d(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int front, int behind, const ncnn_option_t opt)
{
const Option _opt = opt ? *((const Option*)opt) : Option();
copy_cut_border_3d(*(const Mat*)src, *(Mat*)dst, top, bottom, left, right, front, behind, _opt);
}
#ifdef __cplusplus
} /* extern "C" */
#endif
......
......@@ -210,6 +210,7 @@ NCNN_EXPORT ncnn_layer_t ncnn_layer_create();
NCNN_EXPORT ncnn_layer_t ncnn_layer_create_by_typeindex(int typeindex);
#if NCNN_STRING
NCNN_EXPORT ncnn_layer_t ncnn_layer_create_by_type(const char* type);
NCNN_EXPORT int ncnn_layer_type_to_index(const char* type);
#endif /* NCNN_STRING */
NCNN_EXPORT void ncnn_layer_destroy(ncnn_layer_t layer);
......@@ -327,6 +328,16 @@ NCNN_EXPORT int ncnn_extractor_extract(ncnn_extractor_t ex, const char* name, nc
NCNN_EXPORT int ncnn_extractor_input_index(ncnn_extractor_t ex, int index, const ncnn_mat_t mat);
NCNN_EXPORT int ncnn_extractor_extract_index(ncnn_extractor_t ex, int index, ncnn_mat_t* mat);
/* mat process api */
#define NCNN_BORDER_CONSTANT 0
#define NCNN_BORDER_REPLICATE 1
#define NCNN_BORDER_REFLECT 2
#define NCNN_BORDER_TRANSPARENT -233
NCNN_EXPORT void ncnn_copy_make_border(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int type, float v, const ncnn_option_t opt);
NCNN_EXPORT void ncnn_copy_make_border_3d(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int front, int behind, int type, float v, const ncnn_option_t opt);
NCNN_EXPORT void ncnn_copy_cut_border(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, const ncnn_option_t opt);
NCNN_EXPORT void ncnn_copy_cut_border_3d(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int front, int behind, const ncnn_option_t opt);
#ifdef __cplusplus
} /* extern "C" */
#endif
......
......@@ -6302,9 +6302,9 @@ static inline void conv3x3s1_winograd43_transform_input_tile(const Mat& bottom_b
float32x4x2_t _t01 = vzipq_f32(_t0, _t1);
_r0 = vget_low_f32(_t01.val[0]);
if (tj * 2 + 1 < w) _r1 = vget_high_f32(_t01.val[0]);
if (tj * 2 + 2 < w) _r2 = vget_low_f32(_t01.val[1]);
if (tj * 2 + 3 < w) _r3 = vget_high_f32(_t01.val[1]);
if (tj * 4 + 1 < w) _r1 = vget_high_f32(_t01.val[0]);
if (tj * 4 + 2 < w) _r2 = vget_low_f32(_t01.val[1]);
if (tj * 4 + 3 < w) _r3 = vget_high_f32(_t01.val[1]);
if (tj * 4 + 4 < w)
{
float tmp[2] = {r0[4], r1[4]};
......@@ -8081,9 +8081,9 @@ static inline void conv3x3s1_winograd63_transform_input_tile(const Mat& bottom_b
float32x4x2_t _t01 = vzipq_f32(_t0, _t1);
_r0 = vget_low_f32(_t01.val[0]);
if (tj * 2 + 1 < w) _r1 = vget_high_f32(_t01.val[0]);
if (tj * 2 + 2 < w) _r2 = vget_low_f32(_t01.val[1]);
if (tj * 2 + 3 < w) _r3 = vget_high_f32(_t01.val[1]);
if (tj * 6 + 1 < w) _r1 = vget_high_f32(_t01.val[0]);
if (tj * 6 + 2 < w) _r2 = vget_low_f32(_t01.val[1]);
if (tj * 6 + 3 < w) _r3 = vget_high_f32(_t01.val[1]);
if (tj * 6 + 4 < w)
{
_t0 = vld1q_f32(r0 + 4);
......
......@@ -1540,9 +1540,9 @@ static inline void conv3x3s1_winograd43_transform_input_tile_bf16s(const Mat& bo
float32x4_t _t1_fp32 = bfloat2float(_t01.val[1]);
_r0 = vget_low_f32(_t0_fp32);
if (tj * 2 + 1 < w) _r1 = vget_high_f32(_t0_fp32);
if (tj * 2 + 2 < w) _r2 = vget_low_f32(_t1_fp32);
if (tj * 2 + 3 < w) _r3 = vget_high_f32(_t1_fp32);
if (tj * 4 + 1 < w) _r1 = vget_high_f32(_t0_fp32);
if (tj * 4 + 2 < w) _r2 = vget_low_f32(_t1_fp32);
if (tj * 4 + 3 < w) _r3 = vget_high_f32(_t1_fp32);
if (tj * 4 + 4 < w)
{
float tmp[2] = {bfloat16_to_float32(r0[4]), bfloat16_to_float32(r1[4])};
......@@ -3211,9 +3211,9 @@ static inline void conv3x3s1_winograd63_transform_input_tile_bf16s(const Mat& bo
float32x4_t _t1_fp32 = bfloat2float(_t01.val[1]);
_r0 = vget_low_f32(_t0_fp32);
if (tj * 2 + 1 < w) _r1 = vget_high_f32(_t0_fp32);
if (tj * 2 + 2 < w) _r2 = vget_low_f32(_t1_fp32);
if (tj * 2 + 3 < w) _r3 = vget_high_f32(_t1_fp32);
if (tj * 6 + 1 < w) _r1 = vget_high_f32(_t0_fp32);
if (tj * 6 + 2 < w) _r2 = vget_low_f32(_t1_fp32);
if (tj * 6 + 3 < w) _r3 = vget_high_f32(_t1_fp32);
if (tj * 6 + 4 < w)
{
_t0 = vld1_u16(r0 + 4);
......
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
static void convolution_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
int w = bottom_blob.w;
int channels = bottom_blob.c;
int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;
const int maxk = kernel_w * kernel_h;
// kernel offsets
std::vector<int> _space_ofs(maxk);
int* space_ofs = &_space_ofs[0];
{
int p1 = 0;
int p2 = 0;
int gap = w * dilation_h - kernel_w * dilation_w;
for (int i = 0; i < kernel_h; i++)
{
for (int j = 0; j < kernel_w; j++)
{
space_ofs[p1] = p2;
p1++;
p2 += dilation_w;
}
p2 += gap;
}
}
// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
int* outptr = top_blob.channel(p);
for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)
{
int sum = 0;
// const signed char* kptr = weight_data_int8.channel(p);
const signed char* kptr = (const signed char*)weight_data_int8 + maxk * channels * p;
// channels
for (int q = 0; q < channels; q++)
{
const Mat m = bottom_blob.channel(q);
const signed char* sptr = m.row<signed char>(i * stride_h) + j * stride_w;
for (int k = 0; k < maxk; k++)
{
signed char val = sptr[space_ofs[k]];
signed char w = kptr[k];
sum += val * w;
}
kptr += maxk;
}
outptr[j] = sum;
}
outptr += outw;
}
}
}
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
static void convolution_pack1to4_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
int w = bottom_blob.w;
int channels = bottom_blob.c;
int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;
const int maxk = kernel_w * kernel_h;
// kernel offsets
std::vector<int> _space_ofs(maxk);
int* space_ofs = &_space_ofs[0];
{
int p1 = 0;
int p2 = 0;
int gap = w * dilation_h - kernel_w * dilation_w;
for (int i = 0; i < kernel_h; i++)
{
for (int j = 0; j < kernel_w; j++)
{
space_ofs[p1] = p2;
p1++;
p2 += dilation_w;
}
p2 += gap;
}
}
// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
int* outptr = top_blob.channel(p);
for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)
{
__m128i _sum0 = _mm_setzero_si128();
const signed char* kptr = weight_data_int8.channel(p);
// channels
for (int q = 0; q < channels; q++)
{
const Mat m = bottom_blob.channel(q);
const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w;
for (int k = 0; k < maxk; k++)
{
__m128i _val = _mm_set1_epi16((short)sptr[space_ofs[k]]);
// TODO use _mm_cvtepi8_epi16 on sse4.1
__m128i _w = _mm_loadl_epi64((const __m128i*)kptr);
_w = _mm_unpacklo_epi8(_w, _mm_cmpgt_epi8(_mm_setzero_si128(), _w));
__m128i _sl = _mm_mullo_epi16(_val, _w);
__m128i _sh = _mm_mulhi_epi16(_val, _w);
__m128i _s0 = _mm_unpacklo_epi16(_sl, _sh);
_sum0 = _mm_add_epi32(_sum0, _s0);
kptr += 4;
}
}
_mm_storeu_si128((__m128i*)(outptr + j * 4), _sum0);
}
outptr += outw * 4;
}
}
}
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
static void convolution_pack8to1_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
int w = bottom_blob.w;
int channels = bottom_blob.c;
int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;
const int maxk = kernel_w * kernel_h;
// kernel offsets
std::vector<int> _space_ofs(maxk);
int* space_ofs = &_space_ofs[0];
{
int p1 = 0;
int p2 = 0;
int gap = w * dilation_h - kernel_w * dilation_w;
for (int i = 0; i < kernel_h; i++)
{
for (int j = 0; j < kernel_w; j++)
{
space_ofs[p1] = p2;
p1++;
p2 += dilation_w;
}
p2 += gap;
}
}
// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
int* outptr = top_blob.channel(p);
for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)
{
int sum = 0;
const signed char* kptr = weight_data_int8.channel(p);
// channels
for (int q = 0; q < channels; q++)
{
const Mat m = bottom_blob.channel(q);
const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w * 8;
for (int k = 0; k < maxk; k++)
{
// TODO use _mm_cvtepi8_epi16 on sse4.1
__m128i _val = _mm_loadl_epi64((const __m128i*)(sptr + space_ofs[k] * 8));
_val = _mm_unpacklo_epi8(_val, _mm_cmpgt_epi8(_mm_setzero_si128(), _val));
__m128i _w = _mm_loadl_epi64((const __m128i*)kptr);
_w = _mm_unpacklo_epi8(_w, _mm_cmpgt_epi8(_mm_setzero_si128(), _w));
__m128i _sl = _mm_mullo_epi16(_val, _w);
__m128i _sh = _mm_mulhi_epi16(_val, _w);
__m128i _s0 = _mm_unpacklo_epi16(_sl, _sh);
__m128i _s1 = _mm_unpackhi_epi16(_sl, _sh);
__m128i _s4 = _mm_add_epi32(_s0, _s1);
// TODO use _mm_hadd_epi32 on ssse3
int s4[4];
_mm_storeu_si128((__m128i*)s4, _s4);
sum += s4[0] + s4[1] + s4[2] + s4[3]; // dot
kptr += 8;
}
}
outptr[j] = sum;
}
outptr += outw;
}
}
}
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
static void convolution_pack8to4_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
int w = bottom_blob.w;
int channels = bottom_blob.c;
int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;
const int maxk = kernel_w * kernel_h;
// kernel offsets
std::vector<int> _space_ofs(maxk);
int* space_ofs = &_space_ofs[0];
{
int p1 = 0;
int p2 = 0;
int gap = w * dilation_h - kernel_w * dilation_w;
for (int i = 0; i < kernel_h; i++)
{
for (int j = 0; j < kernel_w; j++)
{
space_ofs[p1] = p2;
p1++;
p2 += dilation_w;
}
p2 += gap;
}
}
// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
int* outptr = top_blob.channel(p);
for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)
{
__m128i _sum0 = _mm_setzero_si128();
__m128i _sum1 = _mm_setzero_si128();
__m128i _sum2 = _mm_setzero_si128();
__m128i _sum3 = _mm_setzero_si128();
const signed char* kptr = weight_data_int8.channel(p);
// channels
for (int q = 0; q < channels; q++)
{
const Mat m = bottom_blob.channel(q);
const signed char* sptr = m.row<signed char>(i * stride_h) + j * stride_w * 8;
for (int k = 0; k < maxk; k++)
{
// TODO use _mm_cvtepi8_epi16 on sse4.1
__m128i _val = _mm_loadl_epi64((const __m128i*)(sptr + space_ofs[k] * 8));
_val = _mm_unpacklo_epi8(_val, _mm_cmpgt_epi8(_mm_setzero_si128(), _val));
// TODO use _mm_cvtepi8_epi16 on sse4.1
__m128i _w01 = _mm_loadu_si128((const __m128i*)kptr);
__m128i _w23 = _mm_loadu_si128((const __m128i*)(kptr + 16));
__m128i _extw01 = _mm_cmpgt_epi8(_mm_setzero_si128(), _w01);
__m128i _extw23 = _mm_cmpgt_epi8(_mm_setzero_si128(), _w23);
__m128i _w0 = _mm_unpacklo_epi8(_w01, _extw01);
__m128i _w1 = _mm_unpackhi_epi8(_w01, _extw01);
__m128i _w2 = _mm_unpacklo_epi8(_w23, _extw23);
__m128i _w3 = _mm_unpackhi_epi8(_w23, _extw23);
__m128i _sl0 = _mm_mullo_epi16(_val, _w0);
__m128i _sh0 = _mm_mulhi_epi16(_val, _w0);
__m128i _sl1 = _mm_mullo_epi16(_val, _w1);
__m128i _sh1 = _mm_mulhi_epi16(_val, _w1);
__m128i _sl2 = _mm_mullo_epi16(_val, _w2);
__m128i _sh2 = _mm_mulhi_epi16(_val, _w2);
__m128i _sl3 = _mm_mullo_epi16(_val, _w3);
__m128i _sh3 = _mm_mulhi_epi16(_val, _w3);
_sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl0, _sh0));
_sum1 = _mm_add_epi32(_sum1, _mm_unpacklo_epi16(_sl1, _sh1));
_sum2 = _mm_add_epi32(_sum2, _mm_unpacklo_epi16(_sl2, _sh2));
_sum3 = _mm_add_epi32(_sum3, _mm_unpacklo_epi16(_sl3, _sh3));
_sum0 = _mm_add_epi32(_sum0, _mm_unpackhi_epi16(_sl0, _sh0));
_sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl1, _sh1));
_sum2 = _mm_add_epi32(_sum2, _mm_unpackhi_epi16(_sl2, _sh2));
_sum3 = _mm_add_epi32(_sum3, _mm_unpackhi_epi16(_sl3, _sh3));
kptr += 32;
}
}
// transpose 4x4
{
__m128i _tmp0, _tmp1, _tmp2, _tmp3;
_tmp0 = _mm_unpacklo_epi32(_sum0, _sum1);
_tmp1 = _mm_unpacklo_epi32(_sum2, _sum3);
_tmp2 = _mm_unpackhi_epi32(_sum0, _sum1);
_tmp3 = _mm_unpackhi_epi32(_sum2, _sum3);
_sum0 = _mm_unpacklo_epi64(_tmp0, _tmp1);
_sum1 = _mm_unpackhi_epi64(_tmp0, _tmp1);
_sum2 = _mm_unpacklo_epi64(_tmp2, _tmp3);
_sum3 = _mm_unpackhi_epi64(_tmp2, _tmp3);
}
_sum0 = _mm_add_epi32(_sum0, _sum1);
_sum2 = _mm_add_epi32(_sum2, _sum3);
_sum0 = _mm_add_epi32(_sum0, _sum2);
_mm_storeu_si128((__m128i*)(outptr + j * 4), _sum0);
}
outptr += outw * 4;
}
}
}
此差异已折叠。
......@@ -45,16 +45,14 @@ namespace ncnn {
#include "convolution_sgemm_int8.h"
#include "convolution_1x1_int8.h"
#include "convolution_3x3_int8.h"
#include "convolution_int8.h"
#include "convolution_packed_int8.h"
#endif // NCNN_INT8
#if __SSE2__
#include "convolution_3x3_pack1to4.h"
#if NCNN_INT8
#include "convolution_pack8to4_int8.h"
#include "convolution_pack1to4_int8.h"
#include "convolution_pack8to1_int8.h"
#include "convolution_sgemm_pack8to4_int8.h"
#include "convolution_sgemm_pack1to4_int8.h"
#include "convolution_sgemm_pack8to1_int8.h"
......@@ -1237,42 +1235,6 @@ int Convolution_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<M
}
#if NCNN_INT8
static void convolution_transform_kernel_packed_int8_sse(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack)
{
const int maxk = kernel_w * kernel_h;
// src = kw-kh-inch-outch
// dst = pa-pb-kw-kh-inch/pa-outch/pb
{
Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)elempack * out_elempack, elempack * out_elempack);
for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
{
signed char* g00 = weight_data_tm.channel(q / out_elempack);
for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
{
for (int k = 0; k < maxk; k++)
{
for (int i = 0; i < out_elempack; i++)
{
for (int j = 0; j < elempack; j++)
{
const signed char* k00 = weight_data_r2.channel(q + i).row<const signed char>(p + j);
g00[0] = k00[k];
g00++;
}
}
}
}
}
}
}
int Convolution_x86::create_pipeline_int8_x86(const Option& opt)
{
const int maxk = kernel_w * kernel_h;
......@@ -1309,7 +1271,7 @@ int Convolution_x86::create_pipeline_int8_x86(const Option& opt)
}
else
{
convolution_transform_kernel_packed_int8_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
convolution_transform_kernel_packed_int8(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
}
}
......@@ -1341,7 +1303,7 @@ int Convolution_x86::create_pipeline_int8_x86(const Option& opt)
}
else
{
convolution_transform_kernel_packed_int8_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
convolution_transform_kernel_packed_int8(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
}
}
......@@ -1365,7 +1327,7 @@ int Convolution_x86::create_pipeline_int8_x86(const Option& opt)
}
else
{
convolution_transform_kernel_packed_int8_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
convolution_transform_kernel_packed_int8(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
}
}
#endif // __SSE2__
......@@ -1391,7 +1353,7 @@ int Convolution_x86::create_pipeline_int8_x86(const Option& opt)
}
else
{
weight_data_tm = weight_data;
convolution_transform_kernel_packed_int8(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
}
}
......@@ -1501,7 +1463,7 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con
}
else
{
convolution_pack8to4_int8_sse(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
convolution_packed_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
}
......@@ -1533,7 +1495,7 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con
}
else
{
convolution_pack1to4_int8_sse(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
convolution_packed_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
}
......@@ -1557,7 +1519,7 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con
}
else
{
convolution_pack8to1_int8_sse(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
convolution_packed_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
}
#endif // __SSE2__
......@@ -1583,7 +1545,7 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con
}
else
{
convolution_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
convolution_packed_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
}
......
......@@ -18,6 +18,7 @@
namespace ncnn {
#include "convolution_packed_int8.h"
#include "convolution_sgemm_int8.h"
#include "convolution_sgemm_pack1to4_int8.h"
#include "convolution_sgemm_pack8to1_int8.h"
......@@ -25,6 +26,17 @@ namespace ncnn {
#include "convolution_3x3_pack8to1_int8.h"
#include "convolution_3x3_pack8to4_int8.h"
// packed
void convolution_transform_kernel_packed_int8_avx2(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
{
convolution_transform_kernel_packed_int8(kernel, kernel_tm, inch, outch, kernel_w, kernel_h);
}
void convolution_packed_int8_avx2(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
convolution_packed_int8(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
// pack1
void im2col_sgemm_int8_sse_avx2(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
......
......@@ -18,6 +18,7 @@
namespace ncnn {
#include "convolution_packed_int8.h"
#include "convolution_sgemm_int8.h"
#include "convolution_sgemm_pack1to4_int8.h"
#include "convolution_sgemm_pack8to1_int8.h"
......@@ -25,6 +26,12 @@ namespace ncnn {
#include "convolution_3x3_pack8to1_int8.h"
#include "convolution_3x3_pack8to4_int8.h"
// packed
void convolution_packed_int8_avx512vnni(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
convolution_packed_int8(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
// pack1
void im2col_sgemm_int8_sse_avx512vnni(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
......
......@@ -18,6 +18,7 @@
namespace ncnn {
#include "convolution_packed_int8.h"
#include "convolution_sgemm_int8.h"
#include "convolution_sgemm_pack1to4_int8.h"
#include "convolution_sgemm_pack8to1_int8.h"
......@@ -25,6 +26,12 @@ namespace ncnn {
#include "convolution_3x3_pack8to1_int8.h"
#include "convolution_3x3_pack8to4_int8.h"
// packed
void convolution_packed_int8_avxvnni(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
convolution_packed_int8(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
// pack1
void im2col_sgemm_int8_sse_avxvnni(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
......
......@@ -18,6 +18,7 @@
namespace ncnn {
#include "convolution_packed_int8.h"
#include "convolution_sgemm_int8.h"
#include "convolution_sgemm_pack1to4_int8.h"
#include "convolution_sgemm_pack8to1_int8.h"
......@@ -25,6 +26,12 @@ namespace ncnn {
#include "convolution_3x3_pack8to1_int8.h"
#include "convolution_3x3_pack8to4_int8.h"
// packed
void convolution_packed_int8_xop(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
convolution_packed_int8(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
// pack1
void im2col_sgemm_int8_sse_xop(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
......
......@@ -967,6 +967,60 @@ static NCNN_FORCEINLINE void transpose16x8_epi16(__m256i& _r0, __m256i& _r1, __m
_r7 = _mm256_permute2x128_si256(_tmp6, _tmp7, _MM_SHUFFLE(0, 3, 0, 1));
}
static NCNN_FORCEINLINE void transpose8x16_epi16(__m128i& _r0, __m128i& _r1, __m128i& _r2, __m128i& _r3, __m128i& _r4, __m128i& _r5, __m128i& _r6, __m128i& _r7, __m128i& _r8, __m128i& _r9, __m128i& _ra, __m128i& _rb, __m128i& _rc, __m128i& _rd, __m128i& _re, __m128i& _rf)
{
__m128i _tmp0 = _mm_unpacklo_epi16(_r0, _r1);
__m128i _tmp1 = _mm_unpackhi_epi16(_r0, _r1);
__m128i _tmp2 = _mm_unpacklo_epi16(_r2, _r3);
__m128i _tmp3 = _mm_unpackhi_epi16(_r2, _r3);
__m128i _tmp4 = _mm_unpacklo_epi16(_r4, _r5);
__m128i _tmp5 = _mm_unpackhi_epi16(_r4, _r5);
__m128i _tmp6 = _mm_unpacklo_epi16(_r6, _r7);
__m128i _tmp7 = _mm_unpackhi_epi16(_r6, _r7);
__m128i _tmp8 = _mm_unpacklo_epi16(_r8, _r9);
__m128i _tmp9 = _mm_unpackhi_epi16(_r8, _r9);
__m128i _tmpa = _mm_unpacklo_epi16(_ra, _rb);
__m128i _tmpb = _mm_unpackhi_epi16(_ra, _rb);
__m128i _tmpc = _mm_unpacklo_epi16(_rc, _rd);
__m128i _tmpd = _mm_unpackhi_epi16(_rc, _rd);
__m128i _tmpe = _mm_unpacklo_epi16(_re, _rf);
__m128i _tmpf = _mm_unpackhi_epi16(_re, _rf);
__m128i _tmpg = _mm_unpacklo_epi32(_tmp0, _tmp2);
__m128i _tmph = _mm_unpackhi_epi32(_tmp0, _tmp2);
__m128i _tmpi = _mm_unpacklo_epi32(_tmp1, _tmp3);
__m128i _tmpj = _mm_unpackhi_epi32(_tmp1, _tmp3);
__m128i _tmpk = _mm_unpacklo_epi32(_tmp4, _tmp6);
__m128i _tmpl = _mm_unpackhi_epi32(_tmp4, _tmp6);
__m128i _tmpm = _mm_unpacklo_epi32(_tmp5, _tmp7);
__m128i _tmpn = _mm_unpackhi_epi32(_tmp5, _tmp7);
__m128i _tmpo = _mm_unpacklo_epi32(_tmp8, _tmpa);
__m128i _tmpp = _mm_unpackhi_epi32(_tmp8, _tmpa);
__m128i _tmpq = _mm_unpacklo_epi32(_tmp9, _tmpb);
__m128i _tmpr = _mm_unpackhi_epi32(_tmp9, _tmpb);
__m128i _tmps = _mm_unpacklo_epi32(_tmpc, _tmpe);
__m128i _tmpt = _mm_unpackhi_epi32(_tmpc, _tmpe);
__m128i _tmpu = _mm_unpacklo_epi32(_tmpd, _tmpf);
__m128i _tmpv = _mm_unpackhi_epi32(_tmpd, _tmpf);
_r0 = _mm_unpacklo_epi64(_tmpg, _tmpk);
_r1 = _mm_unpacklo_epi64(_tmpo, _tmps);
_r2 = _mm_unpackhi_epi64(_tmpg, _tmpk);
_r3 = _mm_unpackhi_epi64(_tmpo, _tmps);
_r4 = _mm_unpacklo_epi64(_tmph, _tmpl);
_r5 = _mm_unpacklo_epi64(_tmpp, _tmpt);
_r6 = _mm_unpackhi_epi64(_tmph, _tmpl);
_r7 = _mm_unpackhi_epi64(_tmpp, _tmpt);
_r8 = _mm_unpacklo_epi64(_tmpi, _tmpm);
_r9 = _mm_unpacklo_epi64(_tmpq, _tmpu);
_ra = _mm_unpackhi_epi64(_tmpi, _tmpm);
_rb = _mm_unpackhi_epi64(_tmpq, _tmpu);
_rc = _mm_unpacklo_epi64(_tmpj, _tmpn);
_rd = _mm_unpacklo_epi64(_tmpr, _tmpv);
_re = _mm_unpackhi_epi64(_tmpj, _tmpn);
_rf = _mm_unpackhi_epi64(_tmpr, _tmpv);
}
static NCNN_FORCEINLINE float _mm512_comp_reduce_add_ps(__m512 x)
{
const __m256 x256 = _mm256_add_ps(_mm512_castps512_ps256(x), _mm512_extractf32x8_ps(x, 1));
......
......@@ -46,6 +46,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
if (ret != 0)
{
fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
return ret;
}
{
......@@ -65,6 +66,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
if (ret != 0)
{
fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
return ret;
}
}
......@@ -85,6 +87,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
if (ret != 0)
{
fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
return ret;
}
}
......@@ -98,6 +101,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
if (ret != 0)
{
fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
return ret;
}
}
#endif // __aarch64__
......
......@@ -46,6 +46,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
if (ret != 0)
{
fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
return ret;
}
{
......@@ -65,6 +66,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
if (ret != 0)
{
fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
return ret;
}
}
......@@ -85,6 +87,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
if (ret != 0)
{
fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
return ret;
}
}
......@@ -98,6 +101,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
if (ret != 0)
{
fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
return ret;
}
}
#endif // __aarch64__
......
......@@ -48,6 +48,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
if (ret != 0)
{
fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
return ret;
}
{
......@@ -67,6 +68,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
if (ret != 0)
{
fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
return ret;
}
}
......@@ -87,6 +89,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
if (ret != 0)
{
fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
return ret;
}
}
......@@ -99,6 +102,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
if (ret != 0)
{
fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
return ret;
}
}
......
......@@ -187,6 +187,7 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int
if (ret != 0)
{
fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]);
return ret;
}
{
......@@ -206,6 +207,7 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int
if (ret != 0)
{
fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]);
return ret;
}
}
......@@ -226,6 +228,7 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int
if (ret != 0)
{
fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]);
return ret;
}
}
......@@ -309,6 +312,82 @@ static int test_convolution_1()
|| test_convolution_int8(25, 33, 16, 15, 3, 1, 1, 1, 0)
|| test_convolution_int8(7, 7, 15, 12, 3, 1, 1, 1, 0);
}
static int test_convolution_1_2()
{
return 0
|| test_convolution_int8(19, 17, 1, 1, 3, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 2, 1, 3, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 7, 1, 3, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 8, 1, 3, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 15, 1, 3, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 16, 1, 3, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 31, 1, 3, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 32, 1, 3, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 1, 2, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 2, 2, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 7, 2, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 8, 2, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 15, 2, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 16, 2, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 31, 2, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 32, 2, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 1, 7, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 2, 7, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 7, 7, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 8, 7, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 15, 7, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 16, 7, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 31, 7, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 32, 7, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 1, 8, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 2, 8, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 7, 8, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 8, 8, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 15, 8, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 16, 8, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 31, 8, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 32, 8, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 1, 15, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 2, 15, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 7, 15, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 8, 15, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 15, 15, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 16, 15, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 31, 15, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 32, 15, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 1, 16, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 2, 16, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 7, 16, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 8, 16, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 15, 16, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 16, 16, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 31, 16, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 32, 16, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 1, 31, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 2, 31, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 7, 31, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 8, 31, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 15, 31, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 16, 31, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 31, 31, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 32, 31, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 1, 32, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 2, 32, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 7, 32, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 8, 32, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 15, 32, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 16, 32, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 31, 32, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 32, 32, 5, 2, 2, 0, 0);
}
#endif // NCNN_INT8
int main()
......@@ -318,6 +397,7 @@ int main()
#if NCNN_INT8
return 0
|| test_convolution_1()
|| test_convolution_1_2()
|| test_convolution_2()
|| test_convolution_3();
#else
......