提交 d21a05dc 编写于 作者: T TianXiaogang 提交者: GitHub

fix winograd reinitwhenneed (#2511)


* add winograd c4 implement (#2494)
test=develop
fix: fix conv_block prepack_input_nxwc4 bug
* fix: optimize sgemm_c4 in armv7
     change condition of choose winograd kernel
* fix: change conv choose kernel condition
test=develop
上级 51cdec42
......@@ -68,10 +68,6 @@ void ConvCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
VLOG(3) << "invoking dw conv";
} else if (param.groups == 1 && kw == 3 && stride == 1 && kps_equal &&
no_dilation) {
int tile_block = 8;
#ifdef __aarch64__
tile_block = 16;
#endif
bool use_winograd =
(threads == 1 && oc >= 4 && ic >= 4 && hout >= 6 && wout >= 6 &&
pads_equal) ||
......
......@@ -49,6 +49,10 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::ReInitWhenNeeded() {
int parallel_threads =
(((ow + 5) / 6) * ((oh + 5) / 6) + tile_block - 1) / tile_block;
if (threads <= 2 && parallel_threads >= threads) {
if (last_kernel_is_c4_ == 1) {
return;
}
last_kernel_is_c4_ = 1;
auto pad = *(param.paddings);
int pad_h = pad[0];
int pad_w = pad[2];
......@@ -68,6 +72,10 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::ReInitWhenNeeded() {
weights_data_, param.filter->data<float>(), ic, oc, trans_tmp_ptr);
free(trans_tmp_ptr);
} else {
if (last_kernel_is_c4_ == 0) {
return;
}
last_kernel_is_c4_ = 0;
int tile_w = (ow + 5) / 6;
int tile_h = (oh + 5) / 6;
......
......@@ -40,6 +40,7 @@ class WinogradConv : public KernelLite<TARGET(kARM), Ptype> {
Tensor weights_;
DDim last_shape_;
int workspace_size_{0};
int last_kernel_is_c4_{-1};
};
} // namespace arm
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册