未验证 提交 fdeac73a 编写于 作者: Y Yosshi999 提交者: GitHub

Merge pull request #18983 from Yosshi999:bitexact-gaussian-16U-faster

support SIMD for larger symmetric Bit-exact 16U gaussian blur

* support SIMD for bit-exact 16U symmetric gaussian blur

* use tighter SIMD registers
上级 d6a7f5e1
......@@ -1197,6 +1197,78 @@ void hlineSmoothONa_yzy_a<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, co
}
}
}
template <>
void hlineSmoothONa_yzy_a<uint16_t, ufixedpoint32>(const uint16_t* src, int cn, const ufixedpoint32* m, int n, ufixedpoint32* dst, int len, int borderType)
{
int pre_shift = n / 2;
int post_shift = n - pre_shift;
int i = 0;
for (; i < min(pre_shift, len); i++, dst += cn) // Points that fall left from border
{
for (int k = 0; k < cn; k++)
dst[k] = m[pre_shift - i] * src[k];
if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped
for (int j = i - pre_shift, mid = 0; j < 0; j++, mid++)
{
int src_idx = borderInterpolate(j, len, borderType);
for (int k = 0; k < cn; k++)
dst[k] = dst[k] + m[mid] * src[src_idx*cn + k];
}
int j, mid;
for (j = 1, mid = pre_shift - i + 1; j < min(i + post_shift, len); j++, mid++)
for (int k = 0; k < cn; k++)
dst[k] = dst[k] + m[mid] * src[j*cn + k];
if (borderType != BORDER_CONSTANT)
for (; j < i + post_shift; j++, mid++)
{
int src_idx = borderInterpolate(j, len, borderType);
for (int k = 0; k < cn; k++)
dst[k] = dst[k] + m[mid] * src[src_idx*cn + k];
}
}
i *= cn;
int lencn = (len - post_shift + 1)*cn;
#if CV_SIMD
const int VECSZ = v_uint32::nlanes;
for (; i <= lencn - VECSZ * 2; i += VECSZ * 2, src += VECSZ * 2, dst += VECSZ * 2)
{
v_uint32 v_res0, v_res1;
v_mul_expand(vx_load(src + pre_shift * cn), vx_setall_u16((uint16_t) *((uint32_t*)(m + pre_shift))), v_res0, v_res1);
for (int j = 0; j < pre_shift; j ++)
{
v_uint32 v_add0, v_add1;
v_mul_expand(vx_load(src + j * cn) + vx_load(src + (n - 1 - j)*cn), vx_setall_u16((uint16_t) *((uint32_t*)(m + j))), v_add0, v_add1);
v_res0 += v_add0;
v_res1 += v_add1;
}
v_store((uint32_t*)dst, v_res0);
v_store((uint32_t*)dst + VECSZ, v_res1);
}
#endif
for (; i < lencn; i++, src++, dst++)
{
*dst = m[pre_shift] * src[pre_shift*cn];
for (int j = 0; j < pre_shift; j++)
*dst = *dst + m[j] * src[j*cn] + m[j] * src[(n - 1 - j)*cn];
}
i /= cn;
for (i -= pre_shift; i < len - pre_shift; i++, src += cn, dst += cn) // Points that fall right from border
{
for (int k = 0; k < cn; k++)
dst[k] = m[0] * src[k];
int j = 1;
for (; j < len - i; j++)
for (int k = 0; k < cn; k++)
dst[k] = dst[k] + m[j] * src[j*cn + k];
if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped
for (; j < n; j++)
{
int src_idx = borderInterpolate(i + j, len, borderType) - i;
for (int k = 0; k < cn; k++)
dst[k] = dst[k] + m[j] * src[src_idx*cn + k];
}
}
}
template <typename ET, typename FT>
void vlineSmooth1N(const FT* const * src, const FT* m, int, ET* dst, int len)
{
......@@ -1788,6 +1860,62 @@ void vlineSmoothONa_yzy_a<uint8_t, ufixedpoint16>(const ufixedpoint16* const * s
dst[i] = val;
}
}
template <>
void vlineSmoothONa_yzy_a<uint16_t, ufixedpoint32>(const ufixedpoint32* const * src, const ufixedpoint32* m, int n, uint16_t* dst, int len)
{
int i = 0;
#if CV_SIMD
int pre_shift = n / 2;
const int VECSZ = v_uint32::nlanes;
for (; i <= len - 2*VECSZ; i += 2*VECSZ)
{
v_uint32 v_src00, v_src10, v_src01, v_src11;
v_uint64 v_res0, v_res1, v_res2, v_res3;
v_uint64 v_tmp0, v_tmp1, v_tmp2, v_tmp3, v_tmp4, v_tmp5, v_tmp6, v_tmp7;
v_uint32 v_mul = vx_setall_u32(*((uint32_t*)(m + pre_shift)));
const uint32_t* srcp = (const uint32_t*)src[pre_shift] + i;
v_src00 = vx_load(srcp);
v_src10 = vx_load(srcp + VECSZ);
v_mul_expand(v_src00, v_mul, v_res0, v_res1);
v_mul_expand(v_src10, v_mul, v_res2, v_res3);
int j = 0;
for (; j < pre_shift; j++)
{
v_mul = vx_setall_u32(*((uint32_t*)(m + j)));
const uint32_t* srcj0 = (const uint32_t*)src[j] + i;
const uint32_t* srcj1 = (const uint32_t*)src[n - 1 - j] + i;
v_src00 = vx_load(srcj0);
v_src01 = vx_load(srcj1);
v_mul_expand(v_src00, v_mul, v_tmp0, v_tmp1);
v_mul_expand(v_src01, v_mul, v_tmp2, v_tmp3);
v_res0 += v_tmp0 + v_tmp2;
v_res1 += v_tmp1 + v_tmp3;
v_src10 = vx_load(srcj0 + VECSZ);
v_src11 = vx_load(srcj1 + VECSZ);
v_mul_expand(v_src10, v_mul, v_tmp4, v_tmp5);
v_mul_expand(v_src11, v_mul, v_tmp6, v_tmp7);
v_res2 += v_tmp4 + v_tmp6;
v_res3 += v_tmp5 + v_tmp7;
}
v_store(dst + i, v_pack(v_rshr_pack<32>(v_res0, v_res1),
v_rshr_pack<32>(v_res2, v_res3)));
}
#endif
for (; i < len; i++)
{
ufixedpoint64 val = m[0] * src[0][i];
for (int j = 1; j < n; j++)
{
val = val + m[j] * src[j][i];
}
dst[i] = (uint16_t)val;
}
}
template <typename ET, typename FT>
class fixedSmoothInvoker : public ParallelLoopBody
{
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册