Merge pull request #18167 from Yosshi999:bit-exact-gaussian

Bit exact gaussian blur for 16bit unsigned int * bit-exact gaussian kernel for CV_16U * SIMD optimization * template GaussianBlurFixedPoint * remove template specialization * simd support for h3N121 uint16 * test for u16 gaussian blur * remove unnecessary comments * fix return type of raw() * add typedef of native internal type in fixedpoint * update return type of raw()

Merge pull request #18167 from Yosshi999:bit-exact-gaussian
Bit exact gaussian blur for 16bit unsigned int * bit-exact gaussian kernel for CV_16U * SIMD optimization * template GaussianBlurFixedPoint * remove template specialization * simd support for h3N121 uint16 * test for u16 gaussian blur * remove unnecessary comments * fix return type of raw() * add typedef of native internal type in fixedpoint * update return type of raw()
698b2bf7 · Yosshi999 · GitHub · 1d425600 · 698b2bf7 · 698b2bf7
4 changed file
--- a/modules/imgproc/src/fixedpoint.inl.hpp
+++ b/modules/imgproc/src/fixedpoint.inl.hpp
@@ -14,13 +14,14 @@ namespace {
 class fixedpoint64
 {
 private:
-    static const int fixedShift = 32;
-
    int64_t val;
    fixedpoint64(int64_t _val) : val(_val) {}
    static CV_ALWAYS_INLINE uint64_t fixedround(const uint64_t& _val) { return (_val + ((1LL << fixedShift) >> 1)); }
 public:
+    static const int fixedShift = 32;
+
    typedef fixedpoint64 WT;
+    typedef int64_t raw_t;
    CV_ALWAYS_INLINE fixedpoint64() { val = 0; }
    CV_ALWAYS_INLINE fixedpoint64(const fixedpoint64& v) { val = v.val; }
    CV_ALWAYS_INLINE fixedpoint64(const int8_t& _val) { val = ((int64_t)_val) << fixedShift; }
@@ -97,13 +98,14 @@ public:
 class ufixedpoint64
 {
 private:
-    static const int fixedShift = 32;
-
    uint64_t val;
    ufixedpoint64(uint64_t _val) : val(_val) {}
    static CV_ALWAYS_INLINE uint64_t fixedround(const uint64_t& _val) { return (_val + ((1LL << fixedShift) >> 1)); }
 public:
+    static const int fixedShift = 32;
+
    typedef ufixedpoint64 WT;
+    typedef uint64_t raw_t;
    CV_ALWAYS_INLINE ufixedpoint64() { val = 0; }
    CV_ALWAYS_INLINE ufixedpoint64(const ufixedpoint64& v) { val = v.val; }
    CV_ALWAYS_INLINE ufixedpoint64(const uint8_t& _val) { val = ((uint64_t)_val) << fixedShift; }
@@ -157,6 +159,9 @@ public:
    CV_ALWAYS_INLINE bool isZero() { return val == 0; }
    static CV_ALWAYS_INLINE ufixedpoint64 zero() { return ufixedpoint64(); }
    static CV_ALWAYS_INLINE ufixedpoint64 one() { return ufixedpoint64((uint64_t)(1ULL << fixedShift)); }
+
+    static CV_ALWAYS_INLINE ufixedpoint64 fromRaw(uint64_t v) { return ufixedpoint64(v); }
+    CV_ALWAYS_INLINE uint64_t raw() { return val; }
    CV_ALWAYS_INLINE uint32_t cvFloor() const { return cv::saturate_cast<uint32_t>(val >> fixedShift); }
    friend class ufixedpoint32;
 };
@@ -164,13 +169,14 @@ public:
 class fixedpoint32
 {
 private:
-    static const int fixedShift = 16;
-
    int32_t val;
    fixedpoint32(int32_t _val) : val(_val) {}
    static CV_ALWAYS_INLINE uint32_t fixedround(const uint32_t& _val) { return (_val + ((1 << fixedShift) >> 1)); }
 public:
+    static const int fixedShift = 16;
+
    typedef fixedpoint64 WT;
+    typedef int32_t raw_t;
    CV_ALWAYS_INLINE fixedpoint32() { val = 0; }
    CV_ALWAYS_INLINE fixedpoint32(const fixedpoint32& v) { val = v.val; }
    CV_ALWAYS_INLINE fixedpoint32(const int8_t& _val) { val = ((int32_t)_val) << fixedShift; }
@@ -218,13 +224,14 @@ public:
 class ufixedpoint32
 {
 private:
-    static const int fixedShift = 16;
-
    uint32_t val;
    ufixedpoint32(uint32_t _val) : val(_val) {}
    static CV_ALWAYS_INLINE uint32_t fixedround(const uint32_t& _val) { return (_val + ((1 << fixedShift) >> 1)); }
 public:
+    static const int fixedShift = 16;
+
    typedef ufixedpoint64 WT;
+    typedef uint32_t raw_t;
    CV_ALWAYS_INLINE ufixedpoint32() { val = 0; }
    CV_ALWAYS_INLINE ufixedpoint32(const ufixedpoint32& v) { val = v.val; }
    CV_ALWAYS_INLINE ufixedpoint32(const uint8_t& _val) { val = ((uint32_t)_val) << fixedShift; }
@@ -262,19 +269,23 @@ public:
    CV_ALWAYS_INLINE bool isZero() { return val == 0; }
    static CV_ALWAYS_INLINE ufixedpoint32 zero() { return ufixedpoint32(); }
    static CV_ALWAYS_INLINE ufixedpoint32 one() { return ufixedpoint32((1U << fixedShift)); }
+
+    static CV_ALWAYS_INLINE ufixedpoint32 fromRaw(uint32_t v) { return ufixedpoint32(v); }
+    CV_ALWAYS_INLINE uint32_t raw() { return val; }
    friend class ufixedpoint16;
 };

 class fixedpoint16
 {
 private:
-    static const int fixedShift = 8;
-
    int16_t val;
    fixedpoint16(int16_t _val) : val(_val) {}
    static CV_ALWAYS_INLINE uint16_t fixedround(const uint16_t& _val) { return (_val + ((1 << fixedShift) >> 1)); }
 public:
+    static const int fixedShift = 8;
+
    typedef fixedpoint32 WT;
+    typedef int16_t raw_t;
    CV_ALWAYS_INLINE fixedpoint16() { val = 0; }
    CV_ALWAYS_INLINE fixedpoint16(const fixedpoint16& v) { val = v.val; }
    CV_ALWAYS_INLINE fixedpoint16(const int8_t& _val) { val = ((int16_t)_val) << fixedShift; }
@@ -315,13 +326,14 @@ public:
 class ufixedpoint16
 {
 private:
-    static const int fixedShift = 8;
-
    uint16_t val;
    ufixedpoint16(uint16_t _val) : val(_val) {}
    static CV_ALWAYS_INLINE uint16_t fixedround(const uint16_t& _val) { return (_val + ((1 << fixedShift) >> 1)); }
 public:
+    static const int fixedShift = 8;
+
    typedef ufixedpoint32 WT;
+    typedef uint16_t raw_t;
    CV_ALWAYS_INLINE ufixedpoint16() { val = 0; }
    CV_ALWAYS_INLINE ufixedpoint16(const ufixedpoint16& v) { val = v.val; }
    CV_ALWAYS_INLINE ufixedpoint16(const uint8_t& _val) { val = ((uint16_t)_val) << fixedShift; }
@@ -358,7 +370,7 @@ public:
    static CV_ALWAYS_INLINE ufixedpoint16 one() { return ufixedpoint16((uint16_t)(1 << fixedShift)); }

    static CV_ALWAYS_INLINE ufixedpoint16 fromRaw(uint16_t v) { return ufixedpoint16(v); }
-    CV_ALWAYS_INLINE ufixedpoint16 raw() { return val; }
+    CV_ALWAYS_INLINE uint16_t raw() { return val; }
 };

 }

--- a/modules/imgproc/src/smooth.dispatch.cpp
+++ b/modules/imgproc/src/smooth.dispatch.cpp
@@ -258,23 +258,20 @@ softdouble getGaussianKernelFixedPoint_ED(CV_OUT std::vector<int64_t>& result, c
 }

 static void getGaussianKernel(int n, double sigma, int ktype, Mat& res) { res = getGaussianKernel(n, sigma, ktype); }
-template <typename T> static void getGaussianKernel(int n, double sigma, int, std::vector<T>& res);
-//{ res = getFixedpointGaussianKernel<T>(n, sigma); }
-
-template<> void getGaussianKernel<ufixedpoint16>(int n, double sigma, int, std::vector<ufixedpoint16>& res)
+template <typename FT> static void getGaussianKernel(int n, double sigma, int, std::vector<FT>& res)
 {
    std::vector<softdouble> res_sd;
    softdouble s0 = getGaussianKernelBitExact(res_sd, n, sigma);
    CV_UNUSED(s0);

    std::vector<int64_t> fixed_256;
-    softdouble approx_err = getGaussianKernelFixedPoint_ED(fixed_256, res_sd, 8);
+    softdouble approx_err = getGaussianKernelFixedPoint_ED(fixed_256, res_sd, FT::fixedShift);
    CV_UNUSED(approx_err);

    res.resize(n);
    for (int i = 0; i < n; i++)
    {
-        res[i] = ufixedpoint16::fromRaw((uint16_t)fixed_256[i]);
+        res[i] = FT::fromRaw((typename FT::raw_t)fixed_256[i]);
        //printf("%03d: %d\n", i, res[i].raw());
    }
 }
@@ -688,6 +685,43 @@ void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
            return;
        }
    }
+    if(sdepth == CV_16U && ((borderType & BORDER_ISOLATED) || !_src.isSubmatrix()))
+    {
+        CV_LOG_INFO(NULL, "GaussianBlur: running bit-exact version...");
+
+        std::vector<ufixedpoint32> fkx, fky;
+        createGaussianKernels(fkx, fky, type, ksize, sigma1, sigma2);
+
+        static bool param_check_gaussian_blur_bitexact_kernels = utils::getConfigurationParameterBool("OPENCV_GAUSSIANBLUR_CHECK_BITEXACT_KERNELS", false);
+        if (param_check_gaussian_blur_bitexact_kernels && !validateGaussianBlurKernel(fkx))
+        {
+            CV_LOG_INFO(NULL, "GaussianBlur: bit-exact fx kernel can't be applied: ksize=" << ksize << " sigma=" << Size2d(sigma1, sigma2));
+        }
+        else if (param_check_gaussian_blur_bitexact_kernels && !validateGaussianBlurKernel(fky))
+        {
+            CV_LOG_INFO(NULL, "GaussianBlur: bit-exact fy kernel can't be applied: ksize=" << ksize << " sigma=" << Size2d(sigma1, sigma2));
+        }
+        else
+        {
+            // TODO: implement ocl_sepFilter2D_BitExact -- how to deal with bdepth?
+            // CV_OCL_RUN(useOpenCL,
+            //         ocl_sepFilter2D_BitExact(_src, _dst, sdepth,
+            //                 ksize,
+            //                 (const uint32_t*)&fkx[0], (const uint32_t*)&fky[0],
+            //                 Point(-1, -1), 0, borderType,
+            //                 16/*shift_bits*/)
+            // );
+
+            Mat src = _src.getMat();
+            Mat dst = _dst.getMat();
+
+            if (src.data == dst.data)
+                src = src.clone();
+            CV_CPU_DISPATCH(GaussianBlurFixedPoint, (src, dst, (const uint32_t*)&fkx[0], (int)fkx.size(), (const uint32_t*)&fky[0], (int)fky.size(), borderType),
+                CV_CPU_DISPATCH_MODES_ALL);
+            return;
+        }
+    }

 #ifdef HAVE_OPENCL
    if (useOpenCL)

--- a/modules/imgproc/src/smooth.simd.hpp
+++ b/modules/imgproc/src/smooth.simd.hpp
@@ -54,9 +54,10 @@
 namespace cv {
 CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
 // forward declarations
-void GaussianBlurFixedPoint(const Mat& src, /*const*/ Mat& dst,
-                            const uint16_t/*ufixedpoint16*/* fkx, int fkx_size,
-                            const uint16_t/*ufixedpoint16*/* fky, int fky_size,
+template <typename RFT>
+void GaussianBlurFixedPoint(const Mat& src, Mat& dst,
+                            const RFT* fkx, int fkx_size,
+                            const RFT* fky, int fky_size,
                            int borderType);

 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
@@ -192,8 +193,9 @@ void hlineSmooth3N<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufi
        }
    }
 }
-template <typename ET, typename FT>
-void hlineSmooth3N121(const ET* src, int cn, const FT*, int, FT* dst, int len, int borderType)
+
+template <typename ET, typename FT, typename VFT>
+void hlineSmooth3N121Impl(const ET* src, int cn, const FT*, int, FT* dst, int len, int borderType)
 {
    if (len == 1)
    {
@@ -217,7 +219,13 @@ void hlineSmooth3N121(const ET* src, int cn, const FT*, int, FT* dst, int len, i
        }

        src += cn; dst += cn;
-        for (int i = cn; i < (len - 1)*cn; i++, src++, dst++)
+        int i = cn, lencn = (len - 1)*cn;
+#if CV_SIMD
+        const int VECSZ = VFT::nlanes;
+        for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
+            v_store((typename FT::raw_t*)dst, (vx_load_expand(src - cn) + vx_load_expand(src + cn) + (vx_load_expand(src) << 1)) << (FT::fixedShift-2));
+#endif
+        for (; i < lencn; i++, src++, dst++)
            *dst = (FT(src[-cn])>>2) + (FT(src[cn])>>2) + (FT(src[0])>>1);

        // Point that fall right from border
@@ -231,51 +239,19 @@ void hlineSmooth3N121(const ET* src, int cn, const FT*, int, FT* dst, int len, i
        }
    }
 }
+template <typename ET, typename FT>
+void hlineSmooth3N121(const ET* src, int cn, const FT*, int, FT* dst, int len, int borderType);
 template <>
-void hlineSmooth3N121<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufixedpoint16*, int, ufixedpoint16* dst, int len, int borderType)
+void hlineSmooth3N121<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufixedpoint16* _m, int _n, ufixedpoint16* dst, int len, int borderType)
 {
-    if (len == 1)
-    {
-        if (borderType != BORDER_CONSTANT)
-            for (int k = 0; k < cn; k++)
-                dst[k] = ufixedpoint16(src[k]);
-        else
-            for (int k = 0; k < cn; k++)
-                dst[k] = ufixedpoint16(src[k]) >> 1;
-    }
-    else
-    {
-        // Point that fall left from border
-        for (int k = 0; k < cn; k++)
-            dst[k] = (ufixedpoint16(src[k])>>1) + (ufixedpoint16(src[cn + k])>>2);
-        if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped
-        {
-            int src_idx = borderInterpolate(-1, len, borderType);
-            for (int k = 0; k < cn; k++)
-                dst[k] = dst[k] + (ufixedpoint16(src[src_idx*cn + k])>>2);
-        }
-
-        src += cn; dst += cn;
-        int i = cn, lencn = (len - 1)*cn;
-#if CV_SIMD
-        const int VECSZ = v_uint16::nlanes;
-        for (; i <= lencn - VECSZ; i += VECSZ, src += VECSZ, dst += VECSZ)
-            v_store((uint16_t*)dst, (vx_load_expand(src - cn) + vx_load_expand(src + cn) + (vx_load_expand(src) << 1)) << 6);
-#endif
-        for (; i < lencn; i++, src++, dst++)
-            *((uint16_t*)dst) = (uint16_t(src[-cn]) + uint16_t(src[cn]) + (uint16_t(src[0]) << 1)) << 6;
-
-        // Point that fall right from border
-        for (int k = 0; k < cn; k++)
-            dst[k] = (ufixedpoint16(src[k - cn])>>2) + (ufixedpoint16(src[k])>>1);
-        if (borderType != BORDER_CONSTANT)// If BORDER_CONSTANT out of border values are equal to zero and could be skipped
-        {
-            int src_idx = (borderInterpolate(len, len, borderType) - (len - 1))*cn;
-            for (int k = 0; k < cn; k++)
-                dst[k] = dst[k] + (ufixedpoint16(src[src_idx + k])>>2);
-        }
-    }
+    hlineSmooth3N121Impl<uint8_t, ufixedpoint16, v_uint16>(src, cn, _m, _n, dst, len, borderType);
+}
+template <>
+void hlineSmooth3N121<uint16_t, ufixedpoint32>(const uint16_t* src, int cn, const ufixedpoint32* _m, int _n, ufixedpoint32* dst, int len, int borderType)
+{
+    hlineSmooth3N121Impl<uint16_t, ufixedpoint32, v_uint32>(src, cn, _m, _n, dst, len, borderType);
 }
+
 template <typename ET, typename FT>
 void hlineSmooth3Naba(const ET* src, int cn, const FT* m, int, FT* dst, int len, int borderType)
 {
@@ -1376,6 +1352,28 @@ void vlineSmooth3N121<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src,
    for (; i < len; i++)
        dst[i] = (((uint32_t)(((uint16_t*)(src[0]))[i]) + (uint32_t)(((uint16_t*)(src[2]))[i]) + ((uint32_t)(((uint16_t*)(src[1]))[i]) << 1)) + (1 << 9)) >> 10;
 }
+template <>
+void vlineSmooth3N121<uint16_t, ufixedpoint32>(const ufixedpoint32* const * src, const ufixedpoint32*, int, uint16_t* dst, int len)
+{
+    int i = 0;
+#if CV_SIMD
+    const int VECSZ = v_uint32::nlanes;
+    for (; i <= len - 2*VECSZ; i += 2*VECSZ)
+    {
+        v_uint64 v_src00, v_src01, v_src02, v_src03, v_src10, v_src11, v_src12, v_src13, v_src20, v_src21, v_src22, v_src23;
+        v_expand(vx_load((uint32_t*)(src[0]) + i), v_src00, v_src01);
+        v_expand(vx_load((uint32_t*)(src[0]) + i + VECSZ), v_src02, v_src03);
+        v_expand(vx_load((uint32_t*)(src[1]) + i), v_src10, v_src11);
+        v_expand(vx_load((uint32_t*)(src[1]) + i + VECSZ), v_src12, v_src13);
+        v_expand(vx_load((uint32_t*)(src[2]) + i), v_src20, v_src21);
+        v_expand(vx_load((uint32_t*)(src[2]) + i + VECSZ), v_src22, v_src23);
+        v_store(dst + i, v_pack(v_rshr_pack<18>(v_src00 + v_src20 + (v_src10 + v_src10), v_src01 + v_src21 + (v_src11 + v_src11)),
+                                v_rshr_pack<18>(v_src02 + v_src22 + (v_src12 + v_src12), v_src03 + v_src23 + (v_src13 + v_src13))));
+    }
+#endif
+    for (; i < len; i++)
+        dst[i] = (((uint64_t)((uint32_t*)(src[0]))[i]) + (uint64_t)(((uint32_t*)(src[2]))[i]) + ((uint64_t(((uint32_t*)(src[1]))[i]) << 1)) + (1 << 17)) >> 18;
+}
 template <typename ET, typename FT>
 void vlineSmooth5N(const FT* const * src, const FT* m, int, ET* dst, int len)
 {
@@ -1525,6 +1523,39 @@ void vlineSmooth5N14641<uint8_t, ufixedpoint16>(const ufixedpoint16* const * src
                  (((uint32_t)(((uint16_t*)(src[1]))[i]) + (uint32_t)(((uint16_t*)(src[3]))[i])) << 2) +
                  (uint32_t)(((uint16_t*)(src[0]))[i]) + (uint32_t)(((uint16_t*)(src[4]))[i]) + (1 << 11)) >> 12;
 }
+template <>
+void vlineSmooth5N14641<uint16_t, ufixedpoint32>(const ufixedpoint32* const * src, const ufixedpoint32*, int, uint16_t* dst, int len)
+{
+    int i = 0;
+#if CV_SIMD
+    const int VECSZ = v_uint32::nlanes;
+    for (; i <= len - 2*VECSZ; i += 2*VECSZ)
+    {
+        v_uint64 v_src00, v_src10, v_src20, v_src30, v_src40;
+        v_uint64 v_src01, v_src11, v_src21, v_src31, v_src41;
+        v_uint64 v_src02, v_src12, v_src22, v_src32, v_src42;
+        v_uint64 v_src03, v_src13, v_src23, v_src33, v_src43;
+        v_expand(vx_load((uint32_t*)(src[0]) + i), v_src00, v_src01);
+        v_expand(vx_load((uint32_t*)(src[0]) + i + VECSZ), v_src02, v_src03);
+        v_expand(vx_load((uint32_t*)(src[1]) + i), v_src10, v_src11);
+        v_expand(vx_load((uint32_t*)(src[1]) + i + VECSZ), v_src12, v_src13);
+        v_expand(vx_load((uint32_t*)(src[2]) + i), v_src20, v_src21);
+        v_expand(vx_load((uint32_t*)(src[2]) + i + VECSZ), v_src22, v_src23);
+        v_expand(vx_load((uint32_t*)(src[3]) + i), v_src30, v_src31);
+        v_expand(vx_load((uint32_t*)(src[3]) + i + VECSZ), v_src32, v_src33);
+        v_expand(vx_load((uint32_t*)(src[4]) + i), v_src40, v_src41);
+        v_expand(vx_load((uint32_t*)(src[4]) + i + VECSZ), v_src42, v_src43);
+        v_store(dst + i, v_pack(v_rshr_pack<20>((v_src20 << 2) + (v_src20 << 1) + ((v_src10 + v_src30) << 2) + v_src00 + v_src40,
+                                                (v_src21 << 2) + (v_src21 << 1) + ((v_src11 + v_src31) << 2) + v_src01 + v_src41),
+                                v_rshr_pack<20>((v_src22 << 2) + (v_src22 << 1) + ((v_src12 + v_src32) << 2) + v_src02 + v_src42,
+                                                (v_src23 << 2) + (v_src23 << 1) + ((v_src13 + v_src33) << 2) + v_src03 + v_src43)));
+    }
+#endif
+    for (; i < len; i++)
+        dst[i] = ((uint64_t)(((uint32_t*)(src[2]))[i]) * 6 +
+                  (((uint64_t)(((uint32_t*)(src[1]))[i]) + (uint64_t)(((uint32_t*)(src[3]))[i])) << 2) +
+                  (uint64_t)(((uint32_t*)(src[0]))[i]) + (uint64_t)(((uint32_t*)(src[4]))[i]) + (1 << 19)) >> 20;
+}
 template <typename ET, typename FT>
 void vlineSmooth(const FT* const * src, const FT* m, int n, ET* dst, int len)
 {
@@ -2029,25 +2060,42 @@ private:

 }  // namespace anon

-void GaussianBlurFixedPoint(const Mat& src, /*const*/ Mat& dst,
-                            const uint16_t/*ufixedpoint16*/* fkx, int fkx_size,
-                            const uint16_t/*ufixedpoint16*/* fky, int fky_size,
-                            int borderType)
+template <typename RFT, typename ET, typename FT>
+void GaussianBlurFixedPointImpl(const Mat& src, /*const*/ Mat& dst,
+                                const RFT* fkx, int fkx_size,
+                                const RFT* fky, int fky_size,
+                                int borderType)
 {
    CV_INSTRUMENT_REGION();

-    CV_Assert(src.depth() == CV_8U && ((borderType & BORDER_ISOLATED) || !src.isSubmatrix()));
-    fixedSmoothInvoker<uint8_t, ufixedpoint16> invoker(
-            src.ptr<uint8_t>(), src.step1(),
-            dst.ptr<uint8_t>(), dst.step1(), dst.cols, dst.rows, dst.channels(),
-            (const ufixedpoint16*)fkx, fkx_size, (const ufixedpoint16*)fky, fky_size,
+    CV_Assert(src.depth() == DataType<ET>::depth && ((borderType & BORDER_ISOLATED) || !src.isSubmatrix()));
+    fixedSmoothInvoker<ET, FT> invoker(
+            src.ptr<ET>(), src.step1(),
+            dst.ptr<ET>(), dst.step1(), dst.cols, dst.rows, dst.channels(),
+            (const FT*)fkx, fkx_size, (const FT*)fky, fky_size,
            borderType & ~BORDER_ISOLATED);
    {
        // TODO AVX guard (external call)
        parallel_for_(Range(0, dst.rows), invoker, std::max(1, std::min(getNumThreads(), getNumberOfCPUs())));
    }
 }
+template <>
+void GaussianBlurFixedPoint<uint16_t>(const Mat& src, /*const*/ Mat& dst,
+                                      const uint16_t/*ufixedpoint16*/* fkx, int fkx_size,
+                                      const uint16_t/*ufixedpoint16*/* fky, int fky_size,
+                                      int borderType)
+{
+    GaussianBlurFixedPointImpl<uint16_t, uint8_t, ufixedpoint16>(src, dst, fkx, fkx_size, fky, fky_size, borderType);
+}

+template <>
+void GaussianBlurFixedPoint<uint32_t>(const Mat& src, /*const*/ Mat& dst,
+                                      const uint32_t/*ufixedpoint32*/* fkx, int fkx_size,
+                                      const uint32_t/*ufixedpoint32*/* fky, int fky_size,
+                                      int borderType)
+{
+    GaussianBlurFixedPointImpl<uint32_t, uint16_t, ufixedpoint32>(src, dst, fkx, fkx_size, fky, fky_size, borderType);
+}
 #endif
 CV_CPU_OPTIMIZATION_NAMESPACE_END
 } // namespace
--- a/modules/imgproc/test/test_smooth_bitexact.cpp
+++ b/modules/imgproc/test/test_smooth_bitexact.cpp
@@ -7,13 +7,15 @@
 namespace opencv_test { namespace {

    static const int fixedShiftU8 = 8;
-    static const int64_t fixedOne = (1L << fixedShiftU8);
-
-    int64_t v[][9] = {
-        { fixedOne }, // size 1, sigma 0
-        { fixedOne >> 2, fixedOne >> 1, fixedOne >> 2 }, // size 3, sigma 0
-        { fixedOne >> 4, fixedOne >> 2, 6 * (fixedOne >> 4), fixedOne >> 2, fixedOne >> 4 }, // size 5, sigma 0
-        { fixedOne >> 5, 7 * (fixedOne >> 6), 7 * (fixedOne >> 5), 9 * (fixedOne >> 5), 7 * (fixedOne >> 5), 7 * (fixedOne >> 6), fixedOne >> 5 }, // size 7, sigma 0
+    static const int64_t fixedOneU8 = (1L << fixedShiftU8);
+    static const int fixedShiftU16 = 16;
+    static const int64_t fixedOneU16 = (1L << fixedShiftU16);
+
+    int64_t vU8[][9] = {
+        { fixedOneU8 }, // size 1, sigma 0
+        { fixedOneU8 >> 2, fixedOneU8 >> 1, fixedOneU8 >> 2 }, // size 3, sigma 0
+        { fixedOneU8 >> 4, fixedOneU8 >> 2, 6 * (fixedOneU8 >> 4), fixedOneU8 >> 2, fixedOneU8 >> 4 }, // size 5, sigma 0
+        { fixedOneU8 >> 5, 7 * (fixedOneU8 >> 6), 7 * (fixedOneU8 >> 5), 9 * (fixedOneU8 >> 5), 7 * (fixedOneU8 >> 5), 7 * (fixedOneU8 >> 6), fixedOneU8 >> 5 }, // size 7, sigma 0
        { 4, 13, 30, 51, 60, 51, 30, 13, 4 }, // size 9, sigma 0
 #if 1
 #define CV_TEST_INACCURATE_GAUSSIAN_BLUR
@@ -24,6 +26,14 @@ namespace opencv_test { namespace {
 #endif
    };

+    int64_t vU16[][9] = {
+        { fixedOneU16 }, // size 1, sigma 0
+        { fixedOneU16 >> 2, fixedOneU16 >> 1, fixedOneU16 >> 2 }, // size 3, sigma 0
+        { fixedOneU16 >> 4, fixedOneU16 >> 2, 6 * (fixedOneU16 >> 4), fixedOneU16 >> 2, fixedOneU16 >> 4 }, // size 5, sigma 0
+        { fixedOneU16 >> 5, 7 * (fixedOneU16 >> 6), 7 * (fixedOneU16 >> 5), 9 * (fixedOneU16 >> 5), 7 * (fixedOneU16 >> 5), 7 * (fixedOneU16 >> 6), fixedOneU16 >> 5 }, // size 7, sigma 0
+        { 4<<8, 13<<8, 30<<8, 51<<8, 60<<8, 51<<8, 30<<8, 13<<8, 4<<8 } // size 9, sigma 0
+    };
+
    template <typename T, int fixedShift>
    T eval(Mat src, vector<int64_t> kernelx, vector<int64_t> kernely)
    {
@@ -39,8 +49,6 @@ namespace opencv_test { namespace {
        return saturate_cast<T>((val + fixedRound) >> (fixedShift * 2));
    }

-TEST(GaussianBlur_Bitexact, Linear8U)
-{
    struct testmode
    {
        int type;
@@ -50,34 +58,6 @@ TEST(GaussianBlur_Bitexact, Linear8U)
        double sigma_y;
        vector<int64_t> kernel_x;
        vector<int64_t> kernel_y;
-    } modes[] = {
-        { CV_8UC1, Size(   1,   1), Size(3, 3), 0, 0, vector<int64_t>(v[1], v[1]+3), vector<int64_t>(v[1], v[1]+3) },
-        { CV_8UC1, Size(   2,   2), Size(3, 3), 0, 0, vector<int64_t>(v[1], v[1]+3), vector<int64_t>(v[1], v[1]+3) },
-        { CV_8UC1, Size(   3,   1), Size(3, 3), 0, 0, vector<int64_t>(v[1], v[1]+3), vector<int64_t>(v[1], v[1]+3) },
-        { CV_8UC1, Size(   1,   3), Size(3, 3), 0, 0, vector<int64_t>(v[1], v[1]+3), vector<int64_t>(v[1], v[1]+3) },
-        { CV_8UC1, Size(   3,   3), Size(3, 3), 0, 0, vector<int64_t>(v[1], v[1]+3), vector<int64_t>(v[1], v[1]+3) },
-        { CV_8UC1, Size(   3,   3), Size(5, 5), 0, 0, vector<int64_t>(v[2], v[2]+5), vector<int64_t>(v[2], v[2]+5) },
-        { CV_8UC1, Size(   3,   3), Size(7, 7), 0, 0, vector<int64_t>(v[3], v[3]+7), vector<int64_t>(v[3], v[3]+7) },
-        { CV_8UC1, Size(   5,   5), Size(3, 3), 0, 0, vector<int64_t>(v[1], v[1]+3), vector<int64_t>(v[1], v[1]+3) },
-        { CV_8UC1, Size(   5,   5), Size(5, 5), 0, 0, vector<int64_t>(v[2], v[2]+5), vector<int64_t>(v[2], v[2]+5) },
-        { CV_8UC1, Size(   3,   5), Size(5, 5), 0, 0, vector<int64_t>(v[2], v[2]+5), vector<int64_t>(v[2], v[2]+5) },
-        { CV_8UC1, Size(   5,   5), Size(5, 5), 0, 0, vector<int64_t>(v[2], v[2]+5), vector<int64_t>(v[2], v[2]+5) },
-        { CV_8UC1, Size(   5,   5), Size(7, 7), 0, 0, vector<int64_t>(v[3], v[3]+7), vector<int64_t>(v[3], v[3]+7) },
-        { CV_8UC1, Size(   7,   7), Size(7, 7), 0, 0, vector<int64_t>(v[3], v[3]+7), vector<int64_t>(v[3], v[3]+7) },
-        { CV_8UC1, Size( 256, 128), Size(3, 3), 0, 0, vector<int64_t>(v[1], v[1]+3), vector<int64_t>(v[1], v[1]+3) },
-        { CV_8UC2, Size( 256, 128), Size(3, 3), 0, 0, vector<int64_t>(v[1], v[1]+3), vector<int64_t>(v[1], v[1]+3) },
-        { CV_8UC3, Size( 256, 128), Size(3, 3), 0, 0, vector<int64_t>(v[1], v[1]+3), vector<int64_t>(v[1], v[1]+3) },
-        { CV_8UC4, Size( 256, 128), Size(3, 3), 0, 0, vector<int64_t>(v[1], v[1]+3), vector<int64_t>(v[1], v[1]+3) },
-        { CV_8UC1, Size( 256, 128), Size(5, 5), 0, 0, vector<int64_t>(v[2], v[2]+5), vector<int64_t>(v[2], v[2]+5) },
-        { CV_8UC1, Size( 256, 128), Size(7, 7), 0, 0, vector<int64_t>(v[3], v[3]+7), vector<int64_t>(v[3], v[3]+7) },
-        { CV_8UC1, Size( 256, 128), Size(9, 9), 0, 0, vector<int64_t>(v[4], v[4]+9), vector<int64_t>(v[4], v[4]+9) },
-#ifdef CV_TEST_INACCURATE_GAUSSIAN_BLUR
-        { CV_8UC1, Size( 256, 128), Size(3, 3), 1.75, 0.875, vector<int64_t>(v[5], v[5]+3), vector<int64_t>(v[6], v[6]+3) },
-        { CV_8UC2, Size( 256, 128), Size(3, 3), 1.75, 0.875, vector<int64_t>(v[5], v[5]+3), vector<int64_t>(v[6], v[6]+3) },
-        { CV_8UC3, Size( 256, 128), Size(3, 3), 1.75, 0.875, vector<int64_t>(v[5], v[5]+3), vector<int64_t>(v[6], v[6]+3) },
-        { CV_8UC4, Size( 256, 128), Size(3, 3), 1.75, 0.875, vector<int64_t>(v[5], v[5]+3), vector<int64_t>(v[6], v[6]+3) },
-        { CV_8UC1, Size( 256, 128), Size(5, 5), 0.375, 0.75, vector<int64_t>(v[7], v[7]+5), vector<int64_t>(v[8], v[8]+5) }
-#endif
    };

    int bordermodes[] = {
@@ -93,11 +73,12 @@ TEST(GaussianBlur_Bitexact, Linear8U)
 //        BORDER_REFLECT_101
    };

-    for (int modeind = 0, _modecnt = sizeof(modes) / sizeof(modes[0]); modeind < _modecnt; ++modeind)
+    template <int fixedShift>
+    void checkMode(const testmode& mode)
    {
-        int type = modes[modeind].type, depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
-        int dcols = modes[modeind].sz.width, drows = modes[modeind].sz.height;
-        Size kernel = modes[modeind].kernel;
+        int type = mode.type, depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+        int dcols = mode.sz.width, drows = mode.sz.height;
+        Size kernel = mode.kernel;

        int rows = drows + 20, cols = dcols + 20;
        Mat src(rows, cols, type), refdst(drows, dcols, type), dst;
@@ -142,25 +123,93 @@ TEST(GaussianBlur_Bitexact, Linear8U)
                    for (int i = 0; i < dcols; i++)
                    {
                        if (depth == CV_8U)
-                            dst_chan.at<uint8_t>(j, i) = eval<uint8_t, fixedShiftU8>(src_chan(Rect(i,j,kernel.width,kernel.height)), modes[modeind].kernel_x, modes[modeind].kernel_y);
+                            dst_chan.at<uint8_t>(j, i) = eval<uint8_t, fixedShift>(src_chan(Rect(i,j,kernel.width,kernel.height)), mode.kernel_x, mode.kernel_y);
                        else if (depth == CV_16U)
-                            dst_chan.at<uint16_t>(j, i) = eval<uint16_t, fixedShiftU8>(src_chan(Rect(i, j, kernel.width, kernel.height)), modes[modeind].kernel_x, modes[modeind].kernel_y);
+                            dst_chan.at<uint16_t>(j, i) = eval<uint16_t, fixedShift>(src_chan(Rect(i, j, kernel.width, kernel.height)), mode.kernel_x, mode.kernel_y);
                        else if (depth == CV_16S)
-                            dst_chan.at<int16_t>(j, i) = eval<int16_t, fixedShiftU8>(src_chan(Rect(i, j, kernel.width, kernel.height)), modes[modeind].kernel_x, modes[modeind].kernel_y);
+                            dst_chan.at<int16_t>(j, i) = eval<int16_t, fixedShift>(src_chan(Rect(i, j, kernel.width, kernel.height)), mode.kernel_x, mode.kernel_y);
                        else if (depth == CV_32S)
-                            dst_chan.at<int32_t>(j, i) = eval<int32_t, fixedShiftU8>(src_chan(Rect(i, j, kernel.width, kernel.height)), modes[modeind].kernel_x, modes[modeind].kernel_y);
+                            dst_chan.at<int32_t>(j, i) = eval<int32_t, fixedShift>(src_chan(Rect(i, j, kernel.width, kernel.height)), mode.kernel_x, mode.kernel_y);
                        else
                            CV_Assert(0);
                    }
                mixChannels(dst_chan, refdst, toFrom, 1);
            }

-            cv::GaussianBlur(src_roi, dst, kernel, modes[modeind].sigma_x, modes[modeind].sigma_y, bordermodes[borderind]);
+            cv::GaussianBlur(src_roi, dst, kernel, mode.sigma_x, mode.sigma_y, bordermodes[borderind]);

            EXPECT_GE(0, cvtest::norm(refdst, dst, cv::NORM_L1))
-                << "GaussianBlur " << cn << "-chan mat " << drows << "x" << dcols << " by kernel " << kernel << " sigma(" << modes[modeind].sigma_x << ";" << modes[modeind].sigma_y << ") failed with max diff " << cvtest::norm(refdst, dst, cv::NORM_INF);
+                << "GaussianBlur " << cn << "-chan mat " << drows << "x" << dcols << " by kernel " << kernel << " sigma(" << mode.sigma_x << ";" << mode.sigma_y << ") failed with max diff " << cvtest::norm(refdst, dst, cv::NORM_INF);
        }
    }
+
+TEST(GaussianBlur_Bitexact, Linear8U)
+{
+    testmode modes[] = {
+        { CV_8UC1, Size(   1,   1), Size(3, 3), 0, 0, vector<int64_t>(vU8[1], vU8[1]+3), vector<int64_t>(vU8[1], vU8[1]+3) },
+        { CV_8UC1, Size(   2,   2), Size(3, 3), 0, 0, vector<int64_t>(vU8[1], vU8[1]+3), vector<int64_t>(vU8[1], vU8[1]+3) },
+        { CV_8UC1, Size(   3,   1), Size(3, 3), 0, 0, vector<int64_t>(vU8[1], vU8[1]+3), vector<int64_t>(vU8[1], vU8[1]+3) },
+        { CV_8UC1, Size(   1,   3), Size(3, 3), 0, 0, vector<int64_t>(vU8[1], vU8[1]+3), vector<int64_t>(vU8[1], vU8[1]+3) },
+        { CV_8UC1, Size(   3,   3), Size(3, 3), 0, 0, vector<int64_t>(vU8[1], vU8[1]+3), vector<int64_t>(vU8[1], vU8[1]+3) },
+        { CV_8UC1, Size(   3,   3), Size(5, 5), 0, 0, vector<int64_t>(vU8[2], vU8[2]+5), vector<int64_t>(vU8[2], vU8[2]+5) },
+        { CV_8UC1, Size(   3,   3), Size(7, 7), 0, 0, vector<int64_t>(vU8[3], vU8[3]+7), vector<int64_t>(vU8[3], vU8[3]+7) },
+        { CV_8UC1, Size(   5,   5), Size(3, 3), 0, 0, vector<int64_t>(vU8[1], vU8[1]+3), vector<int64_t>(vU8[1], vU8[1]+3) },
+        { CV_8UC1, Size(   5,   5), Size(5, 5), 0, 0, vector<int64_t>(vU8[2], vU8[2]+5), vector<int64_t>(vU8[2], vU8[2]+5) },
+        { CV_8UC1, Size(   3,   5), Size(5, 5), 0, 0, vector<int64_t>(vU8[2], vU8[2]+5), vector<int64_t>(vU8[2], vU8[2]+5) },
+        { CV_8UC1, Size(   5,   5), Size(5, 5), 0, 0, vector<int64_t>(vU8[2], vU8[2]+5), vector<int64_t>(vU8[2], vU8[2]+5) },
+        { CV_8UC1, Size(   5,   5), Size(7, 7), 0, 0, vector<int64_t>(vU8[3], vU8[3]+7), vector<int64_t>(vU8[3], vU8[3]+7) },
+        { CV_8UC1, Size(   7,   7), Size(7, 7), 0, 0, vector<int64_t>(vU8[3], vU8[3]+7), vector<int64_t>(vU8[3], vU8[3]+7) },
+        { CV_8UC1, Size( 256, 128), Size(3, 3), 0, 0, vector<int64_t>(vU8[1], vU8[1]+3), vector<int64_t>(vU8[1], vU8[1]+3) },
+        { CV_8UC2, Size( 256, 128), Size(3, 3), 0, 0, vector<int64_t>(vU8[1], vU8[1]+3), vector<int64_t>(vU8[1], vU8[1]+3) },
+        { CV_8UC3, Size( 256, 128), Size(3, 3), 0, 0, vector<int64_t>(vU8[1], vU8[1]+3), vector<int64_t>(vU8[1], vU8[1]+3) },
+        { CV_8UC4, Size( 256, 128), Size(3, 3), 0, 0, vector<int64_t>(vU8[1], vU8[1]+3), vector<int64_t>(vU8[1], vU8[1]+3) },
+        { CV_8UC1, Size( 256, 128), Size(5, 5), 0, 0, vector<int64_t>(vU8[2], vU8[2]+5), vector<int64_t>(vU8[2], vU8[2]+5) },
+        { CV_8UC1, Size( 256, 128), Size(7, 7), 0, 0, vector<int64_t>(vU8[3], vU8[3]+7), vector<int64_t>(vU8[3], vU8[3]+7) },
+        { CV_8UC1, Size( 256, 128), Size(9, 9), 0, 0, vector<int64_t>(vU8[4], vU8[4]+9), vector<int64_t>(vU8[4], vU8[4]+9) },
+#ifdef CV_TEST_INACCURATE_GAUSSIAN_BLUR
+        { CV_8UC1, Size( 256, 128), Size(3, 3), 1.75, 0.875, vector<int64_t>(vU8[5], vU8[5]+3), vector<int64_t>(vU8[6], vU8[6]+3) },
+        { CV_8UC2, Size( 256, 128), Size(3, 3), 1.75, 0.875, vector<int64_t>(vU8[5], vU8[5]+3), vector<int64_t>(vU8[6], vU8[6]+3) },
+        { CV_8UC3, Size( 256, 128), Size(3, 3), 1.75, 0.875, vector<int64_t>(vU8[5], vU8[5]+3), vector<int64_t>(vU8[6], vU8[6]+3) },
+        { CV_8UC4, Size( 256, 128), Size(3, 3), 1.75, 0.875, vector<int64_t>(vU8[5], vU8[5]+3), vector<int64_t>(vU8[6], vU8[6]+3) },
+        { CV_8UC1, Size( 256, 128), Size(5, 5), 0.375, 0.75, vector<int64_t>(vU8[7], vU8[7]+5), vector<int64_t>(vU8[8], vU8[8]+5) }
+#endif
+    };
+
+    for (int modeind = 0, _modecnt = sizeof(modes) / sizeof(modes[0]); modeind < _modecnt; ++modeind)
+    {
+        checkMode<fixedShiftU8>(modes[modeind]);
+    }
+}
+
+TEST(GaussianBlur_Bitexact, Linear16U)
+{
+        testmode modes[] = {
+        { CV_16UC1, Size(   1,   1), Size(3, 3), 0, 0, vector<int64_t>(vU16[1], vU16[1]+3), vector<int64_t>(vU16[1], vU16[1]+3) },
+        { CV_16UC1, Size(   2,   2), Size(3, 3), 0, 0, vector<int64_t>(vU16[1], vU16[1]+3), vector<int64_t>(vU16[1], vU16[1]+3) },
+        { CV_16UC1, Size(   3,   1), Size(3, 3), 0, 0, vector<int64_t>(vU16[1], vU16[1]+3), vector<int64_t>(vU16[1], vU16[1]+3) },
+        { CV_16UC1, Size(   1,   3), Size(3, 3), 0, 0, vector<int64_t>(vU16[1], vU16[1]+3), vector<int64_t>(vU16[1], vU16[1]+3) },
+        { CV_16UC1, Size(   3,   3), Size(3, 3), 0, 0, vector<int64_t>(vU16[1], vU16[1]+3), vector<int64_t>(vU16[1], vU16[1]+3) },
+        { CV_16UC1, Size(   3,   3), Size(5, 5), 0, 0, vector<int64_t>(vU16[2], vU16[2]+5), vector<int64_t>(vU16[2], vU16[2]+5) },
+        { CV_16UC1, Size(   3,   3), Size(7, 7), 0, 0, vector<int64_t>(vU16[3], vU16[3]+7), vector<int64_t>(vU16[3], vU16[3]+7) },
+        { CV_16UC1, Size(   5,   5), Size(3, 3), 0, 0, vector<int64_t>(vU16[1], vU16[1]+3), vector<int64_t>(vU16[1], vU16[1]+3) },
+        { CV_16UC1, Size(   5,   5), Size(5, 5), 0, 0, vector<int64_t>(vU16[2], vU16[2]+5), vector<int64_t>(vU16[2], vU16[2]+5) },
+        { CV_16UC1, Size(   3,   5), Size(5, 5), 0, 0, vector<int64_t>(vU16[2], vU16[2]+5), vector<int64_t>(vU16[2], vU16[2]+5) },
+        { CV_16UC1, Size(   5,   5), Size(5, 5), 0, 0, vector<int64_t>(vU16[2], vU16[2]+5), vector<int64_t>(vU16[2], vU16[2]+5) },
+        { CV_16UC1, Size(   5,   5), Size(7, 7), 0, 0, vector<int64_t>(vU16[3], vU16[3]+7), vector<int64_t>(vU16[3], vU16[3]+7) },
+        { CV_16UC1, Size(   7,   7), Size(7, 7), 0, 0, vector<int64_t>(vU16[3], vU16[3]+7), vector<int64_t>(vU16[3], vU16[3]+7) },
+        { CV_16UC1, Size( 256, 128), Size(3, 3), 0, 0, vector<int64_t>(vU16[1], vU16[1]+3), vector<int64_t>(vU16[1], vU16[1]+3) },
+        { CV_16UC2, Size( 256, 128), Size(3, 3), 0, 0, vector<int64_t>(vU16[1], vU16[1]+3), vector<int64_t>(vU16[1], vU16[1]+3) },
+        { CV_16UC3, Size( 256, 128), Size(3, 3), 0, 0, vector<int64_t>(vU16[1], vU16[1]+3), vector<int64_t>(vU16[1], vU16[1]+3) },
+        { CV_16UC4, Size( 256, 128), Size(3, 3), 0, 0, vector<int64_t>(vU16[1], vU16[1]+3), vector<int64_t>(vU16[1], vU16[1]+3) },
+        { CV_16UC1, Size( 256, 128), Size(5, 5), 0, 0, vector<int64_t>(vU16[2], vU16[2]+5), vector<int64_t>(vU16[2], vU16[2]+5) },
+        { CV_16UC1, Size( 256, 128), Size(7, 7), 0, 0, vector<int64_t>(vU16[3], vU16[3]+7), vector<int64_t>(vU16[3], vU16[3]+7) },
+        { CV_16UC1, Size( 256, 128), Size(9, 9), 0, 0, vector<int64_t>(vU16[4], vU16[4]+9), vector<int64_t>(vU16[4], vU16[4]+9) },
+    };
+
+    for (int modeind = 0, _modecnt = sizeof(modes) / sizeof(modes[0]); modeind < _modecnt; ++modeind)
+    {
+        checkMode<16>(modes[modeind]);
+    }
 }

 TEST(GaussianBlur_Bitexact, regression_15015)