Merge remote-tracking branch 'upstream/3.4' into merge-3.4

44d473fb · Alexander Alekhin · 988bc804 · 5bc6b6fc · 44d473fb · 44d473fb
18 changed file
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -2744,8 +2744,7 @@ struct Net::Impl : public detail::NetImplBase
            // (and so we eliminate the concatenation layer, because the channels
            // are concatenated implicitly).
            Ptr<ConcatLayer> concatLayer = ld.layerInstance.dynamicCast<ConcatLayer>();
-            if( !concatLayer.empty() && concatLayer->axis == 1 && !concatLayer->padding &&
-                ld.outputBlobs.size() == 1 )
+            if( !concatLayer.empty() && !concatLayer->padding && ld.outputBlobs.size() == 1 )
            {
                Mat& output = ld.outputBlobs[0];
                UMat umat_output;
@@ -2782,7 +2781,8 @@ struct Net::Impl : public detail::NetImplBase
                // the concatenation optimization is applied with batch_size > 1.
                // so, for now, we only apply this optimization in the most popular
                // case batch_size == 1.
-                if( output.dims == 4 && output.size[0] == 1 )
+                int axis = clamp(concatLayer->axis, output.dims);
+                if( output.total(0, axis) == 1 )
                {
                    size_t i, ninputs = ld.inputBlobsId.size();
                    std::vector<LayerPin> realinputs(ninputs);
@@ -2836,18 +2836,20 @@ struct Net::Impl : public detail::NetImplBase
                            OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umats);
                        }
 #endif
+
 #ifdef HAVE_CUDA
                        if (preferableBackend == DNN_BACKEND_CUDA)
                            ld.outputBlobsWrappers[0] = wrap(output);
 #endif
-                        Range chrange[] = { Range::all(), Range::all(), Range::all(), Range::all() };
+                        std::vector<Range> chrange(output.dims, Range::all());
+
                        int ofs = 0;
                        for( i = 0; i < ninputs; i++ )
                        {
                            LayerPin pin = realinputs[i];
                            LayerData* inp_i_data = &layers[pin.lid];
-                            int channels_i = ld.inputBlobs[i]->size[1];
-                            chrange[1] = Range(ofs, ofs + channels_i);
+                            int channels_i = ld.inputBlobs[i]->size[axis];
+                            chrange[axis] = Range(ofs, ofs + channels_i);
                            printf_(("\toutput %s(%d) to channels (%d, %d)\n", inp_i_data->layerInstance->name.c_str(),
                                   pin.oid, ofs, ofs + channels_i));
                            ofs += channels_i;

--- a/modules/features2d/src/sift.simd.hpp
+++ b/modules/features2d/src/sift.simd.hpp
@@ -167,9 +167,23 @@ float calcOrientationHist(
    int i, j, k, len = (radius*2+1)*(radius*2+1);

    float expf_scale = -1.f/(2.f * sigma * sigma);
+#if CV_SIMD
+    AutoBuffer<float> bufX(len + v_float32::nlanes);
+    AutoBuffer<float> bufY(len + v_float32::nlanes);
+    AutoBuffer<float> bufO(len + v_float32::nlanes);
+    AutoBuffer<float> bufW(len + v_float32::nlanes);
+    AutoBuffer<float> bufT(n+4 + v_float32::nlanes);
+    float *X = alignPtr(bufX.data(), CV_SIMD_WIDTH);
+    float *Y = alignPtr(bufY.data(), CV_SIMD_WIDTH);
+    float *Mag = X;
+    float *Ori = alignPtr(bufO.data(), CV_SIMD_WIDTH);
+    float *W = alignPtr(bufW.data(), CV_SIMD_WIDTH);
+    float *temphist = alignPtr(bufT.data(), CV_SIMD_WIDTH)+2;
+#else
    AutoBuffer<float> buf(len*4 + n+4);
    float *X = buf.data(), *Y = X + len, *Mag = X, *Ori = Y + len, *W = Ori + len;
    float* temphist = W + len + 2;
+#endif

    for( i = 0; i < n; i++ )
        temphist[i] = 0.f;
@@ -201,32 +215,29 @@ float calcOrientationHist(
    cv::hal::magnitude32f(X, Y, Mag, len);

    k = 0;
-#if CV_AVX2
+#if CV_SIMD
+    const int vecsize = v_float32::nlanes;
+    v_float32 nd360 = vx_setall_f32(n/360.f);
+    v_int32 __n = vx_setall_s32(n);
+    int CV_DECL_ALIGNED(CV_SIMD_WIDTH) bin_buf[vecsize];
+    float CV_DECL_ALIGNED(CV_SIMD_WIDTH) w_mul_mag_buf[vecsize];
+
+    for( ; k <= len - vecsize; k += vecsize )
    {
-        __m256 __nd360 = _mm256_set1_ps(n/360.f);
-        __m256i __n = _mm256_set1_epi32(n);
-        int CV_DECL_ALIGNED(32) bin_buf[8];
-        float CV_DECL_ALIGNED(32) w_mul_mag_buf[8];
-        for ( ; k <= len - 8; k+=8 )
+        v_float32 w = vx_load_aligned( W + k );
+        v_float32 mag = vx_load_aligned( Mag + k );
+        v_float32 ori = vx_load_aligned( Ori + k );
+        v_int32 bin = v_round( nd360 * ori );
+
+        bin = v_select(bin >= __n, bin - __n, bin);
+        bin = v_select(bin < vx_setzero_s32(), bin + __n, bin);
+
+        w = w * mag;
+        v_store_aligned(bin_buf, bin);
+        v_store_aligned(w_mul_mag_buf, w);
+        for(int vi = 0; vi < vecsize; vi++)
        {
-            __m256i __bin = _mm256_cvtps_epi32(_mm256_mul_ps(__nd360, _mm256_loadu_ps(&Ori[k])));
-
-            __bin = _mm256_sub_epi32(__bin, _mm256_andnot_si256(_mm256_cmpgt_epi32(__n, __bin), __n));
-            __bin = _mm256_add_epi32(__bin, _mm256_and_si256(__n, _mm256_cmpgt_epi32(_mm256_setzero_si256(), __bin)));
-
-            __m256 __w_mul_mag = _mm256_mul_ps(_mm256_loadu_ps(&W[k]), _mm256_loadu_ps(&Mag[k]));
-
-            _mm256_store_si256((__m256i *) bin_buf, __bin);
-            _mm256_store_ps(w_mul_mag_buf, __w_mul_mag);
-
-            temphist[bin_buf[0]] += w_mul_mag_buf[0];
-            temphist[bin_buf[1]] += w_mul_mag_buf[1];
-            temphist[bin_buf[2]] += w_mul_mag_buf[2];
-            temphist[bin_buf[3]] += w_mul_mag_buf[3];
-            temphist[bin_buf[4]] += w_mul_mag_buf[4];
-            temphist[bin_buf[5]] += w_mul_mag_buf[5];
-            temphist[bin_buf[6]] += w_mul_mag_buf[6];
-            temphist[bin_buf[7]] += w_mul_mag_buf[7];
+            temphist[bin_buf[vi]] += w_mul_mag_buf[vi];
        }
    }
 #endif
@@ -247,34 +258,20 @@ float calcOrientationHist(
    temphist[n+1] = temphist[1];

    i = 0;
-#if CV_AVX2
+#if CV_SIMD
+    v_float32 d_1_16 = vx_setall_f32(1.f/16.f);
+    v_float32 d_4_16 = vx_setall_f32(4.f/16.f);
+    v_float32 d_6_16 = vx_setall_f32(6.f/16.f);
+    for( ; i <= n - v_float32::nlanes; i += v_float32::nlanes )
    {
-        __m256 __d_1_16 = _mm256_set1_ps(1.f/16.f);
-        __m256 __d_4_16 = _mm256_set1_ps(4.f/16.f);
-        __m256 __d_6_16 = _mm256_set1_ps(6.f/16.f);
-        for( ; i <= n - 8; i+=8 )
-        {
-#if CV_FMA3
-            __m256 __hist = _mm256_fmadd_ps(
-                _mm256_add_ps(_mm256_loadu_ps(&temphist[i-2]), _mm256_loadu_ps(&temphist[i+2])),
-                __d_1_16,
-                _mm256_fmadd_ps(
-                    _mm256_add_ps(_mm256_loadu_ps(&temphist[i-1]), _mm256_loadu_ps(&temphist[i+1])),
-                    __d_4_16,
-                    _mm256_mul_ps(_mm256_loadu_ps(&temphist[i]), __d_6_16)));
-#else
-            __m256 __hist = _mm256_add_ps(
-                _mm256_mul_ps(
-                        _mm256_add_ps(_mm256_loadu_ps(&temphist[i-2]), _mm256_loadu_ps(&temphist[i+2])),
-                        __d_1_16),
-                _mm256_add_ps(
-                    _mm256_mul_ps(
-                        _mm256_add_ps(_mm256_loadu_ps(&temphist[i-1]), _mm256_loadu_ps(&temphist[i+1])),
-                        __d_4_16),
-                    _mm256_mul_ps(_mm256_loadu_ps(&temphist[i]), __d_6_16)));
-#endif
-            _mm256_storeu_ps(&hist[i], __hist);
-        }
+        v_float32 tn2 = vx_load_aligned(temphist + i-2);
+        v_float32 tn1 = vx_load(temphist + i-1);
+        v_float32 t0 = vx_load(temphist + i);
+        v_float32 t1 = vx_load(temphist + i+1);
+        v_float32 t2 = vx_load(temphist + i+2);
+        v_float32 _hist = v_fma(tn2 + t2, d_1_16,
+            v_fma(tn1 + t1, d_4_16, t0 * d_6_16));
+        v_store(hist + i, _hist);
    }
 #endif
    for( ; i < n; i++ )
@@ -623,91 +620,65 @@ void calcSIFTDescriptor(
    cv::hal::exp32f(W, W, len);

    k = 0;
-#if CV_AVX2
+#if CV_SIMD
    {
-        int CV_DECL_ALIGNED(32) idx_buf[8];
-        float CV_DECL_ALIGNED(32) rco_buf[64];
-        const __m256 __ori = _mm256_set1_ps(ori);
-        const __m256 __bins_per_rad = _mm256_set1_ps(bins_per_rad);
-        const __m256i __n = _mm256_set1_epi32(n);
-        for( ; k <= len - 8; k+=8 )
+        const int vecsize = v_float32::nlanes;
+        int CV_DECL_ALIGNED(CV_SIMD_WIDTH) idx_buf[vecsize];
+        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rco_buf[8*vecsize];
+        const v_float32 __ori  = vx_setall_f32(ori);
+        const v_float32 __bins_per_rad = vx_setall_f32(bins_per_rad);
+        const v_int32 __n = vx_setall_s32(n);
+        const v_int32 __1 = vx_setall_s32(1);
+        const v_int32 __d_plus_2 = vx_setall_s32(d+2);
+        const v_int32 __n_plus_2 = vx_setall_s32(n+2);
+        for( ; k <= len - vecsize; k += vecsize )
        {
-            __m256 __rbin = _mm256_loadu_ps(&RBin[k]);
-            __m256 __cbin = _mm256_loadu_ps(&CBin[k]);
-            __m256 __obin = _mm256_mul_ps(_mm256_sub_ps(_mm256_loadu_ps(&Ori[k]), __ori), __bins_per_rad);
-            __m256 __mag = _mm256_mul_ps(_mm256_loadu_ps(&Mag[k]), _mm256_loadu_ps(&W[k]));
-
-            __m256 __r0 = _mm256_floor_ps(__rbin);
-            __rbin = _mm256_sub_ps(__rbin, __r0);
-            __m256 __c0 = _mm256_floor_ps(__cbin);
-            __cbin = _mm256_sub_ps(__cbin, __c0);
-            __m256 __o0 = _mm256_floor_ps(__obin);
-            __obin = _mm256_sub_ps(__obin, __o0);
-
-            __m256i __o0i = _mm256_cvtps_epi32(__o0);
-            __o0i = _mm256_add_epi32(__o0i, _mm256_and_si256(__n, _mm256_cmpgt_epi32(_mm256_setzero_si256(), __o0i)));
-            __o0i = _mm256_sub_epi32(__o0i, _mm256_andnot_si256(_mm256_cmpgt_epi32(__n, __o0i), __n));
-
-            __m256 __v_r1 = _mm256_mul_ps(__mag, __rbin);
-            __m256 __v_r0 = _mm256_sub_ps(__mag, __v_r1);
-
-            __m256 __v_rc11 = _mm256_mul_ps(__v_r1, __cbin);
-            __m256 __v_rc10 = _mm256_sub_ps(__v_r1, __v_rc11);
-
-            __m256 __v_rc01 = _mm256_mul_ps(__v_r0, __cbin);
-            __m256 __v_rc00 = _mm256_sub_ps(__v_r0, __v_rc01);
-
-            __m256 __v_rco111 = _mm256_mul_ps(__v_rc11, __obin);
-            __m256 __v_rco110 = _mm256_sub_ps(__v_rc11, __v_rco111);
-
-            __m256 __v_rco101 = _mm256_mul_ps(__v_rc10, __obin);
-            __m256 __v_rco100 = _mm256_sub_ps(__v_rc10, __v_rco101);
-
-            __m256 __v_rco011 = _mm256_mul_ps(__v_rc01, __obin);
-            __m256 __v_rco010 = _mm256_sub_ps(__v_rc01, __v_rco011);
-
-            __m256 __v_rco001 = _mm256_mul_ps(__v_rc00, __obin);
-            __m256 __v_rco000 = _mm256_sub_ps(__v_rc00, __v_rco001);
-
-            __m256i __one = _mm256_set1_epi32(1);
-            __m256i __idx = _mm256_add_epi32(
-                _mm256_mullo_epi32(
-                    _mm256_add_epi32(
-                        _mm256_mullo_epi32(_mm256_add_epi32(_mm256_cvtps_epi32(__r0), __one), _mm256_set1_epi32(d + 2)),
-                        _mm256_add_epi32(_mm256_cvtps_epi32(__c0), __one)),
-                    _mm256_set1_epi32(n + 2)),
-                __o0i);
-
-            _mm256_store_si256((__m256i *)idx_buf, __idx);
-
-            _mm256_store_ps(&(rco_buf[0]),  __v_rco000);
-            _mm256_store_ps(&(rco_buf[8]),  __v_rco001);
-            _mm256_store_ps(&(rco_buf[16]), __v_rco010);
-            _mm256_store_ps(&(rco_buf[24]), __v_rco011);
-            _mm256_store_ps(&(rco_buf[32]), __v_rco100);
-            _mm256_store_ps(&(rco_buf[40]), __v_rco101);
-            _mm256_store_ps(&(rco_buf[48]), __v_rco110);
-            _mm256_store_ps(&(rco_buf[56]), __v_rco111);
-            #define HIST_SUM_HELPER(id)                                  \
-                hist[idx_buf[(id)]] += rco_buf[(id)];                    \
-                hist[idx_buf[(id)]+1] += rco_buf[8 + (id)];              \
-                hist[idx_buf[(id)]+(n+2)] += rco_buf[16 + (id)];         \
-                hist[idx_buf[(id)]+(n+3)] += rco_buf[24 + (id)];         \
-                hist[idx_buf[(id)]+(d+2)*(n+2)] += rco_buf[32 + (id)];   \
-                hist[idx_buf[(id)]+(d+2)*(n+2)+1] += rco_buf[40 + (id)]; \
-                hist[idx_buf[(id)]+(d+3)*(n+2)] += rco_buf[48 + (id)];   \
-                hist[idx_buf[(id)]+(d+3)*(n+2)+1] += rco_buf[56 + (id)];
-
-            HIST_SUM_HELPER(0);
-            HIST_SUM_HELPER(1);
-            HIST_SUM_HELPER(2);
-            HIST_SUM_HELPER(3);
-            HIST_SUM_HELPER(4);
-            HIST_SUM_HELPER(5);
-            HIST_SUM_HELPER(6);
-            HIST_SUM_HELPER(7);
-
-            #undef HIST_SUM_HELPER
+            v_float32 rbin = vx_load(RBin + k);
+            v_float32 cbin = vx_load(CBin + k);
+            v_float32 obin = (vx_load(Ori + k) - __ori) * __bins_per_rad;
+            v_float32 mag = vx_load(Mag + k) * vx_load(W + k);
+
+            v_int32 r0 = v_floor(rbin);
+            v_int32 c0 = v_floor(cbin);
+            v_int32 o0 = v_floor(obin);
+            rbin -= v_cvt_f32(r0);
+            cbin -= v_cvt_f32(c0);
+            obin -= v_cvt_f32(o0);
+
+            o0 = v_select(o0 < vx_setzero_s32(), o0 + __n, o0);
+            o0 = v_select(o0 >= __n, o0 - __n, o0);
+
+            v_float32 v_r1 = mag*rbin, v_r0 = mag - v_r1;
+            v_float32 v_rc11 = v_r1*cbin, v_rc10 = v_r1 - v_rc11;
+            v_float32 v_rc01 = v_r0*cbin, v_rc00 = v_r0 - v_rc01;
+            v_float32 v_rco111 = v_rc11*obin, v_rco110 = v_rc11 - v_rco111;
+            v_float32 v_rco101 = v_rc10*obin, v_rco100 = v_rc10 - v_rco101;
+            v_float32 v_rco011 = v_rc01*obin, v_rco010 = v_rc01 - v_rco011;
+            v_float32 v_rco001 = v_rc00*obin, v_rco000 = v_rc00 - v_rco001;
+
+            v_int32 idx = v_fma(v_fma(r0+__1, __d_plus_2, c0+__1), __n_plus_2, o0);
+            v_store_aligned(idx_buf, idx);
+
+            v_store_aligned(rco_buf,           v_rco000);
+            v_store_aligned(rco_buf+vecsize,   v_rco001);
+            v_store_aligned(rco_buf+vecsize*2, v_rco010);
+            v_store_aligned(rco_buf+vecsize*3, v_rco011);
+            v_store_aligned(rco_buf+vecsize*4, v_rco100);
+            v_store_aligned(rco_buf+vecsize*5, v_rco101);
+            v_store_aligned(rco_buf+vecsize*6, v_rco110);
+            v_store_aligned(rco_buf+vecsize*7, v_rco111);
+
+            for(int id = 0; id < vecsize; id++)
+            {
+                hist[idx_buf[id]] += rco_buf[id];
+                hist[idx_buf[id]+1] += rco_buf[vecsize + id];
+                hist[idx_buf[id]+(n+2)] += rco_buf[2*vecsize + id];
+                hist[idx_buf[id]+(n+3)] += rco_buf[3*vecsize + id];
+                hist[idx_buf[id]+(d+2)*(n+2)] += rco_buf[4*vecsize + id];
+                hist[idx_buf[id]+(d+2)*(n+2)+1] += rco_buf[5*vecsize + id];
+                hist[idx_buf[id]+(d+3)*(n+2)] += rco_buf[6*vecsize + id];
+                hist[idx_buf[id]+(d+3)*(n+2)+1] += rco_buf[7*vecsize + id];
+            }
        }
    }
 #endif
@@ -766,23 +737,16 @@ void calcSIFTDescriptor(
    float nrm2 = 0;
    len = d*d*n;
    k = 0;
-#if CV_AVX2
+#if CV_SIMD
    {
-        float CV_DECL_ALIGNED(32) nrm2_buf[8];
-        __m256 __nrm2 = _mm256_setzero_ps();
-        __m256 __dst;
-        for( ; k <= len - 8; k += 8 )
+        v_float32 __nrm2 = vx_setzero_f32();
+        v_float32 __dst;
+        for( ; k <= len - v_float32::nlanes; k += v_float32::nlanes )
        {
-            __dst = _mm256_loadu_ps(&dst[k]);
-#if CV_FMA3
-            __nrm2 = _mm256_fmadd_ps(__dst, __dst, __nrm2);
-#else
-            __nrm2 = _mm256_add_ps(__nrm2, _mm256_mul_ps(__dst, __dst));
-#endif
+            __dst = vx_load(dst + k);
+            __nrm2 = v_fma(__dst, __dst, __nrm2);
        }
-        _mm256_store_ps(nrm2_buf, __nrm2);
-        nrm2 = nrm2_buf[0] + nrm2_buf[1] + nrm2_buf[2] + nrm2_buf[3] +
-               nrm2_buf[4] + nrm2_buf[5] + nrm2_buf[6] + nrm2_buf[7];
+        nrm2 = (float)v_reduce_sum(__nrm2);
    }
 #endif
    for( ; k < len; k++ )
@@ -795,7 +759,7 @@ void calcSIFTDescriptor(
    // This code cannot be enabled because it sums nrm2 in a different order,
    // thus producing slightly different results
    {
-        float CV_DECL_ALIGNED(32) nrm2_buf[8];
+        float CV_DECL_ALIGNED(CV_SIMD_WIDTH) nrm2_buf[8];
        __m256 __dst;
        __m256 __nrm2 = _mm256_setzero_ps();
        __m256 __thr = _mm256_set1_ps(thr);
@@ -825,17 +789,17 @@ void calcSIFTDescriptor(

 #if 1
    k = 0;
-#if CV_AVX2
+#if CV_SIMD
    {
-        __m256 __dst;
-        __m256 __min = _mm256_setzero_ps();
-        __m256 __max = _mm256_set1_ps(255.0f); // max of uchar
-        __m256 __nrm2 = _mm256_set1_ps(nrm2);
-        for( k = 0; k <= len - 8; k+=8 )
+        v_float32 __dst;
+        v_float32 __min = vx_setzero_f32();
+        v_float32 __max = vx_setall_f32(255.0f); // max of uchar
+        v_float32 __nrm2 = vx_setall_f32(nrm2);
+        for( k = 0; k <= len - v_float32::nlanes; k += v_float32::nlanes )
        {
-            __dst = _mm256_loadu_ps(&dst[k]);
-            __dst = _mm256_min_ps(_mm256_max_ps(_mm256_round_ps(_mm256_mul_ps(__dst, __nrm2), _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC), __min), __max);
-            _mm256_storeu_ps(&dst[k], __dst);
+            __dst = vx_load(dst + k);
+            __dst = v_min(v_max(v_cvt_f32(v_round(__dst * __nrm2)), __min), __max);
+            v_store(dst + k, __dst);
        }
    }
 #endif

--- a/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h
+++ b/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h
@@ -547,7 +547,7 @@ public:
    void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams) CV_OVERRIDE
    {

-        int maxChecks = get_param(searchParams,"checks",32);
+        const int maxChecks = get_param(searchParams,"checks",32);

        // Priority queue storing intermediate branches in the best-bin-first search
        Heap<BranchSt>* heap = new Heap<BranchSt>((int)size_);
@@ -556,6 +556,8 @@ public:
        int checks = 0;
        for (int i=0; i<trees_; ++i) {
            findNN(root[i], result, vec, checks, maxChecks, heap, checked);
+            if ((checks >= maxChecks) && result.full())
+                break;
        }

        BranchSt branch;
@@ -747,8 +749,8 @@ private:
                Heap<BranchSt>* heap, std::vector<bool>& checked)
    {
        if (node->childs==NULL) {
-            if (checks>=maxChecks) {
-                if (result.full()) return;
+            if ((checks>=maxChecks) && result.full()) {
+                return;
            }
            for (int i=0; i<node->size; ++i) {
                int index = node->indices[i];

--- a/modules/imgproc/perf/opencl/perf_filters.cpp
+++ b/modules/imgproc/perf/opencl/perf_filters.cpp
@@ -313,6 +313,62 @@ OCL_PERF_TEST_P(Filter2DFixture, Filter2D,
    SANITY_CHECK(dst, eps);
 }

+///////////// SepFilter2D /////////////
+
+typedef FilterFixture OCL_SepFilter2D;
+
+PERF_TEST_P_(OCL_SepFilter2D, SepFilter2D)
+{
+    const FilterParams& params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), ksize = get<2>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    Mat kernelX(1, ksize, CV_32FC1);
+    randu(kernelX, -3.0, 3.0);
+    Mat kernelY(1, ksize, CV_32FC1);
+    randu(kernelY, -3.0, 3.0);
+
+    OCL_TEST_CYCLE() cv::sepFilter2D(src, dst, -1, kernelX, kernelY, cv::Point(-1, -1), 1.0f, cv::BORDER_CONSTANT);
+
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P_(OCL_SepFilter2D, SepFilter2D_BitExact)
+{
+    const FilterParams& params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), ksize = get<2>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    Mat kernelX(1, ksize, CV_32SC1);
+    randu(kernelX, -16.0, 16.0);
+    kernelX.convertTo(kernelX, CV_32FC1, 1/16.0f, 0);
+    Mat kernelY(1, ksize, CV_32SC1);
+    randu(kernelY, -16.0, 16.0);
+    kernelY.convertTo(kernelY, CV_32FC1, 1/16.0f, 0);
+
+    OCL_TEST_CYCLE() cv::sepFilter2D(src, dst, -1, kernelX, kernelY, cv::Point(-1, -1), 1.0f, cv::BORDER_CONSTANT);
+
+    SANITY_CHECK_NOTHING();
+}
+
+INSTANTIATE_TEST_CASE_P(/*nothing*/, OCL_SepFilter2D,
+    ::testing::Combine(
+        ::testing::Values(sz1080p),
+        OCL_TEST_TYPES,
+        OCL_PERF_ENUM(3, 5, 7, 9, 11)
+    )
+);
+
 ///////////// Bilateral ////////////////////////

 typedef TestBaseWithParam<Size> BilateralFixture;

--- a/modules/imgproc/src/filter.dispatch.cpp
+++ b/modules/imgproc/src/filter.dispatch.cpp
@@ -729,11 +729,12 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth,
    return k.run(2, globalsize, localsize, false);
 }

-const int shift_bits = 8;
-
 static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX, int anchor,
-                               int borderType, int ddepth, bool fast8uc1, bool int_arithm)
+        int borderType, int ddepth, bool fast8uc1,
+        bool int_arithm, int shift_bits)
 {
+    CV_Assert(shift_bits == 0 || int_arithm);
+
    int type = src.type(), cn = CV_MAT_CN(type), sdepth = CV_MAT_DEPTH(type);
    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
    Size bufSize = buf.size();
@@ -801,8 +802,11 @@ static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX
    return k.run(2, globalsize, localsize, false);
 }

-static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor, bool int_arithm)
+static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor,
+        bool int_arithm, int shift_bits)
 {
+    CV_Assert(shift_bits == 0 || int_arithm);
+
    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
    if (dst.depth() == CV_64F && !doubleSupport)
        return false;
@@ -821,13 +825,16 @@ static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY
    globalsize[1] = DIVUP(sz.height, localsize[1]) * localsize[1];
    globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0];

-    char cvt[40];
+    char cvt[2][40];
+    int floatT = std::max(CV_32F, bdepth);
    cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d"
-                                          " -D srcT=%s -D dstT=%s -D convertToDstT=%s"
+                                          " -D srcT=%s -D dstT=%s -D convertToFloatT=%s -D floatT=%s -D convertToDstT=%s"
                                          " -D srcT1=%s -D dstT1=%s -D SHIFT_BITS=%d%s%s",
                                          anchor, (int)localsize[0], (int)localsize[1], cn,
                                          ocl::typeToStr(buf_type), ocl::typeToStr(dtype),
-                                          ocl::convertTypeStr(bdepth, ddepth, cn, cvt),
+                                          ocl::convertTypeStr(bdepth, floatT, cn, cvt[0]),
+                                          ocl::typeToStr(CV_MAKETYPE(floatT, cn)),
+                                          ocl::convertTypeStr(shift_bits ? floatT : bdepth, ddepth, cn, cvt[1]),
                                          ocl::typeToStr(bdepth), ocl::typeToStr(ddepth),
                                          2*shift_bits, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
                                          int_arithm ? " -D INTEGER_ARITHMETIC" : "");
@@ -839,7 +846,7 @@ static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY
        return false;

    k.args(ocl::KernelArg::ReadOnly(buf), ocl::KernelArg::WriteOnly(dst),
-           static_cast<float>(delta));
+           static_cast<float>(delta * (1u << (2 * shift_bits))));

    return k.run(2, globalsize, localsize, false);
 }
@@ -848,16 +855,21 @@ const int optimizedSepFilterLocalWidth  = 16;
 const int optimizedSepFilterLocalHeight = 8;

 static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
-                                       Mat row_kernel, Mat col_kernel,
-                                       double delta, int borderType, int ddepth, int bdepth, bool int_arithm)
+                                       const Mat& kernelX_, const Mat& kernelY_,
+                                       double delta, int borderType, int ddepth, int bdepth,
+                                       bool int_arithm, int shift_bits)
 {
-    Size size = _src.size(), wholeSize;
-    Point origin;
+    //CV_Assert(shift_bits == 0 || int_arithm);
+
+    const ocl::Device& d = ocl::Device::getDefault();
+
+    Size size = _src.size();
    int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
            esz = CV_ELEM_SIZE(stype), wdepth = std::max(std::max(sdepth, ddepth), bdepth),
            dtype = CV_MAKE_TYPE(ddepth, cn);
    size_t src_step = _src.step(), src_offset = _src.offset();
-    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
+
+    bool doubleSupport = d.doubleFPConfig() > 0;

    if (esz == 0 || src_step == 0
        || (src_offset % src_step) % esz != 0
@@ -869,6 +881,13 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
             || borderType == BORDER_REFLECT_101))
        return false;

+    Mat kernelX, kernelY;
+    kernelX_.convertTo(kernelX, wdepth);
+    if (kernelX_.data != kernelY_.data)
+        kernelY_.convertTo(kernelY, wdepth);
+    else
+        kernelY = kernelX;
+
    size_t lt2[2] = { optimizedSepFilterLocalWidth, optimizedSepFilterLocalHeight };
    size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), lt2[1]};

@@ -879,9 +898,9 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
    String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s"
                             " -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s"
                             " -D %s -D srcT1=%s -D dstT1=%s -D WT1=%s -D CN=%d -D SHIFT_BITS=%d%s",
-                             (int)lt2[0], (int)lt2[1], row_kernel.cols / 2, col_kernel.cols / 2,
-                             ocl::kernelToStr(row_kernel, wdepth, "KERNEL_MATRIX_X").c_str(),
-                             ocl::kernelToStr(col_kernel, wdepth, "KERNEL_MATRIX_Y").c_str(),
+                             (int)lt2[0], (int)lt2[1], kernelX.cols / 2, kernelY.cols / 2,
+                             ocl::kernelToStr(kernelX, wdepth, "KERNEL_MATRIX_X").c_str(),
+                             ocl::kernelToStr(kernelY, wdepth, "KERNEL_MATRIX_Y").c_str(),
                             ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
                             ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(dtype),
                             ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType],
@@ -896,21 +915,30 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
    _dst.create(size, dtype);
    UMat dst = _dst.getUMat();

-    int src_offset_x = static_cast<int>((src_offset % src_step) / esz);
-    int src_offset_y = static_cast<int>(src_offset / src_step);
+    // TODO Future: emit error on inplace processing
+    //CV_Assert(src.u != dst.u && "Inplace processing is not allowed with UMat");
+    if (src.u == dst.u)
+    {
+        CV_LOG_ONCE_WARNING(NULL, "sepFilter2D: inplace arguments are not allowed for non-inplace operations. Performance impact warning.");
+        src = src.clone();
+    }

+    Size wholeSize;
+    Point origin;
    src.locateROI(wholeSize, origin);

-    k.args(ocl::KernelArg::PtrReadOnly(src), (int)src_step, src_offset_x, src_offset_y,
+    k.args(ocl::KernelArg::PtrReadOnly(src), (int)src_step, origin.x, origin.y,
           wholeSize.height, wholeSize.width, ocl::KernelArg::WriteOnly(dst),
-           static_cast<float>(delta));
+           static_cast<float>(delta * (1u << (2 * shift_bits))));

    return k.run(2, gt2, lt2, false);
 }

-bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
-                      InputArray _kernelX, InputArray _kernelY, Point anchor,
-                      double delta, int borderType )
+bool ocl_sepFilter2D(
+        InputArray _src, OutputArray _dst, int ddepth,
+        InputArray _kernelX, InputArray _kernelY, Point anchor,
+        double delta, int borderType
+)
 {
    const ocl::Device & d = ocl::Device::getDefault();
    Size imgSize = _src.size();
@@ -934,59 +962,152 @@ bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
    if (anchor.y < 0)
        anchor.y = kernelY.cols >> 1;

-    int rtype = getKernelType(kernelX,
-        kernelX.rows == 1 ? Point(anchor.x, 0) : Point(0, anchor.x));
-    int ctype = getKernelType(kernelY,
-        kernelY.rows == 1 ? Point(anchor.y, 0) : Point(0, anchor.y));
-
    int bdepth = CV_32F;
    bool int_arithm = false;
-    if( sdepth == CV_8U && ddepth == CV_8U &&
-        rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
-        ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL)
+    int shift_bits = 0;
+
+    while (sdepth == CV_8U && ddepth == CV_8U)
    {
-        if (ocl::Device::getDefault().isIntel())
+        int bits_ = 8;
+        if (delta * 256.0f != (float)(int)(delta * 256))
        {
-            for (int i=0; i<kernelX.cols; i++)
-                kernelX.at<float>(0, i) = (float) cvRound(kernelX.at<float>(0, i) * (1 << shift_bits));
-            if (kernelX.data != kernelY.data)
-                for (int i=0; i<kernelX.cols; i++)
-                    kernelY.at<float>(0, i) = (float) cvRound(kernelY.at<float>(0, i) * (1 << shift_bits));
-        } else
+            CV_LOG_DEBUG(NULL, "ocl_sepFilter2D: bit-exact delta can't be applied: delta=" << delta);
+            break;
+        }
+        Mat kernelX_BitExact, kernelY_BitExact;
+        bool isValidBitExactRowKernel = createBitExactKernel_32S(kernelX, kernelX_BitExact, bits_);
+        bool isValidBitExactColumnKernel = createBitExactKernel_32S(kernelY, kernelY_BitExact, bits_);
+        if (!isValidBitExactRowKernel)
+        {
+            CV_LOG_DEBUG(NULL, "ocl_sepFilter2D: bit-exact row-kernel can't be applied: ksize=" << kernelX_BitExact.total());
+        }
+        else if (!isValidBitExactColumnKernel)
+        {
+            CV_LOG_DEBUG(NULL, "ocl_sepFilter2D: bit-exact column-kernel can't be applied: ksize=" << kernelY_BitExact.total());
+        }
+        else
        {
            bdepth = CV_32S;
-            kernelX.convertTo( kernelX, bdepth, 1 << shift_bits );
-            kernelY.convertTo( kernelY, bdepth, 1 << shift_bits );
+            shift_bits = bits_;
+            int_arithm = true;
+
+            kernelX = kernelX_BitExact;
+            kernelY = kernelY_BitExact;
        }
-        int_arithm = true;
+        break;
    }

-    CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 &&
-                imgSize.width > optimizedSepFilterLocalWidth + anchor.x &&
-                imgSize.height > optimizedSepFilterLocalHeight + anchor.y &&
-                (!(borderType & BORDER_ISOLATED) || _src.offset() == 0) &&
-                anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) &&
-                OCL_PERFORMANCE_CHECK(d.isIntel()),  // TODO FIXIT
-                ocl_sepFilter2D_SinglePass(_src, _dst, kernelX, kernelY, delta,
-                                           borderType & ~BORDER_ISOLATED, ddepth, bdepth, int_arithm), true)
+    CV_OCL_RUN_(
+            kernelY.cols <= 21 && kernelX.cols <= 21 &&
+            imgSize.width > optimizedSepFilterLocalWidth + anchor.x &&
+            imgSize.height > optimizedSepFilterLocalHeight + anchor.y &&
+            (!(borderType & BORDER_ISOLATED) || _src.offset() == 0) &&
+            anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) &&
+            OCL_PERFORMANCE_CHECK(d.isIntel()),  // TODO FIXIT
+            ocl_sepFilter2D_SinglePass(
+                    _src, _dst, kernelX, kernelY, delta,
+                   borderType & ~BORDER_ISOLATED, ddepth,
+                   CV_32F,  // force FP32 mode
+                   false, shift_bits
+            ),
+            true
+    );

    UMat src = _src.getUMat();
-    Size srcWholeSize; Point srcOffset;
-    src.locateROI(srcWholeSize, srcOffset);

-    bool fast8uc1 = type == CV_8UC1 && srcOffset.x % 4 == 0 &&
-            src.cols % 4 == 0 && src.step % 4 == 0;
+    bool fast8uc1 = false;
+    if (type == CV_8UC1)
+    {
+        Size srcWholeSize;
+        Point srcOffset;
+        src.locateROI(srcWholeSize, srcOffset);
+        fast8uc1 = srcOffset.x % 4 == 0 &&
+                src.cols % 4 == 0 && src.step % 4 == 0;
+    }
+
+    Size srcSize = src.size();
+    Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1);
+    UMat buf(bufSize, CV_MAKETYPE(bdepth, cn));
+    if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1, int_arithm, shift_bits))
+        return false;
+
+    _dst.create(srcSize, CV_MAKETYPE(ddepth, cn));
+    UMat dst = _dst.getUMat();
+
+    return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y, int_arithm, shift_bits);
+}
+
+bool ocl_sepFilter2D_BitExact(
+        InputArray _src, OutputArray _dst, int ddepth,
+        const Size& ksize,
+        const uint16_t *fkx, const uint16_t *fky,
+        Point anchor,
+        double delta, int borderType,
+        int shift_bits
+)
+{
+    const ocl::Device & d = ocl::Device::getDefault();
+    Size imgSize = _src.size();
+
+    int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    if (cn > 4)
+        return false;
+
+    if (ksize.width % 2 != 1)
+        return false;
+    if (ksize.height % 2 != 1)
+        return false;
+
+    Mat kernelX(1, ksize.width, CV_16SC1, (void*)fkx);
+    Mat kernelY(1, ksize.height, CV_16SC1, (void*)fky);
+
+    if (ddepth < 0)
+        ddepth = sdepth;
+
+    if (anchor.x < 0)
+        anchor.x = kernelX.cols >> 1;
+    if (anchor.y < 0)
+        anchor.y = kernelY.cols >> 1;
+
+    int bdepth = sdepth == CV_8U ? CV_32S : CV_32F;
+
+    CV_OCL_RUN_(
+            kernelY.cols <= 21 && kernelX.cols <= 21 &&
+            imgSize.width > optimizedSepFilterLocalWidth + anchor.x &&
+            imgSize.height > optimizedSepFilterLocalHeight + anchor.y &&
+            (!(borderType & BORDER_ISOLATED) || _src.offset() == 0) &&
+            anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) &&
+            OCL_PERFORMANCE_CHECK(d.isIntel()),  // TODO FIXIT
+            ocl_sepFilter2D_SinglePass(
+                    _src, _dst, kernelX, kernelY, delta,
+                   borderType & ~BORDER_ISOLATED, ddepth, bdepth,
+                   true, shift_bits
+            ),
+            true
+    );
+
+    UMat src = _src.getUMat();
+
+    bool fast8uc1 = false;
+    if (type == CV_8UC1)
+    {
+        Size srcWholeSize;
+        Point srcOffset;
+        src.locateROI(srcWholeSize, srcOffset);
+        fast8uc1 = srcOffset.x % 4 == 0 &&
+                src.cols % 4 == 0 && src.step % 4 == 0;
+    }

    Size srcSize = src.size();
    Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1);
    UMat buf(bufSize, CV_MAKETYPE(bdepth, cn));
-    if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1, int_arithm))
+    if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1, true, shift_bits))
        return false;

    _dst.create(srcSize, CV_MAKETYPE(ddepth, cn));
    UMat dst = _dst.getUMat();

-    return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y, int_arithm);
+    return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y, true, shift_bits);
 }

 #endif
@@ -1444,7 +1565,7 @@ void sepFilter2D(InputArray _src, OutputArray _dst, int ddepth,
    CV_Assert(!_kernelX.empty());
    CV_Assert(!_kernelY.empty());

-    CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && (size_t)_src.rows() > _kernelY.total() && (size_t)_src.cols() > _kernelX.total(),
+    CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && (size_t)_src.rows() >= _kernelY.total() && (size_t)_src.cols() >= _kernelX.total(),
               ocl_sepFilter2D(_src, _dst, ddepth, _kernelX, _kernelY, anchor, delta, borderType))

    Mat src = _src.getMat(), kernelX = _kernelX.getMat(), kernelY = _kernelY.getMat();

--- a/modules/imgproc/src/filter.hpp
+++ b/modules/imgproc/src/filter.hpp
@@ -46,13 +46,25 @@
 namespace cv
 {
 #ifdef HAVE_OPENCL
-    bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
-                          InputArray _kernelX, InputArray _kernelY, Point anchor,
-                          double delta, int borderType );
+bool ocl_sepFilter2D(
+        InputArray _src, OutputArray _dst, int ddepth,
+        InputArray _kernelX, InputArray _kernelY, Point anchor,
+        double delta, int borderType
+);
+
+bool ocl_sepFilter2D_BitExact(
+        InputArray _src, OutputArray _dst, int ddepth,
+        const Size& ksize,
+        const uint16_t *fkx, const uint16_t *fky,
+        Point anchor,
+        double delta, int borderType,
+        int shift_bits
+);
 #endif

-    void preprocess2DKernel(const Mat& kernel, std::vector<Point>& coords, std::vector<uchar>& coeffs);
-}
+void preprocess2DKernel(const Mat& kernel, std::vector<Point>& coords, std::vector<uchar>& coeffs);
+
+}  // namespace

 #endif


--- a/modules/imgproc/src/opencl/filterSepCol.cl
+++ b/modules/imgproc/src/opencl/filterSepCol.cl
@@ -61,7 +61,11 @@
 #endif

 #define DIG(a) a,
+#if defined(INTEGER_ARITHMETIC)
+__constant int mat_kernel[] = { COEFF };
+#else
 __constant srcT1 mat_kernel[] = { COEFF };
+#endif

 __kernel void col_filter(__global const uchar * src, int src_step, int src_offset, int src_whole_rows, int src_whole_cols,
                         __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, float delta)
@@ -92,30 +96,28 @@ __kernel void col_filter(__global const uchar * src, int src_step, int src_offse
    barrier(CLK_LOCAL_MEM_FENCE);

    // read pixels from lds and calculate the result
-    sum = LDS_DAT[l_y + RADIUSY][l_x] * mat_kernel[RADIUSY];
+    sum = LDS_DAT[l_y + RADIUSY][l_x] * mat_kernel[RADIUSY] + (srcT)delta;
    for (int i = 1; i <= RADIUSY; ++i)
    {
        temp[0] = LDS_DAT[l_y + RADIUSY - i][l_x];
        temp[1] = LDS_DAT[l_y + RADIUSY + i][l_x];
-#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
+#if defined(INTEGER_ARITHMETIC)
        sum += mad24(temp[0],mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]);
 #else
        sum += mad(temp[0], mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]);
 #endif
    }

-#ifdef INTEGER_ARITHMETIC
-#ifdef INTEL_DEVICE
-    sum = (sum + (1 << (SHIFT_BITS-1))) / (1 << SHIFT_BITS);
-#else
-    sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS;
-#endif
-#endif
-
    // write the result to dst
    if (x < dst_cols && y < dst_rows)
    {
+#if defined(SHIFT_BITS) && SHIFT_BITS > 0
+        dstT result = convertToDstT(convertToFloatT(sum) * (floatT)(1.0f / (1 << SHIFT_BITS)));
+#else
+        dstT result = convertToDstT(sum);
+#endif
+
        start_addr = mad24(y, dst_step, mad24(DSTSIZE, x, dst_offset));
-        storepix(convertToDstT(sum + (srcT)(delta)), dst + start_addr);
+        storepix(result, dst + start_addr);
    }
 }
--- a/modules/imgproc/src/opencl/filterSepRow.cl
+++ b/modules/imgproc/src/opencl/filterSepRow.cl
@@ -139,9 +139,13 @@
 #endif

 #define DIG(a) a,
+#if defined(INTEGER_ARITHMETIC)
+__constant int mat_kernel[] = { COEFF };
+#else
 __constant dstT1 mat_kernel[] = { COEFF };
+#endif

-#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
+#if defined(INTEGER_ARITHMETIC)
 #define dstT4 int4
 #define convertDstVec convert_int4
 #else
@@ -263,7 +267,7 @@ __kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel
    {
        temp[0] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset - i);
        temp[1] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset + i);
-#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
+#if defined(INTEGER_ARITHMETIC)
        sum += mad24(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]);
 #else
        sum += mad(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]);
@@ -368,7 +372,7 @@ __kernel void row_filter(__global const uchar * src, int src_step, int src_offse
    {
        temp[0] = LDS_DAT[l_y][l_x + RADIUSX - i];
        temp[1] = LDS_DAT[l_y][l_x + RADIUSX + i];
-#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
+#if defined(INTEGER_ARITHMETIC)
        sum += mad24(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]);
 #else
        sum += mad(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]);

--- a/modules/imgproc/src/opencl/filterSep_singlePass.cl
+++ b/modules/imgproc/src/opencl/filterSep_singlePass.cl
@@ -160,7 +160,7 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int
        {
            sum = (WT) 0;
            for (i=0; i<=2*RADIUSY; i++)
-#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
+#if defined(INTEGER_ARITHMETIC)
                sum = mad24(lsmem[liy + i][clocX], mat_kernelY[i], sum);
 #else
                sum = mad(lsmem[liy + i][clocX], mat_kernelY[i], sum);
@@ -177,25 +177,27 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int
        {
            // do second horizontal filter pass
            // and calculate final result
-            sum = 0.0f;
+            sum = (WT)(delta);
            for (i=0; i<=2*RADIUSX; i++)
-#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
+#if defined(INTEGER_ARITHMETIC)
                sum = mad24(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
 #else
                sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
 #endif

-#ifdef INTEGER_ARITHMETIC
-#ifdef INTEL_DEVICE
-            sum = (sum + (1 << (SHIFT_BITS-1))) / (1 << SHIFT_BITS);
+#if defined(SHIFT_BITS) && SHIFT_BITS > 0
+#if !defined(INTEGER_ARITHMETIC)
+            sum = sum * (1.0f / (1 << SHIFT_BITS));
 #else
            sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS;
 #endif
 #endif
+
            // store result into destination image
-            storepix(convertToDstT(sum + (WT)(delta)), Dst + mad24(y + liy, dst_step, mad24(x, DSTSIZE, dst_offset)));
+            storepix(convertToDstT(sum), Dst + mad24(y + liy, dst_step, mad24(x, DSTSIZE, dst_offset)));
        }

+        barrier(CLK_LOCAL_MEM_FENCE);
        for (int i = liy * BLK_X + lix; i < (RADIUSY*2) * (BLK_X+(RADIUSX*2)); i += BLK_X * BLK_Y)
        {
            int clocX = i % (BLK_X+(RADIUSX*2));

--- a/modules/imgproc/src/smooth.dispatch.cpp
+++ b/modules/imgproc/src/smooth.dispatch.cpp
@@ -48,6 +48,7 @@
 #include <opencv2/core/utils/configuration.private.hpp>

 #include <vector>
+#include <iostream>

 #include "opencv2/core/hal/intrin.hpp"
 #include "opencl_kernels_imgproc.hpp"
@@ -637,10 +638,9 @@ void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
        return;
    }

-    bool useOpenCL = (ocl::isOpenCLActivated() && _dst.isUMat() && _src.dims() <= 2 &&
-               ((ksize.width == 3 && ksize.height == 3) ||
-               (ksize.width == 5 && ksize.height == 5)) &&
-               _src.rows() > ksize.height && _src.cols() > ksize.width);
+    bool useOpenCL = ocl::isOpenCLActivated() && _dst.isUMat() && _src.dims() <= 2 &&
+               _src.rows() >= ksize.height && _src.cols() >= ksize.width &&
+               ksize.width > 1 && ksize.height > 1;
    CV_UNUSED(useOpenCL);

    int sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
@@ -648,27 +648,13 @@ void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
    Mat kx, ky;
    createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2);

-    CV_OCL_RUN(useOpenCL, ocl_GaussianBlur_8UC1(_src, _dst, ksize, CV_MAT_DEPTH(type), kx, ky, borderType));
+    CV_OCL_RUN(useOpenCL && sdepth == CV_8U &&
+            ((ksize.width == 3 && ksize.height == 3) ||
+            (ksize.width == 5 && ksize.height == 5)),
+            ocl_GaussianBlur_8UC1(_src, _dst, ksize, CV_MAT_DEPTH(type), kx, ky, borderType)
+    );

-    CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && (size_t)_src.rows() > kx.total() && (size_t)_src.cols() > kx.total(),
-               ocl_sepFilter2D(_src, _dst, sdepth, kx, ky, Point(-1, -1), 0, borderType))
-
-    Mat src = _src.getMat();
-    Mat dst = _dst.getMat();
-
-    Point ofs;
-    Size wsz(src.cols, src.rows);
-    if(!(borderType & BORDER_ISOLATED))
-        src.locateROI( wsz, ofs );
-
-    CALL_HAL(gaussianBlur, cv_hal_gaussianBlur, src.ptr(), src.step, dst.ptr(), dst.step, src.cols, src.rows, sdepth, cn,
-             ofs.x, ofs.y, wsz.width - src.cols - ofs.x, wsz.height - src.rows - ofs.y, ksize.width, ksize.height,
-             sigma1, sigma2, borderType&~BORDER_ISOLATED);
-
-    CV_OVX_RUN(true,
-               openvx_gaussianBlur(src, dst, ksize, sigma1, sigma2, borderType))
-
-    if(sdepth == CV_8U && ((borderType & BORDER_ISOLATED) || !_src.getMat().isSubmatrix()))
+    if(sdepth == CV_8U && ((borderType & BORDER_ISOLATED) || !_src.isSubmatrix()))
    {
        std::vector<ufixedpoint16> fkx, fky;
        createGaussianKernels(fkx, fky, type, ksize, sigma1, sigma2);
@@ -684,6 +670,17 @@ void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
        }
        else
        {
+            CV_OCL_RUN(useOpenCL,
+                    ocl_sepFilter2D_BitExact(_src, _dst, sdepth,
+                            ksize,
+                            (const uint16_t*)&fkx[0], (const uint16_t*)&fky[0],
+                            Point(-1, -1), 0, borderType,
+                            8/*shift_bits*/)
+            );
+
+            Mat src = _src.getMat();
+            Mat dst = _dst.getMat();
+
            if (src.data == dst.data)
                src = src.clone();
            CV_CPU_DISPATCH(GaussianBlurFixedPoint, (src, dst, (const uint16_t*)&fkx[0], (int)fkx.size(), (const uint16_t*)&fky[0], (int)fky.size(), borderType),
@@ -692,6 +689,29 @@ void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
        }
    }

+#ifdef HAVE_OPENCL
+    if (useOpenCL)
+    {
+        sepFilter2D(_src, _dst, sdepth, kx, ky, Point(-1, -1), 0, borderType);
+        return;
+    }
+#endif
+
+    Mat src = _src.getMat();
+    Mat dst = _dst.getMat();
+
+    Point ofs;
+    Size wsz(src.cols, src.rows);
+    if(!(borderType & BORDER_ISOLATED))
+        src.locateROI( wsz, ofs );
+
+    CALL_HAL(gaussianBlur, cv_hal_gaussianBlur, src.ptr(), src.step, dst.ptr(), dst.step, src.cols, src.rows, sdepth, cn,
+             ofs.x, ofs.y, wsz.width - src.cols - ofs.x, wsz.height - src.rows - ofs.y, ksize.width, ksize.height,
+             sigma1, sigma2, borderType&~BORDER_ISOLATED);
+
+    CV_OVX_RUN(true,
+               openvx_gaussianBlur(src, dst, ksize, sigma1, sigma2, borderType))
+
 #if defined ENABLE_IPP_GAUSSIAN_BLUR
    // IPP is not bit-exact to OpenCV implementation
    CV_IPP_RUN_FAST(ipp_GaussianBlur(src, dst, ksize, sigma1, sigma2, borderType));

--- a/modules/imgproc/test/ocl/test_sepfilter2d.cpp
+++ b/modules/imgproc/test/ocl/test_sepfilter2d.cpp
@@ -73,7 +73,7 @@ PARAM_TEST_CASE(SepFilter2D, MatDepth, Channels, BorderType, bool, bool)
        useRoi = GET_PARAM(4);
    }

-    void random_roi()
+    void random_roi(bool bitExact)
    {
        Size ksize = randomSize(kernelMinSize, kernelMaxSize);
        if (1 != ksize.width % 2)
@@ -81,11 +81,19 @@ PARAM_TEST_CASE(SepFilter2D, MatDepth, Channels, BorderType, bool, bool)
        if (1 != ksize.height % 2)
            ksize.height++;

-        Mat temp = randomMat(Size(ksize.width, 1), CV_MAKE_TYPE(CV_32F, 1), -MAX_VALUE, MAX_VALUE);
+        Mat temp = randomMat(Size(ksize.width, 1), CV_32FC1, -0.5, 1.0);
        cv::normalize(temp, kernelX, 1.0, 0.0, NORM_L1);
-        temp = randomMat(Size(1, ksize.height),  CV_MAKE_TYPE(CV_32F, 1), -MAX_VALUE, MAX_VALUE);
+        temp = randomMat(Size(1, ksize.height), CV_32FC1, -0.5, 1.0);
        cv::normalize(temp, kernelY, 1.0, 0.0, NORM_L1);

+        if (bitExact)
+        {
+            kernelX.convertTo(temp, CV_32S, 256);
+            temp.convertTo(kernelX, CV_32F, 1.0 / 256);
+            kernelY.convertTo(temp, CV_32S, 256);
+            temp.convertTo(kernelY, CV_32F, 1.0 / 256);
+        }
+
        Size roiSize = randomSize(ksize.width, MAX_VALUE, ksize.height, MAX_VALUE);
        Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
        randomSubMat(src, src_roi, roiSize, srcBorder, type, -MAX_VALUE, MAX_VALUE);
@@ -96,6 +104,11 @@ PARAM_TEST_CASE(SepFilter2D, MatDepth, Channels, BorderType, bool, bool)
        anchor.x = anchor.y = -1;
        delta = randomDouble(-100, 100);

+        if (bitExact)
+        {
+            delta = (int)(delta * 256) / 256.0;
+        }
+
        UMAT_UPLOAD_INPUT_PARAMETER(src);
        UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
    }
@@ -110,7 +123,7 @@ OCL_TEST_P(SepFilter2D, Mat)
 {
    for (int j = 0; j < test_loop_times + 3; j++)
    {
-        random_roi();
+        random_roi(false);

        OCL_OFF(cv::sepFilter2D(src_roi, dst_roi, -1, kernelX, kernelY, anchor, delta, borderType));
        OCL_ON(cv::sepFilter2D(usrc_roi, udst_roi, -1, kernelX, kernelY, anchor, delta, borderType));
@@ -119,6 +132,22 @@ OCL_TEST_P(SepFilter2D, Mat)
    }
 }

+OCL_TEST_P(SepFilter2D, Mat_BitExact)
+{
+    for (int j = 0; j < test_loop_times + 3; j++)
+    {
+        random_roi(true);
+
+        OCL_OFF(cv::sepFilter2D(src_roi, dst_roi, -1, kernelX, kernelY, anchor, delta, borderType));
+        OCL_ON(cv::sepFilter2D(usrc_roi, udst_roi, -1, kernelX, kernelY, anchor, delta, borderType));
+
+        if (src_roi.depth() < CV_32F)
+            Near(0.0);
+        else
+            Near(1e-3);
+    }
+}
+
 OCL_INSTANTIATE_TEST_CASE_P(ImageProc, SepFilter2D,
                            Combine(
                                Values(CV_8U, CV_32F),

--- a/modules/stitching/src/exposure_compensate.cpp
+++ b/modules/stitching/src/exposure_compensate.cpp
@@ -416,7 +416,11 @@ void BlocksCompensator::feed(const std::vector<Point> &corners, const std::vecto
            bl_idx += bl_per_img.width*bl_per_img.height;

            for (int i=0; i<nr_gain_filtering_iterations_; ++i)
-                sepFilter2D(gain_map, gain_map, CV_32F, ker, ker);
+            {
+                UMat tmp;
+                sepFilter2D(gain_map, tmp, CV_32F, ker, ker);
+                swap(gain_map, tmp);
+            }

            gain_maps_[img_idx] = gain_map;
        }

--- a/platforms/winpack_dldt/2020.4/20200413-dldt-pdb.patch
+++ b/platforms/winpack_dldt/2020.4/20200413-dldt-pdb.patch
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 1f981ed2..90eb500a 100644
+--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+@@ -22,6 +22,9 @@ endif()
+ 
+ project(OpenVINO)
+ 
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zi /FS")
+set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
+
+ set(OpenVINO_MAIN_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+ set(IE_MAIN_SOURCE_DIR ${OpenVINO_MAIN_SOURCE_DIR}/inference-engine)
+ list(APPEND CMAKE_MODULE_PATH "${OpenVINO_MAIN_SOURCE_DIR}/cmake")
--- a/platforms/winpack_dldt/2020.4/20200604-dldt-disable-multidevice.patch
+++ b/platforms/winpack_dldt/2020.4/20200604-dldt-disable-multidevice.patch
+diff --git a/inference-engine/src/CMakeLists.txt b/inference-engine/src/CMakeLists.txt
+index 0ba0dd78..7d34e7cb 100644
+--- a/inference-engine/src/CMakeLists.txt
+++ b/inference-engine/src/CMakeLists.txt
+@@ -26,7 +26,7 @@ endif()
+ 
+ add_subdirectory(hetero_plugin)
+ 
+-add_subdirectory(multi_device)
+#add_subdirectory(multi_device)
+ 
+ add_subdirectory(transformations)
+ 
--- a/platforms/winpack_dldt/2020.4/20200701-dldt-disable-unused-targets.patch
+++ b/platforms/winpack_dldt/2020.4/20200701-dldt-disable-unused-targets.patch
+diff --git a/inference-engine/CMakeLists.txt b/inference-engine/CMakeLists.txt
+index 39ff413b..df4e89c7 100644
+--- a/inference-engine/CMakeLists.txt
+++ b/inference-engine/CMakeLists.txt
+@@ -66,7 +66,7 @@ if(ENABLE_TESTS)
+     add_subdirectory(tests)
+ endif()
+ 
+-add_subdirectory(tools)
+#add_subdirectory(tools)
+ 
+ function(ie_build_samples)
+     # samples should be build with the same flags as from OpenVINO package,
+@@ -85,7 +85,7 @@ endfunction()
+ 
+ # gflags and format_reader targets are kept inside of samples directory and
+ # they must be built even if samples build is disabled (required for tests and tools).
+-ie_build_samples()
+#ie_build_samples()
+ 
+ file(GLOB_RECURSE SAMPLES_SOURCES samples/*.cpp samples/*.hpp samples/*.h)
+ add_cpplint_target(sample_cpplint
+@@ -174,10 +174,10 @@ endif()
+ # Developer package
+ #
+ 
+-ie_developer_export_targets(format_reader)
+#ie_developer_export_targets(format_reader)
+ ie_developer_export_targets(${NGRAPH_LIBRARIES})
+ 
+-ie_developer_export()
+#ie_developer_export()
+ 
+ configure_file(
+     "${IE_MAIN_SOURCE_DIR}/cmake/developer_package_config.cmake.in"
+diff --git a/inference-engine/src/inference_engine/CMakeLists.txt b/inference-engine/src/inference_engine/CMakeLists.txt
+index 4ae0d560..e37acbe0 100644
+--- a/inference-engine/src/inference_engine/CMakeLists.txt
+++ b/inference-engine/src/inference_engine/CMakeLists.txt
+@@ -99,7 +99,7 @@ add_cpplint_target(${TARGET_NAME}_plugin_api_cpplint FOR_SOURCES ${plugin_api_sr
+ 
+ # Create common base object library
+ 
+-add_library(${TARGET_NAME}_common_obj OBJECT
+add_library(${TARGET_NAME}_common_obj OBJECT EXCLUDE_FROM_ALL
+             ${IE_BASE_SOURCE_FILES})
+ 
+ target_compile_definitions(${TARGET_NAME}_common_obj PRIVATE IMPLEMENT_INFERENCE_ENGINE_API)
+@@ -112,7 +112,7 @@ target_include_directories(${TARGET_NAME}_common_obj SYSTEM PRIVATE
+ 
+ # Create object library
+ 
+-add_library(${TARGET_NAME}_obj OBJECT
+add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL
+             ${LIBRARY_SRC}
+             ${LIBRARY_HEADERS}
+             ${PUBLIC_HEADERS})
+@@ -177,7 +177,7 @@ ie_register_plugins(MAIN_TARGET ${TARGET_NAME}
+ 
+ # Static library used for unit tests which are always built
+ 
+-add_library(${TARGET_NAME}_s STATIC
+add_library(${TARGET_NAME}_s STATIC EXCLUDE_FROM_ALL
+             $<TARGET_OBJECTS:${TARGET_NAME}_obj>
+             $<TARGET_OBJECTS:${TARGET_NAME}_common_obj>
+             $<TARGET_OBJECTS:${TARGET_NAME}_legacy_obj>
+diff --git a/inference-engine/src/legacy_api/CMakeLists.txt b/inference-engine/src/legacy_api/CMakeLists.txt
+index 85524310..ed27e058 100644
+--- a/inference-engine/src/legacy_api/CMakeLists.txt
+++ b/inference-engine/src/legacy_api/CMakeLists.txt
+@@ -21,7 +21,7 @@ source_group("include" FILES ${PUBLIC_HEADERS})
+ 
+ # Create object library
+ 
+-add_library(${TARGET_NAME}_obj OBJECT
+add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL
+             ${LIBRARY_SRC}
+             ${PUBLIC_HEADERS})
+ 
+diff --git a/inference-engine/src/mkldnn_plugin/CMakeLists.txt b/inference-engine/src/mkldnn_plugin/CMakeLists.txt
+index 297783da..06da35c3 100644
+--- a/inference-engine/src/mkldnn_plugin/CMakeLists.txt
+++ b/inference-engine/src/mkldnn_plugin/CMakeLists.txt
+@@ -192,7 +192,7 @@ cross_compiled_file(${TARGET_NAME}
+ 
+ #  add test object library
+ 
+-add_library(${TARGET_NAME}_obj OBJECT ${SOURCES} ${HEADERS})
+add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL ${SOURCES} ${HEADERS})
+ 
+ target_include_directories(${TARGET_NAME}_obj PRIVATE $<TARGET_PROPERTY:inference_engine_preproc_s,INTERFACE_INCLUDE_DIRECTORIES>
+                                                       $<TARGET_PROPERTY:inference_engine_lp_transformations,INTERFACE_INCLUDE_DIRECTORIES>
+diff --git a/inference-engine/src/preprocessing/CMakeLists.txt b/inference-engine/src/preprocessing/CMakeLists.txt
+index adc52f06..6b7d0ffe 100644
+--- a/inference-engine/src/preprocessing/CMakeLists.txt
+++ b/inference-engine/src/preprocessing/CMakeLists.txt
+@@ -124,7 +124,7 @@ endif()
+ 
+ # Create object library
+ 
+-add_library(${TARGET_NAME}_obj OBJECT
+add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL
+             ${LIBRARY_SRC}
+             ${LIBRARY_HEADERS})
+ 
+@@ -183,7 +183,7 @@ add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME}
+ 
+ # Static library used for unit tests which are always built
+ 
+-add_library(${TARGET_NAME}_s STATIC
+add_library(${TARGET_NAME}_s STATIC EXCLUDE_FROM_ALL
+             $<TARGET_OBJECTS:${TARGET_NAME}_obj>)
+ 
+ set_ie_threading_interface_for(${TARGET_NAME}_s)
+diff --git a/inference-engine/src/vpu/common/CMakeLists.txt b/inference-engine/src/vpu/common/CMakeLists.txt
+index 43e9308f..2e40dd31 100644
+--- a/inference-engine/src/vpu/common/CMakeLists.txt
+++ b/inference-engine/src/vpu/common/CMakeLists.txt
+@@ -55,7 +55,7 @@ add_common_target("vpu_common_lib" FALSE)
+ 
+ # Unit tests support for graph transformer
+ if(WIN32)
+-    add_common_target("vpu_common_lib_test_static" TRUE)
+    #add_common_target("vpu_common_lib_test_static" TRUE)
+ else()
+     add_library("vpu_common_lib_test_static" ALIAS "vpu_common_lib")
+ endif()
+diff --git a/inference-engine/src/vpu/graph_transformer/CMakeLists.txt b/inference-engine/src/vpu/graph_transformer/CMakeLists.txt
+index 982d3c7f..15fcf3e8 100644
+--- a/inference-engine/src/vpu/graph_transformer/CMakeLists.txt
+++ b/inference-engine/src/vpu/graph_transformer/CMakeLists.txt
+@@ -64,7 +64,7 @@ add_graph_transformer_target("vpu_graph_transformer" FALSE)
+ 
+ # Unit tests support for graph transformer
+ if(WIN32)
+-    add_graph_transformer_target("vpu_graph_transformer_test_static" TRUE)
+    #add_graph_transformer_target("vpu_graph_transformer_test_static" TRUE)
+ else()
+     add_library("vpu_graph_transformer_test_static" ALIAS "vpu_graph_transformer")
+ endif()
+diff --git a/inference-engine/thirdparty/CMakeLists.txt b/inference-engine/thirdparty/CMakeLists.txt
+index f94453e0..c80e75c5 100644
+--- a/inference-engine/thirdparty/CMakeLists.txt
+++ b/inference-engine/thirdparty/CMakeLists.txt
+@@ -43,13 +43,13 @@ function(build_with_lto)
+     endfunction()
+ 
+     ie_build_pugixml()
+-    add_subdirectory(stb_lib)
+    #add_subdirectory(stb_lib)
+     add_subdirectory(ade)
+     add_subdirectory(fluid/modules/gapi)
+ 
+     target_include_directories(pugixml INTERFACE "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/pugixml/src>")
+ 
+-    set_target_properties(pugixml ade fluid stb_image
+    set_target_properties(pugixml ade fluid
+                           PROPERTIES FOLDER thirdparty)
+ 
+     # developer package
+diff --git a/inference-engine/thirdparty/pugixml/CMakeLists.txt b/inference-engine/thirdparty/pugixml/CMakeLists.txt
+index 8bcb2801..380fb468 100644
+--- a/inference-engine/thirdparty/pugixml/CMakeLists.txt
+++ b/inference-engine/thirdparty/pugixml/CMakeLists.txt
+@@ -41,7 +41,7 @@ if(BUILD_SHARED_LIBS)
+ else()
+ 	add_library(pugixml STATIC ${SOURCES})
+ 	if (MSVC)
+-		add_library(pugixml_mt STATIC ${SOURCES})
+               #add_library(pugixml_mt STATIC ${SOURCES})
+ 		#if (WIN32)
+ 		#	set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT")
+ 		#	set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd")
--- a/platforms/winpack_dldt/2020.4/patch.config.py
+++ b/platforms/winpack_dldt/2020.4/patch.config.py
+applyPatch('20200701-dldt-disable-unused-targets.patch')
+applyPatch('20200413-dldt-pdb.patch')
+applyPatch('20200604-dldt-disable-multidevice.patch')
--- a/platforms/winpack_dldt/2020.4/sysroot.config.py
+++ b/platforms/winpack_dldt/2020.4/sysroot.config.py
+sysroot_bin_dir = prepare_dir(self.sysrootdir / 'bin')
+copytree(self.build_dir / 'install', self.sysrootdir / 'ngraph')
+#rm_one(self.sysrootdir / 'ngraph' / 'lib' / 'ngraph.dll')
+
+build_config = 'Release' if not self.config.build_debug else 'Debug'
+build_bin_dir = self.build_dir / 'bin' / 'intel64' / build_config
+
+def copy_bin(name):
+    global build_bin_dir, sysroot_bin_dir
+    copytree(build_bin_dir / name, sysroot_bin_dir / name)
+
+dll_suffix = 'd' if self.config.build_debug else ''
+def copy_dll(name):
+    global copy_bin, dll_suffix
+    copy_bin(name + dll_suffix + '.dll')
+    copy_bin(name + dll_suffix + '.pdb')
+
+copy_bin('cache.json')
+copy_dll('clDNNPlugin')
+copy_dll('HeteroPlugin')
+copy_dll('inference_engine')
+copy_dll('inference_engine_ir_reader')
+copy_dll('inference_engine_legacy')
+copy_dll('inference_engine_transformations')  # runtime
+copy_dll('inference_engine_lp_transformations')  # runtime
+copy_dll('MKLDNNPlugin')  # runtime
+copy_dll('myriadPlugin')  # runtime
+#copy_dll('MultiDevicePlugin')  # runtime, not used
+copy_dll('ngraph')
+copy_bin('plugins.xml')
+copytree(self.build_dir / 'bin' / 'intel64' / 'pcie-ma248x.elf', sysroot_bin_dir / 'pcie-ma248x.elf')
+copytree(self.build_dir / 'bin' / 'intel64' / 'usb-ma2x8x.mvcmd', sysroot_bin_dir / 'usb-ma2x8x.mvcmd')
+copytree(self.build_dir / 'bin' / 'intel64' / 'usb-ma2450.mvcmd', sysroot_bin_dir / 'usb-ma2450.mvcmd')
+
+copytree(self.srcdir / 'inference-engine' / 'temp' / 'tbb' / 'bin', sysroot_bin_dir)
+copytree(self.srcdir / 'inference-engine' / 'temp' / 'tbb', self.sysrootdir / 'tbb')
+
+sysroot_ie_dir = prepare_dir(self.sysrootdir / 'deployment_tools' / 'inference_engine')
+sysroot_ie_lib_dir = prepare_dir(sysroot_ie_dir / 'lib' / 'intel64')
+
+copytree(self.srcdir / 'inference-engine' / 'include', sysroot_ie_dir / 'include')
+if not self.config.build_debug:
+    copytree(self.build_dir / 'install' / 'lib' / 'ngraph.lib', sysroot_ie_lib_dir / 'ngraph.lib')
+    copytree(build_bin_dir / 'inference_engine.lib', sysroot_ie_lib_dir / 'inference_engine.lib')
+    copytree(build_bin_dir / 'inference_engine_ir_reader.lib', sysroot_ie_lib_dir / 'inference_engine_ir_reader.lib')
+    copytree(build_bin_dir / 'inference_engine_legacy.lib', sysroot_ie_lib_dir / 'inference_engine_legacy.lib')
+else:
+    copytree(self.build_dir / 'install' / 'lib' / 'ngraphd.lib', sysroot_ie_lib_dir / 'ngraphd.lib')
+    copytree(build_bin_dir / 'inference_engined.lib', sysroot_ie_lib_dir / 'inference_engined.lib')
+    copytree(build_bin_dir / 'inference_engine_ir_readerd.lib', sysroot_ie_lib_dir / 'inference_engine_ir_readerd.lib')
+    copytree(build_bin_dir / 'inference_engine_legacyd.lib', sysroot_ie_lib_dir / 'inference_engine_legacyd.lib')
+
+sysroot_license_dir = prepare_dir(self.sysrootdir / 'etc' / 'licenses')
+copytree(self.srcdir / 'LICENSE', sysroot_license_dir / 'dldt-LICENSE')
+copytree(self.srcdir / 'ngraph/LICENSE', sysroot_license_dir / 'ngraph-LICENSE')
+copytree(self.sysrootdir / 'tbb/LICENSE', sysroot_license_dir / 'tbb-LICENSE')
--- a/platforms/winpack_dldt/build_package.py
+++ b/platforms/winpack_dldt/build_package.py
@@ -133,9 +133,10 @@ def git_checkout(dst, url, branch, revision, clone_extra_args, noFetch=False):
                (['-b', branch] if branch else []) +
                clone_extra_args + [url, '.'], cwd=dst)
    else:
-        execute(cmd=['git', 'fetch', 'origin'] + ([branch] if branch else []), cwd=dst)
+        execute(cmd=['git', 'fetch', 'origin'] + ([branch + ':' + branch] if branch else []), cwd=dst)
    execute(cmd=['git', 'reset', '--hard'], cwd=dst)
-    execute(cmd=['git', 'checkout', '-B', 'winpack_dldt', revision], cwd=dst)
+    execute(cmd=['git', 'clean', '-f', '-d'], cwd=dst)
+    execute(cmd=['git', 'checkout', '--force', '-B', 'winpack_dldt', revision], cwd=dst)
    execute(cmd=['git', 'clean', '-f', '-d'], cwd=dst)
    execute(cmd=['git', 'submodule', 'init'], cwd=dst)
    execute(cmd=['git', 'submodule', 'update', '--force', '--depth=1000'], cwd=dst)
@@ -149,6 +150,7 @@ def git_apply_patch(src_dir, patch_file):
    patch_file = str(patch_file)  # Python 3.5 may not handle Path
    assert os.path.exists(patch_file), patch_file
    execute(cmd=['git', 'apply', '--3way', '-v', '--ignore-space-change', str(patch_file)], cwd=src_dir)
+    execute(cmd=['git', 'diff', 'HEAD'], cwd=src_dir)


 #===================================================================================================
@@ -186,6 +188,17 @@ class BuilderDLDT:
        self.build_dir = prepare_dir(self.outdir / 'build', clean=self.config.clean_dldt)
        self.sysrootdir = prepare_dir(self.outdir / 'sysroot', clean=self.config.clean_dldt)

+        if self.config.build_subst_drive:
+            if os.path.exists(self.config.build_subst_drive + ':\\'):
+                execute(['subst', self.config.build_subst_drive + ':', '/D'])
+            execute(['subst', self.config.build_subst_drive + ':', str(self.outdir)])
+            def fix_path(p):
+                return str(p).replace(str(self.outdir), self.config.build_subst_drive + ':')
+            self.srcdir = Path(fix_path(self.srcdir))
+            self.build_dir = Path(fix_path(self.build_dir))
+            self.sysrootdir = Path(fix_path(self.sysrootdir))
+
+
    def init_patchset(self):
        cpath = self.cpath
        self.patch_file = str(cpath / 'patch.config.py')  # Python 3.5 may not handle Path
@@ -255,12 +268,14 @@ class BuilderDLDT:
            BUILD_TESTS='OFF',
            ENABLE_OPENCV='OFF',
            ENABLE_GNA='OFF',
+            ENABLE_SPEECH_DEMO='OFF',  # 2020.4+
            NGRAPH_DOC_BUILD_ENABLE='OFF',
            NGRAPH_UNIT_TEST_ENABLE='OFF',
            NGRAPH_UNIT_TEST_OPENVINO_ENABLE='OFF',
            NGRAPH_TEST_UTIL_ENABLE='OFF',
            NGRAPH_ONNX_IMPORT_ENABLE='OFF',
            CMAKE_INSTALL_PREFIX=str(self.build_dir / 'install'),
+            OUTPUT_ROOT=str(self.build_dir),  # 2020.4+
        )

        cmd += [ '-D%s=%s' % (k, v) for (k, v) in cmake_vars.items() if v is not None]
@@ -270,14 +285,6 @@ class BuilderDLDT:
        cmd.append(str(self.srcdir))

        build_dir = self.build_dir
-        if self.config.build_subst_drive:
-            if os.path.exists(self.config.build_subst_drive + ':\\'):
-                execute(['subst', self.config.build_subst_drive + ':', '/D'])
-            def fix_path(p):
-                return str(p).replace(str(self.outdir), self.config.build_subst_drive + ':')
-            execute(['subst', self.config.build_subst_drive + ':', str(self.outdir)])
-            cmd = [fix_path(c) for c in cmd]
-            build_dir = Path(fix_path(build_dir))
        try:
            execute(cmd, cwd=build_dir)

@@ -291,8 +298,6 @@ class BuilderDLDT:
            cmd = [self.cmake_path, '-DBUILD_TYPE=' + build_config, '-P', 'cmake_install.cmake']
            execute(cmd, cwd=build_dir / 'ngraph')
        except:
-            if self.config.build_subst_drive:
-                execute(['subst', self.config.build_subst_drive + ':', '/D'])
            raise

        log.info('DLDT build completed')
@@ -307,6 +312,11 @@ class BuilderDLDT:
        log.info('DLDT sysroot preparation completed')


+    def cleanup(self):
+        if self.config.build_subst_drive:
+            execute(['subst', self.config.build_subst_drive + ':', '/D'])
+
+
 #===================================================================================================

 class Builder:
@@ -466,7 +476,7 @@ def main():
    parser.add_argument('--dldt_reference_dir', help='DLDT reference git repository (optional)')
    parser.add_argument('--dldt_src_dir', help='DLDT custom source repository (skip git checkout and patching, use for TESTING only)')

-    parser.add_argument('--dldt_config', help='Specify DLDT build configuration (defaults to DLDT commit)')
+    parser.add_argument('--dldt_config', help='Specify DLDT build configuration (defaults to evaluate from DLDT commit/branch)')

    args = parser.parse_args()

@@ -492,7 +502,10 @@ def main():
        args.opencv_dir = os.path.abspath(args.opencv_dir)

    if not args.dldt_config:
-        args.dldt_config = args.dldt_src_commit
+        if args.dldt_src_commit == 'releases/2020/4' or args.dldt_src_branch == 'releases/2020/4':
+            args.dldt_config = '2020.4'
+        else:
+            args.dldt_config = args.dldt_src_commit

    _opencv_dir = check_dir(args.opencv_dir)
    _outdir = prepare_dir(args.output_dir)
@@ -504,14 +517,18 @@ def main():

    builder_dldt = BuilderDLDT(args)

-    builder_dldt.prepare_sources()
-    builder_dldt.build()
-    builder_dldt.make_sysroot()
-
-    builder_opencv = Builder(args)
-    builder_opencv.build(builder_dldt)
-    builder_opencv.copy_sysroot(builder_dldt)
-    builder_opencv.package_sources()
+    try:
+        builder_dldt.prepare_sources()
+        builder_dldt.build()
+        builder_dldt.make_sysroot()
+
+        builder_opencv = Builder(args)
+        builder_opencv.build(builder_dldt)
+        builder_opencv.copy_sysroot(builder_dldt)
+        builder_opencv.package_sources()
+    except:
+        builder_dldt.cleanup()
+        raise

    log.info("=====")
    log.info("===== Build finished")