提交 44d473fb 编写于 作者: A Alexander Alekhin

Merge remote-tracking branch 'upstream/3.4' into merge-3.4

......@@ -2744,8 +2744,7 @@ struct Net::Impl : public detail::NetImplBase
// (and so we eliminate the concatenation layer, because the channels
// are concatenated implicitly).
Ptr<ConcatLayer> concatLayer = ld.layerInstance.dynamicCast<ConcatLayer>();
if( !concatLayer.empty() && concatLayer->axis == 1 && !concatLayer->padding &&
ld.outputBlobs.size() == 1 )
if( !concatLayer.empty() && !concatLayer->padding && ld.outputBlobs.size() == 1 )
{
Mat& output = ld.outputBlobs[0];
UMat umat_output;
......@@ -2782,7 +2781,8 @@ struct Net::Impl : public detail::NetImplBase
// the concatenation optimization is applied with batch_size > 1.
// so, for now, we only apply this optimization in the most popular
// case batch_size == 1.
if( output.dims == 4 && output.size[0] == 1 )
int axis = clamp(concatLayer->axis, output.dims);
if( output.total(0, axis) == 1 )
{
size_t i, ninputs = ld.inputBlobsId.size();
std::vector<LayerPin> realinputs(ninputs);
......@@ -2836,18 +2836,20 @@ struct Net::Impl : public detail::NetImplBase
OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umats);
}
#endif
#ifdef HAVE_CUDA
if (preferableBackend == DNN_BACKEND_CUDA)
ld.outputBlobsWrappers[0] = wrap(output);
#endif
Range chrange[] = { Range::all(), Range::all(), Range::all(), Range::all() };
std::vector<Range> chrange(output.dims, Range::all());
int ofs = 0;
for( i = 0; i < ninputs; i++ )
{
LayerPin pin = realinputs[i];
LayerData* inp_i_data = &layers[pin.lid];
int channels_i = ld.inputBlobs[i]->size[1];
chrange[1] = Range(ofs, ofs + channels_i);
int channels_i = ld.inputBlobs[i]->size[axis];
chrange[axis] = Range(ofs, ofs + channels_i);
printf_(("\toutput %s(%d) to channels (%d, %d)\n", inp_i_data->layerInstance->name.c_str(),
pin.oid, ofs, ofs + channels_i));
ofs += channels_i;
......
......@@ -167,9 +167,23 @@ float calcOrientationHist(
int i, j, k, len = (radius*2+1)*(radius*2+1);
float expf_scale = -1.f/(2.f * sigma * sigma);
#if CV_SIMD
AutoBuffer<float> bufX(len + v_float32::nlanes);
AutoBuffer<float> bufY(len + v_float32::nlanes);
AutoBuffer<float> bufO(len + v_float32::nlanes);
AutoBuffer<float> bufW(len + v_float32::nlanes);
AutoBuffer<float> bufT(n+4 + v_float32::nlanes);
float *X = alignPtr(bufX.data(), CV_SIMD_WIDTH);
float *Y = alignPtr(bufY.data(), CV_SIMD_WIDTH);
float *Mag = X;
float *Ori = alignPtr(bufO.data(), CV_SIMD_WIDTH);
float *W = alignPtr(bufW.data(), CV_SIMD_WIDTH);
float *temphist = alignPtr(bufT.data(), CV_SIMD_WIDTH)+2;
#else
AutoBuffer<float> buf(len*4 + n+4);
float *X = buf.data(), *Y = X + len, *Mag = X, *Ori = Y + len, *W = Ori + len;
float* temphist = W + len + 2;
#endif
for( i = 0; i < n; i++ )
temphist[i] = 0.f;
......@@ -201,32 +215,29 @@ float calcOrientationHist(
cv::hal::magnitude32f(X, Y, Mag, len);
k = 0;
#if CV_AVX2
#if CV_SIMD
const int vecsize = v_float32::nlanes;
v_float32 nd360 = vx_setall_f32(n/360.f);
v_int32 __n = vx_setall_s32(n);
int CV_DECL_ALIGNED(CV_SIMD_WIDTH) bin_buf[vecsize];
float CV_DECL_ALIGNED(CV_SIMD_WIDTH) w_mul_mag_buf[vecsize];
for( ; k <= len - vecsize; k += vecsize )
{
__m256 __nd360 = _mm256_set1_ps(n/360.f);
__m256i __n = _mm256_set1_epi32(n);
int CV_DECL_ALIGNED(32) bin_buf[8];
float CV_DECL_ALIGNED(32) w_mul_mag_buf[8];
for ( ; k <= len - 8; k+=8 )
v_float32 w = vx_load_aligned( W + k );
v_float32 mag = vx_load_aligned( Mag + k );
v_float32 ori = vx_load_aligned( Ori + k );
v_int32 bin = v_round( nd360 * ori );
bin = v_select(bin >= __n, bin - __n, bin);
bin = v_select(bin < vx_setzero_s32(), bin + __n, bin);
w = w * mag;
v_store_aligned(bin_buf, bin);
v_store_aligned(w_mul_mag_buf, w);
for(int vi = 0; vi < vecsize; vi++)
{
__m256i __bin = _mm256_cvtps_epi32(_mm256_mul_ps(__nd360, _mm256_loadu_ps(&Ori[k])));
__bin = _mm256_sub_epi32(__bin, _mm256_andnot_si256(_mm256_cmpgt_epi32(__n, __bin), __n));
__bin = _mm256_add_epi32(__bin, _mm256_and_si256(__n, _mm256_cmpgt_epi32(_mm256_setzero_si256(), __bin)));
__m256 __w_mul_mag = _mm256_mul_ps(_mm256_loadu_ps(&W[k]), _mm256_loadu_ps(&Mag[k]));
_mm256_store_si256((__m256i *) bin_buf, __bin);
_mm256_store_ps(w_mul_mag_buf, __w_mul_mag);
temphist[bin_buf[0]] += w_mul_mag_buf[0];
temphist[bin_buf[1]] += w_mul_mag_buf[1];
temphist[bin_buf[2]] += w_mul_mag_buf[2];
temphist[bin_buf[3]] += w_mul_mag_buf[3];
temphist[bin_buf[4]] += w_mul_mag_buf[4];
temphist[bin_buf[5]] += w_mul_mag_buf[5];
temphist[bin_buf[6]] += w_mul_mag_buf[6];
temphist[bin_buf[7]] += w_mul_mag_buf[7];
temphist[bin_buf[vi]] += w_mul_mag_buf[vi];
}
}
#endif
......@@ -247,34 +258,20 @@ float calcOrientationHist(
temphist[n+1] = temphist[1];
i = 0;
#if CV_AVX2
#if CV_SIMD
v_float32 d_1_16 = vx_setall_f32(1.f/16.f);
v_float32 d_4_16 = vx_setall_f32(4.f/16.f);
v_float32 d_6_16 = vx_setall_f32(6.f/16.f);
for( ; i <= n - v_float32::nlanes; i += v_float32::nlanes )
{
__m256 __d_1_16 = _mm256_set1_ps(1.f/16.f);
__m256 __d_4_16 = _mm256_set1_ps(4.f/16.f);
__m256 __d_6_16 = _mm256_set1_ps(6.f/16.f);
for( ; i <= n - 8; i+=8 )
{
#if CV_FMA3
__m256 __hist = _mm256_fmadd_ps(
_mm256_add_ps(_mm256_loadu_ps(&temphist[i-2]), _mm256_loadu_ps(&temphist[i+2])),
__d_1_16,
_mm256_fmadd_ps(
_mm256_add_ps(_mm256_loadu_ps(&temphist[i-1]), _mm256_loadu_ps(&temphist[i+1])),
__d_4_16,
_mm256_mul_ps(_mm256_loadu_ps(&temphist[i]), __d_6_16)));
#else
__m256 __hist = _mm256_add_ps(
_mm256_mul_ps(
_mm256_add_ps(_mm256_loadu_ps(&temphist[i-2]), _mm256_loadu_ps(&temphist[i+2])),
__d_1_16),
_mm256_add_ps(
_mm256_mul_ps(
_mm256_add_ps(_mm256_loadu_ps(&temphist[i-1]), _mm256_loadu_ps(&temphist[i+1])),
__d_4_16),
_mm256_mul_ps(_mm256_loadu_ps(&temphist[i]), __d_6_16)));
#endif
_mm256_storeu_ps(&hist[i], __hist);
}
v_float32 tn2 = vx_load_aligned(temphist + i-2);
v_float32 tn1 = vx_load(temphist + i-1);
v_float32 t0 = vx_load(temphist + i);
v_float32 t1 = vx_load(temphist + i+1);
v_float32 t2 = vx_load(temphist + i+2);
v_float32 _hist = v_fma(tn2 + t2, d_1_16,
v_fma(tn1 + t1, d_4_16, t0 * d_6_16));
v_store(hist + i, _hist);
}
#endif
for( ; i < n; i++ )
......@@ -623,91 +620,65 @@ void calcSIFTDescriptor(
cv::hal::exp32f(W, W, len);
k = 0;
#if CV_AVX2
#if CV_SIMD
{
int CV_DECL_ALIGNED(32) idx_buf[8];
float CV_DECL_ALIGNED(32) rco_buf[64];
const __m256 __ori = _mm256_set1_ps(ori);
const __m256 __bins_per_rad = _mm256_set1_ps(bins_per_rad);
const __m256i __n = _mm256_set1_epi32(n);
for( ; k <= len - 8; k+=8 )
const int vecsize = v_float32::nlanes;
int CV_DECL_ALIGNED(CV_SIMD_WIDTH) idx_buf[vecsize];
float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rco_buf[8*vecsize];
const v_float32 __ori = vx_setall_f32(ori);
const v_float32 __bins_per_rad = vx_setall_f32(bins_per_rad);
const v_int32 __n = vx_setall_s32(n);
const v_int32 __1 = vx_setall_s32(1);
const v_int32 __d_plus_2 = vx_setall_s32(d+2);
const v_int32 __n_plus_2 = vx_setall_s32(n+2);
for( ; k <= len - vecsize; k += vecsize )
{
__m256 __rbin = _mm256_loadu_ps(&RBin[k]);
__m256 __cbin = _mm256_loadu_ps(&CBin[k]);
__m256 __obin = _mm256_mul_ps(_mm256_sub_ps(_mm256_loadu_ps(&Ori[k]), __ori), __bins_per_rad);
__m256 __mag = _mm256_mul_ps(_mm256_loadu_ps(&Mag[k]), _mm256_loadu_ps(&W[k]));
__m256 __r0 = _mm256_floor_ps(__rbin);
__rbin = _mm256_sub_ps(__rbin, __r0);
__m256 __c0 = _mm256_floor_ps(__cbin);
__cbin = _mm256_sub_ps(__cbin, __c0);
__m256 __o0 = _mm256_floor_ps(__obin);
__obin = _mm256_sub_ps(__obin, __o0);
__m256i __o0i = _mm256_cvtps_epi32(__o0);
__o0i = _mm256_add_epi32(__o0i, _mm256_and_si256(__n, _mm256_cmpgt_epi32(_mm256_setzero_si256(), __o0i)));
__o0i = _mm256_sub_epi32(__o0i, _mm256_andnot_si256(_mm256_cmpgt_epi32(__n, __o0i), __n));
__m256 __v_r1 = _mm256_mul_ps(__mag, __rbin);
__m256 __v_r0 = _mm256_sub_ps(__mag, __v_r1);
__m256 __v_rc11 = _mm256_mul_ps(__v_r1, __cbin);
__m256 __v_rc10 = _mm256_sub_ps(__v_r1, __v_rc11);
__m256 __v_rc01 = _mm256_mul_ps(__v_r0, __cbin);
__m256 __v_rc00 = _mm256_sub_ps(__v_r0, __v_rc01);
__m256 __v_rco111 = _mm256_mul_ps(__v_rc11, __obin);
__m256 __v_rco110 = _mm256_sub_ps(__v_rc11, __v_rco111);
__m256 __v_rco101 = _mm256_mul_ps(__v_rc10, __obin);
__m256 __v_rco100 = _mm256_sub_ps(__v_rc10, __v_rco101);
__m256 __v_rco011 = _mm256_mul_ps(__v_rc01, __obin);
__m256 __v_rco010 = _mm256_sub_ps(__v_rc01, __v_rco011);
__m256 __v_rco001 = _mm256_mul_ps(__v_rc00, __obin);
__m256 __v_rco000 = _mm256_sub_ps(__v_rc00, __v_rco001);
__m256i __one = _mm256_set1_epi32(1);
__m256i __idx = _mm256_add_epi32(
_mm256_mullo_epi32(
_mm256_add_epi32(
_mm256_mullo_epi32(_mm256_add_epi32(_mm256_cvtps_epi32(__r0), __one), _mm256_set1_epi32(d + 2)),
_mm256_add_epi32(_mm256_cvtps_epi32(__c0), __one)),
_mm256_set1_epi32(n + 2)),
__o0i);
_mm256_store_si256((__m256i *)idx_buf, __idx);
_mm256_store_ps(&(rco_buf[0]), __v_rco000);
_mm256_store_ps(&(rco_buf[8]), __v_rco001);
_mm256_store_ps(&(rco_buf[16]), __v_rco010);
_mm256_store_ps(&(rco_buf[24]), __v_rco011);
_mm256_store_ps(&(rco_buf[32]), __v_rco100);
_mm256_store_ps(&(rco_buf[40]), __v_rco101);
_mm256_store_ps(&(rco_buf[48]), __v_rco110);
_mm256_store_ps(&(rco_buf[56]), __v_rco111);
#define HIST_SUM_HELPER(id) \
hist[idx_buf[(id)]] += rco_buf[(id)]; \
hist[idx_buf[(id)]+1] += rco_buf[8 + (id)]; \
hist[idx_buf[(id)]+(n+2)] += rco_buf[16 + (id)]; \
hist[idx_buf[(id)]+(n+3)] += rco_buf[24 + (id)]; \
hist[idx_buf[(id)]+(d+2)*(n+2)] += rco_buf[32 + (id)]; \
hist[idx_buf[(id)]+(d+2)*(n+2)+1] += rco_buf[40 + (id)]; \
hist[idx_buf[(id)]+(d+3)*(n+2)] += rco_buf[48 + (id)]; \
hist[idx_buf[(id)]+(d+3)*(n+2)+1] += rco_buf[56 + (id)];
HIST_SUM_HELPER(0);
HIST_SUM_HELPER(1);
HIST_SUM_HELPER(2);
HIST_SUM_HELPER(3);
HIST_SUM_HELPER(4);
HIST_SUM_HELPER(5);
HIST_SUM_HELPER(6);
HIST_SUM_HELPER(7);
#undef HIST_SUM_HELPER
v_float32 rbin = vx_load(RBin + k);
v_float32 cbin = vx_load(CBin + k);
v_float32 obin = (vx_load(Ori + k) - __ori) * __bins_per_rad;
v_float32 mag = vx_load(Mag + k) * vx_load(W + k);
v_int32 r0 = v_floor(rbin);
v_int32 c0 = v_floor(cbin);
v_int32 o0 = v_floor(obin);
rbin -= v_cvt_f32(r0);
cbin -= v_cvt_f32(c0);
obin -= v_cvt_f32(o0);
o0 = v_select(o0 < vx_setzero_s32(), o0 + __n, o0);
o0 = v_select(o0 >= __n, o0 - __n, o0);
v_float32 v_r1 = mag*rbin, v_r0 = mag - v_r1;
v_float32 v_rc11 = v_r1*cbin, v_rc10 = v_r1 - v_rc11;
v_float32 v_rc01 = v_r0*cbin, v_rc00 = v_r0 - v_rc01;
v_float32 v_rco111 = v_rc11*obin, v_rco110 = v_rc11 - v_rco111;
v_float32 v_rco101 = v_rc10*obin, v_rco100 = v_rc10 - v_rco101;
v_float32 v_rco011 = v_rc01*obin, v_rco010 = v_rc01 - v_rco011;
v_float32 v_rco001 = v_rc00*obin, v_rco000 = v_rc00 - v_rco001;
v_int32 idx = v_fma(v_fma(r0+__1, __d_plus_2, c0+__1), __n_plus_2, o0);
v_store_aligned(idx_buf, idx);
v_store_aligned(rco_buf, v_rco000);
v_store_aligned(rco_buf+vecsize, v_rco001);
v_store_aligned(rco_buf+vecsize*2, v_rco010);
v_store_aligned(rco_buf+vecsize*3, v_rco011);
v_store_aligned(rco_buf+vecsize*4, v_rco100);
v_store_aligned(rco_buf+vecsize*5, v_rco101);
v_store_aligned(rco_buf+vecsize*6, v_rco110);
v_store_aligned(rco_buf+vecsize*7, v_rco111);
for(int id = 0; id < vecsize; id++)
{
hist[idx_buf[id]] += rco_buf[id];
hist[idx_buf[id]+1] += rco_buf[vecsize + id];
hist[idx_buf[id]+(n+2)] += rco_buf[2*vecsize + id];
hist[idx_buf[id]+(n+3)] += rco_buf[3*vecsize + id];
hist[idx_buf[id]+(d+2)*(n+2)] += rco_buf[4*vecsize + id];
hist[idx_buf[id]+(d+2)*(n+2)+1] += rco_buf[5*vecsize + id];
hist[idx_buf[id]+(d+3)*(n+2)] += rco_buf[6*vecsize + id];
hist[idx_buf[id]+(d+3)*(n+2)+1] += rco_buf[7*vecsize + id];
}
}
}
#endif
......@@ -766,23 +737,16 @@ void calcSIFTDescriptor(
float nrm2 = 0;
len = d*d*n;
k = 0;
#if CV_AVX2
#if CV_SIMD
{
float CV_DECL_ALIGNED(32) nrm2_buf[8];
__m256 __nrm2 = _mm256_setzero_ps();
__m256 __dst;
for( ; k <= len - 8; k += 8 )
v_float32 __nrm2 = vx_setzero_f32();
v_float32 __dst;
for( ; k <= len - v_float32::nlanes; k += v_float32::nlanes )
{
__dst = _mm256_loadu_ps(&dst[k]);
#if CV_FMA3
__nrm2 = _mm256_fmadd_ps(__dst, __dst, __nrm2);
#else
__nrm2 = _mm256_add_ps(__nrm2, _mm256_mul_ps(__dst, __dst));
#endif
__dst = vx_load(dst + k);
__nrm2 = v_fma(__dst, __dst, __nrm2);
}
_mm256_store_ps(nrm2_buf, __nrm2);
nrm2 = nrm2_buf[0] + nrm2_buf[1] + nrm2_buf[2] + nrm2_buf[3] +
nrm2_buf[4] + nrm2_buf[5] + nrm2_buf[6] + nrm2_buf[7];
nrm2 = (float)v_reduce_sum(__nrm2);
}
#endif
for( ; k < len; k++ )
......@@ -795,7 +759,7 @@ void calcSIFTDescriptor(
// This code cannot be enabled because it sums nrm2 in a different order,
// thus producing slightly different results
{
float CV_DECL_ALIGNED(32) nrm2_buf[8];
float CV_DECL_ALIGNED(CV_SIMD_WIDTH) nrm2_buf[8];
__m256 __dst;
__m256 __nrm2 = _mm256_setzero_ps();
__m256 __thr = _mm256_set1_ps(thr);
......@@ -825,17 +789,17 @@ void calcSIFTDescriptor(
#if 1
k = 0;
#if CV_AVX2
#if CV_SIMD
{
__m256 __dst;
__m256 __min = _mm256_setzero_ps();
__m256 __max = _mm256_set1_ps(255.0f); // max of uchar
__m256 __nrm2 = _mm256_set1_ps(nrm2);
for( k = 0; k <= len - 8; k+=8 )
v_float32 __dst;
v_float32 __min = vx_setzero_f32();
v_float32 __max = vx_setall_f32(255.0f); // max of uchar
v_float32 __nrm2 = vx_setall_f32(nrm2);
for( k = 0; k <= len - v_float32::nlanes; k += v_float32::nlanes )
{
__dst = _mm256_loadu_ps(&dst[k]);
__dst = _mm256_min_ps(_mm256_max_ps(_mm256_round_ps(_mm256_mul_ps(__dst, __nrm2), _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC), __min), __max);
_mm256_storeu_ps(&dst[k], __dst);
__dst = vx_load(dst + k);
__dst = v_min(v_max(v_cvt_f32(v_round(__dst * __nrm2)), __min), __max);
v_store(dst + k, __dst);
}
}
#endif
......
......@@ -547,7 +547,7 @@ public:
void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams) CV_OVERRIDE
{
int maxChecks = get_param(searchParams,"checks",32);
const int maxChecks = get_param(searchParams,"checks",32);
// Priority queue storing intermediate branches in the best-bin-first search
Heap<BranchSt>* heap = new Heap<BranchSt>((int)size_);
......@@ -556,6 +556,8 @@ public:
int checks = 0;
for (int i=0; i<trees_; ++i) {
findNN(root[i], result, vec, checks, maxChecks, heap, checked);
if ((checks >= maxChecks) && result.full())
break;
}
BranchSt branch;
......@@ -747,8 +749,8 @@ private:
Heap<BranchSt>* heap, std::vector<bool>& checked)
{
if (node->childs==NULL) {
if (checks>=maxChecks) {
if (result.full()) return;
if ((checks>=maxChecks) && result.full()) {
return;
}
for (int i=0; i<node->size; ++i) {
int index = node->indices[i];
......
......@@ -313,6 +313,62 @@ OCL_PERF_TEST_P(Filter2DFixture, Filter2D,
SANITY_CHECK(dst, eps);
}
///////////// SepFilter2D /////////////
typedef FilterFixture OCL_SepFilter2D;
PERF_TEST_P_(OCL_SepFilter2D, SepFilter2D)
{
const FilterParams& params = GetParam();
const Size srcSize = get<0>(params);
const int type = get<1>(params), ksize = get<2>(params);
checkDeviceMaxMemoryAllocSize(srcSize, type);
UMat src(srcSize, type), dst(srcSize, type);
declare.in(src, WARMUP_RNG).out(dst);
Mat kernelX(1, ksize, CV_32FC1);
randu(kernelX, -3.0, 3.0);
Mat kernelY(1, ksize, CV_32FC1);
randu(kernelY, -3.0, 3.0);
OCL_TEST_CYCLE() cv::sepFilter2D(src, dst, -1, kernelX, kernelY, cv::Point(-1, -1), 1.0f, cv::BORDER_CONSTANT);
SANITY_CHECK_NOTHING();
}
PERF_TEST_P_(OCL_SepFilter2D, SepFilter2D_BitExact)
{
const FilterParams& params = GetParam();
const Size srcSize = get<0>(params);
const int type = get<1>(params), ksize = get<2>(params);
checkDeviceMaxMemoryAllocSize(srcSize, type);
UMat src(srcSize, type), dst(srcSize, type);
declare.in(src, WARMUP_RNG).out(dst);
Mat kernelX(1, ksize, CV_32SC1);
randu(kernelX, -16.0, 16.0);
kernelX.convertTo(kernelX, CV_32FC1, 1/16.0f, 0);
Mat kernelY(1, ksize, CV_32SC1);
randu(kernelY, -16.0, 16.0);
kernelY.convertTo(kernelY, CV_32FC1, 1/16.0f, 0);
OCL_TEST_CYCLE() cv::sepFilter2D(src, dst, -1, kernelX, kernelY, cv::Point(-1, -1), 1.0f, cv::BORDER_CONSTANT);
SANITY_CHECK_NOTHING();
}
INSTANTIATE_TEST_CASE_P(/*nothing*/, OCL_SepFilter2D,
::testing::Combine(
::testing::Values(sz1080p),
OCL_TEST_TYPES,
OCL_PERF_ENUM(3, 5, 7, 9, 11)
)
);
///////////// Bilateral ////////////////////////
typedef TestBaseWithParam<Size> BilateralFixture;
......
......@@ -729,11 +729,12 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth,
return k.run(2, globalsize, localsize, false);
}
const int shift_bits = 8;
static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX, int anchor,
int borderType, int ddepth, bool fast8uc1, bool int_arithm)
int borderType, int ddepth, bool fast8uc1,
bool int_arithm, int shift_bits)
{
CV_Assert(shift_bits == 0 || int_arithm);
int type = src.type(), cn = CV_MAT_CN(type), sdepth = CV_MAT_DEPTH(type);
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
Size bufSize = buf.size();
......@@ -801,8 +802,11 @@ static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX
return k.run(2, globalsize, localsize, false);
}
static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor, bool int_arithm)
static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor,
bool int_arithm, int shift_bits)
{
CV_Assert(shift_bits == 0 || int_arithm);
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
if (dst.depth() == CV_64F && !doubleSupport)
return false;
......@@ -821,13 +825,16 @@ static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY
globalsize[1] = DIVUP(sz.height, localsize[1]) * localsize[1];
globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0];
char cvt[40];
char cvt[2][40];
int floatT = std::max(CV_32F, bdepth);
cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d"
" -D srcT=%s -D dstT=%s -D convertToDstT=%s"
" -D srcT=%s -D dstT=%s -D convertToFloatT=%s -D floatT=%s -D convertToDstT=%s"
" -D srcT1=%s -D dstT1=%s -D SHIFT_BITS=%d%s%s",
anchor, (int)localsize[0], (int)localsize[1], cn,
ocl::typeToStr(buf_type), ocl::typeToStr(dtype),
ocl::convertTypeStr(bdepth, ddepth, cn, cvt),
ocl::convertTypeStr(bdepth, floatT, cn, cvt[0]),
ocl::typeToStr(CV_MAKETYPE(floatT, cn)),
ocl::convertTypeStr(shift_bits ? floatT : bdepth, ddepth, cn, cvt[1]),
ocl::typeToStr(bdepth), ocl::typeToStr(ddepth),
2*shift_bits, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
int_arithm ? " -D INTEGER_ARITHMETIC" : "");
......@@ -839,7 +846,7 @@ static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY
return false;
k.args(ocl::KernelArg::ReadOnly(buf), ocl::KernelArg::WriteOnly(dst),
static_cast<float>(delta));
static_cast<float>(delta * (1u << (2 * shift_bits))));
return k.run(2, globalsize, localsize, false);
}
......@@ -848,16 +855,21 @@ const int optimizedSepFilterLocalWidth = 16;
const int optimizedSepFilterLocalHeight = 8;
static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
Mat row_kernel, Mat col_kernel,
double delta, int borderType, int ddepth, int bdepth, bool int_arithm)
const Mat& kernelX_, const Mat& kernelY_,
double delta, int borderType, int ddepth, int bdepth,
bool int_arithm, int shift_bits)
{
Size size = _src.size(), wholeSize;
Point origin;
//CV_Assert(shift_bits == 0 || int_arithm);
const ocl::Device& d = ocl::Device::getDefault();
Size size = _src.size();
int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
esz = CV_ELEM_SIZE(stype), wdepth = std::max(std::max(sdepth, ddepth), bdepth),
dtype = CV_MAKE_TYPE(ddepth, cn);
size_t src_step = _src.step(), src_offset = _src.offset();
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
bool doubleSupport = d.doubleFPConfig() > 0;
if (esz == 0 || src_step == 0
|| (src_offset % src_step) % esz != 0
......@@ -869,6 +881,13 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
|| borderType == BORDER_REFLECT_101))
return false;
Mat kernelX, kernelY;
kernelX_.convertTo(kernelX, wdepth);
if (kernelX_.data != kernelY_.data)
kernelY_.convertTo(kernelY, wdepth);
else
kernelY = kernelX;
size_t lt2[2] = { optimizedSepFilterLocalWidth, optimizedSepFilterLocalHeight };
size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), lt2[1]};
......@@ -879,9 +898,9 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s"
" -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s"
" -D %s -D srcT1=%s -D dstT1=%s -D WT1=%s -D CN=%d -D SHIFT_BITS=%d%s",
(int)lt2[0], (int)lt2[1], row_kernel.cols / 2, col_kernel.cols / 2,
ocl::kernelToStr(row_kernel, wdepth, "KERNEL_MATRIX_X").c_str(),
ocl::kernelToStr(col_kernel, wdepth, "KERNEL_MATRIX_Y").c_str(),
(int)lt2[0], (int)lt2[1], kernelX.cols / 2, kernelY.cols / 2,
ocl::kernelToStr(kernelX, wdepth, "KERNEL_MATRIX_X").c_str(),
ocl::kernelToStr(kernelY, wdepth, "KERNEL_MATRIX_Y").c_str(),
ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(dtype),
ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType],
......@@ -896,21 +915,30 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
_dst.create(size, dtype);
UMat dst = _dst.getUMat();
int src_offset_x = static_cast<int>((src_offset % src_step) / esz);
int src_offset_y = static_cast<int>(src_offset / src_step);
// TODO Future: emit error on inplace processing
//CV_Assert(src.u != dst.u && "Inplace processing is not allowed with UMat");
if (src.u == dst.u)
{
CV_LOG_ONCE_WARNING(NULL, "sepFilter2D: inplace arguments are not allowed for non-inplace operations. Performance impact warning.");
src = src.clone();
}
Size wholeSize;
Point origin;
src.locateROI(wholeSize, origin);
k.args(ocl::KernelArg::PtrReadOnly(src), (int)src_step, src_offset_x, src_offset_y,
k.args(ocl::KernelArg::PtrReadOnly(src), (int)src_step, origin.x, origin.y,
wholeSize.height, wholeSize.width, ocl::KernelArg::WriteOnly(dst),
static_cast<float>(delta));
static_cast<float>(delta * (1u << (2 * shift_bits))));
return k.run(2, gt2, lt2, false);
}
bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
InputArray _kernelX, InputArray _kernelY, Point anchor,
double delta, int borderType )
bool ocl_sepFilter2D(
InputArray _src, OutputArray _dst, int ddepth,
InputArray _kernelX, InputArray _kernelY, Point anchor,
double delta, int borderType
)
{
const ocl::Device & d = ocl::Device::getDefault();
Size imgSize = _src.size();
......@@ -934,59 +962,152 @@ bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
if (anchor.y < 0)
anchor.y = kernelY.cols >> 1;
int rtype = getKernelType(kernelX,
kernelX.rows == 1 ? Point(anchor.x, 0) : Point(0, anchor.x));
int ctype = getKernelType(kernelY,
kernelY.rows == 1 ? Point(anchor.y, 0) : Point(0, anchor.y));
int bdepth = CV_32F;
bool int_arithm = false;
if( sdepth == CV_8U && ddepth == CV_8U &&
rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL)
int shift_bits = 0;
while (sdepth == CV_8U && ddepth == CV_8U)
{
if (ocl::Device::getDefault().isIntel())
int bits_ = 8;
if (delta * 256.0f != (float)(int)(delta * 256))
{
for (int i=0; i<kernelX.cols; i++)
kernelX.at<float>(0, i) = (float) cvRound(kernelX.at<float>(0, i) * (1 << shift_bits));
if (kernelX.data != kernelY.data)
for (int i=0; i<kernelX.cols; i++)
kernelY.at<float>(0, i) = (float) cvRound(kernelY.at<float>(0, i) * (1 << shift_bits));
} else
CV_LOG_DEBUG(NULL, "ocl_sepFilter2D: bit-exact delta can't be applied: delta=" << delta);
break;
}
Mat kernelX_BitExact, kernelY_BitExact;
bool isValidBitExactRowKernel = createBitExactKernel_32S(kernelX, kernelX_BitExact, bits_);
bool isValidBitExactColumnKernel = createBitExactKernel_32S(kernelY, kernelY_BitExact, bits_);
if (!isValidBitExactRowKernel)
{
CV_LOG_DEBUG(NULL, "ocl_sepFilter2D: bit-exact row-kernel can't be applied: ksize=" << kernelX_BitExact.total());
}
else if (!isValidBitExactColumnKernel)
{
CV_LOG_DEBUG(NULL, "ocl_sepFilter2D: bit-exact column-kernel can't be applied: ksize=" << kernelY_BitExact.total());
}
else
{
bdepth = CV_32S;
kernelX.convertTo( kernelX, bdepth, 1 << shift_bits );
kernelY.convertTo( kernelY, bdepth, 1 << shift_bits );
shift_bits = bits_;
int_arithm = true;
kernelX = kernelX_BitExact;
kernelY = kernelY_BitExact;
}
int_arithm = true;
break;
}
CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 &&
imgSize.width > optimizedSepFilterLocalWidth + anchor.x &&
imgSize.height > optimizedSepFilterLocalHeight + anchor.y &&
(!(borderType & BORDER_ISOLATED) || _src.offset() == 0) &&
anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) &&
OCL_PERFORMANCE_CHECK(d.isIntel()), // TODO FIXIT
ocl_sepFilter2D_SinglePass(_src, _dst, kernelX, kernelY, delta,
borderType & ~BORDER_ISOLATED, ddepth, bdepth, int_arithm), true)
CV_OCL_RUN_(
kernelY.cols <= 21 && kernelX.cols <= 21 &&
imgSize.width > optimizedSepFilterLocalWidth + anchor.x &&
imgSize.height > optimizedSepFilterLocalHeight + anchor.y &&
(!(borderType & BORDER_ISOLATED) || _src.offset() == 0) &&
anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) &&
OCL_PERFORMANCE_CHECK(d.isIntel()), // TODO FIXIT
ocl_sepFilter2D_SinglePass(
_src, _dst, kernelX, kernelY, delta,
borderType & ~BORDER_ISOLATED, ddepth,
CV_32F, // force FP32 mode
false, shift_bits
),
true
);
UMat src = _src.getUMat();
Size srcWholeSize; Point srcOffset;
src.locateROI(srcWholeSize, srcOffset);
bool fast8uc1 = type == CV_8UC1 && srcOffset.x % 4 == 0 &&
src.cols % 4 == 0 && src.step % 4 == 0;
bool fast8uc1 = false;
if (type == CV_8UC1)
{
Size srcWholeSize;
Point srcOffset;
src.locateROI(srcWholeSize, srcOffset);
fast8uc1 = srcOffset.x % 4 == 0 &&
src.cols % 4 == 0 && src.step % 4 == 0;
}
Size srcSize = src.size();
Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1);
UMat buf(bufSize, CV_MAKETYPE(bdepth, cn));
if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1, int_arithm, shift_bits))
return false;
_dst.create(srcSize, CV_MAKETYPE(ddepth, cn));
UMat dst = _dst.getUMat();
return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y, int_arithm, shift_bits);
}
bool ocl_sepFilter2D_BitExact(
InputArray _src, OutputArray _dst, int ddepth,
const Size& ksize,
const uint16_t *fkx, const uint16_t *fky,
Point anchor,
double delta, int borderType,
int shift_bits
)
{
const ocl::Device & d = ocl::Device::getDefault();
Size imgSize = _src.size();
int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
if (cn > 4)
return false;
if (ksize.width % 2 != 1)
return false;
if (ksize.height % 2 != 1)
return false;
Mat kernelX(1, ksize.width, CV_16SC1, (void*)fkx);
Mat kernelY(1, ksize.height, CV_16SC1, (void*)fky);
if (ddepth < 0)
ddepth = sdepth;
if (anchor.x < 0)
anchor.x = kernelX.cols >> 1;
if (anchor.y < 0)
anchor.y = kernelY.cols >> 1;
int bdepth = sdepth == CV_8U ? CV_32S : CV_32F;
CV_OCL_RUN_(
kernelY.cols <= 21 && kernelX.cols <= 21 &&
imgSize.width > optimizedSepFilterLocalWidth + anchor.x &&
imgSize.height > optimizedSepFilterLocalHeight + anchor.y &&
(!(borderType & BORDER_ISOLATED) || _src.offset() == 0) &&
anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) &&
OCL_PERFORMANCE_CHECK(d.isIntel()), // TODO FIXIT
ocl_sepFilter2D_SinglePass(
_src, _dst, kernelX, kernelY, delta,
borderType & ~BORDER_ISOLATED, ddepth, bdepth,
true, shift_bits
),
true
);
UMat src = _src.getUMat();
bool fast8uc1 = false;
if (type == CV_8UC1)
{
Size srcWholeSize;
Point srcOffset;
src.locateROI(srcWholeSize, srcOffset);
fast8uc1 = srcOffset.x % 4 == 0 &&
src.cols % 4 == 0 && src.step % 4 == 0;
}
Size srcSize = src.size();
Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1);
UMat buf(bufSize, CV_MAKETYPE(bdepth, cn));
if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1, int_arithm))
if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1, true, shift_bits))
return false;
_dst.create(srcSize, CV_MAKETYPE(ddepth, cn));
UMat dst = _dst.getUMat();
return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y, int_arithm);
return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y, true, shift_bits);
}
#endif
......@@ -1444,7 +1565,7 @@ void sepFilter2D(InputArray _src, OutputArray _dst, int ddepth,
CV_Assert(!_kernelX.empty());
CV_Assert(!_kernelY.empty());
CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && (size_t)_src.rows() > _kernelY.total() && (size_t)_src.cols() > _kernelX.total(),
CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && (size_t)_src.rows() >= _kernelY.total() && (size_t)_src.cols() >= _kernelX.total(),
ocl_sepFilter2D(_src, _dst, ddepth, _kernelX, _kernelY, anchor, delta, borderType))
Mat src = _src.getMat(), kernelX = _kernelX.getMat(), kernelY = _kernelY.getMat();
......
......@@ -46,13 +46,25 @@
namespace cv
{
#ifdef HAVE_OPENCL
bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
InputArray _kernelX, InputArray _kernelY, Point anchor,
double delta, int borderType );
bool ocl_sepFilter2D(
InputArray _src, OutputArray _dst, int ddepth,
InputArray _kernelX, InputArray _kernelY, Point anchor,
double delta, int borderType
);
bool ocl_sepFilter2D_BitExact(
InputArray _src, OutputArray _dst, int ddepth,
const Size& ksize,
const uint16_t *fkx, const uint16_t *fky,
Point anchor,
double delta, int borderType,
int shift_bits
);
#endif
void preprocess2DKernel(const Mat& kernel, std::vector<Point>& coords, std::vector<uchar>& coeffs);
}
void preprocess2DKernel(const Mat& kernel, std::vector<Point>& coords, std::vector<uchar>& coeffs);
} // namespace
#endif
......
......@@ -61,7 +61,11 @@
#endif
#define DIG(a) a,
#if defined(INTEGER_ARITHMETIC)
__constant int mat_kernel[] = { COEFF };
#else
__constant srcT1 mat_kernel[] = { COEFF };
#endif
__kernel void col_filter(__global const uchar * src, int src_step, int src_offset, int src_whole_rows, int src_whole_cols,
__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, float delta)
......@@ -92,30 +96,28 @@ __kernel void col_filter(__global const uchar * src, int src_step, int src_offse
barrier(CLK_LOCAL_MEM_FENCE);
// read pixels from lds and calculate the result
sum = LDS_DAT[l_y + RADIUSY][l_x] * mat_kernel[RADIUSY];
sum = LDS_DAT[l_y + RADIUSY][l_x] * mat_kernel[RADIUSY] + (srcT)delta;
for (int i = 1; i <= RADIUSY; ++i)
{
temp[0] = LDS_DAT[l_y + RADIUSY - i][l_x];
temp[1] = LDS_DAT[l_y + RADIUSY + i][l_x];
#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
#if defined(INTEGER_ARITHMETIC)
sum += mad24(temp[0],mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]);
#else
sum += mad(temp[0], mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]);
#endif
}
#ifdef INTEGER_ARITHMETIC
#ifdef INTEL_DEVICE
sum = (sum + (1 << (SHIFT_BITS-1))) / (1 << SHIFT_BITS);
#else
sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS;
#endif
#endif
// write the result to dst
if (x < dst_cols && y < dst_rows)
{
#if defined(SHIFT_BITS) && SHIFT_BITS > 0
dstT result = convertToDstT(convertToFloatT(sum) * (floatT)(1.0f / (1 << SHIFT_BITS)));
#else
dstT result = convertToDstT(sum);
#endif
start_addr = mad24(y, dst_step, mad24(DSTSIZE, x, dst_offset));
storepix(convertToDstT(sum + (srcT)(delta)), dst + start_addr);
storepix(result, dst + start_addr);
}
}
......@@ -139,9 +139,13 @@
#endif
#define DIG(a) a,
#if defined(INTEGER_ARITHMETIC)
__constant int mat_kernel[] = { COEFF };
#else
__constant dstT1 mat_kernel[] = { COEFF };
#endif
#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
#if defined(INTEGER_ARITHMETIC)
#define dstT4 int4
#define convertDstVec convert_int4
#else
......@@ -263,7 +267,7 @@ __kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel
{
temp[0] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset - i);
temp[1] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset + i);
#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
#if defined(INTEGER_ARITHMETIC)
sum += mad24(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]);
#else
sum += mad(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]);
......@@ -368,7 +372,7 @@ __kernel void row_filter(__global const uchar * src, int src_step, int src_offse
{
temp[0] = LDS_DAT[l_y][l_x + RADIUSX - i];
temp[1] = LDS_DAT[l_y][l_x + RADIUSX + i];
#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
#if defined(INTEGER_ARITHMETIC)
sum += mad24(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]);
#else
sum += mad(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]);
......
......@@ -160,7 +160,7 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int
{
sum = (WT) 0;
for (i=0; i<=2*RADIUSY; i++)
#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
#if defined(INTEGER_ARITHMETIC)
sum = mad24(lsmem[liy + i][clocX], mat_kernelY[i], sum);
#else
sum = mad(lsmem[liy + i][clocX], mat_kernelY[i], sum);
......@@ -177,25 +177,27 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int
{
// do second horizontal filter pass
// and calculate final result
sum = 0.0f;
sum = (WT)(delta);
for (i=0; i<=2*RADIUSX; i++)
#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
#if defined(INTEGER_ARITHMETIC)
sum = mad24(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
#else
sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
#endif
#ifdef INTEGER_ARITHMETIC
#ifdef INTEL_DEVICE
sum = (sum + (1 << (SHIFT_BITS-1))) / (1 << SHIFT_BITS);
#if defined(SHIFT_BITS) && SHIFT_BITS > 0
#if !defined(INTEGER_ARITHMETIC)
sum = sum * (1.0f / (1 << SHIFT_BITS));
#else
sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS;
#endif
#endif
// store result into destination image
storepix(convertToDstT(sum + (WT)(delta)), Dst + mad24(y + liy, dst_step, mad24(x, DSTSIZE, dst_offset)));
storepix(convertToDstT(sum), Dst + mad24(y + liy, dst_step, mad24(x, DSTSIZE, dst_offset)));
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = liy * BLK_X + lix; i < (RADIUSY*2) * (BLK_X+(RADIUSX*2)); i += BLK_X * BLK_Y)
{
int clocX = i % (BLK_X+(RADIUSX*2));
......
......@@ -48,6 +48,7 @@
#include <opencv2/core/utils/configuration.private.hpp>
#include <vector>
#include <iostream>
#include "opencv2/core/hal/intrin.hpp"
#include "opencl_kernels_imgproc.hpp"
......@@ -637,10 +638,9 @@ void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
return;
}
bool useOpenCL = (ocl::isOpenCLActivated() && _dst.isUMat() && _src.dims() <= 2 &&
((ksize.width == 3 && ksize.height == 3) ||
(ksize.width == 5 && ksize.height == 5)) &&
_src.rows() > ksize.height && _src.cols() > ksize.width);
bool useOpenCL = ocl::isOpenCLActivated() && _dst.isUMat() && _src.dims() <= 2 &&
_src.rows() >= ksize.height && _src.cols() >= ksize.width &&
ksize.width > 1 && ksize.height > 1;
CV_UNUSED(useOpenCL);
int sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
......@@ -648,27 +648,13 @@ void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
Mat kx, ky;
createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2);
CV_OCL_RUN(useOpenCL, ocl_GaussianBlur_8UC1(_src, _dst, ksize, CV_MAT_DEPTH(type), kx, ky, borderType));
CV_OCL_RUN(useOpenCL && sdepth == CV_8U &&
((ksize.width == 3 && ksize.height == 3) ||
(ksize.width == 5 && ksize.height == 5)),
ocl_GaussianBlur_8UC1(_src, _dst, ksize, CV_MAT_DEPTH(type), kx, ky, borderType)
);
CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && (size_t)_src.rows() > kx.total() && (size_t)_src.cols() > kx.total(),
ocl_sepFilter2D(_src, _dst, sdepth, kx, ky, Point(-1, -1), 0, borderType))
Mat src = _src.getMat();
Mat dst = _dst.getMat();
Point ofs;
Size wsz(src.cols, src.rows);
if(!(borderType & BORDER_ISOLATED))
src.locateROI( wsz, ofs );
CALL_HAL(gaussianBlur, cv_hal_gaussianBlur, src.ptr(), src.step, dst.ptr(), dst.step, src.cols, src.rows, sdepth, cn,
ofs.x, ofs.y, wsz.width - src.cols - ofs.x, wsz.height - src.rows - ofs.y, ksize.width, ksize.height,
sigma1, sigma2, borderType&~BORDER_ISOLATED);
CV_OVX_RUN(true,
openvx_gaussianBlur(src, dst, ksize, sigma1, sigma2, borderType))
if(sdepth == CV_8U && ((borderType & BORDER_ISOLATED) || !_src.getMat().isSubmatrix()))
if(sdepth == CV_8U && ((borderType & BORDER_ISOLATED) || !_src.isSubmatrix()))
{
std::vector<ufixedpoint16> fkx, fky;
createGaussianKernels(fkx, fky, type, ksize, sigma1, sigma2);
......@@ -684,6 +670,17 @@ void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
}
else
{
CV_OCL_RUN(useOpenCL,
ocl_sepFilter2D_BitExact(_src, _dst, sdepth,
ksize,
(const uint16_t*)&fkx[0], (const uint16_t*)&fky[0],
Point(-1, -1), 0, borderType,
8/*shift_bits*/)
);
Mat src = _src.getMat();
Mat dst = _dst.getMat();
if (src.data == dst.data)
src = src.clone();
CV_CPU_DISPATCH(GaussianBlurFixedPoint, (src, dst, (const uint16_t*)&fkx[0], (int)fkx.size(), (const uint16_t*)&fky[0], (int)fky.size(), borderType),
......@@ -692,6 +689,29 @@ void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize,
}
}
#ifdef HAVE_OPENCL
if (useOpenCL)
{
sepFilter2D(_src, _dst, sdepth, kx, ky, Point(-1, -1), 0, borderType);
return;
}
#endif
Mat src = _src.getMat();
Mat dst = _dst.getMat();
Point ofs;
Size wsz(src.cols, src.rows);
if(!(borderType & BORDER_ISOLATED))
src.locateROI( wsz, ofs );
CALL_HAL(gaussianBlur, cv_hal_gaussianBlur, src.ptr(), src.step, dst.ptr(), dst.step, src.cols, src.rows, sdepth, cn,
ofs.x, ofs.y, wsz.width - src.cols - ofs.x, wsz.height - src.rows - ofs.y, ksize.width, ksize.height,
sigma1, sigma2, borderType&~BORDER_ISOLATED);
CV_OVX_RUN(true,
openvx_gaussianBlur(src, dst, ksize, sigma1, sigma2, borderType))
#if defined ENABLE_IPP_GAUSSIAN_BLUR
// IPP is not bit-exact to OpenCV implementation
CV_IPP_RUN_FAST(ipp_GaussianBlur(src, dst, ksize, sigma1, sigma2, borderType));
......
......@@ -73,7 +73,7 @@ PARAM_TEST_CASE(SepFilter2D, MatDepth, Channels, BorderType, bool, bool)
useRoi = GET_PARAM(4);
}
void random_roi()
void random_roi(bool bitExact)
{
Size ksize = randomSize(kernelMinSize, kernelMaxSize);
if (1 != ksize.width % 2)
......@@ -81,11 +81,19 @@ PARAM_TEST_CASE(SepFilter2D, MatDepth, Channels, BorderType, bool, bool)
if (1 != ksize.height % 2)
ksize.height++;
Mat temp = randomMat(Size(ksize.width, 1), CV_MAKE_TYPE(CV_32F, 1), -MAX_VALUE, MAX_VALUE);
Mat temp = randomMat(Size(ksize.width, 1), CV_32FC1, -0.5, 1.0);
cv::normalize(temp, kernelX, 1.0, 0.0, NORM_L1);
temp = randomMat(Size(1, ksize.height), CV_MAKE_TYPE(CV_32F, 1), -MAX_VALUE, MAX_VALUE);
temp = randomMat(Size(1, ksize.height), CV_32FC1, -0.5, 1.0);
cv::normalize(temp, kernelY, 1.0, 0.0, NORM_L1);
if (bitExact)
{
kernelX.convertTo(temp, CV_32S, 256);
temp.convertTo(kernelX, CV_32F, 1.0 / 256);
kernelY.convertTo(temp, CV_32S, 256);
temp.convertTo(kernelY, CV_32F, 1.0 / 256);
}
Size roiSize = randomSize(ksize.width, MAX_VALUE, ksize.height, MAX_VALUE);
Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
randomSubMat(src, src_roi, roiSize, srcBorder, type, -MAX_VALUE, MAX_VALUE);
......@@ -96,6 +104,11 @@ PARAM_TEST_CASE(SepFilter2D, MatDepth, Channels, BorderType, bool, bool)
anchor.x = anchor.y = -1;
delta = randomDouble(-100, 100);
if (bitExact)
{
delta = (int)(delta * 256) / 256.0;
}
UMAT_UPLOAD_INPUT_PARAMETER(src);
UMAT_UPLOAD_OUTPUT_PARAMETER(dst);
}
......@@ -110,7 +123,7 @@ OCL_TEST_P(SepFilter2D, Mat)
{
for (int j = 0; j < test_loop_times + 3; j++)
{
random_roi();
random_roi(false);
OCL_OFF(cv::sepFilter2D(src_roi, dst_roi, -1, kernelX, kernelY, anchor, delta, borderType));
OCL_ON(cv::sepFilter2D(usrc_roi, udst_roi, -1, kernelX, kernelY, anchor, delta, borderType));
......@@ -119,6 +132,22 @@ OCL_TEST_P(SepFilter2D, Mat)
}
}
OCL_TEST_P(SepFilter2D, Mat_BitExact)
{
for (int j = 0; j < test_loop_times + 3; j++)
{
random_roi(true);
OCL_OFF(cv::sepFilter2D(src_roi, dst_roi, -1, kernelX, kernelY, anchor, delta, borderType));
OCL_ON(cv::sepFilter2D(usrc_roi, udst_roi, -1, kernelX, kernelY, anchor, delta, borderType));
if (src_roi.depth() < CV_32F)
Near(0.0);
else
Near(1e-3);
}
}
OCL_INSTANTIATE_TEST_CASE_P(ImageProc, SepFilter2D,
Combine(
Values(CV_8U, CV_32F),
......
......@@ -416,7 +416,11 @@ void BlocksCompensator::feed(const std::vector<Point> &corners, const std::vecto
bl_idx += bl_per_img.width*bl_per_img.height;
for (int i=0; i<nr_gain_filtering_iterations_; ++i)
sepFilter2D(gain_map, gain_map, CV_32F, ker, ker);
{
UMat tmp;
sepFilter2D(gain_map, tmp, CV_32F, ker, ker);
swap(gain_map, tmp);
}
gain_maps_[img_idx] = gain_map;
}
......
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1f981ed2..90eb500a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,6 +22,9 @@ endif()
project(OpenVINO)
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zi /FS")
+set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
+
set(OpenVINO_MAIN_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
set(IE_MAIN_SOURCE_DIR ${OpenVINO_MAIN_SOURCE_DIR}/inference-engine)
list(APPEND CMAKE_MODULE_PATH "${OpenVINO_MAIN_SOURCE_DIR}/cmake")
diff --git a/inference-engine/src/CMakeLists.txt b/inference-engine/src/CMakeLists.txt
index 0ba0dd78..7d34e7cb 100644
--- a/inference-engine/src/CMakeLists.txt
+++ b/inference-engine/src/CMakeLists.txt
@@ -26,7 +26,7 @@ endif()
add_subdirectory(hetero_plugin)
-add_subdirectory(multi_device)
+#add_subdirectory(multi_device)
add_subdirectory(transformations)
diff --git a/inference-engine/CMakeLists.txt b/inference-engine/CMakeLists.txt
index 39ff413b..df4e89c7 100644
--- a/inference-engine/CMakeLists.txt
+++ b/inference-engine/CMakeLists.txt
@@ -66,7 +66,7 @@ if(ENABLE_TESTS)
add_subdirectory(tests)
endif()
-add_subdirectory(tools)
+#add_subdirectory(tools)
function(ie_build_samples)
# samples should be build with the same flags as from OpenVINO package,
@@ -85,7 +85,7 @@ endfunction()
# gflags and format_reader targets are kept inside of samples directory and
# they must be built even if samples build is disabled (required for tests and tools).
-ie_build_samples()
+#ie_build_samples()
file(GLOB_RECURSE SAMPLES_SOURCES samples/*.cpp samples/*.hpp samples/*.h)
add_cpplint_target(sample_cpplint
@@ -174,10 +174,10 @@ endif()
# Developer package
#
-ie_developer_export_targets(format_reader)
+#ie_developer_export_targets(format_reader)
ie_developer_export_targets(${NGRAPH_LIBRARIES})
-ie_developer_export()
+#ie_developer_export()
configure_file(
"${IE_MAIN_SOURCE_DIR}/cmake/developer_package_config.cmake.in"
diff --git a/inference-engine/src/inference_engine/CMakeLists.txt b/inference-engine/src/inference_engine/CMakeLists.txt
index 4ae0d560..e37acbe0 100644
--- a/inference-engine/src/inference_engine/CMakeLists.txt
+++ b/inference-engine/src/inference_engine/CMakeLists.txt
@@ -99,7 +99,7 @@ add_cpplint_target(${TARGET_NAME}_plugin_api_cpplint FOR_SOURCES ${plugin_api_sr
# Create common base object library
-add_library(${TARGET_NAME}_common_obj OBJECT
+add_library(${TARGET_NAME}_common_obj OBJECT EXCLUDE_FROM_ALL
${IE_BASE_SOURCE_FILES})
target_compile_definitions(${TARGET_NAME}_common_obj PRIVATE IMPLEMENT_INFERENCE_ENGINE_API)
@@ -112,7 +112,7 @@ target_include_directories(${TARGET_NAME}_common_obj SYSTEM PRIVATE
# Create object library
-add_library(${TARGET_NAME}_obj OBJECT
+add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL
${LIBRARY_SRC}
${LIBRARY_HEADERS}
${PUBLIC_HEADERS})
@@ -177,7 +177,7 @@ ie_register_plugins(MAIN_TARGET ${TARGET_NAME}
# Static library used for unit tests which are always built
-add_library(${TARGET_NAME}_s STATIC
+add_library(${TARGET_NAME}_s STATIC EXCLUDE_FROM_ALL
$<TARGET_OBJECTS:${TARGET_NAME}_obj>
$<TARGET_OBJECTS:${TARGET_NAME}_common_obj>
$<TARGET_OBJECTS:${TARGET_NAME}_legacy_obj>
diff --git a/inference-engine/src/legacy_api/CMakeLists.txt b/inference-engine/src/legacy_api/CMakeLists.txt
index 85524310..ed27e058 100644
--- a/inference-engine/src/legacy_api/CMakeLists.txt
+++ b/inference-engine/src/legacy_api/CMakeLists.txt
@@ -21,7 +21,7 @@ source_group("include" FILES ${PUBLIC_HEADERS})
# Create object library
-add_library(${TARGET_NAME}_obj OBJECT
+add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL
${LIBRARY_SRC}
${PUBLIC_HEADERS})
diff --git a/inference-engine/src/mkldnn_plugin/CMakeLists.txt b/inference-engine/src/mkldnn_plugin/CMakeLists.txt
index 297783da..06da35c3 100644
--- a/inference-engine/src/mkldnn_plugin/CMakeLists.txt
+++ b/inference-engine/src/mkldnn_plugin/CMakeLists.txt
@@ -192,7 +192,7 @@ cross_compiled_file(${TARGET_NAME}
# add test object library
-add_library(${TARGET_NAME}_obj OBJECT ${SOURCES} ${HEADERS})
+add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL ${SOURCES} ${HEADERS})
target_include_directories(${TARGET_NAME}_obj PRIVATE $<TARGET_PROPERTY:inference_engine_preproc_s,INTERFACE_INCLUDE_DIRECTORIES>
$<TARGET_PROPERTY:inference_engine_lp_transformations,INTERFACE_INCLUDE_DIRECTORIES>
diff --git a/inference-engine/src/preprocessing/CMakeLists.txt b/inference-engine/src/preprocessing/CMakeLists.txt
index adc52f06..6b7d0ffe 100644
--- a/inference-engine/src/preprocessing/CMakeLists.txt
+++ b/inference-engine/src/preprocessing/CMakeLists.txt
@@ -124,7 +124,7 @@ endif()
# Create object library
-add_library(${TARGET_NAME}_obj OBJECT
+add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL
${LIBRARY_SRC}
${LIBRARY_HEADERS})
@@ -183,7 +183,7 @@ add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME}
# Static library used for unit tests which are always built
-add_library(${TARGET_NAME}_s STATIC
+add_library(${TARGET_NAME}_s STATIC EXCLUDE_FROM_ALL
$<TARGET_OBJECTS:${TARGET_NAME}_obj>)
set_ie_threading_interface_for(${TARGET_NAME}_s)
diff --git a/inference-engine/src/vpu/common/CMakeLists.txt b/inference-engine/src/vpu/common/CMakeLists.txt
index 43e9308f..2e40dd31 100644
--- a/inference-engine/src/vpu/common/CMakeLists.txt
+++ b/inference-engine/src/vpu/common/CMakeLists.txt
@@ -55,7 +55,7 @@ add_common_target("vpu_common_lib" FALSE)
# Unit tests support for graph transformer
if(WIN32)
- add_common_target("vpu_common_lib_test_static" TRUE)
+ #add_common_target("vpu_common_lib_test_static" TRUE)
else()
add_library("vpu_common_lib_test_static" ALIAS "vpu_common_lib")
endif()
diff --git a/inference-engine/src/vpu/graph_transformer/CMakeLists.txt b/inference-engine/src/vpu/graph_transformer/CMakeLists.txt
index 982d3c7f..15fcf3e8 100644
--- a/inference-engine/src/vpu/graph_transformer/CMakeLists.txt
+++ b/inference-engine/src/vpu/graph_transformer/CMakeLists.txt
@@ -64,7 +64,7 @@ add_graph_transformer_target("vpu_graph_transformer" FALSE)
# Unit tests support for graph transformer
if(WIN32)
- add_graph_transformer_target("vpu_graph_transformer_test_static" TRUE)
+ #add_graph_transformer_target("vpu_graph_transformer_test_static" TRUE)
else()
add_library("vpu_graph_transformer_test_static" ALIAS "vpu_graph_transformer")
endif()
diff --git a/inference-engine/thirdparty/CMakeLists.txt b/inference-engine/thirdparty/CMakeLists.txt
index f94453e0..c80e75c5 100644
--- a/inference-engine/thirdparty/CMakeLists.txt
+++ b/inference-engine/thirdparty/CMakeLists.txt
@@ -43,13 +43,13 @@ function(build_with_lto)
endfunction()
ie_build_pugixml()
- add_subdirectory(stb_lib)
+ #add_subdirectory(stb_lib)
add_subdirectory(ade)
add_subdirectory(fluid/modules/gapi)
target_include_directories(pugixml INTERFACE "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/pugixml/src>")
- set_target_properties(pugixml ade fluid stb_image
+ set_target_properties(pugixml ade fluid
PROPERTIES FOLDER thirdparty)
# developer package
diff --git a/inference-engine/thirdparty/pugixml/CMakeLists.txt b/inference-engine/thirdparty/pugixml/CMakeLists.txt
index 8bcb2801..380fb468 100644
--- a/inference-engine/thirdparty/pugixml/CMakeLists.txt
+++ b/inference-engine/thirdparty/pugixml/CMakeLists.txt
@@ -41,7 +41,7 @@ if(BUILD_SHARED_LIBS)
else()
add_library(pugixml STATIC ${SOURCES})
if (MSVC)
- add_library(pugixml_mt STATIC ${SOURCES})
+ #add_library(pugixml_mt STATIC ${SOURCES})
#if (WIN32)
# set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT")
# set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd")
applyPatch('20200701-dldt-disable-unused-targets.patch')
applyPatch('20200413-dldt-pdb.patch')
applyPatch('20200604-dldt-disable-multidevice.patch')
sysroot_bin_dir = prepare_dir(self.sysrootdir / 'bin')
copytree(self.build_dir / 'install', self.sysrootdir / 'ngraph')
#rm_one(self.sysrootdir / 'ngraph' / 'lib' / 'ngraph.dll')
build_config = 'Release' if not self.config.build_debug else 'Debug'
build_bin_dir = self.build_dir / 'bin' / 'intel64' / build_config
def copy_bin(name):
global build_bin_dir, sysroot_bin_dir
copytree(build_bin_dir / name, sysroot_bin_dir / name)
dll_suffix = 'd' if self.config.build_debug else ''
def copy_dll(name):
global copy_bin, dll_suffix
copy_bin(name + dll_suffix + '.dll')
copy_bin(name + dll_suffix + '.pdb')
copy_bin('cache.json')
copy_dll('clDNNPlugin')
copy_dll('HeteroPlugin')
copy_dll('inference_engine')
copy_dll('inference_engine_ir_reader')
copy_dll('inference_engine_legacy')
copy_dll('inference_engine_transformations') # runtime
copy_dll('inference_engine_lp_transformations') # runtime
copy_dll('MKLDNNPlugin') # runtime
copy_dll('myriadPlugin') # runtime
#copy_dll('MultiDevicePlugin') # runtime, not used
copy_dll('ngraph')
copy_bin('plugins.xml')
copytree(self.build_dir / 'bin' / 'intel64' / 'pcie-ma248x.elf', sysroot_bin_dir / 'pcie-ma248x.elf')
copytree(self.build_dir / 'bin' / 'intel64' / 'usb-ma2x8x.mvcmd', sysroot_bin_dir / 'usb-ma2x8x.mvcmd')
copytree(self.build_dir / 'bin' / 'intel64' / 'usb-ma2450.mvcmd', sysroot_bin_dir / 'usb-ma2450.mvcmd')
copytree(self.srcdir / 'inference-engine' / 'temp' / 'tbb' / 'bin', sysroot_bin_dir)
copytree(self.srcdir / 'inference-engine' / 'temp' / 'tbb', self.sysrootdir / 'tbb')
sysroot_ie_dir = prepare_dir(self.sysrootdir / 'deployment_tools' / 'inference_engine')
sysroot_ie_lib_dir = prepare_dir(sysroot_ie_dir / 'lib' / 'intel64')
copytree(self.srcdir / 'inference-engine' / 'include', sysroot_ie_dir / 'include')
if not self.config.build_debug:
copytree(self.build_dir / 'install' / 'lib' / 'ngraph.lib', sysroot_ie_lib_dir / 'ngraph.lib')
copytree(build_bin_dir / 'inference_engine.lib', sysroot_ie_lib_dir / 'inference_engine.lib')
copytree(build_bin_dir / 'inference_engine_ir_reader.lib', sysroot_ie_lib_dir / 'inference_engine_ir_reader.lib')
copytree(build_bin_dir / 'inference_engine_legacy.lib', sysroot_ie_lib_dir / 'inference_engine_legacy.lib')
else:
copytree(self.build_dir / 'install' / 'lib' / 'ngraphd.lib', sysroot_ie_lib_dir / 'ngraphd.lib')
copytree(build_bin_dir / 'inference_engined.lib', sysroot_ie_lib_dir / 'inference_engined.lib')
copytree(build_bin_dir / 'inference_engine_ir_readerd.lib', sysroot_ie_lib_dir / 'inference_engine_ir_readerd.lib')
copytree(build_bin_dir / 'inference_engine_legacyd.lib', sysroot_ie_lib_dir / 'inference_engine_legacyd.lib')
sysroot_license_dir = prepare_dir(self.sysrootdir / 'etc' / 'licenses')
copytree(self.srcdir / 'LICENSE', sysroot_license_dir / 'dldt-LICENSE')
copytree(self.srcdir / 'ngraph/LICENSE', sysroot_license_dir / 'ngraph-LICENSE')
copytree(self.sysrootdir / 'tbb/LICENSE', sysroot_license_dir / 'tbb-LICENSE')
......@@ -133,9 +133,10 @@ def git_checkout(dst, url, branch, revision, clone_extra_args, noFetch=False):
(['-b', branch] if branch else []) +
clone_extra_args + [url, '.'], cwd=dst)
else:
execute(cmd=['git', 'fetch', 'origin'] + ([branch] if branch else []), cwd=dst)
execute(cmd=['git', 'fetch', 'origin'] + ([branch + ':' + branch] if branch else []), cwd=dst)
execute(cmd=['git', 'reset', '--hard'], cwd=dst)
execute(cmd=['git', 'checkout', '-B', 'winpack_dldt', revision], cwd=dst)
execute(cmd=['git', 'clean', '-f', '-d'], cwd=dst)
execute(cmd=['git', 'checkout', '--force', '-B', 'winpack_dldt', revision], cwd=dst)
execute(cmd=['git', 'clean', '-f', '-d'], cwd=dst)
execute(cmd=['git', 'submodule', 'init'], cwd=dst)
execute(cmd=['git', 'submodule', 'update', '--force', '--depth=1000'], cwd=dst)
......@@ -149,6 +150,7 @@ def git_apply_patch(src_dir, patch_file):
patch_file = str(patch_file) # Python 3.5 may not handle Path
assert os.path.exists(patch_file), patch_file
execute(cmd=['git', 'apply', '--3way', '-v', '--ignore-space-change', str(patch_file)], cwd=src_dir)
execute(cmd=['git', 'diff', 'HEAD'], cwd=src_dir)
#===================================================================================================
......@@ -186,6 +188,17 @@ class BuilderDLDT:
self.build_dir = prepare_dir(self.outdir / 'build', clean=self.config.clean_dldt)
self.sysrootdir = prepare_dir(self.outdir / 'sysroot', clean=self.config.clean_dldt)
if self.config.build_subst_drive:
if os.path.exists(self.config.build_subst_drive + ':\\'):
execute(['subst', self.config.build_subst_drive + ':', '/D'])
execute(['subst', self.config.build_subst_drive + ':', str(self.outdir)])
def fix_path(p):
return str(p).replace(str(self.outdir), self.config.build_subst_drive + ':')
self.srcdir = Path(fix_path(self.srcdir))
self.build_dir = Path(fix_path(self.build_dir))
self.sysrootdir = Path(fix_path(self.sysrootdir))
def init_patchset(self):
cpath = self.cpath
self.patch_file = str(cpath / 'patch.config.py') # Python 3.5 may not handle Path
......@@ -255,12 +268,14 @@ class BuilderDLDT:
BUILD_TESTS='OFF',
ENABLE_OPENCV='OFF',
ENABLE_GNA='OFF',
ENABLE_SPEECH_DEMO='OFF', # 2020.4+
NGRAPH_DOC_BUILD_ENABLE='OFF',
NGRAPH_UNIT_TEST_ENABLE='OFF',
NGRAPH_UNIT_TEST_OPENVINO_ENABLE='OFF',
NGRAPH_TEST_UTIL_ENABLE='OFF',
NGRAPH_ONNX_IMPORT_ENABLE='OFF',
CMAKE_INSTALL_PREFIX=str(self.build_dir / 'install'),
OUTPUT_ROOT=str(self.build_dir), # 2020.4+
)
cmd += [ '-D%s=%s' % (k, v) for (k, v) in cmake_vars.items() if v is not None]
......@@ -270,14 +285,6 @@ class BuilderDLDT:
cmd.append(str(self.srcdir))
build_dir = self.build_dir
if self.config.build_subst_drive:
if os.path.exists(self.config.build_subst_drive + ':\\'):
execute(['subst', self.config.build_subst_drive + ':', '/D'])
def fix_path(p):
return str(p).replace(str(self.outdir), self.config.build_subst_drive + ':')
execute(['subst', self.config.build_subst_drive + ':', str(self.outdir)])
cmd = [fix_path(c) for c in cmd]
build_dir = Path(fix_path(build_dir))
try:
execute(cmd, cwd=build_dir)
......@@ -291,8 +298,6 @@ class BuilderDLDT:
cmd = [self.cmake_path, '-DBUILD_TYPE=' + build_config, '-P', 'cmake_install.cmake']
execute(cmd, cwd=build_dir / 'ngraph')
except:
if self.config.build_subst_drive:
execute(['subst', self.config.build_subst_drive + ':', '/D'])
raise
log.info('DLDT build completed')
......@@ -307,6 +312,11 @@ class BuilderDLDT:
log.info('DLDT sysroot preparation completed')
def cleanup(self):
if self.config.build_subst_drive:
execute(['subst', self.config.build_subst_drive + ':', '/D'])
#===================================================================================================
class Builder:
......@@ -466,7 +476,7 @@ def main():
parser.add_argument('--dldt_reference_dir', help='DLDT reference git repository (optional)')
parser.add_argument('--dldt_src_dir', help='DLDT custom source repository (skip git checkout and patching, use for TESTING only)')
parser.add_argument('--dldt_config', help='Specify DLDT build configuration (defaults to DLDT commit)')
parser.add_argument('--dldt_config', help='Specify DLDT build configuration (defaults to evaluate from DLDT commit/branch)')
args = parser.parse_args()
......@@ -492,7 +502,10 @@ def main():
args.opencv_dir = os.path.abspath(args.opencv_dir)
if not args.dldt_config:
args.dldt_config = args.dldt_src_commit
if args.dldt_src_commit == 'releases/2020/4' or args.dldt_src_branch == 'releases/2020/4':
args.dldt_config = '2020.4'
else:
args.dldt_config = args.dldt_src_commit
_opencv_dir = check_dir(args.opencv_dir)
_outdir = prepare_dir(args.output_dir)
......@@ -504,14 +517,18 @@ def main():
builder_dldt = BuilderDLDT(args)
builder_dldt.prepare_sources()
builder_dldt.build()
builder_dldt.make_sysroot()
builder_opencv = Builder(args)
builder_opencv.build(builder_dldt)
builder_opencv.copy_sysroot(builder_dldt)
builder_opencv.package_sources()
try:
builder_dldt.prepare_sources()
builder_dldt.build()
builder_dldt.make_sysroot()
builder_opencv = Builder(args)
builder_opencv.build(builder_dldt)
builder_opencv.copy_sysroot(builder_dldt)
builder_opencv.package_sources()
except:
builder_dldt.cleanup()
raise
log.info("=====")
log.info("===== Build finished")
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册