arithm.cpp 136.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
/*M///////////////////////////////////////////////////////////////////////////////////////
//
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
//  By downloading, copying, installing or using the software you agree to this license.
//  If you do not agree to this license, do not download, install,
//  copy or use the software.
//
//
//                           License Agreement
//                For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
//   * Redistribution's of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//
//   * Redistribution's in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other materials provided with the distribution.
//
//   * The name of the copyright holders may not be used to endorse or promote products
//     derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/

/* ////////////////////////////////////////////////////////////////////
//
45
//  Arithmetic and logical operations: +, -, *, /, &, |, ^, ~, abs ...
46 47 48 49
//
// */

#include "precomp.hpp"
V
Vadim Pisarevsky 已提交
50
#include "opencl_kernels.hpp"
51 52 53 54

namespace cv
{

55
struct NOP {};
56

57
#if CV_SSE2 || CV_NEON
58 59 60 61 62

#define FUNCTOR_TEMPLATE(name)          \
    template<typename T> struct name {}

FUNCTOR_TEMPLATE(VLoadStore128);
63
#if CV_SSE2
64 65
FUNCTOR_TEMPLATE(VLoadStore64);
FUNCTOR_TEMPLATE(VLoadStore128Aligned);
66
#endif
67 68 69 70 71

#endif

template<typename T, class Op, class VOp>
void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, Size sz)
72
{
73
#if CV_SSE2 || CV_NEON
74
    VOp vop;
75
#endif
76
    Op op;
77

78 79 80
    for( ; sz.height--; src1 += step1/sizeof(src1[0]),
                        src2 += step2/sizeof(src2[0]),
                        dst += step/sizeof(dst[0]) )
81 82
    {
        int x = 0;
83

84
#if CV_NEON || CV_SSE2
85
#if CV_SSE2
86
        if( USE_SSE2 )
87
        {
88
#endif
89
            for( ; x <= sz.width - 32/(int)sizeof(T); x += 32/sizeof(T) )
90
            {
91 92 93 94 95 96
                typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x               );
                typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 16/sizeof(T));
                r0 = vop(r0, VLoadStore128<T>::load(src2 + x               ));
                r1 = vop(r1, VLoadStore128<T>::load(src2 + x + 16/sizeof(T)));
                VLoadStore128<T>::store(dst + x               , r0);
                VLoadStore128<T>::store(dst + x + 16/sizeof(T), r1);
97
            }
98
#if CV_SSE2
99
        }
V
Victoria Zhislina 已提交
100
#endif
101
#endif
102
#if CV_SSE2
103
        if( USE_SSE2 )
104
        {
105
            for( ; x <= sz.width - 8/(int)sizeof(T); x += 8/sizeof(T) )
106
            {
107 108 109
                typename VLoadStore64<T>::reg_type r = VLoadStore64<T>::load(src1 + x);
                r = vop(r, VLoadStore64<T>::load(src2 + x));
                VLoadStore64<T>::store(dst + x, r);
110
            }
111
        }
112 113
#endif
#if CV_ENABLE_UNROLLED
114
        for( ; x <= sz.width - 4; x += 4 )
115
        {
116 117 118 119 120 121
            T v0 = op(src1[x], src2[x]);
            T v1 = op(src1[x+1], src2[x+1]);
            dst[x] = v0; dst[x+1] = v1;
            v0 = op(src1[x+2], src2[x+2]);
            v1 = op(src1[x+3], src2[x+3]);
            dst[x+2] = v0; dst[x+3] = v1;
122
        }
123
#endif
124

125 126
        for( ; x < sz.width; x++ )
            dst[x] = op(src1[x], src2[x]);
127
    }
128
}
129

130 131 132
template<typename T, class Op, class Op32>
void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2,
              T* dst, size_t step, Size sz)
133
{
134
#if CV_SSE2 || CV_NEON
135
    Op32 op32;
136
#endif
137
    Op op;
138

139 140 141
    for( ; sz.height--; src1 += step1/sizeof(src1[0]),
        src2 += step2/sizeof(src2[0]),
        dst += step/sizeof(dst[0]) )
142 143
    {
        int x = 0;
144

145 146 147 148
#if CV_SSE2
        if( USE_SSE2 )
        {
            if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
149
            {
150 151
                for( ; x <= sz.width - 8; x += 8 )
                {
152 153 154 155 156 157
                    typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x    );
                    typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 4);
                    r0 = op32(r0, VLoadStore128Aligned<T>::load(src2 + x    ));
                    r1 = op32(r1, VLoadStore128Aligned<T>::load(src2 + x + 4));
                    VLoadStore128Aligned<T>::store(dst + x    , r0);
                    VLoadStore128Aligned<T>::store(dst + x + 4, r1);
158
                }
159
            }
160 161
        }
#endif
162
#if CV_NEON || CV_SSE2
163
#if CV_SSE2
164 165
        if( USE_SSE2 )
        {
166
#endif
167 168 169 170 171 172 173 174 175
            for( ; x <= sz.width - 8; x += 8 )
            {
                typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x    );
                typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 4);
                r0 = op32(r0, VLoadStore128<T>::load(src2 + x    ));
                r1 = op32(r1, VLoadStore128<T>::load(src2 + x + 4));
                VLoadStore128<T>::store(dst + x    , r0);
                VLoadStore128<T>::store(dst + x + 4, r1);
            }
176
#if CV_SSE2
177
        }
178
#endif
179
#endif
V
Victoria Zhislina 已提交
180
#if CV_ENABLE_UNROLLED
181 182
        for( ; x <= sz.width - 4; x += 4 )
        {
183 184
            T v0 = op(src1[x], src2[x]);
            T v1 = op(src1[x+1], src2[x+1]);
185 186 187 188 189
            dst[x] = v0; dst[x+1] = v1;
            v0 = op(src1[x+2], src2[x+2]);
            v1 = op(src1[x+3], src2[x+3]);
            dst[x+2] = v0; dst[x+3] = v1;
        }
V
Victoria Zhislina 已提交
190
#endif
191

192 193 194 195 196
        for( ; x < sz.width; x++ )
            dst[x] = op(src1[x], src2[x]);
    }
}

197 198

template<typename T, class Op, class Op64>
199 200
void vBinOp64(const T* src1, size_t step1, const T* src2, size_t step2,
               T* dst, size_t step, Size sz)
201
{
202
#if CV_SSE2
203
    Op64 op64;
204
#endif
205
    Op op;
206

207 208 209 210 211
    for( ; sz.height--; src1 += step1/sizeof(src1[0]),
        src2 += step2/sizeof(src2[0]),
        dst += step/sizeof(dst[0]) )
    {
        int x = 0;
212

213 214 215 216
#if CV_SSE2
        if( USE_SSE2 )
        {
            if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
217
            {
218 219 220 221 222 223 224 225 226
                for( ; x <= sz.width - 4; x += 4 )
                {
                    typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x    );
                    typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 2);
                    r0 = op64(r0, VLoadStore128Aligned<T>::load(src2 + x    ));
                    r1 = op64(r1, VLoadStore128Aligned<T>::load(src2 + x + 2));
                    VLoadStore128Aligned<T>::store(dst + x    , r0);
                    VLoadStore128Aligned<T>::store(dst + x + 2, r1);
                }
227
            }
228 229 230
        }
#endif

231 232
        for( ; x <= sz.width - 4; x += 4 )
        {
233 234
            T v0 = op(src1[x], src2[x]);
            T v1 = op(src1[x+1], src2[x+1]);
235 236 237 238 239
            dst[x] = v0; dst[x+1] = v1;
            v0 = op(src1[x+2], src2[x+2]);
            v1 = op(src1[x+3], src2[x+3]);
            dst[x+2] = v0; dst[x+3] = v1;
        }
240

241 242
        for( ; x < sz.width; x++ )
            dst[x] = op(src1[x], src2[x]);
243
    }
244
}
245

246
#if CV_SSE2
247

248 249 250 251
#define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)\
    template <>                                                                                  \
    struct name<template_arg>{                                                                   \
        typedef register_type reg_type;                                                          \
252 253
        static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \
        static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); }       \
254
    }
255 256 257 258 259

#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
    template <>                                                                \
    struct name<template_arg>{                                                 \
        typedef register_type reg_type;                                        \
260 261
        static reg_type load(const template_arg * p) { return load_body (p); } \
        static void store(template_arg * p, reg_type v) { store_body (p, v); } \
262 263
    }

264 265 266 267 268 269 270 271 272 273 274
#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
    template<>                                                                 \
    struct name<template_arg>                                                  \
    {                                                                          \
        VLoadStore128<template_arg>::reg_type operator()(                      \
                        const VLoadStore128<template_arg>::reg_type & a,       \
                        const VLoadStore128<template_arg>::reg_type & b) const \
        {                                                                      \
            body;                                                              \
        }                                                                      \
    }
275

276 277 278 279 280 281 282 283 284 285
#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
    template<>                                                                 \
    struct name<template_arg>                                                  \
    {                                                                          \
        VLoadStore128<template_arg>::reg_type operator()(                      \
                        const VLoadStore128<template_arg>::reg_type & a,       \
                        const VLoadStore128<template_arg>::reg_type &  ) const \
        {                                                                      \
            body;                                                              \
        }                                                                      \
286
    }
287

288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331
FUNCTOR_LOADSTORE_CAST(VLoadStore128,  uchar, __m128i, _mm_loadu_si128, _mm_storeu_si128);
FUNCTOR_LOADSTORE_CAST(VLoadStore128,  schar, __m128i, _mm_loadu_si128, _mm_storeu_si128);
FUNCTOR_LOADSTORE_CAST(VLoadStore128, ushort, __m128i, _mm_loadu_si128, _mm_storeu_si128);
FUNCTOR_LOADSTORE_CAST(VLoadStore128,  short, __m128i, _mm_loadu_si128, _mm_storeu_si128);
FUNCTOR_LOADSTORE_CAST(VLoadStore128,    int, __m128i, _mm_loadu_si128, _mm_storeu_si128);
FUNCTOR_LOADSTORE(     VLoadStore128,  float, __m128 , _mm_loadu_ps   , _mm_storeu_ps   );
FUNCTOR_LOADSTORE(     VLoadStore128, double, __m128d, _mm_loadu_pd   , _mm_storeu_pd   );

FUNCTOR_LOADSTORE_CAST(VLoadStore64,  uchar, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
FUNCTOR_LOADSTORE_CAST(VLoadStore64,  schar, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
FUNCTOR_LOADSTORE_CAST(VLoadStore64, ushort, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
FUNCTOR_LOADSTORE_CAST(VLoadStore64,  short, __m128i, _mm_loadl_epi64, _mm_storel_epi64);

FUNCTOR_LOADSTORE_CAST(VLoadStore128Aligned,    int, __m128i, _mm_load_si128, _mm_store_si128);
FUNCTOR_LOADSTORE(     VLoadStore128Aligned,  float, __m128 , _mm_load_ps   , _mm_store_ps   );
FUNCTOR_LOADSTORE(     VLoadStore128Aligned, double, __m128d, _mm_load_pd   , _mm_store_pd   );

FUNCTOR_TEMPLATE(VAdd);
FUNCTOR_CLOSURE_2arg(VAdd,  uchar, return _mm_adds_epu8 (a, b));
FUNCTOR_CLOSURE_2arg(VAdd,  schar, return _mm_adds_epi8 (a, b));
FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm_adds_epu16(a, b));
FUNCTOR_CLOSURE_2arg(VAdd,  short, return _mm_adds_epi16(a, b));
FUNCTOR_CLOSURE_2arg(VAdd,    int, return _mm_add_epi32 (a, b));
FUNCTOR_CLOSURE_2arg(VAdd,  float, return _mm_add_ps    (a, b));
FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm_add_pd    (a, b));

FUNCTOR_TEMPLATE(VSub);
FUNCTOR_CLOSURE_2arg(VSub,  uchar, return _mm_subs_epu8 (a, b));
FUNCTOR_CLOSURE_2arg(VSub,  schar, return _mm_subs_epi8 (a, b));
FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm_subs_epu16(a, b));
FUNCTOR_CLOSURE_2arg(VSub,  short, return _mm_subs_epi16(a, b));
FUNCTOR_CLOSURE_2arg(VSub,    int, return _mm_sub_epi32 (a, b));
FUNCTOR_CLOSURE_2arg(VSub,  float, return _mm_sub_ps    (a, b));
FUNCTOR_CLOSURE_2arg(VSub, double, return _mm_sub_pd    (a, b));

FUNCTOR_TEMPLATE(VMin);
FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm_min_epu8(a, b));
FUNCTOR_CLOSURE_2arg(VMin, schar,
        __m128i m = _mm_cmpgt_epi8(a, b);
        return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
    );
FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm_subs_epu16(a, _mm_subs_epu16(a, b)));
FUNCTOR_CLOSURE_2arg(VMin,  short, return _mm_min_epi16(a, b));
FUNCTOR_CLOSURE_2arg(VMin,    int,
332 333
        __m128i m = _mm_cmpgt_epi32(a, b);
        return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
334 335 336 337 338 339 340 341 342 343 344 345 346
    );
FUNCTOR_CLOSURE_2arg(VMin,  float, return _mm_min_ps(a, b));
FUNCTOR_CLOSURE_2arg(VMin, double, return _mm_min_pd(a, b));

FUNCTOR_TEMPLATE(VMax);
FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm_max_epu8(a, b));
FUNCTOR_CLOSURE_2arg(VMax, schar,
        __m128i m = _mm_cmpgt_epi8(b, a);
        return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
    );
FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm_adds_epu16(_mm_subs_epu16(a, b), b));
FUNCTOR_CLOSURE_2arg(VMax,  short, return _mm_max_epi16(a, b));
FUNCTOR_CLOSURE_2arg(VMax,    int,
347 348
        __m128i m = _mm_cmpgt_epi32(b, a);
        return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
349 350 351 352 353
    );
FUNCTOR_CLOSURE_2arg(VMax,  float, return _mm_max_ps(a, b));
FUNCTOR_CLOSURE_2arg(VMax, double, return _mm_max_pd(a, b));


354 355
static unsigned int CV_DECL_ALIGNED(16) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
static unsigned int CV_DECL_ALIGNED(16) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374

FUNCTOR_TEMPLATE(VAbsDiff);
FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar,
        return _mm_add_epi8(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
    );
FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar,
        __m128i d = _mm_subs_epi8(a, b);
        __m128i m = _mm_cmpgt_epi8(b, a);
        return _mm_subs_epi8(_mm_xor_si128(d, m), m);
    );
FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort,
        return _mm_add_epi16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
    );
FUNCTOR_CLOSURE_2arg(VAbsDiff,  short,
        __m128i M = _mm_max_epi16(a, b);
        __m128i m = _mm_min_epi16(a, b);
        return _mm_subs_epi16(M, m);
    );
FUNCTOR_CLOSURE_2arg(VAbsDiff,    int,
375 376 377
        __m128i d = _mm_sub_epi32(a, b);
        __m128i m = _mm_cmpgt_epi32(b, a);
        return _mm_sub_epi32(_mm_xor_si128(d, m), m);
378 379
    );
FUNCTOR_CLOSURE_2arg(VAbsDiff,  float,
380
        return _mm_and_ps(_mm_sub_ps(a,b), *(const __m128*)v32f_absmask);
381 382
    );
FUNCTOR_CLOSURE_2arg(VAbsDiff, double,
383
        return _mm_and_pd(_mm_sub_pd(a,b), *(const __m128d*)v64f_absmask);
384 385 386 387 388 389 390 391 392 393
    );

FUNCTOR_TEMPLATE(VAnd);
FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm_and_si128(a, b));
FUNCTOR_TEMPLATE(VOr);
FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm_or_si128 (a, b));
FUNCTOR_TEMPLATE(VXor);
FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm_xor_si128(a, b));
FUNCTOR_TEMPLATE(VNot);
FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm_xor_si128(_mm_set1_epi32(-1), a));
394
#endif
395

396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487
#if CV_NEON

#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
    template <>                                                                \
    struct name<template_arg>{                                                 \
        typedef register_type reg_type;                                        \
        static reg_type load(const template_arg * p) { return load_body (p);}; \
        static void store(template_arg * p, reg_type v) { store_body (p, v);}; \
    }

#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
    template<>                                                         \
    struct name<template_arg>                                          \
    {                                                                  \
        VLoadStore128<template_arg>::reg_type operator()(              \
                        VLoadStore128<template_arg>::reg_type a,       \
                        VLoadStore128<template_arg>::reg_type b) const \
        {                                                              \
            return body;                                               \
        };                                                             \
    }

#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
    template<>                                                         \
    struct name<template_arg>                                          \
    {                                                                  \
        VLoadStore128<template_arg>::reg_type operator()(              \
                        VLoadStore128<template_arg>::reg_type a,       \
                        VLoadStore128<template_arg>::reg_type  ) const \
        {                                                              \
            return body;                                               \
        };                                                             \
    }

FUNCTOR_LOADSTORE(VLoadStore128,  uchar,  uint8x16_t, vld1q_u8 , vst1q_u8 );
FUNCTOR_LOADSTORE(VLoadStore128,  schar,   int8x16_t, vld1q_s8 , vst1q_s8 );
FUNCTOR_LOADSTORE(VLoadStore128, ushort,  uint16x8_t, vld1q_u16, vst1q_u16);
FUNCTOR_LOADSTORE(VLoadStore128,  short,   int16x8_t, vld1q_s16, vst1q_s16);
FUNCTOR_LOADSTORE(VLoadStore128,    int,   int32x4_t, vld1q_s32, vst1q_s32);
FUNCTOR_LOADSTORE(VLoadStore128,  float, float32x4_t, vld1q_f32, vst1q_f32);

FUNCTOR_TEMPLATE(VAdd);
FUNCTOR_CLOSURE_2arg(VAdd,  uchar, vqaddq_u8 (a, b));
FUNCTOR_CLOSURE_2arg(VAdd,  schar, vqaddq_s8 (a, b));
FUNCTOR_CLOSURE_2arg(VAdd, ushort, vqaddq_u16(a, b));
FUNCTOR_CLOSURE_2arg(VAdd,  short, vqaddq_s16(a, b));
FUNCTOR_CLOSURE_2arg(VAdd,    int, vaddq_s32 (a, b));
FUNCTOR_CLOSURE_2arg(VAdd,  float, vaddq_f32 (a, b));

FUNCTOR_TEMPLATE(VSub);
FUNCTOR_CLOSURE_2arg(VSub,  uchar, vqsubq_u8 (a, b));
FUNCTOR_CLOSURE_2arg(VSub,  schar, vqsubq_s8 (a, b));
FUNCTOR_CLOSURE_2arg(VSub, ushort, vqsubq_u16(a, b));
FUNCTOR_CLOSURE_2arg(VSub,  short, vqsubq_s16(a, b));
FUNCTOR_CLOSURE_2arg(VSub,    int, vsubq_s32 (a, b));
FUNCTOR_CLOSURE_2arg(VSub,  float, vsubq_f32 (a, b));

FUNCTOR_TEMPLATE(VMin);
FUNCTOR_CLOSURE_2arg(VMin,  uchar, vminq_u8 (a, b));
FUNCTOR_CLOSURE_2arg(VMin,  schar, vminq_s8 (a, b));
FUNCTOR_CLOSURE_2arg(VMin, ushort, vminq_u16(a, b));
FUNCTOR_CLOSURE_2arg(VMin,  short, vminq_s16(a, b));
FUNCTOR_CLOSURE_2arg(VMin,    int, vminq_s32(a, b));
FUNCTOR_CLOSURE_2arg(VMin,  float, vminq_f32(a, b));

FUNCTOR_TEMPLATE(VMax);
FUNCTOR_CLOSURE_2arg(VMax,  uchar, vmaxq_u8 (a, b));
FUNCTOR_CLOSURE_2arg(VMax,  schar, vmaxq_s8 (a, b));
FUNCTOR_CLOSURE_2arg(VMax, ushort, vmaxq_u16(a, b));
FUNCTOR_CLOSURE_2arg(VMax,  short, vmaxq_s16(a, b));
FUNCTOR_CLOSURE_2arg(VMax,    int, vmaxq_s32(a, b));
FUNCTOR_CLOSURE_2arg(VMax,  float, vmaxq_f32(a, b));

FUNCTOR_TEMPLATE(VAbsDiff);
FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar, vabdq_u8  (a, b));
FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar, vqabsq_s8 (vqsubq_s8(a, b)));
FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, vabdq_u16 (a, b));
FUNCTOR_CLOSURE_2arg(VAbsDiff,  short, vqabsq_s16(vqsubq_s16(a, b)));
FUNCTOR_CLOSURE_2arg(VAbsDiff,    int, vabdq_s32 (a, b));
FUNCTOR_CLOSURE_2arg(VAbsDiff,  float, vabdq_f32 (a, b));

FUNCTOR_TEMPLATE(VAnd);
FUNCTOR_CLOSURE_2arg(VAnd, uchar, vandq_u8(a, b));
FUNCTOR_TEMPLATE(VOr);
FUNCTOR_CLOSURE_2arg(VOr , uchar, vorrq_u8(a, b));
FUNCTOR_TEMPLATE(VXor);
FUNCTOR_CLOSURE_2arg(VXor, uchar, veorq_u8(a, b));
FUNCTOR_TEMPLATE(VNot);
FUNCTOR_CLOSURE_1arg(VNot, uchar, vmvnq_u8(a   ));
#endif

#if CV_SSE2 || CV_NEON
488
#define IF_SIMD(op) op
489
#else
490
#define IF_SIMD(op) NOP
491
#endif
492

493 494 495 496
template<> inline uchar OpAdd<uchar>::operator ()(uchar a, uchar b) const
{ return CV_FAST_CAST_8U(a + b); }
template<> inline uchar OpSub<uchar>::operator ()(uchar a, uchar b) const
{ return CV_FAST_CAST_8U(a - b); }
497

498
template<typename T> struct OpAbsDiff
499
{
500 501 502 503
    typedef T type1;
    typedef T type2;
    typedef T rtype;
    T operator()(T a, T b) const { return (T)std::abs(a - b); }
504 505
};

506 507
template<> inline short OpAbsDiff<short>::operator ()(short a, short b) const
{ return saturate_cast<short>(std::abs(a - b)); }
508

509 510
template<> inline schar OpAbsDiff<schar>::operator ()(schar a, schar b) const
{ return saturate_cast<schar>(std::abs(a - b)); }
511

512
template<typename T, typename WT=T> struct OpAbsDiffS
513
{
514 515 516 517
    typedef T type1;
    typedef WT type2;
    typedef T rtype;
    T operator()(T a, WT b) const { return saturate_cast<T>(std::abs(a - b)); }
518 519
};

520
template<typename T> struct OpAnd
521
{
522 523 524 525
    typedef T type1;
    typedef T type2;
    typedef T rtype;
    T operator()( T a, T b ) const { return a & b; }
526 527
};

528
template<typename T> struct OpOr
529
{
530 531 532 533
    typedef T type1;
    typedef T type2;
    typedef T rtype;
    T operator()( T a, T b ) const { return a | b; }
534 535
};

536
template<typename T> struct OpXor
537
{
538 539 540 541
    typedef T type1;
    typedef T type2;
    typedef T rtype;
    T operator()( T a, T b ) const { return a ^ b; }
542 543
};

544
template<typename T> struct OpNot
545
{
546 547 548 549
    typedef T type1;
    typedef T type2;
    typedef T rtype;
    T operator()( T a, T ) const { return ~a; }
550
};
551

552
#if (ARITHM_USE_IPP == 1)
553 554 555 556 557
static inline void fixSteps(Size sz, size_t elemSize, size_t& step1, size_t& step2, size_t& step)
{
    if( sz.height == 1 )
        step1 = step2 = step = sz.width*elemSize;
}
558
#endif
559

560 561 562 563
static void add8u( const uchar* src1, size_t step1,
                   const uchar* src2, size_t step2,
                   uchar* dst, size_t step, Size sz, void* )
{
V
vbystricky 已提交
564 565
#if (ARITHM_USE_IPP == 1)
    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
I
Ilya Lavrenov 已提交
566
    if (0 <= ippiAdd_8u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0))
V
vbystricky 已提交
567
        return;
I
Ilya Lavrenov 已提交
568
    setIppErrorStatus();
V
vbystricky 已提交
569 570
#endif
    (vBinOp<uchar, OpAdd<uchar>, IF_SIMD(VAdd<uchar>)>(src1, step1, src2, step2, dst, step, sz));
571
}
572

573 574 575
static void add8s( const schar* src1, size_t step1,
                   const schar* src2, size_t step2,
                   schar* dst, size_t step, Size sz, void* )
576
{
577
    vBinOp<schar, OpAdd<schar>, IF_SIMD(VAdd<schar>)>(src1, step1, src2, step2, dst, step, sz);
578
}
579

580 581 582
static void add16u( const ushort* src1, size_t step1,
                    const ushort* src2, size_t step2,
                    ushort* dst, size_t step, Size sz, void* )
583
{
V
vbystricky 已提交
584 585
#if (ARITHM_USE_IPP == 1)
    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
I
Ilya Lavrenov 已提交
586
    if (0 <= ippiAdd_16u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0))
V
vbystricky 已提交
587
        return;
I
Ilya Lavrenov 已提交
588
    setIppErrorStatus();
V
vbystricky 已提交
589 590
#endif
    (vBinOp<ushort, OpAdd<ushort>, IF_SIMD(VAdd<ushort>)>(src1, step1, src2, step2, dst, step, sz));
591
}
592

593 594 595
static void add16s( const short* src1, size_t step1,
                    const short* src2, size_t step2,
                    short* dst, size_t step, Size sz, void* )
596
{
V
vbystricky 已提交
597 598
#if (ARITHM_USE_IPP == 1)
    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
I
Ilya Lavrenov 已提交
599
    if (0 <= ippiAdd_16s_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0))
V
vbystricky 已提交
600
        return;
I
Ilya Lavrenov 已提交
601
    setIppErrorStatus();
V
vbystricky 已提交
602 603
#endif
    (vBinOp<short, OpAdd<short>, IF_SIMD(VAdd<short>)>(src1, step1, src2, step2, dst, step, sz));
604
}
605

606 607 608
static void add32s( const int* src1, size_t step1,
                    const int* src2, size_t step2,
                    int* dst, size_t step, Size sz, void* )
609
{
610
    vBinOp32<int, OpAdd<int>, IF_SIMD(VAdd<int>)>(src1, step1, src2, step2, dst, step, sz);
611
}
612

613 614 615
static void add32f( const float* src1, size_t step1,
                    const float* src2, size_t step2,
                    float* dst, size_t step, Size sz, void* )
616
{
V
vbystricky 已提交
617 618
#if (ARITHM_USE_IPP == 1)
    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
I
Ilya Lavrenov 已提交
619
    if (0 <= ippiAdd_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
V
vbystricky 已提交
620
        return;
I
Ilya Lavrenov 已提交
621
    setIppErrorStatus();
V
vbystricky 已提交
622 623
#endif
    (vBinOp32<float, OpAdd<float>, IF_SIMD(VAdd<float>)>(src1, step1, src2, step2, dst, step, sz));
624
}
625

626 627 628
static void add64f( const double* src1, size_t step1,
                    const double* src2, size_t step2,
                    double* dst, size_t step, Size sz, void* )
629
{
630
    vBinOp64<double, OpAdd<double>, IF_SIMD(VAdd<double>)>(src1, step1, src2, step2, dst, step, sz);
631
}
632

633 634 635
static void sub8u( const uchar* src1, size_t step1,
                   const uchar* src2, size_t step2,
                   uchar* dst, size_t step, Size sz, void* )
636
{
V
vbystricky 已提交
637 638
#if (ARITHM_USE_IPP == 1)
    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
I
Ilya Lavrenov 已提交
639
    if (0 <= ippiSub_8u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz), 0))
V
vbystricky 已提交
640
        return;
I
Ilya Lavrenov 已提交
641
    setIppErrorStatus();
V
vbystricky 已提交
642 643
#endif
    (vBinOp<uchar, OpSub<uchar>, IF_SIMD(VSub<uchar>)>(src1, step1, src2, step2, dst, step, sz));
644
}
645

646 647 648
static void sub8s( const schar* src1, size_t step1,
                   const schar* src2, size_t step2,
                   schar* dst, size_t step, Size sz, void* )
649
{
650
    vBinOp<schar, OpSub<schar>, IF_SIMD(VSub<schar>)>(src1, step1, src2, step2, dst, step, sz);
651
}
652

653 654 655
static void sub16u( const ushort* src1, size_t step1,
                    const ushort* src2, size_t step2,
                    ushort* dst, size_t step, Size sz, void* )
656
{
V
vbystricky 已提交
657 658
#if (ARITHM_USE_IPP == 1)
    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
I
Ilya Lavrenov 已提交
659
    if (0 <= ippiSub_16u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz), 0))
V
vbystricky 已提交
660
        return;
I
Ilya Lavrenov 已提交
661
    setIppErrorStatus();
V
vbystricky 已提交
662 663
#endif
    (vBinOp<ushort, OpSub<ushort>, IF_SIMD(VSub<ushort>)>(src1, step1, src2, step2, dst, step, sz));
664
}
665

666 667 668
static void sub16s( const short* src1, size_t step1,
                    const short* src2, size_t step2,
                    short* dst, size_t step, Size sz, void* )
669
{
V
vbystricky 已提交
670 671
#if (ARITHM_USE_IPP == 1)
    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
I
Ilya Lavrenov 已提交
672
    if (0 <= ippiSub_16s_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz), 0))
V
vbystricky 已提交
673
        return;
I
Ilya Lavrenov 已提交
674
    setIppErrorStatus();
V
vbystricky 已提交
675 676
#endif
    (vBinOp<short, OpSub<short>, IF_SIMD(VSub<short>)>(src1, step1, src2, step2, dst, step, sz));
677
}
678

679 680 681
static void sub32s( const int* src1, size_t step1,
                    const int* src2, size_t step2,
                    int* dst, size_t step, Size sz, void* )
682
{
683
    vBinOp32<int, OpSub<int>, IF_SIMD(VSub<int>)>(src1, step1, src2, step2, dst, step, sz);
684
}
685

686 687 688
static void sub32f( const float* src1, size_t step1,
                   const float* src2, size_t step2,
                   float* dst, size_t step, Size sz, void* )
689
{
V
vbystricky 已提交
690 691
#if (ARITHM_USE_IPP == 1)
    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
I
Ilya Lavrenov 已提交
692
    if (0 <= ippiSub_32f_C1R(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz)))
V
vbystricky 已提交
693
        return;
I
Ilya Lavrenov 已提交
694
    setIppErrorStatus();
V
vbystricky 已提交
695 696
#endif
    (vBinOp32<float, OpSub<float>, IF_SIMD(VSub<float>)>(src1, step1, src2, step2, dst, step, sz));
697
}
698

699 700 701
static void sub64f( const double* src1, size_t step1,
                    const double* src2, size_t step2,
                    double* dst, size_t step, Size sz, void* )
702
{
703
    vBinOp64<double, OpSub<double>, IF_SIMD(VSub<double>)>(src1, step1, src2, step2, dst, step, sz);
704
}
705

706 707
template<> inline uchar OpMin<uchar>::operator ()(uchar a, uchar b) const { return CV_MIN_8U(a, b); }
template<> inline uchar OpMax<uchar>::operator ()(uchar a, uchar b) const { return CV_MAX_8U(a, b); }
708

709 710 711
static void max8u( const uchar* src1, size_t step1,
                   const uchar* src2, size_t step2,
                   uchar* dst, size_t step, Size sz, void* )
712
{
713 714 715 716 717
#if (ARITHM_USE_IPP == 1)
    uchar* s1 = (uchar*)src1;
    uchar* s2 = (uchar*)src2;
    uchar* d  = dst;
    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
V
vbystricky 已提交
718 719
    int i = 0;
    for(; i < sz.height; i++)
720
    {
721
        if (0 > ippsMaxEvery_8u(s1, s2, d, sz.width))
V
vbystricky 已提交
722 723 724 725
            break;
        s1 += step1;
        s2 += step2;
        d  += step;
726
    }
V
vbystricky 已提交
727 728
    if (i == sz.height)
        return;
I
Ilya Lavrenov 已提交
729
    setIppErrorStatus();
730
#endif
V
vbystricky 已提交
731
    vBinOp<uchar, OpMax<uchar>, IF_SIMD(VMax<uchar>)>(src1, step1, src2, step2, dst, step, sz);
732
}
733

734 735 736
static void max8s( const schar* src1, size_t step1,
                   const schar* src2, size_t step2,
                   schar* dst, size_t step, Size sz, void* )
737
{
738
    vBinOp<schar, OpMax<schar>, IF_SIMD(VMax<schar>)>(src1, step1, src2, step2, dst, step, sz);
739
}
740

741 742 743
static void max16u( const ushort* src1, size_t step1,
                    const ushort* src2, size_t step2,
                    ushort* dst, size_t step, Size sz, void* )
744
{
745 746 747 748 749
#if (ARITHM_USE_IPP == 1)
    ushort* s1 = (ushort*)src1;
    ushort* s2 = (ushort*)src2;
    ushort* d  = dst;
    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
V
vbystricky 已提交
750 751
    int i = 0;
    for(; i < sz.height; i++)
752
    {
753
        if (0 > ippsMaxEvery_16u(s1, s2, d, sz.width))
V
vbystricky 已提交
754 755 756 757
            break;
        s1 = (ushort*)((uchar*)s1 + step1);
        s2 = (ushort*)((uchar*)s2 + step2);
        d  = (ushort*)((uchar*)d + step);
758
    }
V
vbystricky 已提交
759 760
    if (i == sz.height)
        return;
I
Ilya Lavrenov 已提交
761
    setIppErrorStatus();
762
#endif
V
vbystricky 已提交
763
    vBinOp<ushort, OpMax<ushort>, IF_SIMD(VMax<ushort>)>(src1, step1, src2, step2, dst, step, sz);
764
}
765

766 767 768
static void max16s( const short* src1, size_t step1,
                    const short* src2, size_t step2,
                    short* dst, size_t step, Size sz, void* )
769
{
770
    vBinOp<short, OpMax<short>, IF_SIMD(VMax<short>)>(src1, step1, src2, step2, dst, step, sz);
771
}
772

773 774 775
static void max32s( const int* src1, size_t step1,
                    const int* src2, size_t step2,
                    int* dst, size_t step, Size sz, void* )
776
{
777
    vBinOp32<int, OpMax<int>, IF_SIMD(VMax<int>)>(src1, step1, src2, step2, dst, step, sz);
778
}
779

780 781 782
static void max32f( const float* src1, size_t step1,
                    const float* src2, size_t step2,
                    float* dst, size_t step, Size sz, void* )
783
{
784 785 786 787 788
#if (ARITHM_USE_IPP == 1)
    float* s1 = (float*)src1;
    float* s2 = (float*)src2;
    float* d  = dst;
    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
V
vbystricky 已提交
789 790
    int i = 0;
    for(; i < sz.height; i++)
791
    {
792
        if (0 > ippsMaxEvery_32f(s1, s2, d, sz.width))
V
vbystricky 已提交
793 794 795 796
            break;
        s1 = (float*)((uchar*)s1 + step1);
        s2 = (float*)((uchar*)s2 + step2);
        d  = (float*)((uchar*)d + step);
797
    }
V
vbystricky 已提交
798 799
    if (i == sz.height)
        return;
I
Ilya Lavrenov 已提交
800
    setIppErrorStatus();
801
#endif
V
vbystricky 已提交
802
    vBinOp32<float, OpMax<float>, IF_SIMD(VMax<float>)>(src1, step1, src2, step2, dst, step, sz);
803
}
804

805 806 807
static void max64f( const double* src1, size_t step1,
                    const double* src2, size_t step2,
                    double* dst, size_t step, Size sz, void* )
808
{
A
Alexander Alekhin 已提交
809
#if ARITHM_USE_IPP == 1
I
Ilya Lavrenov 已提交
810 811 812 813 814 815 816 817 818 819 820 821 822 823 824
    double* s1 = (double*)src1;
    double* s2 = (double*)src2;
    double* d  = dst;
    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
    int i = 0;
    for(; i < sz.height; i++)
    {
        if (0 > ippsMaxEvery_64f(s1, s2, d, sz.width))
            break;
        s1 = (double*)((uchar*)s1 + step1);
        s2 = (double*)((uchar*)s2 + step2);
        d  = (double*)((uchar*)d + step);
    }
    if (i == sz.height)
        return;
I
fixes  
Ilya Lavrenov 已提交
825
    setIppErrorStatus();
I
Ilya Lavrenov 已提交
826
#endif
827
    vBinOp64<double, OpMax<double>, IF_SIMD(VMax<double>)>(src1, step1, src2, step2, dst, step, sz);
828
}
829

830 831 832
static void min8u( const uchar* src1, size_t step1,
                   const uchar* src2, size_t step2,
                   uchar* dst, size_t step, Size sz, void* )
833
{
834 835 836 837 838
#if (ARITHM_USE_IPP == 1)
    uchar* s1 = (uchar*)src1;
    uchar* s2 = (uchar*)src2;
    uchar* d  = dst;
    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
V
vbystricky 已提交
839 840
    int i = 0;
    for(; i < sz.height; i++)
841
    {
842
        if (0 > ippsMinEvery_8u(s1, s2, d, sz.width))
V
vbystricky 已提交
843 844 845 846
            break;
        s1 += step1;
        s2 += step2;
        d  += step;
847
    }
V
vbystricky 已提交
848 849
    if (i == sz.height)
        return;
I
Ilya Lavrenov 已提交
850
    setIppErrorStatus();
851
#endif
V
vbystricky 已提交
852
    vBinOp<uchar, OpMin<uchar>, IF_SIMD(VMin<uchar>)>(src1, step1, src2, step2, dst, step, sz);
853
}
854

855 856 857 858
static void min8s( const schar* src1, size_t step1,
                   const schar* src2, size_t step2,
                   schar* dst, size_t step, Size sz, void* )
{
859
    vBinOp<schar, OpMin<schar>, IF_SIMD(VMin<schar>)>(src1, step1, src2, step2, dst, step, sz);
860
}
861

862 863 864 865
static void min16u( const ushort* src1, size_t step1,
                    const ushort* src2, size_t step2,
                    ushort* dst, size_t step, Size sz, void* )
{
866 867 868 869 870
#if (ARITHM_USE_IPP == 1)
    ushort* s1 = (ushort*)src1;
    ushort* s2 = (ushort*)src2;
    ushort* d  = dst;
    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
V
vbystricky 已提交
871 872
    int i = 0;
    for(; i < sz.height; i++)
873
    {
874
        if (0 > ippsMinEvery_16u(s1, s2, d, sz.width))
V
vbystricky 已提交
875 876 877 878
            break;
        s1 = (ushort*)((uchar*)s1 + step1);
        s2 = (ushort*)((uchar*)s2 + step2);
        d  = (ushort*)((uchar*)d + step);
879
    }
V
vbystricky 已提交
880 881
    if (i == sz.height)
        return;
I
Ilya Lavrenov 已提交
882
    setIppErrorStatus();
883
#endif
V
vbystricky 已提交
884
    vBinOp<ushort, OpMin<ushort>, IF_SIMD(VMin<ushort>)>(src1, step1, src2, step2, dst, step, sz);
885
}
886

887 888 889 890
static void min16s( const short* src1, size_t step1,
                    const short* src2, size_t step2,
                    short* dst, size_t step, Size sz, void* )
{
891
    vBinOp<short, OpMin<short>, IF_SIMD(VMin<short>)>(src1, step1, src2, step2, dst, step, sz);
892
}
893

894 895 896
static void min32s( const int* src1, size_t step1,
                    const int* src2, size_t step2,
                    int* dst, size_t step, Size sz, void* )
897
{
898
    vBinOp32<int, OpMin<int>, IF_SIMD(VMin<int>)>(src1, step1, src2, step2, dst, step, sz);
899
}
900

901 902 903
static void min32f( const float* src1, size_t step1,
                    const float* src2, size_t step2,
                    float* dst, size_t step, Size sz, void* )
904
{
905 906 907 908 909
#if (ARITHM_USE_IPP == 1)
    float* s1 = (float*)src1;
    float* s2 = (float*)src2;
    float* d  = dst;
    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
V
vbystricky 已提交
910 911
    int i = 0;
    for(; i < sz.height; i++)
912
    {
913
        if (0 > ippsMinEvery_32f(s1, s2, d, sz.width))
V
vbystricky 已提交
914 915 916 917
            break;
        s1 = (float*)((uchar*)s1 + step1);
        s2 = (float*)((uchar*)s2 + step2);
        d  = (float*)((uchar*)d + step);
918
    }
V
vbystricky 已提交
919 920
    if (i == sz.height)
        return;
I
Ilya Lavrenov 已提交
921
    setIppErrorStatus();
922
#endif
V
vbystricky 已提交
923
    vBinOp32<float, OpMin<float>, IF_SIMD(VMin<float>)>(src1, step1, src2, step2, dst, step, sz);
924
}
925

926 927 928
static void min64f( const double* src1, size_t step1,
                    const double* src2, size_t step2,
                    double* dst, size_t step, Size sz, void* )
929
{
A
Alexander Alekhin 已提交
930
#if ARITHM_USE_IPP == 1
I
Ilya Lavrenov 已提交
931 932 933 934 935 936 937 938 939 940 941 942 943 944 945
    double* s1 = (double*)src1;
    double* s2 = (double*)src2;
    double* d  = dst;
    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
    int i = 0;
    for(; i < sz.height; i++)
    {
        if (0 > ippsMinEvery_64f(s1, s2, d, sz.width))
            break;
        s1 = (double*)((uchar*)s1 + step1);
        s2 = (double*)((uchar*)s2 + step2);
        d  = (double*)((uchar*)d + step);
    }
    if (i == sz.height)
        return;
I
fixes  
Ilya Lavrenov 已提交
946
    setIppErrorStatus();
I
Ilya Lavrenov 已提交
947
#endif
948
    vBinOp64<double, OpMin<double>, IF_SIMD(VMin<double>)>(src1, step1, src2, step2, dst, step, sz);
949
}
950

951 952 953
static void absdiff8u( const uchar* src1, size_t step1,
                       const uchar* src2, size_t step2,
                       uchar* dst, size_t step, Size sz, void* )
954
{
V
vbystricky 已提交
955 956
#if (ARITHM_USE_IPP == 1)
    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
I
Ilya Lavrenov 已提交
957
    if (0 <= ippiAbsDiff_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
V
vbystricky 已提交
958
        return;
I
Ilya Lavrenov 已提交
959
    setIppErrorStatus();
V
vbystricky 已提交
960 961
#endif
    (vBinOp<uchar, OpAbsDiff<uchar>, IF_SIMD(VAbsDiff<uchar>)>(src1, step1, src2, step2, dst, step, sz));
962
}
963

964 965 966 967
static void absdiff8s( const schar* src1, size_t step1,
                       const schar* src2, size_t step2,
                       schar* dst, size_t step, Size sz, void* )
{
968
    vBinOp<schar, OpAbsDiff<schar>, IF_SIMD(VAbsDiff<schar>)>(src1, step1, src2, step2, dst, step, sz);
969
}
970

971 972 973 974
static void absdiff16u( const ushort* src1, size_t step1,
                        const ushort* src2, size_t step2,
                        ushort* dst, size_t step, Size sz, void* )
{
V
vbystricky 已提交
975 976
#if (ARITHM_USE_IPP == 1)
    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
I
Ilya Lavrenov 已提交
977
    if (0 <= ippiAbsDiff_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
V
vbystricky 已提交
978
        return;
I
Ilya Lavrenov 已提交
979
    setIppErrorStatus();
V
vbystricky 已提交
980 981
#endif
    (vBinOp<ushort, OpAbsDiff<ushort>, IF_SIMD(VAbsDiff<ushort>)>(src1, step1, src2, step2, dst, step, sz));
982
}
983

984 985 986 987
static void absdiff16s( const short* src1, size_t step1,
                        const short* src2, size_t step2,
                        short* dst, size_t step, Size sz, void* )
{
988
    vBinOp<short, OpAbsDiff<short>, IF_SIMD(VAbsDiff<short>)>(src1, step1, src2, step2, dst, step, sz);
989
}
990

991 992 993 994
static void absdiff32s( const int* src1, size_t step1,
                        const int* src2, size_t step2,
                        int* dst, size_t step, Size sz, void* )
{
995
    vBinOp32<int, OpAbsDiff<int>, IF_SIMD(VAbsDiff<int>)>(src1, step1, src2, step2, dst, step, sz);
996
}
997

998 999 1000 1001
static void absdiff32f( const float* src1, size_t step1,
                        const float* src2, size_t step2,
                        float* dst, size_t step, Size sz, void* )
{
V
vbystricky 已提交
1002 1003
#if (ARITHM_USE_IPP == 1)
    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
I
Ilya Lavrenov 已提交
1004
    if (0 <= ippiAbsDiff_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
V
vbystricky 已提交
1005
        return;
I
Ilya Lavrenov 已提交
1006
    setIppErrorStatus();
V
vbystricky 已提交
1007 1008
#endif
    (vBinOp32<float, OpAbsDiff<float>, IF_SIMD(VAbsDiff<float>)>(src1, step1, src2, step2, dst, step, sz));
1009
}
1010

1011 1012 1013 1014
static void absdiff64f( const double* src1, size_t step1,
                        const double* src2, size_t step2,
                        double* dst, size_t step, Size sz, void* )
{
1015
    vBinOp64<double, OpAbsDiff<double>, IF_SIMD(VAbsDiff<double>)>(src1, step1, src2, step2, dst, step, sz);
1016 1017
}

1018

1019 1020 1021 1022
static void and8u( const uchar* src1, size_t step1,
                   const uchar* src2, size_t step2,
                   uchar* dst, size_t step, Size sz, void* )
{
V
vbystricky 已提交
1023 1024
#if (ARITHM_USE_IPP == 1)
    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
I
Ilya Lavrenov 已提交
1025
    if (0 <= ippiAnd_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
V
vbystricky 已提交
1026
        return;
I
Ilya Lavrenov 已提交
1027
    setIppErrorStatus();
V
vbystricky 已提交
1028 1029
#endif
    (vBinOp<uchar, OpAnd<uchar>, IF_SIMD(VAnd<uchar>)>(src1, step1, src2, step2, dst, step, sz));
1030 1031 1032 1033 1034 1035
}

static void or8u( const uchar* src1, size_t step1,
                  const uchar* src2, size_t step2,
                  uchar* dst, size_t step, Size sz, void* )
{
V
vbystricky 已提交
1036 1037
#if (ARITHM_USE_IPP == 1)
    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
I
Ilya Lavrenov 已提交
1038
    if (0 <= ippiOr_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
V
vbystricky 已提交
1039
        return;
I
Ilya Lavrenov 已提交
1040
    setIppErrorStatus();
V
vbystricky 已提交
1041 1042
#endif
    (vBinOp<uchar, OpOr<uchar>, IF_SIMD(VOr<uchar>)>(src1, step1, src2, step2, dst, step, sz));
1043 1044 1045 1046 1047 1048
}

static void xor8u( const uchar* src1, size_t step1,
                   const uchar* src2, size_t step2,
                   uchar* dst, size_t step, Size sz, void* )
{
V
vbystricky 已提交
1049 1050
#if (ARITHM_USE_IPP == 1)
    fixSteps(sz, sizeof(dst[0]), step1, step2, step);
I
Ilya Lavrenov 已提交
1051
    if (0 <= ippiXor_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
V
vbystricky 已提交
1052
        return;
I
Ilya Lavrenov 已提交
1053
    setIppErrorStatus();
V
vbystricky 已提交
1054 1055
#endif
    (vBinOp<uchar, OpXor<uchar>, IF_SIMD(VXor<uchar>)>(src1, step1, src2, step2, dst, step, sz));
1056
}
1057 1058 1059 1060 1061

static void not8u( const uchar* src1, size_t step1,
                   const uchar* src2, size_t step2,
                   uchar* dst, size_t step, Size sz, void* )
{
V
vbystricky 已提交
1062
#if (ARITHM_USE_IPP == 1)
I
Ilya Lavrenov 已提交
1063 1064
    fixSteps(sz, sizeof(dst[0]), step1, step2, step); (void)src2;
    if (0 <= ippiNot_8u_C1R(src1, (int)step1, dst, (int)step, ippiSize(sz)))
V
vbystricky 已提交
1065
        return;
I
Ilya Lavrenov 已提交
1066
    setIppErrorStatus();
V
vbystricky 已提交
1067 1068
#endif
    (vBinOp<uchar, OpNot<uchar>, IF_SIMD(VNot<uchar>)>(src1, step1, src2, step2, dst, step, sz));
1069
}
1070

1071 1072 1073
/****************************************************************************************\
*                                   logical operations                                   *
\****************************************************************************************/
1074

1075
void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, size_t blocksize )
1076 1077 1078
{
    int scn = (int)sc.total(), cn = CV_MAT_CN(buftype);
    size_t esz = CV_ELEM_SIZE(buftype);
I
Ilya Lavrenov 已提交
1079
    getConvertFunc(sc.depth(), buftype)(sc.data, 1, 0, 1, scbuf, 1, Size(std::min(cn, scn), 1), 0);
1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090
    // unroll the scalar
    if( scn < cn )
    {
        CV_Assert( scn == 1 );
        size_t esz1 = CV_ELEM_SIZE1(buftype);
        for( size_t i = esz1; i < esz; i++ )
            scbuf[i] = scbuf[i - esz1];
    }
    for( size_t i = esz; i < blocksize*esz; i++ )
        scbuf[i] = scbuf[i - esz];
}
1091

1092 1093 1094

enum { OCL_OP_ADD=0, OCL_OP_SUB=1, OCL_OP_RSUB=2, OCL_OP_ABSDIFF=3, OCL_OP_MUL=4,
       OCL_OP_MUL_SCALE=5, OCL_OP_DIV_SCALE=6, OCL_OP_RECIP_SCALE=7, OCL_OP_ADDW=8,
I
Ilya Lavrenov 已提交
1095 1096
       OCL_OP_AND=9, OCL_OP_OR=10, OCL_OP_XOR=11, OCL_OP_NOT=12, OCL_OP_MIN=13, OCL_OP_MAX=14,
       OCL_OP_RDIV_SCALE=15 };
1097

I
Ilya Lavrenov 已提交
1098 1099
#ifdef HAVE_OPENCL

1100 1101
static const char* oclop2str[] = { "OP_ADD", "OP_SUB", "OP_RSUB", "OP_ABSDIFF",
    "OP_MUL", "OP_MUL_SCALE", "OP_DIV_SCALE", "OP_RECIP_SCALE",
I
Ilya Lavrenov 已提交
1102
    "OP_ADDW", "OP_AND", "OP_OR", "OP_XOR", "OP_NOT", "OP_MIN", "OP_MAX", "OP_RDIV_SCALE", 0 };
1103 1104 1105

static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
                          InputArray _mask, bool bitwise, int oclop, bool haveScalar )
1106
{
1107 1108 1109 1110 1111
    bool haveMask = !_mask.empty();
    int srctype = _src1.type();
    int srcdepth = CV_MAT_DEPTH(srctype);
    int cn = CV_MAT_CN(srctype);

I
Ilya Lavrenov 已提交
1112 1113
    const ocl::Device d = ocl::Device::getDefault();
    bool doubleSupport = d.doubleFPConfig() > 0;
1114
    if( oclop < 0 || ((haveMask || haveScalar) && cn > 4) ||
I
Ilya Lavrenov 已提交
1115
            (!doubleSupport && srcdepth == CV_64F && !bitwise))
1116 1117 1118
        return false;

    char opts[1024];
I
Ilya Lavrenov 已提交
1119
    int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
1120
    int scalarcn = kercn == 3 ? 4 : kercn;
I
Ilya Lavrenov 已提交
1121
    int rowsPerWI = d.isIntel() ? 4 : 1;
1122

I
Ilya Lavrenov 已提交
1123
    sprintf(opts, "-D %s%s -D %s -D dstT=%s%s -D dstT_C1=%s -D workST=%s -D cn=%d -D rowsPerWI=%d",
I
Ilya Lavrenov 已提交
1124
            haveMask ? "MASK_" : "", haveScalar ? "UNARY_OP" : "BINARY_OP", oclop2str[oclop],
1125
            bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, kercn)) :
I
Ilya Lavrenov 已提交
1126
                ocl::typeToStr(CV_MAKETYPE(srcdepth, kercn)), doubleSupport ? " -D DOUBLE_SUPPORT" : "",
1127
            bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, 1)) :
I
Ilya Lavrenov 已提交
1128
                ocl::typeToStr(CV_MAKETYPE(srcdepth, 1)),
1129
            bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, scalarcn)) :
I
Ilya Lavrenov 已提交
1130
                ocl::typeToStr(CV_MAKETYPE(srcdepth, scalarcn)),
I
Ilya Lavrenov 已提交
1131
            kercn, rowsPerWI);
1132 1133

    ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
I
Ilya Lavrenov 已提交
1134
    if (k.empty())
1135 1136
        return false;

I
Ilya Lavrenov 已提交
1137 1138 1139
    UMat src1 = _src1.getUMat(), src2;
    UMat dst = _dst.getUMat(), mask = _mask.getUMat();

I
Ilya Lavrenov 已提交
1140 1141 1142
    ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn);
    ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cn, kercn) :
                                       ocl::KernelArg::WriteOnly(dst, cn, kercn);
1143 1144 1145 1146
    ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask, 1);

    if( haveScalar )
    {
1147
        size_t esz = CV_ELEM_SIZE1(srctype)*scalarcn;
1148 1149 1150 1151 1152 1153 1154 1155
        double buf[4] = {0,0,0,0};

        if( oclop != OCL_OP_NOT )
        {
            Mat src2sc = _src2.getMat();
            convertAndUnrollScalar(src2sc, srctype, (uchar*)buf, 1);
        }

I
Ilya Lavrenov 已提交
1156
        ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, 0, buf, esz);
1157 1158 1159 1160 1161 1162 1163 1164 1165

        if( !haveMask )
            k.args(src1arg, dstarg, scalararg);
        else
            k.args(src1arg, maskarg, dstarg, scalararg);
    }
    else
    {
        src2 = _src2.getUMat();
I
Ilya Lavrenov 已提交
1166
        ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cn, kercn);
1167 1168 1169 1170 1171 1172 1173

        if( !haveMask )
            k.args(src1arg, src2arg, dstarg);
        else
            k.args(src1arg, src2arg, maskarg, dstarg);
    }

I
Ilya Lavrenov 已提交
1174
    size_t globalsize[] = { src1.cols * cn / kercn, (src1.rows + rowsPerWI - 1) / rowsPerWI };
1175 1176 1177
    return k.run(2, globalsize, 0, false);
}

I
Ilya Lavrenov 已提交
1178
#endif
1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190

static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst,
                       InputArray _mask, const BinaryFunc* tab,
                       bool bitwise, int oclop )
{
    const _InputArray *psrc1 = &_src1, *psrc2 = &_src2;
    int kind1 = psrc1->kind(), kind2 = psrc2->kind();
    int type1 = psrc1->type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
    int type2 = psrc2->type(), depth2 = CV_MAT_DEPTH(type2), cn2 = CV_MAT_CN(type2);
    int dims1 = psrc1->dims(), dims2 = psrc2->dims();
    Size sz1 = dims1 <= 2 ? psrc1->size() : Size();
    Size sz2 = dims2 <= 2 ? psrc2->size() : Size();
I
Ilya Lavrenov 已提交
1191
#ifdef HAVE_OPENCL
1192
    bool use_opencl = (kind1 == _InputArray::UMAT || kind2 == _InputArray::UMAT) &&
I
Ilya Lavrenov 已提交
1193 1194
            dims1 <= 2 && dims2 <= 2;
#endif
1195 1196
    bool haveMask = !_mask.empty(), haveScalar = false;
    BinaryFunc func;
1197

1198
    if( dims1 <= 2 && dims2 <= 2 && kind1 == kind2 && sz1 == sz2 && type1 == type2 && !haveMask )
1199
    {
1200
        _dst.create(sz1, type1);
I
Ilya Lavrenov 已提交
1201 1202 1203
        CV_OCL_RUN(use_opencl,
                   ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, false))

1204 1205 1206
        if( bitwise )
        {
            func = *tab;
1207
            cn = (int)CV_ELEM_SIZE(type1);
1208 1209
        }
        else
1210
            func = tab[depth1];
1211

1212
        Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat();
1213
        Size sz = getContinuousSize(src1, src2, dst);
1214
        size_t len = sz.width*(size_t)cn;
1215 1216 1217 1218 1219 1220
        if( len == (size_t)(int)len )
        {
            sz.width = (int)len;
            func(src1.data, src1.step, src2.data, src2.step, dst.data, dst.step, sz, 0);
            return;
        }
1221
    }
1222

1223 1224 1225 1226
    if( oclop == OCL_OP_NOT )
        haveScalar = true;
    else if( (kind1 == _InputArray::MATX) + (kind2 == _InputArray::MATX) == 1 ||
        !psrc1->sameSize(*psrc2) || type1 != type2 )
1227
    {
1228 1229
        if( checkScalar(*psrc1, type2, kind1, kind2) )
        {
1230
            // src1 is a scalar; swap it with src2
1231 1232 1233 1234 1235 1236 1237
            swap(psrc1, psrc2);
            swap(type1, type2);
            swap(depth1, depth2);
            swap(cn, cn2);
            swap(sz1, sz2);
        }
        else if( !checkScalar(*psrc2, type1, kind2, kind1) )
1238 1239 1240 1241 1242
            CV_Error( CV_StsUnmatchedSizes,
                      "The operation is neither 'array op array' (where arrays have the same size and type), "
                      "nor 'array op scalar', nor 'scalar op array'" );
        haveScalar = true;
    }
1243 1244 1245 1246
    else
    {
        CV_Assert( psrc1->sameSize(*psrc2) && type1 == type2 );
    }
1247

1248
    size_t esz = CV_ELEM_SIZE(type1);
1249 1250
    size_t blocksize0 = (BLOCK_SIZE + esz-1)/esz;
    BinaryFunc copymask = 0;
1251
    bool reallocate = false;
1252

1253 1254
    if( haveMask )
    {
1255 1256
        int mtype = _mask.type();
        CV_Assert( (mtype == CV_8U || mtype == CV_8S) && _mask.sameSize(*psrc1));
1257
        copymask = getCopyMaskFunc(esz);
1258
        reallocate = !_dst.sameSize(*psrc1) || _dst.type() != type1;
1259
    }
1260

1261 1262
    AutoBuffer<uchar> _buf;
    uchar *scbuf = 0, *maskbuf = 0;
1263

1264
    _dst.createSameSize(*psrc1, type1);
1265
    // if this is mask operation and dst has been reallocated,
1266
    // we have to clear the destination
1267
    if( haveMask && reallocate )
1268 1269
        _dst.setTo(0.);

I
Ilya Lavrenov 已提交
1270 1271 1272
    CV_OCL_RUN(use_opencl,
               ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, haveScalar))

1273 1274 1275

    Mat src1 = psrc1->getMat(), src2 = psrc2->getMat();
    Mat dst = _dst.getMat(), mask = _mask.getMat();
1276

1277 1278 1279
    if( bitwise )
    {
        func = *tab;
1280
        cn = (int)esz;
1281 1282
    }
    else
1283
        func = tab[depth1];
1284

1285
    if( !haveScalar )
1286
    {
1287 1288
        const Mat* arrays[] = { &src1, &src2, &dst, &mask, 0 };
        uchar* ptrs[4];
1289

1290 1291
        NAryMatIterator it(arrays, ptrs);
        size_t total = it.size, blocksize = total;
1292

1293 1294
        if( blocksize*cn > INT_MAX )
            blocksize = INT_MAX/cn;
1295

1296 1297 1298 1299 1300 1301
        if( haveMask )
        {
            blocksize = std::min(blocksize, blocksize0);
            _buf.allocate(blocksize*esz);
            maskbuf = _buf;
        }
1302

1303
        for( size_t i = 0; i < it.nplanes; i++, ++it )
1304
        {
1305
            for( size_t j = 0; j < total; j += blocksize )
1306
            {
1307
                int bsz = (int)MIN(total - j, blocksize);
1308

1309
                func( ptrs[0], 0, ptrs[1], 0, haveMask ? maskbuf : ptrs[2], 0, Size(bsz*cn, 1), 0 );
1310
                if( haveMask )
1311
                {
1312 1313
                    copymask( maskbuf, 0, ptrs[3], 0, ptrs[2], 0, Size(bsz, 1), &esz );
                    ptrs[3] += bsz;
1314
                }
1315

1316 1317
                bsz *= (int)esz;
                ptrs[0] += bsz; ptrs[1] += bsz; ptrs[2] += bsz;
1318 1319
            }
        }
1320 1321 1322 1323 1324
    }
    else
    {
        const Mat* arrays[] = { &src1, &dst, &mask, 0 };
        uchar* ptrs[3];
1325

1326 1327
        NAryMatIterator it(arrays, ptrs);
        size_t total = it.size, blocksize = std::min(total, blocksize0);
1328

1329 1330 1331
        _buf.allocate(blocksize*(haveMask ? 2 : 1)*esz + 32);
        scbuf = _buf;
        maskbuf = alignPtr(scbuf + blocksize*esz, 16);
1332

1333
        convertAndUnrollScalar( src2, src1.type(), scbuf, blocksize);
1334

1335
        for( size_t i = 0; i < it.nplanes; i++, ++it )
1336
        {
1337
            for( size_t j = 0; j < total; j += blocksize )
1338
            {
1339
                int bsz = (int)MIN(total - j, blocksize);
1340

1341
                func( ptrs[0], 0, scbuf, 0, haveMask ? maskbuf : ptrs[1], 0, Size(bsz*cn, 1), 0 );
1342
                if( haveMask )
1343
                {
1344 1345
                    copymask( maskbuf, 0, ptrs[2], 0, ptrs[1], 0, Size(bsz, 1), &esz );
                    ptrs[2] += bsz;
1346
                }
1347

1348 1349
                bsz *= (int)esz;
                ptrs[0] += bsz; ptrs[1] += bsz;
1350 1351 1352 1353
            }
        }
    }
}
1354

1355
static BinaryFunc* getMaxTab()
V
Vadim Pisarevsky 已提交
1356
{
1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367
    static BinaryFunc maxTab[] =
    {
        (BinaryFunc)GET_OPTIMIZED(max8u), (BinaryFunc)GET_OPTIMIZED(max8s),
        (BinaryFunc)GET_OPTIMIZED(max16u), (BinaryFunc)GET_OPTIMIZED(max16s),
        (BinaryFunc)GET_OPTIMIZED(max32s),
        (BinaryFunc)GET_OPTIMIZED(max32f), (BinaryFunc)max64f,
        0
    };

    return maxTab;
}
1368

1369
static BinaryFunc* getMinTab()
1370
{
1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381
    static BinaryFunc minTab[] =
    {
        (BinaryFunc)GET_OPTIMIZED(min8u), (BinaryFunc)GET_OPTIMIZED(min8s),
        (BinaryFunc)GET_OPTIMIZED(min16u), (BinaryFunc)GET_OPTIMIZED(min16s),
        (BinaryFunc)GET_OPTIMIZED(min32s),
        (BinaryFunc)GET_OPTIMIZED(min32f), (BinaryFunc)min64f,
        0
    };

    return minTab;
}
1382

V
Vadim Pisarevsky 已提交
1383
}
1384

1385
void cv::bitwise_and(InputArray a, InputArray b, OutputArray c, InputArray mask)
1386
{
A
Andrey Kamaev 已提交
1387
    BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(and8u);
1388
    binary_op(a, b, c, mask, &f, true, OCL_OP_AND);
1389 1390
}

1391
void cv::bitwise_or(InputArray a, InputArray b, OutputArray c, InputArray mask)
1392
{
A
Andrey Kamaev 已提交
1393
    BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(or8u);
1394
    binary_op(a, b, c, mask, &f, true, OCL_OP_OR);
1395 1396
}

1397
void cv::bitwise_xor(InputArray a, InputArray b, OutputArray c, InputArray mask)
1398
{
A
Andrey Kamaev 已提交
1399
    BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(xor8u);
1400
    binary_op(a, b, c, mask, &f, true, OCL_OP_XOR);
1401 1402
}

1403
void cv::bitwise_not(InputArray a, OutputArray c, InputArray mask)
1404
{
A
Andrey Kamaev 已提交
1405
    BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(not8u);
1406
    binary_op(a, a, c, mask, &f, true, OCL_OP_NOT);
1407 1408
}

1409
void cv::max( InputArray src1, InputArray src2, OutputArray dst )
1410
{
1411
    binary_op(src1, src2, dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
1412 1413
}

1414
void cv::min( InputArray src1, InputArray src2, OutputArray dst )
1415
{
1416
    binary_op(src1, src2, dst, noArray(), getMinTab(), false, OCL_OP_MIN );
1417 1418
}

1419
void cv::max(const Mat& src1, const Mat& src2, Mat& dst)
1420
{
1421
    OutputArray _dst(dst);
1422
    binary_op(src1, src2, _dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
1423 1424
}

1425 1426 1427
void cv::min(const Mat& src1, const Mat& src2, Mat& dst)
{
    OutputArray _dst(dst);
1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440
    binary_op(src1, src2, _dst, noArray(), getMinTab(), false, OCL_OP_MIN );
}

void cv::max(const UMat& src1, const UMat& src2, UMat& dst)
{
    OutputArray _dst(dst);
    binary_op(src1, src2, _dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
}

void cv::min(const UMat& src1, const UMat& src2, UMat& dst)
{
    OutputArray _dst(dst);
    binary_op(src1, src2, _dst, noArray(), getMinTab(), false, OCL_OP_MIN );
1441
}
1442 1443


1444 1445 1446
/****************************************************************************************\
*                                      add/subtract                                      *
\****************************************************************************************/
1447

1448 1449
namespace cv
{
1450

1451 1452
static int actualScalarDepth(const double* data, int len)
{
1453 1454
    int i = 0, minval = INT_MAX, maxval = INT_MIN;
    for(; i < len; ++i)
1455
    {
1456 1457 1458 1459 1460
        int ival = cvRound(data[i]);
        if( ival != data[i] )
            break;
        minval = MIN(minval, ival);
        maxval = MAX(maxval, ival);
1461
    }
1462
    return i < len ? CV_64F :
A
Andrey Kamaev 已提交
1463 1464 1465 1466
        minval >= 0 && maxval <= (int)UCHAR_MAX ? CV_8U :
        minval >= (int)SCHAR_MIN && maxval <= (int)SCHAR_MAX ? CV_8S :
        minval >= 0 && maxval <= (int)USHRT_MAX ? CV_16U :
        minval >= (int)SHRT_MIN && maxval <= (int)SHRT_MAX ? CV_16S :
1467
        CV_32S;
1468 1469
}

I
Ilya Lavrenov 已提交
1470
#ifdef HAVE_OPENCL
1471 1472 1473 1474 1475

static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
                          InputArray _mask, int wtype,
                          void* usrdata, int oclop,
                          bool haveScalar )
1476
{
I
Ilya Lavrenov 已提交
1477 1478
    const ocl::Device d = ocl::Device::getDefault();
    bool doubleSupport = d.doubleFPConfig() > 0;
1479
    int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
1480
    bool haveMask = !_mask.empty();
1481

1482
    if ( (haveMask || haveScalar) && cn > 4 )
1483 1484
        return false;

1485
    int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32S, CV_MAT_DEPTH(wtype));
I
Ilya Lavrenov 已提交
1486 1487 1488
    if (!doubleSupport)
        wdepth = std::min(wdepth, CV_32F);

1489
    wtype = CV_MAKETYPE(wdepth, cn);
1490
    int type2 = haveScalar ? wtype : _src2.type(), depth2 = CV_MAT_DEPTH(type2);
I
Ilya Lavrenov 已提交
1491 1492
    if (!doubleSupport && (depth2 == CV_64F || depth1 == CV_64F))
        return false;
1493

1494 1495 1496
    if( (oclop == OCL_OP_MUL_SCALE || oclop == OCL_OP_DIV_SCALE) && (depth1 >= CV_32F || depth2 >= CV_32F || ddepth >= CV_32F) )
        return false;

I
Ilya Lavrenov 已提交
1497
    int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
I
Ilya Lavrenov 已提交
1498
    int scalarcn = kercn == 3 ? 4 : kercn, rowsPerWI = d.isIntel() ? 4 : 1;
1499

I
Ilya Lavrenov 已提交
1500
    char cvtstr[4][32], opts[1024];
1501
    sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT1_C1=%s -D srcT2=%s -D srcT2_C1=%s "
I
Ilya Lavrenov 已提交
1502
            "-D dstT=%s -D dstT_C1=%s -D workT=%s -D workST=%s -D scaleT=%s -D wdepth=%d -D convertToWT1=%s "
I
Ilya Lavrenov 已提交
1503
            "-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d -D rowsPerWI=%d -D convertFromU=%s",
1504 1505
            (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"),
            oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, kercn)),
1506 1507 1508
            ocl::typeToStr(depth1), ocl::typeToStr(CV_MAKETYPE(depth2, kercn)),
            ocl::typeToStr(depth2), ocl::typeToStr(CV_MAKETYPE(ddepth, kercn)),
            ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKETYPE(wdepth, kercn)),
1509
            ocl::typeToStr(CV_MAKETYPE(wdepth, scalarcn)),
1510
            ocl::typeToStr(wdepth), wdepth,
1511 1512
            ocl::convertTypeStr(depth1, wdepth, kercn, cvtstr[0]),
            ocl::convertTypeStr(depth2, wdepth, kercn, cvtstr[1]),
I
Ilya Lavrenov 已提交
1513
            ocl::convertTypeStr(wdepth, ddepth, kercn, cvtstr[2]),
I
Ilya Lavrenov 已提交
1514 1515 1516
            doubleSupport ? " -D DOUBLE_SUPPORT" : "", kercn, rowsPerWI,
            oclop == OCL_OP_ABSDIFF && wdepth == CV_32S && ddepth == wdepth ?
            ocl::convertTypeStr(CV_8U, ddepth, kercn, cvtstr[3]) : "noconvert");
1517

I
Ilya Lavrenov 已提交
1518
    size_t usrdata_esz = CV_ELEM_SIZE(wdepth);
1519 1520 1521 1522
    const uchar* usrdata_p = (const uchar*)usrdata;
    const double* usrdata_d = (const double*)usrdata;
    float usrdata_f[3];
    int i, n = oclop == OCL_OP_MUL_SCALE || oclop == OCL_OP_DIV_SCALE ||
I
Ilya Lavrenov 已提交
1523
        oclop == OCL_OP_RDIV_SCALE || oclop == OCL_OP_RECIP_SCALE ? 1 : oclop == OCL_OP_ADDW ? 3 : 0;
1524 1525 1526 1527 1528 1529 1530 1531
    if( n > 0 && wdepth == CV_32F )
    {
        for( i = 0; i < n; i++ )
            usrdata_f[i] = (float)usrdata_d[i];
        usrdata_p = (const uchar*)usrdata_f;
    }

    ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
1532
    if (k.empty())
1533 1534
        return false;

I
Ilya Lavrenov 已提交
1535 1536 1537
    UMat src1 = _src1.getUMat(), src2;
    UMat dst = _dst.getUMat(), mask = _mask.getUMat();

I
Ilya Lavrenov 已提交
1538 1539 1540
    ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn);
    ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cn, kercn) :
                                       ocl::KernelArg::WriteOnly(dst, cn, kercn);
1541 1542 1543 1544
    ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask, 1);

    if( haveScalar )
    {
1545
        size_t esz = CV_ELEM_SIZE1(wtype)*scalarcn;
1546 1547 1548 1549 1550
        double buf[4]={0,0,0,0};
        Mat src2sc = _src2.getMat();

        if( !src2sc.empty() )
            convertAndUnrollScalar(src2sc, wtype, (uchar*)buf, 1);
I
Ilya Lavrenov 已提交
1551
        ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, 0, buf, esz);
1552 1553

        if( !haveMask )
I
Ilya Lavrenov 已提交
1554 1555 1556 1557 1558
        {
            if(n == 0)
                k.args(src1arg, dstarg, scalararg);
            else if(n == 1)
                k.args(src1arg, dstarg, scalararg,
I
Ilya Lavrenov 已提交
1559
                       ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz));
I
Ilya Lavrenov 已提交
1560 1561 1562
            else
                CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters");
        }
1563 1564 1565 1566 1567 1568
        else
            k.args(src1arg, maskarg, dstarg, scalararg);
    }
    else
    {
        src2 = _src2.getUMat();
I
Ilya Lavrenov 已提交
1569
        ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cn, kercn);
1570 1571 1572

        if( !haveMask )
        {
1573
            if (n == 0)
1574
                k.args(src1arg, src2arg, dstarg);
1575
            else if (n == 1)
1576
                k.args(src1arg, src2arg, dstarg,
I
Ilya Lavrenov 已提交
1577
                       ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz));
1578
            else if (n == 3)
1579
                k.args(src1arg, src2arg, dstarg,
I
Ilya Lavrenov 已提交
1580 1581 1582
                       ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz),
                       ocl::KernelArg(0, 0, 0, 0, usrdata_p + usrdata_esz, usrdata_esz),
                       ocl::KernelArg(0, 0, 0, 0, usrdata_p + usrdata_esz*2, usrdata_esz));
1583 1584 1585 1586 1587 1588 1589
            else
                CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters");
        }
        else
            k.args(src1arg, src2arg, maskarg, dstarg);
    }

I
Ilya Lavrenov 已提交
1590
    size_t globalsize[] = { src1.cols * cn / kercn, (src1.rows + rowsPerWI - 1) / rowsPerWI };
1591
    return k.run(2, globalsize, NULL, false);
1592 1593
}

I
Ilya Lavrenov 已提交
1594
#endif
1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608

static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
                      InputArray _mask, int dtype, BinaryFunc* tab, bool muldiv=false,
                      void* usrdata=0, int oclop=-1 )
{
    const _InputArray *psrc1 = &_src1, *psrc2 = &_src2;
    int kind1 = psrc1->kind(), kind2 = psrc2->kind();
    bool haveMask = !_mask.empty();
    bool reallocate = false;
    int type1 = psrc1->type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
    int type2 = psrc2->type(), depth2 = CV_MAT_DEPTH(type2), cn2 = CV_MAT_CN(type2);
    int wtype, dims1 = psrc1->dims(), dims2 = psrc2->dims();
    Size sz1 = dims1 <= 2 ? psrc1->size() : Size();
    Size sz2 = dims2 <= 2 ? psrc2->size() : Size();
I
Ilya Lavrenov 已提交
1609
#ifdef HAVE_OPENCL
1610
    bool use_opencl = OCL_PERFORMANCE_CHECK(_dst.isUMat()) && dims1 <= 2 && dims2 <= 2;
I
Ilya Lavrenov 已提交
1611
#endif
1612 1613 1614 1615 1616 1617
    bool src1Scalar = checkScalar(*psrc1, type2, kind1, kind2);
    bool src2Scalar = checkScalar(*psrc2, type1, kind2, kind1);

    if( (kind1 == kind2 || cn == 1) && sz1 == sz2 && dims1 <= 2 && dims2 <= 2 && type1 == type2 &&
        !haveMask && ((!_dst.fixedType() && (dtype < 0 || CV_MAT_DEPTH(dtype) == depth1)) ||
                       (_dst.fixedType() && _dst.type() == type1)) &&
B
Bo Li 已提交
1618
        ((src1Scalar && src2Scalar) || (!src1Scalar && !src2Scalar)) )
V
Vadim Pisarevsky 已提交
1619
    {
1620
        _dst.createSameSize(*psrc1, type1);
I
Ilya Lavrenov 已提交
1621
        CV_OCL_RUN(use_opencl,
1622 1623 1624
            ocl_arithm_op(*psrc1, *psrc2, _dst, _mask,
                          (!usrdata ? type1 : std::max(depth1, CV_32F)),
                          usrdata, oclop, false))
1625

1626
        Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat();
1627
        Size sz = getContinuousSize(src1, src2, dst, src1.channels());
1628
        tab[depth1](src1.data, src1.step, src2.data, src2.step, dst.data, dst.step, sz, usrdata);
V
Vadim Pisarevsky 已提交
1629 1630
        return;
    }
1631

1632
    bool haveScalar = false, swapped12 = false;
1633 1634

    if( dims1 != dims2 || sz1 != sz2 || cn != cn2 ||
1635 1636
        (kind1 == _InputArray::MATX && (sz1 == Size(1,4) || sz1 == Size(1,1))) ||
        (kind2 == _InputArray::MATX && (sz2 == Size(1,4) || sz2 == Size(1,1))) )
1637
    {
1638
        if( checkScalar(*psrc1, type2, kind1, kind2) )
1639 1640
        {
            // src1 is a scalar; swap it with src2
1641 1642 1643 1644 1645 1646
            swap(psrc1, psrc2);
            swap(sz1, sz2);
            swap(type1, type2);
            swap(depth1, depth2);
            swap(cn, cn2);
            swap(dims1, dims2);
1647
            swapped12 = true;
1648 1649
            if( oclop == OCL_OP_SUB )
                oclop = OCL_OP_RSUB;
I
Ilya Lavrenov 已提交
1650 1651
            if ( oclop == OCL_OP_DIV_SCALE )
                oclop = OCL_OP_RDIV_SCALE;
1652
        }
1653
        else if( !checkScalar(*psrc2, type1, kind2, kind1) )
1654
            CV_Error( CV_StsUnmatchedSizes,
1655 1656
                     "The operation is neither 'array op array' "
                     "(where arrays have the same size and the same number of channels), "
1657 1658
                     "nor 'array op scalar', nor 'scalar op array'" );
        haveScalar = true;
1659
        CV_Assert(type2 == CV_64F && (sz2.height == 1 || sz2.height == 4));
A
Andrey Kamaev 已提交
1660

1661 1662
        if (!muldiv)
        {
1663 1664 1665
            Mat sc = psrc2->getMat();
            depth2 = actualScalarDepth(sc.ptr<double>(), cn);
            if( depth2 == CV_64F && (depth1 < CV_32S || depth1 == CV_32F) )
1666 1667
                depth2 = CV_32F;
        }
A
Andrey Kamaev 已提交
1668
        else
1669
            depth2 = CV_64F;
1670
    }
1671

1672 1673 1674 1675 1676 1677
    if( dtype < 0 )
    {
        if( _dst.fixedType() )
            dtype = _dst.type();
        else
        {
1678
            if( !haveScalar && type1 != type2 )
1679 1680 1681
                CV_Error(CV_StsBadArg,
                     "When the input arrays in add/subtract/multiply/divide functions have different types, "
                     "the output array type must be explicitly specified");
1682
            dtype = type1;
1683 1684 1685
        }
    }
    dtype = CV_MAT_DEPTH(dtype);
1686

1687 1688 1689 1690 1691 1692 1693
    if( depth1 == depth2 && dtype == depth1 )
        wtype = dtype;
    else if( !muldiv )
    {
        wtype = depth1 <= CV_8S && depth2 <= CV_8S ? CV_16S :
                depth1 <= CV_32S && depth2 <= CV_32S ? CV_32S : std::max(depth1, depth2);
        wtype = std::max(wtype, dtype);
1694

1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705
        // when the result of addition should be converted to an integer type,
        // and just one of the input arrays is floating-point, it makes sense to convert that input to integer type before the operation,
        // instead of converting the other input to floating-point and then converting the operation result back to integers.
        if( dtype < CV_32F && (depth1 < CV_32F || depth2 < CV_32F) )
            wtype = CV_32S;
    }
    else
    {
        wtype = std::max(depth1, std::max(depth2, CV_32F));
        wtype = std::max(wtype, dtype);
    }
1706

1707 1708
    dtype = CV_MAKETYPE(dtype, cn);
    wtype = CV_MAKETYPE(wtype, cn);
1709

1710 1711
    if( haveMask )
    {
1712 1713 1714
        int mtype = _mask.type();
        CV_Assert( (mtype == CV_8UC1 || mtype == CV_8SC1) && _mask.sameSize(*psrc1) );
        reallocate = !_dst.sameSize(*psrc1) || _dst.type() != dtype;
1715
    }
1716

1717 1718 1719
    _dst.createSameSize(*psrc1, dtype);
    if( reallocate )
        _dst.setTo(0.);
1720

I
Ilya Lavrenov 已提交
1721 1722 1723
    CV_OCL_RUN(use_opencl,
               ocl_arithm_op(*psrc1, *psrc2, _dst, _mask, wtype,
               usrdata, oclop, haveScalar))
1724

1725 1726 1727 1728 1729 1730 1731 1732 1733
    BinaryFunc cvtsrc1 = type1 == wtype ? 0 : getConvertFunc(type1, wtype);
    BinaryFunc cvtsrc2 = type2 == type1 ? cvtsrc1 : type2 == wtype ? 0 : getConvertFunc(type2, wtype);
    BinaryFunc cvtdst = dtype == wtype ? 0 : getConvertFunc(wtype, dtype);

    size_t esz1 = CV_ELEM_SIZE(type1), esz2 = CV_ELEM_SIZE(type2);
    size_t dsz = CV_ELEM_SIZE(dtype), wsz = CV_ELEM_SIZE(wtype);
    size_t blocksize0 = (size_t)(BLOCK_SIZE + wsz-1)/wsz;
    BinaryFunc copymask = getCopyMaskFunc(dsz);
    Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat(), mask = _mask.getMat();
1734

1735 1736 1737 1738 1739 1740
    AutoBuffer<uchar> _buf;
    uchar *buf, *maskbuf = 0, *buf1 = 0, *buf2 = 0, *wbuf = 0;
    size_t bufesz = (cvtsrc1 ? wsz : 0) +
                    (cvtsrc2 || haveScalar ? wsz : 0) +
                    (cvtdst ? wsz : 0) +
                    (haveMask ? dsz : 0);
1741
    BinaryFunc func = tab[CV_MAT_DEPTH(wtype)];
1742

1743 1744 1745 1746
    if( !haveScalar )
    {
        const Mat* arrays[] = { &src1, &src2, &dst, &mask, 0 };
        uchar* ptrs[4];
1747

1748 1749
        NAryMatIterator it(arrays, ptrs);
        size_t total = it.size, blocksize = total;
1750

1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764
        if( haveMask || cvtsrc1 || cvtsrc2 || cvtdst )
            blocksize = std::min(blocksize, blocksize0);

        _buf.allocate(bufesz*blocksize + 64);
        buf = _buf;
        if( cvtsrc1 )
            buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
        if( cvtsrc2 )
            buf2 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
        wbuf = maskbuf = buf;
        if( cvtdst )
            buf = alignPtr(buf + blocksize*wsz, 16);
        if( haveMask )
            maskbuf = buf;
1765

1766
        for( size_t i = 0; i < it.nplanes; i++, ++it )
1767
        {
1768
            for( size_t j = 0; j < total; j += blocksize )
1769
            {
1770
                int bsz = (int)MIN(total - j, blocksize);
1771 1772 1773 1774 1775
                Size bszn(bsz*cn, 1);
                const uchar *sptr1 = ptrs[0], *sptr2 = ptrs[1];
                uchar* dptr = ptrs[2];
                if( cvtsrc1 )
                {
I
Ilya Lavrenov 已提交
1776
                    cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 );
1777 1778 1779 1780 1781 1782
                    sptr1 = buf1;
                }
                if( ptrs[0] == ptrs[1] )
                    sptr2 = sptr1;
                else if( cvtsrc2 )
                {
I
Ilya Lavrenov 已提交
1783
                    cvtsrc2( sptr2, 1, 0, 1, buf2, 1, bszn, 0 );
1784 1785
                    sptr2 = buf2;
                }
1786

1787
                if( !haveMask && !cvtdst )
I
Ilya Lavrenov 已提交
1788
                    func( sptr1, 1, sptr2, 1, dptr, 1, bszn, usrdata );
1789 1790
                else
                {
I
Ilya Lavrenov 已提交
1791
                    func( sptr1, 1, sptr2, 1, wbuf, 0, bszn, usrdata );
1792
                    if( !haveMask )
I
Ilya Lavrenov 已提交
1793
                        cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 );
1794 1795
                    else if( !cvtdst )
                    {
I
Ilya Lavrenov 已提交
1796
                        copymask( wbuf, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz );
1797 1798 1799 1800
                        ptrs[3] += bsz;
                    }
                    else
                    {
I
Ilya Lavrenov 已提交
1801 1802
                        cvtdst( wbuf, 1, 0, 1, maskbuf, 1, bszn, 0 );
                        copymask( maskbuf, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz );
1803 1804 1805 1806
                        ptrs[3] += bsz;
                    }
                }
                ptrs[0] += bsz*esz1; ptrs[1] += bsz*esz2; ptrs[2] += bsz*dsz;
1807 1808
            }
        }
1809 1810 1811 1812 1813
    }
    else
    {
        const Mat* arrays[] = { &src1, &dst, &mask, 0 };
        uchar* ptrs[3];
1814

1815 1816
        NAryMatIterator it(arrays, ptrs);
        size_t total = it.size, blocksize = std::min(total, blocksize0);
1817

1818 1819 1820 1821 1822 1823 1824 1825 1826 1827
        _buf.allocate(bufesz*blocksize + 64);
        buf = _buf;
        if( cvtsrc1 )
            buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
        buf2 = buf; buf = alignPtr(buf + blocksize*wsz, 16);
        wbuf = maskbuf = buf;
        if( cvtdst )
            buf = alignPtr(buf + blocksize*wsz, 16);
        if( haveMask )
            maskbuf = buf;
1828

1829
        convertAndUnrollScalar( src2, wtype, buf2, blocksize);
1830

1831
        for( size_t i = 0; i < it.nplanes; i++, ++it )
1832
        {
1833 1834
            for( size_t j = 0; j < total; j += blocksize )
            {
1835
                int bsz = (int)MIN(total - j, blocksize);
1836 1837 1838 1839
                Size bszn(bsz*cn, 1);
                const uchar *sptr1 = ptrs[0];
                const uchar* sptr2 = buf2;
                uchar* dptr = ptrs[1];
1840

1841 1842
                if( cvtsrc1 )
                {
I
Ilya Lavrenov 已提交
1843
                    cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 );
1844 1845
                    sptr1 = buf1;
                }
1846

1847 1848
                if( swapped12 )
                    std::swap(sptr1, sptr2);
1849

1850
                if( !haveMask && !cvtdst )
I
Ilya Lavrenov 已提交
1851
                    func( sptr1, 1, sptr2, 1, dptr, 1, bszn, usrdata );
1852 1853
                else
                {
I
Ilya Lavrenov 已提交
1854
                    func( sptr1, 1, sptr2, 1, wbuf, 1, bszn, usrdata );
1855
                    if( !haveMask )
I
Ilya Lavrenov 已提交
1856
                        cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 );
1857 1858
                    else if( !cvtdst )
                    {
I
Ilya Lavrenov 已提交
1859
                        copymask( wbuf, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz );
1860 1861 1862 1863
                        ptrs[2] += bsz;
                    }
                    else
                    {
I
Ilya Lavrenov 已提交
1864 1865
                        cvtdst( wbuf, 1, 0, 1, maskbuf, 1, bszn, 0 );
                        copymask( maskbuf, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz );
1866 1867 1868 1869 1870
                        ptrs[2] += bsz;
                    }
                }
                ptrs[0] += bsz*esz1; ptrs[1] += bsz*dsz;
            }
1871 1872 1873
        }
    }
}
1874

1875
static BinaryFunc* getAddTab()
1876
{
1877 1878 1879 1880 1881 1882 1883 1884
    static BinaryFunc addTab[] =
    {
        (BinaryFunc)GET_OPTIMIZED(add8u), (BinaryFunc)GET_OPTIMIZED(add8s),
        (BinaryFunc)GET_OPTIMIZED(add16u), (BinaryFunc)GET_OPTIMIZED(add16s),
        (BinaryFunc)GET_OPTIMIZED(add32s),
        (BinaryFunc)GET_OPTIMIZED(add32f), (BinaryFunc)add64f,
        0
    };
1885

1886 1887 1888 1889
    return addTab;
}

static BinaryFunc* getSubTab()
1890
{
1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901
    static BinaryFunc subTab[] =
    {
        (BinaryFunc)GET_OPTIMIZED(sub8u), (BinaryFunc)GET_OPTIMIZED(sub8s),
        (BinaryFunc)GET_OPTIMIZED(sub16u), (BinaryFunc)GET_OPTIMIZED(sub16s),
        (BinaryFunc)GET_OPTIMIZED(sub32s),
        (BinaryFunc)GET_OPTIMIZED(sub32f), (BinaryFunc)sub64f,
        0
    };

    return subTab;
}
1902

1903
static BinaryFunc* getAbsDiffTab()
1904
{
1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915
    static BinaryFunc absDiffTab[] =
    {
        (BinaryFunc)GET_OPTIMIZED(absdiff8u), (BinaryFunc)GET_OPTIMIZED(absdiff8s),
        (BinaryFunc)GET_OPTIMIZED(absdiff16u), (BinaryFunc)GET_OPTIMIZED(absdiff16s),
        (BinaryFunc)GET_OPTIMIZED(absdiff32s),
        (BinaryFunc)GET_OPTIMIZED(absdiff32f), (BinaryFunc)absdiff64f,
        0
    };

    return absDiffTab;
}
1916 1917

}
1918

1919 1920
void cv::add( InputArray src1, InputArray src2, OutputArray dst,
          InputArray mask, int dtype )
1921
{
1922
    arithm_op(src1, src2, dst, mask, dtype, getAddTab(), false, 0, OCL_OP_ADD );
1923 1924
}

1925
void cv::subtract( InputArray _src1, InputArray _src2, OutputArray _dst,
1926
               InputArray mask, int dtype )
1927
{
A
Andrey Kamaev 已提交
1928
#ifdef HAVE_TEGRA_OPTIMIZATION
1929 1930 1931 1932 1933
    int kind1 = _src1.kind(), kind2 = _src2.kind();
    Mat src1 = _src1.getMat(), src2 = _src2.getMat();
    bool src1Scalar = checkScalar(src1, _src2.type(), kind1, kind2);
    bool src2Scalar = checkScalar(src2, _src1.type(), kind2, kind1);

V
Vladislav Vinogradov 已提交
1934 1935 1936 1937
    if (!src1Scalar && !src2Scalar &&
        src1.depth() == CV_8U && src2.type() == src1.type() &&
        src1.dims == 2 && src2.size() == src1.size() &&
        mask.empty())
1938
    {
V
Vladislav Vinogradov 已提交
1939
        if (dtype < 0)
1940 1941 1942 1943 1944 1945 1946 1947 1948 1949
        {
            if (_dst.fixedType())
            {
                dtype = _dst.depth();
            }
            else
            {
                dtype = src1.depth();
            }
        }
1950

V
Vladislav Vinogradov 已提交
1951
        dtype = CV_MAT_DEPTH(dtype);
1952

V
Vladislav Vinogradov 已提交
1953
        if (!_dst.fixedType() || dtype == _dst.depth())
1954
        {
V
Vladislav Vinogradov 已提交
1955
            _dst.create(src1.size(), CV_MAKE_TYPE(dtype, src1.channels()));
1956

1957 1958
            if (dtype == CV_16S)
            {
1959 1960
                Mat dst = _dst.getMat();
                if(tegra::subtract_8u8u16s(src1, src2, dst))
1961 1962 1963 1964
                    return;
            }
            else if (dtype == CV_32F)
            {
1965 1966
                Mat dst = _dst.getMat();
                if(tegra::subtract_8u8u32f(src1, src2, dst))
1967 1968 1969 1970
                    return;
            }
            else if (dtype == CV_8S)
            {
1971 1972
                Mat dst = _dst.getMat();
                if(tegra::subtract_8u8u8s(src1, src2, dst))
1973 1974 1975
                    return;
            }
        }
1976
    }
A
Andrey Kamaev 已提交
1977
#endif
1978
    arithm_op(_src1, _src2, _dst, mask, dtype, getSubTab(), false, 0, OCL_OP_SUB );
1979 1980
}

1981
void cv::absdiff( InputArray src1, InputArray src2, OutputArray dst )
1982
{
1983
    arithm_op(src1, src2, dst, noArray(), -1, getAbsDiffTab(), false, 0, OCL_OP_ABSDIFF);
1984
}
1985 1986 1987 1988 1989

/****************************************************************************************\
*                                    multiply/divide                                     *
\****************************************************************************************/

1990 1991 1992
namespace cv
{

1993
template<typename T, typename WT> static void
1994 1995
mul_( const T* src1, size_t step1, const T* src2, size_t step2,
      T* dst, size_t step, Size size, WT scale )
1996
{
1997 1998 1999
    step1 /= sizeof(src1[0]);
    step2 /= sizeof(src2[0]);
    step /= sizeof(dst[0]);
2000

2001
    if( scale == (WT)1. )
2002
    {
2003
        for( ; size.height--; src1 += step1, src2 += step2, dst += step )
2004
        {
V
Victoria Zhislina 已提交
2005
            int i=0;
2006
            #if CV_ENABLE_UNROLLED
V
Victoria Zhislina 已提交
2007
            for(; i <= size.width - 4; i += 4 )
2008
            {
2009 2010 2011 2012 2013 2014
                T t0;
                T t1;
                t0 = saturate_cast<T>(src1[i  ] * src2[i  ]);
                t1 = saturate_cast<T>(src1[i+1] * src2[i+1]);
                dst[i  ] = t0;
                dst[i+1] = t1;
2015 2016 2017

                t0 = saturate_cast<T>(src1[i+2] * src2[i+2]);
                t1 = saturate_cast<T>(src1[i+3] * src2[i+3]);
2018 2019
                dst[i+2] = t0;
                dst[i+3] = t1;
2020
            }
V
Victoria Zhislina 已提交
2021
            #endif
2022 2023 2024 2025 2026 2027
            for( ; i < size.width; i++ )
                dst[i] = saturate_cast<T>(src1[i] * src2[i]);
        }
    }
    else
    {
2028
        for( ; size.height--; src1 += step1, src2 += step2, dst += step )
2029
        {
V
Victoria Zhislina 已提交
2030
            int i = 0;
2031
            #if CV_ENABLE_UNROLLED
V
Victoria Zhislina 已提交
2032
            for(; i <= size.width - 4; i += 4 )
2033 2034 2035 2036 2037 2038 2039 2040 2041
            {
                T t0 = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
                T t1 = saturate_cast<T>(scale*(WT)src1[i+1]*src2[i+1]);
                dst[i] = t0; dst[i+1] = t1;

                t0 = saturate_cast<T>(scale*(WT)src1[i+2]*src2[i+2]);
                t1 = saturate_cast<T>(scale*(WT)src1[i+3]*src2[i+3]);
                dst[i+2] = t0; dst[i+3] = t1;
            }
V
Victoria Zhislina 已提交
2042
            #endif
2043 2044 2045 2046 2047 2048 2049
            for( ; i < size.width; i++ )
                dst[i] = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
        }
    }
}

template<typename T> static void
2050 2051
div_( const T* src1, size_t step1, const T* src2, size_t step2,
      T* dst, size_t step, Size size, double scale )
2052
{
2053 2054 2055
    step1 /= sizeof(src1[0]);
    step2 /= sizeof(src2[0]);
    step /= sizeof(dst[0]);
2056

2057
    for( ; size.height--; src1 += step1, src2 += step2, dst += step )
2058 2059
    {
        int i = 0;
2060
        #if CV_ENABLE_UNROLLED
2061 2062 2063 2064 2065 2066 2067 2068 2069
        for( ; i <= size.width - 4; i += 4 )
        {
            if( src2[i] != 0 && src2[i+1] != 0 && src2[i+2] != 0 && src2[i+3] != 0 )
            {
                double a = (double)src2[i] * src2[i+1];
                double b = (double)src2[i+2] * src2[i+3];
                double d = scale/(a * b);
                b *= d;
                a *= d;
2070

2071 2072 2073 2074
                T z0 = saturate_cast<T>(src2[i+1] * ((double)src1[i] * b));
                T z1 = saturate_cast<T>(src2[i] * ((double)src1[i+1] * b));
                T z2 = saturate_cast<T>(src2[i+3] * ((double)src1[i+2] * a));
                T z3 = saturate_cast<T>(src2[i+2] * ((double)src1[i+3] * a));
2075

2076 2077 2078 2079 2080 2081 2082 2083 2084
                dst[i] = z0; dst[i+1] = z1;
                dst[i+2] = z2; dst[i+3] = z3;
            }
            else
            {
                T z0 = src2[i] != 0 ? saturate_cast<T>(src1[i]*scale/src2[i]) : 0;
                T z1 = src2[i+1] != 0 ? saturate_cast<T>(src1[i+1]*scale/src2[i+1]) : 0;
                T z2 = src2[i+2] != 0 ? saturate_cast<T>(src1[i+2]*scale/src2[i+2]) : 0;
                T z3 = src2[i+3] != 0 ? saturate_cast<T>(src1[i+3]*scale/src2[i+3]) : 0;
2085

2086 2087 2088 2089
                dst[i] = z0; dst[i+1] = z1;
                dst[i+2] = z2; dst[i+3] = z3;
            }
        }
V
Victoria Zhislina 已提交
2090
        #endif
2091 2092 2093 2094 2095 2096
        for( ; i < size.width; i++ )
            dst[i] = src2[i] != 0 ? saturate_cast<T>(src1[i]*scale/src2[i]) : 0;
    }
}

template<typename T> static void
2097 2098
recip_( const T*, size_t, const T* src2, size_t step2,
        T* dst, size_t step, Size size, double scale )
2099
{
2100 2101
    step2 /= sizeof(src2[0]);
    step /= sizeof(dst[0]);
2102

2103
    for( ; size.height--; src2 += step2, dst += step )
2104 2105
    {
        int i = 0;
2106
        #if CV_ENABLE_UNROLLED
2107 2108 2109 2110 2111 2112 2113 2114 2115
        for( ; i <= size.width - 4; i += 4 )
        {
            if( src2[i] != 0 && src2[i+1] != 0 && src2[i+2] != 0 && src2[i+3] != 0 )
            {
                double a = (double)src2[i] * src2[i+1];
                double b = (double)src2[i+2] * src2[i+3];
                double d = scale/(a * b);
                b *= d;
                a *= d;
2116

2117 2118 2119 2120
                T z0 = saturate_cast<T>(src2[i+1] * b);
                T z1 = saturate_cast<T>(src2[i] * b);
                T z2 = saturate_cast<T>(src2[i+3] * a);
                T z3 = saturate_cast<T>(src2[i+2] * a);
2121

2122 2123 2124 2125 2126 2127 2128 2129 2130
                dst[i] = z0; dst[i+1] = z1;
                dst[i+2] = z2; dst[i+3] = z3;
            }
            else
            {
                T z0 = src2[i] != 0 ? saturate_cast<T>(scale/src2[i]) : 0;
                T z1 = src2[i+1] != 0 ? saturate_cast<T>(scale/src2[i+1]) : 0;
                T z2 = src2[i+2] != 0 ? saturate_cast<T>(scale/src2[i+2]) : 0;
                T z3 = src2[i+3] != 0 ? saturate_cast<T>(scale/src2[i+3]) : 0;
2131

2132 2133 2134 2135
                dst[i] = z0; dst[i+1] = z1;
                dst[i+2] = z2; dst[i+3] = z3;
            }
        }
V
Victoria Zhislina 已提交
2136
        #endif
2137 2138 2139 2140
        for( ; i < size.width; i++ )
            dst[i] = src2[i] != 0 ? saturate_cast<T>(scale/src2[i]) : 0;
    }
}
2141 2142


2143 2144 2145
static void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
                   uchar* dst, size_t step, Size sz, void* scale)
{
I
Ilya Lavrenov 已提交
2146
    float fscale = (float)*(const double*)scale;
A
Alexander Alekhin 已提交
2147
#if defined HAVE_IPP
I
fixes  
Ilya Lavrenov 已提交
2148 2149 2150 2151 2152 2153
    if (std::fabs(fscale - 1) <= FLT_EPSILON)
    {
        if (ippiMul_8u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0) >= 0)
            return;
        setIppErrorStatus();
    }
I
Ilya Lavrenov 已提交
2154
#endif
I
Ilya Lavrenov 已提交
2155
    mul_(src1, step1, src2, step2, dst, step, sz, fscale);
2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166
}

static void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
                   schar* dst, size_t step, Size sz, void* scale)
{
    mul_(src1, step1, src2, step2, dst, step, sz, (float)*(const double*)scale);
}

static void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
                    ushort* dst, size_t step, Size sz, void* scale)
{
I
Ilya Lavrenov 已提交
2167
    float fscale = (float)*(const double*)scale;
A
Alexander Alekhin 已提交
2168
#if defined HAVE_IPP
I
fixes  
Ilya Lavrenov 已提交
2169 2170 2171 2172 2173 2174
    if (std::fabs(fscale - 1) <= FLT_EPSILON)
    {
        if (ippiMul_16u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0) >= 0)
            return;
        setIppErrorStatus();
    }
I
Ilya Lavrenov 已提交
2175 2176
#endif
    mul_(src1, step1, src2, step2, dst, step, sz, fscale);
2177 2178 2179 2180 2181
}

static void mul16s( const short* src1, size_t step1, const short* src2, size_t step2,
                    short* dst, size_t step, Size sz, void* scale)
{
I
Ilya Lavrenov 已提交
2182
    float fscale = (float)*(const double*)scale;
A
Alexander Alekhin 已提交
2183
#if defined HAVE_IPP
I
fixes  
Ilya Lavrenov 已提交
2184 2185 2186 2187 2188 2189
    if (std::fabs(fscale - 1) <= FLT_EPSILON)
    {
        if (ippiMul_16s_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0) >= 0)
            return;
        setIppErrorStatus();
    }
I
Ilya Lavrenov 已提交
2190 2191
#endif
    mul_(src1, step1, src2, step2, dst, step, sz, fscale);
2192 2193 2194 2195 2196 2197 2198
}

static void mul32s( const int* src1, size_t step1, const int* src2, size_t step2,
                    int* dst, size_t step, Size sz, void* scale)
{
    mul_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}
2199

2200 2201 2202
static void mul32f( const float* src1, size_t step1, const float* src2, size_t step2,
                    float* dst, size_t step, Size sz, void* scale)
{
I
Ilya Lavrenov 已提交
2203
    float fscale = (float)*(const double*)scale;
A
Alexander Alekhin 已提交
2204
#if defined HAVE_IPP
I
fixes  
Ilya Lavrenov 已提交
2205 2206 2207 2208 2209 2210
    if (std::fabs(fscale - 1) <= FLT_EPSILON)
    {
        if (ippiMul_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)) >= 0)
            return;
        setIppErrorStatus();
    }
I
Ilya Lavrenov 已提交
2211 2212
#endif
    mul_(src1, step1, src2, step2, dst, step, sz, fscale);
2213
}
2214

2215 2216 2217 2218 2219
static void mul64f( const double* src1, size_t step1, const double* src2, size_t step2,
                    double* dst, size_t step, Size sz, void* scale)
{
    mul_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}
2220

2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288
static void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
                   uchar* dst, size_t step, Size sz, void* scale)
{
    if( src1 )
        div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
    else
        recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}

static void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
                  schar* dst, size_t step, Size sz, void* scale)
{
    div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}

static void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
                    ushort* dst, size_t step, Size sz, void* scale)
{
    div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}

static void div16s( const short* src1, size_t step1, const short* src2, size_t step2,
                    short* dst, size_t step, Size sz, void* scale)
{
    div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}

static void div32s( const int* src1, size_t step1, const int* src2, size_t step2,
                    int* dst, size_t step, Size sz, void* scale)
{
    div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}

static void div32f( const float* src1, size_t step1, const float* src2, size_t step2,
                    float* dst, size_t step, Size sz, void* scale)
{
    div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}

static void div64f( const double* src1, size_t step1, const double* src2, size_t step2,
                    double* dst, size_t step, Size sz, void* scale)
{
    div_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}

static void recip8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
                  uchar* dst, size_t step, Size sz, void* scale)
{
    recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}

static void recip8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
                  schar* dst, size_t step, Size sz, void* scale)
{
    recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}

static void recip16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
                   ushort* dst, size_t step, Size sz, void* scale)
{
    recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}

static void recip16s( const short* src1, size_t step1, const short* src2, size_t step2,
                   short* dst, size_t step, Size sz, void* scale)
{
    recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}
2289

2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306
static void recip32s( const int* src1, size_t step1, const int* src2, size_t step2,
                   int* dst, size_t step, Size sz, void* scale)
{
    recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}

static void recip32f( const float* src1, size_t step1, const float* src2, size_t step2,
                   float* dst, size_t step, Size sz, void* scale)
{
    recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}

static void recip64f( const double* src1, size_t step1, const double* src2, size_t step2,
                   double* dst, size_t step, Size sz, void* scale)
{
    recip_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
}
2307 2308


2309
static BinaryFunc* getMulTab()
2310
{
2311 2312 2313 2314 2315 2316 2317 2318 2319
    static BinaryFunc mulTab[] =
    {
        (BinaryFunc)mul8u, (BinaryFunc)mul8s, (BinaryFunc)mul16u,
        (BinaryFunc)mul16s, (BinaryFunc)mul32s, (BinaryFunc)mul32f,
        (BinaryFunc)mul64f, 0
    };

    return mulTab;
}
2320

2321
static BinaryFunc* getDivTab()
2322
{
2323 2324 2325 2326 2327 2328
    static BinaryFunc divTab[] =
    {
        (BinaryFunc)div8u, (BinaryFunc)div8s, (BinaryFunc)div16u,
        (BinaryFunc)div16s, (BinaryFunc)div32s, (BinaryFunc)div32f,
        (BinaryFunc)div64f, 0
    };
2329

2330 2331 2332 2333
    return divTab;
}

static BinaryFunc* getRecipTab()
2334
{
2335 2336 2337 2338 2339 2340
    static BinaryFunc recipTab[] =
    {
        (BinaryFunc)recip8u, (BinaryFunc)recip8s, (BinaryFunc)recip16u,
        (BinaryFunc)recip16s, (BinaryFunc)recip32s, (BinaryFunc)recip32f,
        (BinaryFunc)recip64f, 0
    };
2341

2342 2343
    return recipTab;
}
2344

2345
}
2346

2347
void cv::multiply(InputArray src1, InputArray src2,
2348
                  OutputArray dst, double scale, int dtype)
2349
{
2350
    arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(),
I
Ilya Lavrenov 已提交
2351
              true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE);
2352
}
2353

2354
void cv::divide(InputArray src1, InputArray src2,
2355 2356
                OutputArray dst, double scale, int dtype)
{
2357
    arithm_op(src1, src2, dst, noArray(), dtype, getDivTab(), true, &scale, OCL_OP_DIV_SCALE);
2358 2359
}

2360
void cv::divide(double scale, InputArray src2,
2361 2362
                OutputArray dst, int dtype)
{
2363
    arithm_op(src2, src2, dst, noArray(), dtype, getRecipTab(), true, &scale, OCL_OP_RECIP_SCALE);
2364 2365
}

2366 2367 2368 2369
/****************************************************************************************\
*                                      addWeighted                                       *
\****************************************************************************************/

2370 2371 2372
namespace cv
{

2373
template<typename T, typename WT> static void
2374 2375 2376 2377 2378 2379 2380 2381 2382 2383
addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2,
              T* dst, size_t step, Size size, void* _scalars )
{
    const double* scalars = (const double*)_scalars;
    WT alpha = (WT)scalars[0], beta = (WT)scalars[1], gamma = (WT)scalars[2];
    step1 /= sizeof(src1[0]);
    step2 /= sizeof(src2[0]);
    step /= sizeof(dst[0]);

    for( ; size.height--; src1 += step1, src2 += step2, dst += step )
2384
    {
2385
        int x = 0;
2386
        #if CV_ENABLE_UNROLLED
2387
        for( ; x <= size.width - 4; x += 4 )
2388
        {
2389 2390 2391
            T t0 = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
            T t1 = saturate_cast<T>(src1[x+1]*alpha + src2[x+1]*beta + gamma);
            dst[x] = t0; dst[x+1] = t1;
2392

2393 2394 2395
            t0 = saturate_cast<T>(src1[x+2]*alpha + src2[x+2]*beta + gamma);
            t1 = saturate_cast<T>(src1[x+3]*alpha + src2[x+3]*beta + gamma);
            dst[x+2] = t0; dst[x+3] = t1;
2396
        }
V
Victoria Zhislina 已提交
2397
        #endif
2398 2399
        for( ; x < size.width; x++ )
            dst[x] = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
2400 2401 2402 2403 2404
    }
}


static void
2405 2406 2407 2408 2409 2410 2411
addWeighted8u( const uchar* src1, size_t step1,
               const uchar* src2, size_t step2,
               uchar* dst, size_t step, Size size,
               void* _scalars )
{
    const double* scalars = (const double*)_scalars;
    float alpha = (float)scalars[0], beta = (float)scalars[1], gamma = (float)scalars[2];
2412

2413
    for( ; size.height--; src1 += step1, src2 += step2, dst += step )
2414
    {
2415
        int x = 0;
2416

2417 2418
#if CV_SSE2
        if( USE_SSE2 )
2419
        {
2420 2421
            __m128 a4 = _mm_set1_ps(alpha), b4 = _mm_set1_ps(beta), g4 = _mm_set1_ps(gamma);
            __m128i z = _mm_setzero_si128();
2422

2423
            for( ; x <= size.width - 8; x += 8 )
2424
            {
2425 2426
                __m128i u = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src1 + x)), z);
                __m128i v = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src2 + x)), z);
2427

2428 2429 2430 2431
                __m128 u0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(u, z));
                __m128 u1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(u, z));
                __m128 v0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v, z));
                __m128 v1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v, z));
2432

2433 2434 2435
                u0 = _mm_add_ps(_mm_mul_ps(u0, a4), _mm_mul_ps(v0, b4));
                u1 = _mm_add_ps(_mm_mul_ps(u1, a4), _mm_mul_ps(v1, b4));
                u0 = _mm_add_ps(u0, g4); u1 = _mm_add_ps(u1, g4);
2436

2437 2438
                u = _mm_packs_epi32(_mm_cvtps_epi32(u0), _mm_cvtps_epi32(u1));
                u = _mm_packus_epi16(u, u);
2439

2440
                _mm_storel_epi64((__m128i*)(dst + x), u);
2441 2442
            }
        }
2443
#endif
2444
        #if CV_ENABLE_UNROLLED
2445
        for( ; x <= size.width - 4; x += 4 )
2446
        {
2447 2448 2449
            float t0, t1;
            t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma;
            t1 = CV_8TO32F(src1[x+1])*alpha + CV_8TO32F(src2[x+1])*beta + gamma;
2450

2451 2452
            dst[x] = saturate_cast<uchar>(t0);
            dst[x+1] = saturate_cast<uchar>(t1);
2453

2454 2455
            t0 = CV_8TO32F(src1[x+2])*alpha + CV_8TO32F(src2[x+2])*beta + gamma;
            t1 = CV_8TO32F(src1[x+3])*alpha + CV_8TO32F(src2[x+3])*beta + gamma;
2456

2457 2458 2459
            dst[x+2] = saturate_cast<uchar>(t0);
            dst[x+3] = saturate_cast<uchar>(t1);
        }
V
Victoria Zhislina 已提交
2460
        #endif
2461

2462 2463 2464 2465
        for( ; x < size.width; x++ )
        {
            float t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma;
            dst[x] = saturate_cast<uchar>(t0);
2466 2467 2468 2469
        }
    }
}

2470 2471
static void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
                           schar* dst, size_t step, Size sz, void* scalars )
2472
{
2473
    addWeighted_<schar, float>(src1, step1, src2, step2, dst, step, sz, scalars);
2474 2475
}

2476 2477
static void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
                            ushort* dst, size_t step, Size sz, void* scalars )
2478
{
2479 2480
    addWeighted_<ushort, float>(src1, step1, src2, step2, dst, step, sz, scalars);
}
2481

2482 2483
static void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2,
                            short* dst, size_t step, Size sz, void* scalars )
2484
{
2485 2486
    addWeighted_<short, float>(src1, step1, src2, step2, dst, step, sz, scalars);
}
2487

2488 2489
static void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2,
                            int* dst, size_t step, Size sz, void* scalars )
2490
{
2491
    addWeighted_<int, double>(src1, step1, src2, step2, dst, step, sz, scalars);
2492 2493
}

2494 2495
static void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2,
                            float* dst, size_t step, Size sz, void* scalars )
2496
{
2497 2498
    addWeighted_<float, double>(src1, step1, src2, step2, dst, step, sz, scalars);
}
2499

2500 2501 2502 2503 2504
static void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2,
                            double* dst, size_t step, Size sz, void* scalars )
{
    addWeighted_<double, double>(src1, step1, src2, step2, dst, step, sz, scalars);
}
V
Vadim Pisarevsky 已提交
2505

2506
static BinaryFunc* getAddWeightedTab()
2507
{
2508 2509 2510 2511 2512 2513 2514 2515 2516
    static BinaryFunc addWeightedTab[] =
    {
        (BinaryFunc)GET_OPTIMIZED(addWeighted8u), (BinaryFunc)GET_OPTIMIZED(addWeighted8s), (BinaryFunc)GET_OPTIMIZED(addWeighted16u),
        (BinaryFunc)GET_OPTIMIZED(addWeighted16s), (BinaryFunc)GET_OPTIMIZED(addWeighted32s), (BinaryFunc)addWeighted32f,
        (BinaryFunc)addWeighted64f, 0
    };

    return addWeightedTab;
}
2517

2518
}
2519

2520
void cv::addWeighted( InputArray src1, double alpha, InputArray src2,
2521 2522 2523
                      double beta, double gamma, OutputArray dst, int dtype )
{
    double scalars[] = {alpha, beta, gamma};
2524
    arithm_op(src1, src2, dst, noArray(), dtype, getAddWeightedTab(), true, scalars, OCL_OP_ADDW);
2525 2526
}

2527

2528
/****************************************************************************************\
2529
*                                          compare                                       *
2530 2531
\****************************************************************************************/

2532
namespace cv
2533 2534
{

2535 2536 2537
template<typename T> static void
cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
     uchar* dst, size_t step, Size size, int code)
2538
{
2539 2540 2541
    step1 /= sizeof(src1[0]);
    step2 /= sizeof(src2[0]);
    if( code == CMP_GE || code == CMP_LT )
2542
    {
2543 2544 2545
        std::swap(src1, src2);
        std::swap(step1, step2);
        code = code == CMP_GE ? CMP_LE : CMP_GT;
2546
    }
2547

2548
    if( code == CMP_GT || code == CMP_LE )
2549
    {
2550 2551 2552 2553
        int m = code == CMP_GT ? 0 : 255;
        for( ; size.height--; src1 += step1, src2 += step2, dst += step )
        {
            int x = 0;
2554
            #if CV_ENABLE_UNROLLED
2555 2556 2557 2558 2559 2560 2561 2562 2563 2564
            for( ; x <= size.width - 4; x += 4 )
            {
                int t0, t1;
                t0 = -(src1[x] > src2[x]) ^ m;
                t1 = -(src1[x+1] > src2[x+1]) ^ m;
                dst[x] = (uchar)t0; dst[x+1] = (uchar)t1;
                t0 = -(src1[x+2] > src2[x+2]) ^ m;
                t1 = -(src1[x+3] > src2[x+3]) ^ m;
                dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
            }
V
Victoria Zhislina 已提交
2565
            #endif
2566 2567
            for( ; x < size.width; x++ )
                dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
2568
               }
2569
    }
2570
    else if( code == CMP_EQ || code == CMP_NE )
2571
    {
2572 2573 2574 2575
        int m = code == CMP_EQ ? 0 : 255;
        for( ; size.height--; src1 += step1, src2 += step2, dst += step )
        {
            int x = 0;
2576
            #if CV_ENABLE_UNROLLED
2577 2578 2579 2580 2581 2582 2583 2584 2585 2586
            for( ; x <= size.width - 4; x += 4 )
            {
                int t0, t1;
                t0 = -(src1[x] == src2[x]) ^ m;
                t1 = -(src1[x+1] == src2[x+1]) ^ m;
                dst[x] = (uchar)t0; dst[x+1] = (uchar)t1;
                t0 = -(src1[x+2] == src2[x+2]) ^ m;
                t1 = -(src1[x+3] == src2[x+3]) ^ m;
                dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
            }
V
Victoria Zhislina 已提交
2587
            #endif
2588 2589 2590
            for( ; x < size.width; x++ )
                dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
        }
2591
    }
2592
}
2593

K
kdrobnyh 已提交
2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604
#if ARITHM_USE_IPP
inline static IppCmpOp convert_cmp(int _cmpop)
{
    return _cmpop == CMP_EQ ? ippCmpEq :
        _cmpop == CMP_GT ? ippCmpGreater :
        _cmpop == CMP_GE ? ippCmpGreaterEq :
        _cmpop == CMP_LT ? ippCmpLess :
        _cmpop == CMP_LE ? ippCmpLessEq :
        (IppCmpOp)-1;
}
#endif
2605

2606 2607
static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
                  uchar* dst, size_t step, Size size, void* _cmpop)
2608
{
K
kdrobnyh 已提交
2609 2610 2611 2612 2613
#if ARITHM_USE_IPP
    IppCmpOp op = convert_cmp(*(int *)_cmpop);
    if( op  >= 0 )
    {
        fixSteps(size, sizeof(dst[0]), step1, step2, step);
I
Ilya Lavrenov 已提交
2614
        if (0 <= ippiCompare_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(size), op))
K
kdrobnyh 已提交
2615
            return;
I
Ilya Lavrenov 已提交
2616
        setIppErrorStatus();
K
kdrobnyh 已提交
2617 2618
    }
#endif
2619
  //vz optimized  cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
2620
    int code = *(int*)_cmpop;
2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635
    step1 /= sizeof(src1[0]);
    step2 /= sizeof(src2[0]);
    if( code == CMP_GE || code == CMP_LT )
    {
        std::swap(src1, src2);
        std::swap(step1, step2);
        code = code == CMP_GE ? CMP_LE : CMP_GT;
    }

    if( code == CMP_GT || code == CMP_LE )
    {
        int m = code == CMP_GT ? 0 : 255;
        for( ; size.height--; src1 += step1, src2 += step2, dst += step )
        {
            int x =0;
2636 2637
            #if CV_SSE2
            if( USE_SSE2 ){
2638 2639
                __m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi8 (-1);
                __m128i c128 = _mm_set1_epi8 (-128);
2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652
                for( ; x <= size.width - 16; x += 16 )
                {
                    __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
                    __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
                    // no simd for 8u comparison, that's why we need the trick
                    r00 = _mm_sub_epi8(r00,c128);
                    r10 = _mm_sub_epi8(r10,c128);

                    r00 =_mm_xor_si128(_mm_cmpgt_epi8(r00, r10), m128);
                    _mm_storeu_si128((__m128i*)(dst + x),r00);

                }
            }
2653 2654
           #endif

2655
            for( ; x < size.width; x++ ){
2656
                dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
2657
            }
2658 2659 2660 2661 2662
        }
    }
    else if( code == CMP_EQ || code == CMP_NE )
    {
        int m = code == CMP_EQ ? 0 : 255;
2663
        for( ; size.height--; src1 += step1, src2 += step2, dst += step )
2664 2665
        {
            int x = 0;
2666 2667
            #if CV_SSE2
            if( USE_SSE2 ){
2668
                __m128i m128 =  code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi8 (-1);
2669 2670 2671 2672 2673 2674 2675 2676
                for( ; x <= size.width - 16; x += 16 )
                {
                    __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
                    __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
                    r00 = _mm_xor_si128 ( _mm_cmpeq_epi8 (r00, r10), m128);
                    _mm_storeu_si128((__m128i*)(dst + x), r00);
                }
            }
2677 2678 2679 2680 2681
           #endif
           for( ; x < size.width; x++ )
                dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
        }
    }
2682 2683
}

2684 2685
static void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2,
                  uchar* dst, size_t step, Size size, void* _cmpop)
2686
{
2687
    cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
2688 2689
}

2690 2691
static void cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
                  uchar* dst, size_t step, Size size, void* _cmpop)
2692
{
K
kdrobnyh 已提交
2693 2694 2695 2696 2697
#if ARITHM_USE_IPP
    IppCmpOp op = convert_cmp(*(int *)_cmpop);
    if( op  >= 0 )
    {
        fixSteps(size, sizeof(dst[0]), step1, step2, step);
I
Ilya Lavrenov 已提交
2698
        if (0 <= ippiCompare_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(size), op))
K
kdrobnyh 已提交
2699
            return;
I
Ilya Lavrenov 已提交
2700
        setIppErrorStatus();
K
kdrobnyh 已提交
2701 2702
    }
#endif
2703
    cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
2704 2705
}

2706 2707
static void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2,
                  uchar* dst, size_t step, Size size, void* _cmpop)
2708
{
K
kdrobnyh 已提交
2709 2710 2711 2712 2713
#if ARITHM_USE_IPP
    IppCmpOp op = convert_cmp(*(int *)_cmpop);
    if( op  > 0 )
    {
        fixSteps(size, sizeof(dst[0]), step1, step2, step);
I
Ilya Lavrenov 已提交
2714
        if (0 <= ippiCompare_16s_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(size), op))
K
kdrobnyh 已提交
2715
            return;
I
Ilya Lavrenov 已提交
2716
        setIppErrorStatus();
K
kdrobnyh 已提交
2717 2718
    }
#endif
2719 2720
   //vz optimized cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);

2721
    int code = *(int*)_cmpop;
2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736
    step1 /= sizeof(src1[0]);
    step2 /= sizeof(src2[0]);
    if( code == CMP_GE || code == CMP_LT )
    {
        std::swap(src1, src2);
        std::swap(step1, step2);
        code = code == CMP_GE ? CMP_LE : CMP_GT;
    }

    if( code == CMP_GT || code == CMP_LE )
    {
        int m = code == CMP_GT ? 0 : 255;
        for( ; size.height--; src1 += step1, src2 += step2, dst += step )
        {
            int x =0;
2737 2738
            #if CV_SSE2
            if( USE_SSE2){//
2739
                __m128i m128 =  code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi16 (-1);
2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761
                for( ; x <= size.width - 16; x += 16 )
                {
                    __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
                    __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
                    r00 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r00, r10), m128);
                    __m128i r01 = _mm_loadu_si128((const __m128i*)(src1 + x + 8));
                    __m128i r11 = _mm_loadu_si128((const __m128i*)(src2 + x + 8));
                    r01 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r01, r11), m128);
                    r11 = _mm_packs_epi16(r00, r01);
                    _mm_storeu_si128((__m128i*)(dst + x), r11);
                }
                if( x <= size.width-8)
                {
                    __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
                    __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
                    r00 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r00, r10), m128);
                    r10 = _mm_packs_epi16(r00, r00);
                    _mm_storel_epi64((__m128i*)(dst + x), r10);

                    x += 8;
                }
            }
2762 2763
           #endif

2764
            for( ; x < size.width; x++ ){
2765
                 dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
2766
            }
2767 2768 2769 2770 2771
        }
    }
    else if( code == CMP_EQ || code == CMP_NE )
    {
        int m = code == CMP_EQ ? 0 : 255;
2772
        for( ; size.height--; src1 += step1, src2 += step2, dst += step )
2773 2774
        {
            int x = 0;
2775 2776
            #if CV_SSE2
            if( USE_SSE2 ){
2777
                __m128i m128 =  code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi16 (-1);
2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799
                for( ; x <= size.width - 16; x += 16 )
                {
                    __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
                    __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
                    r00 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r00, r10), m128);
                    __m128i r01 = _mm_loadu_si128((const __m128i*)(src1 + x + 8));
                    __m128i r11 = _mm_loadu_si128((const __m128i*)(src2 + x + 8));
                    r01 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r01, r11), m128);
                    r11 = _mm_packs_epi16(r00, r01);
                    _mm_storeu_si128((__m128i*)(dst + x), r11);
                }
                if( x <= size.width - 8)
                {
                    __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
                    __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
                    r00 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r00, r10), m128);
                    r10 = _mm_packs_epi16(r00, r00);
                    _mm_storel_epi64((__m128i*)(dst + x), r10);

                    x += 8;
                }
            }
2800 2801 2802 2803 2804
           #endif
           for( ; x < size.width; x++ )
                dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
        }
    }
2805 2806
}

2807 2808 2809 2810 2811
static void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2,
                   uchar* dst, size_t step, Size size, void* _cmpop)
{
    cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
}
2812

2813 2814
static void cmp32f(const float* src1, size_t step1, const float* src2, size_t step2,
                  uchar* dst, size_t step, Size size, void* _cmpop)
2815
{
K
kdrobnyh 已提交
2816 2817 2818 2819 2820
#if ARITHM_USE_IPP
    IppCmpOp op = convert_cmp(*(int *)_cmpop);
    if( op  >= 0 )
    {
        fixSteps(size, sizeof(dst[0]), step1, step2, step);
I
Ilya Lavrenov 已提交
2821
        if (0 <= ippiCompare_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(size), op))
K
kdrobnyh 已提交
2822
            return;
I
Ilya Lavrenov 已提交
2823
        setIppErrorStatus();
K
kdrobnyh 已提交
2824 2825
    }
#endif
2826 2827
    cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
}
2828

2829 2830
static void cmp64f(const double* src1, size_t step1, const double* src2, size_t step2,
                  uchar* dst, size_t step, Size size, void* _cmpop)
2831
{
2832
    cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
2833 2834
}

2835
static BinaryFunc getCmpFunc(int depth)
2836
{
2837 2838 2839 2840 2841 2842 2843 2844
    static BinaryFunc cmpTab[] =
    {
        (BinaryFunc)GET_OPTIMIZED(cmp8u), (BinaryFunc)GET_OPTIMIZED(cmp8s),
        (BinaryFunc)GET_OPTIMIZED(cmp16u), (BinaryFunc)GET_OPTIMIZED(cmp16s),
        (BinaryFunc)GET_OPTIMIZED(cmp32s),
        (BinaryFunc)GET_OPTIMIZED(cmp32f), (BinaryFunc)cmp64f,
        0
    };
2845

2846 2847
    return cmpTab[depth];
}
2848

2849
static double getMinVal(int depth)
2850
{
2851 2852 2853
    static const double tab[] = {0, -128, 0, -32768, INT_MIN, -FLT_MAX, -DBL_MAX, 0};
    return tab[depth];
}
2854

2855
static double getMaxVal(int depth)
2856
{
2857 2858 2859
    static const double tab[] = {255, 127, 65535, 32767, INT_MAX, FLT_MAX, DBL_MAX, 0};
    return tab[depth];
}
2860

I
Ilya Lavrenov 已提交
2861 2862
#ifdef HAVE_OPENCL

A
Alexander Alekhin 已提交
2863
static bool ocl_compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op, bool haveScalar)
I
Ilya Lavrenov 已提交
2864
{
A
Alexander Alekhin 已提交
2865 2866
    const ocl::Device& dev = ocl::Device::getDefault();
    bool doubleSupport = dev.doubleFPConfig() > 0;
I
Ilya Lavrenov 已提交
2867 2868
    int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1),
            type2 = _src2.type(), depth2 = CV_MAT_DEPTH(type2);
A
Alexander Alekhin 已提交
2869

2870 2871 2872 2873
    if (!doubleSupport && depth1 == CV_64F)
        return false;

    if (!haveScalar && (!_src1.sameSize(_src2) || type1 != type2))
A
Alexander Alekhin 已提交
2874
            return false;
I
Ilya Lavrenov 已提交
2875

I
Ilya Lavrenov 已提交
2876
    int kercn = haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst), rowsPerWI = dev.isIntel() ? 4 : 1;
A
Alexander Alekhin 已提交
2877
    // Workaround for bug with "?:" operator in AMD OpenCL compiler
I
Ilya Lavrenov 已提交
2878
    if (depth1 >= CV_16U)
A
Alexander Alekhin 已提交
2879 2880
        kercn = 1;

A
Alexander Alekhin 已提交
2881
    int scalarcn = kercn == 3 ? 4 : kercn;
I
Ilya Lavrenov 已提交
2882
    const char * const operationMap[] = { "==", ">", ">=", "<", "<=", "!=" };
I
Ilya Lavrenov 已提交
2883 2884
    char cvt[40];

I
Ilya Lavrenov 已提交
2885 2886
    String opts = format("-D %s -D srcT1=%s -D dstT=%s -D workT=srcT1 -D cn=%d"
                         " -D convertToDT=%s -D OP_CMP -D CMP_OPERATOR=%s -D srcT1_C1=%s"
I
Ilya Lavrenov 已提交
2887
                         " -D srcT2_C1=%s -D dstT_C1=%s -D workST=%s -D rowsPerWI=%d%s",
I
Ilya Lavrenov 已提交
2888 2889 2890 2891 2892 2893
                         haveScalar ? "UNARY_OP" : "BINARY_OP",
                         ocl::typeToStr(CV_MAKE_TYPE(depth1, kercn)),
                         ocl::typeToStr(CV_8UC(kercn)), kercn,
                         ocl::convertTypeStr(depth1, CV_8U, kercn, cvt),
                         operationMap[op], ocl::typeToStr(depth1),
                         ocl::typeToStr(depth1), ocl::typeToStr(CV_8U),
I
Ilya Lavrenov 已提交
2894
                         ocl::typeToStr(CV_MAKE_TYPE(depth1, scalarcn)), rowsPerWI,
I
Ilya Lavrenov 已提交
2895 2896 2897
                         doubleSupport ? " -D DOUBLE_SUPPORT" : "");

    ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
I
Ilya Lavrenov 已提交
2898 2899 2900
    if (k.empty())
        return false;

A
Alexander Alekhin 已提交
2901
    UMat src1 = _src1.getUMat();
I
Ilya Lavrenov 已提交
2902 2903 2904
    Size size = src1.size();
    _dst.create(size, CV_8UC(cn));
    UMat dst = _dst.getUMat();
I
Ilya Lavrenov 已提交
2905

A
Alexander Alekhin 已提交
2906 2907
    if (haveScalar)
    {
I
Ilya Lavrenov 已提交
2908 2909 2910 2911 2912 2913 2914 2915 2916
        size_t esz = CV_ELEM_SIZE1(type1) * scalarcn;
        double buf[4] = { 0, 0, 0, 0 };
        Mat src2 = _src2.getMat();

        if( depth1 > CV_32S )
            convertAndUnrollScalar( src2, depth1, (uchar *)buf, kercn );
        else
        {
            double fval = 0;
I
Ilya Lavrenov 已提交
2917
            getConvertFunc(depth2, CV_64F)(src2.data, 1, 0, 1, (uchar *)&fval, 1, Size(1, 1), 0);
I
Ilya Lavrenov 已提交
2918 2919
            if( fval < getMinVal(depth1) )
                return dst.setTo(Scalar::all(op == CMP_GT || op == CMP_GE || op == CMP_NE ? 255 : 0)), true;
A
Alexander Alekhin 已提交
2920

I
Ilya Lavrenov 已提交
2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935
            if( fval > getMaxVal(depth1) )
                return dst.setTo(Scalar::all(op == CMP_LT || op == CMP_LE || op == CMP_NE ? 255 : 0)), true;

            int ival = cvRound(fval);
            if( fval != ival )
            {
                if( op == CMP_LT || op == CMP_GE )
                    ival = cvCeil(fval);
                else if( op == CMP_LE || op == CMP_GT )
                    ival = cvFloor(fval);
                else
                    return dst.setTo(Scalar::all(op == CMP_NE ? 255 : 0)), true;
            }
            convertAndUnrollScalar(Mat(1, 1, CV_32S, &ival), depth1, (uchar *)buf, kercn);
        }
A
Alexander Alekhin 已提交
2936 2937 2938 2939

        ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, 0, buf, esz);

        k.args(ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn),
I
Ilya Lavrenov 已提交
2940
               ocl::KernelArg::WriteOnly(dst, cn, kercn), scalararg);
A
Alexander Alekhin 已提交
2941 2942 2943 2944 2945 2946 2947 2948 2949
    }
    else
    {
        UMat src2 = _src2.getUMat();

        k.args(ocl::KernelArg::ReadOnlyNoSize(src1),
               ocl::KernelArg::ReadOnlyNoSize(src2),
               ocl::KernelArg::WriteOnly(dst, cn, kercn));
    }
I
Ilya Lavrenov 已提交
2950

I
Ilya Lavrenov 已提交
2951
    size_t globalsize[2] = { dst.cols * cn / kercn, (dst.rows + rowsPerWI - 1) / rowsPerWI };
I
Ilya Lavrenov 已提交
2952 2953 2954
    return k.run(2, globalsize, NULL, false);
}

I
Ilya Lavrenov 已提交
2955 2956
#endif

2957 2958
}

2959
void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op)
2960 2961 2962
{
    CV_Assert( op == CMP_LT || op == CMP_LE || op == CMP_EQ ||
               op == CMP_NE || op == CMP_GE || op == CMP_GT );
2963

A
Alexander Alekhin 已提交
2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984
    bool haveScalar = false;

    if ((_src1.isMatx() + _src2.isMatx()) == 1
            || !_src1.sameSize(_src2)
            || _src1.type() != _src2.type())
    {
        if (checkScalar(_src1, _src2.type(), _src1.kind(), _src2.kind()))
        {
            op = op == CMP_LT ? CMP_GT : op == CMP_LE ? CMP_GE :
                op == CMP_GE ? CMP_LE : op == CMP_GT ? CMP_LT : op;
            // src1 is a scalar; swap it with src2
            compare(_src2, _src1, _dst, op);
            return;
        }
        else if( !checkScalar(_src2, _src1.type(), _src2.kind(), _src1.kind()) )
            CV_Error( CV_StsUnmatchedSizes,
                     "The operation is neither 'array op array' (where arrays have the same size and the same type), "
                     "nor 'array op scalar', nor 'scalar op array'" );
        haveScalar = true;
    }

2985
    CV_OCL_RUN(_src1.dims() <= 2 && _src2.dims() <= 2 && OCL_PERFORMANCE_CHECK(_dst.isUMat()),
A
Alexander Alekhin 已提交
2986
               ocl_compare(_src1, _src2, _dst, op, haveScalar))
I
Ilya Lavrenov 已提交
2987

2988 2989
    int kind1 = _src1.kind(), kind2 = _src2.kind();
    Mat src1 = _src1.getMat(), src2 = _src2.getMat();
2990

2991
    if( kind1 == kind2 && src1.dims <= 2 && src2.dims <= 2 && src1.size() == src2.size() && src1.type() == src2.type() )
2992
    {
2993 2994
        int cn = src1.channels();
        _dst.create(src1.size(), CV_8UC(cn));
2995 2996
        Mat dst = _dst.getMat();
        Size sz = getContinuousSize(src1, src2, dst, src1.channels());
2997
        getCmpFunc(src1.depth())(src1.data, src1.step, src2.data, src2.step, dst.data, dst.step, sz, &op);
2998 2999
        return;
    }
3000

3001
    int cn = src1.channels(), depth1 = src1.depth(), depth2 = src2.depth();
3002

3003 3004 3005
    _dst.create(src1.dims, src1.size, CV_8UC(cn));
    src1 = src1.reshape(1); src2 = src2.reshape(1);
    Mat dst = _dst.getMat().reshape(1);
3006

3007 3008
    size_t esz = src1.elemSize();
    size_t blocksize0 = (size_t)(BLOCK_SIZE + esz-1)/esz;
3009
    BinaryFunc func = getCmpFunc(depth1);
3010

3011
    if( !haveScalar )
3012
    {
3013 3014
        const Mat* arrays[] = { &src1, &src2, &dst, 0 };
        uchar* ptrs[3];
3015

3016 3017
        NAryMatIterator it(arrays, ptrs);
        size_t total = it.size;
3018

3019 3020
        for( size_t i = 0; i < it.nplanes; i++, ++it )
            func( ptrs[0], 0, ptrs[1], 0, ptrs[2], 0, Size((int)total, 1), &op );
3021
    }
3022
    else
3023
    {
3024 3025
        const Mat* arrays[] = { &src1, &dst, 0 };
        uchar* ptrs[2];
3026

3027 3028
        NAryMatIterator it(arrays, ptrs);
        size_t total = it.size, blocksize = std::min(total, blocksize0);
3029

3030 3031 3032 3033 3034 3035
        AutoBuffer<uchar> _buf(blocksize*esz);
        uchar *buf = _buf;

        if( depth1 > CV_32S )
            convertAndUnrollScalar( src2, depth1, buf, blocksize );
        else
3036
        {
3037
            double fval=0;
I
Ilya Lavrenov 已提交
3038
            getConvertFunc(depth2, CV_64F)(src2.data, 1, 0, 1, (uchar*)&fval, 1, Size(1,1), 0);
3039 3040 3041 3042 3043
            if( fval < getMinVal(depth1) )
            {
                dst = Scalar::all(op == CMP_GT || op == CMP_GE || op == CMP_NE ? 255 : 0);
                return;
            }
3044

3045 3046 3047 3048 3049
            if( fval > getMaxVal(depth1) )
            {
                dst = Scalar::all(op == CMP_LT || op == CMP_LE || op == CMP_NE ? 255 : 0);
                return;
            }
3050

3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065
            int ival = cvRound(fval);
            if( fval != ival )
            {
                if( op == CMP_LT || op == CMP_GE )
                    ival = cvCeil(fval);
                else if( op == CMP_LE || op == CMP_GT )
                    ival = cvFloor(fval);
                else
                {
                    dst = Scalar::all(op == CMP_NE ? 255 : 0);
                    return;
                }
            }
            convertAndUnrollScalar(Mat(1, 1, CV_32S, &ival), depth1, buf, blocksize);
        }
3066

3067
        for( size_t i = 0; i < it.nplanes; i++, ++it )
3068
        {
3069 3070
            for( size_t j = 0; j < total; j += blocksize )
            {
3071
                int bsz = (int)MIN(total - j, blocksize);
3072 3073 3074 3075 3076
                func( ptrs[0], 0, buf, 0, ptrs[1], 0, Size(bsz, 1), &op);
                ptrs[0] += bsz*esz;
                ptrs[1] += bsz;
            }
        }
3077
    }
3078
}
3079

3080 3081 3082
/****************************************************************************************\
*                                        inRange                                         *
\****************************************************************************************/
3083

3084 3085
namespace cv
{
3086

I
Ilya Lavrenov 已提交
3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267
template <typename T>
struct InRange_SSE
{
    int operator () (const T *, const T *, const T *, uchar *, int) const
    {
        return 0;
    }
};

#if CV_SSE2

template <>
struct InRange_SSE<uchar>
{
    int operator () (const uchar * src1, const uchar * src2, const uchar * src3,
                     uchar * dst, int len) const
    {
        int x = 0;

        if (USE_SSE2)
        {
            __m128i v_full = _mm_set1_epi8(-1), v_128 = _mm_set1_epi8(-128);

            for ( ; x <= len - 16; x += 16 )
            {
                __m128i v_src = _mm_add_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), v_128);
                __m128i v_mask1 = _mm_cmpgt_epi8(_mm_add_epi8(_mm_loadu_si128((const __m128i *)(src2 + x)), v_128), v_src);
                __m128i v_mask2 = _mm_cmpgt_epi8(v_src, _mm_add_epi8(_mm_loadu_si128((const __m128i *)(src3 + x)), v_128));
                _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full));
            }
        }

        return x;
    }
};

template <>
struct InRange_SSE<schar>
{
    int operator () (const schar * src1, const schar * src2, const schar * src3,
                     uchar * dst, int len) const
    {
        int x = 0;

        if (USE_SSE2)
        {
            __m128i v_full = _mm_set1_epi8(-1);

            for ( ; x <= len - 16; x += 16 )
            {
                __m128i v_src = _mm_loadu_si128((const __m128i *)(src1 + x));
                __m128i v_mask1 = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src2 + x)), v_src);
                __m128i v_mask2 = _mm_cmpgt_epi8(v_src, _mm_loadu_si128((const __m128i *)(src3 + x)));
                _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full));
            }
        }

        return x;
    }
};

template <>
struct InRange_SSE<ushort>
{
    int operator () (const ushort * src1, const ushort * src2, const ushort * src3,
                     uchar * dst, int len) const
    {
        int x = 0;

        if (USE_SSE2)
        {
            __m128i v_zero = _mm_setzero_si128(), v_full = _mm_set1_epi16(-1), v_32768 = _mm_set1_epi16(-32768);

            for ( ; x <= len - 8; x += 8 )
            {
                __m128i v_src = _mm_add_epi16(_mm_loadu_si128((const __m128i *)(src1 + x)), v_32768);
                __m128i v_mask1 = _mm_cmpgt_epi16(_mm_add_epi16(_mm_loadu_si128((const __m128i *)(src2 + x)), v_32768), v_src);
                __m128i v_mask2 = _mm_cmpgt_epi16(v_src, _mm_add_epi16(_mm_loadu_si128((const __m128i *)(src3 + x)), v_32768));
                __m128i v_res = _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full);
                _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(_mm_srli_epi16(v_res, 8), v_zero));
            }
        }

        return x;
    }
};

template <>
struct InRange_SSE<short>
{
    int operator () (const short * src1, const short * src2, const short * src3,
                     uchar * dst, int len) const
    {
        int x = 0;

        if (USE_SSE2)
        {
            __m128i v_zero = _mm_setzero_si128(), v_full = _mm_set1_epi16(-1);

            for ( ; x <= len - 8; x += 8 )
            {
                __m128i v_src = _mm_loadu_si128((const __m128i *)(src1 + x));
                __m128i v_mask1 = _mm_cmpgt_epi16(_mm_loadu_si128((const __m128i *)(src2 + x)), v_src);
                __m128i v_mask2 = _mm_cmpgt_epi16(v_src, _mm_loadu_si128((const __m128i *)(src3 + x)));
                __m128i v_res = _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full);
                _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(_mm_srli_epi16(v_res, 8), v_zero));
            }
        }

        return x;
    }
};

template <>
struct InRange_SSE<int>
{
    int operator () (const int * src1, const int * src2, const int * src3,
                     uchar * dst, int len) const
    {
        int x = 0;

        if (USE_SSE2)
        {
            __m128i v_zero = _mm_setzero_si128(), v_full = _mm_set1_epi32(-1);

            for ( ; x <= len - 8; x += 8 )
            {
                __m128i v_src = _mm_loadu_si128((const __m128i *)(src1 + x));
                __m128i v_res1 = _mm_or_si128(_mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src2 + x)), v_src),
                    _mm_cmpgt_epi32(v_src, _mm_loadu_si128((const __m128i *)(src3 + x))));

                v_src = _mm_loadu_si128((const __m128i *)(src1 + x + 4));
                __m128i v_res2 = _mm_or_si128(_mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src2 + x + 4)), v_src),
                    _mm_cmpgt_epi32(v_src, _mm_loadu_si128((const __m128i *)(src3 + x + 4))));

                __m128i v_res = _mm_packs_epi32(_mm_srli_epi32(_mm_andnot_si128(v_res1, v_full), 16),
                                                _mm_srli_epi32(_mm_andnot_si128(v_res2, v_full), 16));
                _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_res, v_zero));
            }
        }

        return x;
    }
};

template <>
struct InRange_SSE<float>
{
    int operator () (const float * src1, const float * src2, const float * src3,
                     uchar * dst, int len) const
    {
        int x = 0;

        if (USE_SSE2)
        {
            __m128i v_zero = _mm_setzero_si128();

            for ( ; x <= len - 8; x += 8 )
            {
                __m128 v_src = _mm_loadu_ps(src1 + x);
                __m128 v_res1 = _mm_and_ps(_mm_cmple_ps(_mm_loadu_ps(src2 + x), v_src),
                    _mm_cmple_ps(v_src, _mm_loadu_ps(src3 + x)));

                v_src = _mm_loadu_ps(src1 + x + 4);
                __m128 v_res2 = _mm_and_ps(_mm_cmple_ps(_mm_loadu_ps(src2 + x + 4), v_src),
                    _mm_cmple_ps(v_src, _mm_loadu_ps(src3 + x + 4)));

                __m128i v_res1i = _mm_cvtps_epi32(v_res1), v_res2i = _mm_cvtps_epi32(v_res2);
                __m128i v_res = _mm_packs_epi32(_mm_srli_epi32(v_res1i, 16), _mm_srli_epi32(v_res2i, 16));
                _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_res, v_zero));
            }
        }

        return x;
    }
};

#endif

template <typename T>
static void inRange_(const T* src1, size_t step1, const T* src2, size_t step2,
3268 3269 3270 3271 3272 3273
         const T* src3, size_t step3, uchar* dst, size_t step,
         Size size)
{
    step1 /= sizeof(src1[0]);
    step2 /= sizeof(src2[0]);
    step3 /= sizeof(src3[0]);
3274

I
Ilya Lavrenov 已提交
3275 3276
    InRange_SSE<T> vop;

3277
    for( ; size.height--; src1 += step1, src2 += step2, src3 += step3, dst += step )
V
Vadim Pisarevsky 已提交
3278
    {
I
Ilya Lavrenov 已提交
3279
        int x = vop(src1, src2, src3, dst, size.width);
3280
        #if CV_ENABLE_UNROLLED
3281
        for( ; x <= size.width - 4; x += 4 )
V
Vadim Pisarevsky 已提交
3282
        {
3283 3284 3285 3286 3287 3288 3289
            int t0, t1;
            t0 = src2[x] <= src1[x] && src1[x] <= src3[x];
            t1 = src2[x+1] <= src1[x+1] && src1[x+1] <= src3[x+1];
            dst[x] = (uchar)-t0; dst[x+1] = (uchar)-t1;
            t0 = src2[x+2] <= src1[x+2] && src1[x+2] <= src3[x+2];
            t1 = src2[x+3] <= src1[x+3] && src1[x+3] <= src3[x+3];
            dst[x+2] = (uchar)-t0; dst[x+3] = (uchar)-t1;
V
Vadim Pisarevsky 已提交
3290
        }
V
Victoria Zhislina 已提交
3291
        #endif
3292 3293
        for( ; x < size.width; x++ )
            dst[x] = (uchar)-(src2[x] <= src1[x] && src1[x] <= src3[x]);
V
Vadim Pisarevsky 已提交
3294
    }
3295 3296
}

3297

3298 3299
static void inRange8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
                      const uchar* src3, size_t step3, uchar* dst, size_t step, Size size)
3300
{
3301 3302
    inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
}
3303

3304 3305
static void inRange8s(const schar* src1, size_t step1, const schar* src2, size_t step2,
                      const schar* src3, size_t step3, uchar* dst, size_t step, Size size)
3306
{
3307 3308
    inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
}
3309

3310 3311
static void inRange16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
                       const ushort* src3, size_t step3, uchar* dst, size_t step, Size size)
3312
{
3313 3314
    inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
}
3315

3316 3317 3318 3319
static void inRange16s(const short* src1, size_t step1, const short* src2, size_t step2,
                       const short* src3, size_t step3, uchar* dst, size_t step, Size size)
{
    inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
3320 3321
}

3322 3323
static void inRange32s(const int* src1, size_t step1, const int* src2, size_t step2,
                       const int* src3, size_t step3, uchar* dst, size_t step, Size size)
3324
{
3325 3326
    inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
}
3327

3328 3329 3330 3331
static void inRange32f(const float* src1, size_t step1, const float* src2, size_t step2,
                       const float* src3, size_t step3, uchar* dst, size_t step, Size size)
{
    inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
3332 3333
}

3334 3335
static void inRange64f(const double* src1, size_t step1, const double* src2, size_t step2,
                       const double* src3, size_t step3, uchar* dst, size_t step, Size size)
3336
{
3337
    inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
3338
}
3339

3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355
static void inRangeReduce(const uchar* src, uchar* dst, size_t len, int cn)
{
    int k = cn % 4 ? cn % 4 : 4;
    size_t i, j;
    if( k == 1 )
        for( i = j = 0; i < len; i++, j += cn )
            dst[i] = src[j];
    else if( k == 2 )
        for( i = j = 0; i < len; i++, j += cn )
            dst[i] = src[j] & src[j+1];
    else if( k == 3 )
        for( i = j = 0; i < len; i++, j += cn )
            dst[i] = src[j] & src[j+1] & src[j+2];
    else
        for( i = j = 0; i < len; i++, j += cn )
            dst[i] = src[j] & src[j+1] & src[j+2] & src[j+3];
3356

3357 3358 3359 3360
    for( ; k < cn; k += 4 )
    {
        for( i = 0, j = k; i < len; i++, j += cn )
            dst[i] &= src[j] & src[j+1] & src[j+2] & src[j+3];
V
Vadim Pisarevsky 已提交
3361
    }
3362
}
3363

3364 3365
typedef void (*InRangeFunc)( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
                             const uchar* src3, size_t step3, uchar* dst, size_t step, Size sz );
3366

3367
static InRangeFunc getInRangeFunc(int depth)
3368
{
3369 3370 3371 3372 3373 3374 3375 3376 3377
    static InRangeFunc inRangeTab[] =
    {
        (InRangeFunc)GET_OPTIMIZED(inRange8u), (InRangeFunc)GET_OPTIMIZED(inRange8s), (InRangeFunc)GET_OPTIMIZED(inRange16u),
        (InRangeFunc)GET_OPTIMIZED(inRange16s), (InRangeFunc)GET_OPTIMIZED(inRange32s), (InRangeFunc)GET_OPTIMIZED(inRange32f),
        (InRangeFunc)inRange64f, 0
    };

    return inRangeTab[depth];
}
3378

I
Ilya Lavrenov 已提交
3379 3380
#ifdef HAVE_OPENCL

I
Ilya Lavrenov 已提交
3381 3382 3383
static bool ocl_inRange( InputArray _src, InputArray _lowerb,
                         InputArray _upperb, OutputArray _dst )
{
I
Ilya Lavrenov 已提交
3384
    const ocl::Device & d = ocl::Device::getDefault();
I
Ilya Lavrenov 已提交
3385 3386 3387 3388
    int skind = _src.kind(), lkind = _lowerb.kind(), ukind = _upperb.kind();
    Size ssize = _src.size(), lsize = _lowerb.size(), usize = _upperb.size();
    int stype = _src.type(), ltype = _lowerb.type(), utype = _upperb.type();
    int sdepth = CV_MAT_DEPTH(stype), ldepth = CV_MAT_DEPTH(ltype), udepth = CV_MAT_DEPTH(utype);
I
Ilya Lavrenov 已提交
3389
    int cn = CV_MAT_CN(stype), rowsPerWI = d.isIntel() ? 4 : 1;
I
Ilya Lavrenov 已提交
3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412
    bool lbScalar = false, ubScalar = false;

    if( (lkind == _InputArray::MATX && skind != _InputArray::MATX) ||
        ssize != lsize || stype != ltype )
    {
        if( !checkScalar(_lowerb, stype, lkind, skind) )
            CV_Error( CV_StsUnmatchedSizes,
                     "The lower bounary is neither an array of the same size and same type as src, nor a scalar");
        lbScalar = true;
    }

    if( (ukind == _InputArray::MATX && skind != _InputArray::MATX) ||
        ssize != usize || stype != utype )
    {
        if( !checkScalar(_upperb, stype, ukind, skind) )
            CV_Error( CV_StsUnmatchedSizes,
                     "The upper bounary is neither an array of the same size and same type as src, nor a scalar");
        ubScalar = true;
    }

    if (lbScalar != ubScalar)
        return false;

I
Ilya Lavrenov 已提交
3413
    bool doubleSupport = d.doubleFPConfig() > 0,
I
Ilya Lavrenov 已提交
3414 3415 3416 3417 3418 3419
            haveScalar = lbScalar && ubScalar;

    if ( (!doubleSupport && sdepth == CV_64F) ||
         (!haveScalar && (sdepth != ldepth || sdepth != udepth)) )
        return false;

I
Ilya Lavrenov 已提交
3420 3421 3422 3423 3424 3425 3426 3427 3428 3429
    int kercn = haveScalar ? cn : std::max(std::min(ocl::predictOptimalVectorWidth(_src, _lowerb, _upperb, _dst), 4), cn);
    if (kercn % cn != 0)
        kercn = cn;
    int colsPerWI = kercn / cn;
    String opts = format("%s-D cn=%d -D srcT=%s -D srcT1=%s -D dstT=%s -D kercn=%d -D depth=%d%s -D colsPerWI=%d",
                           haveScalar ? "-D HAVE_SCALAR " : "", cn, ocl::typeToStr(CV_MAKE_TYPE(sdepth, kercn)),
                           ocl::typeToStr(sdepth), ocl::typeToStr(CV_8UC(colsPerWI)), kercn, sdepth,
                           doubleSupport ? " -D DOUBLE_SUPPORT" : "", colsPerWI);

    ocl::Kernel ker("inrange", ocl::core::inrange_oclsrc, opts);
I
Ilya Lavrenov 已提交
3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453
    if (ker.empty())
        return false;

    _dst.create(ssize, CV_8UC1);
    UMat src = _src.getUMat(), dst = _dst.getUMat(), lscalaru, uscalaru;
    Mat lscalar, uscalar;

    if (lbScalar && ubScalar)
    {
        lscalar = _lowerb.getMat();
        uscalar = _upperb.getMat();

        size_t esz = src.elemSize();
        size_t blocksize = 36;

        AutoBuffer<uchar> _buf(blocksize*(((int)lbScalar + (int)ubScalar)*esz + cn) + 2*cn*sizeof(int) + 128);
        uchar *buf = alignPtr(_buf + blocksize*cn, 16);

        if( ldepth != sdepth && sdepth < CV_32S )
        {
            int* ilbuf = (int*)alignPtr(buf + blocksize*esz, 16);
            int* iubuf = ilbuf + cn;

            BinaryFunc sccvtfunc = getConvertFunc(ldepth, CV_32S);
I
Ilya Lavrenov 已提交
3454 3455
            sccvtfunc(lscalar.data, 1, 0, 1, (uchar*)ilbuf, 1, Size(cn, 1), 0);
            sccvtfunc(uscalar.data, 1, 0, 1, (uchar*)iubuf, 1, Size(cn, 1), 0);
I
Ilya Lavrenov 已提交
3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476
            int minval = cvRound(getMinVal(sdepth)), maxval = cvRound(getMaxVal(sdepth));

            for( int k = 0; k < cn; k++ )
            {
                if( ilbuf[k] > iubuf[k] || ilbuf[k] > maxval || iubuf[k] < minval )
                    ilbuf[k] = minval+1, iubuf[k] = minval;
            }
            lscalar = Mat(cn, 1, CV_32S, ilbuf);
            uscalar = Mat(cn, 1, CV_32S, iubuf);
        }

        lscalar.convertTo(lscalar, stype);
        uscalar.convertTo(uscalar, stype);
    }
    else
    {
        lscalaru = _lowerb.getUMat();
        uscalaru = _upperb.getUMat();
    }

    ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
I
Ilya Lavrenov 已提交
3477
            dstarg = ocl::KernelArg::WriteOnly(dst, 1, colsPerWI);
I
Ilya Lavrenov 已提交
3478 3479 3480 3481 3482 3483 3484

    if (haveScalar)
    {
        lscalar.copyTo(lscalaru);
        uscalar.copyTo(uscalaru);

        ker.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(lscalaru),
I
Ilya Lavrenov 已提交
3485
               ocl::KernelArg::PtrReadOnly(uscalaru), rowsPerWI);
I
Ilya Lavrenov 已提交
3486 3487 3488
    }
    else
        ker.args(srcarg, dstarg, ocl::KernelArg::ReadOnlyNoSize(lscalaru),
I
Ilya Lavrenov 已提交
3489
               ocl::KernelArg::ReadOnlyNoSize(uscalaru), rowsPerWI);
I
Ilya Lavrenov 已提交
3490

I
Ilya Lavrenov 已提交
3491
    size_t globalsize[2] = { ssize.width / colsPerWI, (ssize.height + rowsPerWI - 1) / rowsPerWI };
I
Ilya Lavrenov 已提交
3492 3493 3494
    return ker.run(2, globalsize, NULL, false);
}

I
Ilya Lavrenov 已提交
3495 3496
#endif

3497 3498
}

3499 3500
void cv::inRange(InputArray _src, InputArray _lowerb,
                 InputArray _upperb, OutputArray _dst)
3501
{
I
Ilya Lavrenov 已提交
3502
    CV_OCL_RUN(_src.dims() <= 2 && _lowerb.dims() <= 2 &&
3503
               _upperb.dims() <= 2 && OCL_PERFORMANCE_CHECK(_dst.isUMat()),
I
Ilya Lavrenov 已提交
3504
               ocl_inRange(_src, _lowerb, _upperb, _dst))
I
Ilya Lavrenov 已提交
3505

3506 3507
    int skind = _src.kind(), lkind = _lowerb.kind(), ukind = _upperb.kind();
    Mat src = _src.getMat(), lb = _lowerb.getMat(), ub = _upperb.getMat();
3508

3509
    bool lbScalar = false, ubScalar = false;
3510

3511
    if( (lkind == _InputArray::MATX && skind != _InputArray::MATX) ||
3512 3513 3514 3515 3516 3517 3518
        src.size != lb.size || src.type() != lb.type() )
    {
        if( !checkScalar(lb, src.type(), lkind, skind) )
            CV_Error( CV_StsUnmatchedSizes,
                     "The lower bounary is neither an array of the same size and same type as src, nor a scalar");
        lbScalar = true;
    }
3519

3520
    if( (ukind == _InputArray::MATX && skind != _InputArray::MATX) ||
3521 3522 3523 3524 3525 3526 3527
        src.size != ub.size || src.type() != ub.type() )
    {
        if( !checkScalar(ub, src.type(), ukind, skind) )
            CV_Error( CV_StsUnmatchedSizes,
                     "The upper bounary is neither an array of the same size and same type as src, nor a scalar");
        ubScalar = true;
    }
3528

I
Ilya Lavrenov 已提交
3529
    CV_Assert(lbScalar == ubScalar);
3530

3531
    int cn = src.channels(), depth = src.depth();
3532

3533 3534
    size_t esz = src.elemSize();
    size_t blocksize0 = (size_t)(BLOCK_SIZE + esz-1)/esz;
3535

I
Ilya Lavrenov 已提交
3536
    _dst.create(src.dims, src.size, CV_8UC1);
3537
    Mat dst = _dst.getMat();
3538
    InRangeFunc func = getInRangeFunc(depth);
3539

3540 3541 3542
    const Mat* arrays_sc[] = { &src, &dst, 0 };
    const Mat* arrays_nosc[] = { &src, &dst, &lb, &ub, 0 };
    uchar* ptrs[4];
3543

3544 3545
    NAryMatIterator it(lbScalar && ubScalar ? arrays_sc : arrays_nosc, ptrs);
    size_t total = it.size, blocksize = std::min(total, blocksize0);
3546

3547 3548 3549
    AutoBuffer<uchar> _buf(blocksize*(((int)lbScalar + (int)ubScalar)*esz + cn) + 2*cn*sizeof(int) + 128);
    uchar *buf = _buf, *mbuf = buf, *lbuf = 0, *ubuf = 0;
    buf = alignPtr(buf + blocksize*cn, 16);
3550

3551 3552 3553 3554
    if( lbScalar && ubScalar )
    {
        lbuf = buf;
        ubuf = buf = alignPtr(buf + blocksize*esz, 16);
3555

3556 3557
        CV_Assert( lb.type() == ub.type() );
        int scdepth = lb.depth();
3558

3559 3560 3561 3562
        if( scdepth != depth && depth < CV_32S )
        {
            int* ilbuf = (int*)alignPtr(buf + blocksize*esz, 16);
            int* iubuf = ilbuf + cn;
3563

3564
            BinaryFunc sccvtfunc = getConvertFunc(scdepth, CV_32S);
I
Ilya Lavrenov 已提交
3565 3566
            sccvtfunc(lb.data, 1, 0, 1, (uchar*)ilbuf, 1, Size(cn, 1), 0);
            sccvtfunc(ub.data, 1, 0, 1, (uchar*)iubuf, 1, Size(cn, 1), 0);
3567
            int minval = cvRound(getMinVal(depth)), maxval = cvRound(getMaxVal(depth));
3568

3569 3570 3571 3572 3573 3574 3575 3576
            for( int k = 0; k < cn; k++ )
            {
                if( ilbuf[k] > iubuf[k] || ilbuf[k] > maxval || iubuf[k] < minval )
                    ilbuf[k] = minval+1, iubuf[k] = minval;
            }
            lb = Mat(cn, 1, CV_32S, ilbuf);
            ub = Mat(cn, 1, CV_32S, iubuf);
        }
3577

3578 3579 3580
        convertAndUnrollScalar( lb, src.type(), lbuf, blocksize );
        convertAndUnrollScalar( ub, src.type(), ubuf, blocksize );
    }
3581

3582
    for( size_t i = 0; i < it.nplanes; i++, ++it )
V
Vadim Pisarevsky 已提交
3583
    {
3584 3585
        for( size_t j = 0; j < total; j += blocksize )
        {
3586
            int bsz = (int)MIN(total - j, blocksize);
3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600
            size_t delta = bsz*esz;
            uchar *lptr = lbuf, *uptr = ubuf;
            if( !lbScalar )
            {
                lptr = ptrs[2];
                ptrs[2] += delta;
            }
            if( !ubScalar )
            {
                int idx = !lbScalar ? 3 : 2;
                uptr = ptrs[idx];
                ptrs[idx] += delta;
            }
            func( ptrs[0], 0, lptr, 0, uptr, 0, cn == 1 ? ptrs[1] : mbuf, 0, Size(bsz*cn, 1));
3601
            if( cn > 1 )
3602 3603 3604 3605
                inRangeReduce(mbuf, ptrs[1], bsz, cn);
            ptrs[0] += delta;
            ptrs[1] += bsz;
        }
V
Vadim Pisarevsky 已提交
3606
    }
3607 3608 3609 3610 3611 3612 3613 3614 3615 3616
}

/****************************************************************************************\
*                                Earlier API: cvAdd etc.                                 *
\****************************************************************************************/

CV_IMPL void
cvNot( const CvArr* srcarr, CvArr* dstarr )
{
    cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
3617
    CV_Assert( src.size == dst.size && src.type() == dst.type() );
3618 3619 3620 3621 3622 3623 3624 3625 3626
    cv::bitwise_not( src, dst );
}


CV_IMPL void
cvAnd( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
        dst = cv::cvarrToMat(dstarr), mask;
3627
    CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
3628 3629 3630 3631 3632
    if( maskarr )
        mask = cv::cvarrToMat(maskarr);
    cv::bitwise_and( src1, src2, dst, mask );
}

3633

3634 3635 3636 3637 3638
CV_IMPL void
cvOr( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
        dst = cv::cvarrToMat(dstarr), mask;
3639
    CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650
    if( maskarr )
        mask = cv::cvarrToMat(maskarr);
    cv::bitwise_or( src1, src2, dst, mask );
}


CV_IMPL void
cvXor( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
        dst = cv::cvarrToMat(dstarr), mask;
3651
    CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
3652 3653 3654 3655 3656 3657 3658 3659 3660 3661
    if( maskarr )
        mask = cv::cvarrToMat(maskarr);
    cv::bitwise_xor( src1, src2, dst, mask );
}


CV_IMPL void
cvAndS( const CvArr* srcarr, CvScalar s, CvArr* dstarr, const CvArr* maskarr )
{
    cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask;
3662
    CV_Assert( src.size == dst.size && src.type() == dst.type() );
3663 3664
    if( maskarr )
        mask = cv::cvarrToMat(maskarr);
3665
    cv::bitwise_and( src, (const cv::Scalar&)s, dst, mask );
3666 3667 3668 3669 3670 3671 3672
}


CV_IMPL void
cvOrS( const CvArr* srcarr, CvScalar s, CvArr* dstarr, const CvArr* maskarr )
{
    cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask;
3673
    CV_Assert( src.size == dst.size && src.type() == dst.type() );
3674 3675
    if( maskarr )
        mask = cv::cvarrToMat(maskarr);
3676
    cv::bitwise_or( src, (const cv::Scalar&)s, dst, mask );
3677 3678 3679 3680 3681 3682 3683
}


CV_IMPL void
cvXorS( const CvArr* srcarr, CvScalar s, CvArr* dstarr, const CvArr* maskarr )
{
    cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask;
3684
    CV_Assert( src.size == dst.size && src.type() == dst.type() );
3685 3686
    if( maskarr )
        mask = cv::cvarrToMat(maskarr);
3687
    cv::bitwise_xor( src, (const cv::Scalar&)s, dst, mask );
3688 3689
}

3690

3691 3692 3693 3694
CV_IMPL void cvAdd( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
        dst = cv::cvarrToMat(dstarr), mask;
3695
    CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
3696 3697
    if( maskarr )
        mask = cv::cvarrToMat(maskarr);
3698
    cv::add( src1, src2, dst, mask, dst.type() );
3699 3700
}

3701

3702 3703 3704 3705
CV_IMPL void cvSub( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
        dst = cv::cvarrToMat(dstarr), mask;
3706
    CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
3707 3708
    if( maskarr )
        mask = cv::cvarrToMat(maskarr);
3709
    cv::subtract( src1, src2, dst, mask, dst.type() );
3710 3711
}

3712

3713 3714 3715 3716
CV_IMPL void cvAddS( const CvArr* srcarr1, CvScalar value, CvArr* dstarr, const CvArr* maskarr )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1),
        dst = cv::cvarrToMat(dstarr), mask;
3717
    CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
3718 3719
    if( maskarr )
        mask = cv::cvarrToMat(maskarr);
3720
    cv::add( src1, (const cv::Scalar&)value, dst, mask, dst.type() );
3721 3722
}

3723

3724 3725 3726 3727
CV_IMPL void cvSubRS( const CvArr* srcarr1, CvScalar value, CvArr* dstarr, const CvArr* maskarr )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1),
        dst = cv::cvarrToMat(dstarr), mask;
3728
    CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
3729 3730
    if( maskarr )
        mask = cv::cvarrToMat(maskarr);
3731
    cv::subtract( (const cv::Scalar&)value, src1, dst, mask, dst.type() );
3732 3733
}

3734

3735 3736 3737 3738 3739
CV_IMPL void cvMul( const CvArr* srcarr1, const CvArr* srcarr2,
                    CvArr* dstarr, double scale )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
        dst = cv::cvarrToMat(dstarr);
3740 3741
    CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
    cv::multiply( src1, src2, dst, scale, dst.type() );
3742 3743
}

3744

3745 3746 3747 3748 3749
CV_IMPL void cvDiv( const CvArr* srcarr1, const CvArr* srcarr2,
                    CvArr* dstarr, double scale )
{
    cv::Mat src2 = cv::cvarrToMat(srcarr2),
        dst = cv::cvarrToMat(dstarr), mask;
3750
    CV_Assert( src2.size == dst.size && src2.channels() == dst.channels() );
3751 3752

    if( srcarr1 )
3753
        cv::divide( cv::cvarrToMat(srcarr1), src2, dst, scale, dst.type() );
3754
    else
3755
        cv::divide( scale, src2, dst, dst.type() );
3756 3757 3758 3759 3760 3761 3762 3763 3764 3765
}


CV_IMPL void
cvAddWeighted( const CvArr* srcarr1, double alpha,
               const CvArr* srcarr2, double beta,
               double gamma, CvArr* dstarr )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
        dst = cv::cvarrToMat(dstarr);
3766 3767
    CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
    cv::addWeighted( src1, alpha, src2, beta, gamma, dst, dst.type() );
3768 3769 3770 3771 3772 3773 3774
}


CV_IMPL  void
cvAbsDiff( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
3775
    CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
3776 3777 3778 3779 3780 3781 3782 3783 3784

    cv::absdiff( src1, cv::cvarrToMat(srcarr2), dst );
}


CV_IMPL void
cvAbsDiffS( const CvArr* srcarr1, CvArr* dstarr, CvScalar scalar )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
3785
    CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
3786

3787
    cv::absdiff( src1, (const cv::Scalar&)scalar, dst );
3788 3789
}

3790

3791 3792 3793 3794 3795
CV_IMPL void
cvInRange( const void* srcarr1, const void* srcarr2,
           const void* srcarr3, void* dstarr )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
3796
    CV_Assert( src1.size == dst.size && dst.type() == CV_8U );
3797 3798 3799 3800

    cv::inRange( src1, cv::cvarrToMat(srcarr2), cv::cvarrToMat(srcarr3), dst );
}

3801

3802 3803 3804 3805
CV_IMPL void
cvInRangeS( const void* srcarr1, CvScalar lowerb, CvScalar upperb, void* dstarr )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
3806
    CV_Assert( src1.size == dst.size && dst.type() == CV_8U );
3807

3808
    cv::inRange( src1, (const cv::Scalar&)lowerb, (const cv::Scalar&)upperb, dst );
3809 3810 3811 3812 3813 3814 3815
}


CV_IMPL void
cvCmp( const void* srcarr1, const void* srcarr2, void* dstarr, int cmp_op )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
3816
    CV_Assert( src1.size == dst.size && dst.type() == CV_8U );
3817 3818 3819 3820 3821 3822 3823 3824 3825

    cv::compare( src1, cv::cvarrToMat(srcarr2), dst, cmp_op );
}


CV_IMPL void
cvCmpS( const void* srcarr1, double value, void* dstarr, int cmp_op )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
3826
    CV_Assert( src1.size == dst.size && dst.type() == CV_8U );
3827 3828 3829 3830 3831 3832 3833 3834 3835

    cv::compare( src1, value, dst, cmp_op );
}


CV_IMPL void
cvMin( const void* srcarr1, const void* srcarr2, void* dstarr )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
3836
    CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
3837 3838 3839 3840 3841 3842 3843 3844 3845

    cv::min( src1, cv::cvarrToMat(srcarr2), dst );
}


CV_IMPL void
cvMax( const void* srcarr1, const void* srcarr2, void* dstarr )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
3846
    CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
3847 3848 3849 3850

    cv::max( src1, cv::cvarrToMat(srcarr2), dst );
}

3851

3852 3853 3854 3855
CV_IMPL void
cvMinS( const void* srcarr1, double value, void* dstarr )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
3856
    CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
3857 3858 3859 3860 3861 3862 3863 3864 3865

    cv::min( src1, value, dst );
}


CV_IMPL void
cvMaxS( const void* srcarr1, double value, void* dstarr )
{
    cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
3866
    CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
3867 3868 3869 3870 3871

    cv::max( src1, value, dst );
}

/* End of file. */