resize.cl 12.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
/*M///////////////////////////////////////////////////////////////////////////////////////
//
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
//  By downloading, copying, installing or using the software you agree to this license.
//  If you do not agree to this license, do not download, install,
//  copy or use the software.
//
//
//                           License Agreement
//                For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
//    Zhang Ying, zhangying913@gmail.com
//	  Niko Li, newlife20080214@gmail.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
//   * Redistribution's of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//
//   * Redistribution's in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other materials provided with the distribution.
//
//   * The name of the copyright holders may not be used to endorse or promote products
//     derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/

46 47 48 49
#ifdef DOUBLE_SUPPORT
#ifdef cl_amd_fp64
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#elif defined (cl_khr_fp64)
50 51
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
52
#endif
53 54 55

#define INC(x,l) min(x+1,l-1)

56
#define noconvert
57

58
#if cn != 3
59 60 61
#define loadpix(addr)  *(__global const T *)(addr)
#define storepix(val, addr)  *(__global T *)(addr) = val
#define TSIZE (int)sizeof(T)
62
#else
63 64 65
#define loadpix(addr)  vload3(0, (__global const T1 *)(addr))
#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))
#define TSIZE (int)sizeof(T1)*cn
66 67
#endif

68 69 70 71
#if defined USE_SAMPLER

#if cn == 1
#define READ_IMAGE(X,Y,Z)  read_imagef(X,Y,Z).x
72
#define INTERMEDIATE_TYPE  float
73 74
#elif cn == 2
#define READ_IMAGE(X,Y,Z)  read_imagef(X,Y,Z).xy
75
#define INTERMEDIATE_TYPE  float2
76 77
#elif cn == 3
#define READ_IMAGE(X,Y,Z)  read_imagef(X,Y,Z).xyz
78
#define INTERMEDIATE_TYPE  float3
79 80
#elif cn == 4
#define READ_IMAGE(X,Y,Z)  read_imagef(X,Y,Z)
81
#define INTERMEDIATE_TYPE  float4
82 83 84 85
#endif

#define __CAT(x, y) x##y
#define CAT(x, y) __CAT(x, y)
86
//#define INTERMEDIATE_TYPE CAT(float, cn)
87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
#define float1 float

#if depth == 0
#define RESULT_SCALE    255.0f
#elif depth == 1
#define RESULT_SCALE    127.0f
#elif depth == 2
#define RESULT_SCALE    65535.0f
#elif depth == 3
#define RESULT_SCALE    32767.0f
#else
#define RESULT_SCALE    1.0f
#endif

__kernel void resizeSampler(__read_only image2d_t srcImage,
                            __global uchar* dstptr, int dststep, int dstoffset,
                            int dstrows, int dstcols,
                            float ifx, float ify)
{
    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |
                              CLK_ADDRESS_CLAMP_TO_EDGE |
                              CLK_FILTER_LINEAR;

    int dx = get_global_id(0);
    int dy = get_global_id(1);

    float sx = ((dx+0.5f) * ifx), sy = ((dy+0.5f) * ify);

    INTERMEDIATE_TYPE intermediate = READ_IMAGE(srcImage, sampler, (float2)(sx, sy));

#if depth <= 4
    T uval = convertToDT(round(intermediate * RESULT_SCALE));
#else
    T uval = convertToDT(intermediate * RESULT_SCALE);
#endif

    if(dx < dstcols && dy < dstrows)
    {
        storepix(uval, dstptr + mad24(dy, dststep, dstoffset + dx*TSIZE));
    }
}

#elif defined INTER_LINEAR_INTEGER
130

131 132 133
__kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
                       __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
                       __global const uchar * buffer)
134 135 136 137
{
    int dx = get_global_id(0);
    int dy = get_global_id(1);

138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
    if (dx < dst_cols && dy < dst_rows)
    {
        __global const int * xofs = (__global const int *)(buffer), * yofs = xofs + dst_cols;
        __global const short * ialpha = (__global const short *)(yofs + dst_rows);
        __global const short * ibeta = ialpha + ((dst_cols + dy) << 1);
        ialpha += dx << 1;

        int sx0 = xofs[dx], sy0 = clamp(yofs[dy], 0, src_rows - 1),
        sy1 = clamp(yofs[dy] + 1, 0, src_rows - 1);
        short a0 = ialpha[0], a1 = ialpha[1];
        short b0 = ibeta[0], b1 = ibeta[1];

        int src_index0 = mad24(sy0, src_step, mad24(sx0, TSIZE, src_offset)),
        src_index1 = mad24(sy1, src_step, mad24(sx0, TSIZE, src_offset));
        WT data0 = convertToWT(loadpix(srcptr + src_index0));
        WT data1 = convertToWT(loadpix(srcptr + src_index0 + TSIZE));
        WT data2 = convertToWT(loadpix(srcptr + src_index1));
        WT data3 = convertToWT(loadpix(srcptr + src_index1 + TSIZE));

        WT val = ( (((data0 * a0 + data1 * a1) >> 4) * b0) >> 16) +
                 ( (((data2 * a0 + data3 * a1) >> 4) * b1) >> 16);

        storepix(convertToDT((val + 2) >> 2),
                 dstptr + mad24(dy, dst_step, mad24(dx, TSIZE, dst_offset)));
    }
}
164

165
#elif defined INTER_LINEAR
166

167 168 169 170 171 172
__kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
                       __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
                       float ifx, float ify)
{
    int dx = get_global_id(0);
    int dy = get_global_id(1);
173

174 175 176 177
    if (dx < dst_cols && dy < dst_rows)
    {
        float sx = ((dx+0.5f) * ifx - 0.5f), sy = ((dy+0.5f) * ify - 0.5f);
        int x = floor(sx), y = floor(sy);
178

179
        float u = sx - x, v = sy - y;
180

181 182 183 184
        if ( x<0 ) x=0,u=0;
        if ( x>=src_cols ) x=src_cols-1,u=0;
        if ( y<0 ) y=0,v=0;
        if ( y>=src_rows ) y=src_rows-1,v=0;
185

186 187
        int y_ = INC(y, src_rows);
        int x_ = INC(x, src_cols);
188

189 190 191
#if depth <= 1  // 8U/8S only, 16U+ cause integer overflows
#define INTER_RESIZE_COEF_SCALE (1 << INTER_RESIZE_COEF_BITS)
#define CAST_BITS (INTER_RESIZE_COEF_BITS << 1)
192 193
        u = u * INTER_RESIZE_COEF_SCALE;
        v = v * INTER_RESIZE_COEF_SCALE;
194

195 196 197 198
        int U = rint(u);
        int V = rint(v);
        int U1 = rint(INTER_RESIZE_COEF_SCALE - u);
        int V1 = rint(INTER_RESIZE_COEF_SCALE - v);
199

200 201 202 203
        WT data0 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))));
        WT data1 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x_, TSIZE, src_offset))));
        WT data2 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x, TSIZE, src_offset))));
        WT data3 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x_, TSIZE, src_offset))));
204

205 206
        WT val = mul24((WT)mul24(U1, V1), data0) + mul24((WT)mul24(U, V1), data1) +
                   mul24((WT)mul24(U1, V), data2) + mul24((WT)mul24(U, V), data3);
207

208 209 210 211 212 213 214 215 216
        T uval = convertToDT((val + (1<<(CAST_BITS-1)))>>CAST_BITS);
#else
        float u1 = 1.f - u;
        float v1 = 1.f - v;
        WT data0 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))));
        WT data1 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x_, TSIZE, src_offset))));
        WT data2 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x, TSIZE, src_offset))));
        WT data3 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x_, TSIZE, src_offset))));

217
        T uval = convertToDT((u1 * v1) * data0 + (u * v1) * data1 + (u1 * v) * data2 + (u * v) * data3);
218
#endif
219
        storepix(uval, dstptr + mad24(dy, dst_step, mad24(dx, TSIZE, dst_offset)));
220 221 222 223 224
    }
}

#elif defined INTER_NEAREST

225 226
__kernel void resizeNN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
                       __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
227 228 229 230 231
                       float ifx, float ify)
{
    int dx = get_global_id(0);
    int dy = get_global_id(1);

232
    if (dx < dst_cols && dy < dst_rows)
233
    {
234 235 236 237
        float s1 = dx * ifx;
        float s2 = dy * ify;
        int sx = min(convert_int_rtz(s1), src_cols - 1);
        int sy = min(convert_int_rtz(s2), src_rows - 1);
238

239 240
        storepix(loadpix(srcptr + mad24(sy, src_step, mad24(sx, TSIZE, src_offset))),
                 dstptr + mad24(dy, dst_step, mad24(dx, TSIZE, dst_offset)));
241 242 243
    }
}

244 245 246 247 248
#elif defined INTER_AREA

#ifdef INTER_AREA_FAST

__kernel void resizeAREA_FAST(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
249
                              __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)
250 251 252 253 254 255 256 257
{
    int dx = get_global_id(0);
    int dy = get_global_id(1);

    if (dx < dst_cols && dy < dst_rows)
    {
        int dst_index = mad24(dy, dst_step, dst_offset);

258 259
        int sx = XSCALE * dx;
        int sy = YSCALE * dy;
260 261 262
        WTV sum = (WTV)(0);

        #pragma unroll
263
        for (int py = 0; py < YSCALE; ++py)
264
        {
265 266
            int y = min(sy + py, src_rows - 1);
            int src_index = mad24(y, src_step, src_offset);
267
            #pragma unroll
268 269 270 271 272
            for (int px = 0; px < XSCALE; ++px)
            {
                int x = min(sx + px, src_cols - 1);
                sum += convertToWTV(loadpix(src + src_index + x*TSIZE));
            }
273 274
        }

275
        storepix(convertToT(convertToWT2V(sum) * (WT2V)(SCALE)), dst + mad24(dx, TSIZE, dst_index));
276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316
    }
}

#else

__kernel void resizeAREA(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
                         __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
                         float ifx, float ify, __global const int * ofs_tab,
                         __global const int * map_tab, __global const float * alpha_tab)
{
    int dx = get_global_id(0);
    int dy = get_global_id(1);

    if (dx < dst_cols && dy < dst_rows)
    {
        int dst_index = mad24(dy, dst_step, dst_offset);

        __global const int * xmap_tab = map_tab;
        __global const int * ymap_tab = (__global const int *)(map_tab + (src_cols << 1));
        __global const float * xalpha_tab = alpha_tab;
        __global const float * yalpha_tab = (__global const float *)(alpha_tab + (src_cols << 1));
        __global const int * xofs_tab = ofs_tab;
        __global const int * yofs_tab = (__global const int *)(ofs_tab + dst_cols + 1);

        int xk0 = xofs_tab[dx], xk1 = xofs_tab[dx + 1];
        int yk0 = yofs_tab[dy], yk1 = yofs_tab[dy + 1];

        int sy0 = ymap_tab[yk0], sy1 = ymap_tab[yk1 - 1];
        int sx0 = xmap_tab[xk0], sx1 = xmap_tab[xk1 - 1];

        WTV sum = (WTV)(0), buf;
        int src_index = mad24(sy0, src_step, src_offset);

        for (int sy = sy0, yk = yk0; sy <= sy1; ++sy, src_index += src_step, ++yk)
        {
            WTV beta = (WTV)(yalpha_tab[yk]);
            buf = (WTV)(0);

            for (int sx = sx0, xk = xk0; sx <= sx1; ++sx, ++xk)
            {
                WTV alpha = (WTV)(xalpha_tab[xk]);
317
                buf += convertToWTV(loadpix(src + mad24(sx, TSIZE, src_index))) * alpha;
318 319 320 321
            }
            sum += buf * beta;
        }

322
        storepix(convertToT(sum), dst + mad24(dx, TSIZE, dst_index));
323 324 325 326 327
    }
}

#endif

328
#endif