/** * By downloading, copying, installing or using the software you agree to this license. * If you do not agree to this license, do not download, install, * copy or use the software. * * * License Agreement * For Open Source Computer Vision Library * (3-clause BSD License) * * Copyright (C) 2000-2020, Intel Corporation, all rights reserved. * Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved. * Copyright (C) 2009-2016, NVIDIA Corporation, all rights reserved. * Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved. * Copyright (C) 2015-2016, OpenCV Foundation, all rights reserved. * Copyright (C) 2015-2016, Itseez Inc., all rights reserved. * Copyright (C) 2019-2020, Xperience AI, all rights reserved. * Third party copyrights are property of their respective owners. * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * * Neither the names of the copyright holders nor the names of the contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * This software is provided by the copyright holders and contributors "as is" and * any express or implied warranties, including, but not limited to, the implied * warranties of merchantability and fitness for a particular purpose are disclaimed. * In no event shall copyright holders or contributors be liable for any direct, * indirect, incidental, special, exemplary, or consequential damages * (including, but not limited to, procurement of substitute goods or services; * loss of use, data, or profits; or business interruption) however caused * and on any theory of liability, whether in contract, strict liability, * or tort (including negligence or otherwise) arising in any way out of * the use of this software, even if advised of the possibility of such damage. * * --------------------------------------------------------------------------- * \file dnn/src/common/warp_common.h * * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * * This file has been modified by Megvii ("Megvii Modifications"). * All Megvii Modifications are Copyright (C) 2014-2021 Megvii Inc. All rights reserved. * * --------------------------------------------------------------------------- */ #pragma once #include "megdnn/dtype.h" #include "src/common/cv/common.h" #include "src/common/cv/helper.h" #include "src/common/cv/interp_helper.h" #include "src/common/rounding_converter.cuh" #include "src/common/utils.h" #include "include/megdnn/oprs.h" #include "midout.h" #if MEGDNN_X86 #include #elif MEGDNN_AARCH64 || MEGDNN_ARMV7 #include "src/arm_common/simd_macro/marm_neon.h" #endif MIDOUT_DECL(megdnn_warp) MIDOUT_DECL(remapBilinear_bmode) MIDOUT_DECL(remapBilinear_ch) namespace megdnn { namespace warp { bool is_cv_available( const TensorLayout& src, const TensorLayout& mat, const TensorLayout& dst, param::WarpAffine::InterpolationMode imode, param::WarpAffine::Format format); bool is_dnn_available( const TensorLayout&, const TensorLayout&, const TensorLayout&, param::WarpAffine::InterpolationMode imode, param::WarpAffine::Format format); bool is_dnn_available( const TensorLayoutArray&, const TensorLayout&, const TensorLayout&, param::WarpAffine::InterpolationMode imode, param::WarpAffine::Format format); using namespace megcv; using IMode = InterpolationMode; using BMode = BorderMode; using InterpTable = InterpolationTable<>; constexpr int INTER_REMAP_COEF_BITS = InterpTable::INTER_REMAP_COEF_BITS; constexpr int INTER_BITS = InterpTable::INTER_BITS; constexpr int INTER_TAB_SIZE = InterpTable::INTER_TAB_SIZE; constexpr int INTER_TAB_SIZE2 = InterpTable::INTER_TAB_SIZE2; constexpr int INTER_REMAP_COEF_SCALE = InterpTable::INTER_REMAP_COEF_SCALE; template struct RemapVec { int operator()( const Mat&, void*, const short*, const ushort*, const void*, int) const { return 0; } }; #if MEGDNN_X86 template struct RemapVec { int operator()( const Mat8u& _src, void* _dst, const short* XY, const ushort* FXY, const void* _wtab, int width) const { int x = 0, sstep = (int)_src.step(); if ((CH != 1 && CH != 3) || sstep > 0x8000) return 0; const uchar *S0 = _src.ptr(), *S1 = _src.ptr(1); const short* wtab = CH == 1 ? (const short*)_wtab : InterpTable::get_linear_ic4_table(); uchar* D = (uchar*)_dst; __m128i delta = _mm_set1_epi32(INTER_REMAP_COEF_SCALE / 2); __m128i xy2ofs = _mm_set1_epi32(CH + (sstep << 16)); __m128i z = _mm_setzero_si128(); alignas(16) int iofs0[4]; alignas(16) int iofs1[4]; if (CH == 1) { for (; x <= width - 8; x += 8) { __m128i xy0 = _mm_loadu_si128((const __m128i*)(XY + x * 2)); __m128i xy1 = _mm_loadu_si128((const __m128i*)(XY + x * 2 + 8)); __m128i v0, v1, v2, v3, a0, a1, b0, b1; unsigned i0, i1; xy0 = _mm_madd_epi16(xy0, xy2ofs); xy1 = _mm_madd_epi16(xy1, xy2ofs); _mm_store_si128((__m128i*)iofs0, xy0); _mm_store_si128((__m128i*)iofs1, xy1); i0 = *(ushort*)(S0 + iofs0[0]) + (*(ushort*)(S0 + iofs0[1]) << 16); i1 = *(ushort*)(S0 + iofs0[2]) + (*(ushort*)(S0 + iofs0[3]) << 16); v0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1)); i0 = *(ushort*)(S1 + iofs0[0]) + (*(ushort*)(S1 + iofs0[1]) << 16); i1 = *(ushort*)(S1 + iofs0[2]) + (*(ushort*)(S1 + iofs0[3]) << 16); v1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1)); v0 = _mm_unpacklo_epi8(v0, z); v1 = _mm_unpacklo_epi8(v1, z); a0 = _mm_unpacklo_epi32( _mm_loadl_epi64((__m128i*)(wtab + FXY[x] * 4)), _mm_loadl_epi64((__m128i*)(wtab + FXY[x + 1] * 4))); a1 = _mm_unpacklo_epi32( _mm_loadl_epi64((__m128i*)(wtab + FXY[x + 2] * 4)), _mm_loadl_epi64((__m128i*)(wtab + FXY[x + 3] * 4))); b0 = _mm_unpacklo_epi64(a0, a1); b1 = _mm_unpackhi_epi64(a0, a1); v0 = _mm_madd_epi16(v0, b0); v1 = _mm_madd_epi16(v1, b1); v0 = _mm_add_epi32(_mm_add_epi32(v0, v1), delta); i0 = *(ushort*)(S0 + iofs1[0]) + (*(ushort*)(S0 + iofs1[1]) << 16); i1 = *(ushort*)(S0 + iofs1[2]) + (*(ushort*)(S0 + iofs1[3]) << 16); v2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1)); i0 = *(ushort*)(S1 + iofs1[0]) + (*(ushort*)(S1 + iofs1[1]) << 16); i1 = *(ushort*)(S1 + iofs1[2]) + (*(ushort*)(S1 + iofs1[3]) << 16); v3 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1)); v2 = _mm_unpacklo_epi8(v2, z); v3 = _mm_unpacklo_epi8(v3, z); a0 = _mm_unpacklo_epi32( _mm_loadl_epi64((__m128i*)(wtab + FXY[x + 4] * 4)), _mm_loadl_epi64((__m128i*)(wtab + FXY[x + 5] * 4))); a1 = _mm_unpacklo_epi32( _mm_loadl_epi64((__m128i*)(wtab + FXY[x + 6] * 4)), _mm_loadl_epi64((__m128i*)(wtab + FXY[x + 7] * 4))); b0 = _mm_unpacklo_epi64(a0, a1); b1 = _mm_unpackhi_epi64(a0, a1); v2 = _mm_madd_epi16(v2, b0); v3 = _mm_madd_epi16(v3, b1); v2 = _mm_add_epi32(_mm_add_epi32(v2, v3), delta); v0 = _mm_srai_epi32(v0, INTER_REMAP_COEF_BITS); v2 = _mm_srai_epi32(v2, INTER_REMAP_COEF_BITS); v0 = _mm_packus_epi16(_mm_packs_epi32(v0, v2), z); _mm_storel_epi64((__m128i*)(D + x), v0); } } else if (CH == 3) { for (; x <= width - 5; x += 4, D += 12) { __m128i xy0 = _mm_loadu_si128((const __m128i*)(XY + x * 2)); __m128i u0, v0, u1, v1; xy0 = _mm_madd_epi16(xy0, xy2ofs); _mm_store_si128((__m128i*)iofs0, xy0); const __m128i *w0, *w1; w0 = (const __m128i*)(wtab + FXY[x] * 16); w1 = (const __m128i*)(wtab + FXY[x + 1] * 16); u0 = _mm_unpacklo_epi8( _mm_cvtsi32_si128(*(int*)(S0 + iofs0[0])), _mm_cvtsi32_si128(*(int*)(S0 + iofs0[0] + 3))); v0 = _mm_unpacklo_epi8( _mm_cvtsi32_si128(*(int*)(S1 + iofs0[0])), _mm_cvtsi32_si128(*(int*)(S1 + iofs0[0] + 3))); u1 = _mm_unpacklo_epi8( _mm_cvtsi32_si128(*(int*)(S0 + iofs0[1])), _mm_cvtsi32_si128(*(int*)(S0 + iofs0[1] + 3))); v1 = _mm_unpacklo_epi8( _mm_cvtsi32_si128(*(int*)(S1 + iofs0[1])), _mm_cvtsi32_si128(*(int*)(S1 + iofs0[1] + 3))); u0 = _mm_unpacklo_epi8(u0, z); v0 = _mm_unpacklo_epi8(v0, z); u1 = _mm_unpacklo_epi8(u1, z); v1 = _mm_unpacklo_epi8(v1, z); u0 = _mm_add_epi32( _mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1])); u1 = _mm_add_epi32( _mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1])); u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS); u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS); u0 = _mm_slli_si128(u0, 4); u0 = _mm_packs_epi32(u0, u1); u0 = _mm_packus_epi16(u0, u0); _mm_storel_epi64((__m128i*)D, _mm_srli_si128(u0, 1)); w0 = (const __m128i*)(wtab + FXY[x + 2] * 16); w1 = (const __m128i*)(wtab + FXY[x + 3] * 16); u0 = _mm_unpacklo_epi8( _mm_cvtsi32_si128(*(int*)(S0 + iofs0[2])), _mm_cvtsi32_si128(*(int*)(S0 + iofs0[2] + 3))); v0 = _mm_unpacklo_epi8( _mm_cvtsi32_si128(*(int*)(S1 + iofs0[2])), _mm_cvtsi32_si128(*(int*)(S1 + iofs0[2] + 3))); u1 = _mm_unpacklo_epi8( _mm_cvtsi32_si128(*(int*)(S0 + iofs0[3])), _mm_cvtsi32_si128(*(int*)(S0 + iofs0[3] + 3))); v1 = _mm_unpacklo_epi8( _mm_cvtsi32_si128(*(int*)(S1 + iofs0[3])), _mm_cvtsi32_si128(*(int*)(S1 + iofs0[3] + 3))); u0 = _mm_unpacklo_epi8(u0, z); v0 = _mm_unpacklo_epi8(v0, z); u1 = _mm_unpacklo_epi8(u1, z); v1 = _mm_unpacklo_epi8(v1, z); u0 = _mm_add_epi32( _mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1])); u1 = _mm_add_epi32( _mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1])); u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS); u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS); u0 = _mm_slli_si128(u0, 4); u0 = _mm_packs_epi32(u0, u1); u0 = _mm_packus_epi16(u0, u0); _mm_storel_epi64((__m128i*)(D + 6), _mm_srli_si128(u0, 1)); } } return x; } }; #endif template using RemapNNFunc = void (*)( const Mat& _src, Mat& _dst, const Mat& _xy, const T* bvalue); template using RemapFunc = void (*)( const Mat& _src, Mat& _dst, const Mat& _xy, const Mat& _fxy, const void* _wtab, const T* bvalue); template static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy, const T* bvalue) { const T* S0 = _src.ptr(); size_t sstep = _src.step(); int dx, dy; int width1 = _src.width(), height1 = _src.height(); int swidth = _src.width(), sheight = _src.height(); int dwidth = _dst.width(), dheight = _dst.height(); if (_dst.is_continuous() && _xy.is_continuous()) { dwidth *= dheight; dheight = 1; } for (dy = 0; dy < dheight; dy++) { T* D = _dst.ptr(dy); const short* XY = _xy.ptr(dy); if (CH == 1) { for (dx = 0; dx < dwidth; dx++) { int sx = XY[dx * 2], sy = XY[dx * 2 + 1]; if ((unsigned)sx < (unsigned)width1 && (unsigned)sy < (unsigned)height1) { D[dx] = S0[sy * sstep + sx]; } else { if (bmode == BMode::BORDER_REPLICATE) { sx = saturate(sx, 0, swidth); sy = saturate(sy, 0, sheight); D[dx] = S0[sy * sstep + sx]; } else if (bmode == BMode::BORDER_CONSTANT) D[dx] = bvalue[0]; else if (bmode != BMode::BORDER_TRANSPARENT) { sx = border_interpolate(sx, swidth); sy = border_interpolate(sy, sheight); D[dx] = S0[sy * sstep + sx]; } } } } else { for (dx = 0; dx < dwidth; dx++, D += CH) { int sx = XY[dx * 2], sy = XY[dx * 2 + 1]; const T* S; if ((unsigned)sx < (unsigned)width1 && (unsigned)sy < (unsigned)height1) { S = S0 + sy * sstep + sx * CH; for (size_t i = 0; i < CH; i++) { D[i] = S[i]; } } else if (bmode != BMode::BORDER_TRANSPARENT) { if (bmode == BMode::BORDER_REPLICATE) { sx = saturate(sx, 0, swidth); sy = saturate(sy, 0, sheight); S = S0 + sy * sstep + sx * CH; } else if (bmode == BMode::BORDER_CONSTANT) S = bvalue; else { sx = border_interpolate(sx, swidth); sy = border_interpolate(sy, sheight); S = S0 + sy * sstep + sx * CH; } for (size_t i = 0; i < CH; i++) { D[i] = S[i]; } } } } } } template static void remapBicubic( const Mat& _src, Mat& _dst, const Mat& _xy, const Mat& _fxy, const void* _wtab, const T* bvalue) { typedef typename CastOp::type1 WT; const AT* wtab = (const AT*)_wtab; const T* S0 = _src.ptr(); size_t sstep = _src.step(); int dx, dy; CastOp castOp; int swidth = _src.width(), sheight = _src.height(); int dwidth = _dst.width(), dheight = _dst.height(); unsigned width1 = std::max(swidth - 3, 0), height1 = std::max(sheight - 3, 0); if (_dst.is_continuous() && _xy.is_continuous() && _fxy.is_continuous()) { dwidth *= dheight; dheight = 1; } for (dy = 0; dy < dheight; dy++) { T* D = _dst.ptr(dy); const short* XY = _xy.ptr(dy); const ushort* FXY = _fxy.ptr(dy); for (dx = 0; dx < dwidth; dx++, D += CH) { int sx = XY[dx * 2] - 1, sy = XY[dx * 2 + 1] - 1; const AT* w = wtab + FXY[dx] * 16; size_t i, k; if ((unsigned)sx < width1 && (unsigned)sy < height1) { const T* S = S0 + sy * sstep + sx * CH; for (k = 0; k < CH; k++) { WT sum = S[0] * w[0] + S[CH] * w[1] + S[CH * 2] * w[2] + S[CH * 3] * w[3]; S += sstep; sum += S[0] * w[4] + S[CH] * w[5] + S[CH * 2] * w[6] + S[CH * 3] * w[7]; S += sstep; sum += S[0] * w[8] + S[CH] * w[9] + S[CH * 2] * w[10] + S[CH * 3] * w[11]; S += sstep; sum += S[0] * w[12] + S[CH] * w[13] + S[CH * 2] * w[14] + S[CH * 3] * w[15]; S += 1 - sstep * 3; D[k] = castOp(sum); } } else { int x[4], y[4]; if (bmode == BMode::BORDER_TRANSPARENT && ((unsigned)(sx + 1) >= (unsigned)swidth || (unsigned)(sy + 1) >= (unsigned)sheight)) continue; if (bmode == BMode::BORDER_CONSTANT && (sx >= swidth || sx + 4 <= 0 || sy >= sheight || sy + 4 <= 0)) { for (size_t i = 0; i < CH; i++) { D[i] = bvalue[i]; } continue; } for (i = 0; i < 4; i++) { x[i] = border_interpolate(sx + i, swidth) * CH; y[i] = border_interpolate(sy + i, sheight); } for (k = 0; k < CH; k++, S0++, w -= 16) { WT cv = bvalue[k], sum = cv * ONE; for (i = 0; i < 4; i++, w += 4) { int yi = y[i]; const T* S = S0 + yi * sstep; if (yi < 0) continue; if (x[0] >= 0) sum += (S[x[0]] - cv) * w[0]; if (x[1] >= 0) sum += (S[x[1]] - cv) * w[1]; if (x[2] >= 0) sum += (S[x[2]] - cv) * w[2]; if (x[3] >= 0) sum += (S[x[3]] - cv) * w[3]; } D[k] = castOp(sum); } S0 -= CH; } } } } template < class CastOp, class VecOp, typename AT, typename T, BorderMode bmode, size_t CH> static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy, const Mat& _fxy, const void* _wtab, const T* bvalue) { MIDOUT_BEGIN(remapBilinear_bmode, midout_iv(bmode)) { typedef typename CastOp::type1 WT; const AT* wtab = (const AT*)_wtab; const T* S0 = _src.ptr(); size_t sstep = _src.step(); int dx, dy; CastOp castOp; VecOp vecOp; int swidth = _src.width(), sheight = _src.height(); int dwidth = _dst.width(), dheight = _dst.height(); unsigned width1 = std::max(swidth - 1, 0), height1 = std::max(sheight - 1, 0); for (dy = 0; dy < dheight; dy++) { T* D = _dst.ptr(dy); const short* XY = _xy.ptr(dy); const ushort* FXY = _fxy.ptr(dy); int X0 = 0; bool prevInlier = false; for (dx = 0; dx <= dwidth; dx++) { bool curInlier = dx < dwidth ? (unsigned)XY[dx * 2] < width1 && (unsigned)XY[dx * 2 + 1] < height1 : !prevInlier; if (curInlier == prevInlier) continue; int X1 = dx; dx = X0; X0 = X1; prevInlier = curInlier; if (!curInlier) { int len = vecOp(_src, D, XY + dx * 2, FXY + dx, wtab, X1 - dx); D += len * CH; dx += len; if (CH == 1) { MIDOUT_BEGIN(remapBilinear_bmode, 0, 1) { for (; dx < X1; dx++, D++) { int sx = XY[dx * 2], sy = XY[dx * 2 + 1]; const AT* w = wtab + FXY[dx] * 4; const T* S = S0 + sy * sstep + sx; *D = castOp( WT(S[0] * w[0] + S[1] * w[1] + S[sstep] * w[2] + S[sstep + 1] * w[3])); } } MIDOUT_END(); } else if (CH == 2) { MIDOUT_BEGIN(remapBilinear_bmode, 0, 2) { for (; dx < X1; dx++, D += 2) { int sx = XY[dx * 2], sy = XY[dx * 2 + 1]; const AT* w = wtab + FXY[dx] * 4; const T* S = S0 + sy * sstep + sx * 2; WT t0 = S[0] * w[0] + S[2] * w[1] + S[sstep] * w[2] + S[sstep + 2] * w[3]; WT t1 = S[1] * w[0] + S[3] * w[1] + S[sstep + 1] * w[2] + S[sstep + 3] * w[3]; D[0] = castOp(t0); D[1] = castOp(t1); } } MIDOUT_END(); } else if (CH == 3) MIDOUT_BEGIN(remapBilinear_bmode, 0, 3) { for (; dx < X1; dx++, D += 3) { int sx = XY[dx * 2], sy = XY[dx * 2 + 1]; const AT* w = wtab + FXY[dx] * 4; const T* S = S0 + sy * sstep + sx * 3; WT t0 = S[0] * w[0] + S[3] * w[1] + S[sstep] * w[2] + S[sstep + 3] * w[3]; WT t1 = S[1] * w[0] + S[4] * w[1] + S[sstep + 1] * w[2] + S[sstep + 4] * w[3]; WT t2 = S[2] * w[0] + S[5] * w[1] + S[sstep + 2] * w[2] + S[sstep + 5] * w[3]; D[0] = castOp(t0); D[1] = castOp(t1); D[2] = castOp(t2); } } MIDOUT_END(); else megdnn_throw("nr. of channels must be 1/2/3."); } else { if (bmode == BMode::BORDER_TRANSPARENT && CH != 3) { megdnn_throw( "unsupported Linear InterpolationMode" " with BORDER_TRANSPARENT and channel size 1"); continue; } if (CH == 1) { MIDOUT_BEGIN(remapBilinear_bmode, 1, 1) { for (; dx < X1; dx++, D++) { int sx = XY[dx * 2], sy = XY[dx * 2 + 1]; if (bmode == BMode::BORDER_CONSTANT && (sx >= swidth || sx + 1 < 0 || sy >= sheight || sy + 1 < 0)) { D[0] = bvalue[0]; } else { int sx0, sx1, sy0, sy1; T v0, v1, v2, v3; const AT* w = wtab + FXY[dx] * 4; if (bmode == BMode::BORDER_REPLICATE) { sx0 = saturate(sx, 0, swidth); sx1 = saturate(sx + 1, 0, swidth); sy0 = saturate(sy, 0, sheight); sy1 = saturate(sy + 1, 0, sheight); v0 = S0[sy0 * sstep + sx0]; v1 = S0[sy0 * sstep + sx1]; v2 = S0[sy1 * sstep + sx0]; v3 = S0[sy1 * sstep + sx1]; } else { sx0 = border_interpolate(sx, swidth); sx1 = border_interpolate(sx + 1, swidth); sy0 = border_interpolate(sy, sheight); sy1 = border_interpolate( sy + 1, sheight); v0 = sx0 >= 0 && sy0 >= 0 ? S0[sy0 * sstep + sx0] : bvalue[0]; v1 = sx1 >= 0 && sy0 >= 0 ? S0[sy0 * sstep + sx1] : bvalue[0]; v2 = sx0 >= 0 && sy1 >= 0 ? S0[sy1 * sstep + sx0] : bvalue[0]; v3 = sx1 >= 0 && sy1 >= 0 ? S0[sy1 * sstep + sx1] : bvalue[0]; } D[0] = castOp( WT(v0 * w[0] + v1 * w[1] + v2 * w[2] + v3 * w[3])); } } } MIDOUT_END(); } else { for (; dx < X1; dx++, D += CH) { int sx = XY[dx * 2], sy = XY[dx * 2 + 1]; if (bmode == BMode::BORDER_CONSTANT && (sx >= swidth || sx + 1 < 0 || sy >= sheight || sy + 1 < 0)) { for (size_t k = 0; k < CH; k++) D[k] = bvalue[k]; } else { int sx0, sx1, sy0, sy1; const T *v0, *v1, *v2, *v3; const AT* w = wtab + FXY[dx] * 4; if (bmode == BMode::BORDER_REPLICATE) { sx0 = saturate(sx, 0, swidth); sx1 = saturate(sx + 1, 0, swidth); sy0 = saturate(sy, 0, sheight); sy1 = saturate(sy + 1, 0, sheight); v0 = S0 + sy0 * sstep + sx0 * CH; v1 = S0 + sy0 * sstep + sx1 * CH; v2 = S0 + sy1 * sstep + sx0 * CH; v3 = S0 + sy1 * sstep + sx1 * CH; } else if ( bmode == BMode::BORDER_TRANSPARENT && ((unsigned)sx >= (unsigned)(swidth - 1) || (unsigned)sy >= (unsigned)(sheight - 1))) continue; else { sx0 = border_interpolate(sx, swidth); sx1 = border_interpolate(sx + 1, swidth); sy0 = border_interpolate(sy, sheight); sy1 = border_interpolate(sy + 1, sheight); v0 = sx0 >= 0 && sy0 >= 0 ? S0 + sy0 * sstep + sx0 * CH : &bvalue[0]; v1 = sx1 >= 0 && sy0 >= 0 ? S0 + sy0 * sstep + sx1 * CH : &bvalue[0]; v2 = sx0 >= 0 && sy1 >= 0 ? S0 + sy1 * sstep + sx0 * CH : &bvalue[0]; v3 = sx1 >= 0 && sy1 >= 0 ? S0 + sy1 * sstep + sx1 * CH : &bvalue[0]; } for (size_t k = 0; k < CH; k++) { D[k] = castOp( WT(v0[k] * w[0] + v1[k] * w[1] + v2[k] * w[2] + v3[k] * w[3])); } } } } } } } } MIDOUT_END(); } template static void remapLanczos4( const Mat& _src, Mat& _dst, const Mat& _xy, const Mat& _fxy, const void* _wtab, const T* bvalue) { typedef typename CastOp::type1 WT; const AT* wtab = (const AT*)_wtab; const T* S0 = _src.ptr(); size_t sstep = _src.step(); int dx, dy; CastOp castOp; int swidth = _src.width(), sheight = _src.height(); int dwidth = _dst.width(), dheight = _dst.height(); unsigned width1 = std::max(swidth - 7, 0), height1 = std::max(sheight - 7, 0); if (_dst.is_continuous() && _xy.is_continuous() && _fxy.is_continuous()) { dwidth *= dheight; dheight = 1; } for (dy = 0; dy < dheight; dy++) { T* D = _dst.ptr(dy); const short* XY = _xy.ptr(dy); const ushort* FXY = _fxy.ptr(dy); for (dx = 0; dx < dwidth; dx++, D += CH) { int sx = XY[dx * 2] - 3, sy = XY[dx * 2 + 1] - 3; const AT* w = wtab + FXY[dx] * 64; const T* S = S0 + sy * sstep + sx * CH; size_t i, k; if ((unsigned)sx < width1 && (unsigned)sy < height1) { for (k = 0; k < CH; k++) { WT sum = 0; for (int r = 0; r < 8; r++, S += sstep, w += 8) sum += S[0] * w[0] + S[CH] * w[1] + S[CH * 2] * w[2] + S[CH * 3] * w[3] + S[CH * 4] * w[4] + S[CH * 5] * w[5] + S[CH * 6] * w[6] + S[CH * 7] * w[7]; w -= 64; S -= sstep * 8 - 1; D[k] = castOp(sum); } } else { int x[8], y[8]; if (bmode == BMode::BORDER_TRANSPARENT && ((unsigned)(sx + 3) >= (unsigned)swidth || (unsigned)(sy + 3) >= (unsigned)sheight)) continue; if (bmode == BMode::BORDER_CONSTANT && (sx >= swidth || sx + 8 <= 0 || sy >= sheight || sy + 8 <= 0)) { for (size_t i = 0; i < CH; i++) { D[i] = bvalue[i]; } continue; } for (i = 0; i < 8; i++) { x[i] = border_interpolate(sx + i, swidth) * CH; y[i] = border_interpolate(sy + i, sheight); } for (k = 0; k < CH; k++, S0++, w -= 64) { WT cv = bvalue[k], sum = cv * ONE; for (i = 0; i < 8; i++, w += 8) { int yi = y[i]; const T* S1 = S0 + yi * sstep; if (yi < 0) continue; if (x[0] >= 0) sum += (S1[x[0]] - cv) * w[0]; if (x[1] >= 0) sum += (S1[x[1]] - cv) * w[1]; if (x[2] >= 0) sum += (S1[x[2]] - cv) * w[2]; if (x[3] >= 0) sum += (S1[x[3]] - cv) * w[3]; if (x[4] >= 0) sum += (S1[x[4]] - cv) * w[4]; if (x[5] >= 0) sum += (S1[x[5]] - cv) * w[5]; if (x[6] >= 0) sum += (S1[x[6]] - cv) * w[6]; if (x[7] >= 0) sum += (S1[x[7]] - cv) * w[7]; } D[k] = castOp(sum); } S0 -= CH; } } } } template < typename T, InterpolationMode imode, BorderMode bmode, size_t CH, typename RemapVec> struct RemapFuncHolder; template struct RemapFuncHolder { static void get_funcs( RemapNNFunc& nnfunc, RemapFunc& ifunc) { switch (imode) { case IMode::INTER_NEAREST: MIDOUT_BEGIN(megdnn_warp, midout_iv(0)) { nnfunc = remapNearest; } MIDOUT_END(); break; case IMode::INTER_LINEAR: MIDOUT_BEGIN(megdnn_warp, midout_iv(1)) { ifunc = remapBilinear< FixedPtCast, RemapVec, short, uchar, bmode, CH>; } MIDOUT_END(); break; case IMode::INTER_CUBIC: MIDOUT_BEGIN(megdnn_warp, midout_iv(2)) { ifunc = remapBicubic< FixedPtCast, short, INTER_REMAP_COEF_SCALE, uchar, bmode, CH>; } MIDOUT_END(); break; case IMode::INTER_LANCZOS4: MIDOUT_BEGIN(megdnn_warp, midout_iv(3)) { ifunc = remapLanczos4< FixedPtCast, short, INTER_REMAP_COEF_SCALE, uchar, bmode, CH>; } MIDOUT_END(); break; default: megdnn_throw(("unrecognized interpolation mode")); } } }; template struct RemapFuncHolder { static void get_funcs( RemapNNFunc& nnfunc, RemapFunc& ifunc) { switch (imode) { case IMode::INTER_NEAREST: MIDOUT_BEGIN(megdnn_warp, midout_iv(0)) { nnfunc = remapNearest; } MIDOUT_END(); break; case IMode::INTER_LINEAR: MIDOUT_BEGIN(megdnn_warp, midout_iv(1)) { ifunc = remapBilinear< Cast, RemapVec, float, float, bmode, CH>; } MIDOUT_END(); break; case IMode::INTER_CUBIC: MIDOUT_BEGIN(megdnn_warp, midout_iv(2)) { ifunc = remapBicubic< Cast, float, 1, float, bmode, CH>; } MIDOUT_END(); break; case IMode::INTER_LANCZOS4: MIDOUT_BEGIN(megdnn_warp, midout_iv(3)) { ifunc = remapLanczos4< Cast, float, 1, float, bmode, CH>; } MIDOUT_END(); break; default: megdnn_throw(("unrecognized interpolation mode")); } } }; template < typename T, InterpolationMode imode, BorderMode bmode, size_t CH, typename RemapVec> #if MEGDNN_X86 MEGDNN_ATTRIBUTE_TARGET("sse3") #endif void remap( const Mat& src, Mat& dst, Mat& map1, Mat& map2, const T* bvalue) { RemapNNFunc nnfunc = 0; RemapFunc ifunc = 0; bool fixpt = std::is_same::value; const void* ctab = 0; RemapFuncHolder::get_funcs(nnfunc, ifunc); if (imode != IMode::INTER_NEAREST) { ctab = InterpTable::get_table(imode, fixpt); } { // remap invoker int x, y, x1, y1; const int buf_size = 1 << 14; int dstcols = dst.cols(), dstrows = dst.rows(); int brows0 = std::min(128, dstrows); int bcols0 = std::min(buf_size / brows0, dstcols); brows0 = std::min(buf_size / bcols0, dstrows); Mat _bufxy(brows0, bcols0, 2); Mat _bufa(brows0, bcols0, 1); for (y = 0; y < dstrows; y += brows0) for (x = 0; x < dstcols; x += bcols0) { int brows = std::min(brows0, dstrows - y); int bcols = std::min(bcols0, dstcols - x); Mat dpart(dst, y, brows, x, bcols); Mat bufxy(_bufxy, 0, brows, 0, bcols); if (nnfunc) { bufxy = Mat(map1, y, brows, x, bcols); nnfunc(src, dpart, bufxy, bvalue); continue; } Mat bufa(_bufa, 0, brows, 0, bcols); for (y1 = 0; y1 < brows; ++y1) { ushort* A = bufa.ptr(y1); bufxy = Mat(map1, y, brows, x, bcols); const ushort* sA = map2.ptr(y + y1) + x; x1 = 0; #if MEGDNN_X86 __m128i sA_data, d_data; __m128i v_INTER_TAB_SIZE2 = _mm_set1_epi16(INTER_TAB_SIZE2 - 1); for (; x1 <= bcols - 8; x1 += 8) { __m128i const* src = (__m128i const*)(sA + x1); __m128i* dst = (__m128i*)(A + x1); sA_data = _mm_loadu_si128(src); d_data = _mm_and_si128(sA_data, v_INTER_TAB_SIZE2); _mm_storeu_si128(dst, d_data); } #elif MEGDNN_AARCH64 || MEGDNN_ARMV7 uint16x8_t v_scale = vdupq_n_u16(INTER_TAB_SIZE2 - 1); for (; x1 <= bcols - 8; x1 += 8) vst1q_u16(A + x1, vandq_u16(vld1q_u16(sA + x1), v_scale)); #endif for (; x1 < bcols; ++x1) A[x1] = (ushort)(sA[x1] & (INTER_TAB_SIZE2 - 1)); } ifunc(src, dpart, bufxy, bufa, ctab, bvalue); } } } #define DISPATCH_CHANNEL(_imode, _bmode, _ch, _cb) \ switch (_ch) { \ case 1: { \ _cb(_imode, _bmode, 1); \ break; \ } \ case 2: { \ _cb(_imode, _bmode, 2); \ break; \ } \ case 3: { \ _cb(_imode, _bmode, 3); \ break; \ } \ default: { \ megdnn_assert(0, "unsupport channels: %zu, only supprt 1/2/3", _ch); \ } \ } #define DISPATCH_BMODE(_imode, _bmode, _ch, _cb) \ switch (_bmode) { \ case BorderMode::REPLICATE: { \ DISPATCH_CHANNEL(_imode, BorderMode::REPLICATE, _ch, _cb); \ break; \ } \ case BorderMode::REFLECT: { \ DISPATCH_CHANNEL(_imode, BorderMode::REFLECT, _ch, _cb); \ break; \ } \ case BorderMode::REFLECT_101: { \ DISPATCH_CHANNEL(_imode, BorderMode::REFLECT_101, _ch, _cb); \ break; \ } \ case BorderMode::WRAP: { \ DISPATCH_CHANNEL(_imode, BorderMode::WRAP, _ch, _cb); \ break; \ } \ case BorderMode::CONSTANT: { \ DISPATCH_CHANNEL(_imode, BorderMode::CONSTANT, _ch, _cb); \ break; \ } \ default: { \ megdnn_assert(0, "unsupport border mode for cv"); \ } \ } #define DISPATCH_IMODE(_imode, _bmode, _ch, _cb) \ switch (_imode) { \ case InterpolationMode::NEAREST: { \ MIDOUT_BEGIN( \ megdnn_warp, midout_iv(0), midout_iv("DISPATCH_IMODE"_hash)) { \ DISPATCH_BMODE(InterpolationMode::NEAREST, _bmode, _ch, _cb); \ } \ MIDOUT_END(); \ break; \ } \ case InterpolationMode::LINEAR: { \ MIDOUT_BEGIN( \ megdnn_warp, midout_iv(1), midout_iv("DISPATCH_IMODE"_hash)) { \ DISPATCH_BMODE(InterpolationMode::LINEAR, _bmode, _ch, _cb); \ } \ MIDOUT_END(); \ break; \ } \ case InterpolationMode::AREA: { \ MIDOUT_BEGIN( \ megdnn_warp, midout_iv(2), midout_iv("DISPATCH_IMODE"_hash)) { \ DISPATCH_BMODE(InterpolationMode::AREA, _bmode, _ch, _cb); \ } \ MIDOUT_END(); \ break; \ } \ case InterpolationMode::CUBIC: { \ MIDOUT_BEGIN( \ megdnn_warp, midout_iv(3), midout_iv("DISPATCH_IMODE"_hash)) { \ DISPATCH_BMODE(InterpolationMode::CUBIC, _bmode, _ch, _cb); \ } \ MIDOUT_END(); \ break; \ } \ case InterpolationMode::LANCZOS4: { \ MIDOUT_BEGIN( \ megdnn_warp, midout_iv(4), midout_iv("DISPATCH_IMODE"_hash)) { \ DISPATCH_BMODE(InterpolationMode::LANCZOS4, _bmode, _ch, _cb); \ } \ MIDOUT_END(); \ break; \ } \ default: { \ megdnn_assert(0, "unsupport interpolation mode for cv"); \ } \ } } // namespace warp } // namespace megdnn // vim: syntax=cpp.doxygen