forward.cu

/**
 * \file dnn/src/cuda/resize/forward.cu
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */
#include "src/common/rounding_converter.cuh"
#include "src/common/utils.cuh"
#include "src/cuda/resize/common.cuh"
#include "src/cuda/resize/common.h"
#include "src/cuda/resize/resize_cv.cuh"

#include "src/cuda/cv/kernel_common.cuh"
#include "src/common/resize.cuh"

using namespace megdnn;
using namespace cuda;
using namespace megdnn::cuda::resize;
using megdnn::resize::interpolate_cubic;
using megdnn::megcv::saturate;

namespace {

template <typename ctype>
struct DirectSrcVisitor {
    const ctype* ptr;

    __device__ __forceinline__ const ctype* get(int batch, int im_size) {
        return ptr + batch * im_size;
    }

    void move_batch(size_t batch, size_t im_size) { ptr += batch * im_size; }
};

template <typename ctype, typename SrcVisitor, typename OutputConverter>
__global__ void kern_general_linear(SrcVisitor src, ctype* __restrict dst,
                                    int C, int IH, int IW, int OH, int OW,
                                    int S_IN, int S_IC, int S_IH, int S_IW,
                                    float scale_h, float scale_w) {
    OutputConverter output_converter;
    int ow = blockIdx.x * blockDim.x + threadIdx.x;
    int oh = blockIdx.y * blockDim.y + threadIdx.y;
    const ctype* __restrict sptr = src.get(blockIdx.z, S_IN);
    dst += blockIdx.z * C * OH * OW;

    if (ow < OW && oh < OH) {
        float alphah, alphaw;
        int ih0, iw0;
        get_origin_coord(scale_h, IH, oh, alphah, ih0);
        get_origin_coord(scale_w, IW, ow, alphaw, iw0);

        int ih1 = ih0 + 1;
        int iw1 = iw0 + 1;

        for (int c = 0; c < C; ++c) {
            dst[oh * OW + ow] = output_converter(
                    sptr[ih0 * S_IH + iw0 * S_IW] * (1.0f - alphaw) *
                            (1.0f - alphah) +
                    sptr[ih0 * S_IH + iw1 * S_IW] * alphaw * (1.0f - alphah) +
                    sptr[ih1 * S_IH + iw0 * S_IW] * (1.0f - alphaw) * alphah +
                    sptr[ih1 * S_IH + iw1 * S_IW] * alphaw * alphah);

            sptr += S_IC;
            dst += OH * OW;
        }
    }
}

template <typename ctype, typename SrcVisitor, typename OutputConverter>
__global__ void kern_general_nearest(SrcVisitor src, ctype* __restrict dst,
                                     int C, int IH, int IW, int OH, int OW,
                                     int S_IN, int S_IC, int S_IH, int S_IW,
                                     float scale_h, float scale_w) {
    OutputConverter output_converter;
    int ow = blockIdx.x * blockDim.x + threadIdx.x;
    int oh = blockIdx.y * blockDim.y + threadIdx.y;
    const ctype* __restrict sptr = src.get(blockIdx.z, S_IN);
    dst += blockIdx.z * C * OH * OW;

    if (ow < OW && oh < OH) {
        int ih = get_nearest_src(scale_h, IH, oh);
        int iw = get_nearest_src(scale_w, IW, ow);

        for (int c = 0; c < C; ++c) {
            dst[oh * OW + ow] = output_converter(sptr[ih * S_IH + iw * S_IW]);

            sptr += S_IC;
            dst += OH * OW;
        }
    }
}

template <typename ctype, typename SrcVisitor, typename OutputConverter>
__global__ void kern_general_cubic(SrcVisitor src, ctype* __restrict dst, int C,
                                   int IH, int IW, int OH, int OW, int S_IN,
                                   int S_IC, int S_IH, int S_IW, float scale_h,
                                   float scale_w) {
    OutputConverter output_converter;
    int ow = blockIdx.x * blockDim.x + threadIdx.x;
    int oh = blockIdx.y * blockDim.y + threadIdx.y;
    const ctype* __restrict sptr = src.get(blockIdx.z, S_IN);
    dst += blockIdx.z * C * OH * OW;

    if (ow < OW && oh < OH) {
        float alphah, alphaw;
        int ih0, iw0;
        get_origin_coord(scale_h, IH, oh, alphah, ih0, true);
        get_origin_coord(scale_w, IW, ow, alphaw, iw0, true);
        ih0--;
        iw0--;
        float h_coeff[4], w_coeff[4];
        interpolate_cubic(alphah, h_coeff);
        interpolate_cubic(alphaw, w_coeff);
        for (int c = 0; c < C; ++c) {
            float ret = 0;
            constexpr int ksize = 4;
            for (int kh = 0; kh < ksize; kh++) {
                int ih = saturate(ih0 + kh, 0, IH - 1);
                for (int kw = 0; kw < ksize; kw++) {
                    int iw = saturate(iw0 + kw, 0, IW - 1);
                    ret += sptr[ih * S_IH + iw * S_IW] * h_coeff[kh] *
                           w_coeff[kw];
                }
            }
            dst[oh * OW + ow] = output_converter(ret);

            sptr += S_IC;
            dst += OH * OW;
        }
    }
}
template <typename ctype, typename SrcVisitor, typename OutputConverter>
__global__ void kern_general_nhwc(SrcVisitor src, ctype* __restrict dst, int C,
                                  int IH, int IW, int OH, int OW, float scale_h,
                                  float scale_w) {
    OutputConverter output_converter;
    int ow = blockIdx.x * blockDim.x + threadIdx.x;
    int oh = blockIdx.y * blockDim.y + threadIdx.y;
    const ctype* __restrict sptr = src.get(blockIdx.z, C * IH * IW);
    dst += blockIdx.z * C * OH * OW;
    if (ow < OW && oh < OH) {
        float alphah, alphaw;
        int ih0, iw0;
        get_origin_coord(scale_h, IH, oh, alphah, ih0);
        get_origin_coord(scale_w, IW, ow, alphaw, iw0);

        int ih1 = ih0 + 1;
        int iw1 = iw0 + 1;

        for (int c = 0; c < C; ++c) {
            dst[(oh * OW + ow) * C + c] = output_converter(
                    sptr[(ih0 * IW + iw0) * C + c] * (1.0f - alphaw) *
                            (1.0f - alphah) +
                    sptr[(ih0 * IW + iw1) * C + c] * alphaw * (1.0f - alphah) +
                    sptr[(ih1 * IW + iw0) * C + c] * (1.0f - alphaw) * alphah +
                    sptr[(ih1 * IW + iw1) * C + c] * alphaw * alphah);
        }
    }
}

template <typename ctype, typename SrcVisitor>
void dispatch_with_visitor(bool is_nhwc, InterpolationMode imode,
                           SrcVisitor src, ctype* dst, int N, int C, int IH,
                           int IW, int OH, int OW, int S_IN, int S_IC, int S_IH,
                           int S_IW, cudaStream_t stream) {
    const int BY = 16, BX = 32;

    const int max_batch_size = 65535;
    while (N) {
        size_t curr_batch_size = N < max_batch_size ? N : max_batch_size;
        dim3 threads(BX, BY);
        dim3 blocks((OW + BX - 1) / BX, (OH + BY - 1) / BY, curr_batch_size);

        float scale_h = static_cast<float>(OH) / IH;
        float scale_w = static_cast<float>(OW) / IW;
        if (is_nhwc) {
            kern_general_nhwc<ctype, SrcVisitor,
                              rounding::RoundingConverter<ctype>>
                    <<<blocks, threads, 0, stream>>>(src, dst, C, IH, IW, OH,
                                                     OW, scale_h, scale_w);
        } else {
            switch (imode) {
                case InterpolationMode::INTER_LINEAR:
                    kern_general_linear<ctype, SrcVisitor,
                                        rounding::RoundingConverter<ctype>>
                            <<<blocks, threads, 0, stream>>>(
                                    src, dst, C, IH, IW, OH, OW, S_IN, S_IC,
                                    S_IH, S_IW, scale_h, scale_w);
                    break;
                case InterpolationMode::INTER_NEAREST:
                    kern_general_nearest<ctype, SrcVisitor,
                                         rounding::RoundingConverter<ctype>>
                            <<<blocks, threads, 0, stream>>>(
                                    src, dst, C, IH, IW, OH, OW, S_IN, S_IC,
                                    S_IH, S_IW, scale_h, scale_w);
                    break;
                case InterpolationMode::INTER_CUBIC:
                    kern_general_cubic<ctype, SrcVisitor,
                                       rounding::RoundingConverter<ctype>>
                            <<<blocks, threads, 0, stream>>>(
                                    src, dst, C, IH, IW, OH, OW, S_IN, S_IC,
                                    S_IH, S_IW, scale_h, scale_w);
                    break;
                default:
                    megdnn_throw("unsupported interpolation mode");
                    break;
            }
        }
        N -= curr_batch_size;
        src.move_batch(curr_batch_size, C * IH * IW);
        dst += curr_batch_size * C * OH * OW;
    }
}

template <typename ctype, typename SrcVisitor, typename OutputConverter>
__global__ void kern_general_nchw4(SrcVisitor src, ctype* __restrict dst, int C,
                                   int IH, int IW, int OH, int OW,
                                   float scale_h, float scale_w) {
    OutputConverter output_converter;
    int ow = blockIdx.x * blockDim.x + threadIdx.x;
    int oh = blockIdx.y * blockDim.y + threadIdx.y;
    const ctype* __restrict sptr = src.get(blockIdx.z, C * IH * IW);
    dst += blockIdx.z * C * OH * OW;

    if (ow < OW && oh < OH) {
        float alphah, alphaw;
        int ih0, iw0;
        get_origin_coord(scale_h, IH, oh, alphah, ih0);
        get_origin_coord(scale_w, IW, ow, alphaw, iw0);

        int ih1 = ih0 + 1;
        int iw1 = iw0 + 1;

        int o_coor = (oh * OW + ow) << 2;
        int i_coor00 = (ih0 * IW + iw0) << 2;
        int i_coor01 = (ih0 * IW + iw1) << 2;
        int i_coor10 = (ih1 * IW + iw0) << 2;
        int i_coor11 = (ih1 * IW + iw1) << 2;
        for (int c0 = 0, nr_chan = C >> 2; c0 < nr_chan; ++c0) {
#pragma unroll
            for (int c1 = 0; c1 < 4; ++c1) {
                dst[o_coor + c1] = output_converter(
                        sptr[i_coor00 + c1] * (1.0f - alphaw) *
                                (1.0f - alphah) +
                        sptr[i_coor01 + c1] * alphaw * (1.0f - alphah) +
                        sptr[i_coor10 + c1] * (1.0f - alphaw) * alphah +
                        sptr[i_coor11 + c1] * alphaw * alphah);
            }
            dst += OH * OW * 4;
            sptr += IH * IW * 4;
        }
    }
}

template <typename ctype, typename SrcVisitor>
void dispatch_with_visitor_nchw4(SrcVisitor src, ctype* dst, int N, int C,
                                 int IH, int IW, int OH, int OW,
                                 cudaStream_t stream) {
    const int BY = 16, BX = 32;

    const int max_batch_size = 65535;
    while (N) {
        size_t curr_batch_size = N < max_batch_size ? N : max_batch_size;
        dim3 threads(BX, BY);
        dim3 blocks((OW + BX - 1) / BX, (OH + BY - 1) / BY, curr_batch_size);

        float scale_h = static_cast<float>(OH) / IH;
        float scale_w = static_cast<float>(OW) / IW;
        kern_general_nchw4<ctype, SrcVisitor,
                           rounding::RoundingConverter<ctype>>
                <<<blocks, threads, 0, stream>>>(src, dst, C, IH, IW, OH, OW,
                                                 scale_h, scale_w);
        N -= curr_batch_size;
        src.move_batch(curr_batch_size, C * IH * IW);
        dst += curr_batch_size * C * OH * OW;
    }
}

}  // anonymous namespace

namespace megdnn {
namespace cuda {
namespace resize {

template <typename ctype>
void forward_proxy(bool is_nhwc, InterpolationMode imode, const ctype* src,
                   ctype* dst, int N, int C, int IH, int IW, int OH, int OW,
                   int S_IN, int S_IC, int S_IH, int S_IW,
                   cudaStream_t stream) {
    DirectSrcVisitor<ctype> visitor;
    visitor.ptr = src;
    dispatch_with_visitor(is_nhwc, imode, visitor, dst, N, C, IH, IW, OH, OW,
                          S_IN, S_IC, S_IH, S_IW, stream);
    after_kernel_launch();
}

template <typename ctype>
void forward_proxy_nchw4(const ctype* src, ctype* dst, int N, int C, int IH,
                         int IW, int OH, int OW, cudaStream_t stream) {
    DirectSrcVisitor<ctype> visitor;
    visitor.ptr = src;
    dispatch_with_visitor_nchw4(visitor, dst, N, C, IH, IW, OH, OW, stream);
    after_kernel_launch();
}

#define INST(ctype)                                                            \
    template void forward_proxy(bool, InterpolationMode, const ctype*, ctype*, \
                                int, int, int, int, int, int, int, int, int,   \
                                int, cudaStream_t);
INST(float)
INST(uint8_t)
INST(int8_t)
#undef INST

#define INST(ctype)                                                        \
    template void forward_proxy_nchw4(const ctype*, ctype*, int, int, int, \
                                      int, int, int, cudaStream_t)

INST(int8_t);
#undef INST
}  // namespace resize
}  // namespace cuda
}  // namespace megdnn

// vim: syntax=cpp.doxygen