提交 ba66e1d0 编写于 作者: M Megvii Engine Team

feat(dnn): add nchw_fp32 nchw44_qint8 cuda dct

GitOrigin-RevId: 581e31fc209008418f9821f32f7c71db76f84ddf
上级 b3278229
...@@ -182,6 +182,48 @@ class WarpPerspectiveBackwardMat: public WarpPerspectiveBase { ...@@ -182,6 +182,48 @@ class WarpPerspectiveBackwardMat: public WarpPerspectiveBase {
size_t workspace_in_bytes); size_t workspace_in_bytes);
}; };
class DctChannelSelectForward : public OperatorBase {
DEF_OPR_PARAM(DctChannelSelect);
DEF_OPR_IMPL(DctChannelSelectForward, OperatorBase, 3, 1);
public:
/**
* \param[in] DctChannelSelectForward input, must be uint8 nchw tensor
* \param[in] mask_offset input, must be int32 nchw tensor
* \param[in] mask_val input, must be int32 nchw tensor
* \param[dst] DctChannelSelectForward output, default fp32 nchw tensor
* \param[out] workspace temporary workspace to perform forward
*/
virtual void exec(_megdnn_tensor_in src,
_megdnn_tensor_in mask_offset,
_megdnn_tensor_in mask_val,
_megdnn_tensor_out dst,
_megdnn_workspace workspace) = 0;
void deduce_layout(const TensorLayout& src,
const TensorLayout& mask_offset,
const TensorLayout& mask_val,
TensorLayout& dst);
virtual size_t get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& mask_offset,
const TensorLayout& mask_val,
const TensorLayout& dst) = 0;
protected:
void check_layout_fwd(const TensorLayout& src,
const TensorLayout& mask_offset,
const TensorLayout& mask_val,
const TensorLayout& dst);
void deduce_layout_fwd(const TensorLayout& src,
const TensorLayout& mask_offset,
const TensorLayout& mask_val,
TensorLayout& dst);
std::string param_msg() const;
};
} // namespace megdnn } // namespace megdnn
#include "megdnn/internal/opr_header_epilogue.h" #include "megdnn/internal/opr_header_epilogue.h"
......
...@@ -411,6 +411,9 @@ pdef('ElemwiseMultiType').add_enum( ...@@ -411,6 +411,9 @@ pdef('ElemwiseMultiType').add_enum(
pdef('PowC', 'power with constant exponent').add_fields('float32', 'exp', 0) pdef('PowC', 'power with constant exponent').add_fields('float32', 'exp', 0)
(pdef('DctChannelSelect', '2d discrete cosine transform').add_enum_alias('Format', 'ConvolutionV0').
add_enum('FastImpl', 'NONE', 'FIX_32_MASK').add_fields('int32', 'dct_block_size', 8))
(pdef('MatrixMul', version=0, is_legacy=True). (pdef('MatrixMul', version=0, is_legacy=True).
add_fields('bool', 'transposeA', 'false', 'transposeB', 'false'). add_fields('bool', 'transposeA', 'false', 'transposeB', 'false').
add_enum('DataType', add_enum('DataType',
......
/**
* \file dnn/src/common/dct.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "megdnn/oprs.h"
#include "src/common/utils.h"
namespace megdnn {
void DctChannelSelectForward::deduce_layout_fwd(const TensorLayout& src,
const TensorLayout& mask_offset,
const TensorLayout& mask_val,
TensorLayout& dst) {
const size_t dct_block = param().dct_block_size;
const size_t in = src.shape[0];
const size_t ic = src.shape[1];
const size_t ih = src.shape[2];
const size_t iw = src.shape[3];
check_layout_fwd(src, mask_offset, mask_val, dst);
const size_t oh = ih / dct_block;
const size_t ow = iw / dct_block;
//! mask will be empty or (ic + 1) elements
size_t oc = mask_offset.ndim > 0 && mask_offset[0] >= 2
? mask_val.shape[0]
: ic * dct_block * dct_block;
if (param().fastImpl == Param::FastImpl::FIX_32_MASK) {
megdnn_assert(oc == 32,
"Param::FastImpl::FIX_32_MASK oc must be 32, but %zu",
oc);
}
if (param().format == Param::Format::NCHW) {
dst = TensorLayout(TensorShape({in, oc, oh, ow}), dst.dtype);
} else {
megdnn_assert(param().format == Param::Format::NCHW4,
"dct format must be nchw or nchw4");
megdnn_assert(oc % 4 == 0, "oc mod 4 == 0 in nchw4");
dst = TensorLayout(TensorShape({in, oc / 4, oh, ow, 4}), dst.dtype);
}
}
void DctChannelSelectForward::deduce_layout(const TensorLayout& src,
const TensorLayout& mask_offset,
const TensorLayout& mask_val,
TensorLayout& dst) {
deduce_layout_fwd(src, mask_offset, mask_val, dst);
}
void DctChannelSelectForward::check_layout_fwd(const TensorLayout& src,
const TensorLayout& mask_offset,
const TensorLayout& mask_val,
const TensorLayout& dst) {
const size_t dct_block = param().dct_block_size;
const size_t ih = src.shape[2];
const size_t iw = src.shape[3];
megdnn_assert(mask_offset.ndim == 0 || (mask_offset.ndim == 1 &&
(mask_offset.shape[0] == 0 ||
mask_offset.shape[0] >= 2) &&
mask_val.ndim == 1),
"mask only support one valid dim");
megdnn_assert(mask_val.ndim <= 1, "only support one dim");
megdnn_assert(src.dtype.enumv() == DTypeEnum::Uint8,
"src.dtype == dtype::Uint8");
megdnn_assert(dst.dtype.enumv() == DTypeEnum::Float32 ||
dst.dtype.enumv() == DTypeEnum::QuantizedS8,
"dst.dtype == dtype::Float32 || dst.dtype.enumv() == "
"DTypeEnum::QuantizedS8");
megdnn_assert(ih % dct_block == 0, "ih mod dctblock == 0");
megdnn_assert(iw % dct_block == 0, "iw mod dctblock == 0");
}
} // namespace megdnn
// vim: syntax=cpp.doxygen
...@@ -201,6 +201,7 @@ private: ...@@ -201,6 +201,7 @@ private:
cb(RemapBackwardMat) \ cb(RemapBackwardMat) \
cb(AdaptivePoolingForward) \ cb(AdaptivePoolingForward) \
cb(AdaptivePoolingBackward) \ cb(AdaptivePoolingBackward) \
cb(DctChannelSelectForward)
/*! /*!
* \brief specialize HandleImpl::create_operator for a single opr type; * \brief specialize HandleImpl::create_operator for a single opr type;
......
/**
* \file dnn/src/cuda/dct/dct_channel_select.cu
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "megcore_cdefs.h"
#include "src/cuda/dct/dct_channel_select.cuh"
#include "src/cuda/error_info.cuh"
namespace megdnn {
namespace cuda {
template <typename T>
struct CudaPostProcess;
template <>
struct CudaPostProcess<float> {
CudaPostProcess(float){};
static inline __device__ float func(float val) { return val; }
};
template <>
struct CudaPostProcess<int8_t> {
CudaDTypeParamImpl<dt_qint8> m_type_cvt;
CudaPostProcess(float scale) { m_type_cvt.inv_scale = 1.f / scale; };
inline __device__ int8_t func(float val) {
return m_type_cvt.quantize(val).as_int8();
}
};
template <uint32_t format>
struct ChannelBlockHelper;
template <>
struct ChannelBlockHelper<dct::DctLayoutFormat::NCHW4> {
static constexpr int channel_block = 4;
};
template <>
struct ChannelBlockHelper<dct::DctLayoutFormat::NCHW> {
static constexpr int channel_block = 1;
};
namespace dct {
namespace {
inline __device__ void load_row(float (&row_cache)[8], const uint8_t* src) {
int2 row = *((int2*)src);
row_cache[0] = (float)(((uchar4*)&(row.x))->x);
row_cache[1] = (float)(((uchar4*)&(row.x))->y);
row_cache[2] = (float)(((uchar4*)&(row.x))->z);
row_cache[3] = (float)(((uchar4*)&(row.x))->w);
row_cache[4] = (float)(((uchar4*)&(row.y))->x);
row_cache[5] = (float)(((uchar4*)&(row.y))->y);
row_cache[6] = (float)(((uchar4*)&(row.y))->z);
row_cache[7] = (float)(((uchar4*)&(row.y))->w);
}
inline __device__ void fast_dct_1d_internel(float& src0, float& src1,
float& src2, float& src3,
float& src4, float& src5,
float& src6, float& src7) {
constexpr float rsqrt_8 = 0.3535533905932737f; //!< rsqrt_8 = sqrt(1 / 8)
constexpr float a = 1.387039845322148f; //!< a = sqrt2 * cos(pi * 1 / 16)
constexpr float b = 1.306562964876377f; //!< b = sqrt2 * cos(pi * 2 / 16)
constexpr float c = 1.175875602419359f; //!< c = sqrt2 * cos(pi * 3 / 16)
constexpr float d = 0.785694958387102f; //!< d = sqrt2 * cos(pi * 5 / 16)
constexpr float e = 0.541196100146197f; //!< e = sqrt2 * cos(pi * 6 / 16)
constexpr float f = 0.275899379282943f; //!< f = sqrt2 * cos(pi * 7 / 16)
const float add_0_7 = src0 + src7;
const float add_1_6 = src1 + src6;
const float add_2_5 = src2 + src5;
const float add_3_4 = src3 + src4;
const float sub_0_7 = src0 - src7;
const float sub_6_1 = src6 - src1;
const float sub_2_5 = src2 - src5;
const float sub_4_3 = src4 - src3;
const float add_0_7_3_4 = add_0_7 + add_3_4;
const float add_1_6_2_5 = add_1_6 + add_2_5;
const float add_0_7_sub_3_4 = add_0_7 - add_3_4;
const float add_1_6_sub_2_5 = add_1_6 - add_2_5;
src0 = rsqrt_8 * (add_0_7_3_4 + add_1_6_2_5);
src2 = rsqrt_8 * (b * add_0_7_sub_3_4 + e * add_1_6_sub_2_5);
src4 = rsqrt_8 * (add_0_7_3_4 - add_1_6_2_5);
src6 = rsqrt_8 * (e * add_0_7_sub_3_4 - b * add_1_6_sub_2_5);
src1 = rsqrt_8 * (a * sub_0_7 - c * sub_6_1 + d * sub_2_5 - f * sub_4_3);
src3 = rsqrt_8 * (c * sub_0_7 + f * sub_6_1 - a * sub_2_5 + d * sub_4_3);
src5 = rsqrt_8 * (d * sub_0_7 + a * sub_6_1 + f * sub_2_5 - c * sub_4_3);
src7 = rsqrt_8 * (f * sub_0_7 + d * sub_6_1 + c * sub_2_5 + a * sub_4_3);
}
inline __device__ void fast_dct_1d(float (&src)[8]) {
fast_dct_1d_internel(src[0], src[1], src[2], src[3], src[4], src[5], src[6],
src[7]);
}
inline __device__ void fast_dct_1d_col(float (&src)[8][8], const int col) {
fast_dct_1d_internel(src[0][col], src[1][col], src[2][col], src[3][col],
src[4][col], src[5][col], src[6][col], src[7][col]);
}
enum class MaskType {
NO_MASK = 0,
USER_DEFINE_MASK = 1,
FIX_32_MASK = 2,
MASK_END
};
template <const int dct_block, const int block_oh, const int block_ow,
uint32_t format, MaskType mask_type, typename DstDtype, typename T2>
struct StoreMask;
template <const int dct_block, const int block_oh, const int block_ow,
typename T2>
struct StoreMask<dct_block, block_oh, block_ow, DctLayoutFormat::NCHW,
MaskType::USER_DEFINE_MASK, float, T2> {
static inline __device__ void func(
const float (&thread_cache)[dct_block][dct_block], float* dst_tid,
const int oc_stride, int channel_idx, const int* mask_offset,
const int* mask_val, CudaPostProcess<T2>& quant_param,
megcore::AsyncErrorInfo* error_info, void* error_tracker) {
__shared__ float shared[dct_block][dct_block][block_oh][block_ow];
#pragma unroll
for (int i = 0; i < dct_block; ++i)
#pragma unroll
for (int j = 0; j < dct_block; ++j) {
shared[i][j][threadIdx.y][threadIdx.x] = thread_cache[i][j];
}
const int store_channel_offset = mask_offset[channel_idx];
const int nr_store_channel =
mask_offset[channel_idx + 1] - store_channel_offset;
if (nr_store_channel < 0) {
set_async_error_info(error_info, error_tracker,
"nchw sub mask len must > 0");
}
for (int store_channel_idx = 0; store_channel_idx < nr_store_channel;
++store_channel_idx) {
const int index =
mask_val[store_channel_offset + store_channel_idx];
dst_tid[store_channel_idx * oc_stride] =
shared[index / dct_block][index % dct_block][threadIdx.y]
[threadIdx.x];
}
}
};
template <const int dct_block, const int block_oh, const int block_ow,
typename T2>
struct StoreMask<dct_block, block_oh, block_ow, DctLayoutFormat::NCHW4,
MaskType::USER_DEFINE_MASK, int8_t, T2> {
static inline __device__ void func(
const float (&thread_cache)[dct_block][dct_block], int8_t* dst_tid,
const int oc_stride, int channel_idx, const int* mask_offset,
const int* mask_val, CudaPostProcess<T2>& quant_param,
megcore::AsyncErrorInfo* error_info, void* error_tracker) {
//! nchw4 channel_block is 4
constexpr int channel_block =
ChannelBlockHelper<DctLayoutFormat::NCHW4>::channel_block;
__shared__ float shared[dct_block][dct_block][block_oh][block_ow];
#pragma unroll
for (int i = 0; i < dct_block; ++i)
#pragma unroll
for (int j = 0; j < dct_block; ++j) {
shared[i][j][threadIdx.y][threadIdx.x] = thread_cache[i][j];
}
const int store_channel_offset = mask_offset[channel_idx];
const int nr_store_channel =
mask_offset[channel_idx + 1] - store_channel_offset;
if (nr_store_channel % 4 != 0 || nr_store_channel < 0) {
set_async_error_info(error_info, error_tracker,
"nchw4 sub_mask_len mod 4 should be 0 and "
"sub_mask_len must > 0");
}
for (int store_channel_idx = 0; store_channel_idx < nr_store_channel;
store_channel_idx += channel_block) {
const int index0 =
mask_val[store_channel_offset + store_channel_idx];
const int index1 =
mask_val[store_channel_offset + store_channel_idx + 1];
const int index2 =
mask_val[store_channel_offset + store_channel_idx + 2];
const int index3 =
mask_val[store_channel_offset + store_channel_idx + 3];
const int store_c4_idx = store_channel_idx / channel_block;
*(char4*)(&dst_tid[store_c4_idx * channel_block * oc_stride]) = {
quant_param.func(
shared[index0 / dct_block][index0 % dct_block]
[threadIdx.y][threadIdx.x]),
quant_param.func(
shared[index1 / dct_block][index1 % dct_block]
[threadIdx.y][threadIdx.x]),
quant_param.func(
shared[index2 / dct_block][index2 % dct_block]
[threadIdx.y][threadIdx.x]),
quant_param.func(
shared[index3 / dct_block][index3 % dct_block]
[threadIdx.y][threadIdx.x])};
}
}
};
template <const int dct_block, const int block_oh, const int block_ow,
uint32_t format, typename DstDtype, typename T2>
struct StoreMask<dct_block, block_oh, block_ow, format, MaskType::NO_MASK,
DstDtype, T2> {
static inline __device__ void func(
const float (&thread_cache)[dct_block][dct_block],
DstDtype* dst_tid, const int oc_stride, int channel_idx,
const int* mask_offset, const int* mask_val,
CudaPostProcess<T2>& quant_param,
megcore::AsyncErrorInfo* error_info, void* error_tracker) {
constexpr int channel_block = ChannelBlockHelper<format>::channel_block;
#pragma unroll
for (int i = 0; i < dct_block; i++) {
#pragma unroll
for (int j = 0; j < dct_block; j++) {
dst_tid[(i * dct_block + j) / channel_block * channel_block *
oc_stride +
(i * dct_block + j) % channel_block] =
quant_param.func(thread_cache[i][j]);
}
}
}
};
template <const int dct_block, const int block_oh, const int block_ow,
typename T2>
struct StoreMask<dct_block, block_oh, block_ow, DctLayoutFormat::NCHW,
MaskType::FIX_32_MASK, float, T2> {
static inline __device__ void func(
const float (&thread_cache)[dct_block][dct_block], float* dst_tid,
const int oc_stride, int channel_idx, const int* mask_offset,
const int* mask_val, CudaPostProcess<T2>& quant_param,
megcore::AsyncErrorInfo* error_info, void* error_tracker) {
#define STORE(store_index, index) \
dst_tid[store_index * oc_stride] = \
thread_cache[index / dct_block][index % dct_block]
STORE(0, 0);
STORE(1, 1);
STORE(2, 8);
STORE(3, 16);
STORE(4, 9);
STORE(5, 2);
STORE(6, 3);
STORE(7, 10);
if (channel_idx == 0) {
STORE(8, 17);
STORE(9, 24);
STORE(10, 32);
STORE(11, 25);
STORE(12, 18);
STORE(13, 11);
STORE(14, 4);
STORE(15, 5);
}
#undef STORE
}
};
template <const int dct_block, const int block_oh, const int block_ow,
typename T2>
struct StoreMask<dct_block, block_oh, block_ow, DctLayoutFormat::NCHW4,
MaskType::FIX_32_MASK, int8_t, T2> {
static inline __device__ void func(
const float (&thread_cache)[dct_block][dct_block], int8_t* dst_tid,
const int oc_stride, int channel_idx, const int* mask_offset,
const int* mask_val, CudaPostProcess<T2>& quant_param,
megcore::AsyncErrorInfo* error_info, void* error_tracker) {
#define STORE(store_index, index0, index1, index2, index3) \
*(char4*)(&dst_tid[store_index * oc_stride]) = { \
quant_param.func( \
thread_cache[index0 / dct_block][index0 % dct_block]), \
quant_param.func( \
thread_cache[index1 / dct_block][index1 % dct_block]), \
quant_param.func( \
thread_cache[index2 / dct_block][index2 % dct_block]), \
quant_param.func( \
thread_cache[index3 / dct_block][index3 % dct_block])}
STORE(0, 0, 1, 8, 16);
STORE(4, 9, 2, 3, 10);
if (channel_idx == 0) {
STORE(8, 17, 24, 32, 25);
STORE(12, 18, 11, 4, 5);
}
#undef STORE
}
};
template <const int dct_block, MaskType mask_type, const int ker_block_h,
const int ker_block_w, uint32_t format, typename DstDtype,
typename T2>
__global__ void kern_dct(const uint8_t* src, DstDtype* dst, const int n,
const int c, const int h, const int w, const int oh,
const int ow, const int oc_stride, const int oc,
const int* mask_offset, const int* mask_val,
CudaPostProcess<T2> quant_param,
megcore::AsyncErrorInfo* error_info,
void* error_tracker) {
constexpr int block_oh = ker_block_h / dct_block;
constexpr int block_ow = ker_block_w / dct_block;
const int channel_stride = h * w;
const int oc_idx = blockIdx.z % c;
const int oh_idx = blockIdx.y * block_oh + threadIdx.y;
const int ow_idx = blockIdx.x * block_ow + threadIdx.x;
float thread_cache[dct_block][dct_block];
const uint8_t* src_tid =
src + blockIdx.z * channel_stride +
(blockIdx.y * ker_block_h + threadIdx.y * dct_block) * w +
(blockIdx.x * ker_block_w + threadIdx.x * dct_block);
const int inner_channel_offset =
(oh_idx * ow + ow_idx) * ChannelBlockHelper<format>::channel_block;
DstDtype* dst_tid =
dst + blockIdx.z * channel_stride + inner_channel_offset;
if (mask_type != MaskType::NO_MASK) {
const int batch_idx = blockIdx.z / c;
const int batch_stride = oc_stride * oc;
int out_channel_offset = 0;
if (mask_type == MaskType::FIX_32_MASK) {
//! trick out_channel_offset = {0, 16, 24}[oc_idx]; oc_idx = 0, 1, 2
out_channel_offset = 16 * oc_idx - 8 * (oc_idx >> 1);
} else {
out_channel_offset = mask_offset[oc_idx];
}
dst_tid = dst + batch_idx * batch_stride +
out_channel_offset * oc_stride + inner_channel_offset;
}
if (oh_idx < oh && ow_idx < ow) {
load_row(thread_cache[0], src_tid + 0 * w);
load_row(thread_cache[1], src_tid + 1 * w);
load_row(thread_cache[2], src_tid + 2 * w);
load_row(thread_cache[3], src_tid + 3 * w);
load_row(thread_cache[4], src_tid + 4 * w);
load_row(thread_cache[5], src_tid + 5 * w);
load_row(thread_cache[6], src_tid + 6 * w);
load_row(thread_cache[7], src_tid + 7 * w);
//! TMP = A @ C.T
fast_dct_1d(thread_cache[0]);
fast_dct_1d(thread_cache[1]);
fast_dct_1d(thread_cache[2]);
fast_dct_1d(thread_cache[3]);
fast_dct_1d(thread_cache[4]);
fast_dct_1d(thread_cache[5]);
fast_dct_1d(thread_cache[6]);
fast_dct_1d(thread_cache[7]);
//! TMP = C @ TMP
fast_dct_1d_col(thread_cache, 0);
fast_dct_1d_col(thread_cache, 1);
fast_dct_1d_col(thread_cache, 2);
fast_dct_1d_col(thread_cache, 3);
fast_dct_1d_col(thread_cache, 4);
fast_dct_1d_col(thread_cache, 5);
fast_dct_1d_col(thread_cache, 6);
fast_dct_1d_col(thread_cache, 7);
StoreMask<dct_block, block_oh, block_ow, format, mask_type, DstDtype,
T2>::func(thread_cache, dst_tid, oc_stride, oc_idx,
mask_offset, mask_val, quant_param, error_info,
error_tracker);
}
}
} // namespace
template <int dct_block, uint32_t format, typename DstDtype>
void call_kern_dct(const uint8_t* d_src, DstDtype* d_dst, const int n,
const int c, const int h, const int w, const int oc,
bool fix_32_mask, const int* mask_offset,
const int* mask_val, cudaStream_t stream,
megcore::AsyncErrorInfo* error_info, void* error_tracker,
float scale) {
constexpr int ker_block_h = 32;
constexpr int ker_block_w = 256;
const int oh = h / dct_block;
const int ow = w / dct_block;
const int oc_stride = oh * ow;
const dim3 block_dim(DIVUP(w, ker_block_w), DIVUP(h, ker_block_h), n * c);
const dim3 thread_dim(DIVUP(ker_block_w, dct_block),
DIVUP(ker_block_h, dct_block));
auto cuda_dtype_param = CudaPostProcess<DstDtype>(scale);
if (fix_32_mask) {
kern_dct<dct_block, MaskType::FIX_32_MASK, ker_block_h, ker_block_w,
format><<<block_dim, thread_dim, 0, stream>>>(
d_src, d_dst, n, c, h, w, oh, ow, oc_stride, oc, mask_offset,
mask_val, cuda_dtype_param, error_info, error_tracker);
} else if (mask_offset && mask_val) {
kern_dct<dct_block, MaskType::USER_DEFINE_MASK, ker_block_h,
ker_block_w, format><<<block_dim, thread_dim, 0, stream>>>(
d_src, d_dst, n, c, h, w, oh, ow, oc_stride, oc, mask_offset,
mask_val, cuda_dtype_param, error_info, error_tracker);
} else {
kern_dct<dct_block, MaskType::NO_MASK, ker_block_h, ker_block_w, format>
<<<block_dim, thread_dim, 0, stream>>>(
d_src, d_dst, n, c, h, w, oh, ow, oc_stride, oc,
mask_offset, mask_val, cuda_dtype_param, error_info,
error_tracker);
}
}
template void call_kern_dct<8, DctLayoutFormat::NCHW, float>(
const uint8_t* d_src, float* d_dst, const int n, const int c,
const int h, const int w, const int oc, bool fix_32_mask,
const int* mask_offset, const int* mask_val, cudaStream_t stream,
megcore::AsyncErrorInfo* error_info, void* error_tracker, float scale);
template void call_kern_dct<8, DctLayoutFormat::NCHW4, int8_t>(
const uint8_t* d_src, int8_t* d_dst, const int n, const int c,
const int h, const int w, const int oc, bool fix_32_mask,
const int* mask_offset, const int* mask_val, cudaStream_t stream,
megcore::AsyncErrorInfo* error_info, void* error_tracker, float scale);
} // namespace dct
} // namespace cuda
} // namespace megdnn
// vim: syntax=cpp.doxygen
\ No newline at end of file
/**
* \file dnn/src/cuda/dct/dct_channel_select.cuh
* MegEngine is Licensed under the Apache License, Version 2.0 (the
"License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express
or
* implied.
*/
#pragma once
#include <stdint.h>
#include <cstdio>
#include "src/common/opr_param_defs_enumv.cuh"
#include "src/cuda/utils.cuh"
namespace megdnn {
namespace cuda {
namespace dct {
using DctLayoutFormat = megdnn::param_enumv::DctChannelSelect::Format;
template <int dct_block, uint32_t format, typename DstDtype>
void call_kern_dct(const uint8_t* d_src, DstDtype* d_dst, const int n,
const int c, const int h, const int w, const int oc,
bool fix_32_mask, const int* mask_offset,
const int* mask_val, cudaStream_t stream,
megcore::AsyncErrorInfo* error_info, void* error_tracker,
float scale = 1.f);
} // namespace dct
} // namespace cuda
} // namespace megdnn
// vim: syntax=cpp.doxygen
\ No newline at end of file
/**
* \file dnn/src/naive/dct/opr_impl.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/common/utils.h"
#include "src/cuda/dct/dct_channel_select.cuh"
#include "src/cuda/dct/opr_impl.h"
#include "src/cuda/handle.h"
#include "src/cuda/utils.h"
namespace megdnn {
namespace cuda {
void DctChannelSelectForwardImpl::exec(_megdnn_tensor_in src,
_megdnn_tensor_in mask_offset,
_megdnn_tensor_in mask_val,
_megdnn_tensor_out dst,
_megdnn_workspace /*workspace*/) {
auto stream = cuda_stream(this->handle());
const int in = src.layout.shape[0];
const int ic = src.layout.shape[1];
const int ih = src.layout.shape[2];
const int iw = src.layout.shape[3];
int oc = dst.layout.shape[1];
const bool with_fix_32_mask =
param().fastImpl == Param::FastImpl::FIX_32_MASK;
if (param().format == Param::Format::NCHW4) {
megdnn_assert(dst.layout.ndim == 5 && dst.layout.shape[4] == 4,
"dst must be nchw4");
oc = oc * 4;
}
megdnn_assert(!with_fix_32_mask || (with_fix_32_mask && oc == 32),
"only support specify mask");
megdnn_assert(param().dct_block_size == 8, "only support dct block = 8");
auto error_info =
concrete_handle(this->handle())->megcore_context().error_info;
constexpr int dct_block = 8;
const int* mask_offset_ptr = nullptr;
const int* mask_val_ptr = nullptr;
if (mask_offset.layout.ndim == 1 && mask_offset.layout.shape[0] >= 2) {
mask_offset_ptr = mask_offset.ptr<int32_t>();
mask_val_ptr = mask_val.ptr<int32_t>();
}
if (dst.layout.dtype.enumv() == DTypeEnum::Float32) {
megdnn_assert(param().format == Param::Format::NCHW,
"fp32 only support nchw");
dct::call_kern_dct<dct_block, dct::DctLayoutFormat::NCHW>(
src.ptr<uint8_t>(), dst.ptr<float>(), in, ic, ih, iw, oc,
with_fix_32_mask, mask_offset_ptr, mask_val_ptr, stream,
error_info, m_error_tracker);
} else {
megdnn_assert(dst.layout.dtype.enumv() == DTypeEnum::QuantizedS8,
"only support fp32 and qs8");
megdnn_assert(param().format == Param::Format::NCHW4,
"qint8 only support nchw4");
dct::call_kern_dct<dct_block, dct::DctLayoutFormat::NCHW4>(
src.ptr<uint8_t>(), (int8_t*)dst.raw_ptr, in, ic, ih, iw, oc,
with_fix_32_mask, mask_offset_ptr, mask_val_ptr, stream,
error_info, m_error_tracker,
dst.layout.dtype.param<::megdnn::dtype::QuantizedS8>().scale);
}
}
} // namespace cuda
} // namespace megdnn
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/cuda/dct/opr_impl.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include "megdnn/oprs.h"
namespace megdnn {
namespace cuda {
class DctChannelSelectForwardImpl : public DctChannelSelectForward {
public:
using DctChannelSelectForward::DctChannelSelectForward;
void* m_error_tracker = nullptr;
void exec(_megdnn_tensor_in src, _megdnn_tensor_in mask_offset,
_megdnn_tensor_in mask_val, _megdnn_tensor_out dst,
_megdnn_workspace workspace) override;
size_t get_workspace_in_bytes(const TensorLayout& /*src*/,
const TensorLayout& /*mask_offset*/,
const TensorLayout& /*mask_val*/,
const TensorLayout& /*dst*/) {
return 0;
};
void set_error_tracker(void* tracker) override {
m_error_tracker = tracker;
}
};
} // namespace cuda
} // namespace megdnn
// vim: syntax=cpp.doxygen
...@@ -26,6 +26,7 @@ ...@@ -26,6 +26,7 @@
#include "src/cuda/convpooling/opr_impl.h" #include "src/cuda/convpooling/opr_impl.h"
#include "src/cuda/cumsum/opr_impl.h" #include "src/cuda/cumsum/opr_impl.h"
#include "src/cuda/cvt_color/opr_impl.h" #include "src/cuda/cvt_color/opr_impl.h"
#include "src/cuda/dct/opr_impl.h"
#include "src/cuda/deformable_conv/opr_impl.h" #include "src/cuda/deformable_conv/opr_impl.h"
#include "src/cuda/deformable_ps_roi_pooling/opr_impl.h" #include "src/cuda/deformable_ps_roi_pooling/opr_impl.h"
#include "src/cuda/dot/opr_impl.h" #include "src/cuda/dot/opr_impl.h"
......
/**
* \file dnn/src/naive/dct/opr_impl.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include <cmath>
#include "megdnn/basic_types.h"
#include "megdnn/dtype.h"
#include "midout.h"
#include "src/naive/dct/opr_impl.h"
#include "src/naive/handle.h"
#include "src/naive/matrix_mul/matrix_mul_helper.h"
MIDOUT_DECL(megdnn_naive_dct_fwd)
namespace megdnn {
namespace naive {
namespace {
static inline void generate_c_matrix(float* result, int block) {
constexpr float pi = M_PI;
for (int i = 0; i < block; ++i) {
for (int j = 0; j < block; ++j) {
float alpha = i == 0 ? sqrt(1.f / static_cast<float>(block))
: sqrt(2.f / static_cast<float>(block));
result[i * block + j] = alpha * cos((2.f * j + 1.f) * i * pi /
static_cast<float>(2 * block));
}
}
}
template <typename T>
void matmul(int m, int n, int k, int lda, int ldb, int ldc, const float* a,
const T* b, float* c, bool trans_a, bool trans_b) {
for (int m_idx = 0; m_idx < m; ++m_idx) {
for (int n_idx = 0; n_idx < n; ++n_idx) {
float res = 0.f;
for (int k_idx = 0; k_idx < k; ++k_idx) {
float av = trans_a ? a[k_idx * lda + m_idx]
: a[m_idx * lda + k_idx];
float bv = trans_b ? b[n_idx * ldb + k_idx]
: b[k_idx * ldb + n_idx];
res += av * bv;
}
c[m_idx * ldc + n_idx] = res;
}
}
}
std::vector<std::vector<int>> mask_offset_to_2dmask(
_megdnn_tensor_in mask_offset, _megdnn_tensor_in mask_val) {
std::vector<std::vector<int>> mask;
if (mask_offset.layout.ndim > 0 && mask_offset.layout[0] >= 2) {
const int offset_len = mask_offset.layout.shape[0];
const int32_t* mask_offset_ptr = mask_offset.ptr<int32_t>();
const int32_t* mask_val_ptr = mask_val.ptr<int32_t>();
megdnn_assert(
mask_val.layout.shape[0] ==
static_cast<size_t>(mask_offset_ptr[offset_len - 1]),
"check mask offset %zu != %zu", mask_val.layout.shape[0],
static_cast<size_t>(mask_offset_ptr[offset_len - 1]));
for (int offset_idx = 1; offset_idx < offset_len; ++offset_idx) {
mask.push_back({});
const int mask_len = mask_offset_ptr[offset_idx] -
mask_offset_ptr[offset_idx - 1];
const int32_t* mask_ptr =
&mask_val_ptr[mask_offset_ptr[offset_idx - 1]];
for (int val_idx = 0; val_idx < mask_len; ++val_idx) {
mask[offset_idx - 1].push_back(mask_ptr[val_idx]);
}
}
}
return mask;
}
inline bool is_layout_nchw4(const TensorLayout& layout) {
if (layout.ndim == 5 && layout[4] == 4) {
return true;
} else {
return false;
}
}
template <typename T>
using QuantizedCType =
std::enable_if_t<DTypeTrait<T>::category == DTypeCategory::QUANTIZED,
typename DTypeTrait<T>::ctype>;
inline int8_t quant_float_2_int8(float val, DType dtype) {
return dtype.param<::megdnn::dtype::QuantizedS8>().quantize(val).as_int8();
}
template <param::DctChannelSelect::Format format, typename Dtype>
inline void dct_output(Dtype* dst_ptr, const int oc_idx, const int img_size,
float val, DType) {
dst_ptr[oc_idx * img_size] = val;
}
template <>
inline void dct_output<param::DctChannelSelect::Format::NCHW4>(
int8_t* dst_ptr, const int oc_idx, const int img_size, float val,
DType dtype) {
dst_ptr[oc_idx / 4 * 4 * img_size + oc_idx % 4] =
quant_float_2_int8(val, dtype);
}
template <param::DctChannelSelect::Format format>
struct ChannleBlock {
static constexpr int block = 1;
};
template <>
struct ChannleBlock<param::DctChannelSelect::Format::NCHW4> {
static constexpr int block = 4;
};
template <param::DctChannelSelect::Format format, typename Dtype>
void naive_dct(const uint8_t* src, Dtype* dst, int n, int c, int h, int w,
int block, const std::vector<std::vector<int>>& mask,
DType dtype) {
constexpr int block_channel = ChannleBlock<format>::block;
const int block_h = block;
const int block_w = block;
std::vector<float> c_matrix(block * block);
std::vector<float> tmp(block * block);
std::vector<float> tmp_result(block * block);
generate_c_matrix(&c_matrix[0], block);
megdnn_assert(h % block_h == 0, "h mod block_h == 0");
megdnn_assert(w % block_w == 0, "w mod block_w == 0");
const int oh = h / block_h;
const int ow = w / block_w;
const int o_img_size = oh * ow;
std::vector<int> mask_offset;
int mask_len_sum = 0;
if (mask.size() > 0) {
for (auto& sub_mask : mask) {
mask_offset.push_back(mask_len_sum);
mask_len_sum += sub_mask.size();
}
} else {
for (int c_idx = 0; c_idx < c; ++c_idx) {
mask_offset.push_back(mask_len_sum);
mask_len_sum += block_h * block_w;
}
}
const size_t o_batch_stride = mask_len_sum * oh * ow;
for (int n_idx = 0; n_idx < n; ++n_idx) {
for (int c_idx = 0; c_idx < c; ++c_idx) {
megdnn_assert(mask_offset[c_idx] % block_channel == 0,
"%d mod %d == 0", mask_offset[c_idx], block_channel);
const size_t src_offset = n_idx * c * h * w + c_idx * h * w;
const uint8_t* src_channel = src + src_offset;
const size_t dst_offset = n_idx * o_batch_stride +
mask_offset[c_idx] / block_channel * oh *
ow * block_channel;
Dtype* dst_channel = dst + dst_offset;
for (int oh_idx = 0; oh_idx < oh; ++oh_idx) {
for (int ow_idx = 0; ow_idx < ow; ++ow_idx) {
matmul(block, block, block, block, w, block, &c_matrix[0],
&src_channel[oh_idx * block_h * w +
ow_idx * block_w],
&tmp[0], false, false);
matmul(block, block, block, block, block, block, &tmp[0],
&c_matrix[0], &tmp_result[0], false, true);
Dtype* dst_start = dst_channel +
(oh_idx * ow + ow_idx) * block_channel;
if (mask.size() == 0) {
for (int inner_h_idx = 0; inner_h_idx < block_h;
++inner_h_idx) {
for (int inner_w_idx = 0; inner_w_idx < block_w;
++inner_w_idx) {
const int oc_idx =
inner_h_idx * block_w + inner_w_idx;
dct_output<format>(
dst_start, oc_idx, o_img_size,
tmp_result[inner_h_idx * block +
inner_w_idx],
dtype);
}
}
} else {
//! with mask
auto& sub_mask = mask[c_idx];
int dst_offset = 0;
for (auto mask_idx : sub_mask) {
dct_output<format>(dst_start, dst_offset,
o_img_size, tmp_result[mask_idx],
dtype);
++dst_offset;
}
}
}
}
}
}
}
} // namespace
void DctChannelSelectForwardImpl::exec(_megdnn_tensor_in src,
_megdnn_tensor_in mask_offset,
_megdnn_tensor_in mask_val,
_megdnn_tensor_out dst,
_megdnn_workspace /*workspace*/) {
MIDOUT_BEGIN(megdnn_naive_dct_fwd) {
int in = src.layout.shape[0];
int ic = src.layout.shape[1];
int ih = src.layout.shape[2];
int iw = src.layout.shape[3];
megdnn_assert(dst.raw_ptr, "dst can not be nullptr");
const int block = param().dct_block_size;
auto mask = mask_offset_to_2dmask(mask_offset, mask_val);
if (dst.layout.dtype.enumv() == DTypeEnum::Float32) {
megdnn_assert(!is_layout_nchw4(dst.layout) &&
param().format == Param::Format::NCHW,
"dst must be nchw");
MEGDNN_DISPATCH_CPU_KERN_OPR(naive_dct<Param::Format::NCHW>(
src.ptr<uint8_t>(), dst.ptr<float>(), in, ic, ih, iw, block,
mask, dst.layout.dtype));
} else {
megdnn_assert(dst.layout.dtype.enumv() == DTypeEnum::QuantizedS8,
"dst must be q8");
megdnn_assert(is_layout_nchw4(dst.layout) &&
param().format == Param::Format::NCHW4,
"dst must be nchw4");
MEGDNN_DISPATCH_CPU_KERN_OPR(naive_dct<Param::Format::NCHW4>(
src.ptr<uint8_t>(), static_cast<int8_t*>(dst.raw_ptr), in,
ic, ih, iw, block, mask, dst.layout.dtype));
}
}
MIDOUT_END();
}
} // namespace naive
} // namespace megdnn
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/naive/dct/opr_impl.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include "megdnn/oprs.h"
namespace megdnn {
namespace naive {
class DctChannelSelectForwardImpl : public DctChannelSelectForward {
public:
using DctChannelSelectForward::DctChannelSelectForward;
void exec(_megdnn_tensor_in src, _megdnn_tensor_in mask_offset,
_megdnn_tensor_in mask_val, _megdnn_tensor_out dst,
_megdnn_workspace workspace) override;
size_t get_workspace_in_bytes(const TensorLayout& /*src*/,
const TensorLayout&, const TensorLayout&,
const TensorLayout&) override {
return 0;
};
};
} // namespace naive
} // namespace megdnn
// vim: syntax=cpp.doxygen
...@@ -6,7 +6,8 @@ ...@@ -6,7 +6,8 @@
* *
* Unless required by applicable law or agreed to in writing, * Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an * software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/ */
#include "src/naive/handle.h" #include "src/naive/handle.h"
...@@ -29,6 +30,7 @@ ...@@ -29,6 +30,7 @@
#include "src/naive/convpooling/opr_impl.h" #include "src/naive/convpooling/opr_impl.h"
#include "src/naive/cumsum/opr_impl.h" #include "src/naive/cumsum/opr_impl.h"
#include "src/naive/cvt_color/opr_impl.h" #include "src/naive/cvt_color/opr_impl.h"
#include "src/naive/dct/opr_impl.h"
#include "src/naive/deformable_conv/opr_impl.h" #include "src/naive/deformable_conv/opr_impl.h"
#include "src/naive/deformable_ps_roi_pooling/opr_impl.h" #include "src/naive/deformable_ps_roi_pooling/opr_impl.h"
#include "src/naive/dot/opr_impl.h" #include "src/naive/dot/opr_impl.h"
...@@ -56,6 +58,7 @@ ...@@ -56,6 +58,7 @@
#include "src/naive/reduce/opr_impl.h" #include "src/naive/reduce/opr_impl.h"
#include "src/naive/relayout/opr_impl.h" #include "src/naive/relayout/opr_impl.h"
#include "src/naive/relayout_format/opr_impl.h" #include "src/naive/relayout_format/opr_impl.h"
#include "src/naive/remap/opr_impl.h"
#include "src/naive/repeat/opr_impl.h" #include "src/naive/repeat/opr_impl.h"
#include "src/naive/resize/opr_impl.h" #include "src/naive/resize/opr_impl.h"
#include "src/naive/rng/opr_impl.h" #include "src/naive/rng/opr_impl.h"
...@@ -76,7 +79,6 @@ ...@@ -76,7 +79,6 @@
#include "src/naive/warp_affine/opr_impl.h" #include "src/naive/warp_affine/opr_impl.h"
#include "src/naive/warp_perspective/opr_impl.h" #include "src/naive/warp_perspective/opr_impl.h"
#include "src/naive/winograd_filter_preprocess/opr_impl.h" #include "src/naive/winograd_filter_preprocess/opr_impl.h"
#include "src/naive/remap/opr_impl.h"
static size_t g_image2d_pitch_alignment = 1; static size_t g_image2d_pitch_alignment = 1;
......
...@@ -6,20 +6,21 @@ ...@@ -6,20 +6,21 @@
* *
* Unless required by applicable law or agreed to in writing, * Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an * software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/ */
#pragma once #pragma once
#include <map> #include <map>
#include <memory> #include <memory>
#include <vector>
#include <regex> #include <regex>
#include <vector>
#include "megdnn/basic_types.h" #include "megdnn/basic_types.h"
#include "megdnn/tensor_format.h" #include "megdnn/tensor_format.h"
#include "test/common/opr_algo_proxy.h"
#include "test/common/opr_proxy.h" #include "test/common/opr_proxy.h"
#include "test/common/rng.h" #include "test/common/rng.h"
#include "test/common/timer.h" #include "test/common/timer.h"
#include "test/common/opr_algo_proxy.h"
namespace megdnn { namespace megdnn {
namespace test { namespace test {
...@@ -31,6 +32,7 @@ public: ...@@ -31,6 +32,7 @@ public:
using TensorValueArray = TensorNDArray; using TensorValueArray = TensorNDArray;
using BeforeExecCallback = using BeforeExecCallback =
std::function<void(Opr*, const TensorValueArray&)>; std::function<void(Opr*, const TensorValueArray&)>;
using TensorsConstriant = std::function<void(TensorValueArray& tensors)>;
BenchmarkerBase(Handle* handle, T timer) BenchmarkerBase(Handle* handle, T timer)
: m_timer(timer), : m_timer(timer),
...@@ -54,6 +56,8 @@ public: ...@@ -54,6 +56,8 @@ public:
} }
float exec(TensorLayoutArray layouts); float exec(TensorLayoutArray layouts);
float exect(const TensorValueArray& testcase_in);
//! disabiguate overloaded exec //! disabiguate overloaded exec
float execs(const TensorShapeArray& shapes) { return exec(shapes); } float execs(const TensorShapeArray& shapes) { return exec(shapes); }
float execl(const TensorLayoutArray& layouts) { return exec(layouts); } float execl(const TensorLayoutArray& layouts) { return exec(layouts); }
...@@ -73,6 +77,11 @@ public: ...@@ -73,6 +77,11 @@ public:
m_fmt[idx] = fmt; m_fmt[idx] = fmt;
return *this; return *this;
} }
BenchmarkerBase& set_tensors_constraint(
const TensorsConstriant& tensor_constraint) {
m_tensor_constraint = tensor_constraint;
return *this;
}
TensorLayoutArray make_layouts(const TensorShapeArray& shapes) { TensorLayoutArray make_layouts(const TensorShapeArray& shapes) {
TensorLayoutArray layouts(shapes.size()); TensorLayoutArray layouts(shapes.size());
for (size_t i = 0; i < shapes.size(); ++i) { for (size_t i = 0; i < shapes.size(); ++i) {
...@@ -142,6 +151,7 @@ private: ...@@ -142,6 +151,7 @@ private:
std::unique_ptr<OprProxy<Opr>> m_proxy; std::unique_ptr<OprProxy<Opr>> m_proxy;
BeforeExecCallback m_before_exec_callback; BeforeExecCallback m_before_exec_callback;
std::unique_ptr<Opr> m_opr; std::unique_ptr<Opr> m_opr;
TensorsConstriant m_tensor_constraint;
}; };
template <typename Opr, typename T> template <typename Opr, typename T>
...@@ -184,10 +194,16 @@ float BenchmarkerBase<Opr, T>::exec(TensorLayoutArray layouts) { ...@@ -184,10 +194,16 @@ float BenchmarkerBase<Opr, T>::exec(TensorLayoutArray layouts) {
auto rng = m_rng[i]; auto rng = m_rng[i];
if (!rng) if (!rng)
rng = m_default_rng.get(); rng = m_default_rng.get();
auto size = tensor.layout.span().high_byte;
rng->gen(tensor); rng->gen(tensor);
}
if (m_tensor_constraint) {
m_tensor_constraint(tensors_cur_host);
}
for (size_t i = 0; i < tensors_cur_host.size(); ++i) {
TensorND& tensor = tensors_cur_host[i];
if (tensor.layout.ndim == 0) if (tensor.layout.ndim == 0)
continue; continue;
auto size = tensor.layout.span().high_byte;
megdnn_memcpy_H2D(m_handle, tensors_cur[i].raw_ptr, tensor.raw_ptr, megdnn_memcpy_H2D(m_handle, tensors_cur[i].raw_ptr, tensor.raw_ptr,
size); size);
} }
...@@ -243,6 +259,105 @@ float BenchmarkerBase<Opr, T>::exec(TensorLayoutArray layouts) { ...@@ -243,6 +259,105 @@ float BenchmarkerBase<Opr, T>::exec(TensorLayoutArray layouts) {
return time_in_ms; return time_in_ms;
} }
template <typename Opr, typename T>
float BenchmarkerBase<Opr, T>::exect(const TensorValueArray& testcase_in) {
auto opr = this->opr();
opr->param() = m_param;
TensorLayoutArray layouts;
TensorNDArray tensors_cur_host;
for (auto& inp : testcase_in) {
layouts.push_back(inp.layout);
tensors_cur_host.emplace_back(inp);
}
auto user_layouts = layouts;
m_proxy->deduce_layout(opr, layouts);
for (size_t i = 0; i < layouts.size(); ++i)
if (user_layouts[i].ndim > 0) {
auto run = [&]() {
ASSERT_TRUE(layouts[i].eq_shape(user_layouts[i]))
<< "User provided shape is "
<< user_layouts[i].TensorShape::to_string()
<< "\nExpected shape is "
<< layouts[i].TensorShape::to_string();
};
run();
}
auto allocate = [&layouts](Handle* handle) {
TensorNDArray tensors(layouts.size());
auto trans_func = [handle](const TensorLayout& layout) {
auto span = layout.span();
TensorND res;
res.raw_ptr = static_cast<uint8_t*>(
megdnn_malloc(handle, span.dist_byte())) +
span.low_byte;
res.layout = layout;
return res;
};
std::transform(layouts.begin(), layouts.end(), tensors.begin(),
trans_func);
return tensors;
};
auto tensors_cur = allocate(m_handle);
//! init
for (size_t i = 0; i < tensors_cur_host.size(); ++i) {
TensorND& tensor = tensors_cur_host[i];
auto size = tensor.layout.span().high_byte;
if (tensor.layout.ndim == 0)
continue;
megdnn_memcpy_H2D(m_handle, tensors_cur[i].raw_ptr, tensor.raw_ptr,
size);
}
if (m_before_exec_callback) {
m_before_exec_callback(opr, tensors_cur);
}
//! run
//! warm up
m_proxy->exec(opr, tensors_cur);
megcoreSynchronize(m_handle->megcore_computing_handle());
if (m_adaptive_secs) {
//! find m_times for adaptive benchmarking
m_times = 0;
int cur_times = 1;
auto remain_time = m_adaptive_secs * 1e6;
while (remain_time > 0) {
m_timer.reset();
m_timer.start();
for (int i = 0; i < cur_times; ++i)
m_proxy->exec(opr, tensors_cur);
megcoreSynchronize(m_handle->megcore_computing_handle());
m_timer.stop();
m_times += cur_times;
auto this_run_time = m_timer.get_time_in_us();
remain_time -= this_run_time;
cur_times = std::min(
cur_times * 2,
std::max<int>(1, remain_time / this_run_time * cur_times));
}
}
m_timer.reset();
m_timer.start();
for (size_t t = 0; t < m_times; ++t)
m_proxy->exec(opr, tensors_cur);
megcoreSynchronize(m_handle->megcore_computing_handle());
m_timer.stop();
auto time_in_ms = m_timer.get_time_in_us() / 1e3;
if (m_display) {
std::cout << "Total time is " << time_in_ms << "ms "
<< "for " << m_times << " run(s)." << std::endl;
}
auto free = [](Handle* handle, TensorNDArray& tensors) {
std::for_each(tensors.begin(), tensors.end(),
[handle](const TensorND& tensor) {
megdnn_free(handle, tensor.raw_ptr);
});
};
free(m_handle, tensors_cur);
if (m_adaptive_secs)
time_in_ms /= m_times;
return time_in_ms;
}
template <typename Opr, typename T = Timer> template <typename Opr, typename T = Timer>
class Benchmarker; class Benchmarker;
......
...@@ -294,8 +294,6 @@ void CheckerHelper::do_exec_with_testcases(const TensorValueArray& testcase_in, ...@@ -294,8 +294,6 @@ void CheckerHelper::do_exec_with_testcases(const TensorValueArray& testcase_in,
ASSERT_TRUE(testcase_in[i].layout.ndim == 0 || ASSERT_TRUE(testcase_in[i].layout.ndim == 0 ||
testcase_out[i].layout.ndim == 0 || testcase_out[i].layout.ndim == 0 ||
testcase_in[i].layout.eq_layout(testcase_out[i].layout)); testcase_in[i].layout.eq_layout(testcase_out[i].layout));
ASSERT_TRUE(testcase_in[i].layout.ndim != 0 ||
testcase_out[i].layout.ndim != 0);
layouts.emplace_back(testcase_in[i].layout.ndim > 0 layouts.emplace_back(testcase_in[i].layout.ndim > 0
? testcase_in[i].layout ? testcase_in[i].layout
: testcase_out[i].layout); : testcase_out[i].layout);
......
...@@ -392,7 +392,8 @@ TensorND TensorValue(const TensorShape& shape, T dtype, ...@@ -392,7 +392,8 @@ TensorND TensorValue(const TensorShape& shape, T dtype,
tensor.layout = {shape, dtype}; tensor.layout = {shape, dtype};
tensor.raw_ptr = tensor.raw_ptr =
static_cast<dt_byte*>(malloc(tensor.layout.span().dist_byte())); static_cast<dt_byte*>(malloc(tensor.layout.span().dist_byte()));
megdnn_assert(values.size() == tensor.layout.total_nr_elems()); megdnn_assert(values.size() == tensor.layout.total_nr_elems(), "%zu == %zu", values.size(),
tensor.layout.total_nr_elems());
auto ptr = tensor.ptr<typename DTypeTrait<T>::ctype>(); auto ptr = tensor.ptr<typename DTypeTrait<T>::ctype>();
for (const auto& v : values) { for (const auto& v : values) {
*ptr++ = typename DTypeTrait<T>::ctype(v); *ptr++ = typename DTypeTrait<T>::ctype(v);
......
/**
* \file
* dnn/test/common/dct_ref.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "test/common/dct_ref.h"
namespace megdnn {
namespace test {
struct FixCase {
std::vector<int> mask_offset;
std::vector<int> mask_val;
};
using Param = DctChannelSelectForward::Param;
static inline FixCase get_fix_mask(Param::FastImpl impl) {
std::vector<int> fix_32_mask_offset{0, 16, 24, 32};
std::vector<int> fix_32_mask_val{0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32,
25, 18, 11, 4, 5, 0, 1, 8, 16, 9, 2,
3, 10, 0, 1, 8, 16, 9, 2, 3, 10};
megdnn_assert(impl == Param::FastImpl::FIX_32_MASK,
"only support gen FIX_32_MASK");
return {fix_32_mask_offset, fix_32_mask_val};
}
CheckerHelper::TensorsConstriant gen_dct_constriant(
const size_t /* n */, const size_t ic, const size_t ih, const size_t iw,
const size_t oc, Param param) {
auto constraint = [=](CheckerHelper::TensorValueArray& tensors_orig) {
const size_t block = param.dct_block_size;
const int block_c = param.format == Param::Format::NCHW4 ? 4 : 1;
megdnn_assert(oc % block_c == 0, "oc mod block_c must == 0");
std::shared_ptr<DctTestcase> test_case_ptr = DctTestcase::make();
DctTestcase& test_case = *test_case_ptr.get();
UniformIntRNG rng(0, 255);
UniformIntRNG mask_rng(0, 64 / block_c - 1);
const size_t no_mask_oc = ic * block * block;
megdnn_assert(ih % block == 0, "%zu mod %zu == 0", ih, block);
megdnn_assert(iw % block == 0, "%zu mod %zu == 0", iw, block);
TensorND mask_offset;
TensorND mask_val;
std::vector<int>& mask_offset_vec = test_case.mask_offset_vec;
std::vector<int>& mask_val_vec = test_case.mask_val_vec;
UniformIntRNG rng_oc(0, oc);
if (param.fastImpl == Param::FastImpl::FIX_32_MASK) {
auto fix_32_mask = get_fix_mask(Param::FastImpl::FIX_32_MASK);
mask_offset_vec = fix_32_mask.mask_offset;
mask_val_vec = fix_32_mask.mask_val;
megdnn_assert(oc == 32, "oc must eq 32");
} else if (no_mask_oc > oc) {
size_t remain_oc = oc;
mask_offset_vec.resize(ic + 1);
mask_val_vec.resize(oc);
mask_offset_vec[0] = 0;
for (size_t ic_idx = 0; ic_idx < ic; ++ic_idx) {
size_t random_len = (int)rng_oc.gen_single_val() * block_c;
size_t mask_len = (ic_idx == ic - 1) || (remain_oc == 0)
? remain_oc
: random_len % remain_oc;
megdnn_assert(mask_len % block_c == 0,
"mask_len mod block_c == 0, but %zu mod %d ",
mask_len, block_c);
const size_t oc_idx = mask_offset_vec[ic_idx];
remain_oc -= mask_len;
mask_offset_vec[ic_idx + 1] = oc_idx + mask_len;
for (size_t mask_idx = 0; mask_idx < mask_len; ++mask_idx) {
mask_val_vec[oc_idx + mask_idx] =
(int)mask_rng.gen_single_val();
}
}
}
mask_offset = TensorND(mask_offset_vec.data(),
{{mask_offset_vec.size()}, dtype::Int32()});
mask_val = TensorND(mask_val_vec.data(),
{{mask_val_vec.size()}, dtype::Int32()});
if (tensors_orig.size() > 1) {
megdnn_assert(tensors_orig.size() == 4, "tensors_orig.size() == 4");
megdnn_assert(mask_offset_vec.size() >= 2,
"mask_offset_vec.size() >= 2");
megdnn_assert(tensors_orig[1].layout == mask_offset.layout,
"tensors_orig[1].layout == mask_offset.layout");
megdnn_assert(tensors_orig[2].layout == mask_val.layout,
"tensors_orig[2].layout == mask_val.layout");
auto naive_handle = create_cpu_handle(2, false);
megdnn_memcpy_D2D(naive_handle.get(), tensors_orig[1].raw_ptr,
mask_offset.raw_ptr,
mask_offset.layout.span().dist_byte());
megdnn_memcpy_D2D(naive_handle.get(), tensors_orig[2].raw_ptr,
mask_val.raw_ptr,
mask_val.layout.span().dist_byte());
}
};
return constraint;
}
std::shared_ptr<DctTestcase> gen_dct_case(const size_t n, const size_t ic,
const size_t ih, const size_t iw,
const size_t oc, Param param,
DType dst_dtype,
bool correct_result) {
const size_t block = param.dct_block_size;
const int block_c = param.format == Param::Format::NCHW4 ? 4 : 1;
megdnn_assert(oc % block_c == 0, "oc mod block_c must == 0");
std::shared_ptr<DctTestcase> test_case_ptr = DctTestcase::make();
DctTestcase& test_case = *test_case_ptr.get();
UniformIntRNG rng(0, 255);
UniformIntRNG mask_rng(0, 64 / block_c - 1);
const size_t input_elements = n * ic * ih * iw;
const size_t no_mask_oc = ic * block * block;
megdnn_assert(ih % block == 0, "%zu mod %zu == 0", ih, block);
megdnn_assert(iw % block == 0, "%zu mod %zu == 0", iw, block);
std::vector<uint8_t>& inp_vec = test_case.inp_vec;
inp_vec.resize(input_elements);
TensorShape input_shape{n, ic, ih, iw};
for (auto& elm : inp_vec) {
elm = (uint8_t)rng.gen_single_val();
}
auto src = TensorND(inp_vec.data(), {input_shape, dtype::Uint8()});
TensorND mask_offset;
TensorND mask_val;
std::vector<int>& mask_offset_vec = test_case.mask_offset_vec;
std::vector<int>& mask_val_vec = test_case.mask_val_vec;
UniformIntRNG rng_oc(0, oc);
if (param.fastImpl == Param::FastImpl::FIX_32_MASK) {
auto fix_32_mask = get_fix_mask(Param::FastImpl::FIX_32_MASK);
mask_offset_vec = fix_32_mask.mask_offset;
mask_val_vec = fix_32_mask.mask_val;
megdnn_assert(oc == 32, "oc must eq 32");
} else if (no_mask_oc > oc) {
size_t remain_oc = oc;
mask_offset_vec.resize(ic + 1);
mask_val_vec.resize(oc);
mask_offset_vec[0] = 0;
for (size_t ic_idx = 0; ic_idx < ic; ++ic_idx) {
size_t random_len = (int)rng_oc.gen_single_val() * block_c;
size_t mask_len = (ic_idx == ic - 1) || (remain_oc == 0)
? remain_oc
: random_len % remain_oc;
megdnn_assert(mask_len % block_c == 0,
"mask_len mod block_c == 0, but %zu mod %d ",
mask_len, block_c);
const size_t oc_idx = mask_offset_vec[ic_idx];
remain_oc -= mask_len;
mask_offset_vec[ic_idx + 1] = oc_idx + mask_len;
for (size_t mask_idx = 0; mask_idx < mask_len; ++mask_idx) {
mask_val_vec[oc_idx + mask_idx] =
(int)mask_rng.gen_single_val();
}
}
}
mask_offset = TensorND(mask_offset_vec.data(),
{{mask_offset_vec.size()}, dtype::Int32()});
mask_val = TensorND(mask_val_vec.data(),
{{mask_val_vec.size()}, dtype::Int32()});
if (mask_offset_vec.size() >= 2) {
test_case.testcase_in = {
src, mask_offset, mask_val, {nullptr, {{}, dst_dtype}}};
} else {
test_case.testcase_in = {src, {}, {}, {nullptr, {{}, dst_dtype}}};
}
auto naive_handle = create_cpu_handle(2, false);
auto opr_naive = naive_handle->create_operator<DctChannelSelectForward>();
opr_naive->param() = param;
using Proxy = OprProxy<DctChannelSelectForward>;
Proxy naive_proxy;
TensorLayout temp_dst_layout;
temp_dst_layout.dtype = dst_dtype;
TensorLayoutArray layouts{src.layout, mask_offset.layout, mask_val.layout,
temp_dst_layout};
naive_proxy.deduce_layout(opr_naive.get(), layouts);
const size_t output_elements = layouts[3].total_nr_elems();
std::vector<float>& output_vec = test_case.output_vec;
output_vec.resize(output_elements);
auto dst = TensorND(output_vec.data(), layouts[3]);
DctTestcase::TensorValueArray testcase_naive;
testcase_naive.emplace_back(test_case.testcase_in[0]);
testcase_naive.emplace_back(test_case.testcase_in[1]);
testcase_naive.emplace_back(test_case.testcase_in[2]);
testcase_naive.emplace_back(dst);
if (correct_result) {
naive_proxy.exec(opr_naive.get(), testcase_naive);
}
test_case.testcase_out = {{}, {}, {}, dst};
return test_case_ptr;
}
} // namespace test
} // namespace megdnn
// vim: syntax=cpp.doxygen
\ No newline at end of file
/**
* \file
* dnn/test/common/dct_ref.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include <math.h>
#include <vector>
#include "megdnn/dtype.h"
#include "megdnn/oprs/nn.h"
#include "test/common/checker.h"
#include "test/common/opr_proxy.h"
#include "test/common/rng.h"
namespace megdnn {
namespace test {
using Param = DctChannelSelectForward::Param;
struct DctTestcase {
using TensorValueArray = TensorNDArray;
TensorValueArray testcase_in;
TensorValueArray testcase_out;
std::vector<uint8_t> inp_vec;
std::vector<int> mask_offset_vec;
std::vector<int> mask_val_vec;
std::vector<float> output_vec;
static std::shared_ptr<DctTestcase> make() {
return std::make_shared<DctTestcase>();
}
};
CheckerHelper::TensorsConstriant gen_dct_constriant(
const size_t n, const size_t ic, const size_t ih, const size_t iw,
const size_t oc, Param param);
std::shared_ptr<DctTestcase> gen_dct_case(const size_t n, const size_t ic,
const size_t ih, const size_t iw,
const size_t oc, Param param,
DType dst_dtype = dtype::Float32(),
bool correct_result = true);
} // namespace test
} // namespace megdnn
// vim: syntax=cpp.doxygen
...@@ -110,6 +110,7 @@ DEF(BatchConvBiasForward, 5, true, true); ...@@ -110,6 +110,7 @@ DEF(BatchConvBiasForward, 5, true, true);
DEF(Remap, 3, true, true); DEF(Remap, 3, true, true);
DEF(RemapBackwardData, 3, true, false); DEF(RemapBackwardData, 3, true, false);
DEF(RemapBackwardMat, 4, true, false); DEF(RemapBackwardMat, 4, true, false);
DEF(DctChannelSelectForward, 4, true, true);
} // namespace test } // namespace test
} // namespace megdnn } // namespace megdnn
......
/**
* \file dnn/test/cuda/dct.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "megdnn/oprs/nn.h"
#include "test/common/benchmarker.h"
#include "test/common/checker.h"
#include "test/common/dct_ref.h"
#include "test/common/rng.h"
#include "test/cuda/fixture.h"
namespace megdnn {
namespace test {
TEST_F(CUDA, DCT) {
DctChannelSelectForward::Param param;
Checker<DctChannelSelectForward> checker(handle_cuda());
for (size_t n : {1, 3}) {
for (size_t ic : {1, 3}) {
for (size_t ih : {8, 16, 32, 512, 1024}) {
for (size_t iw : {8, 16, 32, 64, 128, 256, 512, 1024}) {
checker.set_param(param)
.set_dtype(0, dtype::Uint8())
.set_dtype(1, dtype::Int32())
.set_dtype(2, dtype::Int32())
.execs({TensorShape{n, ic, ih, iw}, {}, {}, {}});
}
}
}
}
}
TEST_F(CUDA, DCT_QINT8) {
DctChannelSelectForward::Param param;
Checker<DctChannelSelectForward> checker(handle_cuda());
param.format = Param::Format::NCHW4;
for (size_t n : {1, 3}) {
for (size_t ic : {1, 3}) {
for (size_t ih : {8, 16, 32, 512, 1024}) {
for (size_t iw : {8, 16, 32, 64, 128, 256, 512, 1024}) {
checker.set_param(param)
.set_dtype(0, dtype::Uint8())
.set_dtype(1, dtype::Int32())
.set_dtype(2, dtype::Int32())
.set_dtype(3, dtype::QuantizedS8(10.f))
.set_epsilon(1)
.execs({TensorShape{n, ic, ih, iw}, {}, {}, {}});
}
}
}
}
}
TEST_F(CUDA, DCT_WITH_FIX_32_MASK) {
using Param = DctChannelSelectForward::Param;
Param param;
Checker<DctChannelSelectForward> checker(handle_cuda(), false);
param.fastImpl = Param::FastImpl::FIX_32_MASK;
auto test_case = gen_dct_case(3, 3, 1024, 768, 32, param);
checker.set_param(param).exect(test_case->testcase_in,
test_case->testcase_out);
}
TEST_F(CUDA, DCT_WITH_FIX_32_MASK_QINT8) {
using Param = DctChannelSelectForward::Param;
Param param;
Checker<DctChannelSelectForward> checker(handle_cuda(), false);
param.fastImpl = Param::FastImpl::FIX_32_MASK;
param.format = Param::Format::NCHW4;
auto test_case =
gen_dct_case(3, 3, 1024, 768, 32, param, dtype::QuantizedS8(10.f));
checker.set_param(param).set_epsilon(1).exect(test_case->testcase_in,
test_case->testcase_out);
}
TEST_F(CUDA, DCT_WITH_MASK) {
Checker<DctChannelSelectForward> checker(handle_cuda(), false);
DctChannelSelectForward::Param param;
checker.set_param(param).exect(
Testcase{TensorValue(
{1, 3, 8, 16}, dtype::Uint8(),
{109, 39, 30, 115, 71, 15, 206, 139, 221, 5,
18, 16, 93, 185, 99, 102, 205, 172, 191, 29,
185, 6, 47, 84, 0, 47, 105, 203, 251, 73,
196, 83, 3, 211, 32, 181, 49, 111, 114, 83,
148, 232, 77, 17, 35, 2, 154, 100, 41, 135,
141, 206, 56, 91, 137, 199, 104, 192, 75, 122,
78, 65, 184, 69, 91, 82, 2, 172, 194, 240,
49, 145, 87, 210, 97, 190, 179, 93, 125, 105,
181, 207, 148, 178, 133, 53, 25, 198, 238, 151,
14, 120, 213, 195, 145, 20, 122, 107, 217, 185,
65, 5, 115, 110, 82, 206, 163, 86, 2, 2,
44, 125, 50, 38, 41, 106, 30, 5, 151, 243,
238, 181, 232, 191, 161, 57, 23, 204,
109, 39, 30, 115, 71, 15, 206, 139, 221, 5,
18, 16, 93, 185, 99, 102, 205, 172, 191, 29,
185, 6, 47, 84, 0, 47, 105, 203, 251, 73,
196, 83, 3, 211, 32, 181, 49, 111, 114, 83,
148, 232, 77, 17, 35, 2, 154, 100, 41, 135,
141, 206, 56, 91, 137, 199, 104, 192, 75, 122,
78, 65, 184, 69, 91, 82, 2, 172, 194, 240,
49, 145, 87, 210, 97, 190, 179, 93, 125, 105,
181, 207, 148, 178, 133, 53, 25, 198, 238, 151,
14, 120, 213, 195, 145, 20, 122, 107, 217, 185,
65, 5, 115, 110, 82, 206, 163, 86, 2, 2,
44, 125, 50, 38, 41, 106, 30, 5, 151, 243,
238, 181, 232, 191, 161, 57, 23, 204,
109, 39, 30, 115, 71, 15, 206, 139, 221, 5,
18, 16, 93, 185, 99, 102, 205, 172, 191, 29,
185, 6, 47, 84, 0, 47, 105, 203, 251, 73,
196, 83, 3, 211, 32, 181, 49, 111, 114, 83,
148, 232, 77, 17, 35, 2, 154, 100, 41, 135,
141, 206, 56, 91, 137, 199, 104, 192, 75, 122,
78, 65, 184, 69, 91, 82, 2, 172, 194, 240,
49, 145, 87, 210, 97, 190, 179, 93, 125, 105,
181, 207, 148, 178, 133, 53, 25, 198, 238, 151,
14, 120, 213, 195, 145, 20, 122, 107, 217, 185,
65, 5, 115, 110, 82, 206, 163, 86, 2, 2,
44, 125, 50, 38, 41, 106, 30, 5, 151, 243,
238, 181, 232, 191, 161, 57, 23, 204}),
TensorValue({4}, dtype::Int32(), {0, 14, 22, 30}),
TensorValue({30}, dtype::Int32(),
{8, 16, 9, 2, 3, 10, 17, 24, 32, 25,
18, 11, 4, 5, 0, 1, 8, 16, 9, 2,
3, 10, 0, 1, 8, 16, 9, 2, 3, 10}),
{}},
Testcase{{},
{},
{},
TensorValue({1, 30, 1, 2}, dtype::Float32(),
{-22.850792, -97.862236, -101.043236,
-4.727012, 28.275675, -157.96654,
42.1377, 45.06531, -149.77373,
24.487143, -8.054966, -13.990831,
-6.9395194, -3.9211385, 64.79172,
-12.363858, -47.875, 59.,
56.271786, -62.725567, 120.522675,
16.559765, 85.74334, 112.904495,
99.375, 29.499973, 2.0220923,
-19.681704, 890.12494, 941.25,
-7.0498576, 99.47632, -22.850792,
-97.862236, -101.043236, -4.727012,
28.275675, -157.96654, 42.1377,
45.06531, -149.77373, 24.487143,
-8.054966, -13.990831, 890.12494,
941.25, -7.0498576, 99.47632,
-22.850792, -97.862236, -101.043236,
-4.727012, 28.275675, -157.96654,
42.1377, 45.06531, -149.77373,
24.487143, -8.054966, -13.990831})});
}
TEST_F(CUDA, DCT_WITH_MASK2) {
Checker<DctChannelSelectForward> checker(handle_cuda(), false);
DctChannelSelectForward::Param param;
UniformIntRNG rng_oc(0, 3 * 64);
for (size_t n : {1, 3}) {
for (size_t ic : {1, 3}) {
for (size_t ih : {8, 16, 32, 512, 1024}) {
for (size_t iw : {8, 16, 32, 64, 128, 256, 512, 1024}) {
int random_oc = static_cast<int>(rng_oc.gen_single_val());
int max_oc = ic * 64;
int mask_oc = (random_oc % max_oc) + 1;
auto test_case =
gen_dct_case(n, ic, ih, iw, mask_oc, param);
checker.set_param(param).exect(test_case->testcase_in,
test_case->testcase_out);
}
}
}
}
}
TEST_F(CUDA, DCT_WITH_MASK2_QINT8) {
Checker<DctChannelSelectForward> checker(handle_cuda(), false);
DctChannelSelectForward::Param param;
param.format = DctChannelSelectForward::Param::Format::NCHW4;
UniformIntRNG rng_oc(0, 3 * 64);
for (size_t n : {1, 3}) {
for (size_t ic : {1, 3}) {
for (size_t ih : {8, 16, 32, 512, 1024}) {
for (size_t iw : {8, 16, 32, 64, 128, 256, 512, 1024}) {
int random_oc = static_cast<int>(rng_oc.gen_single_val());
int max_oc = ic * 64;
int mask_oc = (random_oc % max_oc) + 1;
mask_oc = (mask_oc + 3) / 4 * 4;
auto test_case = gen_dct_case(n, ic, ih, iw, mask_oc, param,
dtype::QuantizedS8(10.f));
checker.set_param(param).set_epsilon(1).exect(
test_case->testcase_in, test_case->testcase_out);
}
}
}
}
}
TEST_F(CUDA, DCT_WITH_MASK2_QINT8_CONSTRAINT) {
DctChannelSelectForward::Param param;
param.format = DctChannelSelectForward::Param::Format::NCHW4;
Checker<DctChannelSelectForward> checker(handle_cuda(), false);
checker.set_param(param)
.set_dtype(0, dtype::Uint8())
.set_dtype(1, dtype::Int32())
.set_dtype(2, dtype::Int32())
.set_dtype(3, dtype::QuantizedS8(10.f))
.set_epsilon(1);
UniformIntRNG rng_oc(0, 3 * 64);
for (size_t n : {1, 3}) {
for (size_t ic : {1, 3}) {
for (size_t ih : {8, 16, 32, 512, 1024}) {
for (size_t iw : {8, 16, 32, 64, 128, 256, 512, 1024}) {
int random_oc = static_cast<int>(rng_oc.gen_single_val());
int max_oc = ic * 64;
int mask_oc = (random_oc % max_oc) + 1;
mask_oc = (mask_oc + 3) / 4 * 4;
if (mask_oc < max_oc) {
checker
.set_tensors_constraint(gen_dct_constriant(
n, ic, ih, iw, mask_oc, param))
.exec({TensorShape{n, ic, ih, iw},
TensorShape{ic + 1},
TensorShape{(size_t)mask_oc},
{}});
} else {
checker.set_tensors_constraint({}).exec(
{TensorShape{n, ic, ih, iw}, {}, {}, {}});
}
}
}
}
}
}
#if MEGDNN_WITH_BENCHMARK
TEST_F(CUDA, BENCHMARK_DCT) {
using Param = DctChannelSelectForward::Param;
auto run = [&](const TensorShapeArray& shapes, Param param) {
Benchmarker<DctChannelSelectForward> benchmarker(handle_cuda());
benchmarker.set_param(param);
benchmarker.set_dtype(0, dtype::Uint8())
.set_dtype(1, dtype::Int32())
.set_dtype(2, dtype::Int32());
for (auto&& shape : shapes) {
double computation = double(shape[0]) * shape[1] * shape[2] *
shape[3] * 32.0 * 1e-6;
auto time_ms = benchmarker.execs({shape, {}, {}, {}});
printf("execute %s, %.4f Gops\n", shape.to_string().c_str(),
computation / time_ms);
}
};
auto run_case = [&](const DctTestcase& testcase, Param param,
std::string comment = "") {
Benchmarker<DctChannelSelectForward> benchmarker(handle_cuda());
benchmarker.set_param(param);
benchmarker.set_dtype(0, dtype::Uint8())
.set_dtype(1, dtype::Int32())
.set_dtype(2, dtype::Int32())
.set_dtype(3, testcase.testcase_out[3].layout.dtype);
auto src_shape = testcase.testcase_in[0].layout;
double computation = double(src_shape[0]) * src_shape[1] *
src_shape[2] * src_shape[3] * 32.0 * 1e-6;
auto time_ms = benchmarker.exect(testcase.testcase_in);
printf("[%s] execute %s, %.4f Gops\n", comment.c_str(),
src_shape.to_string().c_str(), computation / time_ms);
};
auto run_case_constraint =
[&](const Benchmarker<DctChannelSelectForward>::TensorsConstriant&
constraint,
Param param, const TensorShapeArray& shapes,
std::string comment = "", DType output_dtype) {
Benchmarker<DctChannelSelectForward> benchmarker(handle_cuda());
benchmarker.set_param(param)
.set_dtype(0, dtype::Uint8())
.set_dtype(1, dtype::Int32())
.set_dtype(2, dtype::Int32())
.set_dtype(3, output_dtype)
.set_tensors_constraint(constraint);
auto src_shape = shapes[0];
double computation = double(src_shape[0]) * src_shape[1] *
src_shape[2] * src_shape[3] * 32.0 * 1e-6;
auto time_ms = benchmarker.exec(shapes);
printf("[%s] execute %s, %.4f Gops\n", comment.c_str(),
src_shape.to_string().c_str(), computation / time_ms);
};
TensorShapeArray shapes = {
{1, 3, 512, 512},
{8, 3, 2176, 3840},
};
{
Param param;
run(shapes, param);
}
Param fix_32_param;
fix_32_param.fastImpl = Param::FastImpl::FIX_32_MASK;
{
auto test_case = gen_dct_case(8, 3, 2176, 3840, 32, fix_32_param);
run_case(*test_case, fix_32_param, "FIX_32_MASK");
}
{
Param param;
auto test_case = gen_dct_case(8, 3, 2176, 3840, 32, fix_32_param);
run_case(*test_case, param, "MASK 32");
}
{
Param fix_32_nchw4_param;
fix_32_nchw4_param.fastImpl = Param::FastImpl::FIX_32_MASK;
fix_32_nchw4_param.format = Param::Format::NCHW4;
auto test_case = gen_dct_case(8, 3, 2176, 3840, 32, fix_32_nchw4_param,
dtype::QuantizedS8(10.f));
run_case(*test_case, fix_32_nchw4_param, "FIX_32_MASK QINT8");
}
{
Param fix_32_nchw4_param;
fix_32_nchw4_param.fastImpl = Param::FastImpl::FIX_32_MASK;
fix_32_nchw4_param.format = Param::Format::NCHW4;
auto test_case = gen_dct_case(8, 3, 2176, 3840, 32, fix_32_nchw4_param,
dtype::QuantizedS8(10.f));
fix_32_nchw4_param.fastImpl = Param::FastImpl::NONE;
run_case(*test_case, fix_32_nchw4_param, "MASK 32 QINT8");
}
{
Param fix_32_nchw4_param;
fix_32_nchw4_param.fastImpl = Param::FastImpl::FIX_32_MASK;
fix_32_nchw4_param.format = Param::Format::NCHW4;
TensorShapeArray shapes = {{8, 3, 2176, 3840}, {4}, {32}, {}};
auto constraint =
gen_dct_constriant(8, 3, 2176, 3840, 32, fix_32_nchw4_param);
run_case_constraint(constraint, fix_32_nchw4_param, shapes,
"FIX_32_MASK QINT8 Constraint",
dtype::QuantizedS8(10.f));
}
}
#endif
} // namespace test
} // namespace megdnn
// vim: syntax=cpp.doxygen
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册