提交 95101362 编写于 作者: M Megvii Engine Team

fix(mgb/rocm): remove begin-internal of rocm

GitOrigin-RevId: 1523833fcbc48b1e56bbb27e4d07738cd8ece90c
上级 92f7cceb
......@@ -537,6 +537,11 @@ set(MGB_CUDA ${MGE_WITH_CUDA})
set(MEGDNN_WITH_CUDA ${MGE_WITH_CUDA})
#ROCM
set(MGB_ROCM ${MGE_WITH_ROCM})
set(MEGDNN_WITH_ROCM ${MGE_WITH_ROCM})
# CAMBRICON
set(MGB_CAMBRICON ${MGE_WITH_CAMBRICON})
set(MEGDNN_WITH_CAMBRICON ${MGE_WITH_CAMBRICON})
......
/**
* \file dnn/include/hcc_detail/hcc_defs_epilogue.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#ifdef __HIP_PLATFORM_HCC__
#undef __HIP_PLATFORM_HCC__
#else
#error "hcc_defs_epilogue.h must be included after hcc_defs_prologue.h"
#endif
// vim: syntax=cpp.doxygen
/**
* \file dnn/include/hcc_detail/hcc_defs_prologue.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#define __HIP_PLATFORM_HCC__
// vim: syntax=cpp.doxygen
/**
* \file dnn/include/hip_header.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
/**
* \remarks The files in the subdirectory include/hip are copied from HIP
* headers provided by ROCm-Developer-Tools/HIP, which can be found from
* https://github.com/ROCm-Developer-Tools/HIP. These files are included to make
* the MegDNN can be compiled with both CUDA and ROCm backends, and the both
* backends share the same code.
*/
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#pragma GCC diagnostic ignored "-Wsign-compare"
#include <hip/hip_runtime_api.h>
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#pragma GCC diagnostic pop
#if !defined(__HIP_PLATFORM_HCC__)
#error "platform macro __HIP_PLATFORM_HCC__ must be defined"
#endif
// vim: syntax=cpp.doxygen
......@@ -19,6 +19,7 @@
typedef enum {
megcorePlatformCPU = 1,
megcorePlatformCUDA = 4,
megcorePlatformROCM = 6,
megcorePlatformCambricon = 7,
megcorePlatformAtlas = 8,
} megcorePlatform_t;
......
/**
* \file dnn/include/megcore_rocm.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
#include "./megcore.h"
#include "hip_header.h"
#include "megdnn/internal/visibility_prologue.h"
namespace megcore {
struct ROCMContext {
hipStream_t stream = nullptr;
static std::atomic_bool sm_miopen_algo_search;
static inline bool enable_miopen_algo_search() { return sm_miopen_algo_search.load(); }
static inline void enable_miopen_algo_search(bool enable_algo_search) {
sm_miopen_algo_search.store(enable_algo_search);
}
//! device pointer to buffer for error reporting from kernels
AsyncErrorInfo* error_info = nullptr;
ROCMContext() = default;
ROCMContext(hipStream_t s, AsyncErrorInfo* e) : stream{s}, error_info{e} {}
};
megcoreStatus_t createComputingHandleWithROCMContext(
megcoreComputingHandle_t* compHandle, megcoreDeviceHandle_t devHandle,
unsigned int flags, const ROCMContext& ctx);
megcoreStatus_t getROCMContext(megcoreComputingHandle_t handle,
ROCMContext* ctx);
// Set MIOpen algo search enabled or disabled
megcoreStatus_t enableMIOpenAlgoSearch(bool enable_algo_search = true);
// Find out whether MIOpen algo search is enabled or disabled
megcoreStatus_t getMIOpenAlgoSearchStatus(bool* algo_search_enabled);
} // namespace megcore
static inline megcoreStatus_t megcoreCreateComputingHandleWithROCMStream(
megcoreComputingHandle_t* compHandle, megcoreDeviceHandle_t devHandle,
unsigned int flags, hipStream_t stream) {
megcore::ROCMContext ctx;
ctx.stream = stream;
return megcore::createComputingHandleWithROCMContext(compHandle, devHandle,
flags, ctx);
}
static inline megcoreStatus_t megcoreGetROCMStream(
megcoreComputingHandle_t handle, hipStream_t* stream) {
megcore::ROCMContext ctx;
auto ret = megcore::getROCMContext(handle, &ctx);
*stream = ctx.stream;
return ret;
}
#include "megdnn/internal/visibility_epilogue.h"
// vim: syntax=cpp.doxygen
......@@ -33,6 +33,7 @@ class Handle {
ARMV7 = 4,
AARCH64 = 5,
CUDA = 6,
ROCM = 11,
ATLAS = 13,
CAMBRICON = 12,
};
......@@ -71,6 +72,13 @@ class Handle {
template <typename opr>
std::unique_ptr<opr> create_cuda_operator();
#endif
#if MEGDNN_WITH_ROCM
static std::unique_ptr<Handle> make_rocm_handle(
megcoreComputingHandle_t computing_handle);
template <typename opr>
std::unique_ptr<opr> create_rocm_operator();
#endif
virtual ~Handle();
......
......@@ -11,6 +11,7 @@ def main():
description='generate elemwise impl files',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--type', type=str, choices=['cuda',
'hip',
'cpp'],
default='cpp', help='generate cuda/hip kernel file')
parser.add_argument('output', help='output directory')
......@@ -21,6 +22,8 @@ def main():
if args.type == 'cuda':
cpp_ext = 'cu'
elif args.type == 'hip':
cpp_ext = 'cpp.hip'
else:
assert args.type == 'cpp'
cpp_ext = 'cpp'
......
......@@ -11,6 +11,7 @@ def main():
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--type', type=str, choices=[
'cuda',
'hip'
],
default='cuda',
help='generate cuda/hip elemwise special kernel file')
......@@ -22,6 +23,9 @@ def main():
if args.type == 'cuda':
cpp_ext = 'cu'
else:
assert args.type =='hip'
cpp_ext = 'cpp.hip'
for dtype in DTYPES.keys():
fname = 'special_{}.{}'.format(dtype, cpp_ext)
......
......@@ -91,6 +91,13 @@ std::unique_ptr<Handle> Handle::make(megcoreComputingHandle_t computing_handle,
}
}
MIDOUT_END();
#endif
}
else if (platform == megcorePlatformROCM) {
#if MEGDNN_WITH_ROCM
return make_rocm_handle(computing_handle);
#else
return nullptr;
#endif
}
else if (platform == megcorePlatformCambricon) {
......@@ -193,6 +200,14 @@ std::unique_ptr<Handle> Handle::make(megcoreComputingHandle_t computing_handle,
#if MEGDNN_WITH_ATLAS
CASE(ATLAS, atlas);
#endif
#if MEGDNN_WITH_ROCM
case HandleType::ROCM: {
MIDOUT_BEGIN(HandleOpr, Opr, midout_iv(HandleType::ROCM)) {
return create_rocm_operator<Opr>();
}
MIDOUT_END();
}
#endif
#if MEGDNN_WITH_CAMBRICON
CASE(CAMBRICON, cambricon);
#endif
......
......@@ -18,6 +18,10 @@
#endif
#if MEGDNN_WITH_ROCM
#include "src/rocm/megcore/computing_context.hpp"
#endif
#if MEGDNN_WITH_CAMBRICON
#include "src/cambricon/megcore/cambricon_computing_context.hpp"
#endif
......@@ -41,6 +45,10 @@ std::unique_ptr<ComputingContext> ComputingContext::make(
case megcorePlatformCUDA:
return make_unique<cuda::CUDAComputingContext>(dev_handle, flags);
#endif
#if MEGDNN_WITH_ROCM
case megcorePlatformROCM:
return make_rocm_computing_context(dev_handle, flags);
#endif
#if MEGDNN_WITH_CAMBRICON
case megcorePlatformCambricon:
return make_unique<cambricon::CambriconComputingContext>(dev_handle,
......
......@@ -15,6 +15,9 @@
#if MEGDNN_WITH_CUDA
#include "src/cuda/megcore/cuda_device_context.hpp"
#endif
#if MEGDNN_WITH_ROCM
#include "src/rocm/megcore/device_context.hpp"
#endif
#if MEGDNN_WITH_CAMBRICON
#include "src/cambricon/megcore/cambricon_device_context.hpp"
#endif
......@@ -36,6 +39,10 @@ std::unique_ptr<DeviceContext> DeviceContext::make(megcorePlatform_t platform,
case megcorePlatformCUDA:
return make_unique<cuda::CUDADeviceContext>(deviceID, flags);
#endif
#if MEGDNN_WITH_ROCM
case megcorePlatformROCM:
return make_rocm_device_context(deviceID, flags);
#endif
#if MEGDNN_WITH_CAMBRICON
case megcorePlatformCambricon:
return make_unique<cambricon::CambriconDeviceContext>(deviceID,
......
/**
* \file src/rocm/add_update/add_update.cpp.hip
*
* This file is part of MegDNN, a deep neural network run-time library
* developed by Megvii.
*
* \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
*/
#include "hcc_detail/hcc_defs_prologue.h"
#include "./add_update.h.hip"
namespace megdnn {
namespace rocm {
#define cb(_dtype) \
INST_RUN_ELEMWISE(AddUpdateKernOp<DTypeTrait<_dtype>::ctype>, \
DTypeTrait<_dtype>::ctype, 1); \
INST_RUN_ELEMWISE(AddUpdateKernOpNonContig<DTypeTrait<_dtype>::ctype>, \
DTypeTrait<_dtype>::ctype, 2);
MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
} // namespace rocm
} // namespace megdnn
// vim: ft=cpp syntax=cpp.doxygen
/**
*
* \file src/rocm/add_update/add_update.h.hip
*
* This file is part of MegDNN, a deep neural network run-time library
* developed by Megvii.
*
* \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
*/
#pragma once
#include "hip_header.h"
#include "src/rocm/elemwise_helper.h.hip"
#if MEGDNN_CC_HOST
#include "megdnn/oprs.h"
#endif
namespace megdnn {
namespace rocm {
template<typename ctype>
struct AddUpdateKernOp {
ctype *dst;
ctype alpha, beta, bias;
__device__ void operator() (uint32_t idx, ctype delta) {
dst[idx] = dst[idx] * alpha + delta * beta + bias;
}
#if MEGDNN_CC_HOST
AddUpdateKernOp(const TensorND &dest, const AddUpdate::Param &param):
dst{dest.ptr<ctype>()},
alpha(param.alpha), beta(param.beta), bias(param.bias)
{
}
#endif
};
template<typename ctype>
struct AddUpdateKernOpNonContig {
ctype alpha, beta, bias;
__device__ void operator() (uint32_t /*idx*/, ctype &dst, ctype delta) {
dst = dst * alpha + delta * beta + bias;
}
#if MEGDNN_CC_HOST
AddUpdateKernOpNonContig(const AddUpdate::Param &param):
alpha(param.alpha), beta(param.beta), bias(param.bias)
{
}
#endif
};
}
}
// vim: ft=cpp syntax=cpp.doxygen
/**
* \file dnn/src/rocm/add_update/opr_impl.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "hcc_detail/hcc_defs_prologue.h"
#include "./opr_impl.h"
#include "src/rocm/add_update/add_update.h.hip"
#include "src/common/utils.h"
using namespace megdnn;
using namespace rocm;
void AddUpdateForwardImpl::exec(_megdnn_tensor_inout dest,
_megdnn_tensor_in delta) {
check_exec(dest.layout, delta.layout);
if (!dest.layout.is_contiguous()) {
return exec_noncontig(dest, delta);
}
ElemwiseOpParamN<1> param;
param[0] = delta;
param[0].layout = param[0].layout.broadcast(dest.layout);
param.init_from_given_tensor();
auto stream = hip_stream(handle());
switch (dest.layout.dtype.enumv()) {
#define cb(_dt) \
case DTypeTrait<_dt>::enumv: { \
using ctype = DTypeTrait<_dt>::ctype; \
return run_elemwise<AddUpdateKernOp<ctype>, ctype, 1>( \
param, stream, {dest, m_param}); \
}
MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
#undef cb
default:
megdnn_throw(megdnn_mangle("unsupported dtype for AddUpdate"));
}
}
void AddUpdateForwardImpl::exec_noncontig(_megdnn_tensor_inout dest,
_megdnn_tensor_in delta) {
ElemwiseOpParamN<2> param = make_param(dest, delta);
auto stream = hip_stream(handle());
switch (dest.layout.dtype.enumv()) {
#define cb(_dt) \
case DTypeTrait<_dt>::enumv: { \
using ctype = DTypeTrait<_dt>::ctype; \
return run_elemwise<AddUpdateKernOpNonContig<ctype>, ctype, 2>( \
param, stream, {m_param}); \
}
MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
#undef cb
default:
megdnn_throw(megdnn_mangle("unsupported dtype for AddUpdate"));
}
}
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/rocm/add_update/opr_impl.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
#include "megdnn/oprs.h"
#include "src/common/add_update_helper.h"
#include "src/rocm/utils.h"
namespace megdnn {
namespace rocm {
class AddUpdateForwardImpl final : public AddUpdateForwardHelper {
void exec_noncontig(_megdnn_tensor_inout dest, _megdnn_tensor_in delta);
public:
using AddUpdateForwardHelper::AddUpdateForwardHelper;
void exec(_megdnn_tensor_inout dest, _megdnn_tensor_in delta) override;
bool is_thread_safe() const override { return true; }
};
} // namespace rocm
} // namespace megdnn
// vim: syntax=cpp.doxygen
/**
* \file src/rocm/argmxx/argmxx.cpp.hip
*
* This file is part of MegDNN, a deep neural network run-time library
* developed by Megvii.
*
* \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
*/
#include "hcc_detail/hcc_defs_prologue.h"
#include "hip_header.h"
#include "src/common/argmxx_helper.h"
#include "src/rocm/reduce_helper.h.hip"
#include "megdnn/dtype.h"
namespace megdnn {
namespace rocm {
#define INST(_dt) \
INST_REDUCE(argmxx::ArgmxxOp<DTypeTrait<_dt>::ctype MEGDNN_COMMA false>, false); \
INST_REDUCE(argmxx::ArgmxxOp<DTypeTrait<_dt>::ctype MEGDNN_COMMA true>, false); \
MEGDNN_FOREACH_COMPUTING_DTYPE(INST)
} // namespace rocm
} // namespace megdnn
/**
* \file dnn/src/rocm/argmxx/opr_impl.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "hcc_detail/hcc_defs_prologue.h"
#include "src/rocm/argmxx/opr_impl.h"
#include "src/rocm/utils.h"
#include "src/common/reduce_helper.h"
#include "src/common/argmxx_helper.h"
#include "src/rocm/reduce_helper.h.hip"
namespace {
using namespace megdnn;
using namespace rocm;
using namespace argmxx;
template <typename T, bool is_max>
size_t get_workspace_in_bytes_impl(const TensorLayout &src,
const TensorLayout & /* dst */,
size_t axis)
{
size_t A, B, C;
reduce::get_ABC(src, A, B, C, axis);
return get_reduce_workspace_in_bytes<argmxx::ArgmxxOp<T, is_max>>(
A, B, C);
}
template <typename T, bool is_max>
void exec_impl(const T *src, int *dst, void *workspace,
size_t A, size_t B, size_t C,
hipStream_t stream)
{
argmxx::ArgmxxOp<T, is_max> opr(const_cast<T *>(src), dst, A, B, C);
run_reduce<argmxx::ArgmxxOp<T, is_max>, false>(
(typename argmxx::ArgmxxOp<T, is_max>::wtype *)workspace,
A, B, C,
stream, opr);
after_kernel_launch();
}
} // anonymous namespace
namespace megdnn {
namespace rocm {
size_t ArgmaxForwardImpl::get_workspace_in_bytes(const TensorLayout &src,
const TensorLayout &dst)
{
#define cb(dt) \
if (src.dtype == dt()) { \
using ctype = typename DTypeTrait<dt>::ctype; \
return get_workspace_in_bytes_impl<ctype, true>(src, dst, param().axis); \
}
MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
#undef cb
megdnn_assert_internal(false);
}
void ArgmaxForwardImpl::exec(_megdnn_tensor_in src,
_megdnn_tensor_out dst,
_megdnn_workspace workspace)
{
check_exec(src.layout, dst.layout, workspace.size);
size_t A, B, C;
reduce::get_ABC(src.layout, A, B, C, param().axis);
auto stream = hip_stream(handle());
#define cb(dt) \
if (src.layout.dtype.enumv() == DTypeTrait<dt>::enumv) { \
using ctype = typename DTypeTrait<dt>::ctype; \
exec_impl<ctype, true>(src.ptr<ctype>(), \
dst.ptr<dt_int32>(), \
workspace.raw_ptr, \
A, B, C, stream); \
return; \
}
MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
#undef cb
megdnn_throw(megdnn_mangle(ssprintf("Unsupported DType: %s",
src.layout.dtype.name())));
}
size_t ArgminForwardImpl::get_workspace_in_bytes(const TensorLayout &src,
const TensorLayout &dst)
{
#define cb(dt) \
if (src.dtype == dt()) { \
using ctype = typename DTypeTrait<dt>::ctype; \
return get_workspace_in_bytes_impl<ctype, false>(src, dst, param().axis); \
}
MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
#undef cb
megdnn_assert_internal(false);
}
void ArgminForwardImpl::exec(_megdnn_tensor_in src,
_megdnn_tensor_out dst,
_megdnn_workspace workspace)
{
check_exec(src.layout, dst.layout, workspace.size);
size_t A, B, C;
reduce::get_ABC(src.layout, A, B, C, param().axis);
auto stream = hip_stream(handle());
#define cb(dt) \
if (src.layout.dtype.enumv() == DTypeTrait<dt>::enumv) { \
using ctype = typename DTypeTrait<dt>::ctype; \
exec_impl<ctype, false>(src.ptr<ctype>(), \
dst.ptr<dt_int32>(), \
workspace.raw_ptr, \
A, B, C, stream); \
return; \
}
MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
#undef cb
megdnn_throw(megdnn_mangle(ssprintf("Unsupported DType: %s",
src.layout.dtype.name())));
}
} // namespace rocm
} // namespace megdnn
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/rocm/argmxx/opr_impl.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
#include "megdnn/oprs.h"
namespace megdnn {
namespace rocm {
class ArgmaxForwardImpl final: public ArgmaxForward {
public:
using ArgmaxForward::ArgmaxForward;
void exec(_megdnn_tensor_in src,
_megdnn_tensor_out dst,
_megdnn_workspace workspace) override;
size_t get_workspace_in_bytes(const TensorLayout &src,
const TensorLayout &dst) override;
};
class ArgminForwardImpl: public ArgminForward {
public:
using ArgminForward::ArgminForward;
void exec(_megdnn_tensor_in src,
_megdnn_tensor_out dst,
_megdnn_workspace) override;
size_t get_workspace_in_bytes(const TensorLayout &src,
const TensorLayout &dst) override;
};
} // namespace rocm
} // namespace megdnn
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/rocm/batched_matrix_mul/opr_impl.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "hcc_detail/hcc_defs_prologue.h"
#include "./opr_impl.h"
#include "src/common/utils.cuh"
#include "src/rocm/handle.h"
#include "src/rocm/utils.h"
namespace megdnn {
namespace rocm {
void BatchedMatrixMulForwardImpl::exec(_megdnn_tensor_in A, _megdnn_tensor_in B,
_megdnn_tensor_out C,
_megdnn_workspace workspace) {
check_exec(A.layout, B.layout, C.layout, workspace.size);
auto dtype = A.layout.dtype;
megdnn_assert(dtype.category() == DTypeCategory::FLOAT &&
param().format == param::MatrixMul::Format::DEFAULT);
if (dtype == dtype::Float32() ||
MEGDNN_FLOAT16_SELECT(dtype == dtype::Float16(), false)) {
auto batch = A.layout.shape[0];
auto m = C.layout.shape[1], n = C.layout.shape[2];
auto k = A.layout.shape[param().transposeA ? 1 : 2];
auto handle = concrete_handle(this->handle());
auto rocblas_handle_ = handle->get_rocblas_handle();
auto io32_c32 = [&]() {
auto zero = handle->zero_device();
auto one = handle->one_device();
rocblas_check(rocblas_sgemm_strided_batched(
rocblas_handle_,
param().transposeB ? rocblas_operation_transpose
: rocblas_operation_none,
param().transposeA ? rocblas_operation_transpose
: rocblas_operation_none,
n, m, k, one, B.ptr<dt_float32>(),
(rocblas_int)(B.layout.stride[1]),
(rocblas_int)(B.layout.stride[0]), A.ptr<dt_float32>(),
(rocblas_int)(A.layout.stride[1]),
(rocblas_int)(A.layout.stride[0]), zero,
C.ptr<dt_float32>(), (rocblas_int)(C.layout.stride[1]),
(rocblas_int)(C.layout.stride[0]), (rocblas_int)(batch)));
};
#if !MEGDNN_DISABLE_FLOAT16
auto io16_c32 = [&]() {
auto zero = handle->zero_device();
auto one = handle->one_device();
int32_t solution_index = 0;
uint32_t flags = 1;
size_t ws_size = 0;
rocblas_check(rocblas_gemm_strided_batched_ex(
rocblas_handle_,
param().transposeB ? rocblas_operation_transpose
: rocblas_operation_none,
param().transposeA ? rocblas_operation_transpose
: rocblas_operation_none,
n, m, k, one, B.raw_ptr, rocblas_datatype_i8_r,
B.layout.stride[1], B.layout.stride[0], A.raw_ptr,
rocblas_datatype_i8_r, A.layout.stride[1],
A.layout.stride[0], zero, C.raw_ptr, rocblas_datatype_i32_r,
C.layout.stride[1], C.layout.stride[0], C.raw_ptr,
rocblas_datatype_i32_r, C.layout.stride[1],
C.layout.stride[0], batch, rocblas_datatype_i32_r,
rocblas_gemm_algo_standard, solution_index, flags, &ws_size,
nullptr));
};
auto io16_c16 = [&]() {
auto zero_half = handle->zero_device_h();
auto one_half = handle->one_device_h();
rocblas_check(rocblas_hgemm_strided_batched(
rocblas_handle_,
param().transposeB ? rocblas_operation_transpose
: rocblas_operation_none,
param().transposeA ? rocblas_operation_transpose
: rocblas_operation_none,
n, m, k, reinterpret_cast<const rocblas_half*>(one_half),
static_cast<const rocblas_half*>(B.raw_ptr),
B.layout.stride[1], B.layout.stride[0],
static_cast<const rocblas_half*>(A.raw_ptr),
A.layout.stride[1], A.layout.stride[0],
reinterpret_cast<const rocblas_half*>(zero_half),
static_cast<rocblas_half*>(C.raw_ptr),
C.layout.stride[1], C.layout.stride[0], batch));
};
#endif
if (dtype == dtype::Float32()) {
io32_c32();
}
#if !MEGDNN_DISABLE_FLOAT16
else {
if (param().compute_mode == Param::ComputeMode::FLOAT32) {
io16_c32();
} else {
io16_c16();
}
}
#endif
}
}
} // namespace rocm
} // namespace megdnn
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/rocm/batched_matrix_mul/opr_impl.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
#include "megdnn/oprs.h"
namespace megdnn {
namespace rocm {
class BatchedMatrixMulForwardImpl : public BatchedMatrixMulForward {
public:
using BatchedMatrixMulForward::BatchedMatrixMulForward;
BatchedMatrixMulForwardImpl(Handle* handle)
: BatchedMatrixMul(handle),
m_opr(handle->create_operator<MatrixMul>()) {}
void exec(_megdnn_tensor_in A, _megdnn_tensor_in B, _megdnn_tensor_out C,
_megdnn_workspace workspace) override;
size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&,
const TensorLayout&) override {
return 0;
}
bool is_thread_safe() const override { return true; }
private:
std::unique_ptr<MatrixMul> m_opr;
};
} // namespace rocm
} // namespace megdnn
// vim: syntax=cpp.doxygen
/**
* \file src/rocm/checksum/kern.cpp.hip
*
* This file is part of MegDNN, a deep neural network run-time library
* developed by Megvii.
*
* \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
*/
#include "hcc_detail/hcc_defs_prologue.h"
#include "hip_header.h"
#include "./kern.h.hip"
#include "src/rocm/reduce_helper.h.hip"
namespace megdnn {
namespace rocm {
namespace checksum {
namespace {
struct ChecksumOp {
typedef uint32_t wtype;
const uint32_t* src;
uint32_t* dst;
static const uint32_t INIT = 0;
__host__ __device__ void write(uint32_t idx, uint32_t val) {
dst[idx] = val;
}
__host__ __device__ static uint32_t apply(uint32_t a, uint32_t b) {
return a + b;
}
};
struct NonFourAlignedChecksumOp : ChecksumOp {
__host__ __device__ uint32_t read(uint32_t idx) {
uint8_t* data = (uint8_t*)(src + idx);
return (data[0] | ((uint32_t)data[1] << 8) | ((uint32_t)data[2] << 16) |
((uint32_t)data[3] << 24)) *
(idx + 1);
}
};
struct FourAlignedChecksumOp : ChecksumOp {
__host__ __device__ uint32_t read(uint32_t idx) {
return src[idx] * (idx + 1);
}
};
} // anonymous namespace
void calc(uint32_t* dest, const uint32_t* buf, uint32_t* workspace,
size_t nr_elem, hipStream_t stream) {
if (!nr_elem)
return;
if (reinterpret_cast<uint64_t>(buf) & 0b11) {
NonFourAlignedChecksumOp op;
op.src = buf;
op.dst = dest;
run_reduce<NonFourAlignedChecksumOp, false>(workspace, 1, nr_elem, 1,
stream, op);
} else {
FourAlignedChecksumOp op;
op.src = buf;
op.dst = dest;
run_reduce<FourAlignedChecksumOp, false>(workspace, 1, nr_elem, 1,
stream, op);
}
}
size_t get_workspace_in_bytes(size_t nr_elem) {
return get_reduce_workspace_in_bytes<ChecksumOp>(1, nr_elem, 1);
}
} // namespace checksum
} // namespace rocm`
} // namespace megdnn
// vim: ft=cpp syntax=cpp.doxygen
/**
* \file src/rocm/checksum/kern.h.hip
*
* This file is part of MegDNN, a deep neural network run-time library
* developed by Megvii.
*
* \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
*/
#pragma once
#include "hip_header.h"
namespace megdnn {
namespace rocm {
namespace checksum {
void calc(uint32_t* dest, const uint32_t* buf, uint32_t* workspace,
size_t nr_elem, hipStream_t stream);
size_t get_workspace_in_bytes(size_t nr_elem);
} // namespace checksum
} // namespace rocm
} // namespace megdnn
// vim: ft=cpp syntax=cpp.doxygen
/**
* \file dnn/src/rocm/checksum/opr_impl.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "hcc_detail/hcc_defs_prologue.h"
#include "./opr_impl.h"
#include "src/rocm/checksum/kern.h.hip"
#include "src/common/utils.h"
#include "src/rocm/reduce_helper.h.hip"
#include <algorithm>
using namespace megdnn;
using namespace rocm;
namespace {
WorkspaceBundle get_wbundle(const TensorLayout& data) {
size_t size_all = data.shape[0], size_ints = size_all / sizeof(uint32_t);
size_t part1 = checksum::get_workspace_in_bytes(size_ints);
size_t part2 = sizeof(ChecksumForward::Result::checksum);
return {nullptr, {part1, part2}};
}
} // anonymous namespace
size_t ChecksumForwardImpl::get_workspace_in_bytes(const TensorLayout& data) {
auto wbundle = get_wbundle(data);
return wbundle.total_size_in_bytes();
}
ChecksumForward::Result ChecksumForwardImpl::exec(_megdnn_tensor_in data,
_megdnn_workspace workspace) {
auto wbundle = get_wbundle(data.layout);
wbundle.set(workspace.raw_ptr);
Result result;
memset(&result, 0, sizeof(result));
check_exec(data.layout, workspace.size);
auto stream = hip_stream(handle());
auto ptr = static_cast<uint8_t*>(data.raw_ptr);
size_t size_all = data.layout.shape[0],
size_ints = size_all / sizeof(uint32_t);
auto last_val_size = std::min<size_t>(size_all, 4);
hip_check(hipMemcpyAsync(&result.last_val, ptr + size_all - last_val_size,
last_val_size, hipMemcpyDeviceToHost, stream));
if (size_ints) {
checksum::calc(static_cast<uint32_t*>(wbundle.get(1)),
static_cast<uint32_t*>(data.raw_ptr),
static_cast<uint32_t*>(wbundle.get(0)), size_ints,
stream);
hip_check(hipMemcpyAsync(&result.checksum, wbundle.get(1),
sizeof(result.checksum), hipMemcpyDeviceToHost,
stream));
}
hip_check(hipStreamSynchronize(stream));
return result;
}
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/rocm/checksum/opr_impl.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
#include "megdnn/oprs.h"
#include "src/rocm/utils.h"
namespace megdnn {
namespace rocm {
class ChecksumForwardImpl final : public ChecksumForward {
public:
using ChecksumForward::ChecksumForward;
size_t get_workspace_in_bytes(const TensorLayout&) override;
bool is_thread_safe() const override { return true; }
Result exec(_megdnn_tensor_in data, _megdnn_workspace workspace) override;
};
} // namespace rocm
} // namespace megdnn
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/rocm/convolution/backward_data/algo.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "hcc_detail/hcc_defs_prologue.h"
#include "./algo.h"
#include "src/rocm/utils.h"
using namespace megdnn;
using namespace rocm;
ConvolutionBackwardDataImpl::AlgoPack::AlgoPack() {
all_algos.push_back(&miopen);
all_algos.push_back(&matmul);
all_algos.push_back(&chanwise);
non_miopen_algos.push_back(&matmul);
non_miopen_algos.push_back(&chanwise);
miopen_algos.push_back(&miopen);
}
ConvolutionBackwardDataImpl::AlgoPack ConvolutionBackwardDataImpl::sm_algo_pack;
ConvolutionBackwardDataImpl::AlgoBase::SizeArgs::SizeArgs(
ConvolutionBackwardDataImpl* o, const TensorLayout& filter,
const TensorLayout& diff, const TensorLayout& grad)
: SizeArgs(o, o->check_layout_fwd(grad, filter, diff), diff, grad) {}
ConvolutionBackwardDataImpl::AlgoBase::SizeArgs::SizeArgs(
ConvolutionBackwardDataImpl* o, const CanonizedFilterMeta& filter,
const TensorLayout& diff, const TensorLayout& grad)
: handle{concrete_handle(o->handle())},
filter_meta{filter},
diff_layout{&diff},
grad_layout{&grad},
opr{o} {}
ConvolutionBackwardDataImpl::AlgoBase::ExecArgs::ExecArgs(
ConvolutionBackwardDataImpl* opr, _megdnn_tensor_in filter,
_megdnn_tensor_in diff, _megdnn_tensor_out grad,
_megdnn_workspace workspace)
: SizeArgs(opr, filter.layout, diff.layout, grad.layout),
filter_tensor{&filter},
diff_tensor{&diff},
grad_tensor{&grad},
workspace{workspace} {}
std::string ConvolutionBackwardDataImpl::AlgoBase::SizeArgs::to_string() const {
auto&& fm = filter_meta;
MEGDNN_MARK_USED_VAR(fm);
return megdnn_mangle(ssprintf(
"filter=%u{%u,%u,%u,%u}, diff=%s, grad=%s, "
"pad=%ux%u, stride=%ux%u, dilate=%ux%u, xcorr=%d, dtype=%s,%s",
fm.group, fm.ocpg, fm.icpg, fm.spatial[0], fm.spatial[1],
diff_layout->to_string().c_str(), grad_layout->to_string().c_str(),
fm.padding[0], fm.padding[1], fm.stride[0], fm.stride[1],
fm.dilation[0], fm.dilation[1], !fm.should_flip,
diff_layout->dtype.name(), grad_layout->dtype.name()));
}
convolution::MIOpenCacheKey
ConvolutionBackwardDataImpl::AlgoBase::SizeArgs::to_miopen_algo_cache_key()
const {
convolution::MIOpenCacheKey res;
res.miopen_handle = reinterpret_cast<intptr_t>(handle->miopen_handle());
res.batch = grad_layout->operator[](0);
res.IC = grad_layout->operator[](1);
res.IH = grad_layout->operator[](2);
res.IW = grad_layout->operator[](3);
res.OH = diff_layout->operator[](2);
res.OW = diff_layout->operator[](3);
res.FH = filter_meta.spatial[0];
res.FW = filter_meta.spatial[1];
res.SH = filter_meta.stride[0];
res.SW = filter_meta.stride[1];
res.PH = filter_meta.padding[0];
res.PW = filter_meta.padding[1];
res.DH = filter_meta.dilation[0];
res.DW = filter_meta.dilation[1];
res.group = filter_meta.group;
res.ocpg = filter_meta.ocpg;
res.icpg = filter_meta.icpg;
res.dtype_enum = static_cast<uint32_t>(diff_layout->dtype.enumv());
res.exhaustive_search =
static_cast<int32_t>(handle->enable_miopen_algo_search());
res.OC = res.group * res.ocpg;
return res;
}
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/rocm/convolution/backward_data/algo.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
#include "src/rocm/convolution/helper.h"
namespace megdnn {
namespace rocm {
/*!
* \brief base class for convolution algos
*
*/
class ConvolutionBackwardDataImpl::AlgoBase : public Algorithm {
protected:
~AlgoBase() = default;
public:
struct SizeArgs {
HandleImpl* handle;
CanonizedFilterMeta filter_meta;
const TensorLayout *diff_layout, *grad_layout;
ConvolutionBackwardDataImpl* opr;
std::string to_string() const;
convolution::MIOpenCacheKey to_miopen_algo_cache_key() const;
void init_desc(convolution::MIOpenBwdDataDescs& desc) const {
desc.set(filter_meta, *diff_layout, *grad_layout, opr->param());
}
SizeArgs(ConvolutionBackwardDataImpl* opr, const TensorLayout& filter,
const TensorLayout& diff, const TensorLayout& grad);
SizeArgs(ConvolutionBackwardDataImpl* opr,
const CanonizedFilterMeta& filter, const TensorLayout& diff,
const TensorLayout& grad);
convolution::ForwardSizeArgs as_fwd_args() const {
return {handle, grad_layout, filter_meta, diff_layout};
}
};
struct ExecArgs : public SizeArgs {
const TensorND *filter_tensor, *diff_tensor, *grad_tensor;
Workspace workspace;
ExecArgs(ConvolutionBackwardDataImpl* opr, _megdnn_tensor_in filter,
_megdnn_tensor_in diff, _megdnn_tensor_out grad,
_megdnn_workspace workspace);
};
virtual bool is_available(const SizeArgs& args) const = 0;
virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0;
virtual void exec(const ExecArgs& args) const = 0;
bool is_available_wk(const SizeArgs& args, size_t limit) {
return is_available(args) && get_workspace_in_bytes(args) <= limit;
}
bool is_available_reproducible(
const SizeArgs& args, bool reproducible = true,
size_t limit = std::numeric_limits<size_t>::max()) {
return (!reproducible || is_reproducible()) &&
is_available_wk(args, limit);
}
AlgoBase& check_workspace(const SizeArgs& args,
const Workspace& workspace) {
auto req = get_workspace_in_bytes(args);
megdnn_assert(req <= workspace.size,
"conv bwd data algo %s: "
"required workspace %zu bytes, got %zu",
name(), req, workspace.size);
return *this;
}
virtual bool is_miopen() const { return false; }
};
class ConvolutionBackwardDataImpl::AlgoMIOpen final : public AlgoBase {
bool m_is_reproducible;
const char* m_name;
miopenConvBwdDataAlgorithm_t find_best_algo(const ExecArgs& args);
public:
AlgoMIOpen() = delete;
AlgoMIOpen(bool is_reproducible) : m_is_reproducible(is_reproducible) {}
bool is_available(const SizeArgs& args) const override;
size_t get_workspace_in_bytes(const SizeArgs& args) const override;
void exec(const ExecArgs& args) const override;
bool is_reproducible() const override { return m_is_reproducible; }
const char* name() const override {
return "MIOpenConvolutionBackwardData";
}
bool is_miopen() const override { return true; }
static convolution::MIOpenCache<SizeArgs, miopenConvBwdDataAlgorithm_t>
sm_miopen_algo_cache;
static convolution::MIOpenCache<SizeArgs, size_t> sm_miopen_ws_cache;
};
class ConvolutionBackwardDataImpl::AlgoMatmul final : public AlgoBase {
template <typename T>
static void exec_internal(const ExecArgs& args);
public:
bool is_available(const SizeArgs& args) const override;
size_t get_workspace_in_bytes(const SizeArgs& args) const override;
void exec(const ExecArgs& args) const override;
const char* name() const override { return "MATMUL"; }
bool is_reproducible() const override { return true; }
};
class ConvolutionBackwardDataImpl::AlgoChanwise final : public AlgoBase {
public:
bool is_available(const SizeArgs& args) const override;
size_t get_workspace_in_bytes(const SizeArgs& args) const override;
void exec(const ExecArgs& args) const override;
const char* name() const override { return "CHANNEL_WISE"; }
bool is_reproducible() const override { return true; }
};
class ConvolutionBackwardDataImpl::AlgoPack {
// defined in miopen.cpp
void fill_miopen_algos();
AlgoPack(const AlgoPack&) = delete;
AlgoPack& operator=(const AlgoPack&) = delete;
public:
AlgoPack();
AlgoMIOpen miopen{true};
AlgoMatmul matmul;
AlgoChanwise chanwise;
std::vector<AlgoBase*>
//! all algorithms
all_algos, miopen_algos, non_miopen_algos;
};
} // namespace rocm
} // namespace megdnn
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/rocm/convolution/backward_data/chanwise.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "./algo.h"
#include "src/rocm/utils.h"
#include "src/rocm/convolution/chanwise/kern.h.hip"
using namespace megdnn;
using namespace rocm;
using namespace convolution;
bool ConvolutionBackwardDataImpl::AlgoChanwise::is_available(
const SizeArgs& args) const {
auto&& fm = args.filter_meta;
return args.filter_meta.format == Param::Format::NCHW &&
args.diff_layout->dtype.category() == DTypeCategory::FLOAT &&
args.opr->param().compute_mode != Param::ComputeMode::FLOAT32 &&
fm.spatial_ndim == 2 && fm.icpg == 1 && fm.dilation[0] == 1 &&
fm.dilation[1] == 1 && !fm.should_flip;
}
size_t ConvolutionBackwardDataImpl::AlgoChanwise::get_workspace_in_bytes(
const SizeArgs&) const {
return 0;
}
void ConvolutionBackwardDataImpl::AlgoChanwise::exec(
const ExecArgs& args) const {
auto kparam = chanwise::Param::from_fwd_args(args.as_fwd_args());
auto stream = hip_stream(args.handle);
switch (args.diff_layout->dtype.enumv()) {
#define cb(_dt) \
case DTypeTrait<_dt>::enumv: { \
using ctype = DTypeTrait<_dt>::ctype; \
return chanwise::run_bwd_data(args.grad_tensor->ptr<ctype>(), \
args.diff_tensor->ptr<ctype>(), \
args.filter_tensor->ptr<ctype>(), \
kparam, stream); \
}
MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
#undef cb
default:
break;
}
megdnn_assert_internal(0);
}
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/rocm/convolution/backward_data/matmul.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "./algo.h"
#include "src/rocm/utils.h"
#include "src/rocm/convolution/helper.h"
#include "src/rocm/convolution/im2col.h.hip"
using namespace megdnn;
using namespace rocm;
bool ConvolutionBackwardDataImpl::AlgoMatmul::is_available(
const SizeArgs& args) const {
auto&& fm = args.filter_meta;
return args.filter_meta.format == Param::Format::NCHW &&
args.diff_layout->dtype.category() == DTypeCategory::FLOAT &&
args.opr->param().compute_mode != Param::ComputeMode::FLOAT32 &&
fm.group == 1 && fm.spatial_ndim == 2;
}
size_t ConvolutionBackwardDataImpl::AlgoMatmul::get_workspace_in_bytes(
const SizeArgs& args) const {
return matmul_get_workspace_bundle(args.as_fwd_args())
.total_size_in_bytes();
}
void ConvolutionBackwardDataImpl::AlgoMatmul::exec(const ExecArgs& args) const {
#define cb(DType) \
if (args.diff_layout->dtype == DType()) { \
using ctype = typename DTypeTrait<DType>::ctype; \
exec_internal<ctype>(args); \
return; \
}
MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
#undef cb
megdnn_assert_internal(0);
}
template <typename T>
void ConvolutionBackwardDataImpl::AlgoMatmul::exec_internal(
const ExecArgs& args) {
auto&& fm = args.filter_meta;
size_t N = args.grad_layout->shape[0], IC = fm.icpg,
IH = args.grad_layout->shape[2], IW = args.grad_layout->shape[3],
OC = fm.ocpg, OH = args.diff_layout->shape[2],
OW = args.diff_layout->shape[3], FH = fm.spatial[0],
FW = fm.spatial[1], PH = fm.padding[0], PW = fm.padding[1],
SH = fm.stride[0], SW = fm.stride[1], DH = fm.dilation[0],
DW = fm.dilation[1];
auto stream = hip_stream(args.handle);
auto wbundle = matmul_get_workspace_bundle(args.as_fwd_args());
wbundle.set(args.workspace.raw_ptr);
T* diff_t = static_cast<T*>(wbundle.get(0));
T* col = static_cast<T*>(wbundle.get(1));
{
// transpose diff
TensorLayout froml({N, OC * OH * OW}, typename DTypeTrait<T>::dtype()),
tol(froml);
froml.stride[0] = args.diff_layout->stride[0];
tol.stride[0] = 1;
tol.stride[1] = N;
TensorND from(args.diff_tensor->ptr<T>(), froml), to(diff_t, tol);
args.handle->relayout_opr()->exec(from, to);
}
{
// take gemm grad
TensorLayout Al({OC, IC * FH * FW}, typename DTypeTrait<T>::dtype()),
Bl({IC * FH * FW, OH * OW * N},
typename DTypeTrait<T>::dtype()),
Cl({OC, OH * OW * N}, typename DTypeTrait<T>::dtype());
TensorND A(args.filter_tensor->ptr<T>(), Al), B(col, Bl), C(diff_t, Cl);
if (fm.should_flip) {
convolution::flip_filter(args.as_fwd_args(),
wbundle.get_workspace(2), A.raw_ptr);
}
args.handle->matmul_aT_opr()->exec(A, C, B, Workspace());
}
{
convolution::col2im<T>(col, args.grad_tensor->ptr<T>(), N,
args.grad_layout->stride[0], IC, IH, IW, FH, FW,
OH, OW, PH, PW, SH, SW, DH, DW, stream);
}
}
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/rocm/convolution/backward_data/miopen.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "hcc_detail/hcc_defs_prologue.h"
#include "./algo.h"
#include "src/rocm/utils.h"
#include "src/rocm/miopen_wrapper.h"
#include "src/rocm/convolution/helper.h"
using namespace megdnn;
using namespace rocm;
using namespace convolution;
MIOpenCache<ConvolutionBackwardDataImpl::AlgoBase::SizeArgs,
miopenConvBwdDataAlgorithm_t>
ConvolutionBackwardDataImpl::AlgoMIOpen::sm_miopen_algo_cache;
MIOpenCache<ConvolutionBackwardDataImpl::AlgoBase::SizeArgs, size_t>
ConvolutionBackwardDataImpl::AlgoMIOpen::sm_miopen_ws_cache;
bool ConvolutionBackwardDataImpl::AlgoMIOpen::is_available(
const SizeArgs& args) const {
MIOpenBwdDataDescs D;
if (!is_miopen_supported(args.as_fwd_args()))
return false;
auto got = sm_miopen_ws_cache.get(args);
if (got.first)
return true;
args.init_desc(D);
size_t workspace_size;
auto status = miopenConvolutionBackwardDataGetWorkSpaceSize(
args.handle->miopen_handle(), D.diff_desc.desc, D.filter_desc.desc,
D.conv_desc.desc, D.grad_desc.desc, &workspace_size);
if (status == miopenStatusSuccess) {
sm_miopen_ws_cache.set(args, workspace_size);
return true;
}
return false;
}
size_t ConvolutionBackwardDataImpl::AlgoMIOpen::get_workspace_in_bytes(
const SizeArgs& args) const {
auto got = sm_miopen_ws_cache.get(args);
if (got.first)
return got.second;
MIOpenBwdDataDescs D;
args.init_desc(D);
size_t workspace_size;
auto status = miopenConvolutionBackwardDataGetWorkSpaceSize(
args.handle->miopen_handle(), D.diff_desc.desc, D.filter_desc.desc,
D.conv_desc.desc, D.grad_desc.desc, &workspace_size);
megdnn_assert(status == miopenStatusSuccess,
"conv bwd_data get workspace failed: %s; info: %s",
miopenGetErrorString(status), args.to_string().c_str());
sm_miopen_ws_cache.set(args, workspace_size);
return workspace_size;
}
miopenConvBwdDataAlgorithm_t
ConvolutionBackwardDataImpl::AlgoMIOpen::find_best_algo(const ExecArgs& args) {
auto find_algo = sm_miopen_algo_cache.get(args);
if (find_algo.first)
return find_algo.second;
bool exhaustive_search = args.handle->enable_miopen_algo_search();
MIOpenBwdDataDescs D;
args.init_desc(D);
const int req_algo_count = 1;
int ret_algo_count;
miopenConvAlgoPerf_t algo_perf;
miopen_check(miopenFindConvolutionBackwardDataAlgorithm(
args.handle->miopen_handle(), D.diff_desc.desc,
args.diff_tensor->raw_ptr, D.filter_desc.desc,
args.filter_tensor->raw_ptr, D.conv_desc.desc, D.grad_desc.desc,
args.grad_tensor->raw_ptr, req_algo_count, &ret_algo_count,
&algo_perf, args.workspace.raw_ptr, args.workspace.size,
exhaustive_search));
sm_miopen_algo_cache.set(args, algo_perf.bwd_data_algo);
return algo_perf.bwd_data_algo;
}
void ConvolutionBackwardDataImpl::AlgoMIOpen::exec(const ExecArgs& args) const {
MIOpenBwdDataDescs D;
args.init_desc(D);
auto algo = const_cast<ConvolutionBackwardDataImpl::AlgoMIOpen*>(this)
->find_best_algo(args);
float alpha = 1.0f, beta = 0.0f;
auto status = miopenConvolutionBackwardData(
args.handle->miopen_handle(), &alpha, D.diff_desc.desc,
args.diff_tensor->raw_ptr, D.filter_desc.desc,
args.filter_tensor->raw_ptr, D.conv_desc.desc, algo, &beta,
D.grad_desc.desc, args.grad_tensor->raw_ptr, args.workspace.raw_ptr,
args.workspace.size);
megdnn_assert(status == miopenStatusSuccess,
"conv bwd_data failed: %s; info: %s",
miopenGetErrorString(status), args.to_string().c_str());
}
void ConvolutionBackwardDataImpl::AlgoPack::fill_miopen_algos() {}
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/rocm/convolution/backward_filter/algo.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "hcc_detail/hcc_defs_prologue.h"
#include "./algo.h"
#include "src/rocm/utils.h"
using namespace megdnn;
using namespace rocm;
ConvolutionBackwardFilterImpl::AlgoPack::AlgoPack() {
all_algos.push_back(&miopen);
all_algos.push_back(&matmul);
all_algos.push_back(&chanwise);
non_miopen_algos.push_back(&matmul);
non_miopen_algos.push_back(&chanwise);
non_miopen_algos.push_back(all_algos.back());
miopen_algos.push_back(&miopen);
}
ConvolutionBackwardFilterImpl::AlgoPack
ConvolutionBackwardFilterImpl::sm_algo_pack;
ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs::SizeArgs(
ConvolutionBackwardFilterImpl* o, const TensorLayout& src,
const TensorLayout& diff, const TensorLayout& grad)
: SizeArgs(o, src, diff, o->check_layout_fwd(src, grad, diff)) {}
ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs::SizeArgs(
ConvolutionBackwardFilterImpl* o, const TensorLayout& src,
const TensorLayout& diff, const CanonizedFilterMeta& grad)
: handle{concrete_handle(o->handle())},
src_layout{&src},
diff_layout{&diff},
grad_filter_meta{grad},
opr{o} {}
ConvolutionBackwardFilterImpl::AlgoBase::ExecArgs::ExecArgs(
ConvolutionBackwardFilterImpl* opr, _megdnn_tensor_in src,
_megdnn_tensor_in diff, _megdnn_tensor_out grad,
_megdnn_workspace workspace)
: SizeArgs(opr, src.layout, diff.layout, grad.layout),
src_tensor{&src},
diff_tensor{&diff},
grad_tensor{&grad},
workspace{workspace} {}
std::string ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs::to_string()
const {
auto&& fm = grad_filter_meta;
MEGDNN_MARK_USED_VAR(fm);
return megdnn_mangle(ssprintf(
"src=%s diff=%s grad_filter=%u{%u,%u,%u,%u}, "
"pad=%ux%u, stride=%ux%u, dilate=%ux%u, xcorr=%d, dtype=%s,%s",
src_layout->to_string().c_str(), diff_layout->to_string().c_str(),
fm.group, fm.ocpg, fm.icpg, fm.spatial[0], fm.spatial[1],
fm.padding[0], fm.padding[1], fm.stride[0], fm.stride[1],
fm.dilation[0], fm.dilation[1], !fm.should_flip,
src_layout->dtype.name(), diff_layout->dtype.name()));
}
convolution::MIOpenCacheKey
ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs::to_miopen_algo_cache_key()
const {
convolution::MIOpenCacheKey res;
res.miopen_handle = reinterpret_cast<intptr_t>(handle->miopen_handle());
res.batch = src_layout->operator[](0);
res.IC = src_layout->operator[](1);
res.IH = src_layout->operator[](2);
res.IW = src_layout->operator[](3);
res.OH = diff_layout->operator[](2);
res.OW = diff_layout->operator[](3);
res.FH = grad_filter_meta.spatial[0];
res.FW = grad_filter_meta.spatial[1];
res.SH = grad_filter_meta.stride[0];
res.SW = grad_filter_meta.stride[1];
res.PH = grad_filter_meta.padding[0];
res.PW = grad_filter_meta.padding[1];
res.DH = grad_filter_meta.dilation[0];
res.DW = grad_filter_meta.dilation[1];
res.group = grad_filter_meta.group;
res.ocpg = grad_filter_meta.ocpg;
res.icpg = grad_filter_meta.icpg;
res.dtype_enum = static_cast<uint32_t>(src_layout->dtype.enumv());
res.exhaustive_search =
static_cast<int32_t>(handle->enable_miopen_algo_search());
res.OC = res.group * res.ocpg;
return res;
}
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/rocm/convolution/backward_filter/algo.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
#include <unordered_map>
#include "src/rocm/convolution/helper.h"
namespace megdnn {
namespace rocm {
/*!
* \brief base class for convolution algos
*
*/
class ConvolutionBackwardFilterImpl::AlgoBase : public Algorithm {
protected:
~AlgoBase() = default;
public:
struct SizeArgs {
HandleImpl* handle;
const TensorLayout *src_layout, *diff_layout;
CanonizedFilterMeta grad_filter_meta;
ConvolutionBackwardFilterImpl* opr;
std::string to_string() const;
convolution::MIOpenCacheKey to_miopen_algo_cache_key() const;
void init_desc(convolution::MIOpenBwdFilterDescs& desc) const {
desc.set(*src_layout, *diff_layout, grad_filter_meta, opr->param());
}
SizeArgs(ConvolutionBackwardFilterImpl* opr, const TensorLayout& src,
const TensorLayout& diff, const TensorLayout& grad);
SizeArgs(ConvolutionBackwardFilterImpl* opr, const TensorLayout& src,
const TensorLayout& diff, const CanonizedFilterMeta& grad);
convolution::ForwardSizeArgs as_fwd_args() const {
return {handle, src_layout, grad_filter_meta, diff_layout};
}
};
struct ExecArgs : public SizeArgs {
const TensorND *src_tensor, *diff_tensor, *grad_tensor;
Workspace workspace;
ExecArgs(ConvolutionBackwardFilterImpl* opr, _megdnn_tensor_in src,
_megdnn_tensor_in diff, _megdnn_tensor_out grad,
_megdnn_workspace workspace);
};
virtual bool is_available(const SizeArgs& args) const = 0;
virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0;
virtual void exec(const ExecArgs& args) const = 0;
bool is_available_wk(const SizeArgs& args, size_t limit) {
return is_available(args) && get_workspace_in_bytes(args) <= limit;
}
bool is_available_reproducible(
const SizeArgs& args, bool reproducible = true,
size_t limit = std::numeric_limits<size_t>::max()) {
return (!reproducible || is_reproducible()) &&
is_available_wk(args, limit);
}
AlgoBase& check_workspace(const SizeArgs& args,
const Workspace& workspace) {
auto req = get_workspace_in_bytes(args);
megdnn_assert(req <= workspace.size,
"conv bwd filter algo %s: "
"required workspace %zu bytes, got %zu",
name(), req, workspace.size);
return *this;
}
virtual bool is_miopen() const { return false; }
};
class ConvolutionBackwardFilterImpl::AlgoMIOpen final : public AlgoBase {
bool m_is_reproducible;
const char* m_name;
miopenConvBwdWeightsAlgorithm_t find_best_algo(const ExecArgs& args);
public:
AlgoMIOpen() = delete;
AlgoMIOpen(bool is_reproducible) : m_is_reproducible(is_reproducible) {}
bool is_available(const SizeArgs& args) const override;
size_t get_workspace_in_bytes(const SizeArgs& args) const override;
void exec(const ExecArgs& args) const override;
bool is_reproducible() const override { return m_is_reproducible; }
const char* name() const override {
return "MIOpenConvolutionBackwardFilter";
}
bool is_miopen() const override { return true; }
static convolution::MIOpenCache<SizeArgs, miopenConvBwdWeightsAlgorithm_t>
sm_miopen_algo_cache;
static convolution::MIOpenCache<SizeArgs, size_t> sm_miopen_ws_cache;
};
class ConvolutionBackwardFilterImpl::AlgoMatmul final : public AlgoBase {
template <typename T>
static void exec_internal(const ExecArgs& args);
public:
bool is_available(const SizeArgs& args) const override;
size_t get_workspace_in_bytes(const SizeArgs& args) const override;
void exec(const ExecArgs& args) const override;
const char* name() const override { return "MATMUL"; }
bool is_reproducible() const override { return true; }
};
class ConvolutionBackwardFilterImpl::AlgoChanwise final : public AlgoBase {
public:
bool is_available(const SizeArgs& args) const override;
size_t get_workspace_in_bytes(const SizeArgs& args) const override;
void exec(const ExecArgs& args) const override;
const char* name() const override { return "CHANNEL_WISE"; }
bool is_reproducible() const override { return true; }
};
class ConvolutionBackwardFilterImpl::AlgoPack {
void fill_miopen_algos();
AlgoPack(const AlgoPack&) = delete;
AlgoPack& operator=(const AlgoPack&) = delete;
public:
AlgoPack();
AlgoMIOpen miopen{true};
AlgoMatmul matmul;
AlgoChanwise chanwise;
std::vector<AlgoBase*>
//! all algorithms
all_algos, miopen_algos, non_miopen_algos;
};
} // namespace rocm
} // namespace megdnn
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/rocm/convolution/backward_filter/chanwise.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "./algo.h"
#include "src/rocm/utils.h"
#include "src/rocm/convolution/chanwise/kern.h.hip"
using namespace megdnn;
using namespace rocm;
using namespace convolution;
bool ConvolutionBackwardFilterImpl::AlgoChanwise::is_available(
const SizeArgs& args) const {
auto&& fm = args.grad_filter_meta;
return fm.format == Param::Format::NCHW &&
args.diff_layout->dtype.category() == DTypeCategory::FLOAT &&
args.opr->param().compute_mode != Param::ComputeMode::FLOAT32 &&
fm.spatial_ndim == 2 && fm.icpg == 1 && fm.dilation[0] == 1 &&
fm.dilation[1] == 1 && !fm.should_flip;
}
size_t ConvolutionBackwardFilterImpl::AlgoChanwise::get_workspace_in_bytes(
const SizeArgs&) const {
return 0;
}
void ConvolutionBackwardFilterImpl::AlgoChanwise::exec(
const ExecArgs& args) const {
auto kparam = chanwise::Param::from_fwd_args(args.as_fwd_args());
auto stream = hip_stream(args.handle);
switch (args.diff_layout->dtype.enumv()) {
#define cb(_dt) \
case DTypeTrait<_dt>::enumv: { \
using ctype = DTypeTrait<_dt>::ctype; \
return chanwise::run_bwd_filter( \
args.grad_tensor->ptr<ctype>(), args.src_tensor->ptr<ctype>(), \
args.diff_tensor->ptr<ctype>(), kparam, stream); \
}
MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
#undef cb
default:
break;
}
megdnn_assert_internal(0);
}
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/rocm/convolution/backward_filter/matmul.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "./algo.h"
#include "src/rocm/utils.h"
#include "src/rocm/convolution/helper.h"
#include "src/rocm/convolution/im2col.h.hip"
using namespace megdnn;
using namespace rocm;
bool ConvolutionBackwardFilterImpl::AlgoMatmul::is_available(
const SizeArgs& args) const {
auto&& fm = args.grad_filter_meta;
return fm.format == Param::Format::NCHW &&
args.diff_layout->dtype.category() == DTypeCategory::FLOAT &&
args.opr->param().compute_mode != Param::ComputeMode::FLOAT32 &&
fm.group == 1 && fm.spatial_ndim == 2;
}
size_t ConvolutionBackwardFilterImpl::AlgoMatmul::get_workspace_in_bytes(
const SizeArgs& args) const {
return matmul_get_workspace_bundle(args.as_fwd_args())
.total_size_in_bytes();
}
void ConvolutionBackwardFilterImpl::AlgoMatmul::exec(
const ExecArgs& args) const {
#define cb(DType) \
if (args.diff_layout->dtype == DType()) { \
using ctype = typename DTypeTrait<DType>::ctype; \
exec_internal<ctype>(args); \
return; \
}
MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
#undef cb
megdnn_assert_internal(0);
}
template <typename T>
void ConvolutionBackwardFilterImpl::AlgoMatmul::exec_internal(
const ExecArgs& args) {
auto&& fm = args.grad_filter_meta;
size_t N = args.src_layout->shape[0], IC = fm.icpg,
IH = args.src_layout->shape[2], IW = args.src_layout->shape[3],
OC = fm.ocpg, OH = args.diff_layout->shape[2],
OW = args.diff_layout->shape[3], FH = fm.spatial[0],
FW = fm.spatial[1], PH = fm.padding[0], PW = fm.padding[1],
SH = fm.stride[0], SW = fm.stride[1], DH = fm.dilation[0],
DW = fm.dilation[1];
auto stream = hip_stream(args.handle);
auto wbundle = matmul_get_workspace_bundle(args.as_fwd_args());
wbundle.set(args.workspace.raw_ptr);
T* diff_t = static_cast<T*>(wbundle.get(0));
T* col = static_cast<T*>(wbundle.get(1));
{
// transpose diff
TensorLayout froml({N, OC * OH * OW}, typename DTypeTrait<T>::dtype()),
tol(froml);
froml.stride[0] = args.diff_layout->stride[0];
tol.stride[0] = 1;
tol.stride[1] = N;
TensorND from(args.diff_tensor->ptr<T>(), froml), to(diff_t, tol);
args.handle->relayout_opr()->exec(from, to);
}
{
convolution::im2col<T>(args.src_tensor->ptr<T>(), col, N,
args.src_tensor->layout.stride[0], IC, IH, IW,
FH, FW, OH, OW, PH, PW, SH, SW, DH, DW, stream);
}
{
// take gemm grad
TensorLayout Al({OC, IC * FH * FW}, typename DTypeTrait<T>::dtype()),
Bl({IC * FH * FW, OH * OW * N},
typename DTypeTrait<T>::dtype()),
Cl({OC, OH * OW * N}, typename DTypeTrait<T>::dtype());
TensorND A(args.grad_tensor->ptr<T>(), Al), B(col, Bl), C(diff_t, Cl);
if (fm.should_flip) {
A.raw_ptr = wbundle.get(2);
}
args.handle->matmul_bT_opr()->exec(C, B, A, Workspace());
if (fm.should_flip) {
convolution::flip_filter(
args.as_fwd_args(),
{static_cast<dt_byte*>(args.grad_tensor->raw_ptr),
wbundle.get_size(2)},
A.raw_ptr);
}
}
}
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/rocm/convolution/backward_filter/miopen.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "hcc_detail/hcc_defs_prologue.h"
#include "./algo.h"
#include "src/rocm/utils.h"
#include "src/rocm/miopen_wrapper.h"
#include "src/rocm/convolution/helper.h"
using namespace megdnn;
using namespace rocm;
using namespace convolution;
MIOpenCache<ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs,
miopenConvBwdWeightsAlgorithm_t>
ConvolutionBackwardFilterImpl::AlgoMIOpen::sm_miopen_algo_cache;
MIOpenCache<ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs, size_t>
ConvolutionBackwardFilterImpl::AlgoMIOpen::sm_miopen_ws_cache;
bool ConvolutionBackwardFilterImpl::AlgoMIOpen::is_available(
const SizeArgs& args) const {
MIOpenBwdFilterDescs D;
if (!is_miopen_supported(args.as_fwd_args()))
return false;
auto got = sm_miopen_ws_cache.get(args);
if (got.first)
return true;
args.init_desc(D);
size_t workspace_size;
auto status = miopenConvolutionBackwardWeightsGetWorkSpaceSize(
args.handle->miopen_handle(), D.diff_desc.desc, D.src_desc.desc,
D.conv_desc.desc, D.grad_desc.desc, &workspace_size);
if (status == miopenStatusSuccess) {
sm_miopen_ws_cache.set(args, workspace_size);
return true;
}
return false;
}
size_t ConvolutionBackwardFilterImpl::AlgoMIOpen::get_workspace_in_bytes(
const SizeArgs& args) const {
auto got = sm_miopen_ws_cache.get(args);
if (got.first)
return got.second;
MIOpenBwdFilterDescs D;
args.init_desc(D);
size_t workspace_size;
auto status = miopenConvolutionBackwardWeightsGetWorkSpaceSize(
args.handle->miopen_handle(), D.diff_desc.desc, D.src_desc.desc,
D.conv_desc.desc, D.grad_desc.desc, &workspace_size);
megdnn_assert(status == miopenStatusSuccess,
"conv bwd_filter get workspace failed: %s; info: %s",
miopenGetErrorString(status), args.to_string().c_str());
sm_miopen_ws_cache.set(args, workspace_size);
return workspace_size;
}
miopenConvBwdWeightsAlgorithm_t
ConvolutionBackwardFilterImpl::AlgoMIOpen::find_best_algo(const ExecArgs& args) {
auto find_algo = sm_miopen_algo_cache.get(args);
if (find_algo.first)
return find_algo.second;
bool exhaustive_search = args.handle->enable_miopen_algo_search();
MIOpenBwdFilterDescs D;
args.init_desc(D);
const int req_algo_count = 1;
int ret_algo_count;
miopenConvAlgoPerf_t algo_perf;
miopen_check(miopenFindConvolutionBackwardWeightsAlgorithm(
args.handle->miopen_handle(), D.diff_desc.desc,
args.diff_tensor->raw_ptr, D.src_desc.desc,
args.src_tensor->raw_ptr, D.conv_desc.desc, D.grad_desc.desc,
args.grad_tensor->raw_ptr, req_algo_count, &ret_algo_count,
&algo_perf, args.workspace.raw_ptr, args.workspace.size,
exhaustive_search));
// algo_perf.bwd_weights_algo = miopenConvolutionBwdWeightsAlgoGEMM;
sm_miopen_algo_cache.set(args, algo_perf.bwd_weights_algo);
return algo_perf.bwd_weights_algo;
}
void ConvolutionBackwardFilterImpl::AlgoMIOpen::exec(
const ExecArgs& args) const {
MIOpenBwdFilterDescs D;
args.init_desc(D);
auto algo = const_cast<ConvolutionBackwardFilterImpl::AlgoMIOpen*>(this)
->find_best_algo(args);
float alpha = 1.0f, beta = 0.0f;
auto status = miopenConvolutionBackwardWeights(
args.handle->miopen_handle(), &alpha, D.diff_desc.desc,
args.diff_tensor->raw_ptr, D.src_desc.desc,
args.src_tensor->raw_ptr, D.conv_desc.desc, algo, &beta,
D.grad_desc.desc, args.grad_tensor->raw_ptr, args.workspace.raw_ptr,
args.workspace.size);
megdnn_assert(status == miopenStatusSuccess,
"conv bwd_filter failed: %s; info: %s",
miopenGetErrorString(status), args.to_string().c_str());
}
void ConvolutionBackwardFilterImpl::AlgoPack::fill_miopen_algos() {}
// vim: syntax=cpp.doxygen
/**
* \file src/rocm/convolution/chanwise/bwd_data.cpp.hip
*
* This file is part of MegDNN, a deep neural network run-time library
* developed by Megvii.
*
* \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
*/
#include "hip_header.h"
#include "./kern.h.hip"
#include "./kern_helper.h.hip"
using namespace megdnn;
using namespace rocm;
using namespace convolution;
using namespace chanwise;
namespace {
// grid idx is (inp_chl, worker_index)
// each y-slice of a block works on an (N, IH, IW) spatial image at given
// inp_chl
template <typename T, int CHL_MUL_SET, int FH_SET, int FW_SET, int SH_SET,
int SW_SET>
__global__ void kern_bwd_data(T* src_grad, const T* dst_grad, const T* flt_tot,
Param param) {
extern __shared__ uint8_t flt_storage[];
T* const flt = reinterpret_cast<T*>(flt_storage);
const uint32_t N = param.batch, IC = param.src_chl, ic = blockIdx.x,
IH = param.src_h, IW = param.src_w,
CHL_MUL = CHL_MUL_SET ? CHL_MUL_SET : param.chl_mul,
FH = FH_SET ? FH_SET : param.flt_h,
FW = FW_SET ? FW_SET : param.flt_w, FSIZE = FH * FW,
PH = param.pad_h, PW = param.pad_w,
SH = SH_SET ? SH_SET : param.stride_h,
SW = SW_SET ? SW_SET : param.stride_w, OH = param.out_h,
OW = param.out_w, TOT_OUT = N * IH * IW;
block_memcpy(flt, flt_tot + ic * FSIZE * CHL_MUL, FSIZE * CHL_MUL);
dst_grad += ic * CHL_MUL * OH * OW;
src_grad += ic * IH * IW;
uint32_t out_idx_ = blockIdx.y * blockDim.x + threadIdx.x,
nr_out_per_launch = blockDim.x * gridDim.y;
for (; out_idx_ < TOT_OUT; out_idx_ += nr_out_per_launch) {
uint32_t out_idx = out_idx_, n, ih, iw;
out_idx = div_mod(out_idx, IW, iw);
out_idx = div_mod(out_idx, IH, ih);
n = out_idx;
const T* dst_grad_base = dst_grad + n * (IC * CHL_MUL * OH * OW);
T sum(0);
// o >= max(0, floor_div((i+P-F+1), S))
uint32_t ohmin = max(int32_t(ih + PH - FH + SH), 0) / SH,
owmin = max(int32_t(iw + PW - FW + SW), 0) / SW,
ohmax = min((ih + PH) / SH, OH - 1),
owmax = min((iw + PW) / SW, OW - 1);
if (SH_SET == 1 && SW_SET == 1 && FH_SET && FW_SET) {
#pragma unroll
for (uint32_t doh = 0; doh < FH; ++doh) {
uint32_t oh = ohmin + doh;
if (oh <= ohmax) {
uint32_t fh = ih - oh * SH + PH;
#pragma unroll
for (uint32_t dow = 0; dow < FW; ++dow) {
uint32_t ow = owmin + dow;
if (ow <= owmax) {
uint32_t fw = iw - ow * SW + PW;
const T* pd = dst_grad_base + oh * OW + ow;
const T* pf = flt + fh * FW + fw;
#pragma unroll
for (uint32_t chl_mul = 0; chl_mul < CHL_MUL;
++chl_mul) {
sum += *pd * *pf;
pd += OH * OW;
pf += FSIZE;
}
}
}
}
}
} else {
for (uint32_t oh = ohmin; oh <= ohmax; ++oh) {
uint32_t fh = ih - oh * SH + PH;
for (uint32_t ow = owmin; ow <= owmax; ++ow) {
uint32_t fw = iw - ow * SW + PW;
const T* pd = dst_grad_base + oh * OW + ow;
const T* pf = flt + fh * FW + fw;
#pragma unroll
for (uint32_t chl_mul = 0; chl_mul < CHL_MUL; ++chl_mul) {
sum += *pd * *pf;
pd += OH * OW;
pf += FSIZE;
}
}
}
}
src_grad[(n * (IC * IH) + ih) * IW + iw] = sum;
}
}
template <typename T>
class KernDispatch {
public:
typedef void (*kern_ptr_t)(T*, const T*, const T*, Param);
static kern_ptr_t dispatch(int chl_mul, int fh, int fw, int sh, int sw) {
if (chl_mul == 1) {
if (fh == 3 && fw == 3)
return d1<1, 3, 3>(sh, sw);
if (fh == 4 && fw == 4)
return d1<1, 4, 4>(sh, sw);
}
return d1<0, 0, 0>(sh, sw);
}
private:
template <int chl_mul, int fh, int fw>
static kern_ptr_t d1(int sh, int sw) {
if (sh == 1 && sw == 1)
return kern_bwd_data<T, chl_mul, fh, fw, 1, 1>;
if (sh == 1 && sw == 2)
return kern_bwd_data<T, chl_mul, fh, fw, 1, 2>;
if (sh == 2 && sw == 1)
return kern_bwd_data<T, chl_mul, fh, fw, 2, 1>;
if (sh == 2 && sw == 2)
return kern_bwd_data<T, chl_mul, fh, fw, 2, 2>;
return kern_bwd_data<T, chl_mul, fh, fw, 0, 0>;
}
};
} // anonymous namespace
template <typename T>
void chanwise::run_bwd_data(T* src_grad, const T* dst_grad, const T* flt,
const Param& param, hipStream_t stream) {
typename KernDispatch<T>::kern_ptr_t kern =
KernDispatch<T>::dispatch(param.chl_mul, param.flt_h, param.flt_w,
param.stride_h, param.stride_w);
int nr_thread = 256, nr_out_dimx = param.src_h * param.src_w * param.batch;
dim3 nr_block(param.src_chl,
std::min(512, max(nr_out_dimx / (nr_thread * 4), 1)));
uint32_t shared = param.chl_mul * param.flt_h * param.flt_w * sizeof(T);
kern<<<nr_block, nr_thread, shared, stream>>>(src_grad, dst_grad, flt,
param);
after_kernel_launch();
}
namespace megdnn {
namespace rocm {
namespace convolution {
namespace chanwise {
#define INST(_dt) \
template void run_bwd_data( \
DTypeTrait<_dt>::ctype*, const DTypeTrait<_dt>::ctype*, \
const DTypeTrait<_dt>::ctype*, const Param&, hipStream_t);
MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(INST)
#undef INST
#undef DO_INST
} // namespace chanwise
} // namespace convolution
} // namespace rocm
} // namespace megdnn
// vim: syntax=cuda.doxygen
/**
* \file src/rocm/convolution/chanwise/bwd_filter.cpp.hip
*
* This file is part of MegDNN, a deep neural network run-time library
* developed by Megvii.
*
* \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
*/
#include "hip_header.h"
#include "./kern.h.hip"
#include "./kern_helper.h.hip"
const uint32_t WARP_SIZE = 32, BATCH_UNROLL = 4;
using namespace megdnn;
using namespace rocm;
using namespace convolution;
using namespace chanwise;
namespace {
/*!
* \brief compute grad w.r.t. filter
*
* block dim: out_id * kern_id
* threads with the same out_id computes grad for corresponding kernel element
* \tparam nr_thpf number of threads for one element in the filter; must be
* power of 2;
*/
template <typename T, uint32_t nr_thpf>
__global__ void kern_bwd_filter(T* flt_grad, const T* src, const T* dst_grad,
Param param) {
const uint32_t N = param.batch, IC = param.src_chl, IH = param.src_h,
IW = param.src_w, CHL_MUL = param.chl_mul, FH = param.flt_h,
FW = param.flt_w, PH = param.pad_h, PW = param.pad_w,
SH = param.stride_h, SW = param.stride_w, OH = param.out_h,
OW = param.out_w, SRC_BATCH_STRIDE = IC * IH * IW,
DST_BATCH_STRIDE = IC * CHL_MUL * OH * OW,
BLKDIM_X = blockDim.x / nr_thpf,
THREADID_X = threadIdx.x / nr_thpf,
OUT_IDX = blockIdx.x * BLKDIM_X + THREADID_X;
uint32_t ic, chl_mul, fh, fw;
{
uint32_t i = OUT_IDX;
i = div_mod(i, FW, fw);
i = div_mod(i, FH, fh);
i = div_mod(i, CHL_MUL, chl_mul);
ic = i;
}
if (ic >= IC) {
return;
}
src += ic * IH * IW;
dst_grad += (ic * CHL_MUL + chl_mul) * OH * OW;
const uint32_t oh_lo = max(int32_t(PH - fh + SH - 1), 0) / SH,
oh_hi = min((IH - 1 + PH - fh) / SH + 1, OH),
ow_lo = max(int32_t(PW - fw + SW - 1), 0) / SW,
ow_hi = min((IW - 1 + PW - fw) / SW + 1, OW),
oblk_h = oh_hi - oh_lo, oblk_w = ow_hi - ow_lo,
oblk_tot = oblk_h * oblk_w *
((N + BATCH_UNROLL - 1) / BATCH_UNROLL),
tid = threadIdx.x % nr_thpf;
if (IH + PH < fh + 1 || oh_lo >= oh_hi || IW + PW < fw + 1 ||
ow_lo >= ow_hi) {
if (!tid)
flt_grad[OUT_IDX] = 0;
return;
}
T sum(0);
for (uint32_t oblk_idx = tid; oblk_idx < oblk_tot; oblk_idx += nr_thpf) {
uint32_t n, oh, ow;
n = div_mod(div_mod(oblk_idx, oblk_w, ow), oblk_h, oh) * BATCH_UNROLL;
oh += oh_lo;
ow += ow_lo;
uint32_t ih = oh * SH - PH + fh, iw = ow * SW - PW + fw,
soff = ih * IW + iw + n * SRC_BATCH_STRIDE,
doff = oh * OW + ow + n * DST_BATCH_STRIDE;
#pragma unroll
for (uint32_t i = 0; i < BATCH_UNROLL; ++i) {
if (!i || n + i < N) {
sum += src[soff] * dst_grad[doff];
}
soff += SRC_BATCH_STRIDE;
doff += DST_BATCH_STRIDE;
}
}
if (nr_thpf == 1) {
flt_grad[OUT_IDX] = sum;
} else {
// reduce all sums in a block
extern __shared__ uint8_t shared_storage[];
volatile T* thread_sum = reinterpret_cast<T*>(shared_storage);
thread_sum += THREADID_X * nr_thpf;
thread_sum[tid] = sum;
#pragma unroll
for (uint32_t i = nr_thpf / 2; i; i >>= 1) {
bool cond = nr_thpf >= i * 2 && tid < i;
if (i >= WARP_SIZE) {
__syncthreads();
}
if (cond) {
T v0 = thread_sum[tid], v1 = v0 + thread_sum[tid + i];
thread_sum[tid] = v1;
}
}
if (!tid)
flt_grad[OUT_IDX] = thread_sum[0];
}
}
} // anonymous namespace
template <typename T>
void convolution::chanwise::run_bwd_filter(T* filter_grad, const T* src,
const T* dst_grad,
const Param& param,
hipStream_t stream) {
void (*kern)(T*, const T*, const T*, Param) = NULL;
uint32_t nr_thread = 256,
nr_thpf = std::min(
nr_thread,
std::max<uint32_t>(1, param.out_h * param.out_w *
param.batch /
(BATCH_UNROLL * 16)));
// find nearest power-of-2 of nr_thpf
do {
#define CK(_n) \
if (nr_thpf >= _n) { \
kern = kern_bwd_filter<T, _n>; \
nr_thpf = _n; \
break; \
}
CK(1 << 10);
CK(1 << 9);
CK(1 << 8);
CK(1 << 7);
CK(1 << 6);
CK(1 << 5);
CK(1 << 4);
CK(1 << 3);
CK(1 << 2);
CK(1 << 1);
CK(1 << 0);
#undef CK
} while (0);
megdnn_assert(kern);
nr_thread = 256;
uint32_t nr_flt_per_blk = nr_thread / nr_thpf;
while (nr_flt_per_blk * nr_thpf % WARP_SIZE)
--nr_flt_per_blk;
megdnn_assert(nr_flt_per_blk);
int nr_block =
DIVUP(param.flt_h * param.flt_w * param.src_chl * param.chl_mul,
nr_flt_per_blk);
nr_thread = nr_flt_per_blk * nr_thpf;
uint32_t shared = nr_thread * 2 * sizeof(T);
hipLaunchKernelGGL(kern, nr_block, nr_thread, shared, stream, filter_grad,
src, dst_grad, param);
after_kernel_launch();
}
namespace megdnn {
namespace rocm {
namespace convolution {
namespace chanwise {
#define DO_INST(_ct) \
template void run_bwd_filter(_ct*, const _ct*, const _ct*, const Param&, \
hipStream_t);
#define INST(_dt) DO_INST(DTypeTrait<_dt>::ctype)
MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(INST)
#undef INST
#undef DO_INST
} // namespace chanwise
} // namespace convolution
} // namespace rocm
} // namespace megdnn
// vim: syntax=cuda.doxygen
/**
* \file src/rocm/convolution/chanwise/fwd.cpp.hip
*
* This file is part of MegDNN, a deep neural network run-time library
* developed by Megvii.
*
* \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
*/
#include "hip_header.h"
#include "./kern.h.hip"
#include "./kern_helper.h.hip"
using namespace megdnn;
using namespace rocm;
using namespace convolution;
using namespace chanwise;
namespace {
// grid idx is (inp_chl, worker_index)
// each y-slice of a block works on an (N, CHL_MUL, OH, OW) spatial image at
// given inp_chl
template <typename T, int CHL_MUL_SET, int FH_SET, int FW_SET>
__global__ void kern_fwd(T* dst, const T* src, const T* flt_tot, Param param) {
extern __shared__ uint8_t flt_storage[];
T* const flt = reinterpret_cast<T*>(flt_storage);
const uint32_t N = param.batch, IC = param.src_chl, ic = blockIdx.x,
IH = param.src_h, IW = param.src_w,
CHL_MUL = CHL_MUL_SET ? CHL_MUL_SET : param.chl_mul,
FH = FH_SET ? FH_SET : param.flt_h,
FW = FW_SET ? FW_SET : param.flt_w, FSIZE = FH * FW,
PH = param.pad_h, PW = param.pad_w, SH = param.stride_h,
SW = param.stride_w, OH = param.out_h, OW = param.out_w,
TOT_OUT = N * CHL_MUL * OH * OW;
block_memcpy(flt, flt_tot + ic * FSIZE * CHL_MUL, FSIZE * CHL_MUL);
uint32_t out_idx_ = blockIdx.y * blockDim.x + threadIdx.x,
nr_out_per_launch = blockDim.x * gridDim.y;
for (; out_idx_ < TOT_OUT; out_idx_ += nr_out_per_launch) {
uint32_t out_idx = out_idx_, n, chl_mul, oh, ow;
out_idx = div_mod(out_idx, OW, ow);
out_idx = div_mod(out_idx, OH, oh);
if (CHL_MUL_SET == 1) {
chl_mul = 0;
n = out_idx;
} else {
n = div_mod(out_idx, CHL_MUL, chl_mul);
}
int ih = int(oh * SH) - int(PH), iw = int(ow * SW) - int(PW);
const T* flt_base = flt + chl_mul * FSIZE;
const T* src_base = src + int(((n * IC + ic) * IH + ih) * IW + iw);
T sum(0);
if (FH_SET && FW_SET) {
#pragma unroll
for (uint32_t fh = 0; fh < FH; ++fh) {
if (static_cast<uint32_t>(fh + ih) < IH) {
#pragma unroll
for (uint32_t fw = 0; fw < FW; ++fw) {
if (static_cast<uint32_t>(fw + iw) < IW) {
sum += flt_base[fh * FW + fw] *
src_base[fh * IW + fw];
}
}
}
}
} else {
int fhmax = min(int(FH), int(IH - ih)),
fwmax = min(int(FW), int(IW - iw));
for (int fh = max(0, -ih); fh < fhmax; ++fh) {
for (int fw = max(0, -iw); fw < fwmax; ++fw) {
sum += flt_base[fh * FW + fw] * src_base[fh * IW + fw];
}
}
}
dst[(((n * IC + ic) * CHL_MUL + chl_mul) * OH + oh) * OW + ow] = sum;
}
}
} // anonymous namespace
template <typename T>
void chanwise::run_fwd(T* dst, const T* src, const T* flt, const Param& param,
hipStream_t stream) {
void (*kern)(T*, const T*, const T*, Param);
if (param.chl_mul == 1) {
if (param.flt_h == 3 && param.flt_w == 3) {
kern = kern_fwd<T, 1, 3, 3>;
} else if (param.flt_h == 4 && param.flt_w == 4) {
kern = kern_fwd<T, 1, 4, 4>;
} else {
kern = kern_fwd<T, 1, 0, 0>;
}
} else {
kern = kern_fwd<T, 0, 0, 0>;
}
int nr_thread = 256,
nr_out_dimx = param.out_h * param.out_w * param.batch * param.chl_mul;
dim3 nr_block(param.src_chl,
std::min(512, max(nr_out_dimx / (nr_thread * 4), 1)));
uint32_t shared = param.chl_mul * param.flt_h * param.flt_w * sizeof(T);
kern<<<nr_block, nr_thread, shared, stream>>>(dst, src, flt, param);
after_kernel_launch();
}
namespace megdnn {
namespace rocm {
namespace convolution {
namespace chanwise {
#define DO_INST(_ct) \
template void run_fwd(_ct*, const _ct*, const _ct*, const Param&, \
hipStream_t);
#define INST(_dt) DO_INST(DTypeTrait<_dt>::ctype)
MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(INST)
#undef INST
#undef DO_INST
} // namespace chanwise
} // namespace convolution
} // namespace rocm
} // namespace megdnn
// vim: syntax=cuda.doxygen
/**
* \file src/rocm/convolution/chanwise/kern.h.hip
*
* This file is part of MegDNN, a deep neural network run-time library
* developed by Megvii.
*
* \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
*/
#pragma once
#include "src/rocm/utils.h.hip"
#include <stdint.h>
#include "hip_header.h"
#if MEGDNN_CC_HOST
#include "src/rocm/convolution/helper.h"
#endif
namespace megdnn {
namespace rocm {
namespace convolution {
namespace chanwise {
struct Param {
uint32_t batch, src_chl, src_h, src_w, chl_mul, flt_h, flt_w, out_h, out_w,
pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w;
#if MEGDNN_CC_HOST
static Param from_fwd_args(const ForwardSizeArgs& args) {
#define U(v) static_cast<uint32_t>(v)
auto&& src = args.src_layout->shape;
auto&& dst = args.dst_layout->shape;
auto&& fm = args.filter_meta;
size_t c_pos, hw_pos;
if (fm.format == param::Convolution::Format::NCHW) {
c_pos = 1;
hw_pos = 2;
} else {
c_pos = 3;
hw_pos = 1;
}
return {
U(src[0]), U(src[c_pos]), U(src[hw_pos]),
U(src[hw_pos + 1]), U(fm.ocpg), U(fm.spatial[0]),
U(fm.spatial[1]), U(dst[hw_pos]), U(dst[hw_pos + 1]),
U(fm.padding[0]), U(fm.padding[1]), U(fm.stride[0]),
U(fm.stride[1]), U(fm.dilation[0]), U(fm.dilation[1]),
};
#undef U
}
#endif
};
template <typename T>
void run_fwd(T* dst, const T* src, const T* flt, const Param& param,
hipStream_t stream);
template <typename T>
void run_bwd_data(T* src_grad, const T* dst_grad, const T* flt,
const Param& param, hipStream_t stream);
template <typename T>
void run_bwd_filter(T* filter_grad, const T* src, const T* dst_grad,
const Param& param, hipStream_t stream);
} // namespace chanwise
} // namespace convolution
} // namespace rocm
} // namespace megdnn
// vim: ft=cpp syntax=cpp.doxygen
/**
* \file src/rocm/convolution/chanwise/kern_helper.h.hip
*
* This file is part of MegDNN, a deep neural network run-time library
* developed by Megvii.
*
* \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
*/
#pragma once
#include "megdnn/dtype.h"
#include "src/rocm/utils.h.hip"
#include <stdint.h>
#include <algorithm>
#include "hip_header.h"
namespace megdnn {
namespace rocm {
namespace convolution {
namespace chanwise {
/*!
* \brief return a / b and set mod to a % b
*/
__device__ __forceinline__ uint32_t div_mod(uint32_t a, uint32_t b,
uint32_t& mod) {
uint32_t ret = a / b;
mod = a - ret * b;
return ret;
}
/*!
* \brief copy a 2D matrix by all threads in a block
* \param rs row stride
*/
template <typename T>
__device__ __forceinline__ void block_memcpy(T* dst, const T* src,
uint32_t size) {
for (uint32_t i = threadIdx.x; i < size; i += blockDim.x) {
dst[i] = src[i];
}
__syncthreads();
}
} // namespace chanwise
} // namespace convolution
} // namespace rocm
} // namespace megdnn
// vim: syntax=cuda.doxygen
/**
* \file dnn/src/rocm/convolution/forward/1x1.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "./algo.h"
#include "src/rocm/handle.h"
#include "src/rocm/utils.h.hip"
using namespace megdnn;
using namespace rocm;
using namespace convolution;
bool ConvolutionForwardImpl::Algo1x1::is_available(const SizeArgs& args) const {
auto&& fm = args.filter_meta;
const size_t MAX_WORKSPACE_SIZE = 2147483648; // 2 * 1024^3
if (!(fm.format == Param::Format::NCHW &&
args.opr->param().compute_mode != Param::ComputeMode::FLOAT32 &&
(fm.dtype.enumv() == DTypeEnum::Float32 ||
fm.dtype.enumv() == DTypeEnum::Float16) &&
fm.spatial_ndim == 2 && fm.group == 1 && fm.dilation[0] == 1 &&
fm.dilation[1] == 1 && fm.spatial[0] == 1 && fm.spatial[1] == 1 &&
fm.padding[0] == 0 && fm.padding[1] == 0 && fm.stride[0] == 1 &&
fm.stride[1] == 1))
return false;
if (get_workspace_in_bytes(args) > MAX_WORKSPACE_SIZE) {
return false;
}
return true;
}
void ConvolutionForwardImpl::Algo1x1::extract_matmul_layouts(
const SizeArgs& args, TensorLayout& A, TensorLayout& B,
TensorLayout& C) {
auto&& fm = args.filter_meta;
A = {{fm.ocpg, fm.icpg}, fm.dtype};
B.ndim = 2;
B.shape[0] = args.src_layout->shape[1];
B.shape[1] = args.src_layout->shape[2] * args.src_layout->shape[3];
B.stride[0] = args.src_layout->stride[1];
B.stride[1] = 1;
B.dtype = args.src_layout->dtype;
C = {{args.dst_layout->shape[1], B.shape[1]}, args.dst_layout->dtype};
}
size_t ConvolutionForwardImpl::Algo1x1::get_workspace_in_bytes(
const SizeArgs& args) const {
TensorLayout A, B, C;
extract_matmul_layouts(args, A, B, C);
return args.handle->matmul_opr()->get_workspace_in_bytes(A, B, C);
}
void ConvolutionForwardImpl::Algo1x1::exec(const ExecArgs& args) const {
TensorND A, B, C;
extract_matmul_layouts(args, A.layout, B.layout, C.layout);
A.raw_ptr = args.filter_tensor->raw_ptr;
B.raw_ptr = args.src_tensor->raw_ptr;
C.raw_ptr = args.dst_tensor->raw_ptr;
size_t batch = args.src_layout->shape[0];
auto mm = args.handle->matmul_opr();
auto strd_B = args.src_layout->stride[0] * args.src_layout->dtype.size(),
strd_C = args.dst_layout->stride[0] * args.dst_layout->dtype.size();
for (size_t i = 0; i < batch; ++i) {
mm->exec(A, B, C, args.workspace);
incr_voidp(B.raw_ptr, strd_B);
incr_voidp(C.raw_ptr, strd_C);
}
}
/*
* Funcitons to handle large batch
*/
bool ConvolutionForwardImpl::Algo1x1LargeBatch::is_available(
const SizeArgs& args) const {
auto&& fm = args.filter_meta;
return fm.format == Param::Format::NCHW &&
args.opr->param().compute_mode != Param::ComputeMode::FLOAT32 &&
(fm.dtype.enumv() == DTypeEnum::Float32 ||
fm.dtype.enumv() == DTypeEnum::Float16) &&
fm.spatial_ndim == 2 && fm.group == 1 && fm.dilation[0] == 1 &&
fm.dilation[1] == 1 && fm.spatial[0] == 1 && fm.spatial[1] == 1 &&
fm.padding[0] == 0 && fm.padding[1] == 0 && fm.stride[0] == 1 &&
fm.stride[1] == 1;
}
void ConvolutionForwardImpl::Algo1x1LargeBatch::extract_matmul_layouts(
const SizeArgs& args, TensorLayout& A, TensorLayout& B,
TensorLayout& C) {
auto&& fm = args.filter_meta;
// A {N, OC, IC}
// B {N, IC, H * W}
// C {N, OC, H * W}
size_t batched = args.src_layout->shape[0];
A = {{batched, fm.ocpg, fm.icpg}, fm.dtype};
A.stride[0] = 0;
B.ndim = 3;
B.shape[1] = args.src_layout->shape[1];
B.shape[2] = args.src_layout->shape[2] * args.src_layout->shape[3];
B.shape[0] = batched;
B.stride[2] = 1;
B.stride[1] = args.src_layout->stride[1];
B.stride[0] = args.src_layout->stride[0];
B.dtype = args.src_layout->dtype;
C = {{args.dst_layout->shape[0], args.dst_layout->shape[1], B.shape[2]},
args.dst_layout->dtype};
}
size_t ConvolutionForwardImpl::Algo1x1LargeBatch::get_workspace_in_bytes(
const SizeArgs& args) const {
TensorLayout A, B, C;
extract_matmul_layouts(args, A, B, C);
return args.handle->batched_matrix_mul()->get_workspace_in_bytes(A, B, C);
}
void ConvolutionForwardImpl::Algo1x1LargeBatch::exec(
const ExecArgs& args) const {
TensorND A, B, C;
extract_matmul_layouts(args, A.layout, B.layout, C.layout);
A.raw_ptr = args.filter_tensor->raw_ptr;
B.raw_ptr = args.src_tensor->raw_ptr;
C.raw_ptr = args.dst_tensor->raw_ptr;
auto mm = args.handle->batched_matrix_mul();
mm->exec(A, B, C, args.workspace);
}
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/rocm/convolution/forward/algo.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "hcc_detail/hcc_defs_prologue.h"
#include "./algo.h"
#include "src/rocm/utils.h"
using namespace megdnn;
using namespace rocm;
ConvolutionForwardImpl::AlgoPack::AlgoPack() {
miopen_algos.push_back(&miopen);
non_miopen_algos.push_back(&matmul);
non_miopen_algos.push_back(&inplace_matmul);
non_miopen_algos.push_back(&a1x1);
non_miopen_algos.push_back(&batched_matrix_mul);
non_miopen_algos.push_back(&chanwise);
all_algos.push_back(&matmul);
all_algos.push_back(&inplace_matmul);
all_algos.push_back(&a1x1);
all_algos.push_back(&batched_matrix_mul);
all_algos.push_back(&chanwise);
all_algos.push_back(&miopen);
}
ConvolutionForwardImpl::AlgoPack ConvolutionForwardImpl::sm_algo_pack;
ConvolutionForwardImpl::AlgoBase::SizeArgs::SizeArgs(ConvolutionForwardImpl* o,
const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& dst)
: SizeArgs(o, src, o->check_layout_fwd(src, filter, dst), dst) {}
ConvolutionForwardImpl::AlgoBase::SizeArgs::SizeArgs(
ConvolutionForwardImpl* o, const TensorLayout& src,
const CanonizedFilterMeta& filter, const TensorLayout& dst)
: ForwardSizeArgs{concrete_handle(o->handle()), &src, filter, &dst},
opr{o} {}
ConvolutionForwardImpl::AlgoBase::ExecArgs::ExecArgs(
ConvolutionForwardImpl* opr, _megdnn_tensor_in src,
_megdnn_tensor_in filter, _megdnn_tensor_out dst,
_megdnn_workspace workspace)
: SizeArgs(opr, src.layout, filter.layout, dst.layout),
src_tensor{&src},
filter_tensor{&filter},
dst_tensor{&dst},
workspace{workspace} {}
std::string ConvolutionForwardImpl::AlgoBase::SizeArgs::to_string() const {
auto&& fm = filter_meta;
MEGDNN_MARK_USED_VAR(fm);
return megdnn_mangle(ssprintf(
"src=%s, filter=%u{%u,%u,%u,%u}, dst=%s, "
"pad=%ux%u, stride=%ux%u, dilate=%ux%u, xcorr=%d, dtype=%s,%s",
src_layout->to_string().c_str(), fm.group, fm.ocpg, fm.icpg,
fm.spatial[0], fm.spatial[1], dst_layout->to_string().c_str(),
fm.padding[0], fm.padding[1], fm.stride[0], fm.stride[1],
fm.dilation[0], fm.dilation[1], !fm.should_flip,
src_layout->dtype.name(), dst_layout->dtype.name()));
}
convolution::MIOpenCacheKey
ConvolutionForwardImpl::AlgoBase::SizeArgs::to_miopen_algo_cache_key() const {
convolution::MIOpenCacheKey res;
res.miopen_handle = reinterpret_cast<intptr_t>(handle->miopen_handle());
res.batch = src_layout->operator[](0);
res.IC = src_layout->operator[](1);
res.IH = src_layout->operator[](2);
res.IW = src_layout->operator[](3);
res.OH = dst_layout->operator[](2);
res.OW = dst_layout->operator[](3);
res.FH = filter_meta.spatial[0];
res.FW = filter_meta.spatial[1];
res.SH = filter_meta.stride[0];
res.SW = filter_meta.stride[1];
res.PH = filter_meta.padding[0];
res.PW = filter_meta.padding[1];
res.DH = filter_meta.dilation[0];
res.DW = filter_meta.dilation[1];
res.group = filter_meta.group;
res.ocpg = filter_meta.ocpg;
res.icpg = filter_meta.icpg;
res.dtype_enum = static_cast<uint32_t>(src_layout->dtype.enumv());
res.exhaustive_search =
static_cast<int32_t>(handle->enable_miopen_algo_search());
res.OC = res.group * res.ocpg;
return res;
}
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/rocm/convolution/forward/algo.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
#include "megdnn/oprs.h"
#include "src/common/utils.h"
#include "src/rocm/convolution/helper.h"
#include "src/rocm/convolution/opr_impl.h"
#include "src/rocm/handle.h"
#include <unordered_map>
namespace megdnn {
namespace rocm {
/*!
* \brief base class for convolution algos
*
*/
class ConvolutionForwardImpl::AlgoBase : public Algorithm {
protected:
~AlgoBase() = default;
public:
struct SizeArgs : public convolution::ForwardSizeArgs {
ConvolutionForwardImpl* opr;
std::string to_string() const;
convolution::MIOpenCacheKey to_miopen_algo_cache_key() const;
void init_desc(convolution::MIOpenForwardDescs& desc) const {
desc.set(*src_layout, filter_meta, *dst_layout, opr->param());
}
SizeArgs(ConvolutionForwardImpl* opr, const TensorLayout& src,
const TensorLayout& filter, const TensorLayout& dst);
SizeArgs(ConvolutionForwardImpl* opr, const TensorLayout& src,
const CanonizedFilterMeta& filter, const TensorLayout& dst);
};
struct ExecArgs : public SizeArgs {
const TensorND *src_tensor, *filter_tensor, *dst_tensor;
Workspace workspace;
ExecArgs(ConvolutionForwardImpl* opr, _megdnn_tensor_in src,
_megdnn_tensor_in filter, _megdnn_tensor_out dst,
_megdnn_workspace workspace);
};
virtual bool is_available(const SizeArgs& args) const = 0;
virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0;
virtual void exec(const ExecArgs& args) const = 0;
bool is_available_wk(const SizeArgs& args, size_t limit) {
return is_available(args) && get_workspace_in_bytes(args) <= limit;
}
bool is_available_reproducible(
const SizeArgs& args, bool reproducible = true,
size_t limit = std::numeric_limits<size_t>::max()) {
return (!reproducible || is_reproducible()) &&
is_available_wk(args, limit);
}
AlgoBase& check_workspace(const SizeArgs& args,
const Workspace& workspace) {
auto req = get_workspace_in_bytes(args);
megdnn_assert(req <= workspace.size,
"conv fwd algo %s: required workspace %zu bytes, got %zu",
name(), req, workspace.size);
return *this;
}
virtual bool is_miopen() const { return false; }
};
class ConvolutionForwardImpl::AlgoMIOpen final : public AlgoBase {
bool m_is_reproducible;
const char* m_name;
miopenConvFwdAlgorithm_t find_best_algo(const ExecArgs& args);
public:
AlgoMIOpen() = delete;
AlgoMIOpen(bool is_reproducible) : m_is_reproducible(is_reproducible) {}
bool is_available(const SizeArgs& args) const override;
size_t get_workspace_in_bytes(const SizeArgs& args) const override;
void exec(const ExecArgs& args) const override;
bool is_reproducible() const override { return m_is_reproducible; }
const char* name() const override { return "MIOpenConvolutionForward"; }
bool is_miopen() const override { return true; }
static convolution::MIOpenCache<SizeArgs, miopenConvFwdAlgorithm_t>
sm_miopen_algo_cache;
static convolution::MIOpenCache<SizeArgs, size_t> sm_miopen_ws_cache;
};
class ConvolutionForwardImpl::AlgoMatmul final : public AlgoBase {
template <typename T>
static void exec_internal(const ExecArgs& args);
public:
bool is_available(const SizeArgs& args) const override;
size_t get_workspace_in_bytes(const SizeArgs& args) const override;
void exec(const ExecArgs& args) const override;
const char* name() const override { return "MATMUL"; }
bool is_reproducible() const override { return true; }
};
//! compute small matmul in the kernel
class ConvolutionForwardImpl::AlgoInplaceMatmul final : public AlgoBase {
public:
bool is_available(const SizeArgs& args) const override;
size_t get_workspace_in_bytes(const SizeArgs& args) const override;
void exec(const ExecArgs& args) const override;
const char* name() const override { return "INPLACE_MATMUL"; }
bool is_reproducible() const override { return true; }
};
//! optimized 1x1 conv
class ConvolutionForwardImpl::Algo1x1 final : public AlgoBase {
static void extract_matmul_layouts(const SizeArgs& args, TensorLayout& A,
TensorLayout& B, TensorLayout& C);
public:
bool is_available(const SizeArgs& args) const override;
size_t get_workspace_in_bytes(const SizeArgs& args) const override;
void exec(const ExecArgs& args) const override;
const char* name() const override { return "1x1"; }
bool is_reproducible() const override { return true; }
};
//! optimized 1x1 conv when input data batchsize is larger than 32
class ConvolutionForwardImpl::Algo1x1LargeBatch final : public AlgoBase {
static void extract_matmul_layouts(const SizeArgs& args, TensorLayout& A,
TensorLayout& B, TensorLayout& C);
public:
bool is_available(const SizeArgs& args) const override;
size_t get_workspace_in_bytes(const SizeArgs& args) const override;
void exec(const ExecArgs& args) const override;
const char* name() const override { return "LARGE_BATCH_1x1"; }
bool is_reproducible() const override { return true; }
};
class ConvolutionForwardImpl::AlgoChanwise final : public AlgoBase {
public:
bool is_available(const SizeArgs& args) const override;
size_t get_workspace_in_bytes(const SizeArgs& args) const override;
void exec(const ExecArgs& args) const override;
const char* name() const override { return "CHANNEL_WISE"; }
bool is_reproducible() const override { return true; }
};
class ConvolutionForwardImpl::AlgoPack {
// defined in miopen.cpp
void fill_miopen_algos();
AlgoPack(const AlgoPack&) = delete;
AlgoPack& operator=(const AlgoPack&) = delete;
public:
AlgoPack();
AlgoMIOpen miopen{true};
AlgoMatmul matmul;
AlgoInplaceMatmul inplace_matmul;
Algo1x1 a1x1;
Algo1x1LargeBatch batched_matrix_mul;
AlgoChanwise chanwise;
std::vector<AlgoBase*>
//! all algorithms
all_algos, miopen_algos, non_miopen_algos;
};
} // namespace rocm
} // namespace megdnn
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/rocm/convolution/forward/chanwise.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "./algo.h"
#include "src/rocm/utils.h"
#include "src/rocm/convolution/chanwise/kern.h.hip"
using namespace megdnn;
using namespace rocm;
using namespace convolution;
bool ConvolutionForwardImpl::AlgoChanwise::is_available(
const SizeArgs& args) const {
auto&& fm = args.filter_meta;
return args.filter_meta.format == Param::Format::NCHW &&
args.src_layout->dtype.category() == DTypeCategory::FLOAT &&
args.opr->param().compute_mode != Param::ComputeMode::FLOAT32 &&
fm.spatial_ndim == 2 && fm.icpg == 1 && fm.dilation[0] == 1 &&
fm.dilation[1] == 1 && !fm.should_flip;
}
size_t ConvolutionForwardImpl::AlgoChanwise::get_workspace_in_bytes(
const SizeArgs&) const {
return 0;
}
void ConvolutionForwardImpl::AlgoChanwise::exec(const ExecArgs& args) const {
auto kparam = chanwise::Param::from_fwd_args(args);
auto stream = hip_stream(args.handle);
switch (args.src_layout->dtype.enumv()) {
#define cb(_dt) \
case DTypeTrait<_dt>::enumv: { \
using ctype = DTypeTrait<_dt>::ctype; \
return chanwise::run_fwd( \
args.dst_tensor->ptr<ctype>(), args.src_tensor->ptr<ctype>(), \
args.filter_tensor->ptr<ctype>(), kparam, stream); \
}
MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
#undef cb
default:
break;
}
megdnn_assert_internal(0);
}
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/rocm/convolution/forward/inplace_matmul.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "./algo.h"
#include "./inplace_matmul_impl.h.hip"
using namespace megdnn;
using namespace rocm;
bool ConvolutionForwardImpl::AlgoInplaceMatmul::is_available(
const SizeArgs& args) const {
auto&& fm = args.filter_meta;
return args.filter_meta.format == Param::Format::NCHW &&
args.src_layout->dtype == dtype::Float32() && fm.group == 1 &&
fm.spatial_ndim == 2 && fm.dilation[0] == 1 && fm.dilation[1] == 1;
}
size_t ConvolutionForwardImpl::AlgoInplaceMatmul::get_workspace_in_bytes(
const SizeArgs&) const {
return 0;
}
void ConvolutionForwardImpl::AlgoInplaceMatmul::exec(
const ExecArgs& args) const {
auto&& fm = args.filter_meta;
size_t N = args.src_layout->shape[0], IC = fm.icpg,
IH = args.src_layout->shape[2], IW = args.src_layout->shape[3],
OC = fm.ocpg, OH = args.dst_layout->shape[2],
OW = args.dst_layout->shape[3], FH = fm.spatial[0],
FW = fm.spatial[1];
auto stream = args.handle->stream();
convolution::exec_inplace_matmul_fwd(
args.src_tensor->ptr<dt_float32>(),
args.filter_tensor->ptr<dt_float32>(),
args.dst_tensor->ptr<dt_float32>(), N, args.src_layout->stride[0],
args.dst_layout->stride[0], IC, IH, IW, OC, OH, OW, FH, FW,
fm.padding[0], fm.padding[1], fm.stride[0], fm.stride[1],
!fm.should_flip, stream);
}
// vim: syntax=cpp.doxygen
/**
* \file src/rocm/convolution/forward/inplace_matmul_impl.cpp.hip
*
* This file is part of MegDNN, a deep neural network run-time library
* developed by Megvii.
*
* \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
*/
#include "./inplace_matmul_impl.h.hip"
#include "src/rocm/utils.h.hip"
using namespace megdnn;
using namespace rocm;
namespace {
struct BufferFetcherTexture {
hipTextureObject_t tex;
__device__ __forceinline__ float get(uint32_t offset) {
return tex1Dfetch<float>(tex, offset);
}
};
struct BufferFetcherRaw {
const float* ptr;
__device__ __forceinline__ float get(uint32_t offset) {
return ptr[offset];
}
};
struct BufferFetcherTextureHost {
bool init_succ;
BufferFetcherTexture val;
BufferFetcherTextureHost(float* p, const size_t n);
~BufferFetcherTextureHost() { reset(); }
void reset() {
if (init_succ) {
hip_check(hipDestroyTextureObject(val.tex));
init_succ = false;
}
}
};
BufferFetcherTextureHost::BufferFetcherTextureHost(float* p, const size_t n) {
init_succ = false;
hipTextureObject_t tex_obj;
hipResourceDesc res_desc;
memset(&res_desc, 0, sizeof(hipResourceDesc));
res_desc.resType = hipResourceTypeLinear;
res_desc.res.linear.devPtr = static_cast<void*>(p);
res_desc.res.linear.sizeInBytes = n * sizeof(float);
res_desc.res.linear.desc =
hipCreateChannelDesc(32, 0, 0, 0, hipChannelFormatKindFloat);
hipTextureDesc tex_desc;
memset(&tex_desc, 0, sizeof(hipTextureDesc));
if (hipCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL) ==
hipSuccess) {
val.tex = tex_obj;
init_succ = true;
} else {
hipGetLastError(); // reset error
}
}
template <class BufferFetcher>
struct KernelPtr {
typedef void (*type)(BufferFetcher, BufferFetcher, float*, uint32_t,
uint32_t, uint32_t, uint32_t, uint32_t, uint32_t,
uint32_t, uint32_t, uint32_t, uint32_t, uint32_t,
uint32_t, uint32_t, uint32_t);
};
//! 1 -> 0xffffffff, 0 -> 0x00000000
__device__ __forceinline__ uint32_t bool_as_mask(uint32_t cond) {
return (!cond) - 1u;
}
union FloatAndU32 {
float f;
uint32_t u;
};
//! \p mask must be either all 1 or 0 bits
template <class BufferFetcher>
__device__ __forceinline__ float visit_with_mask(BufferFetcher buf,
uint32_t offset,
uint32_t mask) {
FloatAndU32 f;
f.f = buf.get(offset & mask);
f.u &= mask;
return f.f;
}
template <uint32_t BY, uint32_t BX, bool is_xcorr, class BufferFetcher>
__global__ void conv_kernel(BufferFetcher src, BufferFetcher filter, float* dst,
const uint32_t INP_BS, const uint32_t OUT_BS,
const uint32_t IC, const uint32_t IH,
const uint32_t IW, const uint32_t OC,
const uint32_t OH, const uint32_t OW,
const uint32_t FH, const uint32_t FW,
const uint32_t SH, const uint32_t SW,
const uint32_t PH, const uint32_t PW) {
const uint32_t BM = BY < BX ? BY : BX;
const uint32_t n = blockIdx.z;
const uint32_t tidx = threadIdx.x;
const uint32_t tidy = threadIdx.y;
const uint32_t posx = blockIdx.x * blockDim.x + threadIdx.x;
const uint32_t posy = blockIdx.y * blockDim.y + threadIdx.y;
const uint32_t posx2 = posx << 2;
const uint32_t posy2 = posy << 2;
const uint32_t heightA = OC;
const uint32_t widthA = IC * FH * FW;
const uint32_t heightB = widthA;
const uint32_t widthB = OH * OW;
const uint32_t oh0 = (posx2 + 0) / OW * SH;
const uint32_t ow0 = (posx2 + 0) % OW * SW;
const uint32_t op0 = oh0 * IW + ow0;
const uint32_t oh1 = (posx2 + 1) / OW * SH;
const uint32_t ow1 = (posx2 + 1) % OW * SW;
const uint32_t op1 = oh1 * IW + ow1;
const uint32_t oh2 = (posx2 + 2) / OW * SH;
const uint32_t ow2 = (posx2 + 2) % OW * SW;
const uint32_t op2 = oh2 * IW + ow2;
const uint32_t oh3 = (posx2 + 3) / OW * SH;
const uint32_t ow3 = (posx2 + 3) % OW * SW;
const uint32_t op3 = oh3 * IW + ow3;
const uint32_t FP = FH * FW;
__shared__ float4 localA[BY][BM];
__shared__ float4 localB[BM][BX];
uint32_t i = 0u;
uint32_t offsetA = posy2 * widthA + tidx;
uint32_t offsetB = n * INP_BS - PH * IW - PW;
float4 sum0 = {0.0f, 0.0f, 0.0f, 0.0f}, sum1 = {0.0f, 0.0f, 0.0f, 0.0f},
sum2 = {0.0f, 0.0f, 0.0f, 0.0f}, sum3 = {0.0f, 0.0f, 0.0f, 0.0f};
uint32_t fh = tidy / FW % FH;
uint32_t fw = tidy % FW;
uint32_t ic = tidy / (FH * FW);
uint32_t icm = tidy % (FH * FW);
const uint32_t fhs = BM / FW % FH;
const uint32_t fws = BM % FW;
const uint32_t ics = BM / (FH * FW);
const uint32_t icms = BM % (FH * FW);
for (; i < widthA; i += BM, offsetA += BM) {
// load localA
if (tidx < BM) {
localA[tidy][tidx].x = filter.get(offsetA + 0 * widthA);
localA[tidy][tidx].y = filter.get(offsetA + 1 * widthA);
localA[tidy][tidx].z = filter.get(offsetA + 2 * widthA);
localA[tidy][tidx].w = filter.get(offsetA + 3 * widthA);
}
// load localB
uint32_t fh2, fw2;
if (is_xcorr) {
fh2 = fh;
fw2 = fw;
} else {
fh2 = FH - fh - 1;
fw2 = FW - fw - 1;
}
if (tidy < BM) {
uint32_t tmp = offsetB + (ic * IH + (fh2)) * IW + (fw2),
ok = bool_as_mask(tidy + i < heightB),
p0 = bool_as_mask(fh2 + oh0 >= PH && fh2 + oh0 < IH + PH &&
fw2 + ow0 >= PW && fw2 + ow0 < IW + PW),
p1 = bool_as_mask(fh2 + oh1 >= PH && fh2 + oh1 < IH + PH &&
fw2 + ow1 >= PW && fw2 + ow1 < IW + PW),
p2 = bool_as_mask(fh2 + oh2 >= PH && fh2 + oh2 < IH + PH &&
fw2 + ow2 >= PW && fw2 + ow2 < IW + PW),
p3 = bool_as_mask(fh2 + oh3 >= PH && fh2 + oh3 < IH + PH &&
fw2 + ow3 >= PW && fw2 + ow3 < IW + PW);
localB[tidy][tidx].x = visit_with_mask(src, tmp + op0, ok & p0);
localB[tidy][tidx].y = visit_with_mask(src, tmp + op1, ok & p1);
localB[tidy][tidx].z = visit_with_mask(src, tmp + op2, ok & p2);
localB[tidy][tidx].w = visit_with_mask(src, tmp + op3, ok & p3);
}
__syncthreads();
for (uint32_t j = 0u; j < BM; ++j) {
float4 tmpA = localA[tidy][j];
float4 tmpB = localB[j][tidx];
sum0.x += tmpA.x * tmpB.x;
sum0.y += tmpA.x * tmpB.y;
sum0.z += tmpA.x * tmpB.z;
sum0.w += tmpA.x * tmpB.w;
sum1.x += tmpA.y * tmpB.x;
sum1.y += tmpA.y * tmpB.y;
sum1.z += tmpA.y * tmpB.z;
sum1.w += tmpA.y * tmpB.w;
sum2.x += tmpA.z * tmpB.x;
sum2.y += tmpA.z * tmpB.y;
sum2.z += tmpA.z * tmpB.z;
sum2.w += tmpA.z * tmpB.w;
sum3.x += tmpA.w * tmpB.x;
sum3.y += tmpA.w * tmpB.y;
sum3.z += tmpA.w * tmpB.z;
sum3.w += tmpA.w * tmpB.w;
}
fw += fws;
fh += fhs;
fh += (fw >= FW);
fh -= (fh >= FH) * FH;
fw -= (fw >= FW) * FW;
ic += ics;
icm += icms;
ic += (icm >= FP);
icm -= (icm >= FP) * FP;
__syncthreads();
}
const uint32_t dst_idx = n * OUT_BS + posy2 * widthB + posx2;
bool y0 = (posy2 + 0 < heightA);
bool y1 = (posy2 + 1 < heightA);
bool y2 = (posy2 + 2 < heightA);
bool y3 = (posy2 + 3 < heightA);
bool x0 = (posx2 + 0 < widthB);
bool x1 = (posx2 + 1 < widthB);
bool x2 = (posx2 + 2 < widthB);
bool x3 = (posx2 + 3 < widthB);
if (y0) {
if (x0)
dst[dst_idx + 0 * widthB + 0] = sum0.x;
if (x1)
dst[dst_idx + 0 * widthB + 1] = sum0.y;
if (x2)
dst[dst_idx + 0 * widthB + 2] = sum0.z;
if (x3)
dst[dst_idx + 0 * widthB + 3] = sum0.w;
}
if (y1) {
if (x0)
dst[dst_idx + 1 * widthB + 0] = sum1.x;
if (x1)
dst[dst_idx + 1 * widthB + 1] = sum1.y;
if (x2)
dst[dst_idx + 1 * widthB + 2] = sum1.z;
if (x3)
dst[dst_idx + 1 * widthB + 3] = sum1.w;
}
if (y2) {
if (x0)
dst[dst_idx + 2 * widthB + 0] = sum2.x;
if (x1)
dst[dst_idx + 2 * widthB + 1] = sum2.y;
if (x2)
dst[dst_idx + 2 * widthB + 2] = sum2.z;
if (x3)
dst[dst_idx + 2 * widthB + 3] = sum2.w;
}
if (y3) {
if (x0)
dst[dst_idx + 3 * widthB + 0] = sum3.x;
if (x1)
dst[dst_idx + 3 * widthB + 1] = sum3.y;
if (x2)
dst[dst_idx + 3 * widthB + 2] = sum3.z;
if (x3)
dst[dst_idx + 3 * widthB + 3] = sum3.w;
}
}
} // anonymous namespace
void convolution::exec_inplace_matmul_fwd(
const float* src, const float* filter, float* dst, size_t N,
size_t INP_BS, size_t OUT_BS, size_t IC, size_t IH, size_t IW,
size_t OC, size_t OH, size_t OW, size_t FH, size_t FW, size_t PH,
size_t PW, size_t SH, size_t SW, bool is_xcorr, hipStream_t stream) {
BufferFetcherTextureHost src_tex(const_cast<float*>(src), N * INP_BS),
filter_tex(const_cast<float*>(filter), OC * IC * FH * FW);
BufferFetcherRaw src_buf, filter_buf;
src_buf.ptr = src;
filter_buf.ptr = filter;
if (!src_tex.init_succ || !filter_tex.init_succ) {
src_tex.reset();
filter_tex.reset();
}
int m = OC;
int n = OH * OW;
int BY = 1;
int BX = 1;
if (m <= 64) {
while (BY < 16 && (BY << 2) < m)
BY <<= 1;
BX = 256 / BY;
} else if (n <= 64) {
while (BX < 16 && (BX << 2) < n)
BX <<= 1;
BY = 256 / BX;
} else {
BX = BY = 16;
}
dim3 blocks((OH * OW + BX * 4 - 1) / (BX * 4), (OC + BY * 4 - 1) / (BY * 4),
N);
dim3 threads(BX, BY);
#define DISPATCH_BX_BY(BX, BY) \
do { \
if (src_tex.init_succ) { \
KernelPtr<BufferFetcherTexture>::type kptr; \
if (is_xcorr) { \
kptr = conv_kernel<BY, BX, true, BufferFetcherTexture>; \
} else { \
kptr = conv_kernel<BY, BX, false, BufferFetcherTexture>; \
} \
kptr<<<blocks, threads, 0, stream>>>( \
src_tex.val, filter_tex.val, dst, INP_BS, OUT_BS, IC, IH, \
IW, OC, OH, OW, FH, FW, SH, SW, PH, PW); \
} else { \
KernelPtr<BufferFetcherRaw>::type kptr; \
if (is_xcorr) { \
kptr = conv_kernel<BY, BX, true, BufferFetcherRaw>; \
} else { \
kptr = conv_kernel<BY, BX, false, BufferFetcherRaw>; \
} \
kptr<<<blocks, threads, 0, stream>>>( \
src_buf, filter_buf, dst, INP_BS, OUT_BS, IC, IH, IW, OC, \
OH, OW, FH, FW, SH, SW, PH, PW); \
} \
} while (0)
#define DISPATCH_BX(BX) \
do { \
DISPATCH_BX_BY(BX, 256 / BX); \
} while (0)
#define DISPATCH() \
do { \
switch (BX) { \
case 1: \
DISPATCH_BX(1); \
break; \
case 2: \
DISPATCH_BX(2); \
break; \
case 4: \
DISPATCH_BX(4); \
break; \
case 8: \
DISPATCH_BX(8); \
break; \
case 16: \
DISPATCH_BX(16); \
break; \
case 32: \
DISPATCH_BX(32); \
break; \
case 64: \
DISPATCH_BX(64); \
break; \
case 128: \
DISPATCH_BX(128); \
break; \
case 256: \
DISPATCH_BX(256); \
break; \
default: \
report_error("no usable kernel"); \
} \
} while (0)
DISPATCH();
#undef DISPATCH
#undef DISPATCH_BX
#undef DISPATCH_BX_BY
after_kernel_launch();
}
// vim: syntax=cpp.doxygen
/**
* \file src/rocm/convolution/forward/inplace_matmul_impl.h.hip
*
* This file is part of MegDNN, a deep neural network run-time library
* developed by Megvii.
*
* \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
*/
#pragma once
#include <stddef.h>
#include <stdint.h>
#include "hip_header.h"
namespace megdnn {
namespace rocm {
namespace convolution {
void exec_inplace_matmul_fwd(const float* src, const float* filter, float* dst,
size_t N, size_t INP_BS, size_t OUT_BS, size_t IC,
size_t IH, size_t IW, size_t OC, size_t OH,
size_t OW, size_t FH, size_t FW, size_t PH,
size_t PW, size_t SH, size_t SW, bool is_xcorr,
hipStream_t stream);
} // namespace convolution
} // namespace rocm
} // namespace megdnn
// vim: ft=cpp syntax=cpp.doxygen
/**
* \file dnn/src/rocm/convolution/forward/matmul.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "./algo.h"
#include "src/rocm/utils.h"
#include "src/rocm/utils.h.hip"
#include "src/rocm/convolution/helper.h"
#include "src/rocm/convolution/im2col.h.hip"
using namespace megdnn;
using namespace rocm;
bool ConvolutionForwardImpl::AlgoMatmul::is_available(
const SizeArgs& args) const {
auto&& fm = args.filter_meta;
return args.filter_meta.format == Param::Format::NCHW &&
args.src_layout->dtype.category() == DTypeCategory::FLOAT &&
args.opr->param().compute_mode != Param::ComputeMode::FLOAT32 &&
fm.group == 1 && fm.spatial_ndim == 2;
}
size_t ConvolutionForwardImpl::AlgoMatmul::get_workspace_in_bytes(
const SizeArgs& args) const {
return matmul_get_workspace_bundle(args).total_size_in_bytes();
}
void ConvolutionForwardImpl::AlgoMatmul::exec(const ExecArgs& args) const {
#define cb(DType) \
if (args.src_layout->dtype == DType()) { \
using ctype = typename DTypeTrait<DType>::ctype; \
exec_internal<ctype>(args); \
return; \
}
MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
#undef cb
megdnn_assert_internal(0);
}
template <typename T>
void ConvolutionForwardImpl::AlgoMatmul::exec_internal(const ExecArgs& args) {
auto&& fm = args.filter_meta;
size_t N = args.src_layout->shape[0], IC = fm.icpg,
IH = args.src_layout->shape[2], IW = args.src_layout->shape[3],
OC = fm.ocpg, OH = args.dst_layout->shape[2],
OW = args.dst_layout->shape[3], FH = fm.spatial[0],
FW = fm.spatial[1], PH = fm.padding[0], PW = fm.padding[1],
SH = fm.stride[0], SW = fm.stride[1], DH = fm.dilation[0],
DW = fm.dilation[1];
auto stream = hip_stream(args.handle);
auto wbundle = matmul_get_workspace_bundle(args);
wbundle.set(args.workspace.raw_ptr);
T* dst_t = static_cast<T*>(wbundle.get(0));
T* col = static_cast<T*>(wbundle.get(1));
convolution::im2col<T>(args.src_tensor->ptr<T>(), col, N,
args.src_layout->stride[0], IC, IH, IW, FH, FW, OH,
OW, PH, PW, SH, SW, DH, DW, stream);
TensorLayout Al({OC, IC * FH * FW}, typename DTypeTrait<T>::dtype()),
Bl({IC * FH * FW, OH * OW * N}, typename DTypeTrait<T>::dtype()),
Cl({OC, OH * OW * N}, typename DTypeTrait<T>::dtype());
TensorND A(args.filter_tensor->ptr<T>(), Al), B(col, Bl), C(dst_t, Cl);
if (fm.should_flip) {
convolution::flip_filter(args, wbundle.get_workspace(2), A.raw_ptr);
}
args.handle->matmul_opr()->exec(A, B, C, Workspace());
TensorLayout C2l({OC * OH * OW, N}, typename DTypeTrait<T>::dtype()),
C3l = C2l;
C3l.stride[0] = 1;
C3l.stride[1] = args.dst_tensor->layout.stride[0];
TensorND C2(dst_t, C2l);
TensorND C3(args.dst_tensor->ptr<T>(), C3l);
args.handle->relayout_opr()->exec(C2, C3);
}
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/rocm/convolution/forward/miopen.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "hcc_detail/hcc_defs_prologue.h"
#include "./algo.h"
#include <mutex>
#include "src/rocm/convolution/helper.h"
#include "src/rocm/miopen_wrapper.h"
#include "src/rocm/utils.h"
using namespace megdnn;
using namespace rocm;
using namespace convolution;
MIOpenCache<ConvolutionForwardImpl::AlgoBase::SizeArgs,
miopenConvFwdAlgorithm_t>
ConvolutionForwardImpl::AlgoMIOpen::sm_miopen_algo_cache;
MIOpenCache<ConvolutionForwardImpl::AlgoBase::SizeArgs, size_t>
ConvolutionForwardImpl::AlgoMIOpen::sm_miopen_ws_cache;
bool ConvolutionForwardImpl::AlgoMIOpen::is_available(
const SizeArgs& args) const {
if (!is_miopen_supported(args))
return false;
auto got = sm_miopen_ws_cache.get(args);
if (got.first)
return true;
MIOpenForwardDescs D;
args.init_desc(D);
size_t workspace_size;
auto status = miopenConvolutionForwardGetWorkSpaceSize(
args.handle->miopen_handle(), D.filter_desc.desc, D.src_desc.desc,
D.conv_desc.desc, D.dst_desc.desc, &workspace_size);
if (status == miopenStatusSuccess) {
sm_miopen_ws_cache.set(args, workspace_size);
return true;
}
return false;
}
size_t ConvolutionForwardImpl::AlgoMIOpen::get_workspace_in_bytes(
const SizeArgs& args) const {
auto got = sm_miopen_ws_cache.get(args);
if (got.first)
return got.second;
MIOpenForwardDescs D;
args.init_desc(D);
size_t workspace_size;
auto status = miopenConvolutionForwardGetWorkSpaceSize(
args.handle->miopen_handle(), D.filter_desc.desc, D.src_desc.desc,
D.conv_desc.desc, D.dst_desc.desc, &workspace_size);
megdnn_assert(status == miopenStatusSuccess,
"conv fwd get workspace failed: %s; info: %s",
miopenGetErrorString(status), args.to_string().c_str());
sm_miopen_ws_cache.set(args, workspace_size);
return workspace_size;
}
miopenConvFwdAlgorithm_t ConvolutionForwardImpl::AlgoMIOpen::find_best_algo(
const ExecArgs& args) {
auto find_algo = sm_miopen_algo_cache.get(args);
if (find_algo.first)
return find_algo.second;
bool exhaustive_search = args.handle->enable_miopen_algo_search();
MIOpenForwardDescs D;
args.init_desc(D);
const int req_algo_count = 1;
int ret_algo_count;
miopenConvAlgoPerf_t algo_perf;
miopen_check(miopenFindConvolutionForwardAlgorithm(
args.handle->miopen_handle(), D.src_desc.desc,
args.src_tensor->raw_ptr, D.filter_desc.desc,
args.filter_tensor->raw_ptr, D.conv_desc.desc, D.dst_desc.desc,
args.dst_tensor->raw_ptr, req_algo_count, &ret_algo_count,
&algo_perf, args.workspace.raw_ptr, args.workspace.size,
exhaustive_search));
sm_miopen_algo_cache.set(args, algo_perf.fwd_algo);
return algo_perf.fwd_algo;
}
void ConvolutionForwardImpl::AlgoMIOpen::exec(const ExecArgs& args) const {
MIOpenForwardDescs D;
args.init_desc(D);
auto algo = const_cast<ConvolutionForwardImpl::AlgoMIOpen*>(this)
->find_best_algo(args);
float alpha = 1.0f, beta = 0.0f;
auto status = miopenConvolutionForward(
args.handle->miopen_handle(), &alpha, D.src_desc.desc,
args.src_tensor->raw_ptr, D.filter_desc.desc,
args.filter_tensor->raw_ptr, D.conv_desc.desc, algo, &beta,
D.dst_desc.desc, args.dst_tensor->raw_ptr, args.workspace.raw_ptr,
args.workspace.size);
megdnn_assert(status == miopenStatusSuccess,
"conv fwd failed: %s; info: %s", miopenGetErrorString(status),
args.to_string().c_str());
}
void ConvolutionForwardImpl::AlgoPack::fill_miopen_algos() {
megdnn_throw("MIOpen has implemented auto-tuning in the framework, so we do not need to choose algorithms manually");
}
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/rocm/convolution/helper.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "hcc_detail/hcc_defs_prologue.h"
#include "./helper.h"
#include "./forward/algo.h"
#include "./backward_data/algo.h"
#include "./backward_filter/algo.h"
using namespace megdnn;
using namespace rocm;
using namespace convolution;
bool convolution::is_miopen_supported(const ForwardSizeArgs& args) {
//! TODO: We only support NCHW format now. It seems MIOpen do not support
//! NHWC or NCHW4 now
if (args.filter_meta.format != param::Convolution::Format::NCHW) {
return false;
}
auto& fm = args.filter_meta;
//! TODO: It seems MIOpen do not support non xcorr convolution
return !fm.should_flip;
}
std::string MIOpenCacheKey::to_string_binary() const {
std::string ret(sizeof(MIOpenCacheKey), '\0');
auto ptr = reinterpret_cast<MIOpenCacheKey*>(&ret[0]);
*ptr = *this;
return ret;
}
template <typename Args, typename ValueType>
void MIOpenCache<Args, ValueType>::set(const Args& args, ValueType val) {
std::string key = args.to_miopen_algo_cache_key().to_string_binary();
std::lock_guard<std::mutex> guard{m_mtx};
m_cache[key] = val;
}
template <typename Args, typename ValueType>
std::pair<bool, ValueType> MIOpenCache<Args, ValueType>::get(const Args& args) {
std::string key = args.to_miopen_algo_cache_key().to_string_binary();
std::lock_guard<std::mutex> guard{m_mtx};
auto search = m_cache.find(key);
bool find = search != m_cache.end();
ValueType val = ValueType();
if (find) {
val = search->second;
}
return std::make_pair(find, val);
}
#define INST(_opr, _miopen_algo) \
template class megdnn::rocm::convolution::MIOpenCache< \
_opr::AlgoBase::SizeArgs, _miopen_algo>; \
template class megdnn::rocm::convolution::MIOpenCache< \
_opr::AlgoBase::SizeArgs, size_t>;
INST(ConvolutionForwardImpl, miopenConvFwdAlgorithm_t);
INST(ConvolutionBackwardDataImpl, miopenConvBwdDataAlgorithm_t);
INST(ConvolutionBackwardFilterImpl, miopenConvBwdWeightsAlgorithm_t);
WorkspaceBundle convolution::matmul_get_workspace_bundle(
const ForwardSizeArgs& args) {
auto dtype = args.src_layout->dtype;
auto&& fm = args.filter_meta;
megdnn_assert(fm.group == 1);
auto N = args.src_layout->shape[0];
auto OC = fm.ocpg, IC = fm.icpg, FH = fm.spatial[0], FW = fm.spatial[1];
auto OH = args.dst_layout->shape[2], OW = args.dst_layout->shape[3];
SmallVector<size_t> sizes{dtype.size() * args.dst_layout->total_nr_elems(),
dtype.size() * IC * FH * FW * OH * OW * N};
if (args.filter_meta.should_flip) {
sizes.push_back(dtype.size() * OC * IC * FH * FW);
}
return {nullptr, std::move(sizes)};
}
void convolution::flip_filter(const ForwardSizeArgs& args,
const Workspace& workspace, void*& raw_ptr) {
auto&& fm = args.filter_meta;
megdnn_assert(fm.group == 1 && fm.spatial_ndim == 2);
auto OC = fm.ocpg, IC = fm.icpg, FH = fm.spatial[0], FW = fm.spatial[1];
auto dtype = fm.dtype;
megdnn_assert(workspace.size >= dtype.size() * OC * IC * FH * FW);
TensorND src{raw_ptr, {{OC, IC, FH, FW}, dtype}},
dst{workspace.raw_ptr + (FH * FW - 1) * dtype.size(), src.layout};
dst.layout.stride[2] = -dst.layout.stride[2];
dst.layout.stride[3] = -dst.layout.stride[3];
args.handle->relayout_opr()->exec(src, dst);
raw_ptr = workspace.raw_ptr;
}
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/rocm/convolution/helper.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
#include "./opr_impl.h"
#include "src/rocm/miopen_wrapper.h"
#include "src/rocm/handle.h"
#include "src/common/utils.h"
#include "src/common/algo_chooser.h"
#include <unordered_map>
namespace megdnn {
namespace rocm {
namespace convolution {
struct MIOpenCacheKey {
int64_t miopen_handle;
uint32_t batch, IC, IH, IW, OC, OH, OW, FH, FW, SH, SW, PH, PW, DH, DW,
group, ocpg, icpg, dtype_enum;
int exhaustive_search;
std::string to_string_binary() const;
};
//! FIXME: MIOpenCache to avoid calling find() and GetWorkSpaceSize()
//! redundantly
template <typename Args, typename ValueType>
class MIOpenCache {
using HashMap = std::unordered_map<std::string, ValueType>;
HashMap m_cache;
std::mutex m_mtx;
public:
MIOpenCache() = default;
~MIOpenCache() noexcept = default;
void set(const Args& args, ValueType val);
std::pair<bool, ValueType> get(const Args& args);
};
using CanonizedFilterMeta = ConvolutionForward::CanonizedFilterMeta;
//! conv size descriptor in the forward view
struct ForwardSizeArgs {
HandleImpl* handle;
const TensorLayout* src_layout;
CanonizedFilterMeta filter_meta;
const TensorLayout* dst_layout;
};
//! whether miopen is supported for a filter meta
bool is_miopen_supported(const ForwardSizeArgs& args);
//! get workspace bundle for matmul algo
WorkspaceBundle matmul_get_workspace_bundle(const ForwardSizeArgs& args);
/*!
* \brief flip conv filter
*
* Flip conv filter pointed by \p raw_ptr, store result in workspace, and
* change \p raw_ptr to workspace.
* */
void flip_filter(const ForwardSizeArgs& args, const Workspace& workspace,
void*& raw_ptr);
struct MIOpenForwardDescs {
TensorDesc src_desc, filter_desc, dst_desc;
ConvDesc conv_desc;
void set(const TensorLayout& src, const CanonizedFilterMeta& filter,
const TensorLayout& dst, const param::Convolution& param) {
src_desc.set(src, param.format);
auto&& group = filter.group;
auto&& ocpg = filter.ocpg;
auto&& icpg = filter.icpg;
auto&& fh = filter.spatial[0];
auto&& fw = filter.spatial[1];
TensorLayout filter_layout{{group * ocpg, icpg, fh, fw}, filter.dtype};
filter_desc.set(filter_layout, param.format);
dst_desc.set(dst, param.format);
bool is_depthwise = param.sparse == param::Convolution::Sparse::GROUP &&
(icpg == 1) && (ocpg == 1);
conv_desc.set(param, filter.group, is_depthwise);
}
};
struct MIOpenBwdDataDescs {
TensorDesc diff_desc, filter_desc, grad_desc;
ConvDesc conv_desc;
void set(const CanonizedFilterMeta& filter, const TensorLayout& diff,
const TensorLayout& grad, const param::Convolution& param) {
auto&& group = filter.group;
auto&& ocpg = filter.ocpg;
auto&& icpg = filter.icpg;
auto&& fh = filter.spatial[0];
auto&& fw = filter.spatial[1];
TensorLayout filter_layout{{group * ocpg, icpg, fh, fw}, filter.dtype};
filter_desc.set(filter_layout, param.format);
diff_desc.set(diff, param.format);
grad_desc.set(grad, param.format);
bool is_depthwise = param.sparse == param::Convolution::Sparse::GROUP &&
(icpg == 1) && (ocpg == 1);
conv_desc.set(param, filter.group, is_depthwise);
}
};
struct MIOpenBwdFilterDescs {
TensorDesc diff_desc, src_desc, grad_desc;
ConvDesc conv_desc;
void set(const TensorLayout& src, const TensorLayout& diff,
const CanonizedFilterMeta& grad, const param::Convolution& param) {
src_desc.set(src, param.format);
diff_desc.set(diff, param.format);
auto&& group = grad.group;
auto&& ocpg = grad.ocpg;
auto&& icpg = grad.icpg;
auto&& fh = grad.spatial[0];
auto&& fw = grad.spatial[1];
TensorLayout grad_layout{{group * ocpg, icpg, fh, fw}, grad.dtype};
grad_desc.set(grad_layout, param.format);
bool is_depthwise = param.sparse == param::Convolution::Sparse::GROUP &&
(icpg == 1) && (ocpg == 1);
conv_desc.set(param, grad.group, is_depthwise);
}
};
//! TODO:miopen does not support non xcorr convolution for now, expecting
//! support in future.
} // namespace convolution
} // namespace rocm
} // namespace megdnn
// vim: syntax=cpp.doxygen
/**
* \file src/rocm/convolution/im2col.cpp.hip
*
* This file is part of MegDNN, a deep neural network run-time library
* developed by Megvii.
*
* \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
*/
#include "./im2col.h.hip"
#include "megdnn/dtype.h"
#include "src/rocm/utils.h.hip"
using namespace megdnn;
using namespace rocm;
namespace {
template <typename T>
__global__ void im2col_kernel(const T* im, T* col, uint32_t N, uint32_t INP_BS,
uint32_t IC, uint32_t IH, uint32_t IW,
uint32_t FH, uint32_t FW, uint32_t OH,
uint32_t OW, uint32_t PH, uint32_t PW,
uint32_t SH, uint32_t SW, uint32_t DH,
uint32_t DW) {
uint32_t n = threadIdx.x + blockIdx.y * blockDim.x;
uint32_t ow = threadIdx.y + blockIdx.z * blockDim.y;
uint32_t oh = blockIdx.x % OH;
uint32_t fw = blockIdx.x / OH % FW;
uint32_t fh = blockIdx.x / OH / FW % FH;
uint32_t ic = blockIdx.x / OH / FW / FH;
if (n < N && ow < OW) {
uint32_t didx = blockIdx.x * OW * N + ow * N + n;
uint32_t ih = -PH + oh * SH + fh * DH;
uint32_t iw = -PW + ow * SW + fw * DW;
col[didx] = (ih < IH && iw < IW
? im[n * INP_BS + ic * IH * IW + ih * IW + iw]
: T(0.0f));
}
}
template <typename T>
__global__ void col2im_kernel(const T* col, T* im, uint32_t N, uint32_t INP_BS,
uint32_t IC, uint32_t IH, uint32_t IW,
uint32_t FH, uint32_t FW, uint32_t OH,
uint32_t OW, uint32_t PH, uint32_t PW,
uint32_t SH, uint32_t SW, uint32_t DH,
uint32_t DW) {
uint32_t iw = threadIdx.x + blockIdx.y * blockDim.x;
uint32_t ih = threadIdx.y + blockIdx.z * blockDim.y;
uint32_t ic = blockIdx.x % IC;
uint32_t n = blockIdx.x / IC;
if (iw < IW && ih < IH) {
T res(0);
for (uint32_t fh = 0; fh < FH; ++fh) {
uint32_t anchorh = ih + PH - fh * DH;
if (anchorh < OH * SH && anchorh % SH == 0) {
uint32_t oh = anchorh / SH;
for (uint32_t fw = 0; fw < FW; ++fw) {
uint32_t anchorw = iw + PW - fw * DW;
if (anchorw < OW * SW && anchorw % SW == 0) {
uint32_t ow = anchorw / SW;
res += col[ic * FH * FW * OH * OW * N +
fh * FW * OH * OW * N + fw * OH * OW * N +
oh * OW * N + ow * N + n];
}
}
}
}
im[n * INP_BS + ic * IH * IW + ih * IW + iw] = res;
}
}
} // anonymous namespace
template <typename T>
void convolution::im2col(const T* im, T* col, size_t N, size_t INP_BS,
size_t IC, size_t IH, size_t IW, size_t FH, size_t FW,
size_t OH, size_t OW, size_t PH, size_t PW, size_t SH,
size_t SW, size_t DH, size_t DW, hipStream_t stream) {
dim3 threads(NR_THREADS_X, NR_THREADS_Y);
dim3 blocks(IC * FH * FW * OH, DIVUP(N, NR_THREADS_X),
DIVUP(OW, NR_THREADS_Y));
hipLaunchKernelGGL(im2col_kernel<T>, blocks, threads, 0, stream, im, col, N,
INP_BS, IC, IH, IW, FH, FW, OH, OW, PH, PW, SH, SW, DH,
DW);
after_kernel_launch();
}
template <typename T>
void convolution::col2im(const T* col, T* im, size_t N, size_t INP_BS,
size_t IC, size_t IH, size_t IW, size_t FH, size_t FW,
size_t OH, size_t OW, size_t PH, size_t PW, size_t SH,
size_t SW, size_t DH, size_t DW, hipStream_t stream) {
dim3 threads(NR_THREADS_X, NR_THREADS_Y);
dim3 blocks(N * IC, DIVUP(IW, NR_THREADS_X), DIVUP(IH, NR_THREADS_Y));
hipLaunchKernelGGL(col2im_kernel<T>, blocks, threads, 0, stream, col, im, N,
INP_BS, IC, IH, IW, FH, FW, OH, OW, PH, PW, SH, SW, DH,
DW);
after_kernel_launch();
}
namespace megdnn {
namespace rocm {
namespace convolution {
#define DO_INST(T) \
template void im2col<T>(const T* im, T* col, size_t N, size_t INP_BS, \
size_t IC, size_t IH, size_t IW, size_t FH, \
size_t FW, size_t OH, size_t OW, size_t PH, \
size_t PW, size_t SH, size_t SW, size_t DH, \
size_t DW, hipStream_t stream); \
template void col2im<T>(const T* col, T* im, size_t N, size_t INP_BS, \
size_t IC, size_t IH, size_t IW, size_t FH, \
size_t FW, size_t OH, size_t OW, size_t PH, \
size_t PW, size_t SH, size_t SW, size_t DH, \
size_t DW, hipStream_t stream);
#define INST(_dt) DO_INST(DTypeTrait<_dt>::ctype)
MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(INST);
#undef DO_INST
#undef INST
} // namespace convolution
} // namespace rocm
} // namespace megdnn
// vim: syntax=cpp.doxygen
/**
* \file src/rocm/convolution/im2col.h.hip
*
* This file is part of MegDNN, a deep neural network run-time library
* developed by Megvii.
*
* \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
*/
#pragma once
#include "hip_header.h"
namespace megdnn {
namespace rocm {
namespace convolution {
//! col is of shape (ic*fh*fw, oh*ow*n)
template <typename T>
void im2col(const T* im, T* col, size_t N, size_t INP_BS, size_t IC, size_t IH,
size_t IW, size_t FH, size_t FW, size_t OH, size_t OW, size_t PH,
size_t PW, size_t SH, size_t SW, size_t DH, size_t DW, // dilation
hipStream_t stream);
template <typename T>
void col2im(const T* col, T* im, size_t N, size_t INP_BS, size_t IC, size_t IH,
size_t IW, size_t FH, size_t FW, size_t OH, size_t OW, size_t PH,
size_t PW, size_t SH, size_t SW, size_t DH, size_t DW, // dilation
hipStream_t stream);
} // namespace convolution
} // namespace rocm
} // namespace megdnn
// vim: ft=cpp syntax=cpp.doxygen
/**
* \file dnn/src/rocm/convolution/opr_impl.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "hcc_detail/hcc_defs_prologue.h"
#include "./backward_data/algo.h"
#include "./backward_filter/algo.h"
#include "./forward/algo.h"
#include "./opr_impl.h"
#include "src/common/algo_chooser.h"
#include "src/rocm/utils.h"
using namespace megdnn;
using namespace rocm;
#define TO_STRING2(v) #v
#define TO_STRING(v) TO_STRING2(v)
#define MIOPEN_VERSION_STR \
TO_STRING(MIOPEN_VERSION_MAJOR) \
"." TO_STRING(MIOPEN_VERSION_MINOR) "." TO_STRING(MIOPEN_VERSION_PATCH)
/* ============== ConvolutionForwardImpl ============== */
ConvolutionForwardImpl::Algorithm*
ConvolutionForwardImpl::get_algorithm_heuristic(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& dst,
size_t workspace_limit_in_bytes,
bool reproducible) {
auto fm = check_layout_fwd(src, filter, dst);
return get_algorithm_heuristic(src, fm, dst, workspace_limit_in_bytes,
reproducible);
}
ConvolutionForwardImpl::Algorithm*
ConvolutionForwardImpl::get_algorithm_heuristic(
const TensorLayout& src, const CanonizedFilterMeta& filter,
const TensorLayout& dst, size_t workspace_limit_in_bytes,
bool reproducible) {
AlgoBase::SizeArgs args(this, src, filter, dst);
//! MIOpen auto-tuning need to run with actual tensors, so we cannot get
//! best algorithm here.
if (is_miopen_supported(args)) {
auto algo = megdnn::get_reproducible_algo<ConvolutionForwardImpl>(
sm_algo_pack.miopen_algos[0], reproducible);
if (algo)
return algo;
}
if (args.filter_meta.group > 1) {
if (sm_algo_pack.chanwise.is_available_reproducible(
args, reproducible, workspace_limit_in_bytes)) {
return &sm_algo_pack.chanwise;
}
}
auto prefer_1x1 = [&args, reproducible, workspace_limit_in_bytes]() {
const size_t MAX_BATCH_SIZE_FOR_1x1_MAT_ALGO = 4;
size_t batch_size = args.src_layout->shape[0];
if (batch_size > MAX_BATCH_SIZE_FOR_1x1_MAT_ALGO) {
return false;
}
return sm_algo_pack.a1x1.is_available_reproducible(
args, reproducible, workspace_limit_in_bytes);
};
if (prefer_1x1()) {
return &sm_algo_pack.a1x1;
}
auto prefer_1x1_large_batch = [&args, reproducible,
workspace_limit_in_bytes]() {
const size_t MIN_BATCH_SIZE_FOR_1x1_LARGE_BATCH_ALGO = 32;
size_t batch_size = args.src_layout->shape[0];
if (batch_size < MIN_BATCH_SIZE_FOR_1x1_LARGE_BATCH_ALGO) {
return false;
}
return sm_algo_pack.batched_matrix_mul.is_available_reproducible(
args, reproducible, workspace_limit_in_bytes);
};
if (prefer_1x1_large_batch()) {
return &sm_algo_pack.batched_matrix_mul;
}
if (reproducible) {
return megdnn::get_reproducible_algo<ConvolutionForwardImpl>(
sm_algo_pack.non_miopen_algos, args, workspace_limit_in_bytes,
"rocm conv fwd");
} else {
return megdnn::get_usable_algo<ConvolutionForwardImpl>(
sm_algo_pack.non_miopen_algos, args, workspace_limit_in_bytes,
"rocm conv fwd");
}
}
std::vector<ConvolutionForwardImpl::Algorithm*>
ConvolutionForwardImpl::get_all_algorithms(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& dst) {
return megdnn::get_all_algorithms<ConvolutionForwardImpl>(
{this, src, filter, dst});
}
size_t ConvolutionForwardImpl::get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst, const PreprocessedFilter*) {
AlgoBase::SizeArgs args(this, src, filter, dst);
return get_algorithm(this, src, args.filter_meta, dst)
->get_workspace_in_bytes(args);
}
void ConvolutionForwardImpl::exec(_megdnn_tensor_in src,
_megdnn_tensor_in filter,
_megdnn_tensor_out dst,
const PreprocessedFilter*,
_megdnn_workspace workspace) {
AlgoBase::ExecArgs args(this, src, filter, dst, workspace);
auto algo = get_algorithm(this, src.layout, args.filter_meta, dst.layout);
algo->check_workspace(args, workspace).exec(args);
}
const char* ConvolutionForwardImpl::get_algorithm_set_name() const {
return "ROCMCONV0+MIOPEN" MIOPEN_VERSION_STR;
}
/* ============== ConvolutionBackwardDataImpl ============== */
void ConvolutionBackwardDataImpl::exec(_megdnn_tensor_in filter,
_megdnn_tensor_in diff,
_megdnn_tensor_out grad,
_megdnn_workspace workspace) {
AlgoBase::ExecArgs args(this, filter, diff, grad, workspace);
auto algo = get_algorithm(this, args.filter_meta, diff.layout, grad.layout);
algo->check_workspace(args, workspace).exec(args);
}
std::vector<ConvolutionBackwardDataImpl::Algorithm*>
ConvolutionBackwardDataImpl::get_all_algorithms(const TensorLayout& filter,
const TensorLayout& diff,
const TensorLayout& grad) {
return megdnn::get_all_algorithms<ConvolutionBackwardDataImpl>(
{this, filter, diff, grad});
}
ConvolutionBackwardDataImpl::Algorithm*
ConvolutionBackwardDataImpl::get_algorithm_heuristic(
const TensorLayout& filter, const TensorLayout& diff,
const TensorLayout& grad, size_t workspace_limit_in_bytes,
bool reproducible) {
auto fm = check_layout_fwd(grad, filter, diff);
return get_algorithm_heuristic(fm, diff, grad, workspace_limit_in_bytes,
reproducible);
}
ConvolutionBackwardDataImpl::Algorithm*
ConvolutionBackwardDataImpl::get_algorithm_heuristic(
const CanonizedFilterMeta& filter, const TensorLayout& diff,
const TensorLayout& grad, size_t workspace_limit_in_bytes,
bool reproducible) {
AlgoBase::SizeArgs args(this, filter, diff, grad);
if (is_miopen_supported(args.as_fwd_args())) {
auto algo = megdnn::get_reproducible_algo<ConvolutionBackwardDataImpl>(
sm_algo_pack.miopen_algos[0], reproducible);
if (algo)
return algo;
}
if (args.filter_meta.group > 1 &&
sm_algo_pack.chanwise.is_available_reproducible(
args, reproducible, workspace_limit_in_bytes)) {
return &sm_algo_pack.chanwise;
}
if (reproducible) {
return megdnn::get_reproducible_algo<ConvolutionBackwardDataImpl>(
sm_algo_pack.non_miopen_algos, args, workspace_limit_in_bytes,
"rocm conv bwd_data");
} else {
return megdnn::get_usable_algo<ConvolutionBackwardDataImpl>(
sm_algo_pack.non_miopen_algos, args, workspace_limit_in_bytes,
"rocm conv bwd_data");
}
}
size_t ConvolutionBackwardDataImpl::get_workspace_in_bytes(
const TensorLayout& filter, const TensorLayout& diff,
const TensorLayout& grad) {
AlgoBase::SizeArgs args(this, filter, diff, grad);
return get_algorithm(this, args.filter_meta, diff, grad)
->get_workspace_in_bytes(args);
}
const char* ConvolutionBackwardDataImpl::get_algorithm_set_name() const {
return "ROCMCONV0+MIOPEN" MIOPEN_VERSION_STR;
}
/* ============== ConvolutionBackwardFilterImpl ============== */
void ConvolutionBackwardFilterImpl::exec(_megdnn_tensor_in src,
_megdnn_tensor_in diff,
_megdnn_tensor_out grad,
_megdnn_workspace workspace) {
AlgoBase::ExecArgs args(this, src, diff, grad, workspace);
auto algo =
get_algorithm(this, src.layout, diff.layout, args.grad_filter_meta);
algo->check_workspace(args, workspace).exec(args);
}
std::vector<ConvolutionBackwardFilterImpl::Algorithm*>
ConvolutionBackwardFilterImpl::get_all_algorithms(const TensorLayout& src,
const TensorLayout& diff,
const TensorLayout& grad) {
return megdnn::get_all_algorithms<ConvolutionBackwardFilterImpl>(
{this, src, diff, grad});
}
ConvolutionBackwardFilterImpl::Algorithm*
ConvolutionBackwardFilterImpl::get_algorithm_heuristic(
const TensorLayout& src, const TensorLayout& diff,
const TensorLayout& grad, size_t workspace_limit_in_bytes,
bool reproducible) {
auto fm = check_layout_fwd(src, grad, diff);
return get_algorithm_heuristic(src, diff, fm, workspace_limit_in_bytes,
reproducible);
}
ConvolutionBackwardFilterImpl::Algorithm*
ConvolutionBackwardFilterImpl::get_algorithm_heuristic(
const TensorLayout& src, const TensorLayout& diff,
const CanonizedFilterMeta& grad, size_t workspace_limit_in_bytes,
bool reproducible) {
AlgoBase::SizeArgs args(this, src, diff, grad);
if (is_miopen_supported(args.as_fwd_args())) {
auto algo =
megdnn::get_reproducible_algo<ConvolutionBackwardFilterImpl>(
sm_algo_pack.miopen_algos[0], reproducible);
if (algo)
return algo;
}
if (args.grad_filter_meta.group > 1 &&
sm_algo_pack.chanwise.is_available_reproducible(
args, reproducible, workspace_limit_in_bytes)) {
// prefer special chanwise impl
return &sm_algo_pack.chanwise;
}
if (reproducible) {
return megdnn::get_reproducible_algo<ConvolutionBackwardFilterImpl>(
sm_algo_pack.non_miopen_algos, args, workspace_limit_in_bytes,
"rocm conv bwd_filter");
} else {
return megdnn::get_usable_algo<ConvolutionBackwardFilterImpl>(
sm_algo_pack.non_miopen_algos, args, workspace_limit_in_bytes,
"rocm conv bwd_filter");
}
}
size_t ConvolutionBackwardFilterImpl::get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& diff,
const TensorLayout& grad) {
AlgoBase::SizeArgs args(this, src, diff, grad);
return get_algorithm(this, src, diff, args.grad_filter_meta)
->get_workspace_in_bytes(args);
}
const char* ConvolutionBackwardFilterImpl::get_algorithm_set_name() const {
return "ROCMCONV0+MIOPEN" MIOPEN_VERSION_STR;
}
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/rocm/convolution/opr_impl.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
#include "megdnn/oprs/nn.h"
#include "src/common/utils.h"
namespace megdnn {
namespace rocm {
class ConvolutionForwardImpl : public ConvolutionForward {
public:
using ConvolutionForward::ConvolutionForward;
void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
_megdnn_tensor_out dst,
const PreprocessedFilter* preprocessed_filter,
_megdnn_workspace workspace) override;
std::vector<Algorithm*> get_all_algorithms(
const TensorLayout& src, const TensorLayout& filter,
const TensorLayout& dst) override;
Algorithm* get_algorithm_heuristic(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& dst,
size_t workspace_limit_in_bytes,
bool reproducible) override;
Algorithm* get_algorithm_heuristic(const TensorLayout& src,
const CanonizedFilterMeta& filter,
const TensorLayout& dst,
size_t workspace_limit_in_bytes,
bool reproducible);
size_t get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& filter,
const TensorLayout& dst,
const PreprocessedFilter*) override;
size_t get_preprocess_workspace_in_bytes(const TensorLayout&,
const TensorLayout&,
const TensorLayout&) override {
return 0;
}
void exec_preprocess(const TensorLayout&, _megdnn_tensor_in,
const TensorLayout&, PreprocessedFilter*,
_megdnn_workspace) override {
megdnn_throw("convolution exec_preprocess has not implemented yet");
}
SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
const TensorLayout&, const TensorLayout&,
const TensorLayout&) override {
return {};
}
const char* get_algorithm_set_name() const override;
class AlgoBase;
class AlgoMIOpen;
class AlgoMatmul;
class AlgoInplaceMatmul;
class Algo1x1;
class Algo1x1LargeBatch;
class AlgoChanwise;
class AlgoPack;
static const AlgoPack& algo_pack() { return sm_algo_pack; }
private:
static AlgoPack sm_algo_pack;
};
class ConvolutionBackwardDataImpl : public ConvolutionBackwardData {
public:
using ConvolutionBackwardData::ConvolutionBackwardData;
void exec(_megdnn_tensor_in filter, _megdnn_tensor_in diff,
_megdnn_tensor_out grad, _megdnn_workspace workspace) override;
std::vector<Algorithm*> get_all_algorithms(
const TensorLayout& filter, const TensorLayout& diff,
const TensorLayout& grad) override;
Algorithm* get_algorithm_heuristic(const TensorLayout& filter,
const TensorLayout& diff,
const TensorLayout& grad,
size_t workspace_limit_in_bytes,
bool reproducible) override;
Algorithm* get_algorithm_heuristic(const CanonizedFilterMeta& filter,
const TensorLayout& diff,
const TensorLayout& grad,
size_t workspace_limit_in_bytes,
bool reproducible);
size_t get_workspace_in_bytes(const TensorLayout& filter,
const TensorLayout& diff,
const TensorLayout& grad) override;
const char* get_algorithm_set_name() const override;
class AlgoBase;
class AlgoMIOpen;
class AlgoMatmul;
class AlgoChanwise;
class AlgoPack;
static const AlgoPack& algo_pack() { return sm_algo_pack; }
private:
static AlgoPack sm_algo_pack;
};
class ConvolutionBackwardFilterImpl : public ConvolutionBackwardFilter {
public:
using ConvolutionBackwardFilter::ConvolutionBackwardFilter;
void exec(_megdnn_tensor_in src, _megdnn_tensor_in diff,
_megdnn_tensor_out grad, _megdnn_workspace workspace) override;
std::vector<Algorithm*> get_all_algorithms(
const TensorLayout& src, const TensorLayout& diff,
const TensorLayout& grad) override;
Algorithm* get_algorithm_heuristic(const TensorLayout& src,
const TensorLayout& diff,
const TensorLayout& grad,
size_t workspace_limit_in_bytes,
bool reproducible) override;
Algorithm* get_algorithm_heuristic(const TensorLayout& src,
const TensorLayout& diff,
const CanonizedFilterMeta& grad,
size_t workspace_limit_in_bytes,
bool reproducible);
size_t get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& diff,
const TensorLayout& grad) override;
const char* get_algorithm_set_name() const override;
class AlgoBase;
class AlgoMIOpen;
class AlgoMatmul;
class AlgoChanwise;
class AlgoPack;
static const AlgoPack& algo_pack() { return sm_algo_pack; }
private:
static AlgoPack sm_algo_pack;
};
} // namespace rocm
} // namespace megdnn
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/rocm/elemwise/kern_impl.inl
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
#ifndef KERN_IMPL_MODE
#error "KERN_IMPL_MODE, KERN_IMPL_ARITY and KERN_IMPL_CTYPE must be defined"
#endif
#include "src/rocm/elemwise/kern_wrapper.h.hip"
namespace megdnn {
namespace rocm {
#define cb(_mode) \
typedef ElemwiseKern<megcorePlatformROCM, \
param_enumv::Elemwise::Mode::_mode, KERN_IMPL_CTYPE> \
KernImpl##_mode; \
typedef ElemArithKernWrapper<KERN_IMPL_ARITY, KernImpl##_mode> \
Wrapper##_mode; \
INST_RUN_ELEMWISE(Wrapper##_mode, KERN_IMPL_CTYPE, KERN_IMPL_ARITY);
KERN_IMPL_MODE(cb)
} // namespace rocm
} // namespace megdnn
// vim: syntax=cpp.doxygen
/**
* \file src/rocm/elemwise/kern_wrapper.h.hip
*
* This file is part of MegDNN, a deep neural network run-time library
* developed by Megvii.
*
* \brief helper for implementing elemwise oprs
*
* \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
*/
#pragma once
#include "src/rocm/elemwise_helper.h.hip"
#include "src/common/elemwise/kern_defs.cuh"
namespace megdnn {
namespace rocm {
template <int arity, class KernImpl>
struct ElemArithKernWrapper;
template <class KernImpl>
struct ElemArithKernWrapper<1, KernImpl> {
typedef typename KernImpl::ctype ctype;
ctype* dst;
#if MEGDNN_CC_CUDA
__device__ void operator()(uint32_t idx, ctype x) {
dst[idx] = KernImpl::apply(x);
}
#endif
};
template <class KernImpl>
struct ElemArithKernWrapper<2, KernImpl> {
typedef typename KernImpl::ctype ctype;
ctype* dst;
#if MEGDNN_CC_CUDA
__device__ void operator()(uint32_t idx, ctype x, ctype y) {
dst[idx] = KernImpl::apply(x, y);
}
#endif
};
template <class KernImpl>
struct ElemArithKernWrapper<3, KernImpl> {
typedef typename KernImpl::ctype ctype;
ctype* dst;
#if MEGDNN_CC_CUDA
__device__ void operator()(uint32_t idx, ctype x, ctype y, ctype z) {
dst[idx] = KernImpl::apply(x, y, z);
}
#endif
};
} // namespace rocm
} // namespace megdnn
// vim: ft=cpp syntax=cpp.doxygen
// generated by gen_elemwise_kern_impls.py
#if !MEGDNN_DISABLE_FLOAT16
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS_GRAD, cb)
#define KERN_IMPL_ARITY 2
#define KERN_IMPL_CTYPE dt_float16
#include "../kern_impl.inl"
#endif
// generated by gen_elemwise_kern_impls.py
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS_GRAD, cb)
#define KERN_IMPL_ARITY 2
#define KERN_IMPL_CTYPE dt_float32
#include "../kern_impl.inl"
// generated by gen_elemwise_kern_impls.py
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS_GRAD, cb)
#define KERN_IMPL_ARITY 2
#define KERN_IMPL_CTYPE dt_int16
#include "../kern_impl.inl"
// generated by gen_elemwise_kern_impls.py
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS_GRAD, cb)
#define KERN_IMPL_ARITY 2
#define KERN_IMPL_CTYPE dt_int32
#include "../kern_impl.inl"
// generated by gen_elemwise_kern_impls.py
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS_GRAD, cb)
#define KERN_IMPL_ARITY 2
#define KERN_IMPL_CTYPE dt_int8
#include "../kern_impl.inl"
// generated by gen_elemwise_kern_impls.py
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS_GRAD, cb)
#define KERN_IMPL_ARITY 2
#define KERN_IMPL_CTYPE dt_uint8
#include "../kern_impl.inl"
// generated by gen_elemwise_kern_impls.py
#if !MEGDNN_DISABLE_FLOAT16
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS, cb)
#define KERN_IMPL_ARITY 1
#define KERN_IMPL_CTYPE dt_float16
#include "../kern_impl.inl"
#endif
// generated by gen_elemwise_kern_impls.py
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS, cb)
#define KERN_IMPL_ARITY 1
#define KERN_IMPL_CTYPE dt_float32
#include "../kern_impl.inl"
// generated by gen_elemwise_kern_impls.py
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS, cb)
#define KERN_IMPL_ARITY 1
#define KERN_IMPL_CTYPE dt_int16
#include "../kern_impl.inl"
// generated by gen_elemwise_kern_impls.py
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS, cb)
#define KERN_IMPL_ARITY 1
#define KERN_IMPL_CTYPE dt_int32
#include "../kern_impl.inl"
// generated by gen_elemwise_kern_impls.py
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS, cb)
#define KERN_IMPL_ARITY 1
#define KERN_IMPL_CTYPE dt_int8
#include "../kern_impl.inl"
// generated by gen_elemwise_kern_impls.py
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ABS, cb)
#define KERN_IMPL_ARITY 1
#define KERN_IMPL_CTYPE dt_uint8
#include "../kern_impl.inl"
// generated by gen_elemwise_kern_impls.py
#if !MEGDNN_DISABLE_FLOAT16
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ACOS, cb)
#define KERN_IMPL_ARITY 1
#define KERN_IMPL_CTYPE dt_float16
#include "../kern_impl.inl"
#endif
// generated by gen_elemwise_kern_impls.py
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ACOS, cb)
#define KERN_IMPL_ARITY 1
#define KERN_IMPL_CTYPE dt_float32
#include "../kern_impl.inl"
// generated by gen_elemwise_kern_impls.py
#if !MEGDNN_DISABLE_FLOAT16
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ADD, cb)
#define KERN_IMPL_ARITY 2
#define KERN_IMPL_CTYPE dt_float16
#include "../kern_impl.inl"
#endif
// generated by gen_elemwise_kern_impls.py
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ADD, cb)
#define KERN_IMPL_ARITY 2
#define KERN_IMPL_CTYPE dt_float32
#include "../kern_impl.inl"
// generated by gen_elemwise_kern_impls.py
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ADD, cb)
#define KERN_IMPL_ARITY 2
#define KERN_IMPL_CTYPE dt_int16
#include "../kern_impl.inl"
// generated by gen_elemwise_kern_impls.py
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ADD, cb)
#define KERN_IMPL_ARITY 2
#define KERN_IMPL_CTYPE dt_int32
#include "../kern_impl.inl"
// generated by gen_elemwise_kern_impls.py
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ADD, cb)
#define KERN_IMPL_ARITY 2
#define KERN_IMPL_CTYPE dt_int8
#include "../kern_impl.inl"
// generated by gen_elemwise_kern_impls.py
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ADD, cb)
#define KERN_IMPL_ARITY 2
#define KERN_IMPL_CTYPE dt_uint8
#include "../kern_impl.inl"
// generated by gen_elemwise_kern_impls.py
#if !MEGDNN_DISABLE_FLOAT16
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ASIN, cb)
#define KERN_IMPL_ARITY 1
#define KERN_IMPL_CTYPE dt_float16
#include "../kern_impl.inl"
#endif
// generated by gen_elemwise_kern_impls.py
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ASIN, cb)
#define KERN_IMPL_ARITY 1
#define KERN_IMPL_CTYPE dt_float32
#include "../kern_impl.inl"
// generated by gen_elemwise_kern_impls.py
#if !MEGDNN_DISABLE_FLOAT16
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ATAN2, cb)
#define KERN_IMPL_ARITY 2
#define KERN_IMPL_CTYPE dt_float16
#include "../kern_impl.inl"
#endif
// generated by gen_elemwise_kern_impls.py
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ATAN2, cb)
#define KERN_IMPL_ARITY 2
#define KERN_IMPL_CTYPE dt_float32
#include "../kern_impl.inl"
// generated by gen_elemwise_kern_impls.py
#if !MEGDNN_DISABLE_FLOAT16
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(CEIL, cb)
#define KERN_IMPL_ARITY 1
#define KERN_IMPL_CTYPE dt_float16
#include "../kern_impl.inl"
#endif
// generated by gen_elemwise_kern_impls.py
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(CEIL, cb)
#define KERN_IMPL_ARITY 1
#define KERN_IMPL_CTYPE dt_float32
#include "../kern_impl.inl"
// generated by gen_elemwise_kern_impls.py
#if !MEGDNN_DISABLE_FLOAT16
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COND_LEQ_MOV, cb)
#define KERN_IMPL_ARITY 3
#define KERN_IMPL_CTYPE dt_float16
#include "../kern_impl.inl"
#endif
// generated by gen_elemwise_kern_impls.py
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COND_LEQ_MOV, cb)
#define KERN_IMPL_ARITY 3
#define KERN_IMPL_CTYPE dt_float32
#include "../kern_impl.inl"
// generated by gen_elemwise_kern_impls.py
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COND_LEQ_MOV, cb)
#define KERN_IMPL_ARITY 3
#define KERN_IMPL_CTYPE dt_int16
#include "../kern_impl.inl"
// generated by gen_elemwise_kern_impls.py
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COND_LEQ_MOV, cb)
#define KERN_IMPL_ARITY 3
#define KERN_IMPL_CTYPE dt_int32
#include "../kern_impl.inl"
// generated by gen_elemwise_kern_impls.py
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COND_LEQ_MOV, cb)
#define KERN_IMPL_ARITY 3
#define KERN_IMPL_CTYPE dt_int8
#include "../kern_impl.inl"
// generated by gen_elemwise_kern_impls.py
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COND_LEQ_MOV, cb)
#define KERN_IMPL_ARITY 3
#define KERN_IMPL_CTYPE dt_uint8
#include "../kern_impl.inl"
// generated by gen_elemwise_kern_impls.py
#if !MEGDNN_DISABLE_FLOAT16
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COS, cb)
#define KERN_IMPL_ARITY 1
#define KERN_IMPL_CTYPE dt_float16
#include "../kern_impl.inl"
#endif
// generated by gen_elemwise_kern_impls.py
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(COS, cb)
#define KERN_IMPL_ARITY 1
#define KERN_IMPL_CTYPE dt_float32
#include "../kern_impl.inl"
// generated by gen_elemwise_kern_impls.py
#if !MEGDNN_DISABLE_FLOAT16
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EQ, cb)
#define KERN_IMPL_ARITY 2
#define KERN_IMPL_CTYPE dt_float16
#include "../kern_impl.inl"
#endif
// generated by gen_elemwise_kern_impls.py
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EQ, cb)
#define KERN_IMPL_ARITY 2
#define KERN_IMPL_CTYPE dt_float32
#include "../kern_impl.inl"
// generated by gen_elemwise_kern_impls.py
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EQ, cb)
#define KERN_IMPL_ARITY 2
#define KERN_IMPL_CTYPE dt_int16
#include "../kern_impl.inl"
// generated by gen_elemwise_kern_impls.py
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EQ, cb)
#define KERN_IMPL_ARITY 2
#define KERN_IMPL_CTYPE dt_int32
#include "../kern_impl.inl"
// generated by gen_elemwise_kern_impls.py
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EQ, cb)
#define KERN_IMPL_ARITY 2
#define KERN_IMPL_CTYPE dt_int8
#include "../kern_impl.inl"
// generated by gen_elemwise_kern_impls.py
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(EQ, cb)
#define KERN_IMPL_ARITY 2
#define KERN_IMPL_CTYPE dt_uint8
#include "../kern_impl.inl"
// generated by gen_elemwise_kern_impls.py
#if !MEGDNN_DISABLE_FLOAT16
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ERFCINV, cb)
#define KERN_IMPL_ARITY 1
#define KERN_IMPL_CTYPE dt_float16
#include "../kern_impl.inl"
#endif
// generated by gen_elemwise_kern_impls.py
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ERFCINV, cb)
#define KERN_IMPL_ARITY 1
#define KERN_IMPL_CTYPE dt_float32
#include "../kern_impl.inl"
// generated by gen_elemwise_kern_impls.py
#if !MEGDNN_DISABLE_FLOAT16
#define KERN_IMPL_MODE(cb) MEGDNN_ELEMWISE_MODE_ENABLE(ERFC, cb)
#define KERN_IMPL_ARITY 1
#define KERN_IMPL_CTYPE dt_float16
#include "../kern_impl.inl"
#endif
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册