/** * \file src/jit/impl/mlir/executable_cuda.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. */ #include "megbrain_build_config.h" #if MGB_JIT && MGB_JIT_MLIR #if MGB_CUDA #include "./executable_cuda.h" #include "./ir/types.h" #include "megbrain/comp_node_env.h" #include "megbrain/jit/mlir/ir/utils.h" #include "megbrain/utils/persistent_cache.h" #include "megbrain/utils/timer.h" #include "megdnn/dtype.h" #include #include #include #include using namespace mgb; using namespace jit; namespace { template void setup_and_launch(const JITExecutor* fusion_opr, CUfunction func, int block_size) { auto&& args = fusion_opr->args(); std::vector> param_holders; std::vector params; auto set_params = [¶m_holders, ¶ms]( void* ptr, const megdnn::TensorLayout& layout) { param_holders.push_back(StridedMemRefType{}); StridedMemRefType& desc = param_holders.back(); desc.basePtr = static_cast(ptr); params.push_back(&(desc.basePtr)); desc.data = static_cast(ptr); params.push_back(&(desc.data)); desc.offset = 0; params.push_back(&(desc.offset)); for (size_t i = 0; i < layout.ndim; i++) { desc.sizes[i] = layout.shape[i]; params.push_back(&(desc.sizes[i])); desc.strides[i] = layout.stride[i]; params.push_back(&(desc.strides[i])); } }; for (const auto& arg : args.inputs) { set_params(arg.from->dev_tensor().raw_ptr(), arg.from->layout()); } int64_t nr_elements = 0; for (const auto& arg : args.outputs) { if (nr_elements == 0) { nr_elements = arg.from->layout().total_nr_elems(); } else { mgb_assert(static_cast(nr_elements) == arg.layout.total_nr_elems(), "The number of elements of outputs mismatch, expected: " "%zu got: %zu(%s)", static_cast(nr_elements), arg.from->layout().total_nr_elems(), arg.from->layout().to_string().c_str()); } set_params(arg.from->dev_tensor().raw_ptr(), arg.from->layout()); } const CompNodeEnv& env = CompNodeEnv::from_comp_node(fusion_opr->comp_node()); int64_t num_block = (nr_elements - 1) / block_size + 1; params.push_back(&nr_elements); MGB_CUDA_CU_CHECK(cuLaunchKernel(func, num_block, 1, 1, block_size, 1, 1, 0, env.cuda_env().stream, params.data(), 0)); } template void setup_and_launch_dim(const megdnn::DType dtype, const JITExecutor* fusion_opr, CUfunction func, int block_size) { switch (dtype.enumv()) { #define cb(_dtype, _type) \ case megdnn::DTypeEnum::_dtype: \ setup_and_launch(fusion_opr, func, block_size); \ return; FOR_EACH_DNN_DTYPE(cb) #undef cb default: mgb_throw(InternalError, "Unsupported dtype: %s", dtype.name()); } return; } } // namespace const std::string MLIRCUDAExecutable::sm_blob_annotation = "nvvm.cubin"; MLIRCUDAExecutable::MLIRCUDAExecutable(mlir::OwningModuleRef& module, const std::string& kernel_name) { m_kernel_name = kernel_name + "_kernel"; auto kernel_module = module->lookupSymbol(m_kernel_name); mgb_assert(kernel_module, "Expected gpu kernel module"); auto binary_attr = kernel_module.getAttrOfType( llvm::StringRef(sm_blob_annotation)); mgb_assert(binary_attr, "Missing %s attribute in gpu kernel module", sm_blob_annotation.c_str()); m_kernel_data = binary_attr.getValue().str(); } void MLIRCUDAExecutable::execute(JITExecutor* fusion_opr) { FuncCache* func; auto cn = fusion_opr->comp_node(); auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; func = &m_func_cache[{prop.major, prop.minor}]; func->kernel_data = m_kernel_data; func->exec(fusion_opr, this); } MLIRCUDAExecutable::~MLIRCUDAExecutable() {} void MLIRCUDAExecutable::FuncCache::exec(const JITExecutor* fusion_opr, const MLIRCUDAExecutable* cuda_exe) { Func* func; { MGB_LOCK_GUARD(mtx); auto ins = cn2func.insert({fusion_opr->comp_node(), {}}); func = &ins.first->second; if (ins.second) { MGB_CUDA_CU_CHECK( cuModuleLoadData(&func->module, kernel_data.data())); MGB_CUDA_CU_CHECK( cuModuleGetFunction(&func->func, func->module, cuda_exe->m_kernel_name.c_str())); int min_grid_size = 0; MGB_CUDA_CU_CHECK(cuOccupancyMaxPotentialBlockSize( &min_grid_size, &func->block_size, func->func, nullptr, 0, 0)); } } mgb_assert(fusion_opr->args().outputs.size() == 1, "Currently only support 1 outputs, got %zu", fusion_opr->args().outputs.size()); int out_dim = fusion_opr->args().outputs[0].from->layout().ndim; DType dtype = fusion_opr->args().outputs[0].from->layout().dtype; switch (out_dim) { #define cb(_ndim) \ case _ndim: \ setup_and_launch_dim<_ndim>(dtype, fusion_opr, func->func, \ func->block_size); \ break; cb(1); cb(2); cb(3); cb(4); #undef cb } } #endif // MGB_CUDA #endif // MGB_JIT && MGB_JIT_MLIR // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}