提交 11b121a7 编写于 作者: M Megvii Engine Team

fix(mgb/jit): link libdevice.bc when generate nvvm ir

GitOrigin-RevId: 49289d65c4d627964c0d53a48f5f911db98012f6
上级 aa7f28b8
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
* implied. * implied.
*/ */
#include "llvm/Pass.h"
#include "megbrain_build_config.h" #include "megbrain_build_config.h"
#if MGB_JIT && MGB_JIT_MLIR #if MGB_JIT && MGB_JIT_MLIR
...@@ -21,6 +22,7 @@ ...@@ -21,6 +22,7 @@
#include "megbrain/comp_node_env.h" #include "megbrain/comp_node_env.h"
#include "megbrain/jit/mlir/ir/dialect.h" #include "megbrain/jit/mlir/ir/dialect.h"
#include "megbrain/jit/mlir/ir/passes.h" #include "megbrain/jit/mlir/ir/passes.h"
#include "megbrain/utils/timer.h"
#include <mlir/Conversion/GPUCommon/GPUCommonPass.h> #include <mlir/Conversion/GPUCommon/GPUCommonPass.h>
#include <mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h> #include <mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h>
...@@ -36,6 +38,11 @@ ...@@ -36,6 +38,11 @@
#include <mlir/Transforms/Passes.h> #include <mlir/Transforms/Passes.h>
#include <llvm/Support/TargetSelect.h> #include <llvm/Support/TargetSelect.h>
#include <llvm/IRReader/IRReader.h>
#include <llvm/Linker/Linker.h>
#include <dlfcn.h>
#include <dirent.h>
using namespace mgb; using namespace mgb;
using namespace jit; using namespace jit;
...@@ -59,6 +66,61 @@ mlir::OwnedBlob compile_ptx_to_cubin(const std::string ptx, mlir::Location, ...@@ -59,6 +66,61 @@ mlir::OwnedBlob compile_ptx_to_cubin(const std::string ptx, mlir::Location,
return result; return result;
} }
std::unique_ptr<llvm::Module> translate_module_to_nvvm_ir_and_link_device(
Operation* m) {
std::unique_ptr<llvm::Module> module = mlir::translateModuleToNVVMIR(m);
auto get_device_path = []() -> std::string {
auto cuda_path = getenv("CUDA_BIN_PATH");
std::string device_dir;
if (!cuda_path) {
char cuda_lib_path[PATH_MAX];
auto handle = dlopen("libcudart.so", RTLD_GLOBAL | RTLD_LAZY);
mgb_assert(handle != nullptr, "%s", dlerror());
mgb_assert(dlinfo(handle, RTLD_DI_ORIGIN, &cuda_lib_path) != -1,
"%s", dlerror());
device_dir =
std::string(cuda_lib_path) + "/../../../nvvm/libdevice/";
mgb_assert(!dlclose(handle), "fail to dlclose handle");
} else {
device_dir = std::string(cuda_path) + "/nvvm/libdevice/";
}
DIR* dirp;
struct dirent* directory;
dirp = opendir(device_dir.c_str());
if (dirp) {
while ((directory = readdir(dirp)) != nullptr) {
if (!strncmp(directory->d_name, "libdevice", 9)) {
closedir(dirp);
return device_dir + std::string(directory->d_name);
}
}
closedir(dirp);
}
return {};
};
//! load libdevice.bc
llvm::SMDiagnostic err;
auto libdevice_path = get_device_path();
std::unique_ptr<llvm::Module> mlib = llvm::parseIRFile(
libdevice_path.c_str(), err, module->getContext());
if (mlib.get()) {
mlib->setTargetTriple(module->getTargetTriple());
mlib->setDataLayout(module->getDataLayout());
RealTimer timer;
mgb_assert(
!llvm::Linker::linkModules(*module, std::move(mlib),
llvm::Linker::Flags::LinkOnlyNeeded),
"failed to parse ir file libdevice.bc");
mgb_log("MLIR JIT: link libdevice.bc, used: %.3fms", timer.get_msecs());
} else {
mgb_log_warn("Fail to load bitcode file %s", libdevice_path.c_str());
}
return module;
}
#endif #endif
void add_cpu_lowering_pass(mlir::PassManager& manager) { void add_cpu_lowering_pass(mlir::PassManager& manager) {
...@@ -80,7 +142,8 @@ void add_cpu_lowering_pass(mlir::PassManager& manager) { ...@@ -80,7 +142,8 @@ void add_cpu_lowering_pass(mlir::PassManager& manager) {
} }
#if MGB_CUDA #if MGB_CUDA
void add_cuda_lowering_pass(mlir::PassManager& manager, CompNode cn) { void add_cuda_lowering_pass(mlir::PassManager& manager,
const std::string& target_chip) {
{ {
mlir::OpPassManager& opt_pm = manager.nest<mlir::FuncOp>(); mlir::OpPassManager& opt_pm = manager.nest<mlir::FuncOp>();
opt_pm.addPass(mlir::createCanonicalizerPass()); opt_pm.addPass(mlir::createCanonicalizerPass());
...@@ -99,12 +162,10 @@ void add_cuda_lowering_pass(mlir::PassManager& manager, CompNode cn) { ...@@ -99,12 +162,10 @@ void add_cuda_lowering_pass(mlir::PassManager& manager, CompNode cn) {
auto& kernel_pm = manager.nest<gpu::GPUModuleOp>(); auto& kernel_pm = manager.nest<gpu::GPUModuleOp>();
kernel_pm.addPass(mlir::createLowerGpuOpsToNVVMOpsPass()); kernel_pm.addPass(mlir::createLowerGpuOpsToNVVMOpsPass());
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
kernel_pm.addPass(mlir::createConvertGPUKernelToBlobPass( kernel_pm.addPass(mlir::createConvertGPUKernelToBlobPass(
mlir::translateModuleToNVVMIR, compile_ptx_to_cubin, translate_module_to_nvvm_ir_and_link_device,
"nvptx64-nvidia-cuda", compile_ptx_to_cubin, "nvptx64-nvidia-cuda", target_chip,
ssprintf("sm_%d%d", prop.major, prop.minor), "+ptx60", "+ptx60", MLIRCUDAExecutable::sm_blob_annotation));
MLIRCUDAExecutable::sm_blob_annotation));
} }
} }
#endif #endif
...@@ -134,21 +195,29 @@ void MLIRCompiler::run_lowering_pass(mlir::OwningModuleRef& module, ...@@ -134,21 +195,29 @@ void MLIRCompiler::run_lowering_pass(mlir::OwningModuleRef& module,
CompNode cn) { CompNode cn) {
mgb_assert(cn.device_type() == m_device_type); mgb_assert(cn.device_type() == m_device_type);
mlir::PassManager manager(module->getContext()); mlir::PassManager manager(module->getContext());
std::string target_chip;
switch (m_device_type) { switch (m_device_type) {
case CompNode::DeviceType::CPU: case CompNode::DeviceType::CPU:
add_cpu_lowering_pass(manager); add_cpu_lowering_pass(manager);
break; break;
#if MGB_CUDA #if MGB_CUDA
case CompNode::DeviceType::CUDA: case CompNode::DeviceType::CUDA: {
add_cuda_lowering_pass(manager, cn); auto&& prop =
CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
std::string target_chip =
ssprintf("sm_%d%d", prop.major, prop.minor);
add_cuda_lowering_pass(manager, target_chip);
break; break;
}
#endif #endif
default: default:
mgb_throw(InternalError, "Unsupport device type: %d", mgb_throw(InternalError, "Unsupport device type: %d",
static_cast<int>(m_device_type)); static_cast<int>(m_device_type));
break; break;
} }
RealTimer timer;
mgb_assert(mlir::succeeded(manager.run(*module))); mgb_assert(mlir::succeeded(manager.run(*module)));
mgb_log("MLIR JIT: run lowering pass used: %.3f ms", timer.get_msecs());
} }
std::unique_ptr<Executable> MLIRCompiler::do_compile( std::unique_ptr<Executable> MLIRCompiler::do_compile(
......
...@@ -66,7 +66,6 @@ mlir::Value ValueBuilderHelper::const_val(float val) { ...@@ -66,7 +66,6 @@ mlir::Value ValueBuilderHelper::const_val(float val) {
} }
cb(neg, NegFOp); cb(neg, NegFOp);
cb(abs, AbsFOp);
cb(ceil, CeilFOp); cb(ceil, CeilFOp);
cb(cos, CosOp); cb(cos, CosOp);
cb(exp, ExpOp); cb(exp, ExpOp);
...@@ -79,6 +78,10 @@ cb(sqrt, SqrtOp); ...@@ -79,6 +78,10 @@ cb(sqrt, SqrtOp);
cb(tanh, TanhOp); cb(tanh, TanhOp);
#undef cb #undef cb
mlir::Value ValueBuilderHelper::abs(mlir::Value lhs) {
return max(lhs, const_val(0.f));
}
mlir::Value ValueBuilderHelper::floor(mlir::Value lhs) { mlir::Value ValueBuilderHelper::floor(mlir::Value lhs) {
//! FIXME use standard floor when upgrade llvm //! FIXME use standard floor when upgrade llvm
return neg(ceil(neg(lhs))); return neg(ceil(neg(lhs)));
......
...@@ -266,9 +266,6 @@ public: ...@@ -266,9 +266,6 @@ public:
target.addLegalDialect<gpu::GPUDialect>(); target.addLegalDialect<gpu::GPUDialect>();
target.addIllegalDialect<MgbDialect>(); target.addIllegalDialect<MgbDialect>();
patterns.insert<AddOpLowering, AssignOpLowering, ReturnOpLowering>(
&getContext(), &launch_op);
#define cb(_op, _) _op##Lowering, #define cb(_op, _) _op##Lowering,
patterns.insert<MLIR_MGB_FOREACH_ELEMWISE_MODE_UNARY( patterns.insert<MLIR_MGB_FOREACH_ELEMWISE_MODE_UNARY(
cb) MLIR_MGB_FOREACH_ELEMWISE_MODE_BINARY(cb) cb) MLIR_MGB_FOREACH_ELEMWISE_MODE_BINARY(cb)
......
...@@ -137,7 +137,7 @@ void run_mlir(CompNode cn) { ...@@ -137,7 +137,7 @@ void run_mlir(CompNode cn) {
b = opr::Host2DeviceCopy::make(*graph, host_x1), b = opr::Host2DeviceCopy::make(*graph, host_x1),
c = opr::Host2DeviceCopy::make(*graph, host_x2); c = opr::Host2DeviceCopy::make(*graph, host_x2);
auto y = a + b + c; auto y = a + b * c;
auto ig_gen = auto ig_gen =
std::make_unique<InternalGraphGenerator>(y.node()->owner_opr()); std::make_unique<InternalGraphGenerator>(y.node()->owner_opr());
...@@ -273,6 +273,20 @@ TYPED_TEST(TestJITMlirUnaryElemwise, run) { ...@@ -273,6 +273,20 @@ TYPED_TEST(TestJITMlirUnaryElemwise, run) {
run_mlir_mode<TypeParam, 1>(cn); run_mlir_mode<TypeParam, 1>(cn);
} }
#define SKIP_MODE(_mode) \
if (TypeParam::mode == opr::Elemwise::Mode::_mode) { \
printf("skip\n"); \
return; \
}
TYPED_TEST(TestJITMlirUnaryElemwise, runGpu) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
SKIP_MODE(SIN);
run_mlir_mode<TypeParam, 1>(cn);
}
///////////////////////// binary /////////////////////////////// ///////////////////////// binary ///////////////////////////////
// clang-format off // clang-format off
#define FOREACH_BINARY_MODE(cb) \ #define FOREACH_BINARY_MODE(cb) \
...@@ -319,6 +333,12 @@ TYPED_TEST(TestJITMlirBinaryElemwise, run) { ...@@ -319,6 +333,12 @@ TYPED_TEST(TestJITMlirBinaryElemwise, run) {
run_mlir_mode<TypeParam, 2>(cn); run_mlir_mode<TypeParam, 2>(cn);
} }
TYPED_TEST(TestJITMlirBinaryElemwise, runGpu) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
run_mlir_mode<TypeParam, 2>(cn);
}
///////////////////////// ternary /////////////////////////////// ///////////////////////// ternary ///////////////////////////////
// clang-format off // clang-format off
#define FOREACH_TERNARY_MODE(cb) \ #define FOREACH_TERNARY_MODE(cb) \
...@@ -345,6 +365,14 @@ TYPED_TEST(TestJITMlirTernaryElemwise, run) { ...@@ -345,6 +365,14 @@ TYPED_TEST(TestJITMlirTernaryElemwise, run) {
run_mlir_mode<TypeParam, 3>(cn); run_mlir_mode<TypeParam, 3>(cn);
} }
TYPED_TEST(TestJITMlirTernaryElemwise, runGpu) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
run_mlir_mode<TypeParam, 3>(cn);
}
#undef SKIP_MODE
#endif #endif
#endif // MGB_JIT #endif // MGB_JIT
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册