Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
38ea5f1b
MegEngine
项目概览
MegEngine 天元
/
MegEngine
大约 1 年 前同步成功
通知
399
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
38ea5f1b
编写于
12月 22, 2020
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix(mgb/jit): lower FuncOp to GPUFuncOp directly
GitOrigin-RevId: d7c9c0f54805f47dd8133c7432da4f0a3a1aafc7
上级
4cfedc16
变更
4
显示空白变更内容
内联
并排
Showing
4 changed file
with
40 addition
and
372 deletion
+40
-372
src/jit/impl/mlir/compiler.cpp
src/jit/impl/mlir/compiler.cpp
+6
-11
src/jit/impl/mlir/ir/create_gpu_kernel_outlining_pass.cpp
src/jit/impl/mlir/ir/create_gpu_kernel_outlining_pass.cpp
+0
-338
src/jit/impl/mlir/ir/lower_to_gpu_pass.cpp
src/jit/impl/mlir/ir/lower_to_gpu_pass.cpp
+34
-15
src/jit/include/megbrain/jit/mlir/ir/passes.h
src/jit/include/megbrain/jit/mlir/ir/passes.h
+0
-8
未找到文件。
src/jit/impl/mlir/compiler.cpp
浏览文件 @
38ea5f1b
...
...
@@ -149,22 +149,17 @@ void add_cuda_lowering_pass(mlir::PassManager& manager,
mlir
::
OpPassManager
&
opt_pm
=
manager
.
nest
<
mlir
::
FuncOp
>
();
opt_pm
.
addPass
(
mlir
::
createCanonicalizerPass
());
opt_pm
.
addPass
(
mlir
::
createCSEPass
());
opt_pm
.
addPass
(
mlir
::
createLoopFusionPass
());
opt_pm
.
addPass
(
mlir
::
createMemRefDataFlowOptPass
());
}
manager
.
addPass
(
create_lower_to_gpu_pass
());
{
mlir
::
OpPassManager
&
opt_pm
=
manager
.
nest
<
mlir
::
FuncOp
>
();
opt_pm
.
addPass
(
create_lower_to_gpu_pass
());
mlir
::
OpPassManager
&
opt_pm
=
manager
.
nest
<
gpu
::
GPUModuleOp
>
();
opt_pm
.
addPass
(
mlir
::
createLowerToCFGPass
());
opt_pm
.
addPass
(
mlir
::
createCanonicalizerPass
());
opt_pm
.
addPass
(
mlir
::
createCSEPass
());
opt_pm
.
addPass
(
mlir
::
createLoopFusionPass
());
opt_pm
.
addPass
(
mlir
::
createMemRefDataFlowOptPass
());
}
manager
.
addPass
(
create_gpu_kernel_outlining_pass
());
{
auto
&
kernel_pm
=
manager
.
nest
<
gpu
::
GPUModuleOp
>
();
kernel_pm
.
addPass
(
mlir
::
createLowerGpuOpsToNVVMOpsPass
());
kernel_pm
.
addPass
(
mlir
::
createConvertGPUKernelToBlobPass
(
opt_pm
.
addPass
(
mlir
::
createLowerGpuOpsToNVVMOpsPass
());
opt_pm
.
addPass
(
mlir
::
createConvertGPUKernelToBlobPass
(
translate_module_to_nvvm_ir_and_link_device
,
compile_ptx_to_cubin
,
"nvptx64-nvidia-cuda"
,
target_chip
,
"+ptx60"
,
MLIRCUDAExecutable
::
sm_blob_annotation
));
...
...
src/jit/impl/mlir/ir/create_gpu_kernel_outlining_pass.cpp
已删除
100644 → 0
浏览文件 @
4cfedc16
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements the GPU dialect kernel outlining pass.
//
//===----------------------------------------------------------------------===//
/**
* \file src/jit/impl/mlir/ir/create_gpu_kernel_outlining_pass.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*
* This file has been modified by Megvii ("Megvii Modifications").
* All Megvii Modifications are Copyright (C) 2014-2020 Megvii Inc. All rights
* reserved.
*
*/
#include "megbrain_build_config.h"
#if MGB_JIT && MGB_JIT_MLIR
#include "megbrain/jit/mlir/ir/passes.h"
#include <mlir/Dialect/GPU/GPUDialect.h>
#include <mlir/Dialect/GPU/Passes.h>
#include <mlir/Dialect/GPU/Utils.h>
#include <mlir/Dialect/StandardOps/IR/Ops.h>
#include <mlir/IR/BlockAndValueMapping.h>
#include <mlir/IR/Builders.h>
#include <mlir/IR/SymbolTable.h>
#include <mlir/Transforms/RegionUtils.h>
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallVector.h"
using
namespace
mlir
;
namespace
{
template
<
typename
OpTy
>
static
void
createForAllDimensions
(
OpBuilder
&
builder
,
Location
loc
,
SmallVectorImpl
<
Value
>&
values
)
{
for
(
StringRef
dim
:
{
"x"
,
"y"
,
"z"
})
{
Value
v
=
builder
.
create
<
OpTy
>
(
loc
,
builder
.
getIndexType
(),
builder
.
getStringAttr
(
dim
));
values
.
push_back
(
v
);
}
}
// Add operations generating block/thread ids and grid/block dimensions at the
// beginning of the `launchFuncOpBody` region. Add mapping from argument in
// entry block of `launchOpBody`, to the corresponding result value of the added
// operations.
static
void
injectGpuIndexOperations
(
Location
loc
,
Region
&
launchFuncOpBody
,
Region
&
launchOpBody
,
BlockAndValueMapping
&
map
)
{
OpBuilder
builder
(
loc
->
getContext
());
Block
&
firstBlock
=
launchOpBody
.
front
();
builder
.
setInsertionPointToStart
(
&
launchFuncOpBody
.
front
());
SmallVector
<
Value
,
12
>
indexOps
;
createForAllDimensions
<
gpu
::
BlockIdOp
>
(
builder
,
loc
,
indexOps
);
createForAllDimensions
<
gpu
::
ThreadIdOp
>
(
builder
,
loc
,
indexOps
);
createForAllDimensions
<
gpu
::
GridDimOp
>
(
builder
,
loc
,
indexOps
);
createForAllDimensions
<
gpu
::
BlockDimOp
>
(
builder
,
loc
,
indexOps
);
// Replace the leading 12 function args with the respective thread/block
// index operations. Iterate backwards since args are erased and indices
// change.
for
(
auto
indexOp
:
enumerate
(
indexOps
))
map
.
map
(
firstBlock
.
getArgument
(
indexOp
.
index
()),
indexOp
.
value
());
}
static
bool
isSinkingBeneficiary
(
Operation
*
op
)
{
return
isa
<
ConstantOp
,
DimOp
>
(
op
);
}
LogicalResult
sink_operations_into_launch_op
(
gpu
::
LaunchOp
launchOp
)
{
Region
&
launchOpBody
=
launchOp
.
body
();
// Identify uses from values defined outside of the scope of the launch
// operation.
llvm
::
SetVector
<
Value
>
sinkCandidates
;
getUsedValuesDefinedAbove
(
launchOpBody
,
sinkCandidates
);
llvm
::
SetVector
<
Value
>
sunkValues
;
llvm
::
SetVector
<
Operation
*>
sunkOperations
;
for
(
Value
operand
:
sinkCandidates
)
{
Operation
*
operandOp
=
operand
.
getDefiningOp
();
if
(
!
operandOp
||
!
isSinkingBeneficiary
(
operandOp
))
continue
;
// Only sink operations that do not create new sinkCandidates.
if
(
!
llvm
::
all_of
(
operandOp
->
getOperands
(),
[
&
sinkCandidates
](
Value
value
)
{
return
sinkCandidates
.
count
(
value
);
}))
continue
;
sunkValues
.
insert
(
operand
);
sunkOperations
.
insert
(
operandOp
);
}
// Insert operations so that the defs get cloned before uses.
BlockAndValueMapping
map
;
OpBuilder
builder
(
launchOpBody
);
DenseSet
<
Operation
*>
processed
;
SmallVector
<
Operation
*
,
2
>
clonedOps
;
while
(
processed
.
size
()
!=
sunkOperations
.
size
())
{
auto
startSize
=
processed
.
size
();
for
(
Operation
*
sunkOperation
:
sunkOperations
)
{
if
(
processed
.
count
(
sunkOperation
))
continue
;
// Operation cant be cloned yet if any of its operands is also being
// sunk, but isnt cloned yet.
if
(
llvm
::
any_of
(
sunkOperation
->
getOperands
(),
[
&
sunkValues
,
&
map
](
Value
value
)
{
return
sunkValues
.
count
(
value
)
&&
!
map
.
lookupOrNull
(
value
);
}))
continue
;
Operation
*
clonedOp
=
builder
.
clone
(
*
sunkOperation
,
map
);
// Only replace uses within the launch op.
for
(
auto
result
:
llvm
::
enumerate
(
sunkOperation
->
getResults
()))
{
auto
replacement
=
clonedOp
->
getResult
(
result
.
index
());
for
(
auto
&
use
:
llvm
::
make_early_inc_range
(
result
.
value
().
getUses
()))
if
(
use
.
getOwner
()
->
getParentOfType
<
gpu
::
LaunchOp
>
()
==
launchOp
)
use
.
set
(
replacement
);
}
processed
.
insert
(
sunkOperation
);
}
if
(
startSize
==
processed
.
size
())
return
launchOp
.
emitError
(
"found illegal cyclic dependency between operations while "
"sinking"
);
}
return
success
();
}
// Outline the `gpu.launch` operation body into a kernel function. Replace
// `gpu.terminator` operations by `gpu.return` in the generated function.
static
gpu
::
GPUFuncOp
outlineKernelFuncImpl
(
gpu
::
LaunchOp
launchOp
,
StringRef
kernelFnName
,
SmallVector
<
Value
,
4
>&
operands
)
{
Location
loc
=
launchOp
.
getLoc
();
// Create a builder with no insertion point, insertion will happen
// separately due to symbol table manipulation.
OpBuilder
builder
(
launchOp
.
getContext
());
Region
&
launchOpBody
=
launchOp
.
body
();
llvm
::
SetVector
<
Value
>
operandsSet
;
// Identify uses from values defined outside of the scope of the launch
// operation.
getUsedValuesDefinedAbove
(
launchOpBody
,
operandsSet
);
// reorder the operands which match the input order
llvm
::
SetVector
<
Value
>
insertedOperands
;
for
(
auto
&
item
:
launchOp
.
getParentOfType
<
FuncOp
>
().
getArguments
())
{
if
(
operandsSet
.
contains
(
item
))
{
operands
.
push_back
(
item
);
insertedOperands
.
insert
(
item
);
}
}
for
(
Value
operand
:
operandsSet
)
{
if
(
!
insertedOperands
.
contains
(
operand
))
{
operands
.
push_back
(
operand
);
}
}
// Create the gpu.func operation.
SmallVector
<
Type
,
4
>
kernelOperandTypes
;
kernelOperandTypes
.
reserve
(
operands
.
size
());
for
(
Value
operand
:
operands
)
{
kernelOperandTypes
.
push_back
(
operand
.
getType
());
}
FunctionType
type
=
FunctionType
::
get
(
kernelOperandTypes
,
{},
launchOp
.
getContext
());
auto
outlinedFunc
=
builder
.
create
<
gpu
::
GPUFuncOp
>
(
loc
,
kernelFnName
,
type
);
outlinedFunc
.
setAttr
(
gpu
::
GPUDialect
::
getKernelFuncAttrName
(),
builder
.
getUnitAttr
());
BlockAndValueMapping
map
;
// Map the arguments corresponding to the launch parameters like blockIdx,
// threadIdx, etc.
Region
&
outlinedFuncBody
=
outlinedFunc
.
body
();
injectGpuIndexOperations
(
loc
,
outlinedFuncBody
,
launchOpBody
,
map
);
// Map arguments from gpu.launch region to the arguments of the gpu.func
// operation.
Block
&
entryBlock
=
outlinedFuncBody
.
front
();
for
(
auto
operand
:
enumerate
(
operands
))
map
.
map
(
operand
.
value
(),
entryBlock
.
getArgument
(
operand
.
index
()));
// Clone the region of the gpu.launch operation into the gpu.func operation.
// TODO: If cloneInto can be modified such that if a mapping for
// a block exists, that block will be used to clone operations into (at the
// end of the block), instead of creating a new block, this would be much
// cleaner.
launchOpBody
.
cloneInto
(
&
outlinedFuncBody
,
map
);
// Branch from entry of the gpu.func operation to the block that is cloned
// from the entry block of the gpu.launch operation.
Block
&
launchOpEntry
=
launchOpBody
.
front
();
Block
*
clonedLaunchOpEntry
=
map
.
lookup
(
&
launchOpEntry
);
builder
.
setInsertionPointToEnd
(
&
entryBlock
);
builder
.
create
<
BranchOp
>
(
loc
,
clonedLaunchOpEntry
);
outlinedFunc
.
walk
([](
gpu
::
TerminatorOp
op
)
{
OpBuilder
replacer
(
op
);
replacer
.
create
<
gpu
::
ReturnOp
>
(
op
.
getLoc
());
op
.
erase
();
});
return
outlinedFunc
;
}
// Replace `gpu.launch` operations with an `gpu.launch_func` operation launching
// `kernelFunc`. The kernel func contains the body of the `gpu.launch` with
// constant region arguments inlined.
static
void
convertToLaunchFuncOp
(
gpu
::
LaunchOp
launchOp
,
gpu
::
GPUFuncOp
kernelFunc
,
ValueRange
operands
)
{
OpBuilder
builder
(
launchOp
);
builder
.
create
<
gpu
::
LaunchFuncOp
>
(
launchOp
.
getLoc
(),
kernelFunc
,
launchOp
.
getGridSizeOperandValues
(),
launchOp
.
getBlockSizeOperandValues
(),
operands
);
launchOp
.
erase
();
}
/// Pass that moves the kernel of each LaunchOp into its separate nested module.
///
/// This pass moves the kernel code of each LaunchOp into a function created
/// inside a nested module. It also creates an external function of the same
/// name in the parent module.
///
/// The gpu.modules are intended to be compiled to a cubin blob independently in
/// a separate pass. The external functions can then be annotated with the
/// symbol of the cubin accessor function.
class
GpuKernelOutliningPass
:
public
PassWrapper
<
GpuKernelOutliningPass
,
OperationPass
<
ModuleOp
>>
{
public:
void
runOnOperation
()
override
{
SymbolTable
symbolTable
(
getOperation
());
bool
modified
=
false
;
for
(
auto
func
:
getOperation
().
getOps
<
FuncOp
>
())
{
// Insert just after the function.
Block
::
iterator
insertPt
(
func
.
getOperation
()
->
getNextNode
());
auto
funcWalkResult
=
func
.
walk
([
&
](
gpu
::
LaunchOp
op
)
{
SmallVector
<
Value
,
4
>
operands
;
std
::
string
kernelFnName
=
Twine
(
op
.
getParentOfType
<
FuncOp
>
().
getName
(),
"_kernel"
)
.
str
();
// Pull in instructions that can be sunk
if
(
failed
(
sink_operations_into_launch_op
(
op
)))
return
WalkResult
::
interrupt
();
gpu
::
GPUFuncOp
outlinedFunc
=
outlineKernelFuncImpl
(
op
,
kernelFnName
,
operands
);
// Create nested module and insert outlinedFunc. The module will
// originally get the same name as the function, but may be
// renamed on insertion into the parent module.
auto
kernelModule
=
createKernelModule
(
outlinedFunc
,
symbolTable
);
symbolTable
.
insert
(
kernelModule
,
insertPt
);
// Potentially changes signature, pulling in constants.
convertToLaunchFuncOp
(
op
,
outlinedFunc
,
operands
);
modified
=
true
;
return
WalkResult
::
advance
();
});
if
(
funcWalkResult
.
wasInterrupted
())
return
signalPassFailure
();
}
// If any new module was inserted in this module, annotate this module
// as a container module.
if
(
modified
)
getOperation
().
setAttr
(
gpu
::
GPUDialect
::
getContainerModuleAttrName
(),
UnitAttr
::
get
(
&
getContext
()));
}
private:
// Returns a gpu.module containing kernelFunc and all callees (recursive).
gpu
::
GPUModuleOp
createKernelModule
(
gpu
::
GPUFuncOp
kernelFunc
,
const
SymbolTable
&
parentSymbolTable
)
{
// TODO: This code cannot use an OpBuilder because it must be inserted
// into a SymbolTable by the caller. SymbolTable needs to be refactored
// to prevent manual building of Ops with symbols in code using
// SymbolTables and then this needs to use the OpBuilder.
auto
context
=
getOperation
().
getContext
();
OpBuilder
builder
(
context
);
OperationState
state
(
kernelFunc
.
getLoc
(),
gpu
::
GPUModuleOp
::
getOperationName
());
gpu
::
GPUModuleOp
::
build
(
builder
,
state
,
kernelFunc
.
getName
());
auto
kernelModule
=
cast
<
gpu
::
GPUModuleOp
>
(
Operation
::
create
(
state
));
SymbolTable
symbolTable
(
kernelModule
);
symbolTable
.
insert
(
kernelFunc
);
SmallVector
<
Operation
*
,
8
>
symbolDefWorklist
=
{
kernelFunc
};
while
(
!
symbolDefWorklist
.
empty
())
{
if
(
Optional
<
SymbolTable
::
UseRange
>
symbolUses
=
SymbolTable
::
getSymbolUses
(
symbolDefWorklist
.
pop_back_val
()))
{
for
(
SymbolTable
::
SymbolUse
symbolUse
:
*
symbolUses
)
{
StringRef
symbolName
=
symbolUse
.
getSymbolRef
()
.
cast
<
FlatSymbolRefAttr
>
()
.
getValue
();
if
(
symbolTable
.
lookup
(
symbolName
))
continue
;
Operation
*
symbolDefClone
=
parentSymbolTable
.
lookup
(
symbolName
)
->
clone
();
symbolDefWorklist
.
push_back
(
symbolDefClone
);
symbolTable
.
insert
(
symbolDefClone
);
}
}
}
return
kernelModule
;
}
};
}
// namespace
std
::
unique_ptr
<
mlir
::
Pass
>
mgb
::
jit
::
create_gpu_kernel_outlining_pass
()
{
return
std
::
make_unique
<
GpuKernelOutliningPass
>
();
}
#endif // MGB_JIT && MGB_JIT_MLIR
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
src/jit/impl/mlir/ir/lower_to_gpu_pass.cpp
浏览文件 @
38ea5f1b
...
...
@@ -187,7 +187,8 @@ struct ReturnOpLowering : public ConversionPattern {
LogicalResult
matchAndRewrite
(
Operation
*
op
,
ArrayRef
<
Value
>
,
Rewriter
&
rewriter
)
const
final
{
rewriter
.
replaceOpWithNewOp
<
mlir
::
ReturnOp
>
(
op
);
rewriter
.
setInsertionPointToEnd
(
op
->
getBlock
());
rewriter
.
replaceOpWithNewOp
<
gpu
::
ReturnOp
>
(
op
);
return
success
();
}
};
...
...
@@ -214,10 +215,10 @@ struct TypeCvtLowering : public ConversionPattern, public GpuLoweringHelper {
/* ===================== MgbToGpuLoweringPass ===================== */
class
MgbToGpuLoweringPass
:
public
PassWrapper
<
MgbToGpuLoweringPass
,
FunctionPass
>
{
:
public
PassWrapper
<
MgbToGpuLoweringPass
,
OperationPass
<
ModuleOp
>
>
{
public:
void
getDependentDialects
(
DialectRegistry
&
registry
)
const
override
;
void
runOn
Func
tion
()
final
;
void
runOn
Opera
tion
()
final
;
private:
Value
get_idx
(
OpBuilder
&
builder
,
Location
loc
);
...
...
@@ -229,16 +230,19 @@ void MgbToGpuLoweringPass::getDependentDialects(
registry
.
insert
<
gpu
::
GPUDialect
,
scf
::
SCFDialect
,
StandardOpsDialect
>
();
}
void
MgbToGpuLoweringPass
::
runOnFunction
()
{
FuncOp
func_op
=
getFunction
();
Location
loc
=
func_op
.
getLoc
();
OpBuilder
builder
(
func_op
.
getBody
());
void
MgbToGpuLoweringPass
::
runOnOperation
()
{
ModuleOp
module_op
=
getOperation
();
// create gpu::LaunchOp
Value
one
=
builder
.
create
<
ConstantIndexOp
>
(
loc
,
1
);
gpu
::
LaunchOp
launch_op
=
builder
.
create
<
gpu
::
LaunchOp
>
(
loc
,
one
,
one
,
one
,
one
,
one
,
one
);
builder
.
setInsertionPointToEnd
(
&
(
launch_op
.
body
().
front
()));
// find FuncOp
FuncOp
func_op
;
module_op
.
walk
([
&
](
FuncOp
fop
)
{
func_op
=
fop
;
return
WalkResult
::
interrupt
();
});
mgb_assert
(
func_op
,
"FuncOp not found in the body of ModuleOp"
);
Location
loc
=
func_op
.
getLoc
();
OpBuilder
builder
(
&
(
func_op
.
getBody
().
front
().
back
()));
// create scf::ForOp
auto
it
=
func_op
.
getArguments
().
end
();
...
...
@@ -247,10 +251,8 @@ void MgbToGpuLoweringPass::runOnFunction() {
Value
idx
=
get_idx
(
builder
,
loc
);
auto
for_op
=
builder
.
create
<
scf
::
ForOp
>
(
loc
,
idx
,
nr_elements
,
nr_threads
);
builder
.
create
<
gpu
::
TerminatorOp
>
(
loc
);
Layout
dest
=
get_dest_layout
(
func_op
);
Value
for_idx
=
for_op
.
get
LoopBody
().
getArgument
(
0
);
Value
for_idx
=
for_op
.
get
InductionVar
(
);
OwningRewritePatternList
patterns
;
patterns
.
insert
<
AssignOpLowering
,
ConstantScalarOpLowering
,
...
...
@@ -265,6 +267,23 @@ void MgbToGpuLoweringPass::runOnFunction() {
if
(
failed
(
applyPartialConversion
(
func_op
,
target
,
std
::
move
(
patterns
))))
{
signalPassFailure
();
}
// create GPUModuleOp
std
::
string
kernel_name
=
func_op
.
getName
().
str
()
+
"_kernel"
;
builder
.
setInsertionPoint
(
func_op
);
gpu
::
GPUModuleOp
gpu_module_op
=
builder
.
create
<
gpu
::
GPUModuleOp
>
(
loc
,
kernel_name
);
// create GPUFuncOp
builder
.
setInsertionPointToStart
(
&
gpu_module_op
.
body
().
front
());
gpu
::
GPUFuncOp
gpu_func_op
=
builder
.
create
<
gpu
::
GPUFuncOp
>
(
loc
,
kernel_name
,
func_op
.
getType
());
gpu_func_op
.
setAttr
(
gpu
::
GPUDialect
::
getKernelFuncAttrName
(),
builder
.
getUnitAttr
());
// move func body
gpu_func_op
.
body
().
takeBody
(
func_op
.
getBody
());
SymbolTable
(
module_op
).
erase
(
func_op
);
}
//! block_dim * block_idx + thread_idx
...
...
src/jit/include/megbrain/jit/mlir/ir/passes.h
浏览文件 @
38ea5f1b
...
...
@@ -32,14 +32,6 @@ std::unique_ptr<mlir::Pass> create_lower_to_llvm_pass();
std
::
unique_ptr
<
mlir
::
Pass
>
create_lower_to_gpu_pass
();
/**
* \brief Outline gpu.launch bodies to kernel functions
*
* \warning Modified from lib/Dialect/GPU/Transforms/KernelOutlining.cpp, it
* will reorder gpu function args with the args of the emit c interface.
*/
std
::
unique_ptr
<
mlir
::
Pass
>
create_gpu_kernel_outlining_pass
();
}
// namespace jit
}
// namespace mgb
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录