Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
88e918e2
MegEngine
项目概览
MegEngine 天元
/
MegEngine
大约 1 年 前同步成功
通知
396
Star
4704
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
88e918e2
编写于
11月 29, 2020
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feat(mgb/jit): add scf.ForOp in MgbToGpuLoweringPass
GitOrigin-RevId: 3cdae27c378f7f76c7dc59ecb80b08d6dd5c35fe
上级
7aa54b0e
变更
4
显示空白变更内容
内联
并排
Showing
4 changed file
with
205 addition
and
216 deletion
+205
-216
src/jit/impl/mlir/compiler.cpp
src/jit/impl/mlir/compiler.cpp
+2
-0
src/jit/impl/mlir/executable_cuda.cpp
src/jit/impl/mlir/executable_cuda.cpp
+19
-2
src/jit/impl/mlir/ir/lower_to_gpu_pass.cpp
src/jit/impl/mlir/ir/lower_to_gpu_pass.cpp
+181
-213
src/jit/impl/mlir/mlir_gen.cpp
src/jit/impl/mlir/mlir_gen.cpp
+3
-1
未找到文件。
src/jit/impl/mlir/compiler.cpp
浏览文件 @
88e918e2
...
...
@@ -26,6 +26,7 @@
#include <mlir/Conversion/GPUCommon/GPUCommonPass.h>
#include <mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h>
#include <mlir/Conversion/SCFToStandard/SCFToStandard.h>
#include <mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h>
#include <mlir/Dialect/GPU/Passes.h>
#include <mlir/IR/Dialect.h>
...
...
@@ -152,6 +153,7 @@ void add_cuda_lowering_pass(mlir::PassManager& manager,
{
mlir
::
OpPassManager
&
opt_pm
=
manager
.
nest
<
mlir
::
FuncOp
>
();
opt_pm
.
addPass
(
create_lower_to_gpu_pass
());
opt_pm
.
addPass
(
mlir
::
createLowerToCFGPass
());
opt_pm
.
addPass
(
mlir
::
createCanonicalizerPass
());
opt_pm
.
addPass
(
mlir
::
createCSEPass
());
opt_pm
.
addPass
(
mlir
::
createLoopFusionPass
());
...
...
src/jit/impl/mlir/executable_cuda.cpp
浏览文件 @
88e918e2
...
...
@@ -32,6 +32,14 @@ using namespace mgb;
using
namespace
jit
;
namespace
{
int64_t
get_grid_size
(
int64_t
nr_elements
,
int64_t
block_size
)
{
// unroll three times in the kernel
int64_t
a
=
nr_elements
/
(
block_size
*
2
);
int64_t
b
=
(
nr_elements
-
1
)
/
(
block_size
*
3
)
+
1
;
return
std
::
max
(
a
,
b
);
}
template
<
int
out_dim
,
typename
ctype
>
void
setup_and_launch
(
const
JITExecutor
*
fusion_opr
,
CUfunction
func
,
int
block_size
)
{
...
...
@@ -87,9 +95,18 @@ void setup_and_launch(const JITExecutor* fusion_opr, CUfunction func,
const
CompNodeEnv
&
env
=
CompNodeEnv
::
from_comp_node
(
fusion_opr
->
comp_node
());
int64_t
num_block
=
(
nr_elements
-
1
)
/
block_size
+
1
;
int64_t
grid_size
;
if
(
nr_elements
<=
block_size
)
{
block_size
=
nr_elements
;
grid_size
=
1
;
}
else
{
grid_size
=
get_grid_size
(
nr_elements
,
block_size
);
}
int64_t
nr_threads
=
grid_size
*
block_size
;
params
.
push_back
(
&
nr_elements
);
MGB_CUDA_CU_CHECK
(
cuLaunchKernel
(
func
,
num_block
,
1
,
1
,
block_size
,
1
,
1
,
0
,
params
.
push_back
(
&
nr_threads
);
MGB_CUDA_CU_CHECK
(
cuLaunchKernel
(
func
,
grid_size
,
1
,
1
,
block_size
,
1
,
1
,
0
,
env
.
cuda_env
().
stream
,
params
.
data
(),
0
));
}
...
...
src/jit/impl/mlir/ir/lower_to_gpu_pass.cpp
浏览文件 @
88e918e2
...
...
@@ -21,11 +21,6 @@
#include "megbrain/jit/mlir/ir/passes.h"
#include "megbrain/jit/mlir/ir/utils.h"
#include <llvm/ADT/PointerUnion.h>
#include <llvm/ADT/Sequence.h>
#include <llvm/ADT/SetVector.h>
#include <llvm/ADT/Twine.h>
#include <llvm/IR/Type.h>
#include <mlir/Dialect/GPU/GPUDialect.h>
#include <mlir/Dialect/SCF/SCF.h>
#include <mlir/Dialect/StandardOps/IR/Ops.h>
...
...
@@ -39,124 +34,98 @@ using namespace jit;
namespace
{
mlir
::
Value
get_tid
(
ConversionPatternRewriter
&
rewriter
,
const
Location
&
loc
)
{
auto
thread_idx
=
rewriter
.
create
<
gpu
::
ThreadIdOp
>
(
loc
,
rewriter
.
getIndexType
(),
rewriter
.
getStringAttr
(
"x"
));
auto
block_idx
=
rewriter
.
create
<
gpu
::
BlockIdOp
>
(
loc
,
rewriter
.
getIndexType
(),
rewriter
.
getStringAttr
(
"x"
));
auto
group_size
=
rewriter
.
create
<
gpu
::
BlockDimOp
>
(
loc
,
rewriter
.
getIndexType
(),
rewriter
.
getStringAttr
(
"x"
));
Value
index
=
rewriter
.
create
<
AddIOp
>
(
loc
,
thread_idx
,
rewriter
.
create
<
MulIOp
>
(
loc
,
block_idx
,
group_size
));
return
index
;
}
using
Rewriter
=
ConversionPatternRewriter
;
using
Layout
=
megdnn
::
TensorLayout
;
megdnn
::
TensorLayout
output_layout
(
gpu
::
LaunchOp
&
launch_op
)
{
auto
func_op
=
launch_op
.
getParentOfType
<
mlir
::
FuncOp
>
();
mgb_assert
(
func_op
,
"Unexpexted launch op."
);
for
(
auto
block_iter
=
func_op
.
rbegin
();
block_iter
!=
func_op
.
rend
();
block_iter
++
)
{
for
(
auto
op_iter
=
block_iter
->
rbegin
();
op_iter
!=
block_iter
->
rend
();
op_iter
++
)
{
auto
op
=
llvm
::
dyn_cast_or_null
<
dialect
::
AssignOp
>
(
&
(
*
op_iter
));
if
(
op
&&
op
.
getNumOperands
()
>
0
)
{
return
mlir_type_to_layout
(
*
(
op
.
operand_type_begin
()));
}
/* ===================== GpuLoweringHelper ===================== */
struct
GpuLoweringHelper
{
GpuLoweringHelper
(
scf
::
ForOp
*
for_op
,
Value
index
,
const
Layout
&
dest
)
:
m_for_op
(
for_op
),
m_index
(
index
),
m_dest
(
dest
)
{}
void
set_insertion_point
(
OpBuilder
&
builder
)
const
{
// insert before the last operation (scf.yield) in the loop body
builder
.
setInsertionPoint
(
&
(
m_for_op
->
getLoopBody
().
front
().
back
()));
}
std
::
vector
<
Value
>
map_indices
(
OpBuilder
&
builder
,
Location
loc
,
Value
value
)
const
{
auto
type
=
value
.
getType
().
dyn_cast_or_null
<
MemRefType
>
();
if
(
!
type
)
{
return
{
m_index
};
}
mgb_throw
(
MegBrainError
,
"Unexpexted launch op."
);
}
std
::
vector
<
mlir
::
Value
>
get_multidim_tid
(
ConversionPatternRewriter
&
rewriter
,
const
Location
&
loc
,
const
mlir
::
Value
&
val
,
const
megdnn
::
TensorLayout
&
dst
)
{
Value
index
=
get_tid
(
rewriter
,
loc
);
auto
type
=
val
.
getType
().
dyn_cast_or_null
<
mlir
::
MemRefType
>
();
if
(
type
)
{
ValueBuilderHelper
helper
(
rewriter
,
loc
);
std
::
vector
<
mlir
::
Value
>
idxs
;
idxs
.
resize
(
dst
.
ndim
);
mlir
::
Value
dim_index
=
index
;
for
(
int
i
=
dst
.
ndim
-
1
;
i
>=
0
;
i
--
)
{
auto
cur_index
=
helper
.
modI
(
dim_index
,
helper
.
const_i32
(
dst
[
i
]));
idxs
[
i
]
=
cur_index
;
dim_index
=
helper
.
divI
(
dim_index
,
helper
.
const_i32
(
dst
[
i
]));
std
::
vector
<
Value
>
indices
(
m_dest
.
ndim
);
ValueBuilderHelper
helper
(
builder
,
loc
);
// map global index to multi-dimensional indices
Value
dim_index
=
m_index
;
for
(
int
i
=
m_dest
.
ndim
-
1
;
i
>=
0
;
i
--
)
{
indices
[
i
]
=
helper
.
modI
(
dim_index
,
helper
.
const_i32
(
m_dest
[
i
]));
dim_index
=
helper
.
divI
(
dim_index
,
helper
.
const_i32
(
m_dest
[
i
]));
}
megdnn
::
TensorLayout
src_layout
=
mlir_type_to_layout
(
type
);
// allow broadcasting
Layout
src_layout
=
mlir_type_to_layout
(
type
);
src_layout
.
init_contiguous_stride
();
for
(
int
i
=
0
;
i
<
type
.
getRank
();
++
i
)
{
if
(
src_layout
[
i
]
==
1
)
{
i
dx
s
[
i
]
=
helper
.
const_i32
(
0
);
i
ndice
s
[
i
]
=
helper
.
const_i32
(
0
);
}
}
return
idxs
;
}
else
{
return
{
index
};
return
indices
;
}
}
struct
ElemwiseLowering
:
public
ConversionPattern
{
ElemwiseLowering
(
MLIRContext
*
ctx
,
gpu
::
LaunchOp
&
launch_op
)
:
ConversionPattern
(
dialect
::
Elemwise
::
getOperationName
(),
1
,
ctx
),
m_launch_op
{
launch_op
}
{}
private:
scf
::
ForOp
*
m_for_op
;
Value
m_index
;
Layout
m_dest
;
};
LogicalResult
matchAndRewrite
(
Operation
*
op
,
ArrayRef
<
Value
>
operands
,
ConversionPatternRewriter
&
rewriter
)
const
final
{
auto
loc
=
op
->
getLoc
();
/* ===================== conversion patterns ===================== */
rewriter
.
setInsertionPointToEnd
(
&
(
m_launch_op
.
body
().
front
()));
struct
AssignOpLowering
:
public
ConversionPattern
,
public
GpuLoweringHelper
{
AssignOpLowering
(
MLIRContext
*
ctx
,
scf
::
ForOp
*
for_op
,
mlir
::
Value
index
,
const
Layout
&
dest
)
:
ConversionPattern
(
dialect
::
AssignOp
::
getOperationName
(),
2
,
ctx
),
GpuLoweringHelper
(
for_op
,
index
,
dest
)
{}
auto
dst_layout
=
output_layout
(
m_launch_op
);
auto
inputs
=
llvm
::
to_vector
<
4
>
(
llvm
::
map_range
(
operands
,
[
&
](
mlir
::
Value
val
)
{
auto
index
=
get_multidim_tid
(
rewriter
,
loc
,
val
,
dst_layout
);
return
get_operand
<
LoadOp
>
(
rewriter
,
loc
,
val
,
index
);
}));
LogicalResult
matchAndRewrite
(
Operation
*
op
,
ArrayRef
<
Value
>
operands
,
Rewriter
&
rewriter
)
const
final
{
auto
loc
=
op
->
getLoc
();
set_insertion_point
(
rewriter
);
rewriter
.
replaceOp
(
op
,
lower_elemwise_to_std
(
op
,
rewriter
,
loc
,
inputs
));
auto
index
=
map_indices
(
rewriter
,
loc
,
operands
[
1
]);
auto
input
=
get_operand
<
LoadOp
>
(
rewriter
,
loc
,
operands
[
0
],
index
);
rewriter
.
create
<
StoreOp
>
(
loc
,
input
,
operands
[
1
],
index
);
rewriter
.
eraseOp
(
op
);
return
success
();
}
private:
gpu
::
LaunchOp
&
m_launch_op
;
};
struct
TypeCvtLowering
:
public
ConversionPattern
{
TypeCvtLowering
(
MLIRContext
*
ctx
,
gpu
::
LaunchOp
&
launch_op
)
:
ConversionPattern
(
dialect
::
TypeCvt
::
getOperationName
(),
1
,
ctx
),
m_launch_op
{
launch_op
}
{}
LogicalResult
matchAndRewrite
(
Operation
*
op
,
ArrayRef
<
Value
>
operands
,
ConversionPatternRewriter
&
rewriter
)
const
final
{
auto
loc
=
op
->
getLoc
();
rewriter
.
setInsertionPointToEnd
(
&
(
m_launch_op
.
body
().
front
()));
auto
dst_layout
=
output_layout
(
m_launch_op
);
auto
index
=
get_multidim_tid
(
rewriter
,
loc
,
operands
[
0
],
dst_layout
);
auto
input
=
get_operand
<
LoadOp
>
(
rewriter
,
loc
,
operands
[
0
],
index
);
struct
ConstantScalarOpLowering
:
public
OpRewritePattern
<
dialect
::
ConstantScalarOp
>
,
public
GpuLoweringHelper
{
ConstantScalarOpLowering
(
MLIRContext
*
ctx
,
scf
::
ForOp
*
for_op
,
Value
index
,
const
Layout
&
dest
)
:
OpRewritePattern
<
dialect
::
ConstantScalarOp
>
(
ctx
),
GpuLoweringHelper
(
for_op
,
index
,
dest
)
{}
rewriter
.
replaceOp
(
op
,
lower_typecvt_to_std
(
op
,
rewriter
,
loc
,
input
));
LogicalResult
matchAndRewrite
(
dialect
::
ConstantScalarOp
op
,
PatternRewriter
&
rewriter
)
const
final
{
set_insertion_point
(
rewriter
);
rewriter
.
replaceOpWithNewOp
<
mlir
::
ConstantOp
>
(
op
,
op
.
value
());
return
success
();
}
private:
gpu
::
LaunchOp
&
m_launch_op
;
};
struct
DimshuffleLowering
:
public
ConversionPattern
{
DimshuffleLowering
(
MLIRContext
*
ctx
,
gpu
::
LaunchOp
&
launch_op
)
struct
DimshuffleLowering
:
public
ConversionPattern
,
public
GpuLoweringHelper
{
DimshuffleLowering
(
MLIRContext
*
ctx
,
scf
::
ForOp
*
for_op
,
Value
index
,
const
Layout
&
dest
)
:
ConversionPattern
(
dialect
::
Dimshuffle
::
getOperationName
(),
1
,
ctx
),
m_launch_op
{
launch_op
}
{}
GpuLoweringHelper
(
for_op
,
index
,
dest
)
{}
static
std
::
vector
<
mlir
::
Value
>
get_index_from_pattern
(
const
std
::
vector
<
int32_t
>&
pattern
,
...
...
@@ -172,163 +141,162 @@ struct DimshuffleLowering : public ConversionPattern {
return
res
;
}
LogicalResult
matchAndRewrite
(
Operation
*
op
,
ArrayRef
<
Value
>
operands
,
ConversionPatternRewriter
&
rewriter
)
const
final
{
LogicalResult
matchAndRewrite
(
Operation
*
op
,
ArrayRef
<
Value
>
operands
,
Rewriter
&
rewriter
)
const
final
{
auto
loc
=
op
->
getLoc
();
set_insertion_point
(
rewriter
);
rewriter
.
setInsertionPointToEnd
(
&
(
m_launch_op
.
body
().
front
()));
auto
dst_layout
=
output_layout
(
m_launch_op
);
auto
index
=
get_multidim_tid
(
rewriter
,
loc
,
operands
[
0
],
dst_layout
);
auto
pattern
=
llvm
::
dyn_cast
<
dialect
::
Dimshuffle
>
(
op
).
pattern
();
auto
index
=
map_indices
(
rewriter
,
loc
,
operands
[
0
]);
auto
shuffled_index
=
get_index_from_pattern
(
pattern
,
index
);
rewriter
.
replaceOp
(
op
,
get_operand
<
LoadOp
>
(
rewriter
,
loc
,
operands
[
0
],
shuffled_index
));
return
success
();
}
private:
gpu
::
LaunchOp
&
m_launch_op
;
};
struct
ReturnOpLowering
:
public
ConversionPattern
{
ReturnOpLowering
(
MLIRContext
*
ctx
,
gpu
::
LaunchOp
&
launch_op
)
:
ConversionPattern
(
dialect
::
ReturnOp
::
getOperationName
(),
1
,
ctx
),
m_launch_op
{
launch_op
}
{}
struct
ElemwiseLowering
:
public
ConversionPattern
,
public
GpuLoweringHelper
{
ElemwiseLowering
(
MLIRContext
*
ctx
,
scf
::
ForOp
*
for_op
,
Value
index
,
const
Layout
&
dest
)
:
ConversionPattern
(
dialect
::
Elemwise
::
getOperationName
(),
1
,
ctx
),
GpuLoweringHelper
(
for_op
,
index
,
dest
)
{}
LogicalResult
matchAndRewrite
(
Operation
*
op
,
ArrayRef
<
Value
>
,
ConversionPatternRewriter
&
rewriter
)
const
final
{
rewriter
.
replaceOpWithNewOp
<
mlir
::
ReturnOp
>
(
op
);
LogicalResult
matchAndRewrite
(
Operation
*
op
,
ArrayRef
<
Value
>
operands
,
Rewriter
&
rewriter
)
const
final
{
auto
loc
=
op
->
getLoc
();
set_insertion_point
(
rewriter
);
//! remove the first gpu.terminator
m_launch_op
.
body
().
front
().
front
().
erase
();
//! if (tid >= nr_tid) {return;} in the begin of the block
rewriter
.
setInsertionPointToStart
(
&
(
m_launch_op
.
body
().
front
()));
Block
*
cond_block
=
rewriter
.
getInsertionBlock
();
Block
::
iterator
op_position
=
rewriter
.
getInsertionPoint
();
Block
*
remaining_ops_block
=
rewriter
.
splitBlock
(
cond_block
,
op_position
);
rewriter
.
setInsertionPointToEnd
(
cond_block
);
auto
index
=
get_tid
(
rewriter
,
loc
);
auto
comparison
=
rewriter
.
create
<
mlir
::
CmpIOp
>
(
loc
,
CmpIPredicate
::
sge
,
index
,
m_launch_op
.
getParentOfType
<
mlir
::
FuncOp
>
()
.
getArguments
()
.
back
());
Block
*
then_block
=
rewriter
.
splitBlock
(
cond_block
,
rewriter
.
getInsertionPoint
());
rewriter
.
setInsertionPointToEnd
(
then_block
);
rewriter
.
create
<
gpu
::
TerminatorOp
>
(
loc
);
rewriter
.
setInsertionPointToEnd
(
cond_block
);
rewriter
.
create
<
mlir
::
CondBranchOp
>
(
loc
,
comparison
,
then_block
,
ArrayRef
<
Value
>
(),
remaining_ops_block
,
ArrayRef
<
Value
>
());
rewriter
.
setInsertionPointToEnd
(
remaining_ops_block
);
rewriter
.
create
<
gpu
::
TerminatorOp
>
(
loc
);
// currently Elemwise handles at most three operands
auto
inputs
=
llvm
::
to_vector
<
4
>
(
llvm
::
map_range
(
operands
,
[
&
](
mlir
::
Value
val
)
{
auto
index
=
map_indices
(
rewriter
,
loc
,
val
);
return
get_operand
<
LoadOp
>
(
rewriter
,
loc
,
val
,
index
);
}));
rewriter
.
replaceOp
(
op
,
lower_elemwise_to_std
(
op
,
rewriter
,
loc
,
inputs
));
return
success
();
}
private:
gpu
::
LaunchOp
&
m_launch_op
;
};
struct
ConstantScalarOpLowering
:
public
OpRewritePattern
<
dialect
::
ConstantScalarOp
>
{
ConstantScalarOpLowering
(
MLIRContext
*
ctx
,
gpu
::
LaunchOp
&
launch_op
)
:
OpRewritePattern
<
dialect
::
ConstantScalarOp
>
(
ctx
),
m_launch_op
{
launch_op
}
{}
LogicalResult
matchAndRewrite
(
dialect
::
ConstantScalarOp
op
,
PatternRewriter
&
rewriter
)
const
final
{
dialect
::
ConstantScalarOpAdaptor
constant_scalar_adaptor
(
op
);
rewriter
.
setInsertionPointToEnd
(
&
(
m_launch_op
.
body
().
front
()));
struct
ReturnOpLowering
:
public
ConversionPattern
{
ReturnOpLowering
(
MLIRContext
*
ctx
,
scf
::
ForOp
*
,
Value
,
const
Layout
&
)
:
ConversionPattern
(
dialect
::
ReturnOp
::
getOperationName
(),
1
,
ctx
)
{
}
rewriter
.
replaceOpWithNewOp
<
mlir
::
ConstantOp
>
(
op
,
constant_scalar_adaptor
.
value
());
LogicalResult
matchAndRewrite
(
Operation
*
op
,
ArrayRef
<
Value
>
,
Rewriter
&
rewriter
)
const
final
{
rewriter
.
replaceOpWithNewOp
<
mlir
::
ReturnOp
>
(
op
);
return
success
();
}
private:
gpu
::
LaunchOp
&
m_launch_op
;
};
struct
AssignOpLowering
:
public
ConversionPattern
{
AssignOpLowering
(
MLIRContext
*
ctx
,
gpu
::
LaunchOp
&
launch_op
)
:
ConversionPattern
(
dialect
::
AssignOp
::
getOperationName
(),
2
,
ctx
),
m_launch_op
{
launch_op
}
{}
struct
TypeCvtLowering
:
public
ConversionPattern
,
public
GpuLoweringHelper
{
TypeCvtLowering
(
MLIRContext
*
ctx
,
scf
::
ForOp
*
for_op
,
Value
index
,
const
Layout
&
dest
)
:
ConversionPattern
(
dialect
::
TypeCvt
::
getOperationName
(),
1
,
ctx
),
GpuLoweringHelper
(
for_op
,
index
,
dest
)
{}
LogicalResult
matchAndRewrite
(
Operation
*
op
,
ArrayRef
<
Value
>
operands
,
ConversionPatternRewriter
&
rewriter
)
const
final
{
LogicalResult
matchAndRewrite
(
Operation
*
op
,
ArrayRef
<
Value
>
operands
,
Rewriter
&
rewriter
)
const
final
{
auto
loc
=
op
->
getLoc
();
set_insertion_point
(
rewriter
);
dialect
::
AssignOpAdaptor
assign_adaptor
(
operands
);
rewriter
.
setInsertionPointToEnd
(
&
(
m_launch_op
.
body
().
front
()));
auto
dst_layout
=
output_layout
(
m_launch_op
);
auto
index
=
get_multidim_tid
(
rewriter
,
loc
,
assign_adaptor
.
rhs
(),
dst_layout
);
auto
loaded_lhs
=
get_operand
<
LoadOp
>
(
rewriter
,
loc
,
assign_adaptor
.
lhs
(),
index
);
rewriter
.
create
<
StoreOp
>
(
loc
,
loaded_lhs
,
assign_adaptor
.
rhs
(),
index
);
auto
index
=
map_indices
(
rewriter
,
loc
,
operands
[
0
]);
auto
input
=
get_operand
<
LoadOp
>
(
rewriter
,
loc
,
operands
[
0
],
index
);
rewriter
.
eraseOp
(
op
);
rewriter
.
replaceOp
(
op
,
lower_typecvt_to_std
(
op
,
rewriter
,
loc
,
input
)
);
return
success
();
}
private:
gpu
::
LaunchOp
&
m_launch_op
;
};
/* ===================== MgbToGpuLoweringPass ===================== */
class
MgbToGpuLoweringPass
:
public
PassWrapper
<
MgbToGpuLoweringPass
,
FunctionPass
>
{
public:
void
getDependentDialects
(
mlir
::
DialectRegistry
&
registry
)
const
override
{
registry
.
insert
<
mlir
::
gpu
::
GPUDialect
>
();
registry
.
insert
<
mlir
::
StandardOpsDialect
>
();
}
void
getDependentDialects
(
DialectRegistry
&
registry
)
const
override
;
void
runOnFunction
()
final
;
private:
Value
get_idx
(
OpBuilder
&
builder
,
Location
loc
);
Layout
get_dest_layout
(
FuncOp
func_op
);
};
void
MgbToGpuLoweringPass
::
getDependentDialects
(
DialectRegistry
&
registry
)
const
{
registry
.
insert
<
gpu
::
GPUDialect
,
scf
::
SCFDialect
,
StandardOpsDialect
>
();
}
void
runOnFunction
()
override
final
{
auto
func_op
=
getFunction
();
void
MgbToGpuLoweringPass
::
runOnFunction
()
{
FuncOp
func_op
=
getFunction
();
Location
loc
=
func_op
.
getLoc
();
OpBuilder
builder
(
&
func_op
.
getBody
());
Value
constantOne
=
builder
.
create
<
ConstantIndexOp
>
(
loc
,
1
);
gpu
::
LaunchOp
launch_op
=
builder
.
create
<
gpu
::
LaunchOp
>
(
loc
,
constantOne
,
constantOne
,
constantOne
,
constantOne
,
constantOne
,
constantOne
);
OpBuilder
builder
(
func_op
.
getBody
());
// create gpu::LaunchOp
Value
one
=
builder
.
create
<
ConstantIndexOp
>
(
loc
,
1
);
gpu
::
LaunchOp
launch_op
=
builder
.
create
<
gpu
::
LaunchOp
>
(
loc
,
one
,
one
,
one
,
one
,
one
,
one
);
builder
.
setInsertionPointToEnd
(
&
(
launch_op
.
body
().
front
()));
// create scf::ForOp
auto
it
=
func_op
.
getArguments
().
end
();
Value
nr_threads
=
*
(
--
it
);
Value
nr_elements
=
*
(
--
it
);
Value
idx
=
get_idx
(
builder
,
loc
);
auto
for_op
=
builder
.
create
<
scf
::
ForOp
>
(
loc
,
idx
,
nr_elements
,
nr_threads
);
builder
.
create
<
gpu
::
TerminatorOp
>
(
loc
);
Layout
dest
=
get_dest_layout
(
func_op
);
Value
for_idx
=
for_op
.
getLoopBody
().
getArgument
(
0
);
OwningRewritePatternList
patterns
;
patterns
.
insert
<
AssignOpLowering
,
ConstantScalarOpLowering
,
DimshuffleLowering
,
ElemwiseLowering
,
ReturnOpLowering
,
TypeCvtLowering
>
(
&
getContext
(),
&
for_op
,
for_idx
,
dest
);
ConversionTarget
target
(
getContext
());
target
.
addLegalDialect
<
StandardOpsDialect
>
();
target
.
addLegalDialect
<
gpu
::
GPU
Dialect
>
();
target
.
addLegalDialect
<
gpu
::
GPUDialect
,
scf
::
SCFDialect
,
StandardOps
Dialect
>
();
target
.
addIllegalDialect
<
MgbDialect
>
();
patterns
.
insert
<
ElemwiseLowering
,
TypeCvtLowering
,
DimshuffleLowering
,
ReturnOpLowering
,
ConstantScalarOpLowering
,
AssignOpLowering
>
(
&
getContext
(),
launch_op
);
if
(
failed
(
applyPartialConversion
(
func_op
,
target
,
std
::
move
(
patterns
))))
{
if
(
failed
(
applyPartialConversion
(
func_op
,
target
,
std
::
move
(
patterns
))))
{
signalPassFailure
();
}
}
};
}
//! block_dim * block_idx + thread_idx
Value
MgbToGpuLoweringPass
::
get_idx
(
OpBuilder
&
builder
,
Location
loc
)
{
IndexType
idx_type
=
builder
.
getIndexType
();
StringAttr
x
=
builder
.
getStringAttr
(
"x"
);
Value
block_dim
=
builder
.
create
<
gpu
::
BlockDimOp
>
(
loc
,
idx_type
,
x
);
Value
block_idx
=
builder
.
create
<
gpu
::
BlockIdOp
>
(
loc
,
idx_type
,
x
);
Value
thread_idx
=
builder
.
create
<
gpu
::
ThreadIdOp
>
(
loc
,
idx_type
,
x
);
Value
prod
=
builder
.
create
<
MulIOp
>
(
loc
,
block_dim
,
block_idx
);
return
builder
.
create
<
AddIOp
>
(
loc
,
prod
,
thread_idx
);
}
//! traverse the body of func_op and get dest_layout from AssignOp
Layout
MgbToGpuLoweringPass
::
get_dest_layout
(
FuncOp
func_op
)
{
Layout
dest_layout
;
bool
found
=
false
;
func_op
.
walk
([
&
](
dialect
::
AssignOp
assign_op
)
{
dest_layout
=
mlir_type_to_layout
(
assign_op
.
lhs
().
getType
());
found
=
true
;
return
WalkResult
::
interrupt
();
});
mgb_assert
(
found
,
"AssignOp not found in the body of FuncOp"
);
return
dest_layout
;
}
}
// namespace
/* ===================== create_lower_to_gpu_pass ===================== */
std
::
unique_ptr
<
mlir
::
Pass
>
mgb
::
jit
::
create_lower_to_gpu_pass
()
{
return
std
::
make_unique
<
MgbToGpuLoweringPass
>
();
}
...
...
src/jit/impl/mlir/mlir_gen.cpp
浏览文件 @
88e918e2
...
...
@@ -80,7 +80,9 @@ private:
for
(
auto
&&
arg
:
args
.
outputs
)
{
func_args
.
push_back
(
get_type
(
arg
.
from
->
layout
()));
}
//! the last arg is nr_elements
//! nr_elements
func_args
.
push_back
(
m_builder
.
getIndexType
());
//! nr_threads
func_args
.
push_back
(
m_builder
.
getIndexType
());
auto
func_type
=
m_builder
.
getFunctionType
(
func_args
,
llvm
::
None
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录