Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
9e3433bd
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
9e3433bd
编写于
7月 04, 2022
作者:
B
Bo Zhang
提交者:
GitHub
7月 04, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Merge dimensions && OP performance optimization (#43931)
上级
cf8e86df
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
67 addition
and
31 deletion
+67
-31
paddle/phi/kernels/gpu/cross_grad_kernel.cu
paddle/phi/kernels/gpu/cross_grad_kernel.cu
+32
-14
paddle/phi/kernels/gpu/cross_kernel.cu
paddle/phi/kernels/gpu/cross_kernel.cu
+35
-17
未找到文件。
paddle/phi/kernels/gpu/cross_grad_kernel.cu
浏览文件 @
9e3433bd
...
@@ -22,8 +22,6 @@
...
@@ -22,8 +22,6 @@
namespace
phi
{
namespace
phi
{
using
funcs
::
IndexCalculator
;
template
<
typename
T
>
template
<
typename
T
>
__global__
void
CrossGrad
(
const
T
*
x
,
__global__
void
CrossGrad
(
const
T
*
x
,
const
T
*
y
,
const
T
*
y
,
...
@@ -32,7 +30,7 @@ __global__ void CrossGrad(const T* x,
...
@@ -32,7 +30,7 @@ __global__ void CrossGrad(const T* x,
T
*
out_dy
,
T
*
out_dy
,
const
int
stride
,
const
int
stride
,
const
int
N
,
const
int
N
,
IndexCalculator
index_calculator
)
{
phi
::
funcs
::
IndexCalculator
index_calculator
)
{
CUDA_KERNEL_LOOP
(
i
,
N
)
{
CUDA_KERNEL_LOOP
(
i
,
N
)
{
int
offset
=
index_calculator
(
i
);
int
offset
=
index_calculator
(
i
);
...
@@ -107,32 +105,52 @@ void CrossGradKernel(const Context& dev_ctx,
...
@@ -107,32 +105,52 @@ void CrossGradKernel(const Context& dev_ctx,
std
::
vector
<
int
>
cal_dims
;
std
::
vector
<
int
>
cal_dims
;
std
::
vector
<
int
>
left_strides
;
std
::
vector
<
int
>
left_strides
;
std
::
vector
<
int
>
full_strides
;
std
::
vector
<
int
>
full_strides
;
std
::
vector
<
int
>
merged_dims
;
for
(
int
i
=
0
;
i
<
dim
;
i
++
)
{
if
(
i
==
0
)
{
merged_dims
.
push_back
(
input_x_dims
[
i
]);
}
else
{
merged_dims
[
0
]
*=
input_x_dims
[
i
];
}
}
int
merge_axis
=
merged_dims
.
size
();
merged_dims
.
push_back
(
input_x_dims
[
dim
]);
for
(
int
i
=
dim
+
1
;
i
<
input_x_dims
.
size
();
i
++
)
{
if
(
i
==
dim
+
1
)
{
merged_dims
.
push_back
(
input_x_dims
[
i
]);
}
else
{
merged_dims
[
merge_axis
+
1
]
*=
input_x_dims
[
i
];
}
}
int
full_dim
=
1
;
int
full_dim
=
1
;
int
left_dim
=
1
;
for
(
int
i
=
0
;
i
<
merged_dims
.
size
();
i
++
)
{
for
(
auto
i
=
0
;
i
<
input_x_dims
.
size
();
i
++
)
{
full_strides
.
insert
(
full_strides
.
begin
(),
full_dim
);
full_strides
.
insert
(
full_strides
.
begin
(),
full_dim
);
full_dim
*=
input_x_dims
[
input_x
_dims
.
size
()
-
i
-
1
];
full_dim
*=
merged_dims
[
merged
_dims
.
size
()
-
i
-
1
];
if
(
i
==
dim
)
{
if
(
i
==
merge_axis
)
{
continue
;
continue
;
}
}
cal_dims
.
push_back
(
i
);
cal_dims
.
push_back
(
i
);
}
int
left_dim
=
1
;
for
(
int
i
=
merged_dims
.
size
()
-
1
;
i
>=
0
;
i
--
)
{
if
(
i
==
merge_axis
)
{
continue
;
}
left_strides
.
insert
(
left_strides
.
begin
(),
left_dim
);
left_strides
.
insert
(
left_strides
.
begin
(),
left_dim
);
left_dim
*=
input_x_dims
[
input_x_dims
.
size
()
-
i
-
1
];
left_dim
*=
merged_dims
[
i
];
}
}
const
auto
*
input_x_data
=
input_x
.
data
<
T
>
();
const
auto
*
input_x_data
=
input_x
.
data
<
T
>
();
const
auto
*
input_y_data
=
input_y
.
data
<
T
>
();
const
auto
*
input_y_data
=
input_y
.
data
<
T
>
();
const
auto
*
input_out_grad_data
=
input_out_grad
.
data
<
T
>
();
const
auto
*
input_out_grad_data
=
input_out_grad
.
data
<
T
>
();
auto
*
output_x_grad_data
=
dev_ctx
.
template
Alloc
<
T
>(
x_grad
);
auto
*
output_x_grad_data
=
dev_ctx
.
template
Alloc
<
T
>(
x_grad
);
auto
*
output_y_grad_data
=
dev_ctx
.
template
Alloc
<
T
>(
y_grad
);
auto
*
output_y_grad_data
=
dev_ctx
.
template
Alloc
<
T
>(
y_grad
);
auto
index_calculator
=
phi
::
funcs
::
IndexCalculator
(
auto
index_calculator
=
IndexCalculator
(
merged_dims
.
size
()
-
1
,
cal_dims
,
left_strides
,
full_strides
);
input_x_dims
.
size
()
-
1
,
cal_dims
,
left_strides
,
full_strides
);
int64_t
numel
=
x
.
numel
();
int64_t
numel
=
x
.
numel
();
backends
::
gpu
::
GpuLaunchConfig
config
=
backends
::
gpu
::
GpuLaunchConfig
config
=
backends
::
gpu
::
GetGpuLaunchConfig1D
(
dev_ctx
,
numel
/
3
);
backends
::
gpu
::
GetGpuLaunchConfig1D
(
dev_ctx
,
numel
/
3
);
...
@@ -144,7 +162,7 @@ void CrossGradKernel(const Context& dev_ctx,
...
@@ -144,7 +162,7 @@ void CrossGradKernel(const Context& dev_ctx,
input_out_grad_data
,
input_out_grad_data
,
output_x_grad_data
,
output_x_grad_data
,
output_y_grad_data
,
output_y_grad_data
,
full_strides
[
dim
],
full_strides
[
merge_axis
],
numel
/
3
,
numel
/
3
,
index_calculator
);
index_calculator
);
}
}
...
...
paddle/phi/kernels/gpu/cross_kernel.cu
浏览文件 @
9e3433bd
...
@@ -22,15 +22,13 @@
...
@@ -22,15 +22,13 @@
namespace
phi
{
namespace
phi
{
using
funcs
::
IndexCalculator
;
template
<
typename
T
>
template
<
typename
T
>
__global__
void
Cross
(
const
T
*
x
,
__global__
void
Cross
(
const
T
*
x
,
const
T
*
y
,
const
T
*
y
,
T
*
out
,
T
*
out
,
const
int
stride
,
const
int
stride
,
const
int
N
,
const
int
N
,
IndexCalculator
index_calculator
)
{
phi
::
funcs
::
IndexCalculator
index_calculator
)
{
CUDA_KERNEL_LOOP
(
i
,
N
)
{
CUDA_KERNEL_LOOP
(
i
,
N
)
{
int
offset
=
index_calculator
(
i
);
int
offset
=
index_calculator
(
i
);
...
@@ -96,30 +94,50 @@ void CrossKernel(const Context& dev_ctx,
...
@@ -96,30 +94,50 @@ void CrossKernel(const Context& dev_ctx,
std
::
vector
<
int
>
cal_dims
;
std
::
vector
<
int
>
cal_dims
;
std
::
vector
<
int
>
left_strides
;
std
::
vector
<
int
>
left_strides
;
std
::
vector
<
int
>
full_strides
;
std
::
vector
<
int
>
full_strides
;
std
::
vector
<
int
>
merged_dims
;
for
(
int
i
=
0
;
i
<
dim
;
i
++
)
{
if
(
i
==
0
)
{
merged_dims
.
push_back
(
input_x_dims
[
i
]);
}
else
{
merged_dims
[
0
]
*=
input_x_dims
[
i
];
}
}
int
merge_axis
=
merged_dims
.
size
();
merged_dims
.
push_back
(
input_x_dims
[
dim
]);
for
(
int
i
=
dim
+
1
;
i
<
input_x_dims
.
size
();
i
++
)
{
if
(
i
==
dim
+
1
)
{
merged_dims
.
push_back
(
input_x_dims
[
i
]);
}
else
{
merged_dims
[
merge_axis
+
1
]
*=
input_x_dims
[
i
];
}
}
int
dims0
=
1
;
int
full_dim
=
1
;
int
dims1
=
1
;
for
(
int
i
=
0
;
i
<
merged_dims
.
size
();
i
++
)
{
for
(
auto
i
=
0
;
i
<
input_x_dims
.
size
();
i
++
)
{
full_strides
.
insert
(
full_strides
.
begin
(),
full_dim
);
full_strides
.
insert
(
full_strides
.
begin
(),
dims0
);
full_dim
*=
merged_dims
[
merged_dims
.
size
()
-
i
-
1
];
dims0
*=
input_x_dims
[
input_x_dims
.
size
()
-
i
-
1
];
if
(
i
==
merge_axis
)
{
if
(
i
==
dim
)
{
continue
;
continue
;
}
}
cal_dims
.
push_back
(
i
);
cal_dims
.
push_back
(
i
);
left_strides
.
insert
(
left_strides
.
begin
(),
dims1
);
}
dims1
*=
input_x_dims
[
input_x_dims
.
size
()
-
i
-
1
];
int
left_dim
=
1
;
for
(
int
i
=
merged_dims
.
size
()
-
1
;
i
>=
0
;
i
--
)
{
if
(
i
==
merge_axis
)
{
continue
;
}
left_strides
.
insert
(
left_strides
.
begin
(),
left_dim
);
left_dim
*=
merged_dims
[
i
];
}
}
const
auto
*
input_x_data
=
input_x
.
data
<
T
>
();
const
auto
*
input_x_data
=
input_x
.
data
<
T
>
();
const
auto
*
input_y_data
=
input_y
.
data
<
T
>
();
const
auto
*
input_y_data
=
input_y
.
data
<
T
>
();
auto
*
out_data
=
dev_ctx
.
template
Alloc
<
T
>(
out
);
auto
*
out_data
=
dev_ctx
.
template
Alloc
<
T
>(
out
);
auto
index_calculator
=
phi
::
funcs
::
IndexCalculator
(
auto
index_calculator
=
IndexCalculator
(
merged_dims
.
size
()
-
1
,
cal_dims
,
left_strides
,
full_strides
);
input_x_dims
.
size
()
-
1
,
cal_dims
,
left_strides
,
full_strides
);
int64_t
numel
=
x
.
numel
();
int64_t
numel
=
x
.
numel
();
backends
::
gpu
::
GpuLaunchConfig
config
=
backends
::
gpu
::
GpuLaunchConfig
config
=
backends
::
gpu
::
GetGpuLaunchConfig1D
(
dev_ctx
,
numel
/
3
);
backends
::
gpu
::
GetGpuLaunchConfig1D
(
dev_ctx
,
numel
/
3
);
...
@@ -129,7 +147,7 @@ void CrossKernel(const Context& dev_ctx,
...
@@ -129,7 +147,7 @@ void CrossKernel(const Context& dev_ctx,
dev_ctx
.
stream
()
>>>
(
input_x_data
,
dev_ctx
.
stream
()
>>>
(
input_x_data
,
input_y_data
,
input_y_data
,
out_data
,
out_data
,
full_strides
[
dim
],
full_strides
[
merge_axis
],
numel
/
3
,
numel
/
3
,
index_calculator
);
index_calculator
);
}
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录