Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
s920243400
PaddleDetection
提交
3dc88342
P
PaddleDetection
项目概览
s920243400
/
PaddleDetection
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleDetection
通知
2
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleDetection
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
3dc88342
编写于
11月 10, 2017
作者:
M
Markus Kliegl
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
conv shift op: change to CamelCase
上级
92b0c699
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
13 addition
and
13 deletion
+13
-13
paddle/operators/conv_shift_op.cu
paddle/operators/conv_shift_op.cu
+13
-13
未找到文件。
paddle/operators/conv_shift_op.cu
浏览文件 @
3dc88342
...
...
@@ -22,7 +22,7 @@ using framework::Tensor;
namespace
{
inline
int
div_u
p
(
int
x
,
int
y
)
{
return
(
x
+
y
-
1
)
/
y
;
}
inline
int
DivU
p
(
int
x
,
int
y
)
{
return
(
x
+
y
-
1
)
/
y
;
}
// Some notes on the design:
//
...
...
@@ -33,9 +33,9 @@ inline int div_up(int x, int y) { return (x + y - 1) / y; }
// y is fairly small. For large y, it would probably be more efficient
// to also tile across y.
template
<
typename
T
>
__global__
void
conv_shift_f
orward
(
const
T
*
x
,
const
T
*
y
,
T
*
out
,
int
x_width
,
int
y_width
,
int
y_half_width
,
int
batch_size
)
{
__global__
void
ConvShiftF
orward
(
const
T
*
x
,
const
T
*
y
,
T
*
out
,
int
x_width
,
int
y_width
,
int
y_half_width
,
int
batch_size
)
{
extern
__shared__
T
mem
[];
int
tx
=
threadIdx
.
x
;
...
...
@@ -79,8 +79,8 @@ __global__ void conv_shift_forward(const T *x, const T *y, T *out, int x_width,
// Compute x gradient - initial naive implementation with atomic add.
template
<
typename
T
>
__global__
void
conv_shift_dx
(
const
T
*
dout
,
const
T
*
y
,
T
*
dx
,
int
x_width
,
int
y_width
,
int
y_half_width
,
int
batch_size
)
{
__global__
void
ConvShiftGradX
(
const
T
*
dout
,
const
T
*
y
,
T
*
dx
,
int
x_width
,
int
y_width
,
int
y_half_width
,
int
batch_size
)
{
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
// x index
int
j
=
blockIdx
.
y
;
// y index
int
k
=
blockIdx
.
z
;
// batch index
...
...
@@ -94,8 +94,8 @@ __global__ void conv_shift_dx(const T *dout, const T *y, T *dx, int x_width,
// Compute y gradient - initial naive implementation with atomic add.
template
<
typename
T
>
__global__
void
conv_shift_d
y
(
const
T
*
x
,
const
T
*
dout
,
T
*
dy
,
int
x_width
,
int
y_width
,
int
y_half_width
,
int
batch_size
)
{
__global__
void
ConvShiftD
y
(
const
T
*
x
,
const
T
*
dout
,
T
*
dy
,
int
x_width
,
int
y_width
,
int
y_half_width
,
int
batch_size
)
{
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
// x index
int
j
=
blockIdx
.
y
;
// y index
int
k
=
blockIdx
.
z
;
// batch index
...
...
@@ -125,14 +125,14 @@ class ConvShiftKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
int
y_half_width
=
(
y_width
-
1
)
/
2
;
const
int
x_per_block
=
256
;
int
num_x_blocks
=
div_u
p
(
x_width
,
x_per_block
);
int
num_x_blocks
=
DivU
p
(
x_width
,
x_per_block
);
int
mem_per_block
=
(
x_per_block
+
2
*
y_width
)
*
sizeof
(
T
);
dim3
grid_dim
(
num_x_blocks
,
batch_size
);
auto
stream
=
context
.
cuda_device_context
().
stream
();
conv_shift_f
orward
<
T
><<<
grid_dim
,
x_per_block
,
mem_per_block
,
stream
>>>
(
ConvShiftF
orward
<
T
><<<
grid_dim
,
x_per_block
,
mem_per_block
,
stream
>>>
(
x_data
,
y_data
,
out_data
,
x_width
,
y_width
,
y_half_width
,
batch_size
);
}
};
...
...
@@ -160,20 +160,20 @@ class ConvShiftGradKernel<platform::GPUPlace, T>
auto
stream
=
context
.
cuda_device_context
().
stream
();
const
int
x_per_block
=
256
;
int
num_x_blocks
=
div_u
p
(
x_width
,
x_per_block
);
int
num_x_blocks
=
DivU
p
(
x_width
,
x_per_block
);
dim3
grid_dim
(
num_x_blocks
,
y_width
,
batch_size
);
if
(
dX
)
{
T
*
dx_data
=
dX
->
mutable_data
<
T
>
(
context
.
GetPlace
());
cudaMemsetAsync
(
dx_data
,
0
,
dX
->
numel
()
*
sizeof
(
T
),
stream
);
conv_shift_dx
<
T
><<<
grid_dim
,
x_per_block
,
0
,
stream
>>>
(
ConvShiftGradX
<
T
><<<
grid_dim
,
x_per_block
,
0
,
stream
>>>
(
dout_data
,
y_data
,
dx_data
,
x_width
,
y_width
,
y_half_width
,
batch_size
);
}
if
(
dY
)
{
T
*
dy_data
=
dY
->
mutable_data
<
T
>
(
context
.
GetPlace
());
cudaMemsetAsync
(
dy_data
,
0
,
dY
->
numel
()
*
sizeof
(
T
),
stream
);
conv_shift_d
y
<
T
><<<
grid_dim
,
x_per_block
,
0
,
stream
>>>
(
ConvShiftD
y
<
T
><<<
grid_dim
,
x_per_block
,
0
,
stream
>>>
(
x_data
,
dout_data
,
dy_data
,
x_width
,
y_width
,
y_half_width
,
batch_size
);
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录