Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
65d418f0
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
65d418f0
编写于
7月 27, 2018
作者:
T
tensor-tang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
complete im2col with padding==1 and speedup filter width==1
上级
52eb86e3
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
113 addition
and
125 deletion
+113
-125
paddle/fluid/operators/math/im2col.cc
paddle/fluid/operators/math/im2col.cc
+5
-3
paddle/fluid/operators/math/im2col_cfo_cpu.h
paddle/fluid/operators/math/im2col_cfo_cpu.h
+99
-119
paddle/fluid/operators/math/im2col_test.cc
paddle/fluid/operators/math/im2col_test.cc
+9
-3
未找到文件。
paddle/fluid/operators/math/im2col.cc
浏览文件 @
65d418f0
...
...
@@ -40,10 +40,12 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
dilation
[
1
]
==
1
)
{
if
(
padding
[
0
]
==
0
&&
padding
[
1
]
==
0
)
{
im2col_sh1sw1dh1dw1ph0pw0
<
T
>
(
im
,
col
);
}
else
{
im2col_sh1sw1dh1dw1
<
T
>
(
im
,
padding
,
col
);
return
;
}
else
if
(
padding
[
0
]
==
1
&&
padding
[
1
]
==
1
)
{
im2col_sh1sw1dh1dw1ph1pw1
<
T
>
(
im
,
col
);
return
;
}
return
;
// TODO(TJ): complete padding >=2
}
im2col_common
<
T
>
(
im
,
dilation
,
stride
,
padding
,
col
);
}
...
...
paddle/fluid/operators/math/im2col_cfo_cpu.h
浏览文件 @
65d418f0
...
...
@@ -21,7 +21,7 @@ namespace paddle {
namespace
operators
{
namespace
math
{
/*
/*
*
* The most common im2col algorithm.
* Support dilation, stride and padding.
*/
...
...
@@ -61,9 +61,9 @@ inline void im2col_common(const framework::Tensor& im,
}
}
/*
/*
*
* im2col algorithm with strides == 1, dilations == 1, paddings == 0
*
*
/
*/
template
<
typename
T
>
inline
void
im2col_sh1sw1dh1dw1ph0pw0
(
const
framework
::
Tensor
&
im
,
framework
::
Tensor
*
col
)
{
...
...
@@ -96,11 +96,13 @@ inline void im2col_sh1sw1dh1dw1ph0pw0(const framework::Tensor& im,
}
}
// further optimize: padding == 1 need special
/**
* im2col algorithm with strides == 1, dilations == 1, paddings == 1
* and filter_width == 1 have a special implementation
*/
template
<
typename
T
>
inline
void
im2col_sh1sw1dh1dw1
(
const
framework
::
Tensor
&
im
,
const
std
::
vector
<
int
>&
padding
,
framework
::
Tensor
*
col
)
{
inline
void
im2col_sh1sw1dh1dw1ph1pw1
(
const
framework
::
Tensor
&
im
,
framework
::
Tensor
*
col
)
{
int
im_channels
=
im
.
dims
()[
0
];
int
im_height
=
im
.
dims
()[
1
];
int
im_width
=
im
.
dims
()[
2
];
...
...
@@ -108,119 +110,57 @@ inline void im2col_sh1sw1dh1dw1(const framework::Tensor& im,
int
filter_width
=
col
->
dims
()[
2
];
int
output_height
=
col
->
dims
()[
3
];
int
output_width
=
col
->
dims
()[
4
];
constexpr
int
sh
=
1
;
constexpr
int
sw
=
1
;
constexpr
int
plh
=
1
;
constexpr
int
prh
=
1
;
constexpr
int
plw
=
1
;
constexpr
int
prw
=
1
;
const
T
*
im_data
=
im
.
data
<
T
>
();
T
*
col_data
=
col
->
data
<
T
>
();
int
col_matrix_width
=
output_width
*
output_height
;
int
im_size
=
im_height
*
im_width
;
int
plh
=
padding
[
0
];
int
plw
=
padding
[
1
];
int
prh
=
(
output_height
-
1
)
*
sh
+
filter_height
-
im_height
-
plh
;
int
prw
=
(
output_width
-
1
)
*
sw
+
filter_width
-
im_width
-
plw
;
// fill height padding : 0 ~ plh-1, (oh-prh) ~ (oh-1)
// TODO(TJ): refine ph*xxx
assert
(
plh
==
prh
);
// because stride_h == 1
int
col_matrix_width
=
output_width
*
output_height
;
int
col_block_fh
=
filter_width
*
col_matrix_width
;
// fw*oh*ow
int
col_block_ic
=
filter_height
*
col_block_fh
;
// fh*fw*oh*ow
for
(
int
ph
=
0
;
ph
<
plh
;
++
ph
)
{
int
sz
=
output_width
*
(
plh
-
ph
);
size_t
copy_sz
=
sizeof
(
T
)
*
sz
;
T
*
col_start_l
=
col_data
+
ph
*
col_block_fh
;
T
*
col_start_r
=
col_data
+
(
filter_height
-
ph
-
1
)
*
col_block_fh
+
col_matrix_width
-
sz
;
// fill height padding
{
size_t
copy_size
=
sizeof
(
T
)
*
output_width
;
T
*
col_start_l
=
col_data
;
T
*
col_start_r
=
col_data
+
(
filter_height
-
1
)
*
col_block_fh
+
col_matrix_width
-
output_width
;
for
(
int
ic
=
0
;
ic
<
im_channels
;
++
ic
)
{
// TODO(TJ): move * outside
T
*
dst_data_l
=
col_start_l
+
ic
*
col_block_ic
;
T
*
dst_data_r
=
col_start_r
+
ic
*
col_block_ic
;
for
(
int
kw
=
0
;
kw
<
filter_width
;
++
kw
)
{
std
::
memset
(
dst_data_l
,
0
,
copy_s
z
);
std
::
memset
(
dst_data_r
,
0
,
copy_s
z
);
std
::
memset
(
dst_data_l
,
0
,
copy_s
ize
);
std
::
memset
(
dst_data_r
,
0
,
copy_s
ize
);
dst_data_l
=
dst_data_l
+
col_matrix_width
;
dst_data_r
=
dst_data_r
+
col_matrix_width
;
}
}
}
// fill width padding
assert
(
plw
==
prw
);
// because stride_w == 1
if
(
plw
==
1
)
{
auto
pad
=
static_cast
<
T
>
(
0
);
// padding zero
auto
pad
=
static_cast
<
T
>
(
0
);
if
(
filter_width
==
1
)
{
// fill width padding
for
(
int
ic
=
0
;
ic
<
im_channels
;
++
ic
)
{
// TODO(TJ):
use add and resue str
ide
// TODO(TJ):
move * outs
ide
T
*
dst_data_ic
=
col_data
+
ic
*
col_block_ic
;
for
(
int
kh
=
0
;
kh
<
filter_height
;
++
kh
)
{
T
*
dst_data_kh
=
dst_data_ic
+
kh
*
col_block_fh
;
for
(
T
*
dst_data
:
{
dst_data_kh
,
dst_data_kh
+
(
filter_width
-
prw
)
*
col_matrix_width
+
output_width
-
1
})
{
// TODO(TJ): from plh, saving repeated assignment
for
(
int
oh
=
0
;
oh
<
output_height
;
++
oh
)
{
*
dst_data
=
pad
;
dst_data
=
dst_data
+
output_width
;
}
// TODO(TJ): move * outside
T
*
dst_data
=
dst_data_ic
+
kh
*
col_block_fh
;
for
(
int
oh
=
0
;
oh
<
output_height
;
++
oh
)
{
*
dst_data
=
pad
;
dst_data
=
dst_data
+
output_width
-
1
;
*
dst_data
=
pad
;
++
dst_data
;
}
}
}
}
else
{
// padding_size > 1
for
(
int
ic
=
0
;
ic
<
im_channels
;
++
ic
)
{
// TODO(TJ): use add and resue stride
T
*
dst_data_ic
=
col_data
+
ic
*
col_block_ic
;
for
(
int
kh
=
0
;
kh
<
filter_height
;
++
kh
)
{
T
*
dst_data_kh
=
dst_data_ic
+
kh
*
col_block_fh
;
for
(
int
kw
=
0
;
kw
<
plw
;
++
kw
)
{
// TODO(TJ): reuse array outside this for
size_t
sz
=
sizeof
(
T
)
*
(
plw
-
kw
);
T
*
dst_data
=
dst_data_kh
+
kw
*
col_matrix_width
;
// TODO(TJ): from plh, saving repeated assignment
for
(
int
oh
=
0
;
oh
<
output_height
;
++
oh
)
{
std
::
memset
(
dst_data
,
0
,
sz
);
dst_data
=
dst_data
+
output_width
;
}
}
// TODO(TJ): use reverse to save cache
for
(
int
kw
=
0
;
kw
<
prw
;
++
kw
)
{
// TODO(TJ): reuse array outside this for
auto
num
=
(
prw
-
kw
);
size_t
sz
=
sizeof
(
T
)
*
num
;
T
*
dst_data
=
dst_data_kh
+
(
filter_width
-
1
-
kw
)
*
col_matrix_width
+
output_width
-
num
;
// TODO(TJ): from plh, saving repeated assignment
for
(
int
oh
=
0
;
oh
<
output_height
;
++
oh
)
{
std
::
memset
(
dst_data
,
0
,
sz
);
dst_data
=
dst_data
+
output_width
;
}
}
}
}
}
// fill im_data
// padding cover two cases:
// 1. kw > 2*pw: kw = 3, pw = 1
// 0 x x x x ... x x x x 0
// 1 1 1 1 1 1
// ==>
// 0 x ... x x
// x x ... x x
// x x ... x 0
// 2. kw < 2*pw: kw = 3, pw = 2
// 0 0 x x x ... x x x 0 0
// 1 1 1 1 1 1
// ==>
// 0 0 x ... x x x
// 0 x x ... x x 0
// x x x ... x 0 0
// TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) *
// (output_width-1)}
// length of copy_size is equal kw.
if
(
plw
+
prw
<
filter_width
)
{
// fill core
size_t
copy_size
=
sizeof
(
T
)
*
(
output_width
-
plw
-
prw
);
for
(
int
oh
=
0
;
oh
<
output_height
;
++
oh
)
{
const
T
*
im_data_start
=
im_data
+
(
oh
-
plh
>
0
?
oh
-
plh
:
0
)
*
im_width
;
...
...
@@ -230,33 +170,73 @@ inline void im2col_sh1sw1dh1dw1(const framework::Tensor& im,
for
(
int
kh
=
0
;
kh
<
filter_height
;
++
kh
)
{
if
((
oh
<
plh
&&
kh
<
plh
)
||
(
oh
>
(
output_height
-
prh
-
1
)
&&
kh
>
(
filter_height
-
prh
-
1
)))
{
dst_data
=
dst_data
+
filter_width
*
col_matrix_width
;
continue
;
}
// TODO(TJ): reuse plw-kw outside this for
// try to unify
for
(
int
kw
=
0
;
kw
<
plw
;
++
kw
)
{
std
::
memcpy
(
dst_data
+
(
plw
-
kw
),
src_data
,
sizeof
(
T
)
*
(
output_width
-
(
plw
-
kw
)));
dst_data
=
dst_data
+
col_matrix_width
;
}
for
(
int
kw
=
plw
;
kw
<
filter_width
-
prw
;
++
kw
)
{
std
::
memcpy
(
dst_data
,
src_data
+
(
kw
-
plw
),
sizeof
(
T
)
*
output_width
);
dst_data
=
dst_data
+
col_matrix_width
;
}
int
i
=
1
;
for
(
int
kw
=
filter_width
-
prw
;
kw
<
filter_width
;
++
kw
,
++
i
)
{
std
::
memcpy
(
dst_data
,
src_data
+
(
kw
-
plw
),
sizeof
(
T
)
*
(
output_width
-
i
));
dst_data
=
dst_data
+
col_matrix_width
;
continue
;
}
std
::
memcpy
(
dst_data
+
plw
,
src_data
,
copy_size
);
dst_data
=
dst_data
+
col_matrix_width
;
src_data
=
src_data
+
im_width
;
}
}
}
}
else
{
LOG
(
FATAL
)
<<
"Not implement yet"
;
return
;
}
// filter_width != 1
// fill width padding
for
(
int
ic
=
0
;
ic
<
im_channels
;
++
ic
)
{
// TODO(TJ): move * outside
T
*
dst_data_ic
=
col_data
+
ic
*
col_block_ic
;
for
(
int
kh
=
0
;
kh
<
filter_height
;
++
kh
)
{
// TODO(TJ): move * outside
T
*
dst_data_kh
=
dst_data_ic
+
kh
*
col_block_fh
;
for
(
T
*
dst_data
:
{
dst_data_kh
,
dst_data_kh
+
(
filter_width
-
prw
)
*
col_matrix_width
+
output_width
-
1
})
{
// TODO(TJ): from plh, saving repeated assignment
for
(
int
oh
=
0
;
oh
<
output_height
;
++
oh
)
{
*
dst_data
=
pad
;
dst_data
=
dst_data
+
output_width
;
}
}
}
}
// TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) *
// (output_width-1)}
// length of copy_size is equal kw.
for
(
int
oh
=
0
;
oh
<
output_height
;
++
oh
)
{
const
T
*
im_data_start
=
im_data
+
(
oh
-
plh
>
0
?
oh
-
plh
:
0
)
*
im_width
;
T
*
dst_data
=
col_data
+
oh
*
output_width
;
for
(
int
ic
=
0
;
ic
<
im_channels
;
++
ic
)
{
const
T
*
src_data
=
im_data_start
+
ic
*
im_size
;
for
(
int
kh
=
0
;
kh
<
filter_height
;
++
kh
)
{
if
((
oh
<
plh
&&
kh
<
plh
)
||
(
oh
>
(
output_height
-
prh
-
1
)
&&
kh
>
(
filter_height
-
prh
-
1
)))
{
dst_data
=
dst_data
+
filter_width
*
col_matrix_width
;
continue
;
}
// TODO(TJ): reuse plw-kw outside this for
// try to unify
for
(
int
kw
=
0
;
kw
<
plw
;
++
kw
)
{
std
::
memcpy
(
dst_data
+
(
plw
-
kw
),
src_data
,
sizeof
(
T
)
*
(
output_width
-
(
plw
-
kw
)));
dst_data
=
dst_data
+
col_matrix_width
;
}
for
(
int
kw
=
plw
;
kw
<
filter_width
-
prw
;
++
kw
)
{
std
::
memcpy
(
dst_data
,
src_data
+
(
kw
-
plw
),
sizeof
(
T
)
*
output_width
);
dst_data
=
dst_data
+
col_matrix_width
;
}
int
i
=
1
;
for
(
int
kw
=
filter_width
-
prw
;
kw
<
filter_width
;
++
kw
,
++
i
)
{
std
::
memcpy
(
dst_data
,
src_data
+
(
kw
-
plw
),
sizeof
(
T
)
*
(
output_width
-
i
));
dst_data
=
dst_data
+
col_matrix_width
;
}
src_data
=
src_data
+
im_width
;
}
}
}
}
...
...
paddle/fluid/operators/math/im2col_test.cc
浏览文件 @
65d418f0
...
...
@@ -227,7 +227,8 @@ void benchIm2col(int ic, int ih, int iw, int fh, int fw, int ph, int pw) {
auto
t3
=
GetCurrentMs
();
LOG
(
INFO
)
<<
"before: "
<<
(
t3
-
t2
)
/
repeat
<<
",after: "
<<
(
t2
-
t1
)
/
repeat
;
<<
",after: "
<<
(
t2
-
t1
)
/
repeat
<<
",boost: "
<<
((
t3
-
t2
)
/
(
t2
-
t1
)
-
1
)
*
100
<<
"%"
;
}
TEST
(
math
,
im2col_cputest
)
{
...
...
@@ -244,6 +245,10 @@ TEST(math, im2col_cputest) {
// height != width
testIm2colCPU
(
/*ic*/
2
,
/*ih*/
5
,
/*iw*/
4
,
/*fh*/
2
,
/*fw*/
3
,
/*ph*/
p
,
/*pw*/
p
);
testIm2colCPU
(
/*ic*/
2
,
/*ih*/
5
,
/*iw*/
4
,
/*fh*/
1
,
/*fw*/
3
,
/*ph*/
p
,
/*pw*/
p
);
testIm2colCPU
(
/*ic*/
2
,
/*ih*/
4
,
/*iw*/
5
,
/*fh*/
3
,
/*fw*/
1
,
/*ph*/
p
,
/*pw*/
p
);
// filter == 1
testIm2colCPU
(
/*ic*/
3
,
/*ih*/
4
,
/*iw*/
4
,
/*fh*/
1
,
/*fw*/
1
,
/*ph*/
p
,
...
...
@@ -251,13 +256,14 @@ TEST(math, im2col_cputest) {
testIm2colCPU
(
/*ic*/
3
,
/*ih*/
3
,
/*iw*/
4
,
/*fh*/
1
,
/*fw*/
1
,
/*ph*/
p
,
/*pw*/
p
);
}
// padding_h != padding_w
testIm2colCPU
(
/*ic*/
2
,
/*ih*/
4
,
/*iw*/
4
,
/*fh*/
2
,
/*fw*/
3
,
/*ph*/
1
,
/*pw*/
2
);
// benchmark
for
(
int
p
:
{
0
,
1
,
2
})
{
for
(
int
k
:
{
3
,
5
})
{
for
(
int
p
:
{
0
,
1
})
{
for
(
int
k
:
{
1
,
3
,
5
})
{
LOG
(
INFO
)
<<
"padding == "
<<
p
<<
", filter == "
<<
k
;
benchIm2col
(
/*ic*/
3
,
/*ih*/
224
,
/*iw*/
224
,
/*fh*/
k
,
/*fw*/
k
,
/*ph*/
p
,
/*pw*/
p
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录