Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
292c1951
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
292c1951
编写于
2月 02, 2018
作者:
Q
qingqing01
提交者:
GitHub
2月 02, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #7441 from hedaoyuan/inference
Some optimization of CNN model computation.
上级
148d35fe
784e5940
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
77 addition
and
69 deletion
+77
-69
paddle/function/GemmConvOp.cpp
paddle/function/GemmConvOp.cpp
+35
-28
paddle/function/Im2Col.h
paddle/function/Im2Col.h
+28
-25
paddle/function/Im2ColTest.cpp
paddle/function/Im2ColTest.cpp
+3
-3
paddle/math/Matrix.cpp
paddle/math/Matrix.cpp
+11
-13
未找到文件。
paddle/function/GemmConvOp.cpp
浏览文件 @
292c1951
...
...
@@ -178,19 +178,22 @@ public:
real
*
inputData
=
inputs
[
0
].
data
<
real
>
();
real
*
filterData
=
inputs
[
1
].
data
<
real
>
();
real
*
outputData
=
outputs
[
0
].
data
<
real
>
();
real
*
colData
=
NULL
;
bool
needIm2col
=
isNeedIm2col
(
filter
);
TensorShape
imShape
=
TensorShape
({
inputChannels
/
groups_
,
inputHeight
,
inputWidth
});
TensorShape
colShape
;
real
*
colData
=
NULL
;
size_t
colHeight
=
inputChannels
/
groups_
*
filterHeight
*
filterWidth
;
size_t
colWidth
=
outputHeight
*
outputWidth
;
// Max col matrix height 256, Max col matrix width 1024
size_t
stepColHeight
=
std
::
min
(
colHeight
,
static_cast
<
size_t
>
(
256
));
size_t
stepColWidth
=
std
::
min
(
colWidth
,
static_cast
<
size_t
>
(
2048
));
// Max col matrix width 4096, Max col matrix size 4M.
size_t
outputHeightSteps
=
std
::
min
(
std
::
max
(
4096
/
outputWidth
,
(
size_t
)
1
),
outputHeight
);
size_t
maxColWidth
=
outputHeightSteps
*
outputWidth
;
size_t
channelSteps
=
std
::
min
(
std
::
max
((
1048576
/
maxColWidth
)
/
filterHeight
*
filterWidth
,
(
size_t
)
1
),
inputChannels
/
groups_
);
size_t
maxColHeight
=
channelSteps
*
filterHeight
*
filterWidth
;
if
(
needIm2col
)
{
colShape
=
TensorShape
({
inputChannels
/
groups_
,
...
...
@@ -199,7 +202,7 @@ public:
outputHeight
,
outputWidth
});
resizeBuffer
<
Device
>
(
stepColHeight
*
step
ColWidth
*
sizeof
(
real
));
resizeBuffer
<
Device
>
(
maxColHeight
*
max
ColWidth
*
sizeof
(
real
));
colData
=
reinterpret_cast
<
real
*>
(
memory_
->
getBuf
());
}
...
...
@@ -209,20 +212,24 @@ public:
(
outputChannels
/
groups_
)
*
outputHeight
*
outputWidth
;
size_t
filterOffset
=
filter
.
getElements
()
/
groups_
;
int
nStride
=
col
Width
;
int
kStride
=
colHeight
;
int
nStride
=
outputHeight
*
output
Width
;
int
kStride
=
inputChannels
/
groups_
*
filterHeight
*
filterWidth
;
for
(
size_t
i
=
0
;
i
<
batchSize
;
i
++
)
{
filterData
=
inputs
[
1
].
data
<
real
>
();
for
(
size_t
g
=
0
;
g
<
groups_
;
g
++
)
{
if
(
needIm2col
)
{
real
beta_
=
beta
;
for
(
size_t
colHeightStart
=
0
;
colHeightStart
<
colHeight
;
colHeightStart
+=
stepColHeight
)
{
for
(
size_t
colWidthStart
=
0
;
colWidthStart
<
colWidth
;
colWidthStart
+=
stepColWidth
)
{
int
N
=
std
::
min
(
colWidth
-
colWidthStart
,
stepColWidth
);
int
K
=
std
::
min
(
colHeight
-
colHeightStart
,
stepColHeight
);
for
(
size_t
ic
=
0
;
ic
<
inputChannels
/
groups_
;
ic
+=
channelSteps
)
{
int
channels
=
std
::
min
(
inputChannels
/
groups_
-
ic
,
channelSteps
);
for
(
size_t
oh
=
0
;
oh
<
outputHeight
;
oh
+=
outputHeightSteps
)
{
int
height
=
std
::
min
(
outputHeight
-
oh
,
outputHeightSteps
);
int
M
=
outputChannels
/
groups_
;
int
N
=
height
*
outputWidth
;
int
K
=
channels
*
filterHeight
*
filterWidth
;
// im2col
im2col
(
inputData
+
g
*
inputOffset
,
im2col
(
inputData
,
imShape
,
colData
,
colShape
,
...
...
@@ -232,13 +239,12 @@ public:
paddingW
(),
dilationH
(),
dilationW
(),
c
olHeightStart
,
K
,
colWidthStar
t
,
c
hannels
,
oh
,
heigh
t
,
N
);
// gemm
int
M
=
outputChannels
/
groups_
;
BlasGemm
<
Device
,
real
>::
compute
(
false
,
false
,
...
...
@@ -246,12 +252,12 @@ public:
N
,
K
,
1.0
f
,
filterData
+
g
*
filterOffset
+
colHeightStart
,
filterData
+
ic
*
filterHeight
*
filterWidth
,
kStride
,
colData
,
N
,
beta_
,
outputData
+
g
*
outputOffset
+
colWidthStart
,
outputData
+
oh
*
outputWidth
,
nStride
);
}
beta_
=
1.0
;
...
...
@@ -266,17 +272,18 @@ public:
N
,
K
,
1.0
f
,
filterData
+
g
*
filterOffset
,
filterData
,
K
,
inputData
+
g
*
inputOffset
,
inputData
,
N
,
beta
,
outputData
+
g
*
outputOffset
,
outputData
,
N
);
}
inputData
+=
inputOffset
;
outputData
+=
outputOffset
;
filterData
+=
filterOffset
;
}
inputData
+=
inputChannels
*
inputHeight
*
inputWidth
;
outputData
+=
outputChannels
*
outputHeight
*
outputWidth
;
}
memory_
.
reset
();
...
...
paddle/function/Im2Col.h
浏览文件 @
292c1951
...
...
@@ -111,39 +111,42 @@ public:
int
paddingWidth
,
int
dilationHeight
,
int
dilationWidth
,
int
colHeightStart
,
int
col
HeightSize
,
int
col
WidthStar
t
,
int
colWidth
Size
)
{
int
inputChannels
,
int
col
Offset
,
int
col
OutputHeigh
t
,
int
colWidth
)
{
int
inputHeight
=
imShape
[
1
];
int
inputWidth
=
imShape
[
2
];
int
filterHeight
=
colShape
[
1
];
int
filterWidth
=
colShape
[
2
];
int
outputWidth
=
colShape
[
4
];
for
(
int
colh
=
0
;
colh
<
colHeightSize
;
colh
++
)
{
int
wOffset
=
(
colHeightStart
+
colh
)
%
filterWidth
;
int
hOffset
=
((
colHeightStart
+
colh
)
/
filterWidth
)
%
filterHeight
;
int
c_im
=
(
colHeightStart
+
colh
)
/
filterWidth
/
filterHeight
;
for
(
int
colw
=
0
;
colw
<
colWidthSize
;
colw
++
)
{
int
h
=
(
colWidthStart
+
colw
)
/
outputWidth
;
int
w
=
(
colWidthStart
+
colw
)
%
outputWidth
;
int
imRowIdx
=
h
*
strideHeight
+
hOffset
*
dilationHeight
;
int
imColIdx
=
w
*
strideWidth
+
wOffset
*
dilationWidth
;
if
((
imRowIdx
-
paddingHeight
)
<
0
||
(
imRowIdx
-
paddingHeight
)
>=
inputHeight
||
(
imColIdx
-
paddingWidth
)
<
0
||
(
imColIdx
-
paddingWidth
)
>=
inputWidth
)
{
colData
[
colh
*
colWidthSize
+
colw
]
=
static_cast
<
T
>
(
0
);
}
else
{
imRowIdx
+=
c_im
*
inputHeight
-
paddingHeight
;
imColIdx
-=
paddingWidth
;
colData
[
colh
*
colWidthSize
+
colw
]
=
imData
[
imRowIdx
*
inputWidth
+
imColIdx
];
for
(
int
ic
=
0
;
ic
<
inputChannels
;
ic
++
)
{
for
(
int
oh
=
0
;
oh
<
colOutputHeight
;
oh
++
)
{
T
*
dstData
=
colData
+
oh
*
outputWidth
;
for
(
int
fh
=
0
;
fh
<
filterHeight
;
fh
++
)
{
for
(
int
fw
=
0
;
fw
<
filterWidth
;
fw
++
)
{
int
imRowIdx
=
(
oh
+
colOffset
)
*
strideHeight
+
fh
*
dilationHeight
-
paddingHeight
;
if
(
imRowIdx
<
0
||
imRowIdx
>=
inputHeight
)
{
memset
(
dstData
,
0
,
outputWidth
*
sizeof
(
T
));
}
else
{
for
(
int
ow
=
0
;
ow
<
outputWidth
;
ow
++
)
{
int
imColIdx
=
ow
*
strideWidth
+
fw
*
dilationWidth
-
paddingWidth
;
if
(
imColIdx
<
0
||
imColIdx
>=
inputWidth
)
{
dstData
[
ow
]
=
T
(
0
);
}
else
{
dstData
[
ow
]
=
imData
[
imRowIdx
*
inputWidth
+
imColIdx
];
}
}
}
dstData
+=
colWidth
;
}
}
}
colData
+=
filterHeight
*
filterWidth
*
colWidth
;
imData
+=
inputHeight
*
inputWidth
;
}
}
};
...
...
paddle/function/Im2ColTest.cpp
浏览文件 @
292c1951
...
...
@@ -202,10 +202,10 @@ void TestIm2ColMobileFunctor() {
padding
,
dilation
,
dilation
,
channels
,
0
,
height
,
0
,
width
);
outputHeight
,
outputHeight
*
outputWidth
);
autotest
::
TensorCheckEqual
(
*
output1
,
*
output2
);
}
...
...
paddle/math/Matrix.cpp
浏览文件 @
292c1951
...
...
@@ -2015,13 +2015,6 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
CHECK_EQ
(
channels
*
outLength
,
maskMatP
->
getWidth
());
}
/* initialize the data_ */
for
(
size_t
i
=
0
;
i
<
height_
;
i
++
)
{
for
(
size_t
j
=
0
;
j
<
width_
;
j
++
)
{
outData
[
i
*
outStride
+
j
]
=
-
(
real
)
FLT_MAX
;
}
}
/* pool max one by one */
for
(
size_t
n
=
0
;
n
<
num
;
++
n
)
{
// frame by frame
if
(
!
isContiguous
())
{
...
...
@@ -2030,19 +2023,24 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
for
(
size_t
c
=
0
;
c
<
channels
;
++
c
)
{
// channel by channel
for
(
size_t
ph
=
0
;
ph
<
outputH
;
++
ph
)
{
int
hstart
=
ph
*
strideH
-
paddingH
;
int
hend
=
std
::
min
(
hstart
+
sizeY
,
imgSizeH
);
hstart
=
std
::
max
(
hstart
,
0
);
int
hend
=
hstart
+
sizeY
;
hstart
=
hstart
<
0
?
0
:
hstart
;
hend
=
hend
<
(
int
)
imgSizeH
?
hend
:
(
int
)
imgSizeH
;
for
(
size_t
pw
=
0
;
pw
<
outputW
;
++
pw
)
{
int
wstart
=
pw
*
strideW
-
paddingW
;
int
wend
=
std
::
min
(
wstart
+
sizeX
,
imgSizeW
);
wstart
=
std
::
max
(
wstart
,
0
);
int
wend
=
wstart
+
sizeX
;
wstart
=
wstart
<
0
?
0
:
wstart
;
wend
=
wend
<
(
int
)
imgSizeW
?
wend
:
(
int
)
imgSizeW
;
if
(
maskData
==
NULL
)
{
real
tmp
=
-
(
real
)
FLT_MAX
;
for
(
int
h
=
hstart
;
h
<
hend
;
++
h
)
{
for
(
int
w
=
wstart
;
w
<
wend
;
++
w
)
{
outData
[
ph
*
outputW
+
pw
]
=
std
::
max
(
outData
[
ph
*
outputW
+
pw
],
inputData
[
h
*
imgSizeW
+
w
]);
tmp
=
tmp
<
inputData
[
h
*
imgSizeW
+
w
]
?
inputData
[
h
*
imgSizeW
+
w
]
:
tmp
;
}
}
outData
[
ph
*
outputW
+
pw
]
=
tmp
;
}
else
{
for
(
int
h
=
hstart
;
h
<
hend
;
++
h
)
{
for
(
int
w
=
wstart
;
w
<
wend
;
++
w
)
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录