Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
efae51ce
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
efae51ce
编写于
7月 07, 2017
作者:
X
xzl
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add the mobilenet gpu acceleration, cpu is in the process
上级
eeb17c26
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
130 addition
and
102 deletion
+130
-102
paddle/function/DepthwiseConvOp.cpp
paddle/function/DepthwiseConvOp.cpp
+9
-10
paddle/function/DepthwiseConvOp.h
paddle/function/DepthwiseConvOp.h
+2
-5
paddle/function/DepthwiseConvOpGpu.cu
paddle/function/DepthwiseConvOpGpu.cu
+115
-86
paddle/gserver/layers/ConvBaseLayer.cpp
paddle/gserver/layers/ConvBaseLayer.cpp
+2
-1
paddle/gserver/layers/DepthwiseConvLayer.cpp
paddle/gserver/layers/DepthwiseConvLayer.cpp
+2
-0
未找到文件。
paddle/function/DepthwiseConvOp.cpp
浏览文件 @
efae51ce
...
...
@@ -18,11 +18,6 @@ limitations under the License. */
namespace
paddle
{
/*
* imData = [input_channels, input_height, input_width]
* colData = [input_channels, filter_height, filter_width,
* output_height, output_width]
*/
template
<
class
T
>
class
DepthwiseConvFunctor
<
DEVICE_TYPE_CPU
,
T
>
{
public:
...
...
@@ -33,6 +28,8 @@ public:
int
outputChannels
,
int
outputHeight
,
int
outputWidth
,
int
inputHeight
,
int
inputWidth
,
int
filterHeight
,
int
filterWidth
,
int
strideH
,
...
...
@@ -40,7 +37,7 @@ public:
int
paddingH
,
int
paddingW
,
T
*
outputData
)
{
//
NO_IMPLEMENTATION
//
TODO(zhaolong) : cpu implementation of depthwise convolution
}
};
...
...
@@ -118,8 +115,8 @@ public:
size_t
batchSize
=
input
[
0
];
// size_t inputChannels = input[1];
//
size_t inputHeight = input[2];
//
size_t inputWidth = input[3];
size_t
inputHeight
=
input
[
2
];
size_t
inputWidth
=
input
[
3
];
size_t
filterHeight
=
getFilterHeight
(
filter
);
size_t
filterWidth
=
getFilterWidth
(
filter
);
size_t
outputChannels
=
output
[
1
];
...
...
@@ -139,6 +136,8 @@ public:
outputChannels
,
outputHeight
,
outputWidth
,
inputHeight
,
inputWidth
,
filterHeight
,
filterWidth
,
strideH
(),
...
...
@@ -233,8 +232,8 @@ public:
}
void
calc
(
const
BufferArgs
&
inputs
,
const
BufferArgs
&
outputs
)
override
{
CHECK_EQ
(
numInputs_
,
inputs
.
size
());
CHECK_EQ
(
numOutputs_
,
outputs
.
size
());
//
CHECK_EQ(numInputs_, inputs.size());
//
CHECK_EQ(numOutputs_, outputs.size());
check
(
inputs
,
outputs
);
const
TensorShape
&
output
=
inputs
[
0
].
shape
();
const
TensorShape
&
input
=
inputs
[
1
].
shape
();
...
...
paddle/function/DepthwiseConvOp.h
浏览文件 @
efae51ce
...
...
@@ -18,11 +18,6 @@ limitations under the License. */
namespace
paddle
{
/*
* imData = [input_channels, input_height, input_width]
* colData = [input_channels, filter_height, filter_width,
* output_height, output_width]
*/
template
<
DeviceType
Device
,
class
T
>
class
DepthwiseConvFunctor
{
public:
...
...
@@ -33,6 +28,8 @@ public:
int
outputChannels
,
int
outputHeight
,
int
outputWidth
,
int
inputHeight
,
int
intputWidth
,
int
filterHeight
,
int
filterWidth
,
int
strideH
,
...
...
paddle/function/DepthwiseConvOpGpu.cu
浏览文件 @
efae51ce
...
...
@@ -14,73 +14,95 @@ limitations under the License. */
#include "ConvOp.h"
#include "DepthwiseConvOp.h"
#include "GemmFunctor.h"
#include "paddle/math/MemoryHandle.h"
namespace
paddle
{
template
<
class
T
>
__global__
void
ConvolutionDepthwiseWeightForward
(
const
int
nthreads
,
const
T
*
const
bottom_data
,
const
T
*
const
weight_data
,
const
int
num
,
const
int
channels
,
const
int
top_height
,
const
int
top_width
,
const
int
bottom_height
,
const
int
bottom_width
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
T
*
const
top_data
)
{
__global__
void
ConvolutionDepthwiseForward
(
const
int
nthreads
,
const
T
*
const
inputData
,
const
T
*
const
filterData
,
const
int
batchSize
,
const
int
outputChannels
,
const
int
outputHeight
,
const
int
outputWidth
,
const
int
inputHeight
,
const
int
inputWidth
,
const
int
filterHeight
,
const
int
filterWidth
,
const
int
strideH
,
const
int
strideW
,
const
int
paddingH
,
const
int
paddingW
,
T
*
const
outputData
)
{
int
index
=
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
nthreads
)
{
const
int
n
=
index
/
channels
/
top_height
/
top_w
idth
;
const
int
c
=
(
index
/
top_height
/
top_width
)
%
c
hannels
;
const
int
h
=
(
index
/
top_width
)
%
top_h
eight
;
const
int
w
=
index
%
top_w
idth
;
const
T
*
weight
=
weight_data
+
c
*
kernel_h
*
kernel_w
;
const
int
n
=
index
/
outputChannels
/
outputHeight
/
outputW
idth
;
const
int
c
=
(
index
/
outputHeight
/
outputWidth
)
%
outputC
hannels
;
const
int
h
=
(
index
/
outputWidth
)
%
outputH
eight
;
const
int
w
=
index
%
outputW
idth
;
const
T
*
weight
=
filterData
+
c
*
filterHeight
*
filterWidth
;
T
value
=
0
;
for
(
int
kh
=
0
;
kh
<
kernel_h
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
kernel_w
;
++
kw
)
{
const
int
h_in
=
-
pad_h
+
h
*
stride_h
+
kh
*
dilation_h
;
const
int
w_in
=
-
pad_w
+
w
*
stride_w
+
kw
*
dilation_w
;
if
((
h_in
>=
0
)
&&
(
h_in
<
bottom_height
)
&&
(
w_in
>=
0
)
&&
(
w_in
<
bottom_width
))
{
const
int
offset
=
((
n
*
channels
+
c
)
*
bottom_height
+
h_in
)
*
bottom_width
+
w_in
;
value
+=
(
*
weight
)
*
bottom_data
[
offset
];
}
++
weight
;
}
}
top_data
[
index
]
=
value
;
const
int
h_in_start
=
-
paddingH
+
h
*
strideH
;
const
int
w_in_start
=
-
paddingW
+
w
*
strideW
;
const
int
h_in_end
=
-
paddingH
+
h
*
strideH
+
filterHeight
-
1
;
const
int
w_in_end
=
-
paddingW
+
w
*
strideW
+
filterWidth
-
1
;
if
((
h_in_start
>=
0
)
&&
(
h_in_end
<
inputHeight
)
&&
(
w_in_start
>=
0
)
&&
(
w_in_end
<
inputWidth
))
{
for
(
int
kh
=
0
;
kh
<
filterHeight
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
filterWidth
;
++
kw
)
{
const
int
h_in
=
-
paddingH
+
h
*
strideH
+
kh
;
const
int
w_in
=
-
paddingW
+
w
*
strideW
+
kw
;
const
int
offset
=
((
n
*
outputChannels
+
c
)
*
inputHeight
+
h_in
)
*
inputWidth
+
w_in
;
value
+=
(
*
weight
)
*
inputData
[
offset
];
++
weight
;
}
}
}
else
{
for
(
int
kh
=
0
;
kh
<
filterHeight
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
filterWidth
;
++
kw
)
{
const
int
h_in
=
-
paddingH
+
h
*
strideH
+
kh
;
const
int
w_in
=
-
paddingW
+
w
*
strideW
+
kw
;
if
((
h_in
>=
0
)
&&
(
h_in
<
inputHeight
)
&&
(
w_in
>=
0
)
&&
(
w_in
<
inputWidth
))
{
const
int
offset
=
((
n
*
outputChannels
+
c
)
*
inputHeight
+
h_in
)
*
inputWidth
+
w_in
;
value
+=
(
*
weight
)
*
inputData
[
offset
];
}
++
weight
;
}
}
}
outputData
[
index
]
=
value
;
}
}
template
<
class
T
>
__global__
void
ConvolutionDepthwiseBottomBackward
(
const
int
nthreads
,
__global__
void
ConvolutionDepthwiseInputBackward
(
const
int
nthreads
,
const
T
*
const
top_diff
,
const
T
*
const
weight_data
,
const
int
num
,
const
int
channels
,
const
int
top_h
eight
,
const
int
top_width
,
const
int
bottom_height
,
const
int
bottom_w
idth
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride
_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
T
*
const
bottom_diff
)
{
const
int
num
,
const
int
outputChannels
,
const
int
outputH
eight
,
const
int
outputWidth
,
const
int
inputHeight
,
const
int
inputW
idth
,
const
int
filterHeight
,
const
int
filterWidth
,
const
int
strideH
,
const
int
stride
W
,
const
int
paddingH
,
const
int
paddingW
,
T
*
const
bottom_diff
)
{
int
index
=
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
nthreads
)
{
const
int
n
=
index
/
channels
/
bottom_height
/
bottom_w
idth
;
const
int
c
=
(
index
/
bottom_height
/
bottom_width
)
%
c
hannels
;
const
int
h
=
(
index
/
bottom_width
)
%
bottom_h
eight
;
const
int
w
=
index
%
bottom_w
idth
;
const
T
*
weight
=
weight_data
+
c
*
kernel_h
*
kernel_w
;
const
int
n
=
index
/
outputChannels
/
inputHeight
/
inputW
idth
;
const
int
c
=
(
index
/
inputHeight
/
inputWidth
)
%
outputC
hannels
;
const
int
h
=
(
index
/
inputWidth
)
%
inputH
eight
;
const
int
w
=
index
%
inputW
idth
;
const
T
*
weight
=
weight_data
+
c
*
filterHeight
*
filterWidth
;
T
value
=
0
;
for
(
int
kh
=
0
;
kh
<
kernel_h
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
kernel_w
;
++
kw
)
{
const
int
h_out_s
=
h
+
pad
_h
-
kh
*
dilation_
h
;
const
int
w_out_s
=
w
+
pad
_w
-
kw
*
dilation_
w
;
if
(((
h_out_s
%
stride
_h
)
==
0
)
&&
((
w_out_s
%
stride_w
)
==
0
))
{
const
int
h_out
=
h_out_s
/
stride
_h
;
const
int
w_out
=
w_out_s
/
stride
_w
;
//it affect the effectives
if
((
h_out
>=
0
)
&&
(
h_out
<
top_h
eight
)
&&
(
w_out
>=
0
)
&&
(
w_out
<
top_w
idth
))
{
const
int
offset
=
((
n
*
channels
+
c
)
*
top_h
eight
+
h_out
)
*
top_w
idth
+
w_out
;
for
(
int
kh
=
0
;
kh
<
filterHeight
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
filterWidth
;
++
kw
)
{
const
int
h_out_s
=
h
+
pad
dingH
-
k
h
;
const
int
w_out_s
=
w
+
pad
dingW
-
k
w
;
if
(((
h_out_s
%
stride
H
)
==
0
)
&&
((
w_out_s
%
strideW
)
==
0
))
{
const
int
h_out
=
h_out_s
/
stride
H
;
const
int
w_out
=
w_out_s
/
stride
W
;
// TODO(zhaolong) : the 'if' affect the effectiveness, it needs to optimize
if
((
h_out
>=
0
)
&&
(
h_out
<
outputH
eight
)
&&
(
w_out
>=
0
)
&&
(
w_out
<
outputW
idth
))
{
const
int
offset
=
((
n
*
outputChannels
+
c
)
*
outputH
eight
+
h_out
)
*
outputW
idth
+
w_out
;
value
+=
(
*
weight
)
*
top_diff
[
offset
];
}
}
...
...
@@ -92,32 +114,33 @@ __global__ void ConvolutionDepthwiseBottomBackward(const int nthreads,
}
template
<
class
T
>
__global__
void
ConvolutionDepthwiseWeightBackward
(
const
int
num_i
,
const
int
nthreads
,
const
T
*
const
top_diff
,
const
T
*
const
bottom_data
,
const
int
num
,
const
int
channels
,
const
int
top_height
,
const
int
top_width
,
const
int
bottom_height
,
const
int
bottom_width
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
T
*
const
buffer_data
)
{
__global__
void
ConvolutionDepthwiseFilterBackward
(
const
int
num_i
,
const
int
nthreads
,
const
T
*
const
top_diff
,
const
T
*
const
inputData
,
const
int
num
,
const
int
outputChannels
,
const
int
outputHeight
,
const
int
outputWidth
,
const
int
inputHeight
,
const
int
inputWidth
,
const
int
filterHeight
,
const
int
filterWidth
,
const
int
strideH
,
const
int
strideW
,
const
int
paddingH
,
const
int
paddingW
,
T
*
const
buffer_data
)
{
int
index
=
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
nthreads
)
{
const
int
h
=
(
index
/
top_width
)
%
top_h
eight
;
const
int
w
=
index
%
top_w
idth
;
const
int
kh
=
(
index
/
kernel_w
/
top_height
/
top_w
idth
)
%
kernel_h
;
const
int
kw
=
(
index
/
top_height
/
top_width
)
%
kernel_w
;
const
int
h_in
=
-
pad
_h
+
h
*
stride_h
+
kh
*
dilation_
h
;
const
int
w_in
=
-
pad
_w
+
w
*
stride_w
+
kw
*
dilation_
w
;
if
((
h_in
>=
0
)
&&
(
h_in
<
bottom_h
eight
)
&&
(
w_in
>=
0
)
&&
(
w_in
<
bottom_w
idth
))
{
const
int
c
=
index
/
kernel_h
/
kernel_w
/
top_height
/
top_w
idth
;
const
int
h
=
(
index
/
outputWidth
)
%
outputH
eight
;
const
int
w
=
index
%
outputW
idth
;
const
int
kh
=
(
index
/
filterWidth
/
outputHeight
/
outputW
idth
)
%
filterHeight
;
const
int
kw
=
(
index
/
outputHeight
/
outputWidth
)
%
filterWidth
;
const
int
h_in
=
-
pad
dingH
+
h
*
strideH
+
k
h
;
const
int
w_in
=
-
pad
dingW
+
w
*
strideW
+
k
w
;
if
((
h_in
>=
0
)
&&
(
h_in
<
inputH
eight
)
&&
(
w_in
>=
0
)
&&
(
w_in
<
inputW
idth
))
{
const
int
c
=
index
/
filterHeight
/
filterWidth
/
outputHeight
/
outputW
idth
;
const
int
n
=
num_i
;
const
int
top_offset
=
((
n
*
channels
+
c
)
*
top_h
eight
+
h
)
*
top_w
idth
+
w
;
const
int
bottom_offset
=
((
n
*
channels
+
c
)
*
bottom_h
eight
+
h_in
)
*
bottom_w
idth
+
w_in
;
buffer_data
[
index
]
=
top_diff
[
top_offset
]
*
bottom_d
ata
[
bottom_offset
];
const
int
top_offset
=
((
n
*
outputChannels
+
c
)
*
outputH
eight
+
h
)
*
outputW
idth
+
w
;
const
int
bottom_offset
=
((
n
*
outputChannels
+
c
)
*
inputH
eight
+
h_in
)
*
inputW
idth
+
w_in
;
buffer_data
[
index
]
=
top_diff
[
top_offset
]
*
inputD
ata
[
bottom_offset
];
}
else
{
buffer_data
[
index
]
=
0
;
}
...
...
@@ -134,6 +157,8 @@ public:
int
outputChannels
,
int
outputHeight
,
int
outputWidth
,
int
inputHeight
,
int
inputWidth
,
int
filterHeight
,
int
filterWidth
,
int
strideH
,
...
...
@@ -148,7 +173,7 @@ public:
dim3
threads
(
1024
,
1
);
dim3
grid
(
blockX
,
blockY
);
ConvolutionDepthwise
Weight
Forward
<
T
>
ConvolutionDepthwiseForward
<
T
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
outputSize
,
inputData
,
...
...
@@ -157,6 +182,8 @@ public:
outputChannels
,
outputHeight
,
outputWidth
,
inputHeight
,
inputWidth
,
filterHeight
,
filterWidth
,
strideH
,
...
...
@@ -193,7 +220,7 @@ public:
dim3
threads
(
1024
,
1
);
dim3
grid
(
blockX
,
blockY
);
ConvolutionDepthwise
Bottom
Backward
<
T
>
ConvolutionDepthwise
Input
Backward
<
T
>
// NOLINT_NEXT_LINE(whitespace/operators)
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
inputSize
,
...
...
@@ -244,10 +271,10 @@ public:
dim3
threads
(
1024
,
1
);
dim3
grid
(
blockX
,
blockY
);
ConvolutionDepthwise
Weight
Backward
<
T
>
ConvolutionDepthwise
Filter
Backward
<
T
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
i
,
s
ize
,
num_
i
,
colDataS
ize
,
outputGrad
,
inputData
,
batchSize
,
...
...
@@ -264,8 +291,8 @@ public:
paddingW
,
colData
);
GemmFunctor
<
D
evice
,
real
>
gemm
;
int
M
=
s
ize
/
outputHeight
/
outputWidth
;
GemmFunctor
<
D
EVICE_TYPE_GPU
,
real
>
gemm
;
int
M
=
colDataS
ize
/
outputHeight
/
outputWidth
;
int
N
=
1
;
int
K
=
outputHeight
*
outputWidth
;
gemm
(
CblasNoTrans
,
...
...
@@ -273,23 +300,25 @@ public:
M
,
N
,
K
,
1.0
f
,
(
T
)
1.0
,
colData
,
K
,
multiplierData
,
N
,
1.0
f
,
(
T
)
1.0
,
filterGrad
,
N
);
//gemv
}
};
template
class
DepthwiseConvGradInputFunctor
<
DEVICE_TYPE_GPU
,
float
>;
template
class
DepthwiseConvGradInputFunctor
<
DEVICE_TYPE_GPU
,
double
>;
template
class
DepthwiseConvFunctor
<
DEVICE_TYPE_GPU
,
float
>;
template
class
DepthwiseConvFunctor
<
DEVICE_TYPE_GPU
,
double
>;
template
class
DepthwiseConvGradFilterFunctor
<
DEVICE_TYPE_GPU
,
float
>;
template
class
DepthwiseConvGradFilterFunctor
<
DEVICE_TYPE_GPU
,
double
>;
#ifdef PADDLE_TYPE_DOUBLE
using
real
=
double
;
#else
using
real
=
float
;
#endif
template
class
DepthwiseConvGradInputFunctor
<
DEVICE_TYPE_GPU
,
real
>;
template
class
DepthwiseConvFunctor
<
DEVICE_TYPE_GPU
,
real
>;
template
class
DepthwiseConvGradFilterFunctor
<
DEVICE_TYPE_GPU
,
real
>;
}
// namespace paddle
paddle/gserver/layers/ConvBaseLayer.cpp
浏览文件 @
efae51ce
...
...
@@ -21,7 +21,8 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
const
ParameterMap
&
parameterMap
)
{
/* Initialize the basic parent class */
Layer
::
init
(
layerMap
,
parameterMap
);
isDeconv_
=
(
config_
.
type
()
==
"exconv"
||
config_
.
type
()
==
"cudnn_conv"
)
isDeconv_
=
(
config_
.
type
()
==
"exconv"
||
config_
.
type
()
==
"cudnn_conv"
||
config_
.
type
()
==
"depthwise_conv"
)
?
false
:
true
;
...
...
paddle/gserver/layers/DepthwiseConvLayer.cpp
浏览文件 @
efae51ce
...
...
@@ -15,6 +15,7 @@ limitations under the License. */
#include "DepthwiseConvLayer.h"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Stat.h"
#include <iostream>
namespace
paddle
{
...
...
@@ -79,6 +80,7 @@ void DepthwiseConvLayer::forward(PassType passType) {
Layer
::
forward
(
passType
);
size_t
batchSize
=
inputLayers_
[
0
]
->
getOutputValue
()
->
getHeight
();
// std::cout << "outputSize" << getOutputSize() <<std::endl;
resetOutput
(
batchSize
,
getOutputSize
());
// Calculate the shape of the input, output, and filter.
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录