Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
18cd1f25
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
18cd1f25
编写于
6月 04, 2017
作者:
D
dangqingqing
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix bug and Python API.
上级
b783e08e
变更
11
显示空白变更内容
内联
并排
Showing
11 changed file
with
295 addition
and
107 deletion
+295
-107
paddle/function/RowConvOp.cpp
paddle/function/RowConvOp.cpp
+59
-34
paddle/function/RowConvOp.h
paddle/function/RowConvOp.h
+16
-2
paddle/function/RowConvOpGpu.cu
paddle/function/RowConvOpGpu.cu
+64
-49
paddle/function/RowConvOpTest.cpp
paddle/function/RowConvOpTest.cpp
+10
-17
paddle/gserver/layers/RowConvLayer.cpp
paddle/gserver/layers/RowConvLayer.cpp
+1
-1
paddle/gserver/layers/RowConvLayer.h
paddle/gserver/layers/RowConvLayer.h
+1
-3
python/paddle/trainer/config_parser.py
python/paddle/trainer/config_parser.py
+17
-0
python/paddle/trainer_config_helpers/layers.py
python/paddle/trainer_config_helpers/layers.py
+76
-0
python/paddle/trainer_config_helpers/tests/configs/file_list.sh
.../paddle/trainer_config_helpers/tests/configs/file_list.sh
+1
-1
python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_conv.protostr
...fig_helpers/tests/configs/protostr/test_row_conv.protostr
+41
-0
python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py
...dle/trainer_config_helpers/tests/configs/test_row_conv.py
+9
-0
未找到文件。
paddle/function/RowConvOp.cpp
浏览文件 @
18cd1f25
...
@@ -61,7 +61,7 @@ void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
...
@@ -61,7 +61,7 @@ void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
size_t
begin
=
starts
[
i
];
size_t
begin
=
starts
[
i
];
size_t
end
=
starts
[
i
+
1
];
size_t
end
=
starts
[
i
+
1
];
size_t
steps
=
end
-
begin
;
size_t
steps
=
end
-
begin
;
for
(
size_t
j
=
0
;
j
<
contextLength
;
++
j
)
{
for
(
size_t
j
=
0
;
j
<
contextLength
&&
(
begin
+
j
)
<
end
;
++
j
)
{
MatrixPtr
x
=
MatrixPtr
x
=
(
const_cast
<
CpuMatrix
&>
(
in
)).
subMatrix
(
begin
+
j
,
steps
-
j
);
(
const_cast
<
CpuMatrix
&>
(
in
)).
subMatrix
(
begin
+
j
,
steps
-
j
);
MatrixPtr
dy
=
MatrixPtr
dy
=
...
@@ -81,7 +81,7 @@ void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
...
@@ -81,7 +81,7 @@ void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
for
(
size_t
j
=
0
;
j
<
steps
;
++
j
)
{
for
(
size_t
j
=
0
;
j
<
steps
;
++
j
)
{
MatrixPtr
dx
=
inG
.
subMatrix
(
begin
+
j
,
1
);
MatrixPtr
dx
=
inG
.
subMatrix
(
begin
+
j
,
1
);
for
(
size_t
t
=
0
;
t
<
contextLength
;
++
t
)
{
for
(
size_t
t
=
0
;
t
<
contextLength
;
++
t
)
{
if
(
(
int
(
j
)
-
int
(
t
)
)
>=
0
)
{
if
(
int
(
j
-
t
)
>=
0
)
{
MatrixPtr
dy
=
MatrixPtr
dy
=
(
const_cast
<
CpuMatrix
&>
(
outG
)).
subMatrix
(
begin
+
j
-
t
,
1
);
(
const_cast
<
CpuMatrix
&>
(
outG
)).
subMatrix
(
begin
+
j
-
t
,
1
);
MatrixPtr
w
=
(
const_cast
<
CpuMatrix
&>
(
filter
)).
subMatrix
(
t
,
1
);
MatrixPtr
w
=
(
const_cast
<
CpuMatrix
&>
(
filter
)).
subMatrix
(
t
,
1
);
...
@@ -94,8 +94,37 @@ void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
...
@@ -94,8 +94,37 @@ void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
}
}
/**
/**
* \brief TODO(qingqing)
* \brief The row convolution is called lookahead convolution. It is firstly
* introduced in deep-speech2 system. The bidirectional RNN that learns
* representation for a sequence by performing a forward and a backward pass
* through the entire sequence. However, unlike unidirectional RNNs,
* bidirectional RNNs are challenging to deploy in an online and low-latency
* setting. The lookahead convolution incorporates information from future
* subsequences in a computationally efficient manner to improve unidirectional
* recurrent neural networks.
*
*
* The connection of row convolution is different form the 1D sequence
* convolution. Assumed that, the future context-length is k, that is to say,
* it can get the output at timestep t by using the the input feature from t-th
* timestep to (t+k)-th timestep. Assumed that the hidden dim of input
* activations are d, the activations r_t for the new layer at time-step t are:
*
*
* -- k + 1
* r(t,i) = > W(i,j) * h(t+j-1, i), for (1 <= i <= d)
* -- j = 1
*
*
* The weight shape is: (k + 1) x d
* Function Arguments:
*
* \param inputs[0] The input activations.
* \param inputs[0] The filter (or weight) and shape is (k+1) x d.
* \param outputs[1] The output activations.
*
* [1] Dario Amodei, etc. Deep Speech 2 : End-to-End Speech Recognition in
* English
* and Mandarin. https://arxiv.org/abs/1512.02595
*/
*/
template
<
DeviceType
Device
>
template
<
DeviceType
Device
>
...
@@ -128,10 +157,21 @@ public:
...
@@ -128,10 +157,21 @@ public:
RowConv
<
Device
>
(
outMat
,
inMat
,
wMat
,
seqId
);
RowConv
<
Device
>
(
outMat
,
inMat
,
wMat
,
seqId
);
}
}
};
};
/**
/**
* \brief TODO(qingqing)
* \brief The backward of row convolution function. This function calculated
* the gradient w.r.t filter and the gradient w.r.t input activations(or data).
*
*
* Argument in this Function:
* Argument in this Function:
*
* \param inputs[0] The gradient w.r.t output activations.
* \param inputs[1] The input activations.
* \param inputs[2] The filter (or weight) and shape is (k+1) x d.
* \param outputs[0] The gradient w.r.t input activations.
* \param outputs[1] The gradient w.r.r filter.
*
* Abbreviation:
* w.r.t: with respect to.
*/
*/
template
<
DeviceType
Device
>
template
<
DeviceType
Device
>
...
@@ -140,12 +180,27 @@ public:
...
@@ -140,12 +180,27 @@ public:
void
init
(
const
FuncConfig
&
config
)
override
{}
void
init
(
const
FuncConfig
&
config
)
override
{}
void
calc
(
const
BufferArgs
&
inputs
,
const
BufferArgs
&
outputs
)
override
{
void
calc
(
const
BufferArgs
&
inputs
,
const
BufferArgs
&
outputs
)
override
{
// check
CHECK_EQ
(
3UL
,
inputs
.
size
());
CHECK_EQ
(
2UL
,
outputs
.
size
());
CHECK_EQ
(
outputs
[
0
].
getArgType
(),
ADD_TO
);
CHECK_EQ
(
outputs
[
1
].
getArgType
(),
ADD_TO
);
CHECK
(
inputs
[
0
].
isSequenceArg
()
&&
inputs
[
1
].
isSequenceArg
()
&&
outputs
[
0
].
isSequenceArg
())
<<
"SequenceArg required here."
;
const
auto
outGrad
=
dynamic_cast
<
const
SequenceArg
&>
(
inputs
[
0
]);
const
auto
outGrad
=
dynamic_cast
<
const
SequenceArg
&>
(
inputs
[
0
]);
const
auto
in
=
dynamic_cast
<
const
SequenceArg
&>
(
inputs
[
1
]);
const
auto
in
=
dynamic_cast
<
const
SequenceArg
&>
(
inputs
[
1
]);
const
auto
w
=
inputs
[
2
];
const
auto
w
=
inputs
[
2
];
auto
inGrad
=
dynamic_cast
<
const
SequenceArg
&>
(
outputs
[
0
]);
auto
inGrad
=
dynamic_cast
<
const
SequenceArg
&>
(
outputs
[
0
]);
auto
wGrad
=
outputs
[
1
];
auto
wGrad
=
outputs
[
1
];
CHECK_EQ
(
in
.
shape
().
ndims
(),
2UL
);
CHECK_EQ
(
outGrad
.
shape
().
ndims
(),
2UL
);
CHECK_EQ
(
in
.
shape
()[
1
],
outGrad
.
shape
()[
1
]);
CHECK_EQ
(
in
.
shape
()[
0
],
outGrad
.
shape
()[
0
]);
CHECK_EQ
(
wGrad
.
shape
()[
1
],
in
.
shape
()[
1
]);
const
auto
outGMat
=
outGrad
.
matrix
<
Device
>
();
const
auto
outGMat
=
outGrad
.
matrix
<
Device
>
();
const
auto
inMat
=
in
.
matrix
<
Device
>
();
const
auto
inMat
=
in
.
matrix
<
Device
>
();
const
auto
wMat
=
w
.
matrix
<
Device
>
();
const
auto
wMat
=
w
.
matrix
<
Device
>
();
...
@@ -157,37 +212,7 @@ public:
...
@@ -157,37 +212,7 @@ public:
:
typename
Tensor
<
real
,
Device
>::
Matrix
(
nullptr
,
0
,
0
);
:
typename
Tensor
<
real
,
Device
>::
Matrix
(
nullptr
,
0
,
0
);
const
auto
seqId
=
in
.
getSequenceId
().
vector
<
int
,
Device
>
();
const
auto
seqId
=
in
.
getSequenceId
().
vector
<
int
,
Device
>
();
std
::
cout
<<
"in:"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
inMat
.
getHeight
();
++
i
)
{
for
(
int
j
=
0
;
j
<
inMat
.
getWidth
();
++
j
)
{
std
::
cout
<<
outGMat
.
getElement
(
i
,
j
)
<<
" "
;
}
std
::
cout
<<
std
::
endl
;
}
std
::
cout
<<
"w:"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
wMat
.
getHeight
();
++
i
)
{
for
(
int
j
=
0
;
j
<
wMat
.
getWidth
();
++
j
)
{
std
::
cout
<<
wMat
.
getElement
(
i
,
j
)
<<
" "
;
}
std
::
cout
<<
std
::
endl
;
}
std
::
cout
<<
"w:"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
seqId
.
getSize
();
++
i
)
{
std
::
cout
<<
seqId
.
getElement
(
i
)
<<
" "
;
}
std
::
cout
<<
std
::
endl
;
RowConvGrad
<
Device
>
(
outGMat
,
inMat
,
wMat
,
inGMat
,
wGMat
,
seqId
);
RowConvGrad
<
Device
>
(
outGMat
,
inMat
,
wMat
,
inGMat
,
wGMat
,
seqId
);
std
::
cout
<<
std
::
endl
<<
"out:"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
inGMat
.
getHeight
();
++
i
)
{
for
(
int
j
=
0
;
j
<
inGMat
.
getWidth
();
++
j
)
{
std
::
cout
<<
inGMat
.
getElement
(
i
,
j
)
<<
" "
;
}
std
::
cout
<<
std
::
endl
;
}
}
}
};
};
...
...
paddle/function/RowConvOp.h
浏览文件 @
18cd1f25
...
@@ -19,7 +19,14 @@ limitations under the License. */
...
@@ -19,7 +19,14 @@ limitations under the License. */
namespace
paddle
{
namespace
paddle
{
/**
/**
* \brief TODO(qingqing)
* \brief The forward of row convolution.
*
* \param[out] out The output data and shape is h x d. h is the sum of
* time steps of all samples in one mini-batch.
* \param[in] in The input data and shape is h x d.
* \param[in] filter The filter and shape is k x d. The lookahead step
* number plus one equals k.
* \param[in] seq The sequence start positions.
*
*
*/
*/
template
<
DeviceType
DType
>
template
<
DeviceType
DType
>
...
@@ -29,7 +36,14 @@ void RowConv(typename Tensor<real, DType>::Matrix& out,
...
@@ -29,7 +36,14 @@ void RowConv(typename Tensor<real, DType>::Matrix& out,
const
typename
Tensor
<
int
,
DType
>::
Vector
&
seq
);
const
typename
Tensor
<
int
,
DType
>::
Vector
&
seq
);
/**
/**
* \brief TODO(qingqing)
* \brief The backward of row convolution.
*
* \param[in] outG The gradient w.r.t output data.
* \param[in] in The input data.
* \param[in] filter The filter.
* \param[out] inG The gradient w.r.t input data.
* \param[out] filterG The gradient w.r.t filter.
* \param[in] seq The sequence start positions.
*
*
*/
*/
template
<
DeviceType
DType
>
template
<
DeviceType
DType
>
...
...
paddle/function/RowConvOpGpu.cu
浏览文件 @
18cd1f25
...
@@ -96,11 +96,6 @@ void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,
...
@@ -96,11 +96,6 @@ void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,
const
size_t
height
=
in
.
getHeight
();
const
size_t
height
=
in
.
getHeight
();
const
size_t
width
=
in
.
getWidth
();
const
size_t
width
=
in
.
getWidth
();
LOG
(
INFO
)
<<
numSeq
;
LOG
(
INFO
)
<<
contextLength
;
LOG
(
INFO
)
<<
height
;
LOG
(
INFO
)
<<
width
;
real
*
y
=
out
.
getData
();
real
*
y
=
out
.
getData
();
const
real
*
x
=
in
.
getData
();
const
real
*
x
=
in
.
getData
();
const
real
*
w
=
filter
.
getData
();
const
real
*
w
=
filter
.
getData
();
...
@@ -108,7 +103,6 @@ void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,
...
@@ -108,7 +103,6 @@ void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,
dim3
dimBlock
(
32
,
32
);
dim3
dimBlock
(
32
,
32
);
dim3
dimGrid
(
DIVUP
(
width
,
dimBlock
.
x
),
1
);
dim3
dimGrid
(
DIVUP
(
width
,
dimBlock
.
x
),
1
);
LOG
(
INFO
)
<<
dimGrid
.
x
;
if
(
contextLength
<=
32
)
{
if
(
contextLength
<=
32
)
{
KeRowConv
<
32
,
32
><<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
KeRowConv
<
32
,
32
><<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
...
@@ -131,12 +125,12 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
...
@@ -131,12 +125,12 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
const
int
blky
=
blockDim
.
y
;
const
int
blky
=
blockDim
.
y
;
const
int
gidx
=
blockIdx
.
x
*
blockDim
.
x
;
const
int
gidx
=
blockIdx
.
x
*
blockDim
.
x
;
__shared__
real
sh_x
[
BLOCK_
H
][
BLOCK_W
];
__shared__
real
sh_x
[
BLOCK_
W
][
BLOCK_H
];
__shared__
real
sh_dy
[
BLOCK_
H
][
BLOCK_W
];
__shared__
real
sh_dy
[
BLOCK_
W
][
BLOCK_H
+
CONTEXT
-
1
];
__shared__
real
sh_dw
[
CONTEXT
][
BLOCK_W
];
__shared__
real
sh_dw
[
CONTEXT
][
BLOCK_W
];
for
(
int
t
=
tidy
;
t
<
context
;
t
+=
blky
)
{
if
(
tidy
<
context
)
{
sh_dw
[
t
][
tidx
]
=
0.0
;
sh_dw
[
t
idy
][
tidx
]
=
0.0
;
}
}
__syncthreads
();
__syncthreads
();
...
@@ -144,21 +138,31 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
...
@@ -144,21 +138,31 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
const
int
start
=
starts
[
i
];
const
int
start
=
starts
[
i
];
const
int
end
=
starts
[
i
+
1
];
const
int
end
=
starts
[
i
+
1
];
const
int
steps
=
end
-
start
;
const
int
steps
=
end
-
start
;
for
(
int
j
=
tidy
;
j
<
steps
;
j
+=
BLOCK_H
)
{
const
int
size
=
((
steps
+
BLOCK_H
-
1
)
/
BLOCK_H
)
*
BLOCK_H
;
for
(
int
j
=
tidy
;
j
<
size
;
j
+=
BLOCK_H
)
{
int
xoff
=
gidx
+
tidx
;
int
xoff
=
gidx
+
tidx
;
int
yoff
=
start
+
j
;
int
yoff
=
start
+
j
;
// transpose
// transpose
sh_x
[
tidx
][
tidy
]
=
xoff
<
width
&&
yoff
<
end
?
x
[
yoff
*
width
+
xoff
]
:
0.0
;
sh_x
[
tidx
][
tidy
]
=
(
xoff
<
width
&&
yoff
<
end
)
?
x
[
yoff
*
width
+
xoff
]
:
0.0
;
sh_dy
[
tidx
][
tidy
]
=
xoff
<
width
&&
yoff
<
end
?
dy
[
yoff
*
width
+
xoff
]
:
0.0
;
sh_dy
[
tidx
][
tidy
+
context
-
1
]
=
(
xoff
<
width
&&
yoff
<
end
)
?
dy
[
yoff
*
width
+
xoff
]
:
0.0
;
__syncthreads
();
if
(
tidy
<
(
context
-
1
))
{
yoff
=
yoff
-
context
+
1
;
sh_dy
[
tidx
][
tidy
]
=
(
xoff
<
width
&&
yoff
>=
start
)
?
dy
[
yoff
*
width
+
xoff
]
:
0.0
;
}
__syncthreads
();
__syncthreads
();
for
(
int
t
=
0
;
t
<
context
;
t
++
)
{
for
(
int
t
=
0
;
t
<
context
;
t
++
)
{
real
val
=
tidx
+
t
<
blockDim
.
x
?
sh_x
[
tidy
][
tidx
+
t
]
*
sh_dy
[
tidy
][
tidx
]
:
0.0
;
real
val
=
sh_x
[
tidy
][
tidx
]
*
sh_dy
[
tidy
][
tidx
+
context
-
1
-
t
];
__syncthreads
();
// warp size and blockDim.x is 32.
// warp size and blockDim.x is 32.
for
(
int
offset
=
16
;
offset
>
0
;
offset
/=
2
)
{
val
+=
__shfl_down
(
val
,
16
);
val
+=
__shfl_down
(
val
,
offset
);
val
+=
__shfl_down
(
val
,
8
);
}
val
+=
__shfl_down
(
val
,
4
);
val
+=
__shfl_down
(
val
,
2
);
val
+=
__shfl_down
(
val
,
1
);
__syncthreads
();
if
(
tidx
==
0
)
{
if
(
tidx
==
0
)
{
sh_dw
[
t
][
tidy
]
+=
val
;
sh_dw
[
t
][
tidy
]
+=
val
;
}
}
...
@@ -167,7 +171,7 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
...
@@ -167,7 +171,7 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
}
}
}
}
for
(
int
t
=
tidy
;
t
<
context
&&
(
gidx
+
tidx
)
<
width
;
t
+=
blky
)
{
for
(
int
t
=
tidy
;
(
t
<
context
)
&&
((
gidx
+
tidx
)
<
width
)
;
t
+=
blky
)
{
dw
[
t
*
width
+
gidx
+
tidx
]
+=
sh_dw
[
t
][
tidx
];
dw
[
t
*
width
+
gidx
+
tidx
]
+=
sh_dw
[
t
][
tidx
];
}
}
}
}
...
@@ -188,21 +192,30 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
...
@@ -188,21 +192,30 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
const
int
start
=
starts
[
i
];
const
int
start
=
starts
[
i
];
const
int
end
=
starts
[
i
+
1
];
const
int
end
=
starts
[
i
+
1
];
const
int
steps
=
end
-
start
;
const
int
steps
=
end
-
start
;
for
(
int
j
=
0
;
j
<
steps
;
j
+=
BLOCK_H
)
{
const
int
size
=
((
steps
+
BLOCK_H
-
1
)
/
BLOCK_H
)
*
BLOCK_H
;
for
(
int
j
=
tidy
;
j
<
size
;
j
+=
BLOCK_H
)
{
int
xoff
=
gidx
+
tidx
;
int
xoff
=
gidx
+
tidx
;
int
yoff
=
start
+
j
;
int
yoff
=
start
+
j
;
// transpose
// transpose
sh_x
[
tidx
][
tidy
]
=
xoff
<
width
&&
yoff
<
end
?
x
[
yoff
*
width
+
xoff
]
:
0.0
;
sh_x
[
tidx
][
tidy
]
=
(
xoff
<
width
&&
yoff
<
end
)
?
x
[
yoff
*
width
+
xoff
]
:
0.0
;
sh_dy
[
tidx
][
tidy
]
=
xoff
<
width
&&
yoff
<
end
?
dy
[
yoff
*
width
+
xoff
]
:
0.0
;
__syncthreads
();
__syncthreads
();
for
(
int
t
=
0
;
t
<
context
;
t
++
)
{
for
(
int
t
=
0
;
t
<
context
;
t
++
)
{
real
val
=
tidx
+
t
<
blockDim
.
x
?
sh_x
[
tidy
][
tidx
+
t
]
*
sh_dy
[
tidy
][
tidx
]
:
0.0
;
sh_dy
[
tidx
][
tidy
]
=
(
xoff
<
width
&&
(
yoff
-
t
)
>=
start
&&
yoff
-
t
<
end
)
?
dy
[(
yoff
-
t
)
*
width
+
xoff
]
:
0.0
;
__syncthreads
();
real
val
=
sh_x
[
tidy
][
tidx
]
*
sh_dy
[
tidy
][
tidx
];
__syncthreads
();
// warp size and blockDim.x is 32.
// warp size and blockDim.x is 32.
for
(
int
offset
=
16
;
offset
>
0
;
offset
/=
2
)
{
val
+=
__shfl_down
(
val
,
16
);
val
+=
__shfl_down
(
val
,
offset
);
val
+=
__shfl_down
(
val
,
8
);
}
val
+=
__shfl_down
(
val
,
4
);
val
+=
__shfl_down
(
val
,
2
);
val
+=
__shfl_down
(
val
,
1
);
__syncthreads
();
if
(
tidx
==
0
&&
(
gidx
+
tidy
)
<
width
)
{
if
(
tidx
==
0
&&
(
gidx
+
tidy
)
<
width
)
{
dw
[
t
*
width
+
gidx
+
tidy
]
+=
val
;
dw
[
t
*
width
+
gidx
+
tidy
]
+=
val
;
}
}
...
@@ -293,13 +306,12 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
...
@@ -293,13 +306,12 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
const
real
*
dy
=
outG
.
getData
();
const
real
*
dy
=
outG
.
getData
();
const
real
*
x
=
in
.
getData
();
const
real
*
x
=
in
.
getData
();
const
real
*
w
=
filter
.
getData
();
const
real
*
w
=
filter
.
getData
();
real
*
dx
=
inG
.
getData
();
real
*
dw
=
filterG
.
getData
();
const
int
*
starts
=
seq
.
getData
();
const
int
*
starts
=
seq
.
getData
();
if
(
filterG
)
{
dim3
dimBlock
(
32
,
32
);
dim3
dimBlock
(
32
,
32
);
dim3
dimGrid
(
DIVUP
(
width
,
dimBlock
.
x
),
1
);
dim3
dimGrid
(
DIVUP
(
width
,
dimBlock
.
x
),
1
);
real
*
dw
=
filterG
.
getData
();
if
(
contextLength
<=
16
)
{
if
(
contextLength
<=
16
)
{
KeRowConvBwWeight
<
32
,
32
,
16
>
KeRowConvBwWeight
<
32
,
32
,
16
>
<<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
<<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
...
@@ -309,8 +321,10 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
...
@@ -309,8 +321,10 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
<<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
<<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
(
dw
,
x
,
dy
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
(
dw
,
x
,
dy
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
}
}
}
if
(
inG
)
{
real
*
dx
=
inG
.
getData
();
dim3
dimBlock2
(
32
,
32
);
dim3
dimBlock2
(
32
,
32
);
dim3
dimGrid2
(
DIVUP
(
width
,
dimBlock2
.
x
),
1
);
dim3
dimGrid2
(
DIVUP
(
width
,
dimBlock2
.
x
),
1
);
if
(
contextLength
<=
64
)
{
if
(
contextLength
<=
64
)
{
...
@@ -322,6 +336,7 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
...
@@ -322,6 +336,7 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
<<<
dimGrid2
,
dimBlock2
,
0
,
STREAM_DEFAULT
>>>
<<<
dimGrid2
,
dimBlock2
,
0
,
STREAM_DEFAULT
>>>
(
dx
,
w
,
dy
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
(
dx
,
w
,
dy
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
}
}
}
CHECK_SYNC
(
"RowConvGrad"
);
CHECK_SYNC
(
"RowConvGrad"
);
}
}
...
...
paddle/function/RowConvOpTest.cpp
浏览文件 @
18cd1f25
...
@@ -47,23 +47,16 @@ void testRowConvBw(size_t batchSize, size_t dim, size_t contextLength) {
...
@@ -47,23 +47,16 @@ void testRowConvBw(size_t batchSize, size_t dim, size_t contextLength) {
}
}
TEST
(
RowConv
,
real
)
{
TEST
(
RowConv
,
real
)
{
// for (size_t numSamples : {17, 129}) {
for
(
size_t
numSamples
:
{
17
,
129
,
2020
})
{
// for (size_t dim : {16, 248}) {
for
(
size_t
dim
:
{
16
,
512
,
2560
})
{
// for (size_t context: {3, 7, 65}) {
for
(
size_t
context
:
{
3
,
19
,
65
})
{
LOG
(
INFO
)
<<
"==========="
;
VLOG
(
3
)
<<
" numSamples="
<<
numSamples
<<
" dim="
<<
dim
// for (size_t numSamples : {17}) {
// for (size_t dim : {16}) {
// for (size_t context: {3}) {
size_t
numSamples
=
17
;
size_t
dim
=
16
;
size_t
context
=
3
;
LOG
(
INFO
)
<<
" numSamples="
<<
numSamples
<<
" dim="
<<
dim
<<
" context length="
<<
context
;
<<
" context length="
<<
context
;
testRowConvFw
(
numSamples
,
dim
,
context
);
testRowConvFw
(
numSamples
,
dim
,
context
);
//
testRowConvBw(numSamples, dim, context);
testRowConvBw
(
numSamples
,
dim
,
context
);
//
}
}
//
}
}
//
}
}
}
}
}
// namespace paddle
}
// namespace paddle
paddle/gserver/layers/RowConvLayer.cpp
浏览文件 @
18cd1f25
...
@@ -75,7 +75,7 @@ void RowConvLayer::backward(const UpdateCallback& callback) {
...
@@ -75,7 +75,7 @@ void RowConvLayer::backward(const UpdateCallback& callback) {
BufferArgs
outputs
;
BufferArgs
outputs
;
inputs
.
addArg
(
*
getOutputGrad
(),
*
startPos
);
inputs
.
addArg
(
*
getOutputGrad
(),
*
startPos
);
inputs
.
addArg
(
*
getInputValue
(
0
),
*
startPos
);
inputs
.
addArg
(
*
getInputValue
(
0
),
*
startPos
);
inputs
.
addArg
(
*
weight_
->
getW
(),
*
startPos
);
inputs
.
addArg
(
*
weight_
->
getW
(),
wDims_
);
MatrixPtr
inGrad
=
getInputGrad
(
0
);
MatrixPtr
inGrad
=
getInputGrad
(
0
);
MatrixPtr
wGrad
=
weight_
->
getWGrad
();
MatrixPtr
wGrad
=
weight_
->
getWGrad
();
...
...
paddle/gserver/layers/RowConvLayer.h
浏览文件 @
18cd1f25
...
@@ -37,9 +37,7 @@ protected:
...
@@ -37,9 +37,7 @@ protected:
// fan_out is the size of output feature.
// fan_out is the size of output feature.
std
::
unique_ptr
<
Weight
>
weight_
;
std
::
unique_ptr
<
Weight
>
weight_
;
// std::unique_ptr<Weight> biases_;
// The step number to look ahead plus one equals contexLength_.
// how many steps to look ahead
size_t
contexLength_
;
size_t
contexLength_
;
TensorShape
wDims_
;
TensorShape
wDims_
;
};
};
...
...
python/paddle/trainer/config_parser.py
浏览文件 @
18cd1f25
...
@@ -2081,6 +2081,23 @@ class MaxOutLayer(LayerBase):
...
@@ -2081,6 +2081,23 @@ class MaxOutLayer(LayerBase):
g_layer_map
[
input_layer
.
name
].
width
,
out_channels
)
g_layer_map
[
input_layer
.
name
].
width
,
out_channels
)
@
config_layer
(
'row_conv'
)
class
RowConvLayer
(
LayerBase
):
def
__init__
(
self
,
name
,
inputs
,
context_length
,
**
xargs
):
super
(
RowConvLayer
,
self
).
__init__
(
name
,
'maxout'
,
0
,
inputs
=
inputs
,
**
xargs
)
config_assert
(
len
(
self
.
inputs
)
==
1
,
'TransLayer must have one and only one input'
)
input_layer
=
self
.
get_input_layer
(
0
)
row_conv_conf
=
self
.
config
.
inputs
[
0
].
row_conv_conf
row_conv_conf
.
context_length
=
context_length
self
.
set_layer_size
(
input_layer
.
size
)
psize
=
context_length
*
input_layer
.
size
dims
=
[
context_length
,
input_layer
.
size
]
self
.
create_input_parameter
(
0
,
psize
,
dims
)
# key: cost type
# key: cost type
# value: cost class
# value: cost class
g_cost_map
=
{}
g_cost_map
=
{}
...
...
python/paddle/trainer_config_helpers/layers.py
浏览文件 @
18cd1f25
...
@@ -120,6 +120,7 @@ __all__ = [
...
@@ -120,6 +120,7 @@ __all__ = [
'smooth_l1_cost'
,
'smooth_l1_cost'
,
'layer_support'
,
'layer_support'
,
'multiplex_layer'
,
'multiplex_layer'
,
'row_conv_layer'
,
]
]
...
@@ -187,6 +188,7 @@ class LayerType(object):
...
@@ -187,6 +188,7 @@ class LayerType(object):
SPP_LAYER
=
"spp"
SPP_LAYER
=
"spp"
PAD_LAYER
=
"pad"
PAD_LAYER
=
"pad"
MULTIPLEX_LAYER
=
"multiplex"
MULTIPLEX_LAYER
=
"multiplex"
ROW_CONV_LAYER
=
"row_conv"
PRINT_LAYER
=
"print"
PRINT_LAYER
=
"print"
PRIORBOX_LAYER
=
"priorbox"
PRIORBOX_LAYER
=
"priorbox"
...
@@ -5528,3 +5530,77 @@ def multiplex_layer(input, name=None, layer_attr=None):
...
@@ -5528,3 +5530,77 @@ def multiplex_layer(input, name=None, layer_attr=None):
layer_type
=
LayerType
.
MULTIPLEX_LAYER
,
layer_type
=
LayerType
.
MULTIPLEX_LAYER
,
parents
=
input
,
parents
=
input
,
size
=
l
.
config
.
size
)
size
=
l
.
config
.
size
)
@
wrap_name_default
()
@
wrap_act_default
(
act
=
LinearActivation
())
@
wrap_param_attr_default
()
@
layer_support
(
DROPOUT
)
def
row_conv_layer
(
input
,
context_len
,
act
=
None
,
name
=
None
,
param_attr
=
None
,
layer_attr
=
None
):
"""
The row convolution is called lookahead convolution. It is firstly
introduced in paper of `Deep Speech 2: End-toEnd Speech Recognition
in English and Mandarin <https://arxiv.org/pdf/1512.02595v1.pdf>`_ .
The bidirectional RNN that learns representation for a sequence by
performing a forward and a backward pass through the entire sequence.
However, unlike unidirectional RNNs, bidirectional RNNs are challenging
to deploy in an online and low-latency setting. The lookahead convolution
incorporates information from future subsequences in a computationally
efficient manner to improve unidirectional recurrent neural networks.
The connection of row convolution is different form the 1D sequence
convolution. Assumed that, the future context-length is k, that is to say,
it can get the output at timestep t by using the the input feature from t-th
timestep to (t+k+1)-th timestep. Assumed that the hidden dim of input
activations are d, the activations r_t for the new layer at time-step t are:
.. math::
r_{t,r} = \sum_{j=1}^{k + 1} {w_{i,j}h_{t+j-1, i}}
\quad
\t
ext{for} \quad (1 \leq i \leq d)
Note:
The `context_len` is `k + 1`. That is to say, the lookahead step
number plus one equals context_len.
.. code-block:: python
row_conv = row_conv_layer(input=input_layer, context_len=3)
:param input: The input layer.
:type input: LayerOutput
:param context_len: The context length equals the lookahead step number
plus one.
:type context_len: int
:param act: Activation Type. Default is linear activation.
:type act: BaseActivation
:param param_attr: The Parameter Attribute. If None, the parameter will be
initialized smartly. It's better set it by yourself.
:type param_attr: ParameterAttribute
:param layer_attr: Extra Layer config.
:type layer_attr: ExtraLayerAttribute|None
:return: LayerOutput object.
:rtype: LayerOutput
"""
assert
isinstance
(
input
,
LayerOutput
)
assert
context_len
>
0
,
"the context_len must be greatet than 0."
Layer
(
inputs
=
[
Input
(
input
.
name
,
**
param_attr
.
attr
)],
name
=
name
,
context_length
=
context_len
,
type
=
LayerType
.
ROW_CONV_LAYER
,
active_type
=
act
.
name
,
**
ExtraLayerAttribute
.
to_kwargs
(
layer_attr
))
return
LayerOutput
(
name
,
LayerType
.
ROW_CONV_LAYER
,
input
,
activation
=
act
,
size
=
input
.
size
)
python/paddle/trainer_config_helpers/tests/configs/file_list.sh
浏览文件 @
18cd1f25
...
@@ -5,6 +5,6 @@ last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
...
@@ -5,6 +5,6 @@ last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
)
test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
test_row_conv
)
export
whole_configs
=(
test_split_datasource
)
export
whole_configs
=(
test_split_datasource
)
python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_conv.protostr
0 → 100644
浏览文件 @
18cd1f25
type: "nn"
layers {
name: "data"
type: "data"
size: 2560
active_type: ""
}
layers {
name: "__row_conv_layer_0__"
type: "maxout"
size: 2560
active_type: "relu"
inputs {
input_layer_name: "data"
input_parameter_name: "___row_conv_layer_0__.w0"
row_conv_conf {
context_length: 19
}
}
}
parameters {
name: "___row_conv_layer_0__.w0"
size: 48640
initial_mean: 0.0
initial_std: 0.229415733871
dims: 19
dims: 2560
initial_strategy: 0
initial_smart: true
}
input_layer_names: "data"
output_layer_names: "__row_conv_layer_0__"
sub_models {
name: "root"
layer_names: "data"
layer_names: "__row_conv_layer_0__"
input_layer_names: "data"
output_layer_names: "__row_conv_layer_0__"
is_recurrent_layer_group: false
}
python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py
0 → 100644
浏览文件 @
18cd1f25
from
paddle.trainer_config_helpers
import
*
settings
(
batch_size
=
1000
,
learning_rate
=
1e-5
)
data
=
data_layer
(
name
=
'data'
,
size
=
2560
)
row_conv
=
row_conv_layer
(
input
=
data
,
context_len
=
19
,
act
=
ReluActivation
())
outputs
(
row_conv
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录