Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
18cd1f25
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2299
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
18cd1f25
编写于
6月 04, 2017
作者:
D
dangqingqing
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix bug and Python API.
上级
b783e08e
变更
11
隐藏空白更改
内联
并排
Showing
11 changed file
with
295 addition
and
107 deletion
+295
-107
paddle/function/RowConvOp.cpp
paddle/function/RowConvOp.cpp
+59
-34
paddle/function/RowConvOp.h
paddle/function/RowConvOp.h
+16
-2
paddle/function/RowConvOpGpu.cu
paddle/function/RowConvOpGpu.cu
+64
-49
paddle/function/RowConvOpTest.cpp
paddle/function/RowConvOpTest.cpp
+10
-17
paddle/gserver/layers/RowConvLayer.cpp
paddle/gserver/layers/RowConvLayer.cpp
+1
-1
paddle/gserver/layers/RowConvLayer.h
paddle/gserver/layers/RowConvLayer.h
+1
-3
python/paddle/trainer/config_parser.py
python/paddle/trainer/config_parser.py
+17
-0
python/paddle/trainer_config_helpers/layers.py
python/paddle/trainer_config_helpers/layers.py
+76
-0
python/paddle/trainer_config_helpers/tests/configs/file_list.sh
.../paddle/trainer_config_helpers/tests/configs/file_list.sh
+1
-1
python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_conv.protostr
...fig_helpers/tests/configs/protostr/test_row_conv.protostr
+41
-0
python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py
...dle/trainer_config_helpers/tests/configs/test_row_conv.py
+9
-0
未找到文件。
paddle/function/RowConvOp.cpp
浏览文件 @
18cd1f25
...
@@ -61,7 +61,7 @@ void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
...
@@ -61,7 +61,7 @@ void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
size_t
begin
=
starts
[
i
];
size_t
begin
=
starts
[
i
];
size_t
end
=
starts
[
i
+
1
];
size_t
end
=
starts
[
i
+
1
];
size_t
steps
=
end
-
begin
;
size_t
steps
=
end
-
begin
;
for
(
size_t
j
=
0
;
j
<
contextLength
;
++
j
)
{
for
(
size_t
j
=
0
;
j
<
contextLength
&&
(
begin
+
j
)
<
end
;
++
j
)
{
MatrixPtr
x
=
MatrixPtr
x
=
(
const_cast
<
CpuMatrix
&>
(
in
)).
subMatrix
(
begin
+
j
,
steps
-
j
);
(
const_cast
<
CpuMatrix
&>
(
in
)).
subMatrix
(
begin
+
j
,
steps
-
j
);
MatrixPtr
dy
=
MatrixPtr
dy
=
...
@@ -81,7 +81,7 @@ void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
...
@@ -81,7 +81,7 @@ void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
for
(
size_t
j
=
0
;
j
<
steps
;
++
j
)
{
for
(
size_t
j
=
0
;
j
<
steps
;
++
j
)
{
MatrixPtr
dx
=
inG
.
subMatrix
(
begin
+
j
,
1
);
MatrixPtr
dx
=
inG
.
subMatrix
(
begin
+
j
,
1
);
for
(
size_t
t
=
0
;
t
<
contextLength
;
++
t
)
{
for
(
size_t
t
=
0
;
t
<
contextLength
;
++
t
)
{
if
(
(
int
(
j
)
-
int
(
t
)
)
>=
0
)
{
if
(
int
(
j
-
t
)
>=
0
)
{
MatrixPtr
dy
=
MatrixPtr
dy
=
(
const_cast
<
CpuMatrix
&>
(
outG
)).
subMatrix
(
begin
+
j
-
t
,
1
);
(
const_cast
<
CpuMatrix
&>
(
outG
)).
subMatrix
(
begin
+
j
-
t
,
1
);
MatrixPtr
w
=
(
const_cast
<
CpuMatrix
&>
(
filter
)).
subMatrix
(
t
,
1
);
MatrixPtr
w
=
(
const_cast
<
CpuMatrix
&>
(
filter
)).
subMatrix
(
t
,
1
);
...
@@ -94,8 +94,37 @@ void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
...
@@ -94,8 +94,37 @@ void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
}
}
/**
/**
* \brief TODO(qingqing)
* \brief The row convolution is called lookahead convolution. It is firstly
* introduced in deep-speech2 system. The bidirectional RNN that learns
* representation for a sequence by performing a forward and a backward pass
* through the entire sequence. However, unlike unidirectional RNNs,
* bidirectional RNNs are challenging to deploy in an online and low-latency
* setting. The lookahead convolution incorporates information from future
* subsequences in a computationally efficient manner to improve unidirectional
* recurrent neural networks.
*
*
* The connection of row convolution is different form the 1D sequence
* convolution. Assumed that, the future context-length is k, that is to say,
* it can get the output at timestep t by using the the input feature from t-th
* timestep to (t+k)-th timestep. Assumed that the hidden dim of input
* activations are d, the activations r_t for the new layer at time-step t are:
*
*
* -- k + 1
* r(t,i) = > W(i,j) * h(t+j-1, i), for (1 <= i <= d)
* -- j = 1
*
*
* The weight shape is: (k + 1) x d
* Function Arguments:
*
* \param inputs[0] The input activations.
* \param inputs[0] The filter (or weight) and shape is (k+1) x d.
* \param outputs[1] The output activations.
*
* [1] Dario Amodei, etc. Deep Speech 2 : End-to-End Speech Recognition in
* English
* and Mandarin. https://arxiv.org/abs/1512.02595
*/
*/
template
<
DeviceType
Device
>
template
<
DeviceType
Device
>
...
@@ -128,10 +157,21 @@ public:
...
@@ -128,10 +157,21 @@ public:
RowConv
<
Device
>
(
outMat
,
inMat
,
wMat
,
seqId
);
RowConv
<
Device
>
(
outMat
,
inMat
,
wMat
,
seqId
);
}
}
};
};
/**
/**
* \brief TODO(qingqing)
* \brief The backward of row convolution function. This function calculated
* the gradient w.r.t filter and the gradient w.r.t input activations(or data).
*
*
* Argument in this Function:
* Argument in this Function:
*
* \param inputs[0] The gradient w.r.t output activations.
* \param inputs[1] The input activations.
* \param inputs[2] The filter (or weight) and shape is (k+1) x d.
* \param outputs[0] The gradient w.r.t input activations.
* \param outputs[1] The gradient w.r.r filter.
*
* Abbreviation:
* w.r.t: with respect to.
*/
*/
template
<
DeviceType
Device
>
template
<
DeviceType
Device
>
...
@@ -140,12 +180,27 @@ public:
...
@@ -140,12 +180,27 @@ public:
void
init
(
const
FuncConfig
&
config
)
override
{}
void
init
(
const
FuncConfig
&
config
)
override
{}
void
calc
(
const
BufferArgs
&
inputs
,
const
BufferArgs
&
outputs
)
override
{
void
calc
(
const
BufferArgs
&
inputs
,
const
BufferArgs
&
outputs
)
override
{
// check
CHECK_EQ
(
3UL
,
inputs
.
size
());
CHECK_EQ
(
2UL
,
outputs
.
size
());
CHECK_EQ
(
outputs
[
0
].
getArgType
(),
ADD_TO
);
CHECK_EQ
(
outputs
[
1
].
getArgType
(),
ADD_TO
);
CHECK
(
inputs
[
0
].
isSequenceArg
()
&&
inputs
[
1
].
isSequenceArg
()
&&
outputs
[
0
].
isSequenceArg
())
<<
"SequenceArg required here."
;
const
auto
outGrad
=
dynamic_cast
<
const
SequenceArg
&>
(
inputs
[
0
]);
const
auto
outGrad
=
dynamic_cast
<
const
SequenceArg
&>
(
inputs
[
0
]);
const
auto
in
=
dynamic_cast
<
const
SequenceArg
&>
(
inputs
[
1
]);
const
auto
in
=
dynamic_cast
<
const
SequenceArg
&>
(
inputs
[
1
]);
const
auto
w
=
inputs
[
2
];
const
auto
w
=
inputs
[
2
];
auto
inGrad
=
dynamic_cast
<
const
SequenceArg
&>
(
outputs
[
0
]);
auto
inGrad
=
dynamic_cast
<
const
SequenceArg
&>
(
outputs
[
0
]);
auto
wGrad
=
outputs
[
1
];
auto
wGrad
=
outputs
[
1
];
CHECK_EQ
(
in
.
shape
().
ndims
(),
2UL
);
CHECK_EQ
(
outGrad
.
shape
().
ndims
(),
2UL
);
CHECK_EQ
(
in
.
shape
()[
1
],
outGrad
.
shape
()[
1
]);
CHECK_EQ
(
in
.
shape
()[
0
],
outGrad
.
shape
()[
0
]);
CHECK_EQ
(
wGrad
.
shape
()[
1
],
in
.
shape
()[
1
]);
const
auto
outGMat
=
outGrad
.
matrix
<
Device
>
();
const
auto
outGMat
=
outGrad
.
matrix
<
Device
>
();
const
auto
inMat
=
in
.
matrix
<
Device
>
();
const
auto
inMat
=
in
.
matrix
<
Device
>
();
const
auto
wMat
=
w
.
matrix
<
Device
>
();
const
auto
wMat
=
w
.
matrix
<
Device
>
();
...
@@ -157,37 +212,7 @@ public:
...
@@ -157,37 +212,7 @@ public:
:
typename
Tensor
<
real
,
Device
>::
Matrix
(
nullptr
,
0
,
0
);
:
typename
Tensor
<
real
,
Device
>::
Matrix
(
nullptr
,
0
,
0
);
const
auto
seqId
=
in
.
getSequenceId
().
vector
<
int
,
Device
>
();
const
auto
seqId
=
in
.
getSequenceId
().
vector
<
int
,
Device
>
();
std
::
cout
<<
"in:"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
inMat
.
getHeight
();
++
i
)
{
for
(
int
j
=
0
;
j
<
inMat
.
getWidth
();
++
j
)
{
std
::
cout
<<
outGMat
.
getElement
(
i
,
j
)
<<
" "
;
}
std
::
cout
<<
std
::
endl
;
}
std
::
cout
<<
"w:"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
wMat
.
getHeight
();
++
i
)
{
for
(
int
j
=
0
;
j
<
wMat
.
getWidth
();
++
j
)
{
std
::
cout
<<
wMat
.
getElement
(
i
,
j
)
<<
" "
;
}
std
::
cout
<<
std
::
endl
;
}
std
::
cout
<<
"w:"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
seqId
.
getSize
();
++
i
)
{
std
::
cout
<<
seqId
.
getElement
(
i
)
<<
" "
;
}
std
::
cout
<<
std
::
endl
;
RowConvGrad
<
Device
>
(
outGMat
,
inMat
,
wMat
,
inGMat
,
wGMat
,
seqId
);
RowConvGrad
<
Device
>
(
outGMat
,
inMat
,
wMat
,
inGMat
,
wGMat
,
seqId
);
std
::
cout
<<
std
::
endl
<<
"out:"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
inGMat
.
getHeight
();
++
i
)
{
for
(
int
j
=
0
;
j
<
inGMat
.
getWidth
();
++
j
)
{
std
::
cout
<<
inGMat
.
getElement
(
i
,
j
)
<<
" "
;
}
std
::
cout
<<
std
::
endl
;
}
}
}
};
};
...
...
paddle/function/RowConvOp.h
浏览文件 @
18cd1f25
...
@@ -19,7 +19,14 @@ limitations under the License. */
...
@@ -19,7 +19,14 @@ limitations under the License. */
namespace
paddle
{
namespace
paddle
{
/**
/**
* \brief TODO(qingqing)
* \brief The forward of row convolution.
*
* \param[out] out The output data and shape is h x d. h is the sum of
* time steps of all samples in one mini-batch.
* \param[in] in The input data and shape is h x d.
* \param[in] filter The filter and shape is k x d. The lookahead step
* number plus one equals k.
* \param[in] seq The sequence start positions.
*
*
*/
*/
template
<
DeviceType
DType
>
template
<
DeviceType
DType
>
...
@@ -29,7 +36,14 @@ void RowConv(typename Tensor<real, DType>::Matrix& out,
...
@@ -29,7 +36,14 @@ void RowConv(typename Tensor<real, DType>::Matrix& out,
const
typename
Tensor
<
int
,
DType
>::
Vector
&
seq
);
const
typename
Tensor
<
int
,
DType
>::
Vector
&
seq
);
/**
/**
* \brief TODO(qingqing)
* \brief The backward of row convolution.
*
* \param[in] outG The gradient w.r.t output data.
* \param[in] in The input data.
* \param[in] filter The filter.
* \param[out] inG The gradient w.r.t input data.
* \param[out] filterG The gradient w.r.t filter.
* \param[in] seq The sequence start positions.
*
*
*/
*/
template
<
DeviceType
DType
>
template
<
DeviceType
DType
>
...
...
paddle/function/RowConvOpGpu.cu
浏览文件 @
18cd1f25
...
@@ -96,11 +96,6 @@ void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,
...
@@ -96,11 +96,6 @@ void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,
const
size_t
height
=
in
.
getHeight
();
const
size_t
height
=
in
.
getHeight
();
const
size_t
width
=
in
.
getWidth
();
const
size_t
width
=
in
.
getWidth
();
LOG
(
INFO
)
<<
numSeq
;
LOG
(
INFO
)
<<
contextLength
;
LOG
(
INFO
)
<<
height
;
LOG
(
INFO
)
<<
width
;
real
*
y
=
out
.
getData
();
real
*
y
=
out
.
getData
();
const
real
*
x
=
in
.
getData
();
const
real
*
x
=
in
.
getData
();
const
real
*
w
=
filter
.
getData
();
const
real
*
w
=
filter
.
getData
();
...
@@ -108,7 +103,6 @@ void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,
...
@@ -108,7 +103,6 @@ void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,
dim3
dimBlock
(
32
,
32
);
dim3
dimBlock
(
32
,
32
);
dim3
dimGrid
(
DIVUP
(
width
,
dimBlock
.
x
),
1
);
dim3
dimGrid
(
DIVUP
(
width
,
dimBlock
.
x
),
1
);
LOG
(
INFO
)
<<
dimGrid
.
x
;
if
(
contextLength
<=
32
)
{
if
(
contextLength
<=
32
)
{
KeRowConv
<
32
,
32
><<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
KeRowConv
<
32
,
32
><<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
...
@@ -131,12 +125,12 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
...
@@ -131,12 +125,12 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
const
int
blky
=
blockDim
.
y
;
const
int
blky
=
blockDim
.
y
;
const
int
gidx
=
blockIdx
.
x
*
blockDim
.
x
;
const
int
gidx
=
blockIdx
.
x
*
blockDim
.
x
;
__shared__
real
sh_x
[
BLOCK_
H
][
BLOCK_W
];
__shared__
real
sh_x
[
BLOCK_
W
][
BLOCK_H
];
__shared__
real
sh_dy
[
BLOCK_
H
][
BLOCK_W
];
__shared__
real
sh_dy
[
BLOCK_
W
][
BLOCK_H
+
CONTEXT
-
1
];
__shared__
real
sh_dw
[
CONTEXT
][
BLOCK_W
];
__shared__
real
sh_dw
[
CONTEXT
][
BLOCK_W
];
for
(
int
t
=
tidy
;
t
<
context
;
t
+=
blky
)
{
if
(
tidy
<
context
)
{
sh_dw
[
t
][
tidx
]
=
0.0
;
sh_dw
[
t
idy
][
tidx
]
=
0.0
;
}
}
__syncthreads
();
__syncthreads
();
...
@@ -144,21 +138,31 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
...
@@ -144,21 +138,31 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
const
int
start
=
starts
[
i
];
const
int
start
=
starts
[
i
];
const
int
end
=
starts
[
i
+
1
];
const
int
end
=
starts
[
i
+
1
];
const
int
steps
=
end
-
start
;
const
int
steps
=
end
-
start
;
for
(
int
j
=
tidy
;
j
<
steps
;
j
+=
BLOCK_H
)
{
const
int
size
=
((
steps
+
BLOCK_H
-
1
)
/
BLOCK_H
)
*
BLOCK_H
;
for
(
int
j
=
tidy
;
j
<
size
;
j
+=
BLOCK_H
)
{
int
xoff
=
gidx
+
tidx
;
int
xoff
=
gidx
+
tidx
;
int
yoff
=
start
+
j
;
int
yoff
=
start
+
j
;
// transpose
// transpose
sh_x
[
tidx
][
tidy
]
=
xoff
<
width
&&
yoff
<
end
?
x
[
yoff
*
width
+
xoff
]
:
0.0
;
sh_x
[
tidx
][
tidy
]
=
(
xoff
<
width
&&
yoff
<
end
)
?
x
[
yoff
*
width
+
xoff
]
:
0.0
;
sh_dy
[
tidx
][
tidy
]
=
xoff
<
width
&&
yoff
<
end
?
dy
[
yoff
*
width
+
xoff
]
:
0.0
;
sh_dy
[
tidx
][
tidy
+
context
-
1
]
=
(
xoff
<
width
&&
yoff
<
end
)
?
dy
[
yoff
*
width
+
xoff
]
:
0.0
;
__syncthreads
();
if
(
tidy
<
(
context
-
1
))
{
yoff
=
yoff
-
context
+
1
;
sh_dy
[
tidx
][
tidy
]
=
(
xoff
<
width
&&
yoff
>=
start
)
?
dy
[
yoff
*
width
+
xoff
]
:
0.0
;
}
__syncthreads
();
__syncthreads
();
for
(
int
t
=
0
;
t
<
context
;
t
++
)
{
for
(
int
t
=
0
;
t
<
context
;
t
++
)
{
real
val
=
tidx
+
t
<
blockDim
.
x
?
sh_x
[
tidy
][
tidx
+
t
]
*
sh_dy
[
tidy
][
tidx
]
:
0.0
;
real
val
=
sh_x
[
tidy
][
tidx
]
*
sh_dy
[
tidy
][
tidx
+
context
-
1
-
t
];
__syncthreads
();
// warp size and blockDim.x is 32.
// warp size and blockDim.x is 32.
for
(
int
offset
=
16
;
offset
>
0
;
offset
/=
2
)
{
val
+=
__shfl_down
(
val
,
16
);
val
+=
__shfl_down
(
val
,
offset
);
val
+=
__shfl_down
(
val
,
8
);
}
val
+=
__shfl_down
(
val
,
4
);
val
+=
__shfl_down
(
val
,
2
);
val
+=
__shfl_down
(
val
,
1
);
__syncthreads
();
if
(
tidx
==
0
)
{
if
(
tidx
==
0
)
{
sh_dw
[
t
][
tidy
]
+=
val
;
sh_dw
[
t
][
tidy
]
+=
val
;
}
}
...
@@ -167,7 +171,7 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
...
@@ -167,7 +171,7 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
}
}
}
}
for
(
int
t
=
tidy
;
t
<
context
&&
(
gidx
+
tidx
)
<
width
;
t
+=
blky
)
{
for
(
int
t
=
tidy
;
(
t
<
context
)
&&
((
gidx
+
tidx
)
<
width
)
;
t
+=
blky
)
{
dw
[
t
*
width
+
gidx
+
tidx
]
+=
sh_dw
[
t
][
tidx
];
dw
[
t
*
width
+
gidx
+
tidx
]
+=
sh_dw
[
t
][
tidx
];
}
}
}
}
...
@@ -188,21 +192,30 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
...
@@ -188,21 +192,30 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
const
int
start
=
starts
[
i
];
const
int
start
=
starts
[
i
];
const
int
end
=
starts
[
i
+
1
];
const
int
end
=
starts
[
i
+
1
];
const
int
steps
=
end
-
start
;
const
int
steps
=
end
-
start
;
for
(
int
j
=
0
;
j
<
steps
;
j
+=
BLOCK_H
)
{
const
int
size
=
((
steps
+
BLOCK_H
-
1
)
/
BLOCK_H
)
*
BLOCK_H
;
for
(
int
j
=
tidy
;
j
<
size
;
j
+=
BLOCK_H
)
{
int
xoff
=
gidx
+
tidx
;
int
xoff
=
gidx
+
tidx
;
int
yoff
=
start
+
j
;
int
yoff
=
start
+
j
;
// transpose
// transpose
sh_x
[
tidx
][
tidy
]
=
xoff
<
width
&&
yoff
<
end
?
x
[
yoff
*
width
+
xoff
]
:
0.0
;
sh_x
[
tidx
][
tidy
]
=
(
xoff
<
width
&&
yoff
<
end
)
?
x
[
yoff
*
width
+
xoff
]
:
0.0
;
sh_dy
[
tidx
][
tidy
]
=
xoff
<
width
&&
yoff
<
end
?
dy
[
yoff
*
width
+
xoff
]
:
0.0
;
__syncthreads
();
__syncthreads
();
for
(
int
t
=
0
;
t
<
context
;
t
++
)
{
for
(
int
t
=
0
;
t
<
context
;
t
++
)
{
real
val
=
tidx
+
t
<
blockDim
.
x
?
sh_x
[
tidy
][
tidx
+
t
]
*
sh_dy
[
tidy
][
tidx
]
:
0.0
;
sh_dy
[
tidx
][
tidy
]
=
(
xoff
<
width
&&
(
yoff
-
t
)
>=
start
&&
yoff
-
t
<
end
)
?
dy
[(
yoff
-
t
)
*
width
+
xoff
]
:
0.0
;
__syncthreads
();
real
val
=
sh_x
[
tidy
][
tidx
]
*
sh_dy
[
tidy
][
tidx
];
__syncthreads
();
// warp size and blockDim.x is 32.
// warp size and blockDim.x is 32.
for
(
int
offset
=
16
;
offset
>
0
;
offset
/=
2
)
{
val
+=
__shfl_down
(
val
,
16
);
val
+=
__shfl_down
(
val
,
offset
);
val
+=
__shfl_down
(
val
,
8
);
}
val
+=
__shfl_down
(
val
,
4
);
val
+=
__shfl_down
(
val
,
2
);
val
+=
__shfl_down
(
val
,
1
);
__syncthreads
();
if
(
tidx
==
0
&&
(
gidx
+
tidy
)
<
width
)
{
if
(
tidx
==
0
&&
(
gidx
+
tidy
)
<
width
)
{
dw
[
t
*
width
+
gidx
+
tidy
]
+=
val
;
dw
[
t
*
width
+
gidx
+
tidy
]
+=
val
;
}
}
...
@@ -293,34 +306,36 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
...
@@ -293,34 +306,36 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
const
real
*
dy
=
outG
.
getData
();
const
real
*
dy
=
outG
.
getData
();
const
real
*
x
=
in
.
getData
();
const
real
*
x
=
in
.
getData
();
const
real
*
w
=
filter
.
getData
();
const
real
*
w
=
filter
.
getData
();
real
*
dx
=
inG
.
getData
();
real
*
dw
=
filterG
.
getData
();
const
int
*
starts
=
seq
.
getData
();
const
int
*
starts
=
seq
.
getData
();
dim3
dimBlock
(
32
,
32
);
if
(
filterG
)
{
dim3
dimGrid
(
DIVUP
(
width
,
dimBlock
.
x
),
1
);
dim3
dimBlock
(
32
,
32
);
dim3
dimGrid
(
DIVUP
(
width
,
dimBlock
.
x
),
1
);
if
(
contextLength
<=
16
)
{
real
*
dw
=
filterG
.
getData
();
KeRowConvBwWeight
<
32
,
32
,
16
>
if
(
contextLength
<=
16
)
{
<<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
KeRowConvBwWeight
<
32
,
32
,
16
>
(
dw
,
x
,
dy
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
<<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
}
else
{
(
dw
,
x
,
dy
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
KeRowConvBwWeight2
<
32
,
32
>
}
else
{
<<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
KeRowConvBwWeight2
<
32
,
32
>
(
dw
,
x
,
dy
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
<<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
(
dw
,
x
,
dy
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
}
}
}
if
(
inG
)
{
dim3
dimBlock2
(
32
,
32
);
real
*
dx
=
inG
.
getData
();
dim3
dimGrid2
(
DIVUP
(
width
,
dimBlock2
.
x
),
1
);
dim3
dimBlock2
(
32
,
32
);
if
(
contextLength
<=
64
)
{
dim3
dimGrid2
(
DIVUP
(
width
,
dimBlock2
.
x
),
1
);
KeRowConvBwData
<
32
,
64
>
if
(
contextLength
<=
64
)
{
<<<
dimGrid2
,
dimBlock2
,
0
,
STREAM_DEFAULT
>>>
KeRowConvBwData
<
32
,
64
>
(
dx
,
w
,
dy
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
<<<
dimGrid2
,
dimBlock2
,
0
,
STREAM_DEFAULT
>>>
}
else
{
(
dx
,
w
,
dy
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
KeRowConvBwData2
}
else
{
<<<
dimGrid2
,
dimBlock2
,
0
,
STREAM_DEFAULT
>>>
KeRowConvBwData2
(
dx
,
w
,
dy
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
<<<
dimGrid2
,
dimBlock2
,
0
,
STREAM_DEFAULT
>>>
(
dx
,
w
,
dy
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
}
}
}
CHECK_SYNC
(
"RowConvGrad"
);
CHECK_SYNC
(
"RowConvGrad"
);
...
...
paddle/function/RowConvOpTest.cpp
浏览文件 @
18cd1f25
...
@@ -47,23 +47,16 @@ void testRowConvBw(size_t batchSize, size_t dim, size_t contextLength) {
...
@@ -47,23 +47,16 @@ void testRowConvBw(size_t batchSize, size_t dim, size_t contextLength) {
}
}
TEST
(
RowConv
,
real
)
{
TEST
(
RowConv
,
real
)
{
// for (size_t numSamples : {17, 129}) {
for
(
size_t
numSamples
:
{
17
,
129
,
2020
})
{
// for (size_t dim : {16, 248}) {
for
(
size_t
dim
:
{
16
,
512
,
2560
})
{
// for (size_t context: {3, 7, 65}) {
for
(
size_t
context
:
{
3
,
19
,
65
})
{
LOG
(
INFO
)
<<
"==========="
;
VLOG
(
3
)
<<
" numSamples="
<<
numSamples
<<
" dim="
<<
dim
// for (size_t numSamples : {17}) {
<<
" context length="
<<
context
;
// for (size_t dim : {16}) {
testRowConvFw
(
numSamples
,
dim
,
context
);
// for (size_t context: {3}) {
testRowConvBw
(
numSamples
,
dim
,
context
);
size_t
numSamples
=
17
;
}
size_t
dim
=
16
;
}
size_t
context
=
3
;
}
LOG
(
INFO
)
<<
" numSamples="
<<
numSamples
<<
" dim="
<<
dim
<<
" context length="
<<
context
;
testRowConvFw
(
numSamples
,
dim
,
context
);
// testRowConvBw(numSamples, dim, context);
// }
// }
// }
}
}
}
// namespace paddle
}
// namespace paddle
paddle/gserver/layers/RowConvLayer.cpp
浏览文件 @
18cd1f25
...
@@ -75,7 +75,7 @@ void RowConvLayer::backward(const UpdateCallback& callback) {
...
@@ -75,7 +75,7 @@ void RowConvLayer::backward(const UpdateCallback& callback) {
BufferArgs
outputs
;
BufferArgs
outputs
;
inputs
.
addArg
(
*
getOutputGrad
(),
*
startPos
);
inputs
.
addArg
(
*
getOutputGrad
(),
*
startPos
);
inputs
.
addArg
(
*
getInputValue
(
0
),
*
startPos
);
inputs
.
addArg
(
*
getInputValue
(
0
),
*
startPos
);
inputs
.
addArg
(
*
weight_
->
getW
(),
*
startPos
);
inputs
.
addArg
(
*
weight_
->
getW
(),
wDims_
);
MatrixPtr
inGrad
=
getInputGrad
(
0
);
MatrixPtr
inGrad
=
getInputGrad
(
0
);
MatrixPtr
wGrad
=
weight_
->
getWGrad
();
MatrixPtr
wGrad
=
weight_
->
getWGrad
();
...
...
paddle/gserver/layers/RowConvLayer.h
浏览文件 @
18cd1f25
...
@@ -37,9 +37,7 @@ protected:
...
@@ -37,9 +37,7 @@ protected:
// fan_out is the size of output feature.
// fan_out is the size of output feature.
std
::
unique_ptr
<
Weight
>
weight_
;
std
::
unique_ptr
<
Weight
>
weight_
;
// std::unique_ptr<Weight> biases_;
// The step number to look ahead plus one equals contexLength_.
// how many steps to look ahead
size_t
contexLength_
;
size_t
contexLength_
;
TensorShape
wDims_
;
TensorShape
wDims_
;
};
};
...
...
python/paddle/trainer/config_parser.py
浏览文件 @
18cd1f25
...
@@ -2081,6 +2081,23 @@ class MaxOutLayer(LayerBase):
...
@@ -2081,6 +2081,23 @@ class MaxOutLayer(LayerBase):
g_layer_map
[
input_layer
.
name
].
width
,
out_channels
)
g_layer_map
[
input_layer
.
name
].
width
,
out_channels
)
@
config_layer
(
'row_conv'
)
class
RowConvLayer
(
LayerBase
):
def
__init__
(
self
,
name
,
inputs
,
context_length
,
**
xargs
):
super
(
RowConvLayer
,
self
).
__init__
(
name
,
'maxout'
,
0
,
inputs
=
inputs
,
**
xargs
)
config_assert
(
len
(
self
.
inputs
)
==
1
,
'TransLayer must have one and only one input'
)
input_layer
=
self
.
get_input_layer
(
0
)
row_conv_conf
=
self
.
config
.
inputs
[
0
].
row_conv_conf
row_conv_conf
.
context_length
=
context_length
self
.
set_layer_size
(
input_layer
.
size
)
psize
=
context_length
*
input_layer
.
size
dims
=
[
context_length
,
input_layer
.
size
]
self
.
create_input_parameter
(
0
,
psize
,
dims
)
# key: cost type
# key: cost type
# value: cost class
# value: cost class
g_cost_map
=
{}
g_cost_map
=
{}
...
...
python/paddle/trainer_config_helpers/layers.py
浏览文件 @
18cd1f25
...
@@ -120,6 +120,7 @@ __all__ = [
...
@@ -120,6 +120,7 @@ __all__ = [
'smooth_l1_cost'
,
'smooth_l1_cost'
,
'layer_support'
,
'layer_support'
,
'multiplex_layer'
,
'multiplex_layer'
,
'row_conv_layer'
,
]
]
...
@@ -187,6 +188,7 @@ class LayerType(object):
...
@@ -187,6 +188,7 @@ class LayerType(object):
SPP_LAYER
=
"spp"
SPP_LAYER
=
"spp"
PAD_LAYER
=
"pad"
PAD_LAYER
=
"pad"
MULTIPLEX_LAYER
=
"multiplex"
MULTIPLEX_LAYER
=
"multiplex"
ROW_CONV_LAYER
=
"row_conv"
PRINT_LAYER
=
"print"
PRINT_LAYER
=
"print"
PRIORBOX_LAYER
=
"priorbox"
PRIORBOX_LAYER
=
"priorbox"
...
@@ -5528,3 +5530,77 @@ def multiplex_layer(input, name=None, layer_attr=None):
...
@@ -5528,3 +5530,77 @@ def multiplex_layer(input, name=None, layer_attr=None):
layer_type
=
LayerType
.
MULTIPLEX_LAYER
,
layer_type
=
LayerType
.
MULTIPLEX_LAYER
,
parents
=
input
,
parents
=
input
,
size
=
l
.
config
.
size
)
size
=
l
.
config
.
size
)
@
wrap_name_default
()
@
wrap_act_default
(
act
=
LinearActivation
())
@
wrap_param_attr_default
()
@
layer_support
(
DROPOUT
)
def
row_conv_layer
(
input
,
context_len
,
act
=
None
,
name
=
None
,
param_attr
=
None
,
layer_attr
=
None
):
"""
The row convolution is called lookahead convolution. It is firstly
introduced in paper of `Deep Speech 2: End-toEnd Speech Recognition
in English and Mandarin <https://arxiv.org/pdf/1512.02595v1.pdf>`_ .
The bidirectional RNN that learns representation for a sequence by
performing a forward and a backward pass through the entire sequence.
However, unlike unidirectional RNNs, bidirectional RNNs are challenging
to deploy in an online and low-latency setting. The lookahead convolution
incorporates information from future subsequences in a computationally
efficient manner to improve unidirectional recurrent neural networks.
The connection of row convolution is different form the 1D sequence
convolution. Assumed that, the future context-length is k, that is to say,
it can get the output at timestep t by using the the input feature from t-th
timestep to (t+k+1)-th timestep. Assumed that the hidden dim of input
activations are d, the activations r_t for the new layer at time-step t are:
.. math::
r_{t,r} = \sum_{j=1}^{k + 1} {w_{i,j}h_{t+j-1, i}}
\quad
\t
ext{for} \quad (1 \leq i \leq d)
Note:
The `context_len` is `k + 1`. That is to say, the lookahead step
number plus one equals context_len.
.. code-block:: python
row_conv = row_conv_layer(input=input_layer, context_len=3)
:param input: The input layer.
:type input: LayerOutput
:param context_len: The context length equals the lookahead step number
plus one.
:type context_len: int
:param act: Activation Type. Default is linear activation.
:type act: BaseActivation
:param param_attr: The Parameter Attribute. If None, the parameter will be
initialized smartly. It's better set it by yourself.
:type param_attr: ParameterAttribute
:param layer_attr: Extra Layer config.
:type layer_attr: ExtraLayerAttribute|None
:return: LayerOutput object.
:rtype: LayerOutput
"""
assert
isinstance
(
input
,
LayerOutput
)
assert
context_len
>
0
,
"the context_len must be greatet than 0."
Layer
(
inputs
=
[
Input
(
input
.
name
,
**
param_attr
.
attr
)],
name
=
name
,
context_length
=
context_len
,
type
=
LayerType
.
ROW_CONV_LAYER
,
active_type
=
act
.
name
,
**
ExtraLayerAttribute
.
to_kwargs
(
layer_attr
))
return
LayerOutput
(
name
,
LayerType
.
ROW_CONV_LAYER
,
input
,
activation
=
act
,
size
=
input
.
size
)
python/paddle/trainer_config_helpers/tests/configs/file_list.sh
浏览文件 @
18cd1f25
...
@@ -5,6 +5,6 @@ last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
...
@@ -5,6 +5,6 @@ last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
)
test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
test_row_conv
)
export
whole_configs
=(
test_split_datasource
)
export
whole_configs
=(
test_split_datasource
)
python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_conv.protostr
0 → 100644
浏览文件 @
18cd1f25
type: "nn"
layers {
name: "data"
type: "data"
size: 2560
active_type: ""
}
layers {
name: "__row_conv_layer_0__"
type: "maxout"
size: 2560
active_type: "relu"
inputs {
input_layer_name: "data"
input_parameter_name: "___row_conv_layer_0__.w0"
row_conv_conf {
context_length: 19
}
}
}
parameters {
name: "___row_conv_layer_0__.w0"
size: 48640
initial_mean: 0.0
initial_std: 0.229415733871
dims: 19
dims: 2560
initial_strategy: 0
initial_smart: true
}
input_layer_names: "data"
output_layer_names: "__row_conv_layer_0__"
sub_models {
name: "root"
layer_names: "data"
layer_names: "__row_conv_layer_0__"
input_layer_names: "data"
output_layer_names: "__row_conv_layer_0__"
is_recurrent_layer_group: false
}
python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py
0 → 100644
浏览文件 @
18cd1f25
from
paddle.trainer_config_helpers
import
*
settings
(
batch_size
=
1000
,
learning_rate
=
1e-5
)
data
=
data_layer
(
name
=
'data'
,
size
=
2560
)
row_conv
=
row_conv_layer
(
input
=
data
,
context_len
=
19
,
act
=
ReluActivation
())
outputs
(
row_conv
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录