Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
92890ac2
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
92890ac2
编写于
8月 16, 2018
作者:
T
tensor-tang
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'ups/develop' into feature/op/fusion_lstm
上级
b599bd8d
ff92b6ba
变更
10
隐藏空白更改
内联
并排
Showing
10 changed file
with
404 addition
and
106 deletion
+404
-106
paddle/fluid/framework/attribute.h
paddle/fluid/framework/attribute.h
+4
-3
paddle/fluid/operators/gru_op.cc
paddle/fluid/operators/gru_op.cc
+159
-3
paddle/fluid/operators/gru_op.cu.cc
paddle/fluid/operators/gru_op.cu.cc
+90
-0
paddle/fluid/operators/gru_op.h
paddle/fluid/operators/gru_op.h
+0
-84
paddle/fluid/operators/math/blas.h
paddle/fluid/operators/math/blas.h
+41
-0
paddle/fluid/operators/math/blas_impl.h
paddle/fluid/operators/math/blas_impl.h
+75
-0
paddle/fluid/platform/dynload/mklml.h
paddle/fluid/platform/dynload/mklml.h
+8
-0
paddle/fluid/platform/enforce.h
paddle/fluid/platform/enforce.h
+6
-4
paddle/fluid/platform/enforce_test.cc
paddle/fluid/platform/enforce_test.cc
+19
-11
paddle/fluid/platform/gpu_info.cc
paddle/fluid/platform/gpu_info.cc
+2
-1
未找到文件。
paddle/fluid/framework/attribute.h
浏览文件 @
92890ac2
...
@@ -128,7 +128,8 @@ struct ExtractAttribute {
...
@@ -128,7 +128,8 @@ struct ExtractAttribute {
attr_value
=
&
boost
::
get
<
T
>
(
attr
);
attr_value
=
&
boost
::
get
<
T
>
(
attr
);
}
catch
(
boost
::
bad_get
&
bad_get
)
{
}
catch
(
boost
::
bad_get
&
bad_get
)
{
PADDLE_THROW
(
"Cannot get attribute %s by type %s, its type is %s"
,
PADDLE_THROW
(
"Cannot get attribute %s by type %s, its type is %s"
,
attr_name_
,
typeid
(
T
).
name
(),
attr
.
type
().
name
());
attr_name_
,
paddle
::
platform
::
demangle
(
typeid
(
T
).
name
()),
paddle
::
platform
::
demangle
(
attr
.
type
().
name
()));
}
}
return
attr_value
;
return
attr_value
;
}
}
...
@@ -160,7 +161,7 @@ struct ExtractAttribute<bool> {
...
@@ -160,7 +161,7 @@ struct ExtractAttribute<bool> {
attr_value
=
&
boost
::
get
<
bool
>
(
attr
);
attr_value
=
&
boost
::
get
<
bool
>
(
attr
);
}
catch
(
boost
::
bad_get
&
bad_get
)
{
}
catch
(
boost
::
bad_get
&
bad_get
)
{
PADDLE_THROW
(
"Cannot get attribute %s by type bool, its type is %s"
,
PADDLE_THROW
(
"Cannot get attribute %s by type bool, its type is %s"
,
attr_name_
,
attr
.
type
().
name
(
));
attr_name_
,
paddle
::
platform
::
demangle
(
attr
.
type
().
name
()
));
}
}
return
attr_value
;
return
attr_value
;
}
}
...
@@ -186,7 +187,7 @@ struct ExtractAttribute<int64_t> {
...
@@ -186,7 +187,7 @@ struct ExtractAttribute<int64_t> {
attr_value
=
&
boost
::
get
<
int64_t
>
(
attr
);
attr_value
=
&
boost
::
get
<
int64_t
>
(
attr
);
}
catch
(
boost
::
bad_get
&
bad_get
)
{
}
catch
(
boost
::
bad_get
&
bad_get
)
{
PADDLE_THROW
(
"Cannot get attribute %s by type int64_t, its type is %s"
,
PADDLE_THROW
(
"Cannot get attribute %s by type int64_t, its type is %s"
,
attr_name_
,
attr
.
type
().
name
(
));
attr_name_
,
paddle
::
platform
::
demangle
(
attr
.
type
().
name
()
));
}
}
return
attr_value
;
return
attr_value
;
}
}
...
...
paddle/fluid/operators/gru_op.cc
浏览文件 @
92890ac2
...
@@ -14,6 +14,11 @@ limitations under the License. */
...
@@ -14,6 +14,11 @@ limitations under the License. */
#include "paddle/fluid/operators/gru_op.h"
#include "paddle/fluid/operators/gru_op.h"
#include <string>
#include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
#include "paddle/fluid/operators/math/detail/gru_kernel.h"
DECLARE_int32
(
paddle_num_threads
);
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -211,6 +216,158 @@ class GRUGradOp : public framework::OperatorWithKernel {
...
@@ -211,6 +216,158 @@ class GRUGradOp : public framework::OperatorWithKernel {
}
}
};
};
template
<
typename
T
>
class
GRUCPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
BatchCompute
(
const
framework
::
ExecutionContext
&
context
)
const
{
using
DeviceContext
=
paddle
::
platform
::
CPUDeviceContext
;
auto
*
input
=
context
.
Input
<
LoDTensor
>
(
"Input"
);
auto
*
h0
=
context
.
Input
<
Tensor
>
(
"H0"
);
auto
*
weight
=
context
.
Input
<
Tensor
>
(
"Weight"
);
const
T
*
weight_data
=
weight
->
data
<
T
>
();
auto
*
bias
=
context
.
Input
<
Tensor
>
(
"Bias"
);
auto
*
batch_gate
=
context
.
Output
<
LoDTensor
>
(
"BatchGate"
);
batch_gate
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
*
batch_reset_hidden_prev
=
context
.
Output
<
LoDTensor
>
(
"BatchResetHiddenPrev"
);
batch_reset_hidden_prev
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
*
batch_hidden
=
context
.
Output
<
LoDTensor
>
(
"BatchHidden"
);
batch_hidden
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
*
hidden
=
context
.
Output
<
LoDTensor
>
(
"Hidden"
);
hidden
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
hidden_dims
=
hidden
->
dims
();
bool
is_reverse
=
context
.
Attr
<
bool
>
(
"is_reverse"
);
math
::
LoDTensor2BatchFunctor
<
DeviceContext
,
T
>
to_batch
;
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
to_batch
(
dev_ctx
,
*
input
,
batch_gate
,
true
,
is_reverse
);
if
(
bias
)
{
math
::
RowwiseAdd
<
DeviceContext
,
T
>
add_bias
;
add_bias
(
dev_ctx
,
*
batch_gate
,
*
bias
,
batch_gate
);
}
int
frame_size
=
hidden_dims
[
1
];
math
::
GRUMetaValue
<
T
>
gru_value
;
gru_value
.
gate_weight
=
const_cast
<
T
*>
(
weight_data
);
gru_value
.
state_weight
=
const_cast
<
T
*>
(
weight_data
+
2
*
frame_size
*
frame_size
);
Tensor
ordered_h0
;
framework
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
if
(
h0
)
{
// Since the batch computing for GRU reorders the input sequences
// according to their length. The initialized cell state also needs
// to reorder.
ReorderInitState
<
DeviceContext
,
T
>
(
context
.
template
device_context
<
DeviceContext
>(),
*
h0
,
order
,
&
ordered_h0
,
true
);
gru_value
.
prev_out_value
=
ordered_h0
.
data
<
T
>
();
}
else
{
gru_value
.
prev_out_value
=
nullptr
;
}
auto
batch_starts
=
batch_gate
->
lod
()[
0
];
size_t
seq_len
=
batch_starts
.
size
()
-
1
;
auto
active_node
=
math
::
detail
::
GetActivationType
(
context
.
Attr
<
std
::
string
>
(
"activation"
));
auto
active_gate
=
math
::
detail
::
GetActivationType
(
context
.
Attr
<
std
::
string
>
(
"gate_activation"
));
#ifdef PADDLE_WITH_MKLML
// use MKL packed to speedup GEMM
if
(
FLAGS_paddle_num_threads
>=
4
)
{
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
dev_ctx
);
T
*
packed_gate
=
blas
.
GEMM_ALLOC
(
CblasBMatrix
,
1
/*height of C*/
,
frame_size
*
2
/*width of weight*/
,
frame_size
/*height of height*/
);
PADDLE_ENFORCE
(
packed_gate
);
blas
.
GEMM_PACK
(
CblasBMatrix
,
CblasNoTrans
,
1
/*cur bs?*/
,
frame_size
*
2
,
frame_size
,
T
(
1.0
),
gru_value
.
gate_weight
,
frame_size
*
2
,
packed_gate
);
T
*
packed_state
=
blas
.
GEMM_ALLOC
(
CblasBMatrix
,
1
/*height of C*/
,
frame_size
/*width of weight*/
,
frame_size
/*height of height*/
);
PADDLE_ENFORCE
(
packed_state
);
blas
.
GEMM_PACK
(
CblasBMatrix
,
CblasNoTrans
,
1
/*cur bs?*/
,
frame_size
,
frame_size
,
T
(
1.0
),
gru_value
.
state_weight
,
frame_size
,
packed_state
);
for
(
size_t
n
=
0
;
n
<
seq_len
;
n
++
)
{
int
bstart
=
static_cast
<
int
>
(
batch_starts
[
n
]);
int
bend
=
static_cast
<
int
>
(
batch_starts
[
n
+
1
]);
int
cur_batch_size
=
bend
-
bstart
;
Tensor
gate_t
=
batch_gate
->
Slice
(
bstart
,
bend
);
Tensor
reset_hidden_prev_t
=
batch_reset_hidden_prev
->
Slice
(
bstart
,
bend
);
Tensor
hidden_t
=
batch_hidden
->
Slice
(
bstart
,
bend
);
gru_value
.
output_value
=
hidden_t
.
data
<
T
>
();
gru_value
.
gate_value
=
gate_t
.
data
<
T
>
();
gru_value
.
reset_output_value
=
reset_hidden_prev_t
.
data
<
T
>
();
if
(
gru_value
.
prev_out_value
)
{
blas
.
GEMM_COMPUTE
(
CblasNoTrans
,
CblasPacked
,
cur_batch_size
,
frame_size
*
2
,
frame_size
,
gru_value
.
prev_out_value
,
frame_size
,
packed_gate
,
frame_size
*
2
,
T
(
1
),
gru_value
.
gate_value
,
frame_size
*
3
);
}
math
::
detail
::
forward_reset_output
(
math
::
detail
::
forward
::
gru_resetOutput
<
T
>
(),
gru_value
,
frame_size
,
cur_batch_size
,
active_gate
);
if
(
gru_value
.
prev_out_value
)
{
blas
.
GEMM_COMPUTE
(
CblasNoTrans
,
CblasPacked
,
cur_batch_size
,
frame_size
,
frame_size
,
gru_value
.
reset_output_value
,
frame_size
,
packed_state
,
frame_size
,
T
(
1
),
gru_value
.
gate_value
+
frame_size
*
2
,
frame_size
*
3
);
}
math
::
detail
::
forward_final_output
(
math
::
detail
::
forward
::
gru_finalOutput
<
T
>
(),
gru_value
,
frame_size
,
cur_batch_size
,
active_node
);
gru_value
.
prev_out_value
=
gru_value
.
output_value
;
}
blas
.
GEMM_FREE
(
packed_gate
);
blas
.
GEMM_FREE
(
packed_state
);
}
else
{
#endif
for
(
size_t
n
=
0
;
n
<
seq_len
;
n
++
)
{
int
bstart
=
static_cast
<
int
>
(
batch_starts
[
n
]);
int
bend
=
static_cast
<
int
>
(
batch_starts
[
n
+
1
]);
int
cur_batch_size
=
bend
-
bstart
;
Tensor
gate_t
=
batch_gate
->
Slice
(
bstart
,
bend
);
Tensor
reset_hidden_prev_t
=
batch_reset_hidden_prev
->
Slice
(
bstart
,
bend
);
Tensor
hidden_t
=
batch_hidden
->
Slice
(
bstart
,
bend
);
gru_value
.
output_value
=
hidden_t
.
data
<
T
>
();
gru_value
.
gate_value
=
gate_t
.
data
<
T
>
();
gru_value
.
reset_output_value
=
reset_hidden_prev_t
.
data
<
T
>
();
math
::
GRUUnitFunctor
<
DeviceContext
,
T
>::
compute
(
dev_ctx
,
gru_value
,
frame_size
,
cur_batch_size
,
active_node
,
active_gate
);
gru_value
.
prev_out_value
=
gru_value
.
output_value
;
}
#ifdef PADDLE_WITH_MKLML
}
#endif
math
::
Batch2LoDTensorFunctor
<
DeviceContext
,
T
>
to_seq
;
batch_hidden
->
set_lod
(
batch_gate
->
lod
());
to_seq
(
dev_ctx
,
*
batch_hidden
,
hidden
);
}
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
BatchCompute
(
context
);
}
};
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
...
@@ -218,9 +375,8 @@ namespace ops = paddle::operators;
...
@@ -218,9 +375,8 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR
(
gru
,
ops
::
GRUOp
,
ops
::
GRUOpMaker
,
REGISTER_OPERATOR
(
gru
,
ops
::
GRUOp
,
ops
::
GRUOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OPERATOR
(
gru_grad
,
ops
::
GRUGradOp
);
REGISTER_OPERATOR
(
gru_grad
,
ops
::
GRUGradOp
);
REGISTER_OP_CPU_KERNEL
(
REGISTER_OP_CPU_KERNEL
(
gru
,
ops
::
GRUCPUKernel
<
float
>
,
gru
,
ops
::
GRUKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
GRUCPUKernel
<
double
>
);
ops
::
GRUKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
REGISTER_OP_CPU_KERNEL
(
REGISTER_OP_CPU_KERNEL
(
gru_grad
,
ops
::
GRUGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
gru_grad
,
ops
::
GRUGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
GRUGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
ops
::
GRUGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
paddle/fluid/operators/gru_op.cu.cc
浏览文件 @
92890ac2
...
@@ -14,6 +14,96 @@ limitations under the License. */
...
@@ -14,6 +14,96 @@ limitations under the License. */
#include "paddle/fluid/operators/gru_op.h"
#include "paddle/fluid/operators/gru_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
GRUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
BatchCompute
(
const
framework
::
ExecutionContext
&
context
)
const
{
auto
*
input
=
context
.
Input
<
LoDTensor
>
(
"Input"
);
auto
*
h0
=
context
.
Input
<
Tensor
>
(
"H0"
);
auto
*
weight
=
context
.
Input
<
Tensor
>
(
"Weight"
);
const
T
*
weight_data
=
weight
->
data
<
T
>
();
auto
*
bias
=
context
.
Input
<
Tensor
>
(
"Bias"
);
auto
*
batch_gate
=
context
.
Output
<
LoDTensor
>
(
"BatchGate"
);
batch_gate
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
*
batch_reset_hidden_prev
=
context
.
Output
<
LoDTensor
>
(
"BatchResetHiddenPrev"
);
batch_reset_hidden_prev
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
*
batch_hidden
=
context
.
Output
<
LoDTensor
>
(
"BatchHidden"
);
batch_hidden
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
*
hidden
=
context
.
Output
<
LoDTensor
>
(
"Hidden"
);
hidden
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
hidden_dims
=
hidden
->
dims
();
bool
is_reverse
=
context
.
Attr
<
bool
>
(
"is_reverse"
);
math
::
LoDTensor2BatchFunctor
<
DeviceContext
,
T
>
to_batch
;
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
to_batch
(
dev_ctx
,
*
input
,
batch_gate
,
true
,
is_reverse
);
if
(
bias
)
{
math
::
RowwiseAdd
<
DeviceContext
,
T
>
add_bias
;
add_bias
(
dev_ctx
,
*
batch_gate
,
*
bias
,
batch_gate
);
}
int
frame_size
=
hidden_dims
[
1
];
math
::
GRUMetaValue
<
T
>
gru_value
;
gru_value
.
gate_weight
=
const_cast
<
T
*>
(
weight_data
);
gru_value
.
state_weight
=
const_cast
<
T
*>
(
weight_data
+
2
*
frame_size
*
frame_size
);
Tensor
ordered_h0
;
framework
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
if
(
h0
)
{
// Since the batch computing for GRU reorders the input sequences
// according to their length. The initialized cell state also needs
// to reorder.
ReorderInitState
<
DeviceContext
,
T
>
(
context
.
template
device_context
<
DeviceContext
>(),
*
h0
,
order
,
&
ordered_h0
,
true
);
gru_value
.
prev_out_value
=
ordered_h0
.
data
<
T
>
();
}
else
{
gru_value
.
prev_out_value
=
nullptr
;
}
auto
batch_starts
=
batch_gate
->
lod
()[
0
];
size_t
num_batch
=
batch_starts
.
size
()
-
1
;
auto
active_node
=
math
::
detail
::
GetActivationType
(
context
.
Attr
<
std
::
string
>
(
"activation"
));
auto
active_gate
=
math
::
detail
::
GetActivationType
(
context
.
Attr
<
std
::
string
>
(
"gate_activation"
));
for
(
size_t
n
=
0
;
n
<
num_batch
;
n
++
)
{
int
bstart
=
static_cast
<
int
>
(
batch_starts
[
n
]);
int
bend
=
static_cast
<
int
>
(
batch_starts
[
n
+
1
]);
int
cur_batch_size
=
bend
-
bstart
;
Tensor
gate_t
=
batch_gate
->
Slice
(
bstart
,
bend
);
Tensor
reset_hidden_prev_t
=
batch_reset_hidden_prev
->
Slice
(
bstart
,
bend
);
Tensor
hidden_t
=
batch_hidden
->
Slice
(
bstart
,
bend
);
gru_value
.
output_value
=
hidden_t
.
data
<
T
>
();
gru_value
.
gate_value
=
gate_t
.
data
<
T
>
();
gru_value
.
reset_output_value
=
reset_hidden_prev_t
.
data
<
T
>
();
math
::
GRUUnitFunctor
<
DeviceContext
,
T
>::
compute
(
dev_ctx
,
gru_value
,
frame_size
,
cur_batch_size
,
active_node
,
active_gate
);
gru_value
.
prev_out_value
=
gru_value
.
output_value
;
}
math
::
Batch2LoDTensorFunctor
<
DeviceContext
,
T
>
to_seq
;
batch_hidden
->
set_lod
(
batch_gate
->
lod
());
to_seq
(
dev_ctx
,
*
batch_hidden
,
hidden
);
}
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
BatchCompute
(
context
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
REGISTER_OP_CUDA_KERNEL
(
gru
,
ops
::
GRUKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
gru
,
ops
::
GRUKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
...
...
paddle/fluid/operators/gru_op.h
浏览文件 @
92890ac2
...
@@ -37,90 +37,6 @@ inline void ReorderInitState(const DeviceContext& ctx,
...
@@ -37,90 +37,6 @@ inline void ReorderInitState(const DeviceContext& ctx,
row_shuffle
(
ctx
,
src
,
index_lod
,
dst
,
indexed_src
);
row_shuffle
(
ctx
,
src
,
index_lod
,
dst
,
indexed_src
);
}
}
template
<
typename
DeviceContext
,
typename
T
>
class
GRUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
BatchCompute
(
const
framework
::
ExecutionContext
&
context
)
const
{
auto
*
input
=
context
.
Input
<
LoDTensor
>
(
"Input"
);
auto
*
h0
=
context
.
Input
<
Tensor
>
(
"H0"
);
auto
*
weight
=
context
.
Input
<
Tensor
>
(
"Weight"
);
const
T
*
weight_data
=
weight
->
data
<
T
>
();
auto
*
bias
=
context
.
Input
<
Tensor
>
(
"Bias"
);
auto
*
batch_gate
=
context
.
Output
<
LoDTensor
>
(
"BatchGate"
);
batch_gate
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
*
batch_reset_hidden_prev
=
context
.
Output
<
LoDTensor
>
(
"BatchResetHiddenPrev"
);
batch_reset_hidden_prev
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
*
batch_hidden
=
context
.
Output
<
LoDTensor
>
(
"BatchHidden"
);
batch_hidden
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
*
hidden
=
context
.
Output
<
LoDTensor
>
(
"Hidden"
);
hidden
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
hidden_dims
=
hidden
->
dims
();
bool
is_reverse
=
context
.
Attr
<
bool
>
(
"is_reverse"
);
math
::
LoDTensor2BatchFunctor
<
DeviceContext
,
T
>
to_batch
;
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
to_batch
(
dev_ctx
,
*
input
,
batch_gate
,
true
,
is_reverse
);
if
(
bias
)
{
math
::
RowwiseAdd
<
DeviceContext
,
T
>
add_bias
;
add_bias
(
dev_ctx
,
*
batch_gate
,
*
bias
,
batch_gate
);
}
int
frame_size
=
hidden_dims
[
1
];
math
::
GRUMetaValue
<
T
>
gru_value
;
gru_value
.
gate_weight
=
const_cast
<
T
*>
(
weight_data
);
gru_value
.
state_weight
=
const_cast
<
T
*>
(
weight_data
+
2
*
frame_size
*
frame_size
);
Tensor
ordered_h0
;
framework
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
if
(
h0
)
{
// Since the batch computing for GRU reorders the input sequences
// according to their length. The initialized cell state also needs
// to reorder.
ReorderInitState
<
DeviceContext
,
T
>
(
context
.
template
device_context
<
DeviceContext
>(),
*
h0
,
order
,
&
ordered_h0
,
true
);
gru_value
.
prev_out_value
=
ordered_h0
.
data
<
T
>
();
}
else
{
gru_value
.
prev_out_value
=
nullptr
;
}
auto
batch_starts
=
batch_gate
->
lod
()[
0
];
size_t
num_batch
=
batch_starts
.
size
()
-
1
;
auto
active_node
=
math
::
detail
::
GetActivationType
(
context
.
Attr
<
std
::
string
>
(
"activation"
));
auto
active_gate
=
math
::
detail
::
GetActivationType
(
context
.
Attr
<
std
::
string
>
(
"gate_activation"
));
for
(
size_t
n
=
0
;
n
<
num_batch
;
n
++
)
{
int
bstart
=
static_cast
<
int
>
(
batch_starts
[
n
]);
int
bend
=
static_cast
<
int
>
(
batch_starts
[
n
+
1
]);
int
cur_batch_size
=
bend
-
bstart
;
Tensor
gate_t
=
batch_gate
->
Slice
(
bstart
,
bend
);
Tensor
reset_hidden_prev_t
=
batch_reset_hidden_prev
->
Slice
(
bstart
,
bend
);
Tensor
hidden_t
=
batch_hidden
->
Slice
(
bstart
,
bend
);
gru_value
.
output_value
=
hidden_t
.
data
<
T
>
();
gru_value
.
gate_value
=
gate_t
.
data
<
T
>
();
gru_value
.
reset_output_value
=
reset_hidden_prev_t
.
data
<
T
>
();
math
::
GRUUnitFunctor
<
DeviceContext
,
T
>::
compute
(
dev_ctx
,
gru_value
,
frame_size
,
cur_batch_size
,
active_node
,
active_gate
);
gru_value
.
prev_out_value
=
gru_value
.
output_value
;
}
math
::
Batch2LoDTensorFunctor
<
DeviceContext
,
T
>
to_seq
;
batch_hidden
->
set_lod
(
batch_gate
->
lod
());
to_seq
(
dev_ctx
,
*
batch_hidden
,
hidden
);
}
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
BatchCompute
(
context
);
}
};
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
DeviceContext
,
typename
T
>
class
GRUGradKernel
:
public
framework
::
OpKernel
<
T
>
{
class
GRUGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
...
...
paddle/fluid/operators/math/blas.h
浏览文件 @
92890ac2
...
@@ -90,6 +90,25 @@ class Blas {
...
@@ -90,6 +90,25 @@ class Blas {
void
GEMM
(
bool
transA
,
bool
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
void
GEMM
(
bool
transA
,
bool
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
int
lda
,
const
T
*
B
,
int
ldb
,
T
beta
,
T
*
C
,
int
ldc
)
const
;
int
lda
,
const
T
*
B
,
int
ldb
,
T
beta
,
T
*
C
,
int
ldc
)
const
;
#ifdef PADDLE_WITH_MKLML
template
<
typename
T
>
T
*
GEMM_ALLOC
(
const
CBLAS_IDENTIFIER
id
,
const
int
M
,
const
int
N
,
const
int
K
)
const
;
template
<
typename
T
>
void
GEMM_PACK
(
const
CBLAS_IDENTIFIER
id
,
const
CBLAS_TRANSPOSE
trans
,
int
M
,
int
N
,
int
K
,
const
T
alpha
,
const
T
*
src
,
const
int
ld
,
T
*
dst
)
const
;
template
<
typename
T
>
void
GEMM_COMPUTE
(
int
transA
,
int
transB
,
int
M
,
int
N
,
int
K
,
const
T
*
A
,
const
int
lda
,
const
T
*
B
,
const
int
ldb
,
T
beta
,
T
*
C
,
const
int
ldc
)
const
;
template
<
typename
T
>
void
GEMM_FREE
(
T
*
data
)
const
;
#endif
template
<
typename
T
>
template
<
typename
T
>
void
MatMul
(
const
framework
::
Tensor
&
mat_a
,
bool
trans_a
,
void
MatMul
(
const
framework
::
Tensor
&
mat_a
,
bool
trans_a
,
const
framework
::
Tensor
&
mat_b
,
bool
trans_b
,
T
alpha
,
const
framework
::
Tensor
&
mat_b
,
bool
trans_b
,
T
alpha
,
...
@@ -146,6 +165,28 @@ class BlasT : private Blas<DeviceContext> {
...
@@ -146,6 +165,28 @@ class BlasT : private Blas<DeviceContext> {
Base
()
->
template
GEMM
<
T
>(
args
...);
Base
()
->
template
GEMM
<
T
>(
args
...);
}
}
#ifdef PADDLE_WITH_MKLML
template
<
typename
...
ARGS
>
T
*
GEMM_ALLOC
(
ARGS
...
args
)
const
{
return
Base
()
->
template
GEMM_ALLOC
<
T
>(
args
...);
}
template
<
typename
...
ARGS
>
void
GEMM_PACK
(
ARGS
...
args
)
const
{
Base
()
->
template
GEMM_PACK
<
T
>(
args
...);
}
template
<
typename
...
ARGS
>
void
GEMM_COMPUTE
(
ARGS
...
args
)
const
{
Base
()
->
template
GEMM_COMPUTE
<
T
>(
args
...);
}
template
<
typename
...
ARGS
>
void
GEMM_FREE
(
ARGS
...
args
)
const
{
Base
()
->
template
GEMM_FREE
<
T
>(
args
...);
}
#endif
template
<
typename
...
ARGS
>
template
<
typename
...
ARGS
>
void
MatMul
(
ARGS
...
args
)
const
{
void
MatMul
(
ARGS
...
args
)
const
{
Base
()
->
template
MatMul
<
T
>(
args
...);
Base
()
->
template
MatMul
<
T
>(
args
...);
...
...
paddle/fluid/operators/math/blas_impl.h
浏览文件 @
92890ac2
...
@@ -31,6 +31,26 @@ struct CBlas<float> {
...
@@ -31,6 +31,26 @@ struct CBlas<float> {
platform
::
dynload
::
cblas_sgemm
(
args
...);
platform
::
dynload
::
cblas_sgemm
(
args
...);
}
}
template
<
typename
...
ARGS
>
static
float
*
GEMM_ALLOC
(
ARGS
...
args
)
{
return
platform
::
dynload
::
cblas_sgemm_alloc
(
args
...);
}
template
<
typename
...
ARGS
>
static
void
GEMM_PACK
(
ARGS
...
args
)
{
platform
::
dynload
::
cblas_sgemm_pack
(
args
...);
}
template
<
typename
...
ARGS
>
static
void
GEMM_COMPUTE
(
ARGS
...
args
)
{
platform
::
dynload
::
cblas_sgemm_compute
(
args
...);
}
template
<
typename
...
ARGS
>
static
void
GEMM_FREE
(
ARGS
...
args
)
{
platform
::
dynload
::
cblas_sgemm_free
(
args
...);
}
#ifdef PADDLE_WITH_LIBXSMM
#ifdef PADDLE_WITH_LIBXSMM
template
<
typename
...
ARGS
>
template
<
typename
...
ARGS
>
static
void
SMM_GEMM
(
ARGS
...
args
)
{
static
void
SMM_GEMM
(
ARGS
...
args
)
{
...
@@ -71,6 +91,26 @@ struct CBlas<double> {
...
@@ -71,6 +91,26 @@ struct CBlas<double> {
platform
::
dynload
::
cblas_dgemm
(
args
...);
platform
::
dynload
::
cblas_dgemm
(
args
...);
}
}
template
<
typename
...
ARGS
>
static
double
*
GEMM_ALLOC
(
ARGS
...
args
)
{
return
platform
::
dynload
::
cblas_dgemm_alloc
(
args
...);
}
template
<
typename
...
ARGS
>
static
void
GEMM_PACK
(
ARGS
...
args
)
{
platform
::
dynload
::
cblas_dgemm_pack
(
args
...);
}
template
<
typename
...
ARGS
>
static
void
GEMM_COMPUTE
(
ARGS
...
args
)
{
platform
::
dynload
::
cblas_dgemm_compute
(
args
...);
}
template
<
typename
...
ARGS
>
static
void
GEMM_FREE
(
ARGS
...
args
)
{
platform
::
dynload
::
cblas_dgemm_free
(
args
...);
}
#ifdef PADDLE_WITH_LIBXSMM
#ifdef PADDLE_WITH_LIBXSMM
template
<
typename
...
ARGS
>
template
<
typename
...
ARGS
>
static
void
SMM_GEMM
(
ARGS
...
args
)
{
static
void
SMM_GEMM
(
ARGS
...
args
)
{
...
@@ -224,6 +264,41 @@ inline void GEMM_WARP(CBLAS_ORDER order, CBLAS_TRANSPOSE transA,
...
@@ -224,6 +264,41 @@ inline void GEMM_WARP(CBLAS_ORDER order, CBLAS_TRANSPOSE transA,
beta
,
C
,
ldc
);
beta
,
C
,
ldc
);
}
}
#ifdef PADDLE_WITH_MKLML
template
<
>
template
<
typename
T
>
T
*
Blas
<
platform
::
CPUDeviceContext
>::
GEMM_ALLOC
(
const
CBLAS_IDENTIFIER
id
,
const
int
M
,
const
int
N
,
const
int
K
)
const
{
return
CBlas
<
T
>::
GEMM_ALLOC
(
id
,
M
,
N
,
K
);
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CPUDeviceContext
>::
GEMM_PACK
(
const
CBLAS_IDENTIFIER
id
,
const
CBLAS_TRANSPOSE
trans
,
int
M
,
int
N
,
int
K
,
const
T
alpha
,
const
T
*
src
,
const
int
ld
,
T
*
dst
)
const
{
CBlas
<
T
>::
GEMM_PACK
(
CblasRowMajor
,
id
,
trans
,
M
,
N
,
K
,
alpha
,
src
,
ld
,
dst
);
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CPUDeviceContext
>::
GEMM_COMPUTE
(
int
transA
,
int
transB
,
int
M
,
int
N
,
int
K
,
const
T
*
A
,
const
int
lda
,
const
T
*
B
,
const
int
ldb
,
T
beta
,
T
*
C
,
const
int
ldc
)
const
{
CBlas
<
T
>::
GEMM_COMPUTE
(
CblasRowMajor
,
transA
,
transB
,
M
,
N
,
K
,
A
,
lda
,
B
,
ldb
,
beta
,
C
,
ldc
);
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CPUDeviceContext
>::
GEMM_FREE
(
T
*
data
)
const
{
CBlas
<
T
>::
GEMM_FREE
(
data
);
}
#endif
template
<
>
template
<
>
template
<
typename
T
>
template
<
typename
T
>
void
Blas
<
platform
::
CPUDeviceContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
void
Blas
<
platform
::
CPUDeviceContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
...
...
paddle/fluid/platform/dynload/mklml.h
浏览文件 @
92890ac2
...
@@ -60,6 +60,14 @@ extern void* mklml_dso_handle;
...
@@ -60,6 +60,14 @@ extern void* mklml_dso_handle;
__macro(cblas_dgemm_batch); \
__macro(cblas_dgemm_batch); \
__macro(vsAdd); \
__macro(vsAdd); \
__macro(vdAdd); \
__macro(vdAdd); \
__macro(cblas_sgemm_alloc); \
__macro(cblas_sgemm_pack); \
__macro(cblas_sgemm_compute); \
__macro(cblas_sgemm_free); \
__macro(cblas_dgemm_alloc); \
__macro(cblas_dgemm_pack); \
__macro(cblas_dgemm_compute); \
__macro(cblas_dgemm_free); \
__macro(MKL_Set_Num_Threads)
__macro(MKL_Set_Num_Threads)
MKLML_ROUTINE_EACH
(
DECLARE_DYNAMIC_LOAD_MKLML_WRAP
);
MKLML_ROUTINE_EACH
(
DECLARE_DYNAMIC_LOAD_MKLML_WRAP
);
...
...
paddle/fluid/platform/enforce.h
浏览文件 @
92890ac2
...
@@ -263,7 +263,8 @@ inline void throw_on_error(T e) {
...
@@ -263,7 +263,8 @@ inline void throw_on_error(T e) {
* PADDLE_ENFORCE_EQ(a, b);
* PADDLE_ENFORCE_EQ(a, b);
*
*
* will raise an expression described as follows:
* will raise an expression described as follows:
* "enforce a == b failed, 1 != 2" with detailed stack information.
* "Enforce failed. Expected input a == b, but received a(1) != b(2)."
* with detailed stack information.
*
*
* extra messages is also supported, for example:
* extra messages is also supported, for example:
* PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2)
* PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2)
...
@@ -292,9 +293,10 @@ inline void throw_on_error(T e) {
...
@@ -292,9 +293,10 @@ inline void throw_on_error(T e) {
#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \
#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \
do { \
do { \
if (UNLIKELY(!((__VAL0)__CMP(__VAL1)))) { \
if (UNLIKELY(!((__VAL0)__CMP(__VAL1)))) { \
PADDLE_THROW("enforce %s " #__CMP " %s failed, %s " #__INV_CMP \
PADDLE_THROW("Enforce failed. Expected %s " #__CMP \
" %s\n%s", \
" %s, but received %s:%s " #__INV_CMP " %s:%s.\n%s", \
#__VAL0, #__VAL1, paddle::string::to_string(__VAL0), \
#__VAL0, #__VAL1, #__VAL0, \
paddle::string::to_string(__VAL0), #__VAL1, \
paddle::string::to_string(__VAL1), \
paddle::string::to_string(__VAL1), \
paddle::string::Sprintf("" __VA_ARGS__)); \
paddle::string::Sprintf("" __VA_ARGS__)); \
} \
} \
...
...
paddle/fluid/platform/enforce_test.cc
浏览文件 @
92890ac2
...
@@ -54,7 +54,9 @@ TEST(ENFORCE_EQ, NO_EXTRA_MSG_FAIL) {
...
@@ -54,7 +54,9 @@ TEST(ENFORCE_EQ, NO_EXTRA_MSG_FAIL) {
PADDLE_ENFORCE_EQ
(
a
,
1
+
3
);
PADDLE_ENFORCE_EQ
(
a
,
1
+
3
);
}
catch
(
paddle
::
platform
::
EnforceNotMet
error
)
{
}
catch
(
paddle
::
platform
::
EnforceNotMet
error
)
{
caught_exception
=
true
;
caught_exception
=
true
;
HasPrefix
(
StringPiece
(
error
.
what
()),
"enforce a == 1 + 3 failed, 2 != 4"
);
HasPrefix
(
StringPiece
(
error
.
what
()),
"Enforce failed. Expected a == 1 + 3, but received a:2 != 1 + 3:4."
);
}
}
EXPECT_TRUE
(
caught_exception
);
EXPECT_TRUE
(
caught_exception
);
}
}
...
@@ -67,7 +69,8 @@ TEST(ENFORCE_EQ, EXTRA_MSG_FAIL) {
...
@@ -67,7 +69,8 @@ TEST(ENFORCE_EQ, EXTRA_MSG_FAIL) {
}
catch
(
paddle
::
platform
::
EnforceNotMet
error
)
{
}
catch
(
paddle
::
platform
::
EnforceNotMet
error
)
{
caught_exception
=
true
;
caught_exception
=
true
;
HasPrefix
(
StringPiece
(
error
.
what
()),
HasPrefix
(
StringPiece
(
error
.
what
()),
"enforce a == 1 + 3 failed, 2 != 4
\n
their size not match"
);
"Enforce failed. Expected a == 1 + 3, but received a:2 != 1 + "
"3:4.
\n
their size not match"
);
}
}
EXPECT_TRUE
(
caught_exception
);
EXPECT_TRUE
(
caught_exception
);
}
}
...
@@ -84,8 +87,9 @@ TEST(ENFORCE_NE, FAIL) {
...
@@ -84,8 +87,9 @@ TEST(ENFORCE_NE, FAIL) {
PADDLE_ENFORCE_NE
(
1.0
,
1UL
);
PADDLE_ENFORCE_NE
(
1.0
,
1UL
);
}
catch
(
paddle
::
platform
::
EnforceNotMet
error
)
{
}
catch
(
paddle
::
platform
::
EnforceNotMet
error
)
{
caught_exception
=
true
;
caught_exception
=
true
;
EXPECT_TRUE
(
HasPrefix
(
StringPiece
(
error
.
what
()),
EXPECT_TRUE
(
HasPrefix
(
"enforce 1.0 != 1UL failed, 1 == 1"
))
StringPiece
(
error
.
what
()),
"Enforce failed. Expected 1.0 != 1UL, but received 1.0:1 == 1UL:1."
))
<<
error
.
what
()
<<
" does not have expected prefix"
;
<<
error
.
what
()
<<
" does not have expected prefix"
;
}
}
EXPECT_TRUE
(
caught_exception
);
EXPECT_TRUE
(
caught_exception
);
...
@@ -98,8 +102,9 @@ TEST(ENFORCE_GT, FAIL) {
...
@@ -98,8 +102,9 @@ TEST(ENFORCE_GT, FAIL) {
PADDLE_ENFORCE_GT
(
1
,
2UL
);
PADDLE_ENFORCE_GT
(
1
,
2UL
);
}
catch
(
paddle
::
platform
::
EnforceNotMet
error
)
{
}
catch
(
paddle
::
platform
::
EnforceNotMet
error
)
{
caught_exception
=
true
;
caught_exception
=
true
;
EXPECT_TRUE
(
EXPECT_TRUE
(
HasPrefix
(
HasPrefix
(
StringPiece
(
error
.
what
()),
"enforce 1 > 2UL failed, 1 <= 2"
));
StringPiece
(
error
.
what
()),
"Enforce failed. Expected 1 > 2UL, but received 1:1 <= 2UL:2."
));
}
}
EXPECT_TRUE
(
caught_exception
);
EXPECT_TRUE
(
caught_exception
);
}
}
...
@@ -116,8 +121,9 @@ TEST(ENFORCE_GE, FAIL) {
...
@@ -116,8 +121,9 @@ TEST(ENFORCE_GE, FAIL) {
PADDLE_ENFORCE_GE
(
1
,
2UL
);
PADDLE_ENFORCE_GE
(
1
,
2UL
);
}
catch
(
paddle
::
platform
::
EnforceNotMet
error
)
{
}
catch
(
paddle
::
platform
::
EnforceNotMet
error
)
{
caught_exception
=
true
;
caught_exception
=
true
;
EXPECT_TRUE
(
EXPECT_TRUE
(
HasPrefix
(
HasPrefix
(
StringPiece
(
error
.
what
()),
"enforce 1 >= 2UL failed, 1 < 2"
));
StringPiece
(
error
.
what
()),
"Enforce failed. Expected 1 >= 2UL, but received 1:1 < 2UL:2."
));
}
}
EXPECT_TRUE
(
caught_exception
);
EXPECT_TRUE
(
caught_exception
);
}
}
...
@@ -135,8 +141,9 @@ TEST(ENFORCE_LE, FAIL) {
...
@@ -135,8 +141,9 @@ TEST(ENFORCE_LE, FAIL) {
PADDLE_ENFORCE_GT
(
1
,
2UL
);
PADDLE_ENFORCE_GT
(
1
,
2UL
);
}
catch
(
paddle
::
platform
::
EnforceNotMet
error
)
{
}
catch
(
paddle
::
platform
::
EnforceNotMet
error
)
{
caught_exception
=
true
;
caught_exception
=
true
;
EXPECT_TRUE
(
EXPECT_TRUE
(
HasPrefix
(
HasPrefix
(
StringPiece
(
error
.
what
()),
"enforce 1 > 2UL failed, 1 <= 2"
));
StringPiece
(
error
.
what
()),
"Enforce failed. Expected 1 > 2UL, but received 1:1 <= 2UL:2."
));
}
}
EXPECT_TRUE
(
caught_exception
);
EXPECT_TRUE
(
caught_exception
);
}
}
...
@@ -153,7 +160,8 @@ TEST(ENFORCE_LT, FAIL) {
...
@@ -153,7 +160,8 @@ TEST(ENFORCE_LT, FAIL) {
}
catch
(
paddle
::
platform
::
EnforceNotMet
error
)
{
}
catch
(
paddle
::
platform
::
EnforceNotMet
error
)
{
caught_exception
=
true
;
caught_exception
=
true
;
EXPECT_TRUE
(
HasPrefix
(
StringPiece
(
error
.
what
()),
EXPECT_TRUE
(
HasPrefix
(
StringPiece
(
error
.
what
()),
"enforce 1UL < 0.12 failed, 1 >= 0.12"
));
"Enforce failed. Expected 1UL < 0.12, but "
"received 1UL:1 >= 0.12:0.12."
));
}
}
EXPECT_TRUE
(
caught_exception
);
EXPECT_TRUE
(
caught_exception
);
}
}
...
...
paddle/fluid/platform/gpu_info.cc
浏览文件 @
92890ac2
...
@@ -116,7 +116,8 @@ size_t GpuMaxChunkSize() {
...
@@ -116,7 +116,8 @@ size_t GpuMaxChunkSize() {
size_t
allocating
=
static_cast
<
size_t
>
(
FLAGS_fraction_of_gpu_memory_to_use
*
size_t
allocating
=
static_cast
<
size_t
>
(
FLAGS_fraction_of_gpu_memory_to_use
*
(
total
-
reserving
));
(
total
-
reserving
));
PADDLE_ENFORCE_LE
(
allocating
,
available
);
PADDLE_ENFORCE_LE
(
allocating
,
available
,
"Insufficient GPU memory to allocation."
);
return
allocating
;
return
allocating
;
}
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录