Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
8314412b
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
8314412b
编写于
1月 24, 2018
作者:
Y
yangyaming
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into fix-7717
上级
f3fe4107
02fee882
变更
38
展开全部
隐藏空白更改
内联
并排
Showing
38 changed file
with
761 addition
and
237 deletion
+761
-237
doc/api/v2/fluid/nets.rst
doc/api/v2/fluid/nets.rst
+3
-3
doc/design/dist_refactor/distributed_architecture.md
doc/design/dist_refactor/distributed_architecture.md
+2
-2
doc/design/dist_refactor/src/remote_executor.graffle
doc/design/dist_refactor/src/remote_executor.graffle
+0
-0
doc/design/dist_refactor/src/remote_executor.png
doc/design/dist_refactor/src/remote_executor.png
+0
-0
paddle/framework/CMakeLists.txt
paddle/framework/CMakeLists.txt
+2
-1
paddle/framework/executor.cc
paddle/framework/executor.cc
+5
-0
paddle/gserver/tests/test_LayerGrad.cpp
paddle/gserver/tests/test_LayerGrad.cpp
+26
-18
paddle/operators/detail/grpc_client.cc
paddle/operators/detail/grpc_client.cc
+71
-47
paddle/operators/im2sequence_op.h
paddle/operators/im2sequence_op.h
+1
-1
paddle/operators/reshape_op.cc
paddle/operators/reshape_op.cc
+2
-6
paddle/platform/profiler.cc
paddle/platform/profiler.cc
+36
-19
paddle/platform/profiler.h
paddle/platform/profiler.h
+21
-13
paddle/platform/profiler_test.cc
paddle/platform/profiler_test.cc
+4
-6
paddle/pybind/CMakeLists.txt
paddle/pybind/CMakeLists.txt
+1
-1
paddle/pybind/protobuf.h
paddle/pybind/protobuf.h
+1
-0
paddle/pybind/pybind.cc
paddle/pybind/pybind.cc
+21
-2
python/paddle/v2/fluid/__init__.py
python/paddle/v2/fluid/__init__.py
+1
-1
python/paddle/v2/fluid/backward.py
python/paddle/v2/fluid/backward.py
+1
-1
python/paddle/v2/fluid/io.py
python/paddle/v2/fluid/io.py
+9
-2
python/paddle/v2/fluid/layer_helper.py
python/paddle/v2/fluid/layer_helper.py
+1
-0
python/paddle/v2/fluid/layers/nn.py
python/paddle/v2/fluid/layers/nn.py
+162
-73
python/paddle/v2/fluid/nets.py
python/paddle/v2/fluid/nets.py
+150
-24
python/paddle/v2/fluid/profiler.py
python/paddle/v2/fluid/profiler.py
+55
-0
python/paddle/v2/fluid/tests/op_test.py
python/paddle/v2/fluid/tests/op_test.py
+2
-2
python/paddle/v2/fluid/tests/test_adagrad_op.py
python/paddle/v2/fluid/tests/test_adagrad_op.py
+1
-1
python/paddle/v2/fluid/tests/test_adam_op.py
python/paddle/v2/fluid/tests/test_adam_op.py
+1
-1
python/paddle/v2/fluid/tests/test_batch_norm_op.py
python/paddle/v2/fluid/tests/test_batch_norm_op.py
+1
-1
python/paddle/v2/fluid/tests/test_gaussian_random_op.py
python/paddle/v2/fluid/tests/test_gaussian_random_op.py
+1
-1
python/paddle/v2/fluid/tests/test_iou_similarity_op.py
python/paddle/v2/fluid/tests/test_iou_similarity_op.py
+0
-0
python/paddle/v2/fluid/tests/test_multihead_attention.py
python/paddle/v2/fluid/tests/test_multihead_attention.py
+98
-0
python/paddle/v2/fluid/tests/test_normalization_wrapper.py
python/paddle/v2/fluid/tests/test_normalization_wrapper.py
+1
-1
python/paddle/v2/fluid/tests/test_op_support_gpu.py
python/paddle/v2/fluid/tests/test_op_support_gpu.py
+2
-1
python/paddle/v2/fluid/tests/test_parallel_op.py
python/paddle/v2/fluid/tests/test_parallel_op.py
+28
-3
python/paddle/v2/fluid/tests/test_profiler.py
python/paddle/v2/fluid/tests/test_profiler.py
+47
-2
python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
+1
-1
python/paddle/v2/fluid/tests/test_sgd_op.py
python/paddle/v2/fluid/tests/test_sgd_op.py
+1
-1
python/paddle/v2/fluid/tests/test_split_selected_rows_op.py
python/paddle/v2/fluid/tests/test_split_selected_rows_op.py
+1
-1
python/paddle/v2/fluid/tests/test_uniform_random_op.py
python/paddle/v2/fluid/tests/test_uniform_random_op.py
+1
-1
未找到文件。
doc/api/v2/fluid/nets.rst
浏览文件 @
8314412b
...
...
@@ -26,8 +26,8 @@ glu
:noindex:
dot_product_attention
---------------------
.. autofunction:: paddle.v2.fluid.nets.dot_product_attention
scaled_
dot_product_attention
---------------------
-------
.. autofunction:: paddle.v2.fluid.nets.
scaled_
dot_product_attention
:noindex:
doc/design/dist_refactor/distributed_architecture.md
浏览文件 @
8314412b
...
...
@@ -152,12 +152,12 @@ for data in train_reader():
`JobDesc`
object describe the distributed job resource specification to run on
Cluster environment.
<img
src=
"src/remote_executor.png"
/>
<img
src=
"src/remote_executor.png"
width=
"500"
align=
"center"
/>
`RemoteExecutor.run`
sends the
`ProgramDesc`
and
[
TrainingJob
](
https://github.com/PaddlePaddle/cloud/blob/develop/doc/autoscale/README.md#training-job-resource
)
to a server in the cluster which executes
`RemoteExecutor.listen`
. This server is responsible
to start the final Kubernetes Jobs to run the different role of
`ProgramDesc`
.
to start the final Kubernetes Jobs to run the different role of
`ProgramDesc`
from
`ConfigMap`
.
### Placement Algorithm
...
...
doc/design/dist_refactor/src/remote_executor.graffle
浏览文件 @
8314412b
无法预览此类型文件
doc/design/dist_refactor/src/remote_executor.png
查看替换文件 @
f3fe4107
浏览文件 @
8314412b
134.5 KB
|
W:
|
H:
118.0 KB
|
W:
|
H:
2-up
Swipe
Onion skin
paddle/framework/CMakeLists.txt
浏览文件 @
8314412b
...
...
@@ -74,7 +74,8 @@ cc_library(backward SRCS backward.cc DEPS net_op)
cc_test
(
backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op
)
cc_library
(
lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor
)
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog lod_rank_table
)
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope
framework_proto backward glog lod_rank_table profiler
)
cc_library
(
prune SRCS prune.cc DEPS framework_proto
)
cc_test
(
prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context
)
...
...
paddle/framework/executor.cc
浏览文件 @
8314412b
...
...
@@ -22,6 +22,7 @@ limitations under the License. */
#include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/op_registry.h"
#include "paddle/platform/place.h"
#include "paddle/platform/profiler.h"
DECLARE_bool
(
do_memory_benchmark
);
DEFINE_bool
(
check_nan_inf
,
false
,
...
...
@@ -117,6 +118,10 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
for
(
auto
&
op_desc
:
block
.
AllOps
())
{
auto
op
=
paddle
::
framework
::
OpRegistry
::
CreateOp
(
*
op_desc
);
VLOG
(
4
)
<<
op
->
DebugStringEx
(
local_scope
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
platform
::
RecordEvent
record_event
(
op
->
Type
(),
pool
.
Get
(
place_
));
op
->
Run
(
*
local_scope
,
place_
);
VLOG
(
3
)
<<
op
->
DebugStringEx
(
local_scope
);
if
(
FLAGS_do_memory_benchmark
)
{
...
...
paddle/gserver/tests/test_LayerGrad.cpp
浏览文件 @
8314412b
...
...
@@ -991,8 +991,10 @@ TEST(Layer, SequenceLastInstanceLayer) {
"seqlastins"
,
"non-seq"
,
-
1
);
// hasSubseq seqlastins to non-seq
testDegradeLayer
(
true
,
"seqlastins"
,
"seq"
,
-
1
);
// hasSubseq seqlastins to seq
testDegradeLayer
(
true
,
"seqlastins"
,
"seq"
,
-
1
);
// hasSubseq seqlastins to seq
}
TEST
(
Layer
,
AverageLayer
)
{
...
...
@@ -1001,8 +1003,10 @@ TEST(Layer, AverageLayer) {
"average"
,
"non-seq"
,
5
);
// seq average to a shorten seq, stride window = 5
testDegradeLayer
(
true
,
"average"
,
"non-seq"
,
-
1
);
// hasSubseq average to non-seq
testDegradeLayer
(
true
,
"average"
,
"non-seq"
,
-
1
);
// hasSubseq average to non-seq
testDegradeLayer
(
true
,
"average"
,
"seq"
,
-
1
);
// hasSubseq average to seq
}
...
...
@@ -1287,8 +1291,9 @@ TEST(Layer, PoolLayer) {
testPoolLayer
(
"cudnn-avg-pool"
,
/* trans= */
false
,
/* useGpu= */
true
);
testPoolLayer2
(
"cudnn-max-pool"
,
/* trans= */
false
,
/* useGpu= */
true
);
testPoolLayer2
(
"cudnn-avg-pool"
,
/* trans= */
false
,
/* useGpu= */
true
);
testPoolLayer2
(
"cudnn-avg-incl-pad-pool"
,
/* trans= */
false
,
/* useGpu= */
true
);
testPoolLayer2
(
"cudnn-avg-incl-pad-pool"
,
/* trans= */
false
,
/* useGpu= */
true
);
testPoolLayer
(
"max-pool-with-mask"
,
/* trans= */
false
,
/* useGpu= */
true
);
#endif
}
...
...
@@ -2431,18 +2436,21 @@ TEST(Layer, test3DDeConvLayer) {
}
TEST
(
Layer
,
ScaleShiftLayer
)
{
const
size_t
batchSize
=
16
;
const
size_t
size
=
32
;
TestConfig
config
;
config
.
layerConfig
.
set_type
(
"scale_shift"
);
config
.
layerConfig
.
set_size
(
size
);
config
.
biasSize
=
1
;
config
.
inputDefs
.
push_back
(
{
INPUT_DATA
,
"input"
,
/* dim= */
size
,
/* paraSize= */
1
});
config
.
layerConfig
.
add_inputs
();
for
(
auto
useGpu
:
{
false
,
true
})
{
testLayerGrad
(
config
,
"scale_shift"
,
batchSize
,
false
,
useGpu
,
false
);
}
// FIXME: Disable ScaleShiftLayer because it is not stable.
// https://github.com/PaddlePaddle/Paddle/issues/7781
return
;
// const size_t batchSize = 16;
// const size_t size = 32;
// TestConfig config;
// config.layerConfig.set_type("scale_shift");
// config.layerConfig.set_size(size);
// config.biasSize = 1;
// config.inputDefs.push_back(
// {INPUT_DATA, "input", /* dim= */ size, /* paraSize= */ 1});
// config.layerConfig.add_inputs();
// for (auto useGpu : {false, true}) {
// testLayerGrad(config, "scale_shift", batchSize, false, useGpu, false);
// }
}
TEST
(
Layer
,
ScaleSubRegionLayer
)
{
...
...
paddle/operators/detail/grpc_client.cc
浏览文件 @
8314412b
...
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "grpc_client.h"
#include "paddle/framework/threadpool.h"
namespace
paddle
{
namespace
operators
{
namespace
detail
{
...
...
@@ -22,25 +23,32 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
const
framework
::
Scope
&
scope
,
const
std
::
string
&
var_name
,
int64_t
time_out
)
{
sendrecv
::
VariableMessage
req
;
auto
*
var
=
scope
.
FindVar
(
var_name
);
SerializeToMessage
(
var_name
,
var
,
ctx
,
&
req
);
// varhandle
VarHandle
var_h
;
var_h
.
ep
=
ep
;
var_h
.
scope
=
&
scope
;
var_h
.
name
=
var_name
;
var_h
.
ctx
=
&
ctx
;
// stub context
auto
ch
=
GetChannel
(
ep
);
SendProcessor
*
s
=
new
SendProcessor
(
ch
);
s
->
Prepare
(
var_h
,
time_out
);
s
->
response_call_back_
=
NULL
;
auto
rpc
=
s
->
stub_
->
AsyncSendVariable
(
s
->
context_
.
get
(),
req
,
&
cq_
);
rpc
->
Finish
(
&
s
->
reply_
,
&
s
->
status_
,
(
void
*
)
s
);
const
platform
::
DeviceContext
*
p_ctx
=
&
ctx
;
const
std
::
string
ep_val
=
ep
;
const
std
::
string
var_name_val
=
var_name
;
const
framework
::
Scope
*
p_scope
=
&
scope
;
const
auto
ch
=
GetChannel
(
ep_val
);
framework
::
Async
([
var_name_val
,
p_ctx
,
ep_val
,
p_scope
,
time_out
,
ch
,
this
]
{
auto
*
var
=
p_scope
->
FindVar
(
var_name_val
);
sendrecv
::
VariableMessage
req
;
SerializeToMessage
(
var_name_val
,
var
,
*
p_ctx
,
&
req
);
// varhandle
VarHandle
var_h
;
var_h
.
ep
=
ep_val
;
var_h
.
scope
=
p_scope
;
var_h
.
name
=
var_name_val
;
var_h
.
ctx
=
p_ctx
;
// stub context
SendProcessor
*
s
=
new
SendProcessor
(
ch
);
s
->
Prepare
(
var_h
,
time_out
);
s
->
response_call_back_
=
NULL
;
auto
rpc
=
s
->
stub_
->
AsyncSendVariable
(
s
->
context_
.
get
(),
req
,
&
cq_
);
rpc
->
Finish
(
&
s
->
reply_
,
&
s
->
status_
,
(
void
*
)
s
);
});
req_count_
++
;
...
...
@@ -50,8 +58,6 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
void
ProcGetResponse
(
const
VarHandle
&
var_h
,
const
sendrecv
::
VariableMessage
&
ret_msg
)
{
auto
*
outvar
=
var_h
.
scope
->
FindVar
(
var_h
.
name
);
std
::
istringstream
iss
(
ret_msg
.
serialized
());
DeserializeFromMessage
(
ret_msg
,
*
var_h
.
ctx
,
outvar
);
}
...
...
@@ -60,24 +66,31 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
const
framework
::
Scope
&
scope
,
const
std
::
string
&
var_name
,
int64_t
time_out
)
{
sendrecv
::
VariableMessage
req
;
req
.
set_varname
(
var_name
);
// varhandle
VarHandle
var_h
;
var_h
.
ep
=
ep
;
var_h
.
scope
=
&
scope
;
var_h
.
name
=
var_name
;
var_h
.
ctx
=
&
ctx
;
// stub context
auto
ch
=
GetChannel
(
ep
);
GetProcessor
*
s
=
new
GetProcessor
(
ch
);
s
->
Prepare
(
var_h
,
time_out
);
s
->
response_call_back_
=
ProcGetResponse
;
auto
rpc
=
s
->
stub_
->
AsyncGetVariable
(
s
->
context_
.
get
(),
req
,
&
cq_
);
rpc
->
Finish
(
&
s
->
reply_
,
&
s
->
status_
,
(
void
*
)
s
);
const
platform
::
DeviceContext
*
p_ctx
=
&
ctx
;
const
std
::
string
ep_val
=
ep
;
const
std
::
string
var_name_val
=
var_name
;
const
framework
::
Scope
*
p_scope
=
&
scope
;
const
auto
ch
=
GetChannel
(
ep_val
);
framework
::
Async
([
var_name_val
,
ep_val
,
p_scope
,
p_ctx
,
time_out
,
ch
,
this
]
{
sendrecv
::
VariableMessage
req
;
req
.
set_varname
(
var_name_val
);
// varhandle
VarHandle
var_h
;
var_h
.
ep
=
ep_val
;
var_h
.
scope
=
p_scope
;
var_h
.
name
=
var_name_val
;
var_h
.
ctx
=
p_ctx
;
// stub context
GetProcessor
*
s
=
new
GetProcessor
(
ch
);
s
->
Prepare
(
var_h
,
time_out
);
s
->
response_call_back_
=
ProcGetResponse
;
auto
rpc
=
s
->
stub_
->
AsyncGetVariable
(
s
->
context_
.
get
(),
req
,
&
cq_
);
rpc
->
Finish
(
&
s
->
reply_
,
&
s
->
status_
,
(
void
*
)
s
);
});
req_count_
++
;
...
...
@@ -85,19 +98,31 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
}
bool
RPCClient
::
Wait
()
{
bool
ok
=
true
;
if
(
req_count_
<=
0
)
{
return
true
;
}
while
(
true
)
{
if
(
req_count_
<=
0
)
{
break
;
}
std
::
vector
<
bool
>
a
(
req_count_
);
std
::
vector
<
std
::
future
<
void
>>
waits
(
req_count_
);
if
(
!
Proceed
())
{
for
(
int
i
=
0
;
i
<
req_count_
;
i
++
)
{
waits
[
i
]
=
framework
::
Async
([
i
,
&
a
,
this
]
{
a
[
i
]
=
Proceed
();
});
}
for
(
int
i
=
0
;
i
<
req_count_
;
i
++
)
{
waits
[
i
].
wait
();
}
int
last_req_count
=
req_count_
;
req_count_
=
0
;
for
(
int
i
=
0
;
i
<
last_req_count
;
i
++
)
{
if
(
!
a
[
i
])
{
return
false
;
}
}
return
ok
;
return
true
;
}
bool
RPCClient
::
Proceed
()
{
...
...
@@ -124,7 +149,6 @@ bool RPCClient::Proceed() {
c
->
Process
();
delete
c
;
req_count_
--
;
return
true
;
}
...
...
paddle/operators/im2sequence_op.h
浏览文件 @
8314412b
...
...
@@ -79,7 +79,7 @@ class Im2SequenceKernel : public framework::OpKernel<T> {
framework
::
LoD
lod
(
1
);
lod
[
0
].
reserve
(
batch_size
+
1
);
for
(
int
i
=
0
,
offset
=
0
;
i
<
batch_size
+
1
;
++
i
)
{
lod
[
0
]
[
i
]
=
offset
;
lod
[
0
]
.
push_back
(
offset
)
;
offset
+=
output_height
*
output_width
;
}
out
->
set_lod
(
lod
);
...
...
paddle/operators/reshape_op.cc
浏览文件 @
8314412b
...
...
@@ -90,14 +90,10 @@ Reshape Operator.
Reshape Input(X) into the shape specified by Attr(shape).
An example:
Given a 2-D tensor X with 2 rows and 2 columns
[[1, 2], [3, 4]]
Given a 2-D tensor X with 2 rows and 2 columns : [[1, 2], [3, 4]]
and target shape = [1, 4], the reshape operator will transform
the tensor X into a 2-D tensor:
[[1, 2, 3, 4]]
the tensor X into a 2-D tensor: [[1, 2, 3, 4]]
One dimension in the target shape can be set -1, representing that its
size is unknown. In this case, the real dimension will be infered from
...
...
paddle/platform/profiler.cc
浏览文件 @
8314412b
...
...
@@ -47,16 +47,16 @@ inline uint64_t GetTimeInNsec() {
}
Event
::
Event
(
EventKind
kind
,
std
::
string
name
,
uint32_t
thread_id
,
DeviceContext
*
dev_ctx
)
const
DeviceContext
*
dev_ctx
)
:
kind_
(
kind
),
name_
(
name
),
thread_id_
(
thread_id
),
has_cuda_
(
false
)
{
#ifdef PADDLE_WITH_CUDA
auto
*
cuda_dev_ctx
=
static_cast
<
const
CUDADeviceContext
*>
(
dev_ctx
);
if
(
cuda_dev_ctx
)
{
has_cuda_
=
dev_ctx
?
platform
::
is_gpu_place
(
dev_ctx
->
GetPlace
())
:
false
;
if
(
has_cuda_
)
{
auto
*
cuda_dev_ctx
=
static_cast
<
const
CUDADeviceContext
*>
(
dev_ctx
);
PADDLE_ENFORCE
(
cudaGetDevice
(
&
device_
));
PADDLE_ENFORCE
(
cudaEventCreate
(
&
event_
));
auto
stream
=
cuda_dev_ctx
->
stream
();
PADDLE_ENFORCE
(
cudaEventRecord
(
event_
,
stream
));
has_cuda_
=
true
;
}
#endif
cpu_ns_
=
GetTimeInNsec
();
...
...
@@ -114,19 +114,20 @@ inline EventList& GetEventList() {
return
*
g_event_list
;
}
void
Mark
(
const
std
::
string
&
name
,
DeviceContext
*
dev_ctx
)
{
void
Mark
(
const
std
::
string
&
name
,
const
DeviceContext
*
dev_ctx
)
{
GetEventList
().
Record
(
EventKind
::
kMark
,
name
,
g_thread_id
,
dev_ctx
);
}
void
PushEvent
(
const
std
::
string
&
name
,
DeviceContext
*
dev_ctx
)
{
void
PushEvent
(
const
std
::
string
&
name
,
const
DeviceContext
*
dev_ctx
)
{
GetEventList
().
Record
(
EventKind
::
kPushRange
,
name
,
g_thread_id
,
dev_ctx
);
}
void
PopEvent
(
const
std
::
string
&
name
,
DeviceContext
*
dev_ctx
)
{
void
PopEvent
(
const
std
::
string
&
name
,
const
DeviceContext
*
dev_ctx
)
{
GetEventList
().
Record
(
EventKind
::
kPopRange
,
name
,
g_thread_id
,
dev_ctx
);
}
RecordEvent
::
RecordEvent
(
const
std
::
string
&
name
,
DeviceContext
*
dev_ctx
)
{
RecordEvent
::
RecordEvent
(
const
std
::
string
&
name
,
const
DeviceContext
*
dev_ctx
)
{
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
dev_ctx_
=
dev_ctx
;
name_
=
name
;
...
...
@@ -155,6 +156,7 @@ void EnableProfiler(ProfilerState state) {
DeviceContext
*
dev_ctx
=
new
CUDADeviceContext
(
CUDAPlace
(
d
));
Mark
(
"_cuda_startup_"
,
dev_ctx
);
dev_ctx
->
Wait
();
delete
dev_ctx
;
});
}
}
...
...
@@ -163,14 +165,17 @@ void EnableProfiler(ProfilerState state) {
Mark
(
"_start_profiler_"
,
nullptr
);
}
std
::
vector
<
std
::
vector
<
Event
>>
DisableProfiler
()
{
PADDLE_ENFORCE
(
g_state
!=
ProfilerState
::
kDisabled
,
"Can't disable profiling, since it's not starting."
);
// Mark the profiling stop.
Mark
(
"_stop_profiler_"
,
nullptr
);
g_state
=
ProfilerState
::
kDisabled
;
std
::
vector
<
std
::
vector
<
Event
>>
result
;
void
ResetProfiler
()
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
g_all_event_lists_mutex
);
for
(
auto
it
=
g_all_event_lists
.
begin
();
it
!=
g_all_event_lists
.
end
();
++
it
)
{
(
*
it
)
->
Clear
();
}
}
std
::
vector
<
std
::
vector
<
Event
>>
GetAllEvents
()
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
g_all_event_lists_mutex
);
std
::
vector
<
std
::
vector
<
Event
>>
result
;
for
(
auto
it
=
g_all_event_lists
.
begin
();
it
!=
g_all_event_lists
.
end
();
++
it
)
{
result
.
emplace_back
((
*
it
)
->
Reduce
());
...
...
@@ -178,6 +183,18 @@ std::vector<std::vector<Event>> DisableProfiler() {
return
result
;
}
void
DisableProfiler
(
EventSortingKey
sorted_key
)
{
PADDLE_ENFORCE
(
g_state
!=
ProfilerState
::
kDisabled
,
"Can't disable profiling, since it's not starting."
);
// Mark the profiling stop.
Mark
(
"_stop_profiler_"
,
nullptr
);
g_state
=
ProfilerState
::
kDisabled
;
std
::
vector
<
std
::
vector
<
Event
>>
all_events
=
GetAllEvents
();
ParseEvents
(
all_events
,
sorted_key
);
ResetProfiler
();
}
void
ParseEvents
(
std
::
vector
<
std
::
vector
<
Event
>>&
events
,
EventSortingKey
sorted_by
)
{
if
(
g_profiler_place
==
""
)
return
;
...
...
@@ -291,12 +308,12 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
}
// Print report
PrintProfil
ingReport
(
events_table
,
sorted_domain
,
max_name_width
+
4
,
12
);
PrintProfil
er
(
events_table
,
sorted_domain
,
max_name_width
+
4
,
12
);
}
void
PrintProfil
ingReport
(
std
::
vector
<
std
::
vector
<
EventItem
>>&
events_table
,
std
::
string
&
sorted_domain
,
const
size_t
name_width
,
const
size_t
data_width
)
{
void
PrintProfil
er
(
std
::
vector
<
std
::
vector
<
EventItem
>>&
events_table
,
std
::
string
&
sorted_domain
,
const
size_t
name_width
,
const
size_t
data_width
)
{
// Output header information
std
::
cout
<<
"
\n
------------------------->"
<<
" Profiling Report "
...
...
paddle/platform/profiler.h
浏览文件 @
8314412b
...
...
@@ -29,7 +29,7 @@ class Event {
// The DeviceContext is used to get the cuda stream.
// If CPU profiling mode, can pass nullptr.
Event
(
EventKind
kind
,
std
::
string
name
,
uint32_t
thread_id
,
DeviceContext
*
dev_ctx
);
const
DeviceContext
*
dev_ctx
);
std
::
string
kind
()
const
;
std
::
string
name
()
const
{
return
name_
;
}
...
...
@@ -84,6 +84,8 @@ struct EventList {
return
result
;
}
void
Clear
()
{
event_blocks
.
clear
();
}
std
::
forward_list
<
std
::
vector
<
Event
>>
event_blocks
;
};
...
...
@@ -93,29 +95,26 @@ enum ProfilerState {
kCUDA
,
// GPU profiling state
};
void
Mark
(
const
std
::
string
&
name
,
DeviceContext
*
dev_ctx
);
void
Mark
(
const
std
::
string
&
name
,
const
DeviceContext
*
dev_ctx
);
void
PushEvent
(
const
std
::
string
&
name
,
DeviceContext
*
dev_ctx
);
void
PushEvent
(
const
std
::
string
&
name
,
const
DeviceContext
*
dev_ctx
);
void
PopEvent
(
const
std
::
string
&
name
,
DeviceContext
*
dev_ctx
);
void
PopEvent
(
const
std
::
string
&
name
,
const
DeviceContext
*
dev_ctx
);
struct
RecordEvent
{
explicit
RecordEvent
(
const
std
::
string
&
name
,
DeviceContext
*
dev_ctx
);
explicit
RecordEvent
(
const
std
::
string
&
name
,
const
DeviceContext
*
dev_ctx
);
~
RecordEvent
();
// The device context is used by Event to get the current cuda stream.
DeviceContext
*
dev_ctx_
;
const
DeviceContext
*
dev_ctx_
;
// Event name
std
::
string
name_
;
};
// Enable the profiling function.
void
EnableProfiler
(
ProfilerState
state
);
// Return the event list of all threads. Asummed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
std
::
vector
<
std
::
vector
<
Event
>>
DisableProfiler
();
std
::
vector
<
std
::
vector
<
Event
>>
GetAllEvents
();
// The information of each event given in the profiling report
struct
EventItem
{
...
...
@@ -130,13 +129,22 @@ struct EventItem {
// Candidate keys to sort the profiling report
enum
EventSortingKey
{
kDefault
,
kCalls
,
kTotal
,
kMin
,
kMax
,
kAve
};
// Enable the profiling function.
void
EnableProfiler
(
ProfilerState
state
);
// Clear the g_all_event_lists, which is total event lists of all threads.
void
ResetProfiler
();
void
DisableProfiler
(
EventSortingKey
sorted_key
);
// Parse the event list and output the profiling report
void
ParseEvents
(
std
::
vector
<
std
::
vector
<
Event
>>&
,
EventSortingKey
sorted_by
=
EventSortingKey
::
kDefault
);
// Print results
void
PrintProfilingReport
(
std
::
vector
<
std
::
vector
<
EventItem
>>&
events_table
,
std
::
string
&
sorted_domain
,
const
size_t
name_width
,
const
size_t
data_width
);
void
PrintProfiler
(
std
::
vector
<
std
::
vector
<
EventItem
>>&
events_table
,
std
::
string
&
sorted_domain
,
const
size_t
name_width
,
const
size_t
data_width
);
}
// namespace platform
}
// namespace paddle
paddle/platform/profiler_test.cc
浏览文件 @
8314412b
...
...
@@ -103,18 +103,14 @@ TEST(RecordEvent, RecordEvent) {
// Bad Usage:
PushEvent
(
"event_without_pop"
,
dev_ctx
);
PopEvent
(
"event_without_push"
,
dev_ctx
);
std
::
vector
<
std
::
vector
<
Event
>>
events
=
paddle
::
platform
::
DisableProfiler
();
// Will remove parsing-related code from test later
ParseEvents
(
events
,
EventSortingKey
::
kTotal
);
std
::
vector
<
std
::
vector
<
Event
>>
events
=
paddle
::
platform
::
GetAllEvents
();
int
cuda_startup_count
=
0
;
int
start_profiler_count
=
0
;
int
stop_profiler_count
=
0
;
for
(
size_t
i
=
0
;
i
<
events
.
size
();
++
i
)
{
for
(
size_t
j
=
0
;
j
<
events
[
i
].
size
();
++
j
)
{
if
(
events
[
i
][
j
].
name
()
==
"_cuda_startup_"
)
++
cuda_startup_count
;
if
(
events
[
i
][
j
].
name
()
==
"_start_profiler_"
)
++
start_profiler_count
;
if
(
events
[
i
][
j
].
name
()
==
"_stop_profiler_"
)
++
stop_profiler_count
;
if
(
events
[
i
][
j
].
name
()
==
"push"
)
{
EXPECT_EQ
(
events
[
i
][
j
+
1
].
name
(),
"pop"
);
#ifdef PADDLE_WITH_CUDA
...
...
@@ -127,5 +123,7 @@ TEST(RecordEvent, RecordEvent) {
}
EXPECT_EQ
(
cuda_startup_count
%
5
,
0
);
EXPECT_EQ
(
start_profiler_count
,
1
);
EXPECT_EQ
(
stop_profiler_count
,
1
);
// Will remove parsing-related code from test later
DisableProfiler
(
EventSortingKey
::
kTotal
);
}
paddle/pybind/CMakeLists.txt
浏览文件 @
8314412b
if
(
WITH_PYTHON
)
cc_library
(
paddle_pybind SHARED
SRCS pybind.cc exception.cc protobuf.cc const_value.cc
DEPS pybind python backward proto_desc paddle_memory executor prune init
DEPS pybind python backward proto_desc paddle_memory executor prune init
profiler
${
GLOB_OP_LIB
}
)
if
(
NOT APPLE AND NOT ANDROID
)
target_link_libraries
(
paddle_pybind rt
)
...
...
paddle/pybind/protobuf.h
浏览文件 @
8314412b
...
...
@@ -17,6 +17,7 @@ limitations under the License. */
#include <Python.h>
#include <fstream>
#include <vector>
#include "paddle/platform/variant.h"
#include "pybind11/numpy.h"
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
...
...
paddle/pybind/pybind.cc
浏览文件 @
8314412b
...
...
@@ -30,6 +30,7 @@ limitations under the License. */
#include "paddle/operators/net_op.h"
#include "paddle/platform/enforce.h"
#include "paddle/platform/place.h"
#include "paddle/platform/profiler.h"
#include "paddle/pybind/const_value.h"
#include "paddle/pybind/exception.h"
#include "paddle/pybind/pybind.h"
...
...
@@ -52,7 +53,7 @@ static size_t UniqueIntegerGenerator(const std::string &prefix) {
return
generators
[
prefix
].
fetch_add
(
1
);
}
bool
IsCompile
GPU
()
{
bool
IsCompile
dWithCUDA
()
{
#ifndef PADDLE_WITH_CUDA
return
false
;
#else
...
...
@@ -430,7 +431,7 @@ All parameter, weight, gradient are variables in Paddle.
m
.
def
(
"init_glog"
,
framework
::
InitGLOG
);
m
.
def
(
"init_devices"
,
&
framework
::
InitDevices
);
m
.
def
(
"is_compile
_gpu"
,
IsCompileGPU
);
m
.
def
(
"is_compile
d_with_cuda"
,
IsCompiledWithCUDA
);
m
.
def
(
"set_feed_variable"
,
framework
::
SetFeedVariable
);
m
.
def
(
"get_fetch_variable"
,
framework
::
GetFetchVariable
);
...
...
@@ -476,6 +477,24 @@ All parameter, weight, gradient are variables in Paddle.
m
.
def
(
"nvprof_stop"
,
platform
::
CudaProfilerStop
);
#endif
py
::
enum_
<
platform
::
ProfilerState
>
(
m
,
"ProfilerState"
,
py
::
arithmetic
())
.
value
(
"kDisabled"
,
platform
::
ProfilerState
::
kDisabled
)
.
value
(
"kCPU"
,
platform
::
ProfilerState
::
kCPU
)
.
value
(
"kCUDA"
,
platform
::
ProfilerState
::
kCUDA
)
.
export_values
();
py
::
enum_
<
platform
::
EventSortingKey
>
(
m
,
"EventSortingKey"
,
py
::
arithmetic
())
.
value
(
"kDefault"
,
platform
::
EventSortingKey
::
kDefault
)
.
value
(
"kCalls"
,
platform
::
EventSortingKey
::
kCalls
)
.
value
(
"kTotal"
,
platform
::
EventSortingKey
::
kTotal
)
.
value
(
"kMin"
,
platform
::
EventSortingKey
::
kMin
)
.
value
(
"kMax"
,
platform
::
EventSortingKey
::
kMax
)
.
value
(
"kAve"
,
platform
::
EventSortingKey
::
kAve
)
.
export_values
();
m
.
def
(
"enable_profiler"
,
platform
::
EnableProfiler
);
m
.
def
(
"disable_profiler"
,
platform
::
DisableProfiler
);
m
.
def
(
"reset_profiler"
,
platform
::
ResetProfiler
);
return
m
.
ptr
();
}
}
// namespace pybind
...
...
python/paddle/v2/fluid/__init__.py
浏览文件 @
8314412b
...
...
@@ -89,7 +89,7 @@ def __bootstrap__():
read_env_flags
=
[
'use_pinned_memory'
,
'check_nan_inf'
,
'do_memory_benchmark'
]
if
core
.
is_compile
_gpu
():
if
core
.
is_compile
d_with_cuda
():
read_env_flags
+=
[
'fraction_of_gpu_memory_to_use'
,
'op_sync'
]
core
.
init_gflags
([
sys
.
argv
[
0
]]
+
[
"--tryfromenv="
+
","
.
join
(
read_env_flags
)])
...
...
python/paddle/v2/fluid/backward.py
浏览文件 @
8314412b
...
...
@@ -178,7 +178,7 @@ def _remove_no_grad_branch_(op_descs, no_grad_set):
if
_all_in_set_
(
filter
(
lambda
name
:
name
.
find
(
core
.
grad_var_suffix
())
!=
-
1
,
op_desc
.
input_arg_names
()),
no_grad_set
):
no_grad_set
.
u
nion
(
out_arg_names
)
no_grad_set
.
u
pdate
(
out_arg_names
)
return
True
return
False
...
...
python/paddle/v2/fluid/io.py
浏览文件 @
8314412b
...
...
@@ -15,6 +15,7 @@
import
os
import
cPickle
as
pickle
from
paddle.v2.fluid.evaluator
import
Evaluator
from
paddle.v2.fluid.framework
import
Program
,
Parameter
,
default_main_program
,
Variable
from
.
import
core
...
...
@@ -187,8 +188,14 @@ def get_inference_program(target_vars, main_program=None):
main_program
=
default_main_program
()
if
not
isinstance
(
target_vars
,
list
):
target_vars
=
[
target_vars
]
pruned_program
=
main_program
.
prune
(
targets
=
target_vars
)
vars
=
[]
for
var
in
target_vars
:
if
isinstance
(
var
,
Evaluator
):
vars
.
append
(
var
.
states
)
vars
.
append
(
var
.
metrics
)
else
:
vars
.
append
(
var
)
pruned_program
=
main_program
.
prune
(
targets
=
vars
)
inference_program
=
pruned_program
.
inference_optimize
()
return
inference_program
...
...
python/paddle/v2/fluid/layer_helper.py
浏览文件 @
8314412b
...
...
@@ -111,6 +111,7 @@ class LayerHelper(object):
is_bias
=
False
,
default_initializer
=
None
):
# Deepcopy the attr so that parameters can be shared in program
attr
=
copy
.
deepcopy
(
attr
)
assert
isinstance
(
attr
,
ParamAttr
)
suffix
=
'b'
if
is_bias
else
'w'
...
...
python/paddle/v2/fluid/layers/nn.py
浏览文件 @
8314412b
此差异已折叠。
点击以展开。
python/paddle/v2/fluid/nets.py
浏览文件 @
8314412b
...
...
@@ -11,14 +11,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
layers
__all__
=
[
"simple_img_conv_pool"
,
"sequence_conv_pool"
,
"glu"
,
"dot_product_attention"
,
"
scaled_
dot_product_attention"
,
]
...
...
@@ -160,7 +159,11 @@ def glu(input, dim=-1):
return
out
def
dot_product_attention
(
querys
,
keys
,
values
):
def
scaled_dot_product_attention
(
queries
,
keys
,
values
,
num_heads
=
1
,
dropout_rate
=
0.
):
"""
The dot-product attention.
...
...
@@ -174,39 +177,162 @@ def dot_product_attention(querys, keys, values):
.. math::
Attention(Q, K, V)=
softmax(QK^\mathrm{T})V
Attention(Q, K, V)= softmax(QK^\mathrm{T})V
Refer to `Attention Is All You Need
<https://arxiv.org/pdf/1706.03762.pdf>`_.
Note that batch data containing sequences with different lengths is not
supported by this because of the (batch) matrix multipication.
Args:
query (Variable): The input variable which is a Tensor or LoDTensor.
key (Variable): The input variable which is a Tensor or LoDTensor.
value (Variable): The input variable which is a Tensor or LoDTensor.
queries (Variable): The input variable which should be a 3-D Tensor.
keys (Variable): The input variable which should be a 3-D Tensor.
values (Variable): The input variable which should be a 3-D Tensor.
num_heads (int): Head number to compute the scaled dot product
attention. Default value is 1.
dropout_rate (float): The dropout rate to drop the attention weight.
Default value is 0.
Returns:
tuple: The Tensor variables representing the output and attention scores.
Variable: A 3-D Tensor computed by multi-head scaled dot product
attention.
Raises:
ValueError: If input queries, keys, values are not 3-D Tensors.
NOTE:
1. When num_heads > 1, three linear projections are learned respectively
to map input queries, keys and values into queries', keys' and values'.
queries', keys' and values' have the same shapes with queries, keys
and values.
1. When num_heads == 1, scaled_dot_product_attention has no learnable
parameters.
Examples:
.. code-block:: python
# Suppose q, k, v are
tensor variable
s with the following shape:
# Suppose q, k, v are
Tensor
s with the following shape:
# q: [3, 5, 9], k: [3, 6, 9], v: [3, 6, 10]
out, attn_scores = fluid.nets.dot_product_attention(q, k, v)
out.shape # [3, 5, 10]
attn_scores.shape # [3, 5, 6
]
contexts = fluid.nets.scaled_dot_product_attention(q, k, v)
contexts.shape # [3, 5, 10
]
"""
assert
keys
.
shape
[
-
2
]
==
values
.
shape
[
-
2
],
'The shapes of keys and values mismatch.'
assert
querys
.
shape
[
-
1
]
==
keys
.
shape
[
-
1
],
'The shapes of querys and keys mismatch.'
product
=
layers
.
matmul
(
x
=
querys
,
y
=
keys
,
transpose_y
=
True
)
attn_scores
=
layers
.
reshape
(
if
not
(
len
(
queries
.
shape
)
==
len
(
keys
.
shape
)
==
len
(
values
.
shape
)
==
3
):
raise
ValueError
(
"Inputs quries, keys and values should all be 3-D tensors."
)
if
queries
.
shape
[
-
1
]
!=
keys
.
shape
[
-
1
]:
raise
ValueError
(
"The hidden size of queries and keys should be the same."
)
if
keys
.
shape
[
-
2
]
!=
values
.
shape
[
-
2
]:
raise
ValueError
(
"The max sequence length in query batch and in key batch "
"should be the same."
)
if
keys
.
shape
[
-
1
]
%
num_heads
!=
0
:
raise
ValueError
(
"The hidden size of keys (%d) must be divisible "
"by the number of attention heads (%d)."
%
(
keys
.
shape
[
-
1
],
num_heads
))
if
values
.
shape
[
-
1
]
%
num_heads
!=
0
:
raise
ValueError
(
"The hidden size of values (%d) must be divisible "
"by the number of attention heads (%d)."
%
(
values
.
shape
[
-
1
],
num_heads
))
def
__compute_qkv
(
queries
,
keys
,
values
,
num_heads
):
"""
Add linear projection to queries, keys, and values.
Args:
queries(Tensor): a 3-D input Tensor.
keys(Tensor): a 3-D input Tensor.
values(Tensor): a 3-D input Tensor.
num_heads(int): The number of heads. Linearly project the inputs
ONLY when num_heads > 1.
Returns:
Tensor: linearly projected output Tensors: queries', keys' and
values'. They have the same shapes with queries, keys and
values.
"""
if
num_heads
==
1
:
return
queries
,
keys
,
values
q
=
layers
.
fc
(
input
=
queries
,
size
=
queries
.
shape
[
-
1
],
num_flatten_dims
=
2
)
k
=
layers
.
fc
(
input
=
keys
,
size
=
keys
.
shape
[
-
1
],
num_flatten_dims
=
2
)
v
=
layers
.
fc
(
input
=
values
,
size
=
values
.
shape
[
-
1
],
num_flatten_dims
=
2
)
return
q
,
k
,
v
def
__split_heads
(
x
,
num_heads
):
"""
Reshape the last dimension of inpunt tensor x so that it becomes two
dimensions.
Args:
x(Tensor): a 3-D input Tensor.
num_heads(int): The number of heads.
Returns:
Tensor: a Tensor with shape [..., n, m/num_heads], where m is size
of the last dimension of x.
"""
if
num_heads
==
1
:
return
x
hidden_size
=
x
.
shape
[
-
1
]
# reshape the 3-D input: [batch_size, max_sequence_length, hidden_dim]
# into a 4-D output:
# [batch_size, max_sequence_length, num_heads, hidden_size_per_head].
reshaped
=
layers
.
reshape
(
x
=
x
,
shape
=
list
(
x
.
shape
[:
-
1
])
+
[
num_heads
,
hidden_size
//
num_heads
])
# permuate the dimensions into:
# [batch_size, num_heads, max_sequence_len, hidden_size_per_head]
return
layers
.
transpose
(
x
=
reshaped
,
perm
=
[
0
,
2
,
1
,
3
])
def
__combine_heads
(
x
):
"""
Reshape the last two dimensions of inpunt tensor x so that it becomes
one dimension.
Args:
x(Tensor): a 4-D input Tensor with shape
[bs, num_heads, max_sequence_length, hidden_dim].
Returns:
Tensor: a Tensor with shape
[bs, max_sequence_length, num_heads * hidden_dim].
"""
if
len
(
x
.
shape
)
==
3
:
return
x
if
len
(
x
.
shape
)
!=
4
:
raise
ValueError
(
"Input(x) should be a 4-D Tensor."
)
trans_x
=
layers
.
transpose
(
x
,
perm
=
[
0
,
2
,
1
,
3
])
return
layers
.
reshape
(
x
=
trans_x
,
shape
=
map
(
int
,
[
trans_x
.
shape
[
0
],
trans_x
.
shape
[
1
],
trans_x
.
shape
[
2
]
*
trans_x
.
shape
[
3
]
]))
q
,
k
,
v
=
__compute_qkv
(
queries
,
keys
,
values
,
num_heads
)
q
=
__split_heads
(
q
,
num_heads
)
k
=
__split_heads
(
k
,
num_heads
)
v
=
__split_heads
(
v
,
num_heads
)
key_dim_per_head
=
keys
.
shape
[
-
1
]
//
num_heads
scaled_q
=
layers
.
scale
(
x
=
q
,
scale
=
key_dim_per_head
**-
0.5
)
product
=
layers
.
matmul
(
x
=
k
,
y
=
scaled_q
,
transpose_y
=
True
)
weights
=
layers
.
reshape
(
x
=
layers
.
reshape
(
x
=
product
,
shape
=
[
-
1
,
product
.
shape
[
-
1
]],
act
=
'softmax'
),
x
=
product
,
shape
=
[
-
1
,
product
.
shape
[
-
1
]],
act
=
"softmax"
),
shape
=
product
.
shape
)
out
=
layers
.
matmul
(
attn_scores
,
values
)
return
out
,
attn_scores
if
dropout_rate
:
weights
=
layers
.
dropout
(
x
,
dropout_prob
=
dropout_rate
,
is_test
=
False
)
ctx_multiheads
=
layers
.
matmul
(
weights
,
v
)
return
__combine_heads
(
ctx_multiheads
)
python/paddle/v2/fluid/profiler.py
浏览文件 @
8314412b
...
...
@@ -63,3 +63,58 @@ def cuda_profiler(output_file, output_mode=None, config=None):
# Disables profiler collection.
core
.
nvprof_stop
()
os
.
remove
(
config_file
)
def
reset_profiler
():
"""The profiler clear interface.
reset_profiler will clear the previous time record.
"""
core
.
reset_profiler
()
@
contextmanager
def
profiler
(
state
,
sorted_key
=
None
):
"""The profiler interface.
Different from cuda_profiler, this profiler can be used to profile both CPU
and GPU program. By defalut, it records the CPU and GPU operator kernels,
if you want to profile other program, you can refer the profiling tutorial
to add more records.
Args:
state (string) : The profiling state, which should be 'CPU' or 'GPU',
telling the profiler to use CPU timer or GPU timer for profiling.
Although users may have already specified the execution place
(CPUPlace/CUDAPlace) in the begining, for flexibility the profiler
would not inherit this place.
sorted_key (string) : If None, the profiling results will be printed
in the order of first end time of events. Otherwise, the profiling
results will be sorted by the this flag. This flag should be one
of 'calls', 'total', 'max', 'min' or 'ave'.
The `calls` means sorting by the number of calls.
The `total` means sorting by the total execution time.
The `max` means sorting by the maximum execution time.
The `min` means sorting by the minimum execution time.
The `ave` means sorting by the average execution time.
"""
if
state
not
in
[
'CPU'
,
'GPU'
]:
raise
ValueError
(
"The state must be 'CPU' or 'GPU'."
)
prof_state
=
core
.
ProfilerState
.
kCUDA
if
state
==
"GPU"
else
core
.
ProfilerState
.
kCPU
core
.
enable_profiler
(
prof_state
)
yield
if
sorted_key
not
in
[
'calls'
,
'total'
,
'max'
,
'min'
,
'ave'
]:
raise
ValueError
(
"The state must be in 'calls', 'total', "
"'max', 'min', 'ave'"
)
sorted_key
=
'default'
if
sorted_key
is
None
else
sorted_key
key_map
=
{
'default'
:
core
.
EventSortingKey
.
kDefault
,
'calls'
:
core
.
EventSortingKey
.
kCalls
,
'total'
:
core
.
EventSortingKey
.
kTotal
,
'max'
:
core
.
EventSortingKey
.
kMax
,
'min'
:
core
.
EventSortingKey
.
kMin
,
'ave'
:
core
.
EventSortingKey
.
kAve
,
}
# TODO(qingqing) : redirect C++ ostream to Python stream.
# with core.ostream_redirect(stdout=True, stderr=True):
core
.
disable_profiler
(
key_map
[
sorted_key
])
python/paddle/v2/fluid/tests/op_test.py
浏览文件 @
8314412b
...
...
@@ -334,7 +334,7 @@ class OpTest(unittest.TestCase):
def
check_output
(
self
,
atol
=
1e-5
):
places
=
[
core
.
CPUPlace
()]
if
core
.
is_compile
_gpu
()
and
core
.
op_support_gpu
(
self
.
op_type
):
if
core
.
is_compile
d_with_cuda
()
and
core
.
op_support_gpu
(
self
.
op_type
):
places
.
append
(
core
.
CUDAPlace
(
0
))
for
place
in
places
:
self
.
check_output_with_place
(
place
,
atol
)
...
...
@@ -367,7 +367,7 @@ class OpTest(unittest.TestCase):
max_relative_error
=
0.005
,
user_defined_grads
=
None
):
places
=
[
core
.
CPUPlace
()]
if
core
.
is_compile
_gpu
()
and
core
.
op_support_gpu
(
self
.
op_type
):
if
core
.
is_compile
d_with_cuda
()
and
core
.
op_support_gpu
(
self
.
op_type
):
places
.
append
(
core
.
CUDAPlace
(
0
))
for
place
in
places
:
self
.
check_grad_with_place
(
place
,
inputs_to_check
,
output_names
,
...
...
python/paddle/v2/fluid/tests/test_adagrad_op.py
浏览文件 @
8314412b
...
...
@@ -180,7 +180,7 @@ class TestSparseAdagradOp(unittest.TestCase):
def
test_sparse_adagrad
(
self
):
places
=
[
core
.
CPUPlace
()]
if
core
.
is_compile
_gpu
():
if
core
.
is_compile
d_with_cuda
():
places
.
append
(
core
.
CUDAPlace
(
0
))
for
place
in
places
:
self
.
check_with_place
(
place
)
...
...
python/paddle/v2/fluid/tests/test_adam_op.py
浏览文件 @
8314412b
...
...
@@ -305,7 +305,7 @@ class TestSparseAdamOp(unittest.TestCase):
def
test_sparse_sgd
(
self
):
places
=
[
core
.
CPUPlace
()]
if
core
.
is_compile
_gpu
():
if
core
.
is_compile
d_with_cuda
():
places
.
append
(
core
.
CUDAPlace
(
0
))
for
place
in
places
:
self
.
check_with_place
(
place
)
...
...
python/paddle/v2/fluid/tests/test_batch_norm_op.py
浏览文件 @
8314412b
...
...
@@ -352,7 +352,7 @@ class TestBatchNormOp(OpTest):
print
"op test backward passed: "
,
str
(
place
),
data_layout
places
=
[
core
.
CPUPlace
()]
if
core
.
is_compile
_gpu
()
and
core
.
op_support_gpu
(
"batch_norm"
):
if
core
.
is_compile
d_with_cuda
()
and
core
.
op_support_gpu
(
"batch_norm"
):
places
.
append
(
core
.
CUDAPlace
(
0
))
for
place
in
places
:
...
...
python/paddle/v2/fluid/tests/test_gaussian_random_op.py
浏览文件 @
8314412b
...
...
@@ -33,7 +33,7 @@ class TestGaussianRandomOp(unittest.TestCase):
self
.
gaussian_random_test
(
place
=
fluid
.
CPUPlace
())
def
test_gpu
(
self
):
if
core
.
is_compile
_gpu
():
if
core
.
is_compile
d_with_cuda
():
self
.
gaussian_random_test
(
place
=
fluid
.
CUDAPlace
(
0
))
def
gaussian_random_test
(
self
,
place
):
...
...
python/paddle/v2/fluid/tests/test_iou_similarity_op.py
100755 → 100644
浏览文件 @
8314412b
文件模式从 100755 更改为 100644
python/paddle/v2/fluid/tests/test_multihead_attention.py
0 → 100644
浏览文件 @
8314412b
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
paddle.v2.fluid
as
fluid
import
paddle.v2.fluid.core
as
core
import
numpy
as
np
class
TestMultiheadAttention
(
unittest
.
TestCase
):
def
gen_random_input
(
self
):
"""Generate random input data.
"""
# batch_size, max_sequence_length, hidden dimension
self
.
input_shape
=
(
3
,
13
,
16
)
self
.
queries
=
np
.
random
.
random
(
size
=
self
.
input_shape
).
astype
(
"float32"
)
self
.
keys
=
np
.
random
.
random
(
size
=
self
.
input_shape
).
astype
(
"float32"
)
def
set_program
(
self
):
"""Build the test program.
"""
queries
=
fluid
.
layers
.
data
(
name
=
"queries"
,
shape
=
self
.
input_shape
,
dtype
=
"float32"
,
append_batch_size
=
False
)
queries
.
stop_gradient
=
False
keys
=
fluid
.
layers
.
data
(
name
=
"keys"
,
shape
=
self
.
input_shape
,
dtype
=
"float32"
,
append_batch_size
=
False
)
keys
.
stop_gradient
=
False
contexts
=
fluid
.
nets
.
scaled_dot_product_attention
(
queries
=
queries
,
keys
=
keys
,
values
=
keys
,
num_heads
=
8
,
dropout_rate
=
0.
)
out
=
fluid
.
layers
.
reduce_sum
(
contexts
,
dim
=
None
)
fluid
.
backward
.
append_backward
(
loss
=
out
)
self
.
fetch_list
=
[
contexts
]
def
run_program
(
self
):
"""Run the test program.
"""
places
=
[
core
.
CPUPlace
()]
if
core
.
is_compile_gpu
():
places
.
append
(
core
.
CUDAPlace
(
0
))
for
place
in
places
:
self
.
set_inputs
(
place
)
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
fluid
.
default_startup_program
())
output
=
exe
.
run
(
fluid
.
default_main_program
(),
feed
=
self
.
inputs
,
fetch_list
=
self
.
fetch_list
,
return_numpy
=
True
)
self
.
op_output
=
output
def
set_inputs
(
self
,
place
):
"""Set the randomly generated data to the test program.
"""
self
.
inputs
=
{}
queries
=
fluid
.
Tensor
()
queries
.
set
(
self
.
queries
,
place
)
keys
=
fluid
.
Tensor
()
keys
.
set
(
self
.
keys
,
place
)
self
.
inputs
[
"keys"
]
=
keys
self
.
inputs
[
"queries"
]
=
queries
def
test_multihead_attention
(
self
):
self
.
gen_random_input
()
self
.
set_program
()
self
.
run_program
()
#fixme(caoying) add more meaningfull unittest.
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/v2/fluid/tests/test_normalization_wrapper.py
浏览文件 @
8314412b
...
...
@@ -46,7 +46,7 @@ class TestNormalization(unittest.TestCase):
"""Run the test program.
"""
places
=
[
core
.
CPUPlace
()]
if
core
.
is_compile
_gpu
():
if
core
.
is_compile
d_with_cuda
():
places
.
append
(
core
.
CUDAPlace
(
0
))
for
place
in
places
:
...
...
python/paddle/v2/fluid/tests/test_op_support_gpu.py
浏览文件 @
8314412b
...
...
@@ -18,7 +18,8 @@ import paddle.v2.fluid.core as core
class
TestOpSupportGPU
(
unittest
.
TestCase
):
def
test_case
(
self
):
self
.
assertEqual
(
core
.
is_compile_gpu
(),
core
.
op_support_gpu
(
"sum"
))
self
.
assertEqual
(
core
.
is_compiled_with_cuda
(),
core
.
op_support_gpu
(
"sum"
))
if
__name__
==
'__main__'
:
...
...
python/paddle/v2/fluid/tests/test_parallel_op.py
浏览文件 @
8314412b
...
...
@@ -53,7 +53,7 @@ class BaseParallelForTest(unittest.TestCase):
fetch
=
fetch
,
place
=
cpu
,
use_parallel
=
True
)
if
fluid
.
core
.
is_compile
_gpu
():
if
fluid
.
core
.
is_compile
d_with_cuda
():
gpu
=
fluid
.
CUDAPlace
(
0
)
result_gpu
=
self
.
_run_test_impl_
(
callback
=
callback
,
...
...
@@ -159,7 +159,7 @@ class ParallelOpTest(BaseParallelForTest):
def
test_simple_fc
(
self
):
self
.
run_test
(
callback
=
ParallelOpTest
.
__network__
,
callback
=
self
.
__network__
,
feed
=
{
'img'
:
numpy
.
random
.
random
(
size
=
(
51
,
784
)).
astype
(
'float32'
)
},
...
...
@@ -167,10 +167,35 @@ class ParallelOpTest(BaseParallelForTest):
def
test_fc_with_tiny_data
(
self
):
self
.
run_test
(
callback
=
ParallelOpTest
.
__network__
,
callback
=
self
.
__network__
,
feed
=
{
'img'
:
numpy
.
random
.
random
(
size
=
(
1
,
784
)).
astype
(
'float32'
)},
fetch
=
[
'fc1.w@GRAD'
])
class
ParallelOpTestMultipleInput
(
BaseParallelForTest
):
@
staticmethod
def
__network__
():
x
=
fluid
.
layers
.
data
(
shape
=
[
784
],
dtype
=
'float32'
,
name
=
'img1'
,
stop_gradient
=
False
)
y
=
fluid
.
layers
.
data
(
shape
=
[
784
],
dtype
=
'float32'
,
name
=
'img2'
,
stop_gradient
=
False
)
yield
[
x
,
y
]
x
=
x
+
y
hidden1
=
fluid
.
layers
.
fc
(
input
=
x
,
size
=
200
,
param_attr
=
'fc1.w'
)
hidden2
=
fluid
.
layers
.
fc
(
input
=
hidden1
,
size
=
200
,
param_attr
=
'fc2.w'
)
hidden3
=
fluid
.
layers
.
fc
(
input
=
hidden2
,
size
=
200
,
param_attr
=
'fc3.w'
)
loss
=
fluid
.
layers
.
mean
(
x
=
hidden3
)
yield
loss
def
test_simple_fc
(
self
):
self
.
run_test
(
callback
=
self
.
__network__
,
feed
=
{
'img1'
:
numpy
.
random
.
random
(
size
=
(
51
,
784
)).
astype
(
'float32'
),
'img2'
:
numpy
.
random
.
random
(
size
=
(
51
,
784
)).
astype
(
'float32'
)
},
fetch
=
[
'fc1.w@GRAD'
,
'fc2.w@GRAD'
,
'fc3.w@GRAD'
])
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/v2/fluid/tests/test_profiler.py
浏览文件 @
8314412b
...
...
@@ -13,16 +13,17 @@
# limitations under the License.
import
unittest
import
os
import
numpy
as
np
import
paddle.v2.fluid
as
fluid
import
paddle.v2.fluid.profiler
as
profiler
import
paddle.v2.fluid.layers
as
layers
import
os
import
paddle.v2.fluid.core
as
core
class
TestProfiler
(
unittest
.
TestCase
):
def
test_nvprof
(
self
):
if
not
fluid
.
core
.
is_compile
_gpu
():
if
not
fluid
.
core
.
is_compile
d_with_cuda
():
return
epoc
=
8
dshape
=
[
4
,
3
,
28
,
28
]
...
...
@@ -40,6 +41,50 @@ class TestProfiler(unittest.TestCase):
exe
.
run
(
fluid
.
default_main_program
(),
feed
=
{
'data'
:
input
})
os
.
remove
(
output_file
)
def
net_profiler
(
self
,
state
):
if
state
==
'GPU'
and
not
core
.
is_compiled_with_cuda
():
return
startup_program
=
fluid
.
Program
()
main_program
=
fluid
.
Program
()
with
fluid
.
program_guard
(
main_program
,
startup_program
):
image
=
fluid
.
layers
.
data
(
name
=
'x'
,
shape
=
[
784
],
dtype
=
'float32'
)
hidden1
=
fluid
.
layers
.
fc
(
input
=
image
,
size
=
128
,
act
=
'relu'
)
hidden2
=
fluid
.
layers
.
fc
(
input
=
hidden1
,
size
=
64
,
act
=
'relu'
)
predict
=
fluid
.
layers
.
fc
(
input
=
hidden2
,
size
=
10
,
act
=
'softmax'
)
label
=
fluid
.
layers
.
data
(
name
=
'y'
,
shape
=
[
1
],
dtype
=
'int64'
)
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
predict
,
label
=
label
)
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
accuracy
=
fluid
.
evaluator
.
Accuracy
(
input
=
predict
,
label
=
label
)
optimizer
=
fluid
.
optimizer
.
Momentum
(
learning_rate
=
0.001
,
momentum
=
0.9
)
opts
=
optimizer
.
minimize
(
avg_cost
,
startup_program
=
startup_program
)
place
=
fluid
.
CPUPlace
()
if
state
==
'CPU'
else
fluid
.
CUDAPlace
(
0
)
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup_program
)
accuracy
.
reset
(
exe
)
with
profiler
.
profiler
(
state
,
'total'
)
as
prof
:
for
iter
in
range
(
10
):
if
iter
==
2
:
profiler
.
reset_profiler
()
x
=
np
.
random
.
random
((
32
,
784
)).
astype
(
"float32"
)
y
=
np
.
random
.
randint
(
0
,
10
,
(
32
,
1
)).
astype
(
"int64"
)
outs
=
exe
.
run
(
main_program
,
feed
=
{
'x'
:
x
,
'y'
:
y
},
fetch_list
=
[
avg_cost
]
+
accuracy
.
metrics
)
acc
=
np
.
array
(
outs
[
1
])
pass_acc
=
accuracy
.
eval
(
exe
)
def
test_cpu_profiler
(
self
):
self
.
net_profiler
(
'CPU'
)
def
test_cuda_profiler
(
self
):
self
.
net_profiler
(
'GPU'
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
浏览文件 @
8314412b
...
...
@@ -45,7 +45,7 @@ class TestReorderLoDTensor(unittest.TestCase):
outputs
=
[]
input_grads
=
[]
places
=
[
core
.
CPUPlace
()]
if
core
.
is_compile
_gpu
():
if
core
.
is_compile
d_with_cuda
():
places
.
append
(
core
.
CUDAPlace
(
0
))
for
place
in
places
:
self
.
set_inputs
(
place
)
...
...
python/paddle/v2/fluid/tests/test_sgd_op.py
浏览文件 @
8314412b
...
...
@@ -91,7 +91,7 @@ class TestSparseSGDOp(unittest.TestCase):
def
test_sparse_sgd
(
self
):
places
=
[
core
.
CPUPlace
()]
if
core
.
is_compile
_gpu
():
if
core
.
is_compile
d_with_cuda
():
places
.
append
(
core
.
CUDAPlace
(
0
))
for
place
in
places
:
self
.
check_with_place
(
place
)
...
...
python/paddle/v2/fluid/tests/test_split_selected_rows_op.py
浏览文件 @
8314412b
...
...
@@ -21,7 +21,7 @@ from paddle.v2.fluid.op import Operator
class
TestSpliteSelectedRows
(
unittest
.
TestCase
):
def
get_places
(
self
):
places
=
[
core
.
CPUPlace
()]
if
core
.
is_compile
_gpu
():
if
core
.
is_compile
d_with_cuda
():
places
.
append
(
core
.
CUDAPlace
(
0
))
return
places
...
...
python/paddle/v2/fluid/tests/test_uniform_random_op.py
浏览文件 @
8314412b
...
...
@@ -36,7 +36,7 @@ class TestUniformRandomOp(unittest.TestCase):
self
.
uniform_random_test
(
place
=
core
.
CPUPlace
())
def
test_gpu
(
self
):
if
core
.
is_compile
_gpu
():
if
core
.
is_compile
d_with_cuda
():
self
.
uniform_random_test
(
place
=
core
.
CUDAPlace
(
0
))
def
uniform_random_test
(
self
,
place
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录