Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
b2898c0f
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
b2898c0f
编写于
3月 14, 2019
作者:
L
luotao1
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' into runtime_context
test=develop
上级
1510b866
4ef6f738
变更
33
展开全部
隐藏空白更改
内联
并排
Showing
33 changed file
with
1268 addition
and
417 deletion
+1268
-417
paddle/fluid/API.spec
paddle/fluid/API.spec
+3
-0
paddle/fluid/framework/executor.cc
paddle/fluid/framework/executor.cc
+7
-4
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+4
-2
paddle/fluid/framework/operator.h
paddle/fluid/framework/operator.h
+9
-0
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+26
-11
paddle/fluid/framework/parallel_executor.h
paddle/fluid/framework/parallel_executor.h
+7
-2
paddle/fluid/memory/allocation/CMakeLists.txt
paddle/fluid/memory/allocation/CMakeLists.txt
+1
-1
paddle/fluid/memory/allocation/legacy_allocator.cc
paddle/fluid/memory/allocation/legacy_allocator.cc
+8
-4
paddle/fluid/operators/activation_op.cc
paddle/fluid/operators/activation_op.cc
+47
-0
paddle/fluid/operators/activation_op.h
paddle/fluid/operators/activation_op.h
+100
-3
paddle/fluid/operators/detection/box_coder_op.h
paddle/fluid/operators/detection/box_coder_op.h
+52
-37
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
+2
-3
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+2
-0
paddle/fluid/operators/hash_op.cc
paddle/fluid/operators/hash_op.cc
+2
-3
paddle/fluid/operators/ngraph/ngraph_engine.cc
paddle/fluid/operators/ngraph/ngraph_engine.cc
+362
-242
paddle/fluid/operators/ngraph/ngraph_engine.h
paddle/fluid/operators/ngraph/ngraph_engine.h
+45
-16
paddle/fluid/operators/ngraph/ngraph_engine_op.cc
paddle/fluid/operators/ngraph/ngraph_engine_op.cc
+1
-0
paddle/fluid/operators/ngraph/ngraph_engine_op.h
paddle/fluid/operators/ngraph/ngraph_engine_op.h
+1
-3
paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
+2
-3
paddle/fluid/platform/device_tracer.cc
paddle/fluid/platform/device_tracer.cc
+55
-4
paddle/fluid/platform/device_tracer.h
paddle/fluid/platform/device_tracer.h
+21
-0
paddle/fluid/platform/event.h
paddle/fluid/platform/event.h
+33
-0
paddle/fluid/platform/profiler.cc
paddle/fluid/platform/profiler.cc
+190
-67
paddle/fluid/platform/profiler.h
paddle/fluid/platform/profiler.h
+76
-1
paddle/fluid/platform/profiler.proto
paddle/fluid/platform/profiler.proto
+17
-0
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+10
-1
paddle/scripts/paddle_build.sh
paddle/scripts/paddle_build.sh
+6
-2
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+4
-1
python/paddle/fluid/compiler.py
python/paddle/fluid/compiler.py
+11
-7
python/paddle/fluid/layers/ops.py
python/paddle/fluid/layers/ops.py
+3
-0
python/paddle/fluid/tests/unittests/test_activation_op.py
python/paddle/fluid/tests/unittests/test_activation_op.py
+54
-0
tools/print_signatures.py
tools/print_signatures.py
+3
-0
tools/timeline.py
tools/timeline.py
+104
-0
未找到文件。
paddle/fluid/API.spec
浏览文件 @
b2898c0f
...
...
@@ -293,6 +293,7 @@ paddle.fluid.layers.sigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=
paddle.fluid.layers.logsigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '81ccb7acafd06c7728e11581f5d342e3'))
paddle.fluid.layers.exp (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e6b3e769413d96aab4176f96db25984b'))
paddle.fluid.layers.tanh (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e9d586a0b5bd05f67ee78048f9d503b6'))
paddle.fluid.layers.atan (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3a46e0b5f9ce82348406478e610f14c9'))
paddle.fluid.layers.tanh_shrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1e521554b9fdda9061ec6d306f0709b7'))
paddle.fluid.layers.softshrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9eef31597bbafa2bd49691e072296e13'))
paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '072a8541e0f632366bba10f67cb0db27'))
...
...
@@ -300,6 +301,8 @@ paddle.fluid.layers.abs (ArgSpec(args=['x', 'name'], varargs=None, keywords=None
paddle.fluid.layers.ceil (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c75d67dc5fe28f68e4cfffead4f698ad'))
paddle.fluid.layers.floor (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '647b16c5da5ef909649ae02abb434973'))
paddle.fluid.layers.cos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '485f2686bcc2fe37a4bd893769c8a3e2'))
paddle.fluid.layers.acos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '920a47734482276c069ba24c61c26b25'))
paddle.fluid.layers.asin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cf4ee2c9b9d7293556f8c5173dfb5d2c'))
paddle.fluid.layers.sin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '01f1766aa76eff1df30147505b59f7c4'))
paddle.fluid.layers.round (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b47f5da13913d3e56bdb1e612a73f3f2'))
paddle.fluid.layers.reciprocal (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cc6ac2f14f03c52aaa83a59bf83b8d26'))
...
...
paddle/fluid/framework/executor.cc
浏览文件 @
b2898c0f
...
...
@@ -34,11 +34,11 @@ limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#include "paddle/fluid/operators/ngraph/ngraph_engine.h"
DEFINE_bool
(
use_ngraph
,
false
,
"Use NGRAPH to run"
);
#endif
DECLARE_bool
(
benchmark
);
DEFINE_bool
(
use_mkldnn
,
false
,
"Use MKLDNN to run"
);
DEFINE_bool
(
use_ngraph
,
false
,
"Use NGRAPH to run"
);
namespace
paddle
{
namespace
framework
{
...
...
@@ -194,9 +194,6 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
bool
force_disable_gc
)
{
platform
::
RecordBlock
b
(
block_id
);
if
(
FLAGS_use_mkldnn
)
EnableMKLDNN
(
pdesc
);
#ifdef PADDLE_WITH_NGRAPH
if
(
FLAGS_use_ngraph
)
operators
::
NgraphEngine
::
EnableNgraph
(
pdesc
);
#endif
auto
ctx
=
Prepare
(
pdesc
,
block_id
,
skip_ref_cnt_vars
,
force_disable_gc
);
RunPreparedContext
(
ctx
.
get
(),
scope
,
create_local_scope
,
create_vars
);
}
...
...
@@ -372,6 +369,12 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
for
(
auto
&
op_desc
:
block
.
AllOps
())
{
ctx
->
ops_
.
push_back
(
OpRegistry
::
CreateOp
(
*
op_desc
));
}
#ifdef PADDLE_WITH_NGRAPH
if
(
FLAGS_use_ngraph
)
{
paddle
::
operators
::
NgraphEngine
::
FuseNgraphOps
(
ctx
->
prog_
.
Block
(
ctx
->
block_id_
),
&
ctx
->
ops_
);
}
#endif
return
ctx
;
}
...
...
paddle/fluid/framework/operator.cc
浏览文件 @
b2898c0f
...
...
@@ -934,8 +934,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
dev_ctx
=
pool
.
Get
(
expected_kernel_key
.
place_
);
}
RuntimeInferShapeContext
infer_shape_ctx
(
*
this
,
exec_scope
,
*
runtime_ctx_
);
this
->
InferShape
(
&
infer_shape_ctx
);
if
(
!
HasAttr
(
kAllKernelsMustComputeRuntimeShape
))
{
RuntimeInferShapeContext
infer_shape_ctx
(
*
this
,
exec_scope
,
*
runtime_ctx_
);
this
->
InferShape
(
&
infer_shape_ctx
);
}
// TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
// not Scope. Imperative mode only pass inputs and get outputs.
kernel_iter
->
second
(
ExecutionContext
(
*
this
,
exec_scope
,
*
dev_ctx
,
...
...
paddle/fluid/framework/operator.h
浏览文件 @
b2898c0f
...
...
@@ -70,6 +70,15 @@ constexpr char kNewGradSuffix[] = "@NEWGRAD@";
/// execution to save the elapsed time.
constexpr
char
kEnableRuntimeContext
[]
=
"@ENABLE_RUNTIME_CONTEXT@"
;
/// If an Op has this attribute, all its kernels should calculate output
/// variable's shape in the corresponding Compute() function. And
/// OperatorWithKernel::RunImpl() would skip call this Op's InferShape()
/// function in its runtime for speedup.
/// TODO(luotao): Note that this temporal attribute would be deleted after all
/// ops contain it.
constexpr
char
kAllKernelsMustComputeRuntimeShape
[]
=
"@ALL_KERNELS_MUST_COMPUTE_RUNTIME_SHAPE@"
;
// define some kernel priority
/* Define multiple kernel type fallback order*/
extern
std
::
vector
<
std
::
tuple
<
platform
::
Place
,
LibraryType
>>
kKernelPriority
;
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
b2898c0f
...
...
@@ -181,13 +181,14 @@ std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
return
member_
->
local_scopes_
;
}
ParallelExecutor
::
ParallelExecutor
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
unordered_set
<
std
::
string
>
&
bcast_vars
,
const
std
::
string
&
loss_var_name
,
Scope
*
scope
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
ExecutionStrategy
&
exec_strategy
,
const
BuildStrategy
&
build_strategy
,
ir
::
Graph
*
graph
)
ParallelExecutor
::
ParallelExecutor
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
std
::
string
>
&
bcast_vars
,
const
std
::
string
&
loss_var_name
,
Scope
*
scope
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
ExecutionStrategy
&
exec_strategy
,
const
BuildStrategy
&
build_strategy
,
ir
::
Graph
*
graph
)
:
member_
(
new
ParallelExecutorPrivate
(
places
))
{
member_
->
global_scope_
=
scope
;
member_
->
use_cuda_
=
exec_strategy
.
use_cuda_
;
...
...
@@ -254,9 +255,23 @@ ParallelExecutor::ParallelExecutor(
PADDLE_THROW
(
"Not compiled with CUDA"
);
#endif
}
if
(
member_
->
local_scopes_
.
size
()
!=
1
&&
local_scopes
.
empty
())
{
BCastParamsToDevices
(
bcast_vars
);
// broadcast parameters from the 0th device to others:
auto
need_broadcast
=
[
&
]()
->
bool
{
if
(
build_strategy
.
num_trainers_
>
1
)
{
// 1. num_tariners would be grater than 1 for nccl distributed training.
return
true
;
}
else
if
(
member_
->
local_scopes_
.
size
()
!=
1
&&
local_scopes
.
empty
())
{
// 2. Only one trainer process, but ParallelExecutor hold multiple
// devices.
return
true
;
}
return
false
;
};
if
(
need_broadcast
())
{
BCastParamsToDevices
(
bcast_vars
,
build_strategy
.
trainer_id_
);
}
// Startup Program has been run. All local scopes has correct parameters.
// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
...
...
@@ -338,7 +353,7 @@ ParallelExecutor::ParallelExecutor(
}
void
ParallelExecutor
::
BCastParamsToDevices
(
const
std
::
unordered_set
<
std
::
string
>
&
vars
)
const
{
const
std
::
vector
<
std
::
string
>
&
vars
,
int
trainer_id
)
const
{
// the initializing bcast, all vars would be bcast from device(0).
for
(
auto
&
var
:
vars
)
{
framework
::
Variable
*
main_var
=
member_
->
local_scopes_
[
0
]
->
FindVar
(
var
);
...
...
@@ -362,7 +377,7 @@ void ParallelExecutor::BCastParamsToDevices(
auto
place
=
member_
->
places_
[
i
];
void
*
buffer
;
if
(
i
==
0
)
{
if
(
i
==
0
&&
trainer_id
==
0
)
{
buffer
=
const_cast
<
void
*>
(
main_tensor
.
data
<
void
>
());
}
else
{
auto
local_scope
=
member_
->
local_scopes_
[
i
];
...
...
paddle/fluid/framework/parallel_executor.h
浏览文件 @
b2898c0f
...
...
@@ -14,9 +14,11 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/build_strategy.h"
...
...
@@ -45,7 +47,7 @@ class ParallelExecutor {
public:
explicit
ParallelExecutor
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
unordered_set
<
std
::
string
>
&
bcast_vars
,
const
std
::
vector
<
std
::
string
>
&
bcast_vars
,
const
std
::
string
&
loss_var_name
,
Scope
*
scope
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
ExecutionStrategy
&
exec_strategy
,
...
...
@@ -70,7 +72,10 @@ class ParallelExecutor {
const
std
::
string
&
fetched_var_name
);
private:
void
BCastParamsToDevices
(
const
std
::
unordered_set
<
std
::
string
>
&
vars
)
const
;
// broadcast the parameters from the 0th device.
// trainer_id the trainer index in nccl distributed training.
void
BCastParamsToDevices
(
const
std
::
vector
<
std
::
string
>
&
vars
,
int
trainer_id
=
0
)
const
;
bool
EnableParallelGraphExecution
(
const
ir
::
Graph
&
graph
,
const
ExecutionStrategy
&
exec_strategy
,
const
BuildStrategy
&
build_strategy
)
const
;
...
...
paddle/fluid/memory/allocation/CMakeLists.txt
浏览文件 @
b2898c0f
...
...
@@ -3,7 +3,7 @@ cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
cc_library
(
best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator
)
cc_library
(
locked_allocator SRCS locked_allocator.cc DEPS allocator
)
cc_library
(
buffered_allocator SRCS buffered_allocator.cc DEPS allocator
)
cc_library
(
legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator
)
cc_library
(
legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator
profiler
)
cc_test
(
buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator
)
if
(
WITH_GPU
)
...
...
paddle/fluid/memory/allocation/legacy_allocator.cc
浏览文件 @
b2898c0f
...
...
@@ -12,8 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/legacy_allocator.h"
#include <memory>
#include <string>
#include <utility>
...
...
@@ -24,9 +22,11 @@
#endif
#include "glog/logging.h"
#include "paddle/fluid/memory/allocation/legacy_allocator.h"
#include "paddle/fluid/memory/detail/buddy_allocator.h"
#include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/split.h"
...
...
@@ -329,18 +329,22 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
}
// namespace legacy
namespace
allocation
{
LegacyMemMonitor
GPUMemMonitor
;
Allocation
*
LegacyAllocator
::
AllocateImpl
(
size_t
size
,
Allocator
::
Attr
attr
)
{
void
*
ptr
=
boost
::
apply_visitor
(
legacy
::
AllocVisitor
(
size
),
place_
);
return
new
Allocation
(
ptr
,
size
,
place_
);
auto
*
tmp_alloc
=
new
Allocation
(
ptr
,
size
,
place_
);
platform
::
MemEvenRecorder
::
Instance
().
PushMemRecord
(
static_cast
<
void
*>
(
tmp_alloc
),
place_
,
size
);
return
tmp_alloc
;
}
void
LegacyAllocator
::
Free
(
Allocation
*
allocation
)
{
boost
::
apply_visitor
(
legacy
::
FreeVisitor
(
allocation
->
ptr
(),
allocation
->
size
()),
allocation
->
place
());
platform
::
MemEvenRecorder
::
Instance
().
PopMemRecord
(
static_cast
<
void
*>
(
allocation
),
place_
);
delete
allocation
;
}
...
...
paddle/fluid/operators/activation_op.cc
浏览文件 @
b2898c0f
...
...
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/activation_op.h"
#include <memory>
#include <string>
#include <unordered_map>
#include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h"
#include "paddle/fluid/platform/port.h"
#ifdef PADDLE_WITH_CUDA
...
...
@@ -269,6 +271,48 @@ $$out = \\frac{x}{1 + \|x\|}$$
)DOC"
;
class
AcosOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"X"
,
"Input of acos operator"
);
AddOutput
(
"Out"
,
"Output of acos operator"
);
AddComment
(
R"DOC(
Arccosine Activation Operator.
$$out = \cos^{-1}(x)$$
)DOC"
);
}
};
class
AsinOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"X"
,
"Input of asin operator"
);
AddOutput
(
"Out"
,
"Output of asin operator"
);
AddComment
(
R"DOC(
Arcsine Activation Operator.
$$out = \sin^{-1}(x)$$
)DOC"
);
}
};
class
AtanOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"X"
,
"Input of atan operator"
);
AddOutput
(
"Out"
,
"Output of atan operator"
);
AddComment
(
R"DOC(
Arctanh Activation Operator.
$$out = \tanh^{-1}(x)$$
)DOC"
);
}
};
class
LeakyReluOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
...
...
@@ -543,7 +587,10 @@ namespace ops = paddle::operators;
__macro(SoftShrink, softshrink); \
__macro(Abs, abs); \
__macro(Cos, cos); \
__macro(Acos, acos); \
__macro(Sin, sin); \
__macro(Asin, asin); \
__macro(Atan, atan); \
__macro(Round, round); \
__macro(Log, log); \
__macro(Square, square); \
...
...
paddle/fluid/operators/activation_op.h
浏览文件 @
b2898c0f
...
...
@@ -39,9 +39,8 @@ namespace operators {
Please refer to the layer_helper.py and get the details.
*/
static
std
::
unordered_set
<
std
::
string
>
InplaceOpSet
=
{
"sigmoid"
,
"exp"
,
"relu"
,
"tanh"
,
"sqrt"
,
"ceil"
,
"floor"
,
"reciprocal"
,
"relu6"
,
"soft_relu"
,
"hard_sigmoid"
,
};
"sigmoid"
,
"exp"
,
"relu"
,
"tanh"
,
"sqrt"
,
"ceil"
,
"floor"
,
"reciprocal"
,
"relu6"
,
"soft_relu"
,
"hard_sigmoid"
};
static
bool
IsInplace
(
const
std
::
string
&
op
)
{
bool
inplace
=
InplaceOpSet
.
count
(
op
);
...
...
@@ -553,6 +552,101 @@ struct SinFunctor : public BaseActivationFunctor<T> {
}
};
template
<
typename
T
>
struct
Acos
{
HOSTDEVICE
T
operator
()(
const
T
&
val
)
const
{
return
acos
(
val
);
}
};
template
<
>
struct
Acos
<
platform
::
float16
>
{
HOSTDEVICE
platform
::
float16
operator
()(
const
platform
::
float16
&
val
)
const
{
return
platform
::
float16
(
acos
(
static_cast
<
float
>
(
val
)));
}
};
// Acos(x) = acos(x)
template
<
typename
T
>
struct
AcosFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
,
typename
X
,
typename
Out
>
void
operator
()(
Device
d
,
X
x
,
Out
out
)
const
{
out
.
device
(
d
)
=
x
.
unaryExpr
(
Acos
<
T
>
());
}
};
// acos'(x) = -1/sqrt(1-x^2)
template
<
typename
T
>
struct
AcosGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
dx
.
device
(
d
)
=
-
dout
*
static_cast
<
T
>
(
1
)
/
(
static_cast
<
T
>
(
1
)
-
x
.
square
()).
sqrt
();
}
};
template
<
typename
T
>
struct
Asin
{
HOSTDEVICE
T
operator
()(
const
T
&
val
)
const
{
return
asin
(
val
);
}
};
template
<
>
struct
Asin
<
platform
::
float16
>
{
HOSTDEVICE
platform
::
float16
operator
()(
const
platform
::
float16
&
val
)
const
{
return
platform
::
float16
(
asin
(
static_cast
<
float
>
(
val
)));
}
};
// Asin(x) = asin(x)
template
<
typename
T
>
struct
AsinFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
,
typename
X
,
typename
Out
>
void
operator
()(
Device
d
,
X
x
,
Out
out
)
const
{
out
.
device
(
d
)
=
x
.
unaryExpr
(
Asin
<
T
>
());
}
};
// asin'(x) = 1/sqrt(1-x^2)
template
<
typename
T
>
struct
AsinGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
dx
.
device
(
d
)
=
dout
*
static_cast
<
T
>
(
1
)
/
(
static_cast
<
T
>
(
1
)
-
x
.
square
()).
sqrt
();
}
};
template
<
typename
T
>
struct
Atan
{
HOSTDEVICE
T
operator
()(
const
T
&
val
)
const
{
return
atan
(
val
);
}
};
template
<
>
struct
Atan
<
platform
::
float16
>
{
HOSTDEVICE
platform
::
float16
operator
()(
const
platform
::
float16
&
val
)
const
{
return
platform
::
float16
(
atan
(
static_cast
<
float
>
(
val
)));
}
};
// Atan(x) = atan(x)
template
<
typename
T
>
struct
AtanFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
,
typename
X
,
typename
Out
>
void
operator
()(
Device
d
,
X
x
,
Out
out
)
const
{
out
.
device
(
d
)
=
x
.
unaryExpr
(
Atan
<
T
>
());
}
};
// atan'(x) = 1 / (1 + x^2)
template
<
typename
T
>
struct
AtanGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
dx
.
device
(
d
)
=
dout
*
static_cast
<
T
>
(
1
)
/
(
static_cast
<
T
>
(
1
)
+
x
.
square
());
}
};
// round(x) = [x]
template
<
typename
T
>
struct
RoundFunctor
:
public
BaseActivationFunctor
<
T
>
{
...
...
@@ -1001,13 +1095,16 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
__macro(relu, ReluFunctor, ReluGradFunctor); \
__macro(gelu, GeluFunctor, GeluGradFunctor); \
__macro(tanh, TanhFunctor, TanhGradFunctor); \
__macro(atan, AtanFunctor, AtanGradFunctor); \
__macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \
__macro(sqrt, SqrtFunctor, SqrtGradFunctor); \
__macro(abs, AbsFunctor, AbsGradFunctor); \
__macro(ceil, CeilFunctor, ZeroGradFunctor); \
__macro(floor, FloorFunctor, ZeroGradFunctor); \
__macro(cos, CosFunctor, CosGradFunctor); \
__macro(acos, AcosFunctor, AcosGradFunctor); \
__macro(sin, SinFunctor, SinGradFunctor); \
__macro(asin, AsinFunctor, AsinGradFunctor); \
__macro(round, RoundFunctor, ZeroGradFunctor); \
__macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \
__macro(log, LogFunctor, LogGradFunctor); \
...
...
paddle/fluid/operators/detection/box_coder_op.h
浏览文件 @
b2898c0f
...
...
@@ -20,7 +20,7 @@ namespace operators {
enum
class
BoxCodeType
{
kEncodeCenterSize
=
0
,
kDecodeCenterSize
=
1
};
inline
BoxCodeType
GetBoxCodeType
(
const
std
::
string
&
type
)
{
inline
BoxCodeType
GetBoxCodeType
(
const
std
::
string
&
type
)
{
if
(
type
==
"encode_center_size"
)
{
return
BoxCodeType
::
kEncodeCenterSize
;
}
else
if
(
type
==
"decode_center_size"
)
{
...
...
@@ -32,24 +32,23 @@ inline BoxCodeType GetBoxCodeType(const std::string& type) {
template
<
typename
DeviceContext
,
typename
T
>
class
BoxCoderKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
EncodeCenterSize
(
const
framework
::
Tensor
*
target_box
,
const
framework
::
Tensor
*
prior_box
,
const
framework
::
Tensor
*
prior_box_var
,
void
EncodeCenterSize
(
const
framework
::
Tensor
*
target_box
,
const
framework
::
Tensor
*
prior_box
,
const
framework
::
Tensor
*
prior_box_var
,
const
bool
normalized
,
const
std
::
vector
<
float
>
variance
,
T
*
output
)
const
{
const
std
::
vector
<
float
>
variance
,
T
*
output
)
const
{
int64_t
row
=
target_box
->
dims
()[
0
];
int64_t
col
=
prior_box
->
dims
()[
0
];
int64_t
len
=
prior_box
->
dims
()[
1
];
auto
*
target_box_data
=
target_box
->
data
<
T
>
();
auto
*
prior_box_data
=
prior_box
->
data
<
T
>
();
const
T
*
prior_box_var_data
=
nullptr
;
if
(
prior_box_var
)
prior_box_var_data
=
prior_box_var
->
data
<
T
>
();
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(2)
#endif
for
(
int64_t
i
=
0
;
i
<
row
;
++
i
)
{
for
(
int64_t
j
=
0
;
j
<
col
;
++
j
)
{
auto
*
target_box_data
=
target_box
->
data
<
T
>
();
auto
*
prior_box_data
=
prior_box
->
data
<
T
>
();
size_t
offset
=
i
*
col
*
len
+
j
*
len
;
T
prior_box_width
=
prior_box_data
[
j
*
len
+
2
]
-
prior_box_data
[
j
*
len
]
+
(
normalized
==
false
);
T
prior_box_height
=
prior_box_data
[
j
*
len
+
3
]
-
...
...
@@ -69,7 +68,6 @@ class BoxCoderKernel : public framework::OpKernel<T> {
target_box_data
[
i
*
len
+
1
]
+
(
normalized
==
false
);
size_t
offset
=
i
*
col
*
len
+
j
*
len
;
output
[
offset
]
=
(
target_box_center_x
-
prior_box_center_x
)
/
prior_box_width
;
output
[
offset
+
1
]
=
...
...
@@ -78,44 +76,61 @@ class BoxCoderKernel : public framework::OpKernel<T> {
std
::
log
(
std
::
fabs
(
target_box_width
/
prior_box_width
));
output
[
offset
+
3
]
=
std
::
log
(
std
::
fabs
(
target_box_height
/
prior_box_height
));
if
(
prior_box_var
)
{
int
prior_var_offset
=
j
*
len
;
output
[
offset
]
/=
prior_box_var_data
[
prior_var_offset
];
output
[
offset
+
1
]
/=
prior_box_var_data
[
prior_var_offset
+
1
];
output
[
offset
+
2
]
/=
prior_box_var_data
[
prior_var_offset
+
2
];
output
[
offset
+
3
]
/=
prior_box_var_data
[
prior_var_offset
+
3
];
}
else
if
(
!
(
variance
.
empty
()))
{
}
}
if
(
prior_box_var
)
{
const
T
*
prior_box_var_data
=
prior_box_var
->
data
<
T
>
();
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(3)
#endif
for
(
int64_t
i
=
0
;
i
<
row
;
++
i
)
{
for
(
int64_t
j
=
0
;
j
<
col
;
++
j
)
{
for
(
int
k
=
0
;
k
<
4
;
++
k
)
{
size_t
offset
=
i
*
col
*
len
+
j
*
len
;
int
prior_var_offset
=
j
*
len
;
output
[
offset
+
k
]
/=
prior_box_var_data
[
prior_var_offset
+
k
];
}
}
}
}
else
if
(
!
(
variance
.
empty
()))
{
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(3)
#endif
for
(
int64_t
i
=
0
;
i
<
row
;
++
i
)
{
for
(
int64_t
j
=
0
;
j
<
col
;
++
j
)
{
for
(
int
k
=
0
;
k
<
4
;
++
k
)
{
size_t
offset
=
i
*
col
*
len
+
j
*
len
;
output
[
offset
+
k
]
/=
static_cast
<
T
>
(
variance
[
k
]);
}
}
}
}
}
template
<
int
axis
,
int
var_size
>
void
DecodeCenterSize
(
const
framework
::
Tensor
*
target_box
,
const
framework
::
Tensor
*
prior_box
,
const
framework
::
Tensor
*
prior_box_var
,
void
DecodeCenterSize
(
const
framework
::
Tensor
*
target_box
,
const
framework
::
Tensor
*
prior_box
,
const
framework
::
Tensor
*
prior_box_var
,
const
bool
normalized
,
std
::
vector
<
float
>
variance
,
T
*
output
)
const
{
T
*
output
)
const
{
int64_t
row
=
target_box
->
dims
()[
0
];
int64_t
col
=
target_box
->
dims
()[
1
];
int64_t
len
=
target_box
->
dims
()[
2
];
auto
*
target_box_data
=
target_box
->
data
<
T
>
();
auto
*
prior_box_data
=
prior_box
->
data
<
T
>
();
const
T
*
prior_box_var_data
=
nullptr
;
if
(
var_size
==
2
)
prior_box_var_data
=
prior_box_var
->
data
<
T
>
();
int
prior_box_offset
=
0
;
T
var_data
[
4
]
=
{
1.
,
1.
,
1.
,
1.
};
T
*
var_ptr
=
var_data
;
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(2)
#endif
for
(
int64_t
i
=
0
;
i
<
row
;
++
i
)
{
for
(
int64_t
j
=
0
;
j
<
col
;
++
j
)
{
auto
*
target_box_data
=
target_box
->
data
<
T
>
();
auto
*
prior_box_data
=
prior_box
->
data
<
T
>
();
T
var_data
[
4
]
=
{
1.
,
1.
,
1.
,
1.
};
T
*
var_ptr
=
var_data
;
size_t
offset
=
i
*
col
*
len
+
j
*
len
;
prior_box_offset
=
axis
==
0
?
j
*
len
:
i
*
len
;
int
prior_box_offset
=
axis
==
0
?
j
*
len
:
i
*
len
;
T
prior_box_width
=
prior_box_data
[
prior_box_offset
+
2
]
-
prior_box_data
[
prior_box_offset
]
+
(
normalized
==
false
);
...
...
@@ -131,10 +146,10 @@ class BoxCoderKernel : public framework::OpKernel<T> {
T
target_box_width
=
0
,
target_box_height
=
0
;
int
prior_var_offset
=
axis
==
0
?
j
*
len
:
i
*
len
;
if
(
var_size
==
2
)
{
std
::
memcpy
(
var_ptr
,
prior_box_var
_data
+
prior_var_offset
,
std
::
memcpy
(
var_ptr
,
prior_box_var
->
data
<
T
>
()
+
prior_var_offset
,
4
*
sizeof
(
T
));
}
else
if
(
var_size
==
1
)
{
var_ptr
=
reinterpret_cast
<
T
*>
(
variance
.
data
());
var_ptr
=
reinterpret_cast
<
T
*>
(
variance
.
data
());
}
T
box_var_x
=
*
var_ptr
;
T
box_var_y
=
*
(
var_ptr
+
1
);
...
...
@@ -162,11 +177,11 @@ class BoxCoderKernel : public framework::OpKernel<T> {
}
}
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
prior_box
=
context
.
Input
<
framework
::
Tensor
>
(
"PriorBox"
);
auto
*
prior_box_var
=
context
.
Input
<
framework
::
Tensor
>
(
"PriorBoxVar"
);
auto
*
target_box
=
context
.
Input
<
framework
::
LoDTensor
>
(
"TargetBox"
);
auto
*
output_box
=
context
.
Output
<
framework
::
Tensor
>
(
"OutputBox"
);
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
prior_box
=
context
.
Input
<
framework
::
Tensor
>
(
"PriorBox"
);
auto
*
prior_box_var
=
context
.
Input
<
framework
::
Tensor
>
(
"PriorBoxVar"
);
auto
*
target_box
=
context
.
Input
<
framework
::
LoDTensor
>
(
"TargetBox"
);
auto
*
output_box
=
context
.
Output
<
framework
::
Tensor
>
(
"OutputBox"
);
std
::
vector
<
float
>
variance
=
context
.
Attr
<
std
::
vector
<
float
>>
(
"variance"
);
const
int
axis
=
context
.
Attr
<
int
>
(
"axis"
);
if
(
target_box
->
lod
().
size
())
{
...
...
@@ -194,7 +209,7 @@ class BoxCoderKernel : public framework::OpKernel<T> {
output_box
->
mutable_data
<
T
>
({
row
,
col
,
len
},
context
.
GetPlace
());
T
*
output
=
output_box
->
data
<
T
>
();
T
*
output
=
output_box
->
data
<
T
>
();
if
(
code_type
==
BoxCodeType
::
kEncodeCenterSize
)
{
EncodeCenterSize
(
target_box
,
prior_box
,
prior_box_var
,
normalized
,
variance
,
output
);
...
...
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
浏览文件 @
b2898c0f
...
...
@@ -23,9 +23,6 @@ class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel {
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
if
(
ctx
->
IsRuntime
())
{
return
;
}
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"W"
),
"Input W of FusedEmbeddingSeqPoolOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Ids"
),
...
...
@@ -91,6 +88,8 @@ class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker {
"(boolean, default false) "
"Sparse update."
)
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
framework
::
kAllKernelsMustComputeRuntimeShape
,
""
)
.
SetDefault
(
true
);
AddComment
(
R"DOC(
FusedEmbeddingSeqPool Operator.
...
...
paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
浏览文件 @
b2898c0f
...
...
@@ -121,6 +121,8 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
auto
*
ids
=
context
.
Input
<
LoDTensor
>
(
"Ids"
);
auto
*
d_output
=
context
.
Input
<
LoDTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
d_table
=
context
.
Output
<
SelectedRows
>
(
framework
::
GradVarName
(
"W"
));
// runtime shape
d_table
->
set_height
(
table_dim
[
0
]);
auto
*
ids_data
=
ids
->
data
<
int64_t
>
();
int64_t
ids_num
=
ids
->
numel
();
...
...
paddle/fluid/operators/hash_op.cc
浏览文件 @
b2898c0f
...
...
@@ -26,9 +26,6 @@ class HashOp : public framework::OperatorWithKernel {
:
OperatorWithKernel
(
type
,
inputs
,
outputs
,
attrs
)
{}
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
if
(
ctx
->
IsRuntime
())
{
return
;
}
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) of HashOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
...
...
@@ -57,6 +54,8 @@ $$Out = scale * X$$
)DOC"
);
AddAttr
<
int
>
(
"num_hash"
,
""
).
SetDefault
(
1
);
AddAttr
<
int
>
(
"mod_by"
,
""
).
SetDefault
(
100000
);
AddAttr
<
bool
>
(
framework
::
kAllKernelsMustComputeRuntimeShape
,
""
)
.
SetDefault
(
true
);
}
};
...
...
paddle/fluid/operators/ngraph/ngraph_engine.cc
浏览文件 @
b2898c0f
此差异已折叠。
点击以展开。
paddle/fluid/operators/ngraph/ngraph_engine.h
浏览文件 @
b2898c0f
...
...
@@ -12,12 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef PADDLE_FLUID_OPERATORS_NGRAPH_NGRAPH_ENGINE_H_
#define PADDLE_FLUID_OPERATORS_NGRAPH_NGRAPH_ENGINE_H_
#include <memory>
#include <set>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/var_desc.h"
#include "ngraph/ngraph.hpp"
...
...
@@ -33,29 +39,47 @@ enum class OpState { /* nGraph support state on ops */
UNKNOWN
/* Output all for debug purpose */
};
// cache engine repetitives
struct
EngineCache
{
std
::
shared_ptr
<
ngraph
::
Function
>
ngraph_function
;
std
::
set
<
std
::
string
>
persistables
;
std
::
vector
<
std
::
string
>
var_in
;
std
::
vector
<
std
::
string
>
var_out
;
std
::
vector
<
size_t
>
var_in_updates
;
bool
is_test
=
true
;
};
// perform graph build through bridge and execute computation
class
NgraphEngine
{
public:
explicit
NgraphEngine
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
place
,
const
std
::
string
&
serialized_graph
,
const
std
::
vector
<
int
>&
interval
);
const
framework
::
ExecutionContext
&
ctx
);
void
Run
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
;
static
void
EnableNgraph
(
const
framework
::
ProgramDesc
&
program
);
static
const
framework
::
BlockDesc
*
p_bdesc
;
static
std
::
vector
<
std
::
string
>
feed_vars
,
fetch_vars
;
static
void
FuseNgraphOps
(
const
framework
::
BlockDesc
&
prog
,
std
::
vector
<
std
::
unique_ptr
<
framework
::
OperatorBase
>>*
ops
);
private:
static
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Function
>>
func_cache_
;
static
std
::
unordered_map
<
std
::
string
,
EngineCache
>
engine_cache
;
static
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
shared_ptr
<
ngraph
::
runtime
::
Tensor
>>>
t_in_cache_
;
static
framework
::
Variable
*
pre_var_ptr
;
const
framework
::
Scope
&
scope_
;
const
platform
::
Place
&
place_
;
std
::
vector
<
std
::
shared_ptr
<
framework
::
OperatorBase
>>
fused_ops_
;
std
::
unordered_map
<
std
::
string
,
ngraph
::
element
::
Type
>
var_type_map_
;
std
::
unordered_set
<
std
::
string
>
persistables_
;
std
::
unordered_set
<
std
::
string
>
fetches_
;
std
::
set
<
std
::
string
>
persistables_
;
std
::
unordered_set
<
std
::
string
>
post_op_inputs_
;
OpState
ng_op_state_
=
OpState
::
UNKNOWN
;
OpState
op_state_
=
OpState
::
UNKNOWN
;
bool
is_test_
{
true
};
std
::
string
func_cache_key_
;
// ngraph backend eg. CPU
...
...
@@ -66,6 +90,8 @@ class NgraphEngine {
std
::
vector
<
std
::
string
>
var_in_
;
// var_name of outputs from fetch in order
std
::
vector
<
std
::
string
>
var_out_
;
// non-persitable var_in
std
::
vector
<
size_t
>
var_in_updates_
;
// map input vars to nodes
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
...
...
@@ -74,20 +100,23 @@ class NgraphEngine {
std
::
shared_ptr
<
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
ngraph
::
Node
>>>
var_node_map_
;
// prepare info for nraph engine
void
Prepare
(
const
framework
::
BlockDesc
&
block
,
const
std
::
vector
<
int
>&
interval
);
// prepare info for ngraph engine need
void
Prepare
(
const
std
::
vector
<
int
>&
interval
);
// get ngraph engine input and output list
void
BuildNgIO
(
const
std
::
vector
<
framework
::
OpDesc
*>&
op_descs
,
const
std
::
vector
<
int
>&
interval
);
// get ngraph input and define ngraph input parameters
void
GetNgInputShape
(
std
::
shared_ptr
<
framework
::
OperatorBase
>
op
);
void
GetNgInputShape
();
// Call ngraph bridge to map ops
void
BuildNgNodes
();
//
get the ngraph input and output var list
void
BuildNgIO
();
//
run paddle RuntimeInferShape to get the tensor shape
void
RunInferShape
();
// build ngraph function call
void
BuildNgFunction
();
void
BuildNgFunction
(
const
std
::
vector
<
int
>&
interval
);
// Check cache for ngraph function or otherwise build the function
void
GetNgFunction
();
void
GetNgFunction
(
std
::
string
engine_key
,
const
std
::
vector
<
int
>&
interval
);
};
}
// namespace operators
}
// namespace paddle
#endif // PADDLE_FLUID_OPERATORS_NGRAPH_NGRAPH_ENGINE_H_
paddle/fluid/operators/ngraph/ngraph_engine_op.cc
浏览文件 @
b2898c0f
...
...
@@ -29,6 +29,7 @@ class NgraphEngineOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput
(
"Xs"
,
"A list of inputs."
).
AsDispensable
();
AddOutput
(
"Ys"
,
"A list of outputs"
).
AsDispensable
();
AddAttr
<
std
::
string
>
(
"graph"
,
"the graph."
);
AddAttr
<
std
::
string
>
(
"engine_key"
,
"the engine hash key."
);
AddAttr
<
std
::
vector
<
int
>>
(
"interval"
,
"op interval supported by ngraph"
);
AddComment
(
"ngraph engine operator."
);
}
...
...
paddle/fluid/operators/ngraph/ngraph_engine_op.h
浏览文件 @
b2898c0f
...
...
@@ -46,10 +46,8 @@ class NgraphEngineKernel : public framework::OpKernel<T> {
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
scope
=
ctx
.
scope
();
auto
place
=
ctx
.
GetPlace
();
std
::
string
serialized_graph
=
ctx
.
Attr
<
std
::
string
>
(
"graph"
);
auto
interval
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"interval"
);
NgraphEngine
ngraph_engine
(
scope
,
place
,
serialized_graph
,
interval
);
NgraphEngine
ngraph_engine
(
scope
,
place
,
ctx
);
ngraph_engine
.
Run
(
scope
,
place
);
}
};
...
...
paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
浏览文件 @
b2898c0f
...
...
@@ -22,9 +22,6 @@ class SequenceEnumerateOp : public framework::OperatorWithKernel {
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
if
(
ctx
->
IsRuntime
())
{
return
;
}
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) of SequecceEnumerate operator should not be null."
);
...
...
@@ -62,6 +59,8 @@ class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker {
});
AddAttr
<
int
>
(
"pad_value"
,
"(int) The enumerate sequence padding value."
)
.
SetDefault
(
0
);
AddAttr
<
bool
>
(
framework
::
kAllKernelsMustComputeRuntimeShape
,
""
)
.
SetDefault
(
true
);
AddComment
(
R"DOC(
Sequence Enumerate Operator.
...
...
paddle/fluid/platform/device_tracer.cc
浏览文件 @
b2898c0f
...
...
@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/device_tracer.h"
#include <deque>
#include <forward_list>
...
...
@@ -30,6 +29,8 @@ limitations under the License. */
#include "glog/logging.h"
#include "google/protobuf/text_format.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/platform/device_tracer.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h"
namespace
paddle
{
...
...
@@ -317,6 +318,24 @@ class DeviceTracerImpl : public DeviceTracer {
stream_id
,
correlation_id
,
bytes
});
}
void
AddMemInfoRecord
(
uint64_t
start_ns
,
uint64_t
end_ns
,
size_t
bytes
,
const
Place
&
place
,
const
std
::
string
&
alloc_in
,
const
std
::
string
&
free_in
,
int64_t
thread_id
)
{
if
(
0
==
start_ns
||
0
==
end_ns
)
{
VLOG
(
3
)
<<
alloc_in
<<
", "
<<
free_in
<<
" Cannot be traced."
;
return
;
}
thread_local
std
::
forward_list
<
MemInfoRecord
>
*
local_mem_info_record
=
nullptr
;
if
(
local_mem_info_record
==
nullptr
)
{
std
::
lock_guard
<
std
::
mutex
>
l
(
trace_mu_
);
mem_info_record_
.
emplace_front
();
local_mem_info_record
=
&
mem_info_record_
.
front
();
}
local_mem_info_record
->
emplace_front
(
MemInfoRecord
{
start_ns
,
end_ns
,
bytes
,
place
,
thread_id
,
alloc_in
,
free_in
});
}
void
AddActiveKindRecords
(
const
std
::
string
&
anno
,
uint64_t
start_ns
,
uint64_t
end_ns
,
int64_t
device_id
,
int64_t
thread_id
,
uint32_t
correlation_id
)
{
...
...
@@ -409,6 +428,7 @@ class DeviceTracerImpl : public DeviceTracer {
correlations_
.
clear
();
for
(
auto
&
tmp
:
correlations_pairs
)
tmp
.
clear
();
for
(
auto
&
tmp
:
cpu_records_
)
tmp
.
clear
();
for
(
auto
&
tmp
:
mem_info_record_
)
tmp
.
clear
();
for
(
auto
&
tmp
:
active_kind_records_
)
tmp
.
clear
();
}
...
...
@@ -440,9 +460,12 @@ class DeviceTracerImpl : public DeviceTracer {
proto
::
Profile
profile_pb
;
profile_pb
.
set_start_ns
(
start_ns_
);
profile_pb
.
set_end_ns
(
end_ns_
);
if
(
correlations_
.
empty
())
for
(
auto
&
tmp
:
correlations_pairs
)
if
(
correlations_
.
empty
())
{
for
(
auto
&
tmp
:
correlations_pairs
)
{
for
(
auto
&
pair
:
tmp
)
correlations_
[
pair
.
first
]
=
pair
.
second
;
}
}
for
(
const
KernelRecord
&
r
:
kernel_records_
)
{
auto
*
event
=
profile_pb
.
add_events
();
event
->
set_type
(
proto
::
Event
::
GPUKernel
);
...
...
@@ -462,6 +485,7 @@ class DeviceTracerImpl : public DeviceTracer {
event
->
set_device_id
(
r
.
device_id
);
}
VLOG
(
1
)
<<
"KernelRecord event miss: "
<<
miss
<<
" find: "
<<
find
;
for
(
auto
&
tmp
:
cpu_records_
)
{
for
(
const
CPURecord
&
r
:
tmp
)
{
auto
*
event
=
profile_pb
.
add_events
();
...
...
@@ -473,6 +497,7 @@ class DeviceTracerImpl : public DeviceTracer {
event
->
set_device_id
(
r
.
device_id
);
}
}
for
(
auto
&
tmp
:
active_kind_records_
)
{
for
(
const
ActiveKindRecord
&
r
:
tmp
)
{
auto
*
event
=
profile_pb
.
add_events
();
...
...
@@ -510,6 +535,31 @@ class DeviceTracerImpl : public DeviceTracer {
event
->
mutable_memcopy
()
->
set_bytes
(
r
.
bytes
);
}
VLOG
(
1
)
<<
"MemRecord event miss: "
<<
miss
<<
" find: "
<<
find
;
for
(
auto
&
tmp
:
mem_info_record_
)
{
for
(
const
auto
&
r
:
tmp
)
{
auto
*
event
=
profile_pb
.
add_mem_events
();
event
->
set_device_id
(
0
);
if
(
platform
::
is_cpu_place
(
r
.
place
))
{
event
->
set_place
(
proto
::
MemEvent
::
CPUPlace
);
}
else
if
(
platform
::
is_gpu_place
(
r
.
place
))
{
event
->
set_place
(
proto
::
MemEvent
::
CUDAPlace
);
event
->
set_device_id
(
boost
::
get
<
platform
::
CUDAPlace
>
(
r
.
place
).
GetDeviceId
());
}
else
if
(
platform
::
is_cuda_pinned_place
(
r
.
place
))
{
event
->
set_place
(
proto
::
MemEvent
::
CUDAPinnedPlace
);
}
else
{
PADDLE_THROW
(
"The current place is not supported."
);
}
event
->
set_alloc_in
(
r
.
alloc_in
);
event
->
set_free_in
(
r
.
free_in
);
event
->
set_start_ns
(
r
.
start_ns
);
event
->
set_end_ns
(
r
.
end_ns
);
event
->
set_bytes
(
r
.
bytes
);
event
->
set_thread_id
(
r
.
thread_id
);
}
}
std
::
ofstream
profile_f
;
profile_f
.
open
(
profile_path
,
std
::
ios
::
out
|
std
::
ios
::
trunc
|
std
::
ios
::
binary
);
...
...
@@ -553,6 +603,7 @@ class DeviceTracerImpl : public DeviceTracer {
std
::
forward_list
<
KernelRecord
>
kernel_records_
;
std
::
forward_list
<
MemRecord
>
mem_records_
;
std
::
forward_list
<
std
::
forward_list
<
CPURecord
>>
cpu_records_
;
std
::
forward_list
<
std
::
forward_list
<
MemInfoRecord
>>
mem_info_record_
;
std
::
forward_list
<
std
::
forward_list
<
ActiveKindRecord
>>
active_kind_records_
;
std
::
forward_list
<
std
::
forward_list
<
std
::
pair
<
uint32_t
,
Event
*>>>
correlations_pairs
;
...
...
@@ -575,7 +626,7 @@ Event *CurAnnotation() {
return
annotation_stack
.
back
();
}
std
::
string
CurAnnotationName
()
{
if
(
annotation_stack
.
empty
())
return
""
;
if
(
annotation_stack
.
empty
())
return
"
Unknown
"
;
return
annotation_stack
.
back
()
->
name
();
}
...
...
paddle/fluid/platform/device_tracer.h
浏览文件 @
b2898c0f
...
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/fluid/platform/dynload/cupti.h"
#include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/profiler.pb.h"
...
...
@@ -47,6 +48,7 @@ class DeviceTracer {
int64_t
stream_id
;
uint32_t
correlation_id
;
};
struct
CPURecord
{
std
::
string
name
;
uint64_t
start_ns
;
...
...
@@ -54,6 +56,7 @@ class DeviceTracer {
int64_t
device_id
;
int64_t
thread_id
;
};
struct
MemRecord
{
std
::
string
name
;
uint64_t
start_ns
;
...
...
@@ -63,6 +66,17 @@ class DeviceTracer {
uint32_t
correlation_id
;
uint64_t
bytes
;
};
struct
MemInfoRecord
{
uint64_t
start_ns
;
uint64_t
end_ns
;
size_t
bytes
;
Place
place
;
int64_t
thread_id
;
std
::
string
alloc_in
;
std
::
string
free_in
;
};
struct
ActiveKindRecord
{
std
::
string
name
;
uint64_t
start_ns
;
...
...
@@ -71,6 +85,7 @@ class DeviceTracer {
int64_t
thread_id
;
uint32_t
correlation_id
;
};
virtual
~
DeviceTracer
()
{}
// Needs to be called once before use.
virtual
void
Enable
()
=
0
;
...
...
@@ -97,6 +112,12 @@ class DeviceTracer {
int64_t
thread_id
,
uint32_t
correlation_id
)
=
0
;
virtual
void
AddMemInfoRecord
(
uint64_t
start_ns
,
uint64_t
end_ns
,
size_t
bytes
,
const
Place
&
place
,
const
std
::
string
&
alloc_in
,
const
std
::
string
&
free_in
,
int64_t
thread_id
)
=
0
;
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation
// added before for human readability.
virtual
void
AddKernelRecords
(
std
::
string
name
,
uint64_t
start
,
uint64_t
end
,
...
...
paddle/fluid/platform/event.h
浏览文件 @
b2898c0f
...
...
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h>
#endif
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
namespace
platform
{
...
...
@@ -64,5 +66,36 @@ class Event {
#endif
#endif
};
class
MemEvent
{
public:
MemEvent
(
EventType
type
,
uint64_t
start_ns
,
uint64_t
end_ns
,
size_t
bytes
,
Place
place
,
int64_t
thread_id
,
const
std
::
string
&
annotation
)
:
type_
(
type
),
start_ns_
(
start_ns
),
end_ns_
(
end_ns
),
bytes_
(
bytes
),
place_
(
place
),
thread_id_
(
thread_id
),
annotation_
(
annotation
)
{}
const
EventType
&
type
()
const
{
return
type_
;
}
uint64_t
start_ns
()
const
{
return
start_ns_
;
}
uint64_t
end_ns
()
const
{
return
end_ns_
;
}
size_t
bytes
()
const
{
return
bytes_
;
}
Place
place
()
const
{
return
place_
;
}
int64_t
thread_id
()
const
{
return
thread_id_
;
}
const
std
::
string
&
annotation
()
const
{
return
annotation_
;
}
private:
EventType
type_
;
uint64_t
start_ns_
=
0
;
uint64_t
end_ns_
=
0
;
size_t
bytes_
;
Place
place_
;
int64_t
thread_id_
;
std
::
string
annotation_
;
};
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/profiler.cc
浏览文件 @
b2898c0f
...
...
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/profiler.h"
#include <algorithm>
#include <iomanip>
#include <limits>
...
...
@@ -21,6 +20,8 @@ limitations under the License. */
#include <mutex> // NOLINT
#include <random>
#include <string>
#include <vector>
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#endif // PADDLE_WITH_CUDA
...
...
@@ -36,8 +37,6 @@ DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");
namespace
paddle
{
namespace
platform
{
struct
EventList
;
static
int64_t
profiler_lister_id
=
0
;
static
bool
should_send_profile_state
=
false
;
std
::
mutex
profiler_mu
;
...
...
@@ -53,43 +52,15 @@ static uint32_t g_next_thread_id = 0;
// The global mutex
static
std
::
mutex
g_all_event_lists_mutex
;
// The total event lists of all threads
static
std
::
list
<
std
::
shared_ptr
<
EventList
>>
g_all_event_lists
;
static
std
::
list
<
std
::
shared_ptr
<
EventList
<
Event
>
>>
g_all_event_lists
;
// The thread local event list only can be accessed by the specific thread
static
thread_local
std
::
shared_ptr
<
EventList
>
g_event_list
;
struct
EventList
{
constexpr
static
size_t
kMB
=
1024
*
1024
;
constexpr
static
size_t
kEventBlockSize
=
16
*
kMB
;
constexpr
static
size_t
kEventSize
=
sizeof
(
Event
);
constexpr
static
size_t
kEventAlign
=
alignof
(
Event
);
constexpr
static
size_t
kNumBlock
=
kEventBlockSize
/
((
kEventSize
+
kEventAlign
-
1
)
/
kEventAlign
*
kEventAlign
);
template
<
typename
...
Args
>
Event
*
Record
(
Args
&&
...
args
)
{
if
(
event_blocks
.
empty
()
||
event_blocks
.
front
().
size
()
==
kNumBlock
)
{
event_blocks
.
emplace_front
();
event_blocks
.
front
().
reserve
(
kNumBlock
);
}
event_blocks
.
front
().
emplace_back
(
std
::
forward
<
Args
>
(
args
)...);
return
&
event_blocks
.
front
().
back
();
}
std
::
vector
<
Event
>
Reduce
()
{
std
::
vector
<
Event
>
result
;
for
(
auto
&
block
:
event_blocks
)
{
result
.
insert
(
result
.
begin
(),
std
::
make_move_iterator
(
block
.
begin
()),
std
::
make_move_iterator
(
block
.
end
()));
}
event_blocks
.
clear
();
return
result
;
}
static
thread_local
std
::
shared_ptr
<
EventList
<
Event
>>
g_event_list
;
void
Clear
()
{
event_blocks
.
clear
();
}
std
::
forward_list
<
std
::
vector
<
Event
>>
event_blocks
;
};
static
std
::
list
<
std
::
shared_ptr
<
EventList
<
MemEvent
>>>
g_all_mem_event_lists
;
static
thread_local
std
::
shared_ptr
<
EventList
<
MemEvent
>>
g_mem_event_list
;
static
std
::
mutex
g_all_mem_event_lists_mutex
;
static
thread_local
int32_t
g_mem_thread_id
;
static
uint32_t
g_mem_next_thread_id
=
0
;
inline
uint64_t
GetTimeInNsec
()
{
using
clock
=
std
::
conditional
<
std
::
chrono
::
high_resolution_clock
::
is_steady
,
...
...
@@ -105,13 +76,13 @@ Event::Event(EventType type, std::string name, uint32_t thread_id)
cpu_ns_
=
GetTimeInNsec
();
}
const
EventType
&
Event
::
type
()
const
{
return
type_
;
}
const
EventType
&
Event
::
type
()
const
{
return
type_
;
}
double
Event
::
CpuElapsedMs
(
const
Event
&
e
)
const
{
double
Event
::
CpuElapsedMs
(
const
Event
&
e
)
const
{
return
(
e
.
cpu_ns_
-
cpu_ns_
)
/
(
1000000.0
);
}
double
Event
::
CudaElapsedMs
(
const
Event
&
e
)
const
{
double
Event
::
CudaElapsedMs
(
const
Event
&
e
)
const
{
#ifdef PADDLE_WITH_CUPTI
return
gpu_ns_
/
1000000.0
;
#else
...
...
@@ -120,10 +91,32 @@ double Event::CudaElapsedMs(const Event& e) const {
#endif
}
inline
EventList
&
GetEventList
()
{
inline
EventList
<
MemEvent
>
&
GetMemEventList
()
{
if
(
!
g_mem_event_list
)
{
g_mem_event_list
=
std
::
make_shared
<
EventList
<
MemEvent
>>
();
std
::
lock_guard
<
std
::
mutex
>
guard
(
g_all_mem_event_lists_mutex
);
g_mem_thread_id
=
g_mem_next_thread_id
++
;
g_all_mem_event_lists
.
emplace_front
(
g_mem_event_list
);
}
return
*
g_mem_event_list
;
}
void
PushMemEvent
(
uint64_t
start_ns
,
uint64_t
end_ns
,
size_t
bytes
,
const
Place
&
place
,
const
std
::
string
&
annotation
)
{
GetMemEventList
().
Record
(
EventType
::
kPushRange
,
start_ns
,
end_ns
,
bytes
,
place
,
g_mem_thread_id
,
annotation
);
}
void
PopMemEvent
(
uint64_t
start_ns
,
uint64_t
end_ns
,
size_t
bytes
,
const
Place
&
place
,
const
std
::
string
&
annotation
)
{
GetMemEventList
().
Record
(
EventType
::
kPopRange
,
start_ns
,
end_ns
,
bytes
,
place
,
g_mem_thread_id
,
annotation
);
}
inline
EventList
<
Event
>
&
GetEventList
()
{
if
(
!
g_event_list
)
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
g_all_event_lists_mutex
);
g_event_list
=
std
::
make_shared
<
EventList
>
();
g_event_list
=
std
::
make_shared
<
EventList
<
Event
>
>
();
g_thread_id
=
g_next_thread_id
++
;
g_all_event_lists
.
emplace_front
(
g_event_list
);
RecoreCurThreadId
(
g_thread_id
);
...
...
@@ -131,26 +124,26 @@ inline EventList& GetEventList() {
return
*
g_event_list
;
}
void
Mark
(
const
std
::
string
&
name
)
{
void
Mark
(
const
std
::
string
&
name
)
{
GetEventList
().
Record
(
EventType
::
kMark
,
name
,
g_thread_id
);
}
Event
*
PushEvent
(
const
std
::
string
&
name
)
{
Event
*
PushEvent
(
const
std
::
string
&
name
)
{
return
GetEventList
().
Record
(
EventType
::
kPushRange
,
name
,
g_thread_id
);
}
void
PopEvent
(
const
std
::
string
&
name
)
{
void
PopEvent
(
const
std
::
string
&
name
)
{
GetEventList
().
Record
(
EventType
::
kPopRange
,
name
,
g_thread_id
);
}
RecordEvent
::
RecordEvent
(
const
std
::
string
&
name
)
RecordEvent
::
RecordEvent
(
const
std
::
string
&
name
)
:
is_enabled_
(
false
),
start_ns_
(
PosixInNsec
())
{
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
// lock is not needed, the code below is thread-safe
is_enabled_
=
true
;
name_
=
name
;
Event
*
e
=
PushEvent
(
name_
);
Event
*
e
=
PushEvent
(
name_
);
// Maybe need the same push/pop behavior.
SetCurAnnotation
(
e
);
}
...
...
@@ -158,7 +151,7 @@ RecordEvent::RecordEvent(const std::string& name)
RecordEvent
::~
RecordEvent
()
{
if
(
g_state
==
ProfilerState
::
kDisabled
||
!
is_enabled_
)
return
;
// lock is not needed, the code below is thread-safe
DeviceTracer
*
tracer
=
GetDeviceTracer
();
DeviceTracer
*
tracer
=
GetDeviceTracer
();
if
(
tracer
)
{
tracer
->
AddCPURecords
(
CurAnnotationName
(),
start_ns_
,
PosixInNsec
(),
BlockDepth
(),
g_thread_id
);
...
...
@@ -167,7 +160,56 @@ RecordEvent::~RecordEvent() {
PopEvent
(
name_
);
}
RecordRPCEvent
::
RecordRPCEvent
(
const
std
::
string
&
name
)
{
MemEvenRecorder
MemEvenRecorder
::
recorder
;
void
MemEvenRecorder
::
PushMemRecord
(
const
void
*
ptr
,
const
Place
&
place
,
size_t
size
)
{
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
std
::
lock_guard
<
std
::
mutex
>
guard
(
mtx_
);
auto
&
events
=
address_memevent_
[
place
];
PADDLE_ENFORCE
(
events
.
count
(
ptr
)
==
0
,
""
);
events
.
emplace
(
ptr
,
std
::
unique_ptr
<
RecordMemEvent
>
(
new
MemEvenRecorder
::
RecordMemEvent
(
place
,
size
)));
}
void
MemEvenRecorder
::
PopMemRecord
(
const
void
*
ptr
,
const
Place
&
place
)
{
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
std
::
lock_guard
<
std
::
mutex
>
guard
(
mtx_
);
auto
&
events
=
address_memevent_
[
place
];
auto
iter
=
events
.
find
(
ptr
);
// The ptr maybe not in address_memevent
if
(
iter
!=
events
.
end
())
{
events
.
erase
(
iter
);
}
}
void
MemEvenRecorder
::
Flush
()
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
mtx_
);
address_memevent_
.
clear
();
}
MemEvenRecorder
::
RecordMemEvent
::
RecordMemEvent
(
const
Place
&
place
,
size_t
bytes
)
:
place_
(
place
),
bytes_
(
bytes
),
start_ns_
(
PosixInNsec
()),
alloc_in_
(
CurAnnotationName
())
{
PushMemEvent
(
start_ns_
,
end_ns_
,
bytes_
,
place_
,
alloc_in_
);
}
MemEvenRecorder
::
RecordMemEvent
::~
RecordMemEvent
()
{
DeviceTracer
*
tracer
=
GetDeviceTracer
();
end_ns_
=
PosixInNsec
();
auto
annotation_free
=
CurAnnotationName
();
if
(
tracer
)
{
tracer
->
AddMemInfoRecord
(
start_ns_
,
end_ns_
,
bytes_
,
place_
,
alloc_in_
,
annotation_free
,
g_mem_thread_id
);
}
PopMemEvent
(
start_ns_
,
end_ns_
,
bytes_
,
place_
,
annotation_free
);
}
RecordRPCEvent
::
RecordRPCEvent
(
const
std
::
string
&
name
)
{
if
(
FLAGS_enable_rpc_profiler
)
{
event_
.
reset
(
new
platform
::
RecordEvent
(
name
));
}
...
...
@@ -185,7 +227,7 @@ RecordBlock::RecordBlock(int block_id)
RecordBlock
::~
RecordBlock
()
{
// lock is not needed, the code below is thread-safe
if
(
g_state
==
ProfilerState
::
kDisabled
||
!
is_enabled_
)
return
;
DeviceTracer
*
tracer
=
GetDeviceTracer
();
DeviceTracer
*
tracer
=
GetDeviceTracer
();
if
(
tracer
)
{
// We try to put all blocks at the same nested depth in the
// same timeline lane. and distinguish the using thread_id.
...
...
@@ -232,11 +274,16 @@ void EnableProfiler(ProfilerState state) {
void
ResetProfiler
()
{
SynchronizeAllDevice
();
GetDeviceTracer
()
->
Reset
();
MemEvenRecorder
::
Instance
().
Flush
();
std
::
lock_guard
<
std
::
mutex
>
guard
(
g_all_event_lists_mutex
);
for
(
auto
it
=
g_all_event_lists
.
begin
();
it
!=
g_all_event_lists
.
end
();
++
it
)
{
(
*
it
)
->
Clear
();
}
for
(
auto
it
=
g_all_mem_event_lists
.
begin
();
it
!=
g_all_mem_event_lists
.
end
();
++
it
)
{
(
*
it
)
->
Clear
();
}
}
std
::
vector
<
std
::
vector
<
Event
>>
GetAllEvents
()
{
...
...
@@ -249,6 +296,15 @@ std::vector<std::vector<Event>> GetAllEvents() {
return
result
;
}
std
::
vector
<
std
::
vector
<
MemEvent
>>
GetMemEvents
()
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
g_all_mem_event_lists_mutex
);
std
::
vector
<
std
::
vector
<
MemEvent
>>
result
;
for
(
auto
&
it
:
g_all_mem_event_lists
)
{
result
.
emplace_back
((
*
it
).
Reduce
());
}
return
result
;
}
// The information of each event given in the profiling report
struct
EventItem
{
std
::
string
name
;
...
...
@@ -263,8 +319,8 @@ struct EventItem {
};
// Print results
void
PrintProfiler
(
const
std
::
vector
<
std
::
vector
<
EventItem
>>
&
events_table
,
const
std
::
string
&
sorted_domain
,
const
size_t
name_width
,
void
PrintProfiler
(
const
std
::
vector
<
std
::
vector
<
EventItem
>>
&
events_table
,
const
std
::
string
&
sorted_domain
,
const
size_t
name_width
,
const
size_t
data_width
,
bool
merge_thread
)
{
// Output header information
std
::
cout
<<
"
\n
------------------------->"
...
...
@@ -302,7 +358,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
<<
std
::
setw
(
data_width
)
<<
"Ratio."
<<
std
::
endl
;
for
(
size_t
i
=
0
;
i
<
events_table
.
size
();
++
i
)
{
for
(
size_t
j
=
0
;
j
<
events_table
[
i
].
size
();
++
j
)
{
const
EventItem
&
event_item
=
events_table
[
i
][
j
];
const
EventItem
&
event_item
=
events_table
[
i
][
j
];
std
::
cout
<<
std
::
setw
(
name_width
)
<<
event_item
.
name
<<
std
::
setw
(
data_width
)
<<
event_item
.
calls
<<
std
::
setw
(
data_width
)
<<
event_item
.
total_time
;
...
...
@@ -326,54 +382,54 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
}
// Parse the event list and output the profiling report
void
ParseEvents
(
const
std
::
vector
<
std
::
vector
<
Event
>>
&
events
,
void
ParseEvents
(
const
std
::
vector
<
std
::
vector
<
Event
>>
&
events
,
bool
merge_thread
,
EventSortingKey
sorted_by
=
EventSortingKey
::
kDefault
)
{
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
if
(
merge_thread
&&
events
.
size
()
<
2
)
return
;
std
::
string
sorted_domain
;
std
::
function
<
bool
(
const
EventItem
&
,
const
EventItem
&
)
>
sorted_func
;
std
::
function
<
bool
(
const
EventItem
&
,
const
EventItem
&
)
>
sorted_func
;
switch
(
sorted_by
)
{
case
EventSortingKey
::
kCalls
:
sorted_domain
=
"number of calls"
;
sorted_func
=
[](
const
EventItem
&
a
,
const
EventItem
&
b
)
{
sorted_func
=
[](
const
EventItem
&
a
,
const
EventItem
&
b
)
{
return
a
.
calls
>
b
.
calls
;
};
break
;
case
EventSortingKey
::
kTotal
:
sorted_domain
=
"total time"
;
sorted_func
=
[](
const
EventItem
&
a
,
const
EventItem
&
b
)
{
sorted_func
=
[](
const
EventItem
&
a
,
const
EventItem
&
b
)
{
return
a
.
total_time
>
b
.
total_time
;
};
break
;
case
EventSortingKey
::
kMin
:
sorted_domain
=
"minimum time"
;
sorted_func
=
[](
const
EventItem
&
a
,
const
EventItem
&
b
)
{
sorted_func
=
[](
const
EventItem
&
a
,
const
EventItem
&
b
)
{
return
a
.
min_time
>
b
.
min_time
;
};
break
;
case
EventSortingKey
::
kMax
:
sorted_domain
=
"maximum time"
;
sorted_func
=
[](
const
EventItem
&
a
,
const
EventItem
&
b
)
{
sorted_func
=
[](
const
EventItem
&
a
,
const
EventItem
&
b
)
{
return
a
.
max_time
>
b
.
max_time
;
};
break
;
case
EventSortingKey
::
kAve
:
sorted_domain
=
"average time"
;
sorted_func
=
[](
const
EventItem
&
a
,
const
EventItem
&
b
)
{
sorted_func
=
[](
const
EventItem
&
a
,
const
EventItem
&
b
)
{
return
a
.
ave_time
>
b
.
ave_time
;
};
break
;
case
EventSortingKey
::
kGPUTime
:
sorted_domain
=
"average time"
;
sorted_func
=
[](
const
EventItem
&
a
,
const
EventItem
&
b
)
{
sorted_func
=
[](
const
EventItem
&
a
,
const
EventItem
&
b
)
{
return
a
.
gpu_time
>
b
.
gpu_time
;
};
break
;
case
EventSortingKey
::
kCPUTime
:
sorted_domain
=
"average time"
;
sorted_func
=
[](
const
EventItem
&
a
,
const
EventItem
&
b
)
{
sorted_func
=
[](
const
EventItem
&
a
,
const
EventItem
&
b
)
{
return
a
.
cpu_time
>
b
.
cpu_time
;
};
break
;
...
...
@@ -381,7 +437,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
sorted_domain
=
"event first end time"
;
}
const
std
::
vector
<
std
::
vector
<
Event
>>
*
analyze_events
;
const
std
::
vector
<
std
::
vector
<
Event
>>
*
analyze_events
;
std
::
vector
<
std
::
vector
<
Event
>>
merged_events_list
;
if
(
merge_thread
)
{
std
::
vector
<
Event
>
merged_events
;
...
...
@@ -469,7 +525,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
}
}
// average time
for
(
auto
&
item
:
event_items
)
{
for
(
auto
&
item
:
event_items
)
{
item
.
ave_time
=
item
.
total_time
/
item
.
calls
;
item
.
ratio
=
item
.
total_time
/
total
;
}
...
...
@@ -493,15 +549,77 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
merge_thread
);
}
struct
MemoryProfierReport
{
size_t
alloc_times
{
0
};
size_t
alloc_size
{
0
};
size_t
free_times
{
0
};
size_t
free_size
{
0
};
};
// Print results
void
PrintMemProfiler
(
const
std
::
map
<
Place
,
std
::
unordered_map
<
std
::
string
,
MemoryProfierReport
>>
&
annotation_report
,
const
size_t
name_width
,
const
size_t
data_width
)
{
// Output header information
std
::
cout
<<
"
\n
------------------------->"
<<
" Memory Profiling Report "
<<
"<-------------------------
\n\n
"
;
// Output events table
std
::
cout
.
setf
(
std
::
ios
::
left
);
std
::
cout
<<
std
::
setw
(
name_width
)
<<
"Event"
<<
std
::
setw
(
data_width
)
<<
"Alloc Calls"
<<
std
::
setw
(
data_width
)
<<
"Size(MB)"
<<
std
::
setw
(
data_width
)
<<
"Free Calls"
<<
std
::
setw
(
data_width
)
<<
"Size(MB)"
<<
std
::
endl
;
for
(
auto
&
tmp
:
annotation_report
)
{
for
(
auto
&
e
:
tmp
.
second
)
{
auto
event_name
=
string
::
Sprintf
(
"%s:%s"
,
tmp
.
first
,
e
.
first
);
std
::
cout
<<
std
::
setw
(
name_width
)
<<
event_name
;
std
::
cout
<<
std
::
setw
(
data_width
)
<<
e
.
second
.
alloc_times
;
std
::
cout
<<
std
::
setw
(
data_width
)
<<
e
.
second
.
alloc_size
/
(
1024.0
*
1024.0
);
std
::
cout
<<
std
::
setw
(
data_width
)
<<
e
.
second
.
free_times
;
std
::
cout
<<
std
::
setw
(
data_width
)
<<
e
.
second
.
free_size
/
(
1024.0
*
1024.0
)
<<
std
::
endl
;
}
}
std
::
cout
<<
std
::
endl
;
}
// parse memory events
void
ParseMemEvents
(
const
std
::
vector
<
std
::
vector
<
MemEvent
>>
&
events
)
{
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
// place, annotation, alloc times, alloc size
std
::
map
<
Place
,
std
::
unordered_map
<
std
::
string
,
MemoryProfierReport
>>
annotation_report
;
for
(
auto
&
tmp
:
events
)
{
for
(
auto
&
e
:
tmp
)
{
if
(
e
.
type
()
==
EventType
::
kPushRange
)
{
annotation_report
[
e
.
place
()][
e
.
annotation
()].
alloc_times
+=
1
;
annotation_report
[
e
.
place
()][
e
.
annotation
()].
alloc_size
+=
e
.
bytes
();
}
else
if
(
e
.
type
()
==
EventType
::
kPopRange
)
{
annotation_report
[
e
.
place
()][
e
.
annotation
()].
free_times
+=
1
;
annotation_report
[
e
.
place
()][
e
.
annotation
()].
free_size
+=
e
.
bytes
();
}
}
}
PrintMemProfiler
(
annotation_report
,
55
,
18
);
}
void
DisableProfiler
(
EventSortingKey
sorted_key
,
const
std
::
string
&
profile_path
)
{
const
std
::
string
&
profile_path
)
{
SynchronizeAllDevice
();
MemEvenRecorder
::
Instance
().
Flush
();
std
::
lock_guard
<
std
::
mutex
>
l
(
profiler_mu
);
if
(
g_state
==
ProfilerState
::
kDisabled
)
return
;
// Mark the profiling stop.
Mark
(
"_stop_profiler_"
);
DeviceTracer
*
tracer
=
GetDeviceTracer
();
DeviceTracer
*
tracer
=
GetDeviceTracer
();
if
(
tracer
->
IsEnabled
())
{
tracer
->
Disable
();
tracer
->
GenProfile
(
profile_path
);
...
...
@@ -511,6 +629,11 @@ void DisableProfiler(EventSortingKey sorted_key,
std
::
vector
<
std
::
vector
<
Event
>>
all_events
=
GetAllEvents
();
ParseEvents
(
all_events
,
true
,
sorted_key
);
ParseEvents
(
all_events
,
false
,
sorted_key
);
if
(
VLOG_IS_ON
(
5
))
{
std
::
vector
<
std
::
vector
<
MemEvent
>>
all_mem_events
=
GetMemEvents
();
ParseMemEvents
(
all_mem_events
);
}
ResetProfiler
();
g_state
=
ProfilerState
::
kDisabled
;
should_send_profile_state
=
true
;
...
...
paddle/fluid/platform/profiler.h
浏览文件 @
b2898c0f
...
...
@@ -15,10 +15,17 @@ limitations under the License. */
#pragma once
#include <forward_list>
#include <list>
#include <map>
#include <memory>
#include <mutex> // NOLINT
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/gpu_info.h"
#endif
...
...
@@ -34,8 +41,41 @@ enum ProfilerState {
void
Mark
(
const
std
::
string
&
name
);
Event
*
PushEvent
(
const
std
::
string
&
name
);
void
PushMemEvent
(
uint64_t
start_ns
,
uint64_t
end_ns
,
size_t
bytes
,
const
Place
&
place
);
void
PopMemEvent
(
uint64_t
start_ns
,
uint64_t
end_ns
,
size_t
bytes
,
const
Place
&
place
);
struct
MemEvenRecorder
{
public:
void
PushMemRecord
(
const
void
*
ptr
,
const
Place
&
place
,
size_t
size
);
void
PopMemRecord
(
const
void
*
ptr
,
const
Place
&
place
);
void
Flush
();
static
MemEvenRecorder
&
Instance
()
{
return
recorder
;
}
private:
struct
RecordMemEvent
{
RecordMemEvent
(
const
Place
&
place
,
size_t
bytes
);
~
RecordMemEvent
();
Place
place_
;
size_t
bytes_
;
uint64_t
start_ns_
;
uint64_t
end_ns_
;
std
::
string
alloc_in_
;
std
::
string
free_in_
;
};
static
MemEvenRecorder
recorder
;
std
::
map
<
Place
,
std
::
unordered_map
<
const
void
*
,
std
::
unique_ptr
<
RecordMemEvent
>>>
address_memevent_
;
std
::
mutex
mtx_
;
MemEvenRecorder
()
{}
DISABLE_COPY_AND_ASSIGN
(
MemEvenRecorder
);
};
Event
*
PushEvent
(
const
std
::
string
&
name
);
void
PopEvent
(
const
std
::
string
&
name
);
struct
RecordEvent
{
...
...
@@ -87,6 +127,41 @@ enum EventSortingKey {
kGPUTime
};
template
<
typename
T
>
struct
EventList
{
constexpr
static
size_t
kMB
=
1024
*
1024
;
constexpr
static
size_t
kEventBlockSize
=
16
*
kMB
;
constexpr
static
size_t
kEventSize
=
sizeof
(
T
);
constexpr
static
size_t
kEventAlign
=
alignof
(
T
);
constexpr
static
size_t
kNumBlock
=
kEventBlockSize
/
((
kEventSize
+
kEventAlign
-
1
)
/
kEventAlign
*
kEventAlign
);
template
<
typename
...
Args
>
T
*
Record
(
Args
&&
...
args
)
{
if
(
event_blocks
.
empty
()
||
event_blocks
.
front
().
size
()
==
kNumBlock
)
{
event_blocks
.
emplace_front
();
event_blocks
.
front
().
reserve
(
kNumBlock
);
}
event_blocks
.
front
().
emplace_back
(
std
::
forward
<
Args
>
(
args
)...);
return
&
event_blocks
.
front
().
back
();
}
std
::
vector
<
T
>
Reduce
()
{
std
::
vector
<
T
>
result
;
for
(
auto
&
block
:
event_blocks
)
{
result
.
insert
(
result
.
begin
(),
std
::
make_move_iterator
(
block
.
begin
()),
std
::
make_move_iterator
(
block
.
end
()));
}
event_blocks
.
clear
();
return
result
;
}
void
Clear
()
{
event_blocks
.
clear
();
}
std
::
forward_list
<
std
::
vector
<
T
>>
event_blocks
;
};
// Enable the profiling function.
void
EnableProfiler
(
ProfilerState
state
);
...
...
paddle/fluid/platform/profiler.proto
浏览文件 @
b2898c0f
...
...
@@ -34,8 +34,25 @@ message Event {
optional
string
detail_info
=
9
;
}
message
MemEvent
{
enum
Place
{
CUDAPlace
=
0
;
CPUPlace
=
1
;
CUDAPinnedPlace
=
2
;
}
optional
uint64
start_ns
=
1
;
optional
uint64
end_ns
=
2
;
optional
uint64
bytes
=
3
;
optional
Place
place
=
4
;
optional
uint64
thread_id
=
5
;
optional
uint32
device_id
=
6
;
optional
string
alloc_in
=
7
;
optional
string
free_in
=
8
;
}
message
Profile
{
repeated
Event
events
=
1
;
optional
uint64
start_ns
=
2
;
optional
uint64
end_ns
=
3
;
repeated
MemEvent
mem_events
=
4
;
}
\ No newline at end of file
paddle/fluid/pybind/pybind.cc
浏览文件 @
b2898c0f
...
...
@@ -94,6 +94,14 @@ bool IsCompiledWithMKLDNN() {
#endif
}
bool
IsCompiledWithNGRAPH
()
{
#ifndef PADDLE_WITH_NGRAPH
return
false
;
#else
return
true
;
#endif
}
bool
IsCompiledWithBrpc
()
{
#ifndef PADDLE_WITH_DISTRIBUTE
return
false
;
...
...
@@ -874,6 +882,7 @@ All parameter, weight, gradient are variables in Paddle.
m
.
def
(
"init_devices"
,
[](
bool
init_p2p
)
{
framework
::
InitDevices
(
init_p2p
);
});
m
.
def
(
"is_compiled_with_ngraph"
,
IsCompiledWithNGRAPH
);
m
.
def
(
"is_compiled_with_cuda"
,
IsCompiledWithCUDA
);
m
.
def
(
"is_compiled_with_mkldnn"
,
IsCompiledWithMKLDNN
);
m
.
def
(
"is_compiled_with_brpc"
,
IsCompiledWithBrpc
);
...
...
@@ -1242,7 +1251,7 @@ All parameter, weight, gradient are variables in Paddle.
cannot be updated after being finalized.)DOC"
);
pe
.
def
(
py
::
init
<
const
std
::
vector
<
platform
::
Place
>
&
,
const
std
::
unordered_set
<
std
::
string
>
&
,
const
std
::
string
&
,
const
std
::
vector
<
std
::
string
>
&
,
const
std
::
string
&
,
Scope
*
,
std
::
vector
<
Scope
*>
&
,
const
ExecutionStrategy
&
,
const
BuildStrategy
&
,
ir
::
Graph
*>
())
// NOTE: even we return a vec<Scope*>* to Python use reference policy.
...
...
paddle/scripts/paddle_build.sh
浏览文件 @
b2898c0f
...
...
@@ -455,7 +455,11 @@ function assert_api_spec_approvals() {
# NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
if
[
"
$API_FILE
"
==
"paddle/fluid/API.spec"
]
;
then
APPROVALS
=
`
curl
-H
"Authorization: token
${
GITHUB_API_TOKEN
}
"
https://api.github.com/repos/PaddlePaddle/Paddle/pulls/
${
GIT_PR_ID
}
/reviews?per_page
=
10000 |
\
python
${
PADDLE_ROOT
}
/tools/check_pr_approval.py 2 2887803 35982308
`
python
${
PADDLE_ROOT
}
/tools/check_pr_approval.py 2 2887803 35982308 46782768 30176695
`
if
[
"
${
APPROVALS
}
"
==
"TRUE"
]
;
then
APPROVALS
=
`
curl
-H
"Authorization: token
${
GITHUB_API_TOKEN
}
"
https://api.github.com/repos/PaddlePaddle/Paddle/pulls/
${
GIT_PR_ID
}
/reviews?per_page
=
10000 |
\
python
${
PADDLE_ROOT
}
/tools/check_pr_approval.py 1 35982308
`
fi
else
APPROVALS
=
`
curl
-H
"Authorization: token
${
GITHUB_API_TOKEN
}
"
https://api.github.com/repos/PaddlePaddle/Paddle/pulls/
${
GIT_PR_ID
}
/reviews?per_page
=
10000 |
\
python
${
PADDLE_ROOT
}
/tools/check_pr_approval.py 1 2887803
`
...
...
@@ -463,7 +467,7 @@ function assert_api_spec_approvals() {
echo
"current pr
${
GIT_PR_ID
}
got approvals:
${
APPROVALS
}
"
if
[
"
${
APPROVALS
}
"
==
"FALSE"
]
;
then
if
[
"
$API_FILE
"
==
"paddle/fluid/API.spec"
]
;
then
echo
"You must have
panyx0718 and shanyi15
approval for the api change!
${
API_FILE
}
"
echo
"You must have
one RD (panyx0718 or chengduoZH or XiaoguangHu01) and one PM (shanyi15)
approval for the api change!
${
API_FILE
}
"
else
echo
"You must have panyx0718 approval for the api change!
${
API_FILE
}
"
fi
...
...
python/paddle/fluid/__init__.py
浏览文件 @
b2898c0f
...
...
@@ -125,7 +125,7 @@ def __bootstrap__():
os
.
environ
[
'OMP_NUM_THREADS'
]
=
str
(
num_threads
)
sysstr
=
platform
.
system
()
read_env_flags
=
[
'check_nan_inf'
,
'benchmark'
,
'eager_delete_scope'
,
'use_ngraph'
,
'check_nan_inf'
,
'benchmark'
,
'eager_delete_scope'
,
'initial_cpu_memory_in_mb'
,
'init_allocated_mem'
,
'free_idle_memory'
,
'paddle_num_threads'
,
"dist_threadpool_size"
,
'eager_delete_tensor_gb'
,
'fast_eager_deletion_mode'
,
'memory_fraction_of_eager_deletion'
,
...
...
@@ -143,6 +143,9 @@ def __bootstrap__():
if
core
.
is_compiled_with_mkldnn
():
read_env_flags
.
append
(
'use_mkldnn'
)
if
core
.
is_compiled_with_ngraph
():
read_env_flags
.
append
(
'use_ngraph'
)
if
core
.
is_compiled_with_dist
():
read_env_flags
.
append
(
'rpc_deadline'
)
read_env_flags
.
append
(
'rpc_server_profile_path'
)
...
...
python/paddle/fluid/compiler.py
浏览文件 @
b2898c0f
...
...
@@ -230,13 +230,17 @@ class CompiledProgram(object):
self
.
_persistable_vars
.
append
(
cpt
.
to_text
(
node
.
name
()))
places
=
list
(
map
(
_place_obj
,
self
.
_places
))
return
core
.
ParallelExecutor
(
places
,
set
(
self
.
_persistable_vars
),
cpt
.
to_text
(
self
.
_loss_name
)
if
self
.
_loss_name
else
six
.
u
(
''
),
scope
,
self
.
_local_scopes
,
self
.
_exec_strategy
,
self
.
_build_strategy
,
self
.
_graph
)
# ParallelExecutor would broadcast all the parameters during initializing.
# The parameters of each process should be in the same ordered for the data-parallelism
# distributed training to keep the broadcast correct.
self
.
_persistable_vars
=
list
(
set
(
self
.
_persistable_vars
))
self
.
_persistable_vars
.
sort
()
return
core
.
ParallelExecutor
(
places
,
self
.
_persistable_vars
,
cpt
.
to_text
(
self
.
_loss_name
)
if
self
.
_loss_name
else
six
.
u
(
''
),
self
.
_scope
,
self
.
_local_scopes
,
self
.
_exec_strategy
,
self
.
_build_strategy
,
self
.
_graph
)
def
_compile_inference
(
self
):
return
core
.
create_paddle_predictor
(
self
.
_infer_config
)
...
...
python/paddle/fluid/layers/ops.py
浏览文件 @
b2898c0f
...
...
@@ -23,6 +23,7 @@ __activations_noattr__ = [
'logsigmoid'
,
'exp'
,
'tanh'
,
'atan'
,
'tanh_shrink'
,
'softshrink'
,
'sqrt'
,
...
...
@@ -30,6 +31,8 @@ __activations_noattr__ = [
'ceil'
,
'floor'
,
'cos'
,
'acos'
,
'asin'
,
'sin'
,
'round'
,
'reciprocal'
,
...
...
python/paddle/fluid/tests/unittests/test_activation_op.py
浏览文件 @
b2898c0f
...
...
@@ -100,6 +100,23 @@ class TestTanh(TestActivation):
self
.
check_grad
([
'X'
],
'Out'
,
max_relative_error
=
0.007
)
class
TestAtan
(
TestActivation
):
def
setUp
(
self
):
self
.
op_type
=
"atan"
self
.
init_dtype
()
x
=
np
.
random
.
uniform
(
0.1
,
1
,
[
11
,
17
]).
astype
(
self
.
dtype
)
out
=
np
.
arctan
(
x
)
self
.
inputs
=
{
'X'
:
OpTest
.
np_dtype_to_fluid_dtype
(
x
)}
self
.
outputs
=
{
'Out'
:
out
}
def
test_check_grad
(
self
):
if
self
.
dtype
==
np
.
float16
:
return
self
.
check_grad
([
'X'
],
'Out'
,
max_relative_error
=
0.007
)
class
TestTanhShrink
(
TestActivation
):
def
setUp
(
self
):
self
.
op_type
=
"tanh_shrink"
...
...
@@ -248,6 +265,23 @@ class TestCos(TestActivation):
self
.
check_grad
([
'X'
],
'Out'
,
max_relative_error
=
0.007
)
class
TestAcos
(
TestActivation
):
def
setUp
(
self
):
self
.
op_type
=
"acos"
self
.
init_dtype
()
x
=
np
.
random
.
uniform
(
-
1
,
1
,
[
4
,
4
]).
astype
(
self
.
dtype
)
out
=
np
.
arccos
(
x
)
self
.
inputs
=
{
'X'
:
OpTest
.
np_dtype_to_fluid_dtype
(
x
)}
self
.
outputs
=
{
'Out'
:
out
}
def
test_check_grad
(
self
):
if
self
.
dtype
==
np
.
float16
:
return
self
.
check_grad
([
'X'
],
'Out'
,
max_relative_error
=
0.007
)
class
TestSin
(
TestActivation
):
def
setUp
(
self
):
self
.
op_type
=
"sin"
...
...
@@ -265,6 +299,23 @@ class TestSin(TestActivation):
self
.
check_grad
([
'X'
],
'Out'
,
max_relative_error
=
0.007
)
class
TestAsin
(
TestActivation
):
def
setUp
(
self
):
self
.
op_type
=
"asin"
self
.
init_dtype
()
x
=
np
.
random
.
uniform
(
-
1
,
1
,
[
4
,
4
]).
astype
(
self
.
dtype
)
out
=
np
.
arcsin
(
x
)
self
.
inputs
=
{
'X'
:
OpTest
.
np_dtype_to_fluid_dtype
(
x
)}
self
.
outputs
=
{
'Out'
:
out
}
def
test_check_grad
(
self
):
if
self
.
dtype
==
np
.
float16
:
return
self
.
check_grad
([
'X'
],
'Out'
,
max_relative_error
=
0.007
)
class
TestRound
(
TestActivation
):
def
setUp
(
self
):
self
.
op_type
=
"round"
...
...
@@ -665,7 +716,10 @@ create_test_act_fp16_class(TestAbs)
create_test_act_fp16_class
(
TestCeil
,
grad_check
=
False
)
create_test_act_fp16_class
(
TestFloor
,
grad_check
=
False
)
create_test_act_fp16_class
(
TestCos
,
grad_atol
=
0.85
)
create_test_act_fp16_class
(
TestAcos
,
grad_atol
=
0.85
)
create_test_act_fp16_class
(
TestSin
)
create_test_act_fp16_class
(
TestAsin
)
create_test_act_fp16_class
(
TestAtan
)
create_test_act_fp16_class
(
TestRound
,
grad_check
=
False
)
create_test_act_fp16_class
(
TestRelu
)
create_test_act_fp16_class
(
TestGelu
)
...
...
tools/print_signatures.py
浏览文件 @
b2898c0f
...
...
@@ -51,6 +51,8 @@ def visit_member(parent_name, member):
all
=
(
args
,
doc
)
member_dict
[
cur_name
]
=
all
except
TypeError
:
# special for PyBind method
if
cur_name
in
check_modules_list
:
return
member_dict
[
cur_name
]
=
" "
.
join
([
line
.
strip
()
for
line
in
pydoc
.
render_doc
(
member
).
split
(
'
\n
'
)
if
"->"
in
line
...
...
@@ -78,6 +80,7 @@ def visit_all_module(mod):
visit_member
(
mod
.
__name__
,
instance
)
check_modules_list
=
[
"paddle.reader.ComposeNotAligned.__init__"
]
modules
=
sys
.
argv
[
1
].
split
(
","
)
for
m
in
modules
:
visit_all_module
(
importlib
.
import_module
(
m
))
...
...
tools/timeline.py
浏览文件 @
b2898c0f
...
...
@@ -95,6 +95,22 @@ class _ChromeTraceFormatter(object):
event
[
'args'
]
=
args
self
.
_events
.
append
(
event
)
def
emit_counter
(
self
,
category
,
name
,
pid
,
timestamp
,
counter
,
value
):
"""Emits a record for a single counter.
Args:
category: The event category as string
name: The event name as string
pid: Identifier of the process generating this event as integer
timestamp: The timestamps of this event as long integer
counter: Name of the counter as string
value: Value of the counter as integer
tid: Thread id of the allocation as integer
"""
event
=
self
.
_create_event
(
'C'
,
category
,
name
,
pid
,
0
,
timestamp
)
event
[
'args'
]
=
{
counter
:
value
}
self
.
_events
.
append
(
event
)
def
format_to_string
(
self
,
pretty
=
False
):
"""Formats the chrome trace to a string.
...
...
@@ -117,6 +133,7 @@ class Timeline(object):
self
.
_profile_dict
=
profile_dict
self
.
_pid
=
0
self
.
_devices
=
dict
()
self
.
_mem_devices
=
dict
()
self
.
_chrome_trace
=
_ChromeTraceFormatter
()
def
_allocate_pid
(
self
):
...
...
@@ -143,6 +160,45 @@ class Timeline(object):
self
.
_devices
[(
k
,
event
.
device_id
,
"GPUKernel"
)]
=
pid
self
.
_chrome_trace
.
emit_pid
(
"%s:gpu:%d"
%
(
k
,
event
.
device_id
),
pid
)
for
mevent
in
profile_pb
.
mem_events
:
if
mevent
.
place
==
profiler_pb2
.
MemEvent
.
CUDAPlace
:
if
(
k
,
mevent
.
device_id
,
"GPU"
)
not
in
self
.
_mem_devices
:
pid
=
self
.
_allocate_pid
()
self
.
_mem_devices
[(
k
,
mevent
.
device_id
,
"GPU"
)]
=
pid
self
.
_chrome_trace
.
emit_pid
(
"memory usage on %s:gpu:%d"
%
(
k
,
mevent
.
device_id
),
pid
)
elif
mevent
.
place
==
profiler_pb2
.
MemEvent
.
CPUPlace
:
if
(
k
,
mevent
.
device_id
,
"CPU"
)
not
in
self
.
_mem_devices
:
pid
=
self
.
_allocate_pid
()
self
.
_mem_devices
[(
k
,
mevent
.
device_id
,
"CPU"
)]
=
pid
self
.
_chrome_trace
.
emit_pid
(
"memory usage on %s:cpu:%d"
%
(
k
,
mevent
.
device_id
),
pid
)
elif
mevent
.
place
==
profiler_pb2
.
MemEvent
.
CUDAPinnedPlace
:
if
(
k
,
mevent
.
device_id
,
"CUDAPinnedPlace"
)
not
in
self
.
_mem_devices
:
pid
=
self
.
_allocate_pid
()
self
.
_mem_devices
[(
k
,
mevent
.
device_id
,
"CUDAPinnedPlace"
)]
=
pid
self
.
_chrome_trace
.
emit_pid
(
"memory usage on %s:cudapinnedplace:%d"
%
(
k
,
mevent
.
device_id
),
pid
)
if
(
k
,
0
,
"CPU"
)
not
in
self
.
_mem_devices
:
pid
=
self
.
_allocate_pid
()
self
.
_mem_devices
[(
k
,
0
,
"CPU"
)]
=
pid
self
.
_chrome_trace
.
emit_pid
(
"memory usage on %s:cpu:%d"
%
(
k
,
0
),
pid
)
if
(
k
,
0
,
"GPU"
)
not
in
self
.
_mem_devices
:
pid
=
self
.
_allocate_pid
()
self
.
_mem_devices
[(
k
,
0
,
"GPU"
)]
=
pid
self
.
_chrome_trace
.
emit_pid
(
"memory usage on %s:gpu:%d"
%
(
k
,
0
),
pid
)
if
(
k
,
0
,
"CUDAPinnedPlace"
)
not
in
self
.
_mem_devices
:
pid
=
self
.
_allocate_pid
()
self
.
_mem_devices
[(
k
,
0
,
"CUDAPinnedPlace"
)]
=
pid
self
.
_chrome_trace
.
emit_pid
(
"memory usage on %s:cudapinnedplace:%d"
%
(
k
,
0
),
pid
)
def
_allocate_events
(
self
):
for
k
,
profile_pb
in
six
.
iteritems
(
self
.
_profile_dict
):
...
...
@@ -163,9 +219,57 @@ class Timeline(object):
event
.
start_ns
,
(
event
.
end_ns
-
event
.
start_ns
)
/
1.0
,
pid
,
event
.
sub_device_id
,
'Op'
,
event
.
name
,
args
)
def
_allocate_memory_event
(
self
):
place_to_str
=
{
profiler_pb2
.
MemEvent
.
CPUPlace
:
"CPU"
,
profiler_pb2
.
MemEvent
.
CUDAPlace
:
"GPU"
,
profiler_pb2
.
MemEvent
.
CUDAPinnedPlace
:
"CUDAPinnedPlace"
}
for
k
,
profile_pb
in
six
.
iteritems
(
self
.
_profile_dict
):
mem_list
=
[]
end_profiler
=
0
for
mevent
in
profile_pb
.
mem_events
:
crt_info
=
dict
()
crt_info
[
'time'
]
=
mevent
.
start_ns
crt_info
[
'size'
]
=
mevent
.
bytes
if
mevent
.
place
in
place_to_str
:
place
=
place_to_str
[
mevent
.
place
]
else
:
place
=
"UnDefine"
crt_info
[
'place'
]
=
place
pid
=
self
.
_mem_devices
[(
k
,
mevent
.
device_id
,
place
)]
crt_info
[
'pid'
]
=
pid
crt_info
[
'thread_id'
]
=
mevent
.
thread_id
crt_info
[
'device_id'
]
=
mevent
.
device_id
mem_list
.
append
(
crt_info
)
crt_info
=
dict
()
crt_info
[
'place'
]
=
place
crt_info
[
'pid'
]
=
pid
crt_info
[
'thread_id'
]
=
mevent
.
thread_id
crt_info
[
'device_id'
]
=
mevent
.
device_id
crt_info
[
'time'
]
=
mevent
.
end_ns
crt_info
[
'size'
]
=
-
mevent
.
bytes
mem_list
.
append
(
crt_info
)
end_profiler
=
max
(
end_profiler
,
crt_info
[
'time'
])
mem_list
.
sort
(
key
=
lambda
tmp
:
(
tmp
.
get
(
'time'
,
0
)))
i
=
0
total_size
=
0
while
i
<
len
(
mem_list
):
total_size
+=
mem_list
[
i
][
'size'
]
while
i
<
len
(
mem_list
)
-
1
and
mem_list
[
i
][
'time'
]
==
mem_list
[
i
+
1
][
'time'
]:
total_size
+=
mem_list
[
i
+
1
][
'size'
]
i
+=
1
self
.
_chrome_trace
.
emit_counter
(
"Memory"
,
"Memory"
,
mem_list
[
i
][
'pid'
],
mem_list
[
i
][
'time'
],
0
,
total_size
)
i
+=
1
def
generate_chrome_trace
(
self
):
self
.
_allocate_pids
()
self
.
_allocate_events
()
self
.
_allocate_memory_event
()
return
self
.
_chrome_trace
.
format_to_string
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录