Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
309b37e5
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
309b37e5
编写于
7月 05, 2017
作者:
S
Superjom
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of github.com:PaddlePaddle/Paddle into network
上级
5cf9a23f
98378968
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
0 addition
and
1157 deletion
+0
-1157
paddle/parameter/tests/test_common.cpp
paddle/parameter/tests/test_common.cpp
+0
-50
paddle/pserver/ParameterServer2.cpp
paddle/pserver/ParameterServer2.cpp
+0
-215
paddle/pserver/ParameterServer2.h
paddle/pserver/ParameterServer2.h
+0
-49
paddle/utils/BarrierStat.cpp
paddle/utils/BarrierStat.cpp
+0
-340
paddle/utils/BarrierStat.h
paddle/utils/BarrierStat.h
+0
-425
paddle/utils/Stat.cpp
paddle/utils/Stat.cpp
+0
-61
paddle/utils/Stat.h
paddle/utils/Stat.h
+0
-17
未找到文件。
paddle/parameter/tests/test_common.cpp
浏览文件 @
309b37e5
...
@@ -172,53 +172,3 @@ TEST_F(CommonTest, syncThreadPool) {
...
@@ -172,53 +172,3 @@ TEST_F(CommonTest, syncThreadPool) {
EXPECT_EQ
((
int
)
0
,
nums
[
i
]);
EXPECT_EQ
((
int
)
0
,
nums
[
i
]);
}
}
}
}
TEST_F
(
CommonTest
,
barrierStat
)
{
const
int
threadNum
=
10
;
SyncThreadPool
pool
(
threadNum
);
#define TEST_BARRIER_RANDOM(statName, numConnThreads, ...) \
pool.exec([&](int tid, size_t numThreads) { \
struct timeval time; \
gettimeofday(&time, nullptr); \
uint64_t usec = timeToMicroSecond(time); \
std::srand(usec); \
auto value = std::rand() % 100000; \
usleep(value); \
REGISTER_SLOW_NODES_PROBE( \
globalStat, statName, numConnThreads, tid, __VA_ARGS__); \
});
for
(
auto
i
=
0
;
i
<
10
;
i
++
)
{
TEST_BARRIER_RANDOM
(
"synThreadBarrier1"
,
threadNum
);
TEST_BARRIER_RANDOM
(
"synThreadBarrier2"
,
threadNum
);
}
globalStat
.
printAllStatus
();
globalStat
.
reset
();
for
(
auto
i
=
0
;
i
<
10
;
i
++
)
{
TEST_BARRIER_RANDOM
(
"synThreadBarrier3"
,
threadNum
,
"tag0"
);
TEST_BARRIER_RANDOM
(
"synThreadBarrier4"
,
threadNum
,
"tag1"
);
}
globalStat
.
printAllStatus
();
globalStat
.
reset
();
// use it to test accurate barrier gap
#define TEST_BARRIER(statName, numConnThreads, ...) \
pool.exec([&](int tid, size_t numThreads) { \
usleep(tid * 10000); \
REGISTER_SLOW_NODES_PROBE( \
globalStat, statName, numConnThreads, tid, __VA_ARGS__); \
});
for
(
auto
i
=
0
;
i
<
10
;
i
++
)
{
TEST_BARRIER
(
"synThreadBarrier3"
,
threadNum
,
"tag0"
);
TEST_BARRIER
(
"synThreadBarrier4"
,
threadNum
,
"tag1"
);
}
globalStat
.
printAllStatus
();
globalStat
.
reset
();
}
paddle/pserver/ParameterServer2.cpp
浏览文件 @
309b37e5
...
@@ -217,10 +217,6 @@ void ParameterServer2::setConfig(const SetConfigRequest& request,
...
@@ -217,10 +217,6 @@ void ParameterServer2::setConfig(const SetConfigRequest& request,
SetConfigResponse
response
;
SetConfigResponse
response
;
callback
(
response
);
callback
(
response
);
/// always defined, barrier slowest node function need it.
statSet_
.
reset
(
new
StatSet
(
"ParameterServer"
+
str
::
to_string
(
static_cast
<
int
>
(
serverId_
))));
}
}
real
bufferSum
(
const
std
::
vector
<
ParameterServer2
::
Buffer
>&
buffers
)
{
real
bufferSum
(
const
std
::
vector
<
ParameterServer2
::
Buffer
>&
buffers
)
{
...
@@ -369,50 +365,7 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
...
@@ -369,50 +365,7 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
std
::
vector
<
Buffer
>*
outputBuffers
)
{
std
::
vector
<
Buffer
>*
outputBuffers
)
{
VLOG
(
1
)
<<
"pserver: addGradient"
;
VLOG
(
1
)
<<
"pserver: addGradient"
;
// forwardbackward delta from all trainers
// indicate the fluctuation caused by forwardbackward.
if
(
!
numPassFinishClients_
)
{
REGISTER_BARRIER_DELTA_SERVER_SET
(
*
statSet_
,
"forwardbackwardDelta"
,
FLAGS_num_gradient_servers
,
request
.
trainer_id
(),
request
.
forwardbackward_time
(),
isSparseServer_
?
"_sparseUpdater"
:
"_denseUpdater"
);
}
{
{
/// approximately pure network overhead
REGISTER_TIMER_DYNAMIC_SET
(
"pushRecv"
,
timeToMicroSecond
(
*
handleRequestBegin_
),
-
1
,
*
statSet_
);
}
#ifndef PADDLE_DISABLE_TIMER
gettimeofday
(
&
(
*
addGradBegin_
),
nullptr
);
#endif
/// barrier fluctuation caused by network and previous forwardbackward
if
(
!
numPassFinishClients_
)
{
REGISTER_BARRIER_TIMER_SERVER_SET
(
*
statSet_
,
"handleReqBegin"
,
FLAGS_num_gradient_servers
,
request
.
trainer_id
(),
(
*
handleRequestBegin_
),
isSparseServer_
?
"_sparseUpdater"
:
"_denseUpdater"
);
}
if
(
!
numPassFinishClients_
)
{
REGISTER_BARRIER_TIMER_SERVER
(
*
statSet_
,
"addGradBegin"
,
FLAGS_num_gradient_servers
,
request
.
trainer_id
(),
isSparseServer_
?
"_sparseUpdater"
:
"_denseUpdater"
);
}
{
REGISTER_TIMER_DYNAMIC
(
"addGradCore"
,
-
1
,
*
statSet_
);
ReadLockGuard
guard
(
parameterMutex_
);
ReadLockGuard
guard
(
parameterMutex_
);
int
bufferIndex
=
0
;
int
bufferIndex
=
0
;
for
(
const
auto
&
block
:
request
.
blocks
())
{
for
(
const
auto
&
block
:
request
.
blocks
())
{
...
@@ -444,15 +397,6 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
...
@@ -444,15 +397,6 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
std
::
lock_guard
<
std
::
mutex
>
guard
(
*
info
.
lock
);
std
::
lock_guard
<
std
::
mutex
>
guard
(
*
info
.
lock
);
simd
::
addTo
(
gradientSumBuffer
,
gradientBuffer
,
size
);
simd
::
addTo
(
gradientSumBuffer
,
gradientBuffer
,
size
);
}
}
if
(
!
numPassFinishClients_
)
{
REGISTER_BARRIER_TIMER_SERVER
(
*
statSet_
,
"addGradCoreFinish"
,
FLAGS_num_gradient_servers
,
request
.
trainer_id
(),
isSparseServer_
?
"_sparseUpdater"
:
"_denseUpdater"
);
}
}
}
if
(
request
.
batch_status
()
==
BATCH_FINISH
||
if
(
request
.
batch_status
()
==
BATCH_FINISH
||
request
.
batch_status
()
==
BATCH_START_AND_FINISH
)
{
request
.
batch_status
()
==
BATCH_START_AND_FINISH
)
{
...
@@ -461,47 +405,12 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
...
@@ -461,47 +405,12 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
VLOG
(
1
)
<<
"num samples: "
<<
numSamplesProcessed_
VLOG
(
1
)
<<
"num samples: "
<<
numSamplesProcessed_
<<
", new cost:"
<<
cost_
;
<<
", new cost:"
<<
cost_
;
/// numPassFinishClients_ means some trainer has entered finishPass
if
(
!
numPassFinishClients_
)
{
REGISTER_SLOW_NODES_PROBE
(
*
statSet_
,
"SLOW_NODES"
,
FLAGS_num_gradient_servers
,
request
.
trainer_id
(),
isSparseServer_
?
"_sparseUpdater"
:
"_denseUpdater"
);
}
/// notify doOperation gradient ready
/// notify doOperation gradient ready
gradientReadyBarrier_
.
wait
();
gradientReadyBarrier_
.
wait
();
/// if wait pass finish does not start, do check
if
(
!
numPassFinishClients_
)
{
CHECK_BARRIER_TIMER
(
*
statSet_
,
"SLOW_NODES"
,
FLAGS_num_gradient_servers
,
isSparseServer_
?
"_sparseUpdater"
:
"_denseUpdater"
);
}
/// barrier performance while all parameter add is finished
/// can indicate the fluctation caused by computation at pserver.
if
(
!
numPassFinishClients_
)
{
REGISTER_BARRIER_TIMER_SERVER
(
*
statSet_
,
"paraReady"
,
FLAGS_num_gradient_servers
,
request
.
trainer_id
(),
isSparseServer_
?
"_sparseUpdater"
:
"_denseUpdater"
);
}
/// wait doOperation finish
/// wait doOperation finish
parameterReadyBarrier_
.
wait
();
parameterReadyBarrier_
.
wait
();
VLOG
(
1
)
<<
"start send back"
;
VLOG
(
1
)
<<
"start send back"
;
{
/// total time except overhead of network.
REGISTER_TIMER_DYNAMIC_SET
(
"sendParaNoRecvNoSend"
,
timeToMicroSecond
(
*
addGradBegin_
),
-
1
,
*
statSet_
);
}
}
}
}
}
...
@@ -543,57 +452,6 @@ bool ParameterServer2::asyncGrdientCommitCheckAndStat(
...
@@ -543,57 +452,6 @@ bool ParameterServer2::asyncGrdientCommitCheckAndStat(
return
commitGradient
;
return
commitGradient
;
}
}
void
ParameterServer2
::
printAsyncGradientCommitStatAndReset
()
{
std
::
stringstream
statFormat
;
if
(
asyncUpdateSteps_
)
{
statFormat
<<
"async discard gradients stat: "
<<
std
::
endl
;
statFormat
<<
"serverId: "
<<
serverId_
<<
" serverType: "
<<
isSparseServer_
<<
" total updates: "
<<
asyncUpdateSteps_
<<
" discard updates: "
<<
asyncLaggedGradientsNum_
<<
" discard ratio: "
<<
(
real
)
asyncLaggedGradientsNum_
/
(
real
)
asyncUpdateSteps_
;
statFormat
<<
std
::
endl
;
statFormat
<<
std
::
endl
;
statFormat
<<
"Async Gradient Update Steps distribution: "
<<
std
::
endl
<<
"Sample: 1:1912(0.00284449) means "
<<
"the updates step=1 count 1912 times "
<<
"and account for 0.284449% of total updates"
<<
std
::
endl
;
size_t
index
=
0
;
for
(
const
auto
&
stat
:
asyncUpdateStat_
)
{
statFormat
<<
index
<<
":"
<<
stat
<<
"("
<<
(
real
)
stat
/
(
real
)
asyncUpdateSteps_
<<
") "
;
index
++
;
}
statFormat
<<
std
::
endl
;
statFormat
<<
std
::
endl
;
statFormat
<<
"Async Gradient Discard based on trainer_id: "
<<
std
::
endl
<<
"Sample: 2:22(0.0016363) means "
<<
"total discarded updates from trainer_id=2 count 22 "
<<
"and account for 0.16363% of all updates from trainer_id=2"
<<
std
::
endl
;
for
(
auto
i
=
0
;
i
<
FLAGS_num_gradient_servers
;
i
++
)
{
real
ratio
=
(
real
)
asyncTrainerDiscardStat_
[
i
]
/
(
real
)(
asyncTrainerCommitStat_
[
i
]
+
asyncTrainerDiscardStat_
[
i
]);
statFormat
<<
i
<<
":"
<<
asyncTrainerDiscardStat_
[
i
]
<<
"("
<<
ratio
<<
")"
<<
" "
;
}
LOG
(
INFO
)
<<
statFormat
.
str
();
/// reset stat
asyncUpdateSteps_
=
0
;
asyncTrainerSteps_
.
assign
(
asyncTrainerSteps_
.
size
(),
0
);
asyncLaggedGradientsNum_
=
0
;
asyncUpdateStat_
.
assign
(
asyncUpdateStat_
.
size
(),
0
);
asyncTrainerDiscardStat_
.
assign
(
asyncTrainerDiscardStat_
.
size
(),
0
);
asyncTrainerCommitStat_
.
assign
(
asyncTrainerCommitStat_
.
size
(),
0
);
}
}
static
ThreadLocal
<
std
::
vector
<
bool
>>
localBlockBitset_
;
static
ThreadLocal
<
std
::
vector
<
bool
>>
localBlockBitset_
;
void
ParameterServer2
::
asyncSGD
(
const
SendParameterRequest
&
request
,
void
ParameterServer2
::
asyncSGD
(
const
SendParameterRequest
&
request
,
...
@@ -695,7 +553,6 @@ void ParameterServer2::asyncSGD(const SendParameterRequest& request,
...
@@ -695,7 +553,6 @@ void ParameterServer2::asyncSGD(const SendParameterRequest& request,
if
(
request
.
trainer_id
()
==
0
)
{
if
(
request
.
trainer_id
()
==
0
)
{
/// batchId_ is approximately equal to "real batchId_"
/// batchId_ is approximately equal to "real batchId_"
batchId_
++
;
batchId_
++
;
tuningAsyncsgdMidOutput
();
}
}
}
}
...
@@ -881,34 +738,6 @@ void ParameterServer2::sendParameter(const SendParameterRequest& request,
...
@@ -881,34 +738,6 @@ void ParameterServer2::sendParameter(const SendParameterRequest& request,
}
}
(
*
requestVec_
).
clear
();
(
*
requestVec_
).
clear
();
(
*
callbackVec_
).
clear
();
(
*
callbackVec_
).
clear
();
/// barrier perfromance while all data are send finished.
/// indicates network flucatuation for big message.
if
(
!
numPassFinishClients_
)
{
REGISTER_BARRIER_TIMER_SERVER
(
*
statSet_
,
"sendParamFinish"
,
FLAGS_num_gradient_servers
,
request
.
trainer_id
(),
isSparseServer_
?
"_sparseUpdater"
:
"_denseUpdater"
);
}
/// all time exhausted in parameterServer for big message.
/// it contains network and computation at pserver.
{
/// total time including overhead of network.
REGISTER_TIMER_DYNAMIC_SET
(
"sendParaTotal"
,
timeToMicroSecond
(
*
handleRequestBegin_
),
-
1
,
*
statSet_
);
}
/// all time exhausted in pserverServer except recieve network.
{
/// total time except overhead of network receive
REGISTER_TIMER_DYNAMIC_SET
(
"sendParaNoRecv"
,
timeToMicroSecond
(
*
addGradBegin_
),
-
1
,
*
statSet_
);
}
}
}
break
;
break
;
case
PSERVER_UPDATE_MODE_SET_PARAM
:
case
PSERVER_UPDATE_MODE_SET_PARAM
:
...
@@ -1088,8 +917,6 @@ void ParameterServer2::op_SGD(const Operation& operation,
...
@@ -1088,8 +917,6 @@ void ParameterServer2::op_SGD(const Operation& operation,
}
}
{
{
REGISTER_TIMER_DYNAMIC
(
"op_SGD"
,
-
1
,
*
statSet_
);
parallelExecForEachBlock
([
&
](
int64_t
blockId
,
const
VectorPtr
vecs
[])
{
parallelExecForEachBlock
([
&
](
int64_t
blockId
,
const
VectorPtr
vecs
[])
{
BlockInfo
&
info
=
blockInfos_
[
blockId
];
BlockInfo
&
info
=
blockInfos_
[
blockId
];
const
ParameterConfig
&
config
=
getParameterConfig
(
blockId
);
const
ParameterConfig
&
config
=
getParameterConfig
(
blockId
);
...
@@ -1113,7 +940,6 @@ void ParameterServer2::op_SGD(const Operation& operation,
...
@@ -1113,7 +940,6 @@ void ParameterServer2::op_SGD(const Operation& operation,
}
}
batchId_
++
;
batchId_
++
;
tuningSgdMidOutput
();
}
}
void
ParameterServer2
::
op_start_pass
(
const
Operation
&
operation
,
void
ParameterServer2
::
op_start_pass
(
const
Operation
&
operation
,
...
@@ -1146,8 +972,6 @@ void ParameterServer2::op_finish_pass(const Operation& operation,
...
@@ -1146,8 +972,6 @@ void ParameterServer2::op_finish_pass(const Operation& operation,
/// finish pass
/// finish pass
info
.
optimizer
->
finishPass
();
info
.
optimizer
->
finishPass
();
});
});
tuningSgdFinished
();
batchId_
=
0
;
batchId_
=
0
;
}
}
...
@@ -1515,7 +1339,6 @@ void ParameterServer2::asyncFinishPass(const SynchronizeRequest& request,
...
@@ -1515,7 +1339,6 @@ void ParameterServer2::asyncFinishPass(const SynchronizeRequest& request,
callback
(
SynchronizeResponse
());
callback
(
SynchronizeResponse
());
if
(
request
.
trainer_id
()
==
0
)
{
if
(
request
.
trainer_id
()
==
0
)
{
tuningAsyncsgdFinished
();
batchId_
=
0
;
batchId_
=
0
;
}
}
}
}
...
@@ -1574,42 +1397,4 @@ void ParameterServer2::releaseMatrix(const ReleaseMatrixRequest& request,
...
@@ -1574,42 +1397,4 @@ void ParameterServer2::releaseMatrix(const ReleaseMatrixRequest& request,
callback
(
response
);
callback
(
response
);
}
}
void
ParameterServer2
::
tuningSgdMidOutput
()
{
if
(
batchId_
&&
batchId_
%
FLAGS_log_period_server
==
0
)
{
LOG
(
INFO
)
<<
"======== Batch="
<<
batchId_
<<
"======="
;
statSet_
->
setThreadInfo
(
true
);
statSet_
->
printAllStatus
();
/// not reset raw data for reducing the overhead of performance tuning
statSet_
->
reset
(
false
);
}
}
void
ParameterServer2
::
tuningSgdFinished
()
{
LOG
(
INFO
)
<<
"======== Batch="
<<
batchId_
<<
" pass END"
<<
"======="
;
statSet_
->
setThreadInfo
(
true
);
statSet_
->
printAllStatus
();
/**
* reset raw data at end of pass since some raw data could be not
* complete. Otherwise the raw data will pollute next pass performance
* tuning
*/
statSet_
->
reset
();
}
void
ParameterServer2
::
tuningAsyncsgdMidOutput
()
{
#ifndef PADDLE_DISABLE_TIMER
if
(
batchId_
&&
batchId_
%
FLAGS_log_period_server
==
0
)
{
LOG
(
INFO
)
<<
"======== [not accurate] Batch="
<<
batchId_
<<
"======="
;
printAsyncGradientCommitStatAndReset
();
}
#endif
}
void
ParameterServer2
::
tuningAsyncsgdFinished
()
{
LOG
(
INFO
)
<<
"======== [not accurate] Batch="
<<
batchId_
<<
" pass END"
<<
"======="
;
printAsyncGradientCommitStatAndReset
();
}
}
// namespace paddle
}
// namespace paddle
paddle/pserver/ParameterServer2.h
浏览文件 @
309b37e5
...
@@ -298,24 +298,6 @@ protected:
...
@@ -298,24 +298,6 @@ protected:
/// barrier performance tuning sync-sgd required
/// barrier performance tuning sync-sgd required
std
::
atomic
<
int64_t
>
batchId_
;
std
::
atomic
<
int64_t
>
batchId_
;
/// the beginning of addGradient without network overhead
ThreadLocal
<
struct
timeval
>
addGradBegin_
;
/**
* tuning barrier performance
* to better control log for sparse and dense parameter,
* we use different log entities for different parameterServer
* objects.
* it will output lots of performance stats to perceive the
* overhead of network, fluctuation of computation from
* forwardbackward and network, computation from optimization
* at pserver end, barrier overhead, etc. to understand tuning
* data, focus on the synchronization between addGradient and
* doOperation which indirectly call op_SGD operation controlled
* by remote updater controller
*/
std
::
unique_ptr
<
StatSet
>
statSet_
;
public:
public:
struct
Buffer
{
struct
Buffer
{
real
*
base
;
real
*
base
;
...
@@ -325,7 +307,6 @@ public:
...
@@ -325,7 +307,6 @@ public:
protected:
protected:
/// async gradient commit control
/// async gradient commit control
bool
asyncGrdientCommitCheckAndStat
(
const
SendParameterRequest
&
request
);
bool
asyncGrdientCommitCheckAndStat
(
const
SendParameterRequest
&
request
);
void
printAsyncGradientCommitStatAndReset
();
public:
public:
/// disable default parameter for overloading
/// disable default parameter for overloading
...
@@ -710,36 +691,6 @@ public:
...
@@ -710,36 +691,6 @@ public:
void
op_load
(
const
Operation
&
operation
,
OperationResult
*
result
);
void
op_load
(
const
Operation
&
operation
,
OperationResult
*
result
);
void
op_save
(
const
Operation
&
operation
,
OperationResult
*
result
);
void
op_save
(
const
Operation
&
operation
,
OperationResult
*
result
);
/**
* @brief output log in at the middle stage of training
*
* @note flush log histroy and state at the end for sgd
*/
void
tuningSgdMidOutput
();
/**
* @brief output log in at the end stage of training
*
* @note flush log histroy and state at the end for sgd. it will also
* flush some stateful stat for next pass.
*/
void
tuningSgdFinished
();
/**
* @brief output log in at the middle stage of training
*
* @note flush log histroy and state at the end for async-sgd.
* it will log some performance log if some lagged node are found
*/
void
tuningAsyncsgdMidOutput
();
/**
* @brief output log in at the end stage of training
*
* @note flush log histroy and state at the end for async-sgd.
*/
void
tuningAsyncsgdFinished
();
};
};
}
// namespace paddle
}
// namespace paddle
paddle/utils/BarrierStat.cpp
已删除
100644 → 0
浏览文件 @
5cf9a23f
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/utils/BarrierStat.h"
#include <string.h>
#include <sys/types.h>
#include <algorithm>
#include <iomanip>
#include "paddle/utils/Flags.h"
#include "paddle/utils/Stat.h"
DEFINE_bool
(
log_barrier_abstract
,
true
,
"if true, show abstract of barrier performance"
);
DEFINE_int32
(
log_barrier_lowest_nodes
,
5
,
"how many lowest node will be logged"
);
DEFINE_bool
(
log_barrier_show_log
,
false
,
// for performance tuning insight
"if true, always show barrier abstract even with little gap"
);
namespace
paddle
{
std
::
ostream
&
operator
<<
(
std
::
ostream
&
output
,
const
BarrierStatBase
&
stat
)
{
if
(
FLAGS_log_barrier_abstract
)
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
stat
.
lock_
);
stat
.
showAbstract
(
output
);
}
return
output
;
}
BarrierStatBase
::
BarrierStatBase
(
uint16_t
numConnThreads
,
const
std
::
string
&
name
)
:
totSamples_
(
0
),
numConnThreads_
(
numConnThreads
),
name_
(
name
)
{
abstract_
.
resize
(
numConnThreads_
);
if
(
FLAGS_log_barrier_show_log
)
{
rateThreshold_
=
0.0
;
}
else
{
/* probablity of abnormal node
* p = 1/n + (n/8)/(n+1), n = nodes, n > 1
* if the freq of lowest trainerId larger than p,
* output FLAGS_log_barrier_lowest_nodes lastTrainerId.
* numConnThreads_ indicates nodes
*/
float
n
=
(
float
)
numConnThreads
;
rateThreshold_
=
1.0
/
n
+
(
n
/
8.0
)
/
(
n
+
1.0
);
}
}
BarrierEndStat
::
BarrierEndStat
(
uint16_t
numConnThreads
,
const
std
::
string
&
name
)
:
BarrierStatBase
(
numConnThreads
,
name
)
{
timeVector_
.
reset
(
new
TimeVectorEnd
(
numConnThreads_
));
reset
(
true
);
LOG
(
INFO
)
<<
" create barrierEndStat: "
<<
name
<<
" endBarrier warning rate: "
<<
rateThreshold_
;
}
/*
* Note:
* the design different pserver entity owns different statSet to obey
* the background that different pserver runs separately.
*/
void
BarrierEndStat
::
updateStat
(
struct
timeval
&
cur
,
int32_t
trainerId
)
{
CHECK_LT
(
trainerId
,
numConnThreads_
)
<<
"trainerId is invalid in barrier"
;
std
::
lock_guard
<
std
::
mutex
>
guard
(
lock_
);
timeVector_
->
addTimeval
(
cur
,
trainerId
);
if
(
timeVector_
->
full
())
{
std
::
lock_guard
<
std
::
mutex
>
abstractGuard
(
abstractLock_
);
auto
id
=
timeVector_
->
getLastTrainerId
();
auto
delta
=
timeToMicroSecond
(
timeVector_
->
getDelta
());
auto
secondDelta
=
timeToMicroSecond
(
timeVector_
->
get1NDelta
());
auto
lastTwoDelta
=
timeToMicroSecond
(
timeVector_
->
getMinus1NDelta
());
auto
midDelta
=
timeToMicroSecond
(
timeVector_
->
getMidNDelta
());
// discard first sample, since first sample probably is abnormal.
if
(
totSamples_
)
{
abstract_
[
id
].
freq
++
;
if
(
delta
<
abstract_
[
id
].
minDelta
)
{
abstract_
[
id
].
minDelta
=
delta
;
}
if
(
delta
>
abstract_
[
id
].
maxDelta
)
{
abstract_
[
id
].
maxDelta
=
delta
;
}
abstract_
[
id
].
totDelta
+=
delta
;
abstract_
[
id
].
totSecondDelta
+=
secondDelta
;
abstract_
[
id
].
totLastTwoDelta
+=
lastTwoDelta
;
abstract_
[
id
].
totMidDelta
+=
midDelta
;
// update totAbstract_
totAbstract_
.
freq
++
;
if
(
delta
<
totAbstract_
.
minDelta
)
{
totAbstract_
.
minDelta
=
delta
;
}
if
(
delta
>
totAbstract_
.
maxDelta
)
{
totAbstract_
.
maxDelta
=
delta
;
}
totAbstract_
.
totDelta
+=
delta
;
totAbstract_
.
totSecondDelta
+=
secondDelta
;
totAbstract_
.
totLastTwoDelta
+=
lastTwoDelta
;
totAbstract_
.
totMidDelta
+=
midDelta
;
}
totSamples_
++
;
timeVector_
->
reset
();
}
}
void
BarrierEndStat
::
reset
(
bool
clearRawData
)
{
int32_t
i
=
0
;
totSamples_
=
0
;
std
::
lock_guard
<
std
::
mutex
>
guard
(
abstractLock_
);
if
(
clearRawData
)
{
timeVector_
->
reset
();
}
for
(
auto
&
abstract
:
abstract_
)
{
memset
((
void
*
)
&
abstract
,
0
,
sizeof
(
abstract
));
abstract
.
minDelta
=
UINT64_MAX
;
abstract
.
trainerId
=
i
++
;
}
memset
((
void
*
)
&
totAbstract_
,
0
,
sizeof
(
Abstract
));
totAbstract_
.
minDelta
=
UINT64_MAX
;
}
void
BarrierEndStat
::
showAbstract
(
std
::
ostream
&
output
)
const
{
// do not support the case "<=2 pserver"
if
(
numConnThreads_
<=
2
||
!
totSamples_
)
{
return
;
}
// duplicate freq info
std
::
vector
<
struct
Abstract
>
outputAbstract
=
abstract_
;
std
::
sort
(
outputAbstract
.
begin
(),
outputAbstract
.
end
(),
[](
const
struct
Abstract
&
a
,
const
struct
Abstract
&
b
)
{
return
a
.
freq
>
b
.
freq
;
});
auto
rate
=
(
float
)
outputAbstract
[
0
].
freq
/
(
float
)
totSamples_
;
if
(
rate
<
rateThreshold_
)
{
return
;
}
output
<<
std
::
setw
(
20
)
<<
name_
<<
std
::
endl
;
/*
* Note:
* avgGap: the average delta between 1 -- n arriving trainers
* avgSecondGap: the average delta between 2 -- n arriving trainers
* avgLastTwoGap: the average delta between n-1 -- n arriving trainers
* avgMidGap: the average delta between n/2 -- n arriving trainers
* rato: samples / totSamples
*
* the stat is based on per trainer if trainer_id is set, totAbstract is
* stat based on all trainers scope.
*/
output
<<
std
::
setw
(
42
)
<<
" "
<<
std
::
setw
(
15
)
<<
"trainerId"
<<
std
::
setw
(
15
)
<<
"avgGap"
<<
std
::
setw
(
15
)
<<
"avgSecondGap"
<<
std
::
setw
(
15
)
<<
"avgLastTwoGap"
<<
std
::
setw
(
15
)
<<
"avgMidGap"
<<
std
::
setw
(
10
)
<<
"rate"
<<
std
::
setw
(
10
)
<<
"samples"
<<
std
::
setw
(
10
)
<<
"totSamples"
<<
std
::
endl
;
// show totAbstract, it's valuable when lastTrainerId is even-distributed'
if
(
!
totAbstract_
.
freq
)
return
;
output
<<
std
::
setw
(
42
)
<<
" "
<<
std
::
setw
(
15
)
<<
"totAbstract"
<<
std
::
setw
(
15
)
<<
(
totAbstract_
.
totDelta
/
totAbstract_
.
freq
)
*
0.001
<<
std
::
setw
(
15
)
<<
(
totAbstract_
.
totSecondDelta
/
totAbstract_
.
freq
)
*
0.001
<<
std
::
setw
(
15
)
<<
(
totAbstract_
.
totLastTwoDelta
/
totAbstract_
.
freq
)
*
0.001
<<
std
::
setw
(
15
)
<<
(
totAbstract_
.
totMidDelta
/
totAbstract_
.
freq
)
*
0.001
<<
std
::
setw
(
10
)
<<
(
float
)
totAbstract_
.
freq
/
(
float
)
totSamples_
<<
std
::
setw
(
10
)
<<
(
float
)
totAbstract_
.
freq
<<
std
::
setw
(
10
)
<<
(
float
)
totSamples_
<<
std
::
endl
;
// show lastTrainerId abstract
int
count
=
0
;
for
(
auto
&
abstract
:
outputAbstract
)
{
if
(
!
abstract
.
freq
||
count
++
>=
FLAGS_log_barrier_lowest_nodes
)
{
break
;
}
// output format control
output
<<
std
::
setw
(
42
)
<<
" "
<<
std
::
setw
(
15
)
<<
abstract
.
trainerId
<<
std
::
setw
(
15
)
<<
(
abstract
.
totDelta
/
abstract
.
freq
)
*
0.001
<<
std
::
setw
(
15
)
<<
(
abstract
.
totSecondDelta
/
abstract
.
freq
)
*
0.001
<<
std
::
setw
(
15
)
<<
(
abstract
.
totLastTwoDelta
/
abstract
.
freq
)
*
0.001
<<
std
::
setw
(
15
)
<<
(
abstract
.
totMidDelta
/
abstract
.
freq
)
*
0.001
<<
std
::
setw
(
10
)
<<
(
float
)
abstract
.
freq
/
(
float
)
totSamples_
<<
std
::
setw
(
10
)
<<
(
float
)
abstract
.
freq
<<
std
::
setw
(
10
)
<<
(
float
)
totSamples_
<<
std
::
endl
;
}
}
BarrierDeltaStat
::
BarrierDeltaStat
(
uint16_t
numConnThreads
,
const
std
::
string
&
name
)
:
BarrierStatBase
(
numConnThreads
,
name
)
{
timeVector_
.
reset
(
new
TimeVectorDelta
(
numConnThreads_
));
reset
(
true
);
LOG
(
INFO
)
<<
" create barrierDeltaStat: "
<<
name
<<
" barrierDelta warning rate: "
<<
rateThreshold_
;
}
void
BarrierDeltaStat
::
updateStat
(
uint64_t
delta
,
int32_t
trainerId
)
{
CHECK_LT
(
trainerId
,
numConnThreads_
)
<<
"trainerId is invalid in barrier"
;
std
::
lock_guard
<
std
::
mutex
>
guard
(
lock_
);
timeVector_
->
addTimeval
(
delta
,
trainerId
);
if
(
timeVector_
->
full
())
{
std
::
lock_guard
<
std
::
mutex
>
abstractGuard
(
abstractLock_
);
auto
id
=
timeVector_
->
getMaxTrainerId
();
auto
delta
=
timeVector_
->
getDelta
();
// discard first sample, since first sample probably is abnormal.
if
(
totSamples_
)
{
abstract_
[
id
].
freq
++
;
if
(
delta
<
abstract_
[
id
].
minDelta
)
{
abstract_
[
id
].
minDelta
=
delta
;
}
if
(
delta
>
abstract_
[
id
].
maxDelta
)
{
abstract_
[
id
].
maxDelta
=
delta
;
}
abstract_
[
id
].
totDelta
+=
delta
;
// update totAbstract_
totAbstract_
.
freq
++
;
if
(
delta
<
totAbstract_
.
minDelta
)
{
totAbstract_
.
minDelta
=
delta
;
}
if
(
delta
>
totAbstract_
.
maxDelta
)
{
totAbstract_
.
maxDelta
=
delta
;
}
totAbstract_
.
totDelta
+=
delta
;
}
totSamples_
++
;
timeVector_
->
reset
();
}
}
void
BarrierDeltaStat
::
reset
(
bool
clearRawData
)
{
int32_t
i
=
0
;
totSamples_
=
0
;
std
::
lock_guard
<
std
::
mutex
>
guard
(
abstractLock_
);
if
(
clearRawData
)
{
timeVector_
->
reset
();
}
for
(
auto
&
abstract
:
abstract_
)
{
memset
((
void
*
)
&
abstract
,
0
,
sizeof
(
abstract
));
abstract
.
minDelta
=
UINT64_MAX
;
abstract
.
trainerId
=
i
++
;
}
memset
((
void
*
)
&
totAbstract_
,
0
,
sizeof
(
Abstract
));
totAbstract_
.
minDelta
=
UINT64_MAX
;
}
void
BarrierDeltaStat
::
showAbstract
(
std
::
ostream
&
output
)
const
{
// do not support the case "<=2 pserver"
if
(
numConnThreads_
<=
2
||
!
totSamples_
)
{
return
;
}
// duplicate freq info
std
::
vector
<
struct
Abstract
>
outputAbstract
=
abstract_
;
std
::
sort
(
outputAbstract
.
begin
(),
outputAbstract
.
end
(),
[](
const
struct
Abstract
&
a
,
const
struct
Abstract
&
b
)
{
return
a
.
freq
>
b
.
freq
;
});
auto
rate
=
(
float
)
outputAbstract
[
0
].
freq
/
(
float
)
totSamples_
;
if
(
rate
<
rateThreshold_
)
{
return
;
}
output
<<
std
::
setw
(
20
)
<<
name_
<<
std
::
endl
;
/* Note:
* Gap means the delta from all trainers' forwardbackward
* avgGap: average Gap in log_period batches
* minGap: min Gap in log_period batches
* maxGap: max Gap in log_period batches
* trainerId: the slowest trainer_id
*
* the stat is based on per trainer if trainer_id is set, totAbstract is
* stat based on all trainers scope.
*/
output
<<
std
::
setw
(
42
)
<<
" "
<<
std
::
setw
(
15
)
<<
"trainerId"
<<
std
::
setw
(
15
)
<<
"avgGap"
<<
std
::
setw
(
10
)
<<
"minGap"
<<
std
::
setw
(
10
)
<<
"maxGap"
<<
std
::
setw
(
10
)
<<
"rate"
<<
std
::
setw
(
10
)
<<
"samples"
<<
std
::
setw
(
10
)
<<
"totSamples"
<<
std
::
endl
;
// show totAbstract, it's valuable when lastTrainerId is even-distributed'
if
(
!
totAbstract_
.
freq
)
return
;
output
<<
std
::
setw
(
42
)
<<
" "
<<
std
::
setw
(
15
)
<<
"totAbstract"
<<
std
::
setw
(
15
)
<<
(
totAbstract_
.
totDelta
/
totAbstract_
.
freq
)
*
0.001
<<
std
::
setw
(
10
)
<<
totAbstract_
.
minDelta
*
0.001
<<
std
::
setw
(
10
)
<<
totAbstract_
.
maxDelta
*
0.001
<<
std
::
setw
(
10
)
<<
(
float
)
totAbstract_
.
freq
/
(
float
)
totSamples_
<<
std
::
setw
(
10
)
<<
(
float
)
totAbstract_
.
freq
<<
std
::
setw
(
10
)
<<
(
float
)
totSamples_
<<
std
::
endl
;
// show lastTrainerId abstract
int
count
=
0
;
for
(
auto
&
abstract
:
outputAbstract
)
{
if
(
!
abstract
.
freq
||
count
++
>=
FLAGS_log_barrier_lowest_nodes
)
{
break
;
}
// output format control
output
<<
std
::
setw
(
42
)
<<
" "
<<
std
::
setw
(
15
)
<<
abstract
.
trainerId
<<
std
::
setw
(
15
)
<<
(
abstract
.
totDelta
/
abstract
.
freq
)
*
0.001
<<
std
::
setw
(
10
)
<<
abstract
.
minDelta
*
0.001
<<
std
::
setw
(
10
)
<<
abstract
.
maxDelta
*
0.001
<<
std
::
setw
(
10
)
<<
(
float
)
abstract
.
freq
/
(
float
)
totSamples_
<<
std
::
setw
(
10
)
<<
(
float
)
abstract
.
freq
<<
std
::
setw
(
10
)
<<
(
float
)
totSamples_
<<
std
::
endl
;
}
}
}
// namespace paddle
paddle/utils/BarrierStat.h
已删除
100644 → 0
浏览文件 @
5cf9a23f
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdint.h>
#include <sys/time.h>
#include <iostream>
#include <list>
#include <memory>
#include <mutex>
#include <string>
#include <unordered_map>
#include "Locks.h"
#include "Logging.h"
#include "ThreadLocal.h"
namespace
paddle
{
inline
uint64_t
timeToMicroSecond
(
struct
timeval
time
)
{
return
time
.
tv_sec
*
1000000LU
+
time
.
tv_usec
;
}
class
TimeVectorEnd
{
/*
* help class for gathering all barrier performance data
* which shows time point property.
* freqently used in barrier performance tuning API, such
* as tuning which is slowest node in sync-sgd mode training.
*/
public:
explicit
TimeVectorEnd
(
uint16_t
size
)
:
size_
(
size
)
{
index_
=
0
;
timeArray_
.
resize
(
size
);
trainerIds_
.
resize
(
size
);
}
~
TimeVectorEnd
()
{}
uint16_t
size
()
{
return
size_
;
}
bool
full
()
{
return
index_
==
size_
;
}
bool
empty
()
{
return
index_
==
0
;
}
void
reset
()
{
index_
=
0
;
}
void
addTimeval
(
struct
timeval
time
,
int32_t
trainerId
)
{
timeArray_
[
index_
]
=
time
;
trainerIds_
[
index_
]
=
trainerId
;
index_
++
;
}
struct
timeval
getDelta
()
const
{
struct
timeval
delta
;
CHECK_GT
(
size_
,
1
)
<<
"not support with 1 pserver"
;
timersub
(
&
timeArray_
[
size_
-
1
],
&
timeArray_
[
0
],
&
delta
);
return
delta
;
}
/* 2, n delta */
struct
timeval
get1NDelta
()
const
{
CHECK_GT
(
size_
,
2
)
<<
"not support with less than 2 pservers"
;
struct
timeval
delta
;
timersub
(
&
timeArray_
[
size_
-
1
],
&
timeArray_
[
1
],
&
delta
);
return
delta
;
}
/* n-1, n delta */
struct
timeval
getMinus1NDelta
()
const
{
CHECK_GT
(
size_
,
2
)
<<
"not support with less than 2 pservers"
;
struct
timeval
delta
;
timersub
(
&
timeArray_
[
size_
-
1
],
&
timeArray_
[
size_
-
2
],
&
delta
);
return
delta
;
}
/* n/2, n delta */
struct
timeval
getMidNDelta
()
const
{
CHECK_GT
(
size_
,
2
)
<<
"not support with less than 2 pservers"
;
struct
timeval
delta
;
timersub
(
&
timeArray_
[
size_
-
1
],
&
timeArray_
[
size_
/
2
],
&
delta
);
return
delta
;
}
int32_t
getLastTrainerId
()
const
{
return
trainerIds_
[
index_
-
1
];
}
private:
uint16_t
size_
;
uint16_t
index_
;
std
::
vector
<
struct
timeval
>
timeArray_
;
std
::
vector
<
int32_t
>
trainerIds_
;
};
class
TimeVectorDelta
{
/*
* help class for gathering performance data which shows time
* delta property, such as tuning the time distribution of
* forwardBackward time from all cluster nodes.
*/
public:
explicit
TimeVectorDelta
(
uint16_t
size
)
:
size_
(
size
),
min_
(
UINT64_MAX
),
max_
(
0
)
{
index_
=
0
;
timeArray_
.
resize
(
size
);
}
~
TimeVectorDelta
()
{}
uint16_t
size
()
{
return
size_
;
}
bool
full
()
{
return
index_
==
size_
;
}
bool
empty
()
{
return
index_
==
0
;
}
void
reset
()
{
index_
=
0
;
min_
=
UINT64_MAX
;
max_
=
0
;
}
void
addTimeval
(
uint64_t
delta
,
int32_t
trainerId
)
{
timeArray_
[
index_
]
=
delta
;
index_
++
;
if
(
delta
<
min_
)
{
min_
=
delta
;
}
if
(
delta
>
max_
)
{
max_
=
delta
;
maxTrainerId_
=
trainerId
;
}
}
uint64_t
getDelta
()
const
{
CHECK_GT
(
size_
,
1
)
<<
"not support with 1 pserver"
;
return
max_
-
min_
;
}
/* 2, n delta */
uint64_t
get1NDelta
()
const
{
CHECK_GT
(
size_
,
2
)
<<
"not support with less than 2 pservers"
;
LOG
(
FATAL
)
<<
"Not implemented"
;
}
/* n-1, n delta */
uint64_t
getMinus1NDelta
()
const
{
CHECK_GT
(
size_
,
2
)
<<
"not support with less than 2 pservers"
;
LOG
(
FATAL
)
<<
"Not implemented"
;
}
/* n/2, n delta */
uint64_t
getMidNDelta
()
const
{
CHECK_GT
(
size_
,
2
)
<<
"not support with less than 2 pservers"
;
LOG
(
FATAL
)
<<
"Not implemented"
;
}
int32_t
getMaxTrainerId
()
const
{
return
maxTrainerId_
;
}
private:
uint16_t
size_
;
uint16_t
index_
;
std
::
vector
<
uint64_t
>
timeArray_
;
private:
uint64_t
min_
;
uint64_t
max_
;
int32_t
maxTrainerId_
;
};
// total samples stats, us
struct
Abstract
{
// last trainerId for barrier end, maxDelta trainerId for barrier delta
int32_t
trainerId
;
uint64_t
minDelta
;
uint64_t
maxDelta
;
uint64_t
totDelta
;
// first one is probably itself, so discard it.
uint64_t
totSecondDelta
;
// to confirm if last node destroy barrier performance.
uint64_t
totLastTwoDelta
;
// n/2-n delta
uint64_t
totMidDelta
;
uint64_t
freq
;
};
// barrier performance tunning stats
class
BarrierStatBase
{
public:
BarrierStatBase
(
uint16_t
numConnThreads
,
const
std
::
string
&
name
);
virtual
~
BarrierStatBase
()
{}
// if called at pserver end, then trainId means trainer's id.
// by default trainer does not use trainerId, so set it to -1
virtual
void
updateStat
(
struct
timeval
&
cur
,
int32_t
trainerId
=
-
1
)
=
0
;
virtual
void
updateStat
(
uint64_t
delta
,
int32_t
trainerId
=
-
1
)
=
0
;
const
std
::
string
&
getName
()
{
return
name_
;
}
virtual
void
reset
(
bool
clearRawData
=
true
)
{}
// since the timeVector_ is not stateful, so it's not clear whether the
// the barrier delta is correct. if one timestamp was lost, the all data
// from barrier stat becomes rubbish. -_-
virtual
bool
checkPassBarrier
()
{
LOG
(
INFO
)
<<
"bug implementation found"
;
return
false
;
}
protected:
virtual
void
showAbstract
(
std
::
ostream
&
output
)
const
{}
friend
std
::
ostream
&
operator
<<
(
std
::
ostream
&
output
,
const
BarrierStatBase
&
stat
);
protected:
mutable
std
::
mutex
lock_
;
std
::
mutex
abstractLock_
;
// see note on updaterStat
// each freqency for each barrier trainer
std
::
vector
<
struct
Abstract
>
abstract_
;
// it is valuable when do perf-tuining, if lastTrainerId acts uniform
// distribution
struct
Abstract
totAbstract_
;
uint64_t
totSamples_
;
protected:
uint16_t
numConnThreads_
;
// total updates needed
float
rateThreshold_
;
std
::
string
name_
;
};
// the end-time of arriving real/forged barrier position
class
BarrierEndStat
:
public
BarrierStatBase
{
public:
BarrierEndStat
(
uint16_t
numConnThreads
,
const
std
::
string
&
name
);
~
BarrierEndStat
()
{}
virtual
void
updateStat
(
struct
timeval
&
cur
,
int32_t
trainerId
=
-
1
);
virtual
void
updateStat
(
uint64_t
delta
,
int32_t
trainerId
=
-
1
)
{
LOG
(
INFO
)
<<
"have no delta updateStat in BarrierEndStat"
;
}
virtual
void
reset
(
bool
clearRawData
=
true
);
virtual
bool
checkPassBarrier
()
{
return
timeVector_
->
empty
();
}
protected:
/*
* LOG:
* readAllBlocks_denseUpdater
* trainerId avgGap avgSecondGap avgLastTwoGap avgMidGap rate
* 44 86.702 81.022 9.984 50.472 0.144737
* 46 87.723 82.939 8.737 50.019 0.118421
* 35 100.923 96.752 14.305 61.979
* 0.0657895
* log_barrier_abstract, log_barrier_lowest_nodes, log_barrier_threshold
* control details.
*/
virtual
void
showAbstract
(
std
::
ostream
&
output
)
const
;
private:
std
::
unique_ptr
<
TimeVectorEnd
>
timeVector_
;
};
// the delta-time from different trainers,
// eg, find the degree of imbalance of BP time at pserver end
// the entry value in timerVector_ is BP delta, do evaluation to BP delta.
class
BarrierDeltaStat
:
public
BarrierStatBase
{
public:
BarrierDeltaStat
(
uint16_t
numConnThreads
,
const
std
::
string
&
name
);
~
BarrierDeltaStat
()
{}
virtual
void
updateStat
(
uint64_t
delta
,
int32_t
trainerId
=
-
1
);
virtual
void
updateStat
(
struct
timeval
&
cur
,
int32_t
trainerId
=
-
1
)
{
LOG
(
INFO
)
<<
"have no timeval updateStat in BarrierDeltaStat"
;
}
virtual
void
reset
(
bool
clearRawData
=
true
);
virtual
bool
checkPassBarrier
()
{
return
timeVector_
->
empty
();
}
protected:
virtual
void
showAbstract
(
std
::
ostream
&
outPut
)
const
;
private:
// store delta time in uint64_t, eg BP time of all trainers
std
::
unique_ptr
<
TimeVectorDelta
>
timeVector_
;
};
// to distinguish different contexts for same parallel threads, and different
// threads with same code-sgement, just use tagName to tag the run-time
// position.
// in Sparse, sendParallel threads can not only run in the stage of push&pull
// with same thread group, but also run in the stage of pull&push with different
// thread group, tag will be used to distinguish different run-time barrier
// position.
// trainerId in REGISTER_BARRIER_TIMER_SERVER is used to retreive lowest trainer
// nodes.
// end barrier
#define __REGISTER_BARRIER_TIMER_SERVER( \
set, statName, numConnThreads, trainerId, ...) \
do { \
if (numConnThreads > 2) { \
std::string internalName = \
std::string(statName) + std::string(__VA_ARGS__); \
BarrierStatPtr __stat = \
(set).getStat(numConnThreads, internalName, BARRIER_END); \
struct timeval cur; \
gettimeofday(&cur, nullptr); \
__stat->updateStat(cur, trainerId); \
} \
} while (0);
// end barrier with user-defined timer
#define __REGISTER_BARRIER_TIMER_SERVER_SET( \
set, statName, numConnThreads, trainerId, cur, ...) \
do { \
if (numConnThreads > 2) { \
std::string internalName = \
std::string(statName) + std::string(__VA_ARGS__); \
BarrierStatPtr __stat = \
(set).getStat(numConnThreads, internalName, BARRIER_END); \
__stat->updateStat(cur, trainerId); \
} \
} while (0);
// delta barrier
#define __REGISTER_BARRIER_DELTA_SERVER_SET( \
set, statName, numConnThreads, trainerId, delta, ...) \
do { \
if (numConnThreads > 2) { \
std::string internalName = \
std::string(statName) + std::string(__VA_ARGS__); \
BarrierStatPtr __stat = \
(set).getStat(numConnThreads, internalName, BARRIER_DELTA); \
__stat->updateStat(delta, trainerId); \
} \
} while (0);
// check end barrier
#define __CHECK_BARRIER_TIMER(set, statName, numConnThreads, ...) \
do { \
std::string internalName = \
std::string(statName) + std::string(__VA_ARGS__); \
BarrierStatPtr __stat = \
(set).getStat(numConnThreads, internalName, BARRIER_END); \
PCHECK(__stat->checkPassBarrier()) << internalName \
<< ": invalid barrier data"; \
} while (0);
/*
* Note:
* with sync-sgd algriothm in cluster mode, lots of synchronize action exsit at
* pserve end. these synchronizaton actions have impact on the efficiency of
* parameter exchange. the synchronizaton(barrier) GAP is composed of lots of
* factors, such as the forwardBackward variance, network fluncation. we try
* to have a quantitative analysis on these factor, so we design lots of barrier
* time to capture these performance. these barrier also can be placed at
* implict barrier position.
*
* example:
* in sync-sgd algorithm, each parameter server waits for all gradients from
* all trainers, thus, an explict barrier point exsit before doing optimization.
* the barrier timer located before the point can sense the barrier condition.
*
*/
// try to capture which trainer is slowest node in sync-sgd at pserver.
#define REGISTER_SLOW_NODES_PROBE( \
set, statName, numConnThreads, trainerId, ...) \
__REGISTER_BARRIER_TIMER_SERVER( \
(set), statName, numConnThreads, trainerId, __VA_ARGS__)
// try to check if all threads or trainers have passed barriers for data
// accuracy.
#define CHECK_BARRIER_TIMER(set, statName, numConnThreads, ...) \
__CHECK_BARRIER_TIMER((set), statName, numConnThreads, __VA_ARGS__)
#ifdef PADDLE_DISABLE_TIMER
#define REGISTER_BARRIER_TIMER_SERVER( \
set, statName, numConnThreads, trainerId, ...)
#define REGISTER_BARRIER_TIMER_SERVER_SET( \
set, statName, numConnThreads, trainerId, cur, ...)
#define REGISTER_BARRIER_DELTA_SERVER_SET( \
set, statName, numConnThreads, trainerId, cur, ...)
#else
/*
* sensing barrier time distribution for all parallelization threads.
* it provides low API for slow node check(REGISTER_SLOW_NODES_PROBE)
*/
#define REGISTER_BARRIER_TIMER_SERVER( \
set, statName, numConnThreads, trainerId, ...) \
__REGISTER_BARRIER_TIMER_SERVER( \
(set), statName, numConnThreads, trainerId, __VA_ARGS__)
/*
* sensing barrier time distribution for all parallelization threads.
* but time point for barrier performance is set by user.
* eg, with this api, you can get implict barrier point such as the beginning
* time distribution
* for receiving data.
*/
#define REGISTER_BARRIER_TIMER_SERVER_SET( \
set, statName, numConnThreads, trainerId, cur, ...) \
__REGISTER_BARRIER_TIMER_SERVER_SET( \
(set), statName, numConnThreads, trainerId, cur, __VA_ARGS__)
// try to capture time delta from all trainers, such as forwardBackward time
// which implies
// computation fluctuation
#define REGISTER_BARRIER_DELTA_SERVER_SET( \
set, statName, numConnThreads, trainerId, delta, ...) \
__REGISTER_BARRIER_DELTA_SERVER_SET( \
(set), statName, numConnThreads, trainerId, delta, __VA_ARGS__)
#endif // DISABLE_TIMER
}
// namespace paddle
paddle/utils/Stat.cpp
浏览文件 @
309b37e5
...
@@ -97,34 +97,6 @@ std::ostream& operator<<(std::ostream& outPut, const Stat& stat) {
...
@@ -97,34 +97,6 @@ std::ostream& operator<<(std::ostream& outPut, const Stat& stat) {
return
outPut
;
return
outPut
;
}
}
BarrierStatPtr
StatSet
::
getStat
(
uint16_t
numConnThreads
,
const
std
::
string
&
name
,
BarrierStatType
bType
)
{
{
ReadLockGuard
guard
(
lock_
);
auto
it
=
barrierStatSet_
.
find
(
name
);
if
(
it
!=
barrierStatSet_
.
end
())
{
return
it
->
second
;
}
}
std
::
lock_guard
<
RWLock
>
guard
(
lock_
);
// test again with lock_guard
auto
it
=
barrierStatSet_
.
find
(
name
);
if
(
it
!=
barrierStatSet_
.
end
())
{
return
it
->
second
;
}
BarrierStatPtr
stat
;
if
(
bType
==
BARRIER_END
)
{
stat
=
std
::
make_shared
<
BarrierEndStat
>
(
numConnThreads
,
name
);
}
else
if
(
bType
==
BARRIER_DELTA
)
{
stat
=
std
::
make_shared
<
BarrierDeltaStat
>
(
numConnThreads
,
name
);
}
auto
ret
=
barrierStatSet_
.
insert
(
std
::
make_pair
(
name
,
stat
));
return
ret
.
first
->
second
;
}
void
StatSet
::
printSegTimerStatus
()
{
void
StatSet
::
printSegTimerStatus
()
{
ReadLockGuard
guard
(
lock_
);
ReadLockGuard
guard
(
lock_
);
LOG
(
INFO
)
<<
std
::
setiosflags
(
std
::
ios
::
left
)
<<
std
::
setfill
(
' '
)
LOG
(
INFO
)
<<
std
::
setiosflags
(
std
::
ios
::
left
)
<<
std
::
setfill
(
' '
)
...
@@ -135,46 +107,20 @@ void StatSet::printSegTimerStatus() {
...
@@ -135,46 +107,20 @@ void StatSet::printSegTimerStatus() {
}
}
}
}
void
StatSet
::
printBarrierTimerStatus
()
{
ReadLockGuard
guard
(
lock_
);
if
(
barrierStatSet_
.
empty
())
{
return
;
}
// control barrierAbstact in runtime, so enable compliation
LOG
(
INFO
)
<<
std
::
setiosflags
(
std
::
ios
::
left
)
<<
std
::
setfill
(
' '
)
<<
"======= BarrierStatSet status ======"
<<
std
::
endl
;
for
(
auto
&
stat
:
barrierStatSet_
)
{
LOG
(
INFO
)
<<
std
::
setiosflags
(
std
::
ios
::
left
)
<<
std
::
setfill
(
' '
)
<<
*
(
stat
.
second
);
}
}
void
StatSet
::
printAllStatus
()
{
void
StatSet
::
printAllStatus
()
{
#ifndef PADDLE_DISABLE_TIMER
#ifndef PADDLE_DISABLE_TIMER
printSegTimerStatus
();
printSegTimerStatus
();
#endif
#endif
printBarrierTimerStatus
();
LOG
(
INFO
)
<<
std
::
setiosflags
(
std
::
ios
::
left
)
LOG
(
INFO
)
<<
std
::
setiosflags
(
std
::
ios
::
left
)
<<
"--------------------------------------------------"
<<
"--------------------------------------------------"
<<
std
::
endl
;
<<
std
::
endl
;
}
}
void
StatSet
::
printStatus
(
const
std
::
string
&
name
)
{
ReadLockGuard
guard
(
lock_
);
auto
iter
=
statSet_
.
find
(
name
);
CHECK
(
iter
!=
statSet_
.
end
())
<<
name
<<
" is not registed in "
<<
name_
;
LOG
(
INFO
)
<<
*
(
iter
->
second
);
}
void
StatSet
::
reset
(
bool
clearRawData
)
{
void
StatSet
::
reset
(
bool
clearRawData
)
{
ReadLockGuard
guard
(
lock_
);
ReadLockGuard
guard
(
lock_
);
for
(
auto
&
stat
:
statSet_
)
{
for
(
auto
&
stat
:
statSet_
)
{
stat
.
second
->
reset
();
stat
.
second
->
reset
();
}
}
// reset barrierStat
for
(
auto
&
stat
:
barrierStatSet_
)
{
stat
.
second
->
reset
(
clearRawData
);
}
}
}
void
StatSet
::
setThreadInfo
(
const
std
::
string
&
name
,
bool
flag
)
{
void
StatSet
::
setThreadInfo
(
const
std
::
string
&
name
,
bool
flag
)
{
...
@@ -184,13 +130,6 @@ void StatSet::setThreadInfo(const std::string& name, bool flag) {
...
@@ -184,13 +130,6 @@ void StatSet::setThreadInfo(const std::string& name, bool flag) {
iter
->
second
->
setThreadInfo
(
flag
);
iter
->
second
->
setThreadInfo
(
flag
);
}
}
void
StatSet
::
deleteStat
(
const
std
::
string
&
name
)
{
std
::
lock_guard
<
RWLock
>
guard
(
lock_
);
auto
iter
=
statSet_
.
find
(
name
);
CHECK
(
iter
!=
statSet_
.
end
())
<<
name
<<
" is not registed in "
<<
name_
;
statSet_
.
erase
(
iter
);
}
StatInfo
::~
StatInfo
()
{
StatInfo
::~
StatInfo
()
{
if
(
stat_
)
{
if
(
stat_
)
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
stat_
->
lock_
);
std
::
lock_guard
<
std
::
mutex
>
guard
(
stat_
->
lock_
);
...
...
paddle/utils/Stat.h
浏览文件 @
309b37e5
...
@@ -23,7 +23,6 @@ limitations under the License. */
...
@@ -23,7 +23,6 @@ limitations under the License. */
#include <string>
#include <string>
#include <unordered_map>
#include <unordered_map>
#include "BarrierStat.h"
#include "Locks.h"
#include "Locks.h"
#include "Logging.h"
#include "Logging.h"
#include "ThreadLocal.h"
#include "ThreadLocal.h"
...
@@ -60,12 +59,6 @@ public:
...
@@ -60,12 +59,6 @@ public:
class
Stat
;
class
Stat
;
typedef
std
::
shared_ptr
<
Stat
>
StatPtr
;
typedef
std
::
shared_ptr
<
Stat
>
StatPtr
;
typedef
std
::
shared_ptr
<
BarrierStatBase
>
BarrierStatPtr
;
enum
BarrierStatType
{
BARRIER_END
=
0
,
BARRIER_DELTA
=
1
,
};
class
StatSet
{
class
StatSet
{
public:
public:
...
@@ -74,11 +67,8 @@ public:
...
@@ -74,11 +67,8 @@ public:
// print to LOG(INFO)
// print to LOG(INFO)
void
printSegTimerStatus
();
void
printSegTimerStatus
();
void
printBarrierTimerStatus
();
void
printAllStatus
();
void
printAllStatus
();
void
printStatus
(
const
std
::
string
&
name
);
StatPtr
getStat
(
const
std
::
string
&
name
)
{
StatPtr
getStat
(
const
std
::
string
&
name
)
{
{
{
ReadLockGuard
guard
(
lock_
);
ReadLockGuard
guard
(
lock_
);
...
@@ -93,12 +83,6 @@ public:
...
@@ -93,12 +83,6 @@ public:
return
ret
.
first
->
second
;
return
ret
.
first
->
second
;
}
}
BarrierStatPtr
getStat
(
uint16_t
numConnThreads
,
const
std
::
string
&
name
,
BarrierStatType
bType
);
void
deleteStat
(
const
std
::
string
&
name
);
// true for showing stats for each thread
// true for showing stats for each thread
// false for showing stats aggragated over threads
// false for showing stats aggragated over threads
void
setThreadInfo
(
const
std
::
string
&
name
,
bool
flag
);
void
setThreadInfo
(
const
std
::
string
&
name
,
bool
flag
);
...
@@ -120,7 +104,6 @@ public:
...
@@ -120,7 +104,6 @@ public:
private:
private:
std
::
unordered_map
<
std
::
string
,
StatPtr
>
statSet_
;
std
::
unordered_map
<
std
::
string
,
StatPtr
>
statSet_
;
std
::
unordered_map
<
std
::
string
,
BarrierStatPtr
>
barrierStatSet_
;
const
std
::
string
name_
;
const
std
::
string
name_
;
RWLock
lock_
;
RWLock
lock_
;
};
};
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录