Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
0589ed21
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
0589ed21
编写于
4月 01, 2021
作者:
T
tangwei12
提交者:
GitHub
4月 01, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
LOG CLEAN (#31819)
* upgrade vlog * train from dataset fetch optimize
上级
b807e408
变更
17
隐藏空白更改
内联
并排
Showing
17 changed file
with
97 addition
and
59 deletion
+97
-59
cmake/external/brpc.cmake
cmake/external/brpc.cmake
+1
-1
paddle/fluid/distributed/service/brpc_ps_server.cc
paddle/fluid/distributed/service/brpc_ps_server.cc
+3
-2
paddle/fluid/distributed/service/brpc_utils.cc
paddle/fluid/distributed/service/brpc_utils.cc
+1
-1
paddle/fluid/distributed/service/env.h
paddle/fluid/distributed/service/env.h
+4
-6
paddle/fluid/distributed/service/ps_client.cc
paddle/fluid/distributed/service/ps_client.cc
+1
-2
paddle/fluid/distributed/service/service.cc
paddle/fluid/distributed/service/service.cc
+1
-1
paddle/fluid/distributed/table/depends/dense.h
paddle/fluid/distributed/table/depends/dense.h
+0
-2
paddle/fluid/distributed/table/depends/sparse.h
paddle/fluid/distributed/table/depends/sparse.h
+0
-2
paddle/fluid/framework/details/build_strategy.cc
paddle/fluid/framework/details/build_strategy.cc
+12
-16
paddle/fluid/framework/device_worker.h
paddle/fluid/framework/device_worker.h
+1
-1
paddle/fluid/framework/hogwild_worker.cc
paddle/fluid/framework/hogwild_worker.cc
+25
-6
paddle/fluid/platform/lodtensor_printer.cc
paddle/fluid/platform/lodtensor_printer.cc
+22
-8
paddle/fluid/platform/lodtensor_printer.h
paddle/fluid/platform/lodtensor_printer.h
+1
-1
paddle/fluid/platform/lodtensor_printer_test.cc
paddle/fluid/platform/lodtensor_printer_test.cc
+2
-1
python/paddle/distributed/fleet/base/fleet_base.py
python/paddle/distributed/fleet/base/fleet_base.py
+7
-6
python/paddle/distributed/fleet/runtime/the_one_ps.py
python/paddle/distributed/fleet/runtime/the_one_ps.py
+1
-1
python/paddle/fluid/tests/unittests/test_monitor.py
python/paddle/fluid/tests/unittests/test_monitor.py
+15
-2
未找到文件。
cmake/external/brpc.cmake
浏览文件 @
0589ed21
...
...
@@ -41,7 +41,7 @@ ExternalProject_Add(
${
EXTERNAL_PROJECT_LOG_ARGS
}
# TODO(gongwb): change to de newst repo when they changed.
GIT_REPOSITORY
"https://github.com/wangjiawei04/brpc"
GIT_TAG
"
6d79e0b17f25107c35b705ea58d888083f59ff47
"
GIT_TAG
"
e203afb794caf027da0f1e0776443e7d20c0c28e
"
PREFIX
${
BRPC_SOURCES_DIR
}
UPDATE_COMMAND
""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=
${
CMAKE_CXX_COMPILER
}
...
...
paddle/fluid/distributed/service/brpc_ps_server.cc
浏览文件 @
0589ed21
...
...
@@ -60,7 +60,8 @@ uint64_t BrpcPsServer::start(const std::string &ip, uint32_t port) {
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
std
::
string
ip_port
=
ip
+
":"
+
std
::
to_string
(
port
);
VLOG
(
3
)
<<
"server of rank "
<<
_rank
<<
" starts at "
<<
ip_port
;
VLOG
(
0
)
<<
"running server with rank id: "
<<
_rank
<<
", endpoint: "
<<
ip_port
;
brpc
::
ServerOptions
options
;
int
num_threads
=
std
::
thread
::
hardware_concurrency
();
...
...
@@ -538,7 +539,7 @@ int32_t BrpcPsService::stop_server(Table *table,
auto
*
p_server
=
_server
;
std
::
thread
t_stop
([
p_server
]()
{
p_server
->
stop
();
LOG
(
INFO
)
<<
"Server Stoped"
;
VLOG
(
3
)
<<
"Server Stoped"
;
});
t_stop
.
detach
();
return
0
;
...
...
paddle/fluid/distributed/service/brpc_utils.cc
浏览文件 @
0589ed21
...
...
@@ -324,7 +324,7 @@ std::string GetIntTypeEndpoint(const std::string& ip, const uint32_t& port) {
while
(
hp
->
h_addr_list
[
i
]
!=
NULL
)
{
int_ip
=
inet_ntoa
(
*
(
struct
in_addr
*
)
hp
->
h_addr_list
[
i
]);
VLOG
(
0
)
<<
"Brpc Get host by name, host:"
<<
ip
<<
" -> ip: "
<<
int_ip
;
VLOG
(
3
)
<<
"Brpc Get host by name, host:"
<<
ip
<<
" -> ip: "
<<
int_ip
;
break
;
}
...
...
paddle/fluid/distributed/service/env.h
浏览文件 @
0589ed21
...
...
@@ -39,7 +39,7 @@ struct PSHost {
// |---ip---|---port---|--rank--|
// |-32bit--|--20bit---|--12bit-|
// for pslib
uint64_t
serialize_to_uint64
()
{
uint64_t
host_label
=
0
;
host_label
=
inet_addr
(
ip
.
c_str
());
...
...
@@ -175,14 +175,12 @@ class PSEnvironment {
host
.
ip
=
ip
;
host
.
port
=
port
;
host
.
rank
=
rank
;
if
(
sign_set
.
count
(
rank
)
>
0
)
{
LOG
(
WARNING
)
<<
"ps-host :"
<<
host
.
ip
<<
":"
<<
host
.
port
<<
", rank:"
<<
host
.
rank
<<
" already register, ignore register"
;
}
else
{
if
(
sign_set
.
count
(
rank
)
==
0
)
{
host_list
.
push_back
(
host
);
sign_set
.
insert
(
rank
);
}
return
0
;
}
...
...
paddle/fluid/distributed/service/ps_client.cc
浏览文件 @
0589ed21
...
...
@@ -78,8 +78,7 @@ PSClient *PSClientFactory::create(const PSParameter &ps_config) {
}
TableManager
::
instance
().
initialize
();
LOG
(
INFO
)
<<
"Create PSClient["
<<
service_param
.
client_class
()
<<
"] success"
;
VLOG
(
3
)
<<
"Create PSClient["
<<
service_param
.
client_class
()
<<
"] success"
;
return
client
;
}
}
// namespace distributed
...
...
paddle/fluid/distributed/service/service.cc
浏览文件 @
0589ed21
...
...
@@ -47,7 +47,7 @@ paddle::distributed::PSParameter load_from_prototxt(
}
void
PSCore
::
init_gflag
(
const
std
::
string
&
gflags
)
{
LOG
(
INFO
)
<<
"Init With Gflags:"
<<
gflags
;
VLOG
(
3
)
<<
"Init With Gflags:"
<<
gflags
;
std
::
vector
<
std
::
string
>
flags
=
paddle
::
string
::
split_string
(
gflags
);
if
(
flags
.
size
()
<
1
)
{
flags
.
push_back
(
"-max_body_size=314217728"
);
...
...
paddle/fluid/distributed/table/depends/dense.h
浏览文件 @
0589ed21
...
...
@@ -89,7 +89,6 @@ class DSGD : public DenseOptimizer {
auto
blas
=
GetBlas
<
float
>
();
float
lr
=
*
(
global_learning_rate_
)
*
(
*
learning_rate
);
VLOG
(
4
)
<<
"DSGD LearningRate: "
<<
lr
;
blas
.
VCOPY
(
update_numel
,
update_values
+
begin
,
grads
.
data
());
blas
.
SCAL
(
update_numel
,
lr
,
grads
.
data
());
blas
.
VSUB
(
update_numel
,
param
+
begin
,
grads
.
data
(),
param
+
begin
);
...
...
@@ -157,7 +156,6 @@ class DAdam : public DenseOptimizer {
beta2_pow
[
0
]
=
beta2_pow
[
0
]
*
beta2
;
float
lr_
=
*
(
global_learning_rate_
)
*
learning_rate
[
0
];
VLOG
(
4
)
<<
"DAdam LearningRate: "
<<
lr_
;
lr_
*=
sqrt
(
1
-
beta2_pow
[
0
])
/
(
1
-
beta1_pow
[
0
]);
float
*
tmp_
=
tmp
.
data
();
...
...
paddle/fluid/distributed/table/depends/sparse.h
浏览文件 @
0589ed21
...
...
@@ -110,7 +110,6 @@ class SSGD : public SparseOptimizer {
auto
*
value
=
block
->
Get
(
id
);
float
learning_rate
=
*
(
global_learning_rate_
)
*
(
value
+
lr_offset
)[
0
];
VLOG
(
4
)
<<
"SSGD LearningRate: "
<<
learning_rate
;
float
*
param
=
value
+
param_offset
;
std
::
vector
<
float
>
grads
;
...
...
@@ -166,7 +165,6 @@ class SAdam : public SparseOptimizer {
if
(
!
block
->
GetEntry
(
id
))
continue
;
auto
*
values
=
block
->
Get
(
id
);
float
lr_
=
*
(
global_learning_rate_
)
*
(
values
+
lr_offset
)[
0
];
VLOG
(
4
)
<<
"SAdam LearningRate: "
<<
lr_
;
float
*
param
=
values
+
param_offset
;
float
*
moment1
=
values
+
m1_offset
;
float
*
moment2
=
values
+
m2_offset
;
...
...
paddle/fluid/framework/details/build_strategy.cc
浏览文件 @
0589ed21
...
...
@@ -161,9 +161,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
!defined(_WIN32) && !defined(__APPLE__)
AppendPassWithCheck
(
strategy_
.
enable_auto_fusion_
,
"fusion_group_pass"
);
#else
LOG
(
WARNING
)
<<
"fusion_group is not enabled for Windows/MacOS now, and "
"only effective when running with CUDA GPU."
;
#endif
AppendPassWithCheck
(
strategy_
.
fuse_elewise_add_act_ops_
,
"fuse_elewise_add_act_pass"
);
...
...
@@ -265,12 +262,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
if
(
FLAGS_use_mkldnn
)
{
AppendPass
(
pass_name
);
}
else
if
(
!
strategy_
.
mkldnn_enabled_op_types_
.
empty
())
{
LOG
(
WARNING
)
<<
"mkldnn_enabled_op_types specify the operator type list to "
"use MKLDNN acceleration. It is null in default, means "
"that all the operators supported by MKLDNN will be "
"accelerated. And it should not be set when "
"FLAGS_use_mkldnn=false."
;
VLOG
(
1
)
<<
"mkldnn_enabled_op_types specify the operator type list to "
"use MKLDNN acceleration. It is null in default, means "
"that all the operators supported by MKLDNN will be "
"accelerated. And it should not be set when "
"FLAGS_use_mkldnn=false."
;
}
#else
PADDLE_ENFORCE_NE
(
FLAGS_use_mkldnn
,
true
,
...
...
@@ -403,26 +399,26 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
<<
", num_trainers:"
<<
num_trainers_
;
}
else
if
(
pass
->
Type
()
==
"fuse_relu_depthwise_conv_pass"
)
{
if
(
use_device
!=
p
::
kCUDA
)
{
LOG
(
WARNING
)
<<
"fuse_relu_depthwise_conv_pass is only supported on "
"GPU, skipped."
;
VLOG
(
1
)
<<
"fuse_relu_depthwise_conv_pass is only supported on "
"GPU, skipped."
;
continue
;
}
}
else
if
(
pass
->
Type
()
==
"fusion_group_pass"
)
{
pass
->
Set
<
bool
>
(
"use_gpu"
,
new
bool
((
use_device
==
p
::
kCUDA
)));
if
(
use_device
!=
p
::
kCUDA
)
{
LOG
(
WARNING
)
<<
"fusion_group_pass is only supported on GPU, skipped."
;
VLOG
(
1
)
<<
"fusion_group_pass is only supported on GPU, skipped."
;
continue
;
}
}
else
if
(
pass
->
Type
()
==
"fuse_bn_act_pass"
)
{
if
(
use_device
!=
p
::
kCUDA
)
{
LOG
(
WARNING
)
<<
"fuse_bn_act_pass is only supported on "
"GPU, skipped."
;
VLOG
(
1
)
<<
"fuse_bn_act_pass is only supported on "
"GPU, skipped."
;
continue
;
}
}
else
if
(
pass
->
Type
()
==
"fuse_bn_add_act_pass"
)
{
if
(
use_device
!=
p
::
kCUDA
)
{
LOG
(
WARNING
)
<<
"fuse_bn_add_act_pass is only supported on "
"GPU, skipped."
;
VLOG
(
1
)
<<
"fuse_bn_add_act_pass is only supported on "
"GPU, skipped."
;
continue
;
}
}
else
if
(
pass
->
Type
()
==
"mkldnn_placement_pass"
)
{
...
...
paddle/fluid/framework/device_worker.h
浏览文件 @
0589ed21
...
...
@@ -205,7 +205,7 @@ class DeviceWorker {
Scope
*
root_scope_
=
nullptr
;
Scope
*
thread_scope_
;
paddle
::
platform
::
Place
place_
;
int64_t
batch_num_
;
int64_t
batch_num_
=
0
;
FetchConfig
fetch_config_
;
bool
use_cvm_
;
bool
no_cvm_
;
...
...
paddle/fluid/framework/hogwild_worker.cc
浏览文件 @
0589ed21
...
...
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <ctime>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/device_worker.h"
#include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
...
...
@@ -226,14 +227,32 @@ void HogwildWorker::PrintFetchVars() {
// call count
batch_num_
++
;
int
batch_per_print
=
fetch_config_
.
print_period
();
if
(
thread_id_
==
0
)
{
if
(
batch_num_
%
batch_per_print
==
0
)
{
int
fetch_var_num
=
fetch_config_
.
fetch_var_names_size
();
for
(
int
i
=
0
;
i
<
fetch_var_num
;
++
i
)
{
platform
::
PrintVar
(
thread_scope_
,
fetch_config_
.
fetch_var_names
(
i
),
fetch_config_
.
fetch_var_str_format
(
i
));
int
fetch_var_num
=
fetch_config_
.
fetch_var_names_size
();
if
(
fetch_var_num
==
0
)
{
return
;
}
if
(
thread_id_
==
0
&&
batch_num_
%
batch_per_print
==
0
)
{
time_t
curtime
;
time
(
&
curtime
);
char
mbstr
[
80
];
std
::
strftime
(
mbstr
,
sizeof
(
mbstr
),
"%Y-%m-%d %H:%M:%S"
,
std
::
localtime
(
&
curtime
));
std
::
stringstream
ss
;
ss
<<
"time: ["
<<
mbstr
<<
"], "
;
ss
<<
"batch: ["
<<
batch_num_
<<
"], "
;
for
(
int
i
=
0
;
i
<
fetch_var_num
;
++
i
)
{
platform
::
PrintVar
(
thread_scope_
,
fetch_config_
.
fetch_var_names
(
i
),
fetch_config_
.
fetch_var_str_format
(
i
),
&
ss
);
if
(
i
<
fetch_var_num
-
1
)
{
ss
<<
", "
;
}
}
std
::
cout
<<
ss
.
str
()
<<
std
::
endl
;
}
}
...
...
paddle/fluid/platform/lodtensor_printer.cc
浏览文件 @
0589ed21
...
...
@@ -27,24 +27,38 @@ namespace paddle {
namespace
platform
{
void
PrintVar
(
framework
::
Scope
*
scope
,
const
std
::
string
&
var_name
,
const
std
::
string
&
print_info
)
{
const
std
::
string
&
print_info
,
std
::
stringstream
*
sstream
)
{
framework
::
Variable
*
var
=
scope
->
FindVar
(
var_name
);
if
(
var
==
nullptr
)
{
VLOG
(
1
)
<<
"Variable Name "
<<
var_name
<<
" does not exist in your scope"
;
VLOG
(
0
)
<<
"Variable Name "
<<
var_name
<<
" does not exist in your scope"
;
return
;
}
framework
::
LoDTensor
*
tensor
=
var
->
GetMutable
<
framework
::
LoDTensor
>
();
if
(
tensor
==
nullptr
)
{
VLOG
(
1
)
<<
"tensor of variable "
<<
var_name
VLOG
(
0
)
<<
"tensor of variable "
<<
var_name
<<
" does not exist in your scope"
;
return
;
}
std
::
ostringstream
sstream
;
sstream
<<
print_info
<<
"
\t
"
;
sstream
<<
var_name
<<
"
\t
"
;
sstream
<<
*
tensor
<<
"
\t
"
;
std
::
cout
<<
sstream
.
str
()
<<
std
::
endl
;
*
sstream
<<
print_info
<<
": "
;
#define PrintTensorCallback(cpp_type, proto_type) \
do { \
if (tensor->type() == proto_type) { \
*sstream << "["; \
auto* data = tensor->data<cpp_type>(); \
auto element_num = tensor->numel(); \
if (element_num > 0) { \
*sstream << data[0]; \
for (int j = 1; j < element_num; ++j) { \
*sstream << " " << data[j]; \
} \
} \
*sstream << "]"; \
} \
} while (0)
_ForEachDataType_
(
PrintTensorCallback
);
}
}
// end namespace platform
...
...
paddle/fluid/platform/lodtensor_printer.h
浏览文件 @
0589ed21
...
...
@@ -26,6 +26,6 @@ class Scope;
namespace
paddle
{
namespace
platform
{
void
PrintVar
(
framework
::
Scope
*
scope
,
const
std
::
string
&
var_name
,
const
std
::
string
&
print_info
);
const
std
::
string
&
print_info
,
std
::
stringstream
*
out
);
}
// end namespace platform
}
// end namespace paddle
paddle/fluid/platform/lodtensor_printer_test.cc
浏览文件 @
0589ed21
...
...
@@ -18,5 +18,6 @@
TEST
(
LodTensorPrinter
,
PrintVar
)
{
paddle
::
framework
::
Scope
scope
;
paddle
::
platform
::
PrintVar
(
&
scope
,
"NotAVar"
,
"We don't have var"
);
std
::
stringstream
ss
;
paddle
::
platform
::
PrintVar
(
&
scope
,
"NotAVar"
,
"We don't have var"
,
&
ss
);
}
python/paddle/distributed/fleet/base/fleet_base.py
浏览文件 @
0589ed21
...
...
@@ -628,12 +628,13 @@ class Fleet(object):
self
.
user_defined_optimizer
=
optimizer
if
strategy
is
not
None
:
warnings
.
warn
(
"It is recommended to use DistributedStrategy "
"in fleet.init(). The strategy here is only for compatibility. "
"If the strategy in fleet.distributed_optimizer() is "
"not None, then it will overwrite the DistributedStrategy in fleet.init(), "
"which will take effect in distributed training."
)
if
self
.
_is_collective
:
warnings
.
warn
(
"It is recommended to use DistributedStrategy "
"in fleet.init(). The strategy here is only for compatibility. "
"If the strategy in fleet.distributed_optimizer() is "
"not None, then it will overwrite the DistributedStrategy in fleet.init(), "
"which will take effect in distributed training."
)
self
.
_user_defined_strategy
=
copy
.
deepcopy
(
strategy
)
self
.
_context
=
{}
...
...
python/paddle/distributed/fleet/runtime/the_one_ps.py
浏览文件 @
0589ed21
...
...
@@ -768,7 +768,7 @@ class TheOnePSRuntime(RuntimeBase):
server
=
self
.
_get_fleet_proto
(
is_server
=
True
,
is_sync
=
is_sync
)
proto_txt
=
str
(
server
)
debug
=
bool
(
os
.
getenv
(
"PSERVER_DEBUG"
,
"0"
))
debug
=
bool
(
int
(
os
.
getenv
(
"PSERVER_DEBUG"
,
"0"
)
))
if
debug
:
print
(
"server:
\n
{}"
.
format
(
proto_txt
))
...
...
python/paddle/fluid/tests/unittests/test_monitor.py
浏览文件 @
0589ed21
...
...
@@ -17,6 +17,8 @@ TestCases for Monitor
from
__future__
import
print_function
import
paddle
paddle
.
enable_static
()
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
import
numpy
as
np
...
...
@@ -52,6 +54,11 @@ class TestDatasetWithStat(unittest.TestCase):
name
=
slot
,
shape
=
[
1
],
dtype
=
"int64"
,
lod_level
=
1
)
slots_vars
.
append
(
var
)
embs
=
[]
for
x
in
slots_vars
:
emb
=
fluid
.
layers
.
embedding
(
x
,
is_sparse
=
True
,
size
=
[
100001
,
4
])
embs
.
append
(
emb
)
dataset
=
paddle
.
distributed
.
InMemoryDataset
()
dataset
.
_set_batch_size
(
32
)
dataset
.
_set_thread
(
3
)
...
...
@@ -74,11 +81,17 @@ class TestDatasetWithStat(unittest.TestCase):
for
i
in
range
(
self
.
epoch_num
):
for
data
in
data_loader
():
exe
.
run
(
fluid
.
default_main_program
(),
feed
=
data
)
else
:
for
i
in
range
(
self
.
epoch_num
):
try
:
exe
.
train_from_dataset
(
fluid
.
default_main_program
(),
dataset
)
exe
.
train_from_dataset
(
fluid
.
default_main_program
(),
dataset
,
fetch_list
=
[
embs
[
0
],
embs
[
1
]],
fetch_info
=
[
"emb0"
,
"emb1"
],
print_period
=
1
)
except
Exception
as
e
:
self
.
assertTrue
(
False
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录