Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleRec
提交
192682ad
P
PaddleRec
项目概览
PaddlePaddle
/
PaddleRec
通知
68
Star
12
Fork
5
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
27
列表
看板
标记
里程碑
合并请求
10
Wiki
1
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
27
Issue
27
列表
看板
标记
里程碑
合并请求
10
合并请求
10
Pages
分析
分析
仓库分析
DevOps
Wiki
1
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
192682ad
编写于
3月 23, 2020
作者:
X
xiexionghang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add gloo support and fix some barrier bug
上级
520c7780
变更
4
显示空白变更内容
内联
并排
Showing
4 changed file
with
59 addition
and
34 deletion
+59
-34
kagle/kagle_fs.py
kagle/kagle_fs.py
+2
-3
kagle/kagle_metric.py
kagle/kagle_metric.py
+11
-0
kagle/kagle_util.py
kagle/kagle_util.py
+11
-15
kagle/trainer/abacus_trainer.py
kagle/trainer/abacus_trainer.py
+35
-16
未找到文件。
kagle/kagle_fs.py
浏览文件 @
192682ad
...
...
@@ -133,9 +133,7 @@ class FileHandler(object):
"""R
"""
if
is_afs_path
(
path
):
print
(
"xxh go cat "
+
path
)
hdfs_cat
=
self
.
_hdfs_client
.
cat
(
path
)
print
(
hdfs_cat
)
return
hdfs_cat
else
:
return
self
.
_local_fs_client
.
cat
(
path
)
...
...
@@ -146,9 +144,10 @@ class FileHandler(object):
files
=
[]
if
is_afs_path
(
path
):
files
=
self
.
_hdfs_client
.
ls
(
path
)
files
=
[
path
+
'/'
+
self
.
get_file_name
(
fi
)
for
fi
in
files
]
# absulte path
else
:
files
=
self
.
_local_fs_client
.
ls
(
path
)
files
=
[
path
+
'/'
+
fi
for
fi
in
files
]
files
=
[
path
+
'/'
+
fi
for
fi
in
files
]
# absulte path
return
files
def
cp
(
self
,
org_path
,
dest_path
):
...
...
kagle/kagle_metric.py
浏览文件 @
192682ad
...
...
@@ -199,6 +199,17 @@ class PaddleAUCMetric(Metric):
self
.
_metric_dict
=
params
[
'metric_dict'
]
fleet
.
_role_maker
.
_barrier_worker
()
result
=
self
.
get_global_metrics
(
scope
,
self
.
_metric_dict
)
if
result
[
'total_ins_num'
]
==
0
:
self
.
_result
=
result
self
.
_result
[
'auc'
]
=
0
self
.
_result
[
'bucket_error'
]
=
0
self
.
_result
[
'actual_ctr'
]
=
0
self
.
_result
[
'predict_ctr'
]
=
0
self
.
_result
[
'mae'
]
=
0
self
.
_result
[
'rmse'
]
=
0
self
.
_result
[
'copc'
]
=
0
self
.
_result
[
'mean_q'
]
=
0
return
self
.
_result
if
'stat_pos'
in
result
and
'stat_neg'
in
result
:
result
[
'auc'
]
=
self
.
calculate_auc
(
result
[
'stat_pos'
],
result
[
'stat_neg'
])
result
[
'bucket_error'
]
=
self
.
calculate_auc
(
result
[
'stat_pos'
],
result
[
'stat_neg'
])
...
...
kagle/kagle_util.py
浏览文件 @
192682ad
...
...
@@ -53,46 +53,43 @@ def make_datetime(date_str, fmt=None):
return
datetime
.
datetime
.
strptime
(
date_str
,
fmt
)
def
wroker_numric_opt
(
value
,
opt
):
def
wroker_numric_opt
(
value
,
env
,
opt
):
"""
numric count opt for workers
Args:
value: value for count
env: mpi/gloo
opt: count operator, SUM/MAX/MIN/AVG
Return:
count result
"""
local_value
=
np
.
array
([
value
])
global_value
=
np
.
copy
(
local_value
)
*
0
fleet
.
_role_maker
.
_node_type_comm
.
Allreduce
(
local_value
,
global_value
,
op
=
opt
)
fleet
.
_role_maker
.
all_reduce_worker
(
local_value
,
global_value
,
opt
)
return
global_value
[
0
]
def
worker_numric_sum
(
value
):
def
worker_numric_sum
(
value
,
env
=
"mpi"
):
"""R
"""
from
mpi4py
import
MPI
return
wroker_numric_opt
(
value
,
MPI
.
SUM
)
return
wroker_numric_opt
(
value
,
env
,
"sum"
)
def
worker_numric_avg
(
value
):
def
worker_numric_avg
(
value
,
env
=
"mpi"
):
"""R
"""
return
worker_numric_sum
(
value
)
/
fleet
.
worker_num
()
return
worker_numric_sum
(
value
,
env
)
/
fleet
.
worker_num
()
def
worker_numric_min
(
value
):
def
worker_numric_min
(
value
,
env
=
"mpi"
):
"""R
"""
from
mpi4py
import
MPI
return
wroker_numric_opt
(
value
,
MPI
.
MIN
)
return
wroker_numric_opt
(
value
,
env
,
"min"
)
def
worker_numric_max
(
value
):
def
worker_numric_max
(
value
,
env
=
"mpi"
):
"""R
"""
from
mpi4py
import
MPI
return
wroker_numric_opt
(
value
,
MPI
.
MAX
)
return
wroker_numric_opt
(
value
,
env
,
"max"
)
def
rank0_print
(
log_str
):
...
...
@@ -267,7 +264,6 @@ class TimeTrainPass(object):
self
.
_pass_id
=
pass_id
mins
=
self
.
_interval_per_pass
*
(
pass_id
-
1
)
self
.
_current_train_time
=
date_time
+
datetime
.
timedelta
(
minutes
=
mins
)
print
(
self
.
_current_train_time
)
def
init_pass_by_time
(
self
,
datetime_str
):
"""
...
...
kagle/trainer/abacus_trainer.py
浏览文件 @
192682ad
...
...
@@ -16,6 +16,7 @@ import kagle.kagle_metric as kagle_metric
import
kagle.kagle_dataset
as
kagle_dataset
import
kagle.trainer.kagle_trainer
as
kagle_trainer
from
paddle.fluid.incubate.fleet.parameter_server.pslib
import
fleet
from
paddle.fluid.incubate.fleet.base.role_maker
import
GeneralRoleMaker
class
AbacusPaddleTrainer
(
kagle_trainer
.
Trainer
):
"""R
...
...
@@ -52,7 +53,14 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
def
init
(
self
,
context
):
"""R
"""
fleet
.
init
(
self
.
_exe
)
role_maker
=
None
if
self
.
global_config
.
get
(
'process_mode'
,
'mpi'
)
==
'brilliant_cpu'
:
afs_config
=
self
.
global_config
[
'io'
][
'afs'
]
role_maker
=
fluid
.
incubate
.
fleet
.
base
.
role_maker
.
GeneralRoleMaker
(
hdfs_name
=
afs_config
[
'fs_name'
],
hdfs_ugi
=
afs_config
[
'fs_ugi'
],
path
=
self
.
global_config
[
'output_path'
]
+
"/gloo"
,
init_timeout_seconds
=
1200
,
run_timeout_seconds
=
1200
)
fleet
.
init
(
role_maker
)
data_var_list
=
[]
data_var_name_dict
=
{}
runnnable_scope
=
[]
...
...
@@ -136,6 +144,7 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
save_mode
=
3
# unseen_day++, save all
kagle_util
.
rank0_print
(
"going to save_model %s"
%
model_path
)
fleet
.
save_persistables
(
None
,
model_path
,
mode
=
save_mode
)
if
fleet
.
_role_maker
.
is_first_worker
():
self
.
_train_pass
.
save_train_progress
(
day
,
pass_index
,
base_key
,
model_path
,
is_checkpoint
=
True
)
cost_printer
.
done
()
return
model_path
...
...
@@ -180,6 +189,7 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
}
cost_printer
=
kagle_util
.
CostPrinter
(
kagle_util
.
print_cost
,
{
'master'
:
True
,
'log_format'
:
'save dense model cost %s sec'
,
'stdout'
:
stdout_str
})
if
fleet
.
_role_maker
.
is_first_worker
():
for
executor
in
self
.
global_config
[
'executor'
]:
if
'layer_for_inference'
not
in
executor
:
continue
...
...
@@ -191,6 +201,7 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
for
dnn_layer
in
executor
[
'layer_for_inference'
]:
model_file_handler
.
cp
(
dnn_layer
[
'save_file_name'
],
model_path
+
'/dnn_plugin/'
+
dnn_layer
[
'save_file_name'
])
fleet
.
_role_maker
.
_barrier_worker
()
cost_printer
.
done
()
xbox_done_info
=
{
...
...
@@ -206,9 +217,11 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
"job_id"
:
kagle_util
.
get_env_value
(
"JOB_ID"
),
"job_name"
:
kagle_util
.
get_env_value
(
"JOB_NAME"
)
}
if
fleet
.
_role_maker
.
is_first_worker
():
model_file_handler
.
write
(
json
.
dumps
(
xbox_done_info
)
+
"
\n
"
,
xbox_model_donefile
,
'a'
)
if
pass_index
>
0
:
self
.
_train_pass
.
save_train_progress
(
day
,
pass_index
,
xbox_base_key
,
model_path
,
is_checkpoint
=
False
)
fleet
.
_role_maker
.
_barrier_worker
()
return
stdout_str
def
run_executor
(
self
,
executor_config
,
dataset
,
stdout_str
):
...
...
@@ -239,6 +252,7 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
kagle_util
.
rank0_print
(
"End "
+
executor_name
+
" pass"
)
if
self
.
_train_pass
.
need_dump_inference
(
pass_id
)
and
executor_config
[
'dump_inference_model'
]:
stdout_str
+=
self
.
save_xbox_model
(
day
,
pass_id
,
xbox_base_key
,
monitor_data
)
fleet
.
_role_maker
.
_barrier_worker
()
def
startup
(
self
,
context
):
"""R
...
...
@@ -305,6 +319,7 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
kagle_util
.
rank0_print
(
"going to save batch model"
)
self
.
save_model
(
next_date
,
0
,
xbox_base_key
)
self
.
_train_pass
.
_base_key
=
xbox_base_key
fleet
.
_role_maker
.
_barrier_worker
()
def
train_pass
(
self
,
context
):
"""R
...
...
@@ -325,6 +340,7 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
'node_num'
:
fleet
.
worker_num
(),
'node_idx'
:
fleet
.
worker_index
(),
'begin_time'
:
pass_time
,
'time_window_min'
:
self
.
_train_pass
.
_interval_per_pass
})
fleet
.
_role_maker
.
_barrier_worker
()
cost_printer
.
done
()
kagle_util
.
rank0_print
(
"going to global shuffle"
)
...
...
@@ -335,6 +351,7 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
current_dataset
[
name
].
global_shuffle
(
fleet
,
self
.
global_config
[
'dataset'
][
'shuffle_thread'
])
cost_printer
.
done
()
# str(dataset.get_shuffle_data_size(fleet))
fleet
.
_role_maker
.
_barrier_worker
()
if
self
.
global_config
[
'prefetch_data'
]:
next_pass_time
=
(
self
.
_train_pass
.
_current_train_time
+
...
...
@@ -345,6 +362,7 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
'begin_time'
:
next_pass_time
,
'time_window_min'
:
self
.
_train_pass
.
_interval_per_pass
})
fleet
.
_role_maker
.
_barrier_worker
()
pure_train_begin
=
time
.
time
()
for
executor
in
self
.
global_config
[
'executor'
]:
self
.
run_executor
(
executor
,
current_dataset
[
executor
[
'dataset_name'
]],
stdout_str
)
...
...
@@ -367,6 +385,7 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
kagle_util
.
rank0_print
(
log_str
)
stdout_str
+=
kagle_util
.
now_time_str
()
+
log_str
sys
.
stdout
.
write
(
stdout_str
)
fleet
.
_role_maker
.
_barrier_worker
()
stdout_str
=
""
if
pass_id
==
self
.
_train_pass
.
max_pass_num_day
():
context
[
'status'
]
=
'end_day'
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录