Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
PaddleRec
提交
192682ad
P
PaddleRec
项目概览
BaiXuePrincess
/
PaddleRec
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleRec
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
192682ad
编写于
3月 23, 2020
作者:
X
xiexionghang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add gloo support and fix some barrier bug
上级
520c7780
变更
4
显示空白变更内容
内联
并排
Showing
4 changed file
with
59 addition
and
34 deletion
+59
-34
kagle/kagle_fs.py
kagle/kagle_fs.py
+2
-3
kagle/kagle_metric.py
kagle/kagle_metric.py
+11
-0
kagle/kagle_util.py
kagle/kagle_util.py
+11
-15
kagle/trainer/abacus_trainer.py
kagle/trainer/abacus_trainer.py
+35
-16
未找到文件。
kagle/kagle_fs.py
浏览文件 @
192682ad
...
...
@@ -133,9 +133,7 @@ class FileHandler(object):
"""R
"""
if
is_afs_path
(
path
):
print
(
"xxh go cat "
+
path
)
hdfs_cat
=
self
.
_hdfs_client
.
cat
(
path
)
print
(
hdfs_cat
)
return
hdfs_cat
else
:
return
self
.
_local_fs_client
.
cat
(
path
)
...
...
@@ -146,9 +144,10 @@ class FileHandler(object):
files
=
[]
if
is_afs_path
(
path
):
files
=
self
.
_hdfs_client
.
ls
(
path
)
files
=
[
path
+
'/'
+
self
.
get_file_name
(
fi
)
for
fi
in
files
]
# absulte path
else
:
files
=
self
.
_local_fs_client
.
ls
(
path
)
files
=
[
path
+
'/'
+
fi
for
fi
in
files
]
files
=
[
path
+
'/'
+
fi
for
fi
in
files
]
# absulte path
return
files
def
cp
(
self
,
org_path
,
dest_path
):
...
...
kagle/kagle_metric.py
浏览文件 @
192682ad
...
...
@@ -199,6 +199,17 @@ class PaddleAUCMetric(Metric):
self
.
_metric_dict
=
params
[
'metric_dict'
]
fleet
.
_role_maker
.
_barrier_worker
()
result
=
self
.
get_global_metrics
(
scope
,
self
.
_metric_dict
)
if
result
[
'total_ins_num'
]
==
0
:
self
.
_result
=
result
self
.
_result
[
'auc'
]
=
0
self
.
_result
[
'bucket_error'
]
=
0
self
.
_result
[
'actual_ctr'
]
=
0
self
.
_result
[
'predict_ctr'
]
=
0
self
.
_result
[
'mae'
]
=
0
self
.
_result
[
'rmse'
]
=
0
self
.
_result
[
'copc'
]
=
0
self
.
_result
[
'mean_q'
]
=
0
return
self
.
_result
if
'stat_pos'
in
result
and
'stat_neg'
in
result
:
result
[
'auc'
]
=
self
.
calculate_auc
(
result
[
'stat_pos'
],
result
[
'stat_neg'
])
result
[
'bucket_error'
]
=
self
.
calculate_auc
(
result
[
'stat_pos'
],
result
[
'stat_neg'
])
...
...
kagle/kagle_util.py
浏览文件 @
192682ad
...
...
@@ -53,46 +53,43 @@ def make_datetime(date_str, fmt=None):
return
datetime
.
datetime
.
strptime
(
date_str
,
fmt
)
def
wroker_numric_opt
(
value
,
opt
):
def
wroker_numric_opt
(
value
,
env
,
opt
):
"""
numric count opt for workers
Args:
value: value for count
env: mpi/gloo
opt: count operator, SUM/MAX/MIN/AVG
Return:
count result
"""
local_value
=
np
.
array
([
value
])
global_value
=
np
.
copy
(
local_value
)
*
0
fleet
.
_role_maker
.
_node_type_comm
.
Allreduce
(
local_value
,
global_value
,
op
=
opt
)
fleet
.
_role_maker
.
all_reduce_worker
(
local_value
,
global_value
,
opt
)
return
global_value
[
0
]
def
worker_numric_sum
(
value
):
def
worker_numric_sum
(
value
,
env
=
"mpi"
):
"""R
"""
from
mpi4py
import
MPI
return
wroker_numric_opt
(
value
,
MPI
.
SUM
)
return
wroker_numric_opt
(
value
,
env
,
"sum"
)
def
worker_numric_avg
(
value
):
def
worker_numric_avg
(
value
,
env
=
"mpi"
):
"""R
"""
return
worker_numric_sum
(
value
)
/
fleet
.
worker_num
()
return
worker_numric_sum
(
value
,
env
)
/
fleet
.
worker_num
()
def
worker_numric_min
(
value
):
def
worker_numric_min
(
value
,
env
=
"mpi"
):
"""R
"""
from
mpi4py
import
MPI
return
wroker_numric_opt
(
value
,
MPI
.
MIN
)
return
wroker_numric_opt
(
value
,
env
,
"min"
)
def
worker_numric_max
(
value
):
def
worker_numric_max
(
value
,
env
=
"mpi"
):
"""R
"""
from
mpi4py
import
MPI
return
wroker_numric_opt
(
value
,
MPI
.
MAX
)
return
wroker_numric_opt
(
value
,
env
,
"max"
)
def
rank0_print
(
log_str
):
...
...
@@ -267,7 +264,6 @@ class TimeTrainPass(object):
self
.
_pass_id
=
pass_id
mins
=
self
.
_interval_per_pass
*
(
pass_id
-
1
)
self
.
_current_train_time
=
date_time
+
datetime
.
timedelta
(
minutes
=
mins
)
print
(
self
.
_current_train_time
)
def
init_pass_by_time
(
self
,
datetime_str
):
"""
...
...
kagle/trainer/abacus_trainer.py
浏览文件 @
192682ad
...
...
@@ -16,6 +16,7 @@ import kagle.kagle_metric as kagle_metric
import
kagle.kagle_dataset
as
kagle_dataset
import
kagle.trainer.kagle_trainer
as
kagle_trainer
from
paddle.fluid.incubate.fleet.parameter_server.pslib
import
fleet
from
paddle.fluid.incubate.fleet.base.role_maker
import
GeneralRoleMaker
class
AbacusPaddleTrainer
(
kagle_trainer
.
Trainer
):
"""R
...
...
@@ -52,7 +53,14 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
def
init
(
self
,
context
):
"""R
"""
fleet
.
init
(
self
.
_exe
)
role_maker
=
None
if
self
.
global_config
.
get
(
'process_mode'
,
'mpi'
)
==
'brilliant_cpu'
:
afs_config
=
self
.
global_config
[
'io'
][
'afs'
]
role_maker
=
fluid
.
incubate
.
fleet
.
base
.
role_maker
.
GeneralRoleMaker
(
hdfs_name
=
afs_config
[
'fs_name'
],
hdfs_ugi
=
afs_config
[
'fs_ugi'
],
path
=
self
.
global_config
[
'output_path'
]
+
"/gloo"
,
init_timeout_seconds
=
1200
,
run_timeout_seconds
=
1200
)
fleet
.
init
(
role_maker
)
data_var_list
=
[]
data_var_name_dict
=
{}
runnnable_scope
=
[]
...
...
@@ -136,6 +144,7 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
save_mode
=
3
# unseen_day++, save all
kagle_util
.
rank0_print
(
"going to save_model %s"
%
model_path
)
fleet
.
save_persistables
(
None
,
model_path
,
mode
=
save_mode
)
if
fleet
.
_role_maker
.
is_first_worker
():
self
.
_train_pass
.
save_train_progress
(
day
,
pass_index
,
base_key
,
model_path
,
is_checkpoint
=
True
)
cost_printer
.
done
()
return
model_path
...
...
@@ -180,6 +189,7 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
}
cost_printer
=
kagle_util
.
CostPrinter
(
kagle_util
.
print_cost
,
{
'master'
:
True
,
'log_format'
:
'save dense model cost %s sec'
,
'stdout'
:
stdout_str
})
if
fleet
.
_role_maker
.
is_first_worker
():
for
executor
in
self
.
global_config
[
'executor'
]:
if
'layer_for_inference'
not
in
executor
:
continue
...
...
@@ -191,6 +201,7 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
for
dnn_layer
in
executor
[
'layer_for_inference'
]:
model_file_handler
.
cp
(
dnn_layer
[
'save_file_name'
],
model_path
+
'/dnn_plugin/'
+
dnn_layer
[
'save_file_name'
])
fleet
.
_role_maker
.
_barrier_worker
()
cost_printer
.
done
()
xbox_done_info
=
{
...
...
@@ -206,9 +217,11 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
"job_id"
:
kagle_util
.
get_env_value
(
"JOB_ID"
),
"job_name"
:
kagle_util
.
get_env_value
(
"JOB_NAME"
)
}
if
fleet
.
_role_maker
.
is_first_worker
():
model_file_handler
.
write
(
json
.
dumps
(
xbox_done_info
)
+
"
\n
"
,
xbox_model_donefile
,
'a'
)
if
pass_index
>
0
:
self
.
_train_pass
.
save_train_progress
(
day
,
pass_index
,
xbox_base_key
,
model_path
,
is_checkpoint
=
False
)
fleet
.
_role_maker
.
_barrier_worker
()
return
stdout_str
def
run_executor
(
self
,
executor_config
,
dataset
,
stdout_str
):
...
...
@@ -239,6 +252,7 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
kagle_util
.
rank0_print
(
"End "
+
executor_name
+
" pass"
)
if
self
.
_train_pass
.
need_dump_inference
(
pass_id
)
and
executor_config
[
'dump_inference_model'
]:
stdout_str
+=
self
.
save_xbox_model
(
day
,
pass_id
,
xbox_base_key
,
monitor_data
)
fleet
.
_role_maker
.
_barrier_worker
()
def
startup
(
self
,
context
):
"""R
...
...
@@ -305,6 +319,7 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
kagle_util
.
rank0_print
(
"going to save batch model"
)
self
.
save_model
(
next_date
,
0
,
xbox_base_key
)
self
.
_train_pass
.
_base_key
=
xbox_base_key
fleet
.
_role_maker
.
_barrier_worker
()
def
train_pass
(
self
,
context
):
"""R
...
...
@@ -325,6 +340,7 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
'node_num'
:
fleet
.
worker_num
(),
'node_idx'
:
fleet
.
worker_index
(),
'begin_time'
:
pass_time
,
'time_window_min'
:
self
.
_train_pass
.
_interval_per_pass
})
fleet
.
_role_maker
.
_barrier_worker
()
cost_printer
.
done
()
kagle_util
.
rank0_print
(
"going to global shuffle"
)
...
...
@@ -335,6 +351,7 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
current_dataset
[
name
].
global_shuffle
(
fleet
,
self
.
global_config
[
'dataset'
][
'shuffle_thread'
])
cost_printer
.
done
()
# str(dataset.get_shuffle_data_size(fleet))
fleet
.
_role_maker
.
_barrier_worker
()
if
self
.
global_config
[
'prefetch_data'
]:
next_pass_time
=
(
self
.
_train_pass
.
_current_train_time
+
...
...
@@ -345,6 +362,7 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
'begin_time'
:
next_pass_time
,
'time_window_min'
:
self
.
_train_pass
.
_interval_per_pass
})
fleet
.
_role_maker
.
_barrier_worker
()
pure_train_begin
=
time
.
time
()
for
executor
in
self
.
global_config
[
'executor'
]:
self
.
run_executor
(
executor
,
current_dataset
[
executor
[
'dataset_name'
]],
stdout_str
)
...
...
@@ -367,6 +385,7 @@ class AbacusPaddleTrainer(kagle_trainer.Trainer):
kagle_util
.
rank0_print
(
log_str
)
stdout_str
+=
kagle_util
.
now_time_str
()
+
log_str
sys
.
stdout
.
write
(
stdout_str
)
fleet
.
_role_maker
.
_barrier_worker
()
stdout_str
=
""
if
pass_id
==
self
.
_train_pass
.
max_pass_num_day
():
context
[
'status'
]
=
'end_day'
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录