Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
7bd16e3a
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
7bd16e3a
编写于
12月 12, 2018
作者:
H
heqiaozhi
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix some bug & add log
上级
5d3ecbfd
变更
5
显示空白变更内容
内联
并排
Showing
5 changed file
with
27 addition
and
13 deletion
+27
-13
paddle/fluid/framework/async_executor.cc
paddle/fluid/framework/async_executor.cc
+1
-1
paddle/fluid/framework/executor_thread_worker.cc
paddle/fluid/framework/executor_thread_worker.cc
+19
-9
paddle/fluid/framework/executor_thread_worker.h
paddle/fluid/framework/executor_thread_worker.h
+1
-1
python/paddle/fluid/async_executor.py
python/paddle/fluid/async_executor.py
+2
-1
python/paddle/fluid/contrib/utils/hdfs_utils.py
python/paddle/fluid/contrib/utils/hdfs_utils.py
+4
-1
未找到文件。
paddle/fluid/framework/async_executor.cc
浏览文件 @
7bd16e3a
...
...
@@ -111,7 +111,7 @@ void AsyncExecutor::InitParamConfig() {
std
::
vector
<
std
::
string
>
tmp_sparse_variable_name
;
for
(
int
i
=
0u
;
i
<
table
.
slot_value_size
();
++
i
)
{
tmp_sparse_variable_name
.
push_back
(
table
.
slot_value
(
i
));
_param_config
.
slot_alias_to_table
[
table
.
slot_
value
(
i
)]
=
table
.
table_id
();
_param_config
.
slot_alias_to_table
[
table
.
slot_
key
(
i
)]
=
table
.
table_id
();
}
std
::
vector
<
std
::
string
>
tmp_sparse_gradient_variable_name
;
for
(
auto
i
=
0u
;
i
<
table
.
slot_gradient_size
();
++
i
)
{
...
...
paddle/fluid/framework/executor_thread_worker.cc
浏览文件 @
7bd16e3a
...
...
@@ -330,6 +330,7 @@ void AsyncExecutorThreadWorker::TrainFiles() {
print_fetch_var
(
thread_scope_
,
fetch_var_names_
[
i
]);
}
// end for (int i = 0...)
}
// end while ()
LOG
(
ERROR
)
<<
"TRAIN DONE"
;
}
void
AsyncExecutorThreadWorker
::
SetPSlibPtr
(
std
::
shared_ptr
<
paddle
::
distributed
::
PSlib
>
pslib_ptr
)
{
...
...
@@ -572,24 +573,29 @@ void AsyncExecutorThreadWorker::PushSparse(int table_id) {
auto
slot_dim
=
_param_config
->
slot_dim
;
//TODO
auto
fea_dim
=
_param_config
->
fea_dim
;
//_current_train_job.fea_dim();TODO
auto
&
features
=
_features
[
table_id
];
CHECK
(
features
.
size
()
<
1000000
)
<<
"features size:"
<<
features
.
size
();
//std::vector<std::string> gradient_var;
//auto& gradient_var = GlobalConfig::instance().input_gradient_variable_name; //TODO
auto
&
push_g
=
_feature_push_value
[
table_id
];
check_pull_push_memory
(
features
,
push_g
,
fea_dim
);
CHECK
(
push_g
.
size
()
==
features
.
size
()
+
1
)
<<
"push_g size:"
<<
push_g
.
size
()
<<
" features size:"
<<
features
.
size
();
uint64_t
fea_idx
=
0u
;
auto
&
fea_info
=
_fea_info
[
table_id
];
//TODO
auto
&
fea_info
=
_fea_info
[
table_id
];
int
offset
=
0
;
//if (!_current_train_job.use_cvm_feature()) { //TODO
offset
=
2
;
//}
const
std
::
vector
<
std
::
string
>&
feed_vec
=
thread_reader_
->
GetUseSlotAlias
();
// slot_idx = 0 is label TODO
for
(
auto
slot_idx
=
1u
;
slot_idx
<
feed_vec
.
size
();
++
slot_idx
)
{
if
(
_param_config
->
slot_alias_to_table
[
feed_vec
[
slot_idx
]]
!=
table_id
)
{
if
(
_param_config
->
slot_alias_to_table
.
find
(
feed_vec
[
slot_idx
])
==
_param_config
->
slot_alias_to_table
.
end
())
{
LOG
(
ERROR
)
<<
"ERROR slot_idx:"
<<
slot_idx
<<
" name:"
<<
feed_vec
[
slot_idx
];
}
else
if
(
_param_config
->
slot_alias_to_table
[
feed_vec
[
slot_idx
]]
!=
table_id
)
{
LOG
(
ERROR
)
<<
"ERROR continue"
;
continue
;
}
Variable
*
g_var
=
thread_scope_
->
FindVar
(
_param_config
->
gradient_var
[
table_id
][
slot_idx
-
1
]);
CHECK
(
g_var
!=
nullptr
)
<<
"var["
<<
_param_config
->
gradient_var
[
table_id
][
slot_idx
-
1
]
<<
"] not found"
;
LoDTensor
*
g_tensor
=
g_var
->
GetMutable
<
LoDTensor
>
();
if
(
g_tensor
==
NULL
)
{
LOG
(
ERROR
)
<<
"var["
<<
_param_config
->
gradient_var
[
table_id
][
slot_idx
-
1
]
<<
"] not found"
;
...
...
@@ -598,13 +604,16 @@ void AsyncExecutorThreadWorker::PushSparse(int table_id) {
float
*
g
=
g_tensor
->
data
<
float
>
();
Variable
*
var
=
thread_scope_
->
FindVar
(
feed_vec
[
slot_idx
]);
CHECK
(
var
!=
nullptr
)
<<
"var["
<<
feed_vec
[
slot_idx
]
<<
"] not found"
;
LoDTensor
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
if
(
tensor
==
NULL
)
{
LOG
(
ERROR
)
<<
"var["
<<
feed_vec
[
slot_idx
]
<<
"] not found"
;
exit
(
-
1
);
}
int
len
=
tensor
->
lod
()[
0
].
back
();
assert
(
slot_dim
*
len
==
g_tensor
->
numel
());
//int len = tensor->lod()[0].back();
int
len
=
tensor
->
numel
();
CHECK
(
slot_dim
*
len
==
g_tensor
->
numel
())
<<
"len:"
<<
len
<<
" g_numel:"
<<
g_tensor
->
numel
();
CHECK
(
len
==
tensor
->
numel
())
<<
"len:"
<<
len
<<
"t_numel:"
<<
tensor
->
numel
();
int64_t
*
ids
=
tensor
->
data
<
int64_t
>
();
for
(
auto
id_idx
=
0u
;
id_idx
<
len
;
++
id_idx
){
if
(
ids
[
id_idx
]
==
0
)
{
...
...
@@ -613,12 +622,13 @@ void AsyncExecutorThreadWorker::PushSparse(int table_id) {
}
memcpy
(
push_g
[
fea_idx
].
data
()
+
offset
,
g
,
sizeof
(
float
)
*
slot_dim
);
push_g
[
fea_idx
][
0
]
=
1.0
f
;
CHECK
(
fea_idx
<
fea_info
.
size
())
<<
"fea_idx:"
<<
fea_idx
<<
" size:"
<<
fea_info
.
size
();
push_g
[
fea_idx
][
1
]
=
static_cast
<
float
>
(
fea_info
[
fea_idx
].
label
);
g
+=
slot_dim
;
fea_idx
++
;
}
}
assert
(
fea_idx
==
features
.
size
()
);
CHECK
(
fea_idx
==
features
.
size
())
<<
"fea_idx:"
<<
fea_idx
<<
" features size:"
<<
features
.
size
(
);
CHECK
(
features
.
size
()
>
0
);
std
::
vector
<
float
*>
push_g_vec
;
...
...
paddle/fluid/framework/executor_thread_worker.h
浏览文件 @
7bd16e3a
...
...
@@ -49,7 +49,7 @@ struct AsyncWorkerParamConfig {
std
::
vector
<
int
>
sparse_table_id
;
std
::
map
<
uint64_t
,
std
::
vector
<
std
::
string
>>
slot_input_vec
;
//6048slot 6050slot //name
std
::
map
<
uint64_t
,
std
::
vector
<
std
::
string
>>
gradient_var
;
//6048slot_embed
std
::
unordered_
map
<
std
::
string
,
uint64_t
>
slot_alias_to_table
;
//TODO done
std
::
map
<
std
::
string
,
uint64_t
>
slot_alias_to_table
;
//TODO done
};
struct
DensePullThreadParam
{
...
...
python/paddle/fluid/async_executor.py
浏览文件 @
7bd16e3a
...
...
@@ -153,7 +153,7 @@ class AsyncExecutor(object):
data_feed
.
desc
(),
filelist
,
thread_num
,
fetch_var_names
,
mode
,
debug
)
def
download_data
(
self
,
afs_path
,
local_path
,
fs_default_name
,
ugi
,
hadoop_home
=
"$HADOOP_HOME"
,
process_num
=
12
):
def
download_data
(
self
,
afs_path
,
local_path
,
fs_default_name
,
ugi
,
file_cnt
,
hadoop_home
=
"$HADOOP_HOME"
,
process_num
=
12
):
if
self
.
instance
is
None
:
raise
ValueError
(
'instance is None, please run config_distributed_nodes init instance'
)
...
...
@@ -169,6 +169,7 @@ class AsyncExecutor(object):
local_path
,
self
.
instance
.
get_worker_index
(),
self
.
instance
.
get_node_cnt
()
/
2
,
file_cnt
,
multi_processes
=
process_num
)
#self.instance.barrier_all() #wait for download_data #TODO only barriere worker
self
.
instance
.
barrier_worker
()
#wait for download_data #TODO only barriere worker
...
...
python/paddle/fluid/contrib/utils/hdfs_utils.py
浏览文件 @
7bd16e3a
...
...
@@ -427,6 +427,7 @@ def multi_download(client,
local_path
,
trainer_id
,
trainers
,
file_cnt
,
multi_processes
=
5
):
"""
multi_download
...
...
@@ -435,6 +436,7 @@ def multi_download(client,
:param local_path: path on local
:param trainer_id: current trainer id
:param trainers: all trainers number
:param file_cnt: all file number
:param multi_processes: the download data process at the same time, default=5
:return: None
"""
...
...
@@ -450,7 +452,7 @@ def multi_download(client,
client
.
make_local_dirs
(
local_path
)
_logger
.
info
(
"Make local dir {} successfully"
.
format
(
local_path
))
all_need_download
=
client
.
lsr
(
hdfs_path
,
sort
=
True
)
all_need_download
=
client
.
lsr
(
hdfs_path
,
sort
=
True
)
[:
file_cnt
]
need_download
=
all_need_download
[
trainer_id
::
trainers
]
_logger
.
info
(
"Get {} files From all {} files need to be download from {}"
.
format
(
len
(
need_download
),
len
(
all_need_download
),
hdfs_path
))
...
...
@@ -501,6 +503,7 @@ if __name__ == "__main__":
"/home/xx/data1"
,
1
,
5
,
100
,
multi_processes
=
5
)
multi_upload
(
client
,
"/user/com/train-25/model"
,
"/home/xx/data1"
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录