Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleRec
提交
b4063f51
P
PaddleRec
项目概览
PaddlePaddle
/
PaddleRec
通知
68
Star
12
Fork
5
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
27
列表
看板
标记
里程碑
合并请求
10
Wiki
1
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
27
Issue
27
列表
看板
标记
里程碑
合并请求
10
合并请求
10
Pages
分析
分析
仓库分析
DevOps
Wiki
1
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
b4063f51
编写于
8月 20, 2020
作者:
W
wangjiawei04
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add inmemory dataset
上级
12fc8c82
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
156 addition
and
5 deletion
+156
-5
core/trainers/framework/dataset.py
core/trainers/framework/dataset.py
+67
-2
core/trainers/framework/network.py
core/trainers/framework/network.py
+21
-3
models/rank/dnn/hdfs_config.yaml
models/rank/dnn/hdfs_config.yaml
+68
-0
未找到文件。
core/trainers/framework/dataset.py
浏览文件 @
b4063f51
...
...
@@ -22,9 +22,9 @@ from paddlerec.core.utils import dataloader_instance
from
paddlerec.core.reader
import
SlotReader
from
paddlerec.core.trainer
import
EngineMode
from
paddlerec.core.utils.util
import
split_files
from
paddle.fluid.contrib.utils.hdfs_utils
import
HDFSClient
__all__
=
[
"DatasetBase"
,
"DataLoader"
,
"QueueDataset"
]
__all__
=
[
"DatasetBase"
,
"DataLoader"
,
"QueueDataset"
,
"InMemoryDataset"
]
class
DatasetBase
(
object
):
"""R
...
...
@@ -151,3 +151,68 @@ class QueueDataset(DatasetBase):
dataset
.
set_use_var
(
inputs
)
break
return
dataset
class
InMemoryDataset
(
QueueDataset
):
def
_get_dataset
(
self
,
dataset_name
,
context
):
with
open
(
"context.txt"
,
"w+"
)
as
fout
:
fout
.
write
(
str
(
context
))
name
=
"dataset."
+
dataset_name
+
"."
reader_class
=
envs
.
get_global_env
(
name
+
"data_converter"
)
reader_class_name
=
envs
.
get_global_env
(
name
+
"reader_class_name"
,
"Reader"
)
abs_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
reader
=
os
.
path
.
join
(
abs_dir
,
'../../utils'
,
'dataset_instance.py'
)
sparse_slots
=
envs
.
get_global_env
(
name
+
"sparse_slots"
,
""
).
strip
()
dense_slots
=
envs
.
get_global_env
(
name
+
"dense_slots"
,
""
).
strip
()
for
dataset_config
in
context
[
"env"
][
"dataset"
]:
if
dataset_config
[
"type"
]
==
"InMemoryDataset"
:
hdfs_addr
=
dataset_config
[
"hdfs_addr"
]
hdfs_ugi
=
dataset_config
[
"hdfs_ugi"
]
hadoop_home
=
dataset_config
[
"hadoop_home"
]
if
hdfs_addr
is
None
or
hdfs_ugi
is
None
:
raise
ValueError
(
"hdfs_addr and hdfs_ugi not set"
)
if
sparse_slots
==
""
and
dense_slots
==
""
:
pipe_cmd
=
"python {} {} {} {}"
.
format
(
reader
,
reader_class
,
reader_class_name
,
context
[
"config_yaml"
])
else
:
if
sparse_slots
==
""
:
sparse_slots
=
"?"
if
dense_slots
==
""
:
dense_slots
=
"?"
padding
=
envs
.
get_global_env
(
name
+
"padding"
,
0
)
pipe_cmd
=
"python {} {} {} {} {} {} {} {}"
.
format
(
reader
,
"slot"
,
"slot"
,
context
[
"config_yaml"
],
"fake"
,
sparse_slots
.
replace
(
" "
,
"?"
),
dense_slots
.
replace
(
" "
,
"?"
),
str
(
padding
))
batch_size
=
envs
.
get_global_env
(
name
+
"batch_size"
)
dataset
=
fluid
.
DatasetFactory
().
create_dataset
(
"InMemoryDataset"
)
dataset
.
set_batch_size
(
batch_size
)
dataset
.
set_pipe_command
(
pipe_cmd
)
dataset
.
set_hdfs_config
(
hdfs_addr
,
hdfs_ugi
)
train_data_path
=
envs
.
get_global_env
(
name
+
"data_path"
)
hdfs_configs
=
{
"fs.default.name"
:
hdfs_addr
,
"hadoop.job.ugi"
:
hdfs_ugi
}
hdfs_client
=
HDFSClient
(
hadoop_home
,
hdfs_configs
)
file_list
=
[
"{}/{}"
.
format
(
hdfs_addr
,
x
)
for
x
in
hdfs_client
.
lsr
(
train_data_path
)]
if
context
[
"engine"
]
==
EngineMode
.
LOCAL_CLUSTER
:
file_list
=
split_files
(
file_list
,
context
[
"fleet"
].
worker_index
(),
context
[
"fleet"
].
worker_num
())
dataset
.
set_filelist
(
file_list
)
for
model_dict
in
context
[
"phases"
]:
if
model_dict
[
"dataset_name"
]
==
dataset_name
:
model
=
context
[
"model"
][
model_dict
[
"name"
]][
"model"
]
thread_num
=
int
(
model_dict
[
"thread_num"
])
dataset
.
set_thread
(
thread_num
)
if
context
[
"is_infer"
]:
inputs
=
model
.
_infer_data_var
else
:
inputs
=
model
.
_data_var
dataset
.
set_use_var
(
inputs
)
dataset
.
load_into_memory
()
break
return
dataset
core/trainers/framework/network.py
浏览文件 @
b4063f51
...
...
@@ -19,7 +19,7 @@ import warnings
import
paddle.fluid
as
fluid
from
paddlerec.core.utils
import
envs
from
paddlerec.core.trainers.framework.dataset
import
DataLoader
,
QueueDataset
from
paddlerec.core.trainers.framework.dataset
import
DataLoader
,
QueueDataset
,
InMemoryDataset
__all__
=
[
"NetworkBase"
,
"SingleNetwork"
,
"PSNetwork"
,
"PslibNetwork"
,
...
...
@@ -105,7 +105,11 @@ class SingleNetwork(NetworkBase):
context
[
"dataset"
][
dataset
[
"name"
]]
=
dataset_class
.
create_dataset
(
dataset
[
"name"
],
context
)
elif
type
==
"InMemoryDataset"
:
dataset_class
=
InMemoryDataset
(
context
)
context
[
"dataset"
][
dataset
[
"name"
]]
=
dataset_class
.
create_dataset
(
dataset
[
"name"
],
context
)
context
[
"status"
]
=
"startup_pass"
...
...
@@ -187,7 +191,11 @@ class FineTuningNetwork(NetworkBase):
context
[
"dataset"
][
dataset
[
"name"
]]
=
dataset_class
.
create_dataset
(
dataset
[
"name"
],
context
)
elif
type
==
"InMemoryDataset"
:
dataset_class
=
InMemoryDataset
(
context
)
context
[
"dataset"
][
dataset
[
"name"
]]
=
dataset_class
.
create_dataset
(
dataset
[
"name"
],
context
)
context
[
"status"
]
=
"startup_pass"
...
...
@@ -250,6 +258,11 @@ class PSNetwork(NetworkBase):
context
[
"dataset"
][
dataset
[
"name"
]]
=
dataset_class
.
create_dataset
(
dataset
[
"name"
],
context
)
elif
type
==
"InMemoryDataset"
:
dataset_class
=
InMemoryDataset
(
context
)
context
[
"dataset"
][
dataset
[
"name"
]]
=
dataset_class
.
create_dataset
(
dataset
[
"name"
],
context
)
context
[
"status"
]
=
"startup_pass"
def
_build_strategy
(
self
,
context
):
...
...
@@ -348,6 +361,11 @@ class PslibNetwork(NetworkBase):
context
[
"dataset"
][
dataset
[
"name"
]]
=
dataset_class
.
create_dataset
(
dataset
[
"name"
],
context
)
elif
type
==
"InMemoryDataset"
:
dataset_class
=
InMemoryDataset
(
context
)
context
[
"dataset"
][
dataset
[
"name"
]]
=
dataset_class
.
create_dataset
(
dataset
[
"name"
],
context
)
context
[
"status"
]
=
"startup_pass"
def
_server
(
self
,
context
):
...
...
models/rank/dnn/hdfs_config.yaml
0 → 100755
浏览文件 @
b4063f51
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# workspace
workspace
:
"
models/rank/dnn"
# list of dataset
dataset
:
-
name
:
dataset_train
# name of dataset to distinguish different datasets
batch_size
:
2
type
:
InMemoryDataset
# or DataLoader
data_path
:
"
/user/paddle/wangjiawei04/paddlerec/dnn"
hdfs_addr
:
"
afs://yinglong.afs.baidu.com:9902"
hdfs_ugi
:
"
paddle,paddle"
hadoop_home
:
"
~/.ndt/software/hadoop-xingtian/hadoop/"
sparse_slots
:
"
click
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26"
dense_slots
:
"
dense_var:13"
# hyper parameters of user-defined network
hyper_parameters
:
# optimizer config
optimizer
:
class
:
Adam
learning_rate
:
0.001
strategy
:
async
# user-defined <key, value> pairs
sparse_inputs_slots
:
27
sparse_feature_number
:
1000001
sparse_feature_dim
:
9
dense_input_dim
:
13
fc_sizes
:
[
512
,
256
,
128
,
32
]
# select runner by name
mode
:
[
single_cpu_train
]
# config of each runner.
# runner is a kind of paddle training class, which wraps the train/infer process.
runner
:
-
name
:
single_cpu_train
class
:
train
# num of epochs
epochs
:
4
# device to run training or infer
device
:
cpu
save_checkpoint_interval
:
2
# save model interval of epochs
save_inference_interval
:
4
# save inference
save_checkpoint_path
:
"
increment_dnn"
# save checkpoint path
save_inference_path
:
"
inference"
# save inference path
save_inference_feed_varnames
:
[]
# feed vars of save inference
save_inference_fetch_varnames
:
[]
# fetch vars of save inference
print_interval
:
10
phases
:
[
phase1
]
# runner will run all the phase in each epoch
phase
:
-
name
:
phase1
model
:
"
{workspace}/model.py"
# user-defined model
dataset_name
:
dataset_train
# select dataset by name
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录