Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
e95cafd9
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
e95cafd9
编写于
3月 25, 2019
作者:
X
xjqbest
提交者:
dongdaxiang
3月 29, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix code style & add dataset testcase
test=develop
上级
39362a84
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
186 addition
and
5 deletion
+186
-5
paddle/fluid/framework/data_set.cc
paddle/fluid/framework/data_set.cc
+2
-0
paddle/fluid/framework/data_set.h
paddle/fluid/framework/data_set.h
+8
-0
paddle/fluid/framework/executor_thread_worker.cc
paddle/fluid/framework/executor_thread_worker.cc
+1
-0
paddle/fluid/framework/io/fs.cc
paddle/fluid/framework/io/fs.cc
+1
-0
paddle/fluid/framework/io/fs.h
paddle/fluid/framework/io/fs.h
+1
-0
paddle/fluid/pybind/data_set_py.cc
paddle/fluid/pybind/data_set_py.cc
+5
-0
paddle/fluid/string/string_helper.h
paddle/fluid/string/string_helper.h
+1
-0
python/paddle/fluid/async_executor.py
python/paddle/fluid/async_executor.py
+23
-5
python/paddle/fluid/tests/unittests/test_dataset.py
python/paddle/fluid/tests/unittests/test_dataset.py
+144
-0
未找到文件。
paddle/fluid/framework/data_set.cc
浏览文件 @
e95cafd9
...
...
@@ -62,6 +62,8 @@ void DatasetImpl<T>::SetTrainerNum(int trainer_num) {
template
<
typename
T
>
void
DatasetImpl
<
T
>::
SetHdfsConfig
(
const
std
::
string
&
fs_name
,
const
std
::
string
&
fs_ugi
)
{
fs_name_
=
fs_name
;
fs_ugi_
=
fs_ugi
;
std
::
string
cmd
=
std
::
string
(
"hadoop fs"
);
cmd
+=
" -D fs.default.name="
+
fs_name
;
cmd
+=
" -D hadoop.job.ugi="
+
fs_ugi
;
...
...
paddle/fluid/framework/data_set.h
浏览文件 @
e95cafd9
...
...
@@ -20,6 +20,7 @@
#include <string>
#include <thread> // NOLINT
#include <vector>
#include <utility>
#include "paddle/fluid/framework/data_feed.h"
...
...
@@ -58,6 +59,8 @@ class Dataset {
virtual
int
GetThreadNum
()
=
0
;
// get worker num
virtual
int
GetTrainerNum
()
=
0
;
// get hdfs config
virtual
std
::
pair
<
std
::
string
,
std
::
string
>
GetHdfsConfig
()
=
0
;
// get data fedd desc
virtual
const
paddle
::
framework
::
DataFeedDesc
&
GetDataFeedDesc
()
=
0
;
// get readers, the reader num depend both on thread num
...
...
@@ -102,6 +105,9 @@ class DatasetImpl : public Dataset {
virtual
const
std
::
vector
<
std
::
string
>&
GetFileList
()
{
return
filelist_
;
}
virtual
int
GetThreadNum
()
{
return
thread_num_
;
}
virtual
int
GetTrainerNum
()
{
return
trainer_num_
;
}
virtual
std
::
pair
<
std
::
string
,
std
::
string
>
GetHdfsConfig
()
{
return
std
::
make_pair
(
fs_name_
,
fs_ugi_
);
}
virtual
const
paddle
::
framework
::
DataFeedDesc
&
GetDataFeedDesc
()
{
return
data_feed_desc_
;
}
...
...
@@ -128,6 +134,8 @@ class DatasetImpl : public Dataset {
std
::
vector
<
std
::
string
>
filelist_
;
size_t
file_idx_
;
std
::
mutex
mutex_for_pick_file_
;
std
::
string
fs_name_
;
std
::
string
fs_ugi_
;
};
// use std::vector<MultiSlotType> as data type
...
...
paddle/fluid/framework/executor_thread_worker.cc
浏览文件 @
e95cafd9
...
...
@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/framework/executor_thread_worker.h"
#include <algorithm>
#include <utility>
#include "google/protobuf/io/zero_copy_stream_impl.h"
#include "google/protobuf/message.h"
#include "google/protobuf/text_format.h"
...
...
paddle/fluid/framework/io/fs.cc
浏览文件 @
e95cafd9
...
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/io/fs.h"
#include <memory>
namespace
paddle
{
namespace
framework
{
...
...
paddle/fluid/framework/io/fs.h
浏览文件 @
e95cafd9
...
...
@@ -17,6 +17,7 @@
#include <stdio.h>
#include <string>
#include <vector>
#include <memory>
#include "glog/logging.h"
#include "paddle/fluid/framework/io/shell.h"
#include "paddle/fluid/string/string_helper.h"
...
...
paddle/fluid/pybind/data_set_py.cc
浏览文件 @
e95cafd9
...
...
@@ -52,6 +52,11 @@ void BindDataset(py::module* m) {
.
def
(
"set_trainer_num"
,
&
framework
::
Dataset
::
SetTrainerNum
)
.
def
(
"set_hdfs_config"
,
&
framework
::
Dataset
::
SetHdfsConfig
)
.
def
(
"set_data_feed_desc"
,
&
framework
::
Dataset
::
SetDataFeedDesc
)
.
def
(
"get_filelist"
,
&
framework
::
Dataset
::
GetFileList
)
.
def
(
"get_thread_num"
,
&
framework
::
Dataset
::
GetThreadNum
)
.
def
(
"get_trainer_num"
,
&
framework
::
Dataset
::
GetTrainerNum
)
.
def
(
"get_hdfs_config"
,
&
framework
::
Dataset
::
GetHdfsConfig
)
.
def
(
"get_data_feed_desc"
,
&
framework
::
Dataset
::
GetDataFeedDesc
)
.
def
(
"register_client2client_msg_handler"
,
&
framework
::
Dataset
::
RegisterClientToClientMsgHandler
)
.
def
(
"load_into_memory"
,
&
framework
::
Dataset
::
LoadIntoMemory
)
...
...
paddle/fluid/string/string_helper.h
浏览文件 @
e95cafd9
...
...
@@ -19,6 +19,7 @@
#include <cstring>
#include <string>
#include <vector>
#include <utility>
#include "boost/lexical_cast.hpp"
#include "glog/logging.h"
...
...
python/paddle/fluid/async_executor.py
浏览文件 @
e95cafd9
...
...
@@ -78,6 +78,12 @@ class AsyncExecutor(object):
"""
def
__init__
(
self
,
place
=
None
,
run_mode
=
""
):
"""
Init.
Args:
place(Place): CPUPlace or GPUPlace.
run_mode(str): default is empty string.
"""
if
place
is
None
:
place
=
core
.
CPUPlace
()
if
not
isinstance
(
place
,
core
.
CPUPlace
):
...
...
@@ -91,6 +97,18 @@ class AsyncExecutor(object):
self
.
instance
=
None
def
run
(
self
,
program
,
data_feed
,
filelist
,
thread_num
,
fetch
,
debug
=
False
):
"""
Run program by this AsyncExecutor.
Args:
program(Program): the program that need to run, if not provied,
then default_main_program will be used.
data_feed(DataFeedDesc): A DataFeedDesc object
filelist(str|list): a file or a list of files
thread_num(int): number of concurrent training threads.
fetch(str|list): the var name or a list of var names to inspect
debug(bool): When set to True, fetch vars will be printed to
standard output after each minibatch
"""
if
program
is
None
:
program
=
default_main_program
()
program_desc
=
program
.
desc
...
...
@@ -211,12 +229,12 @@ class AsyncExecutor(object):
"""
download_data is a default download method for distributed training
a user download data without this method
Example:
>>> exe = fluid.AsyncExecutor()
>>> exe.download_data("/xxx/xxx/xx/",
>>> "./data", "afs://
>>> xxx.xxx.xxx.xxx:9901", "xxx,yyy")
>>> "./data", "afs://
>>> xxx.xxx.xxx.xxx:9901", "xxx,yyy")
Args:
afs_path(str): afs_path defined by users
local_path(str): download data path
...
...
@@ -256,7 +274,7 @@ class AsyncExecutor(object):
def
config_distributed_nodes
(
self
):
"""
if a user needs to run distributed async executor
he or she needs to do a global configuration so that
he or she needs to do a global configuration so that
information of current process can be obtained
"""
self
.
instance
=
ps_instance
.
PaddlePSInstance
(
1
,
2
)
...
...
@@ -282,7 +300,7 @@ class AsyncExecutor(object):
"""
initialize server of current node if current process is a server
Args:
dist_desc(str): a protobuf string that describes
dist_desc(str): a protobuf string that describes
how to init a worker and a server
"""
if
self
.
instance
is
None
:
...
...
python/paddle/fluid/tests/unittests/test_dataset.py
0 → 100644
浏览文件 @
e95cafd9
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
paddle.fluid
as
fluid
import
numpy
as
np
import
os
import
shutil
import
unittest
class
TestDataset
(
unittest
.
TestCase
):
def
test_dataset_create
(
self
):
try
:
dataset
=
fluid
.
DatasetFactory
().
create_dataset
(
"InMemoryDataset"
)
except
:
self
.
assertTrue
(
False
)
try
:
dataset
=
fluid
.
DatasetFactory
().
create_dataset
(
"QueueDataset"
)
except
:
self
.
assertTrue
(
False
)
try
:
dataset
=
fluid
.
DatasetFactory
().
create_dataset
(
"MyOwnDataset"
)
self
.
assertTrue
(
False
)
except
:
self
.
assertTrue
(
True
)
def
test_dataset_config
(
self
):
dataset
=
fluid
.
core
.
Dataset
(
"MultiSlotDataset"
)
dataset
.
set_thread_num
(
12
)
dataset
.
set_filelist
([
"a.txt"
,
"b.txt"
,
"c.txt"
])
dataset
.
set_trainer_num
(
4
)
dataset
.
set_hdfs_config
(
"my_fs_name"
,
"my_fs_ugi"
)
thread_num
=
dataset
.
get_thread_num
()
self
.
assertEqual
(
thread_num
,
12
)
filelist
=
dataset
.
get_filelist
()
self
.
assertEqual
(
len
(
filelist
),
3
)
self
.
assertEqual
(
filelist
[
0
],
"a.txt"
)
self
.
assertEqual
(
filelist
[
1
],
"b.txt"
)
self
.
assertEqual
(
filelist
[
2
],
"c.txt"
)
trainer_num
=
dataset
.
get_trainer_num
()
self
.
assertEqual
(
trainer_num
,
4
)
name
,
ugi
=
dataset
.
get_hdfs_config
()
self
.
assertEqual
(
name
,
"my_fs_name"
)
self
.
assertEqual
(
ugi
,
"my_fs_ugi"
)
def
test_in_memory_dataset_run
(
self
):
with
open
(
"test_dataset_a.txt"
,
"w"
)
as
f
:
data
=
"1 1 2 3 3 4 5 5 5 5 1 1
\n
"
data
+=
"1 2 2 3 4 4 6 6 6 6 1 2
\n
"
data
+=
"1 3 2 3 5 4 7 7 7 7 1 3
\n
"
f
.
write
(
data
)
with
open
(
"test_dataset_b.txt"
,
"w"
)
as
f
:
data
=
"1 4 2 3 3 4 5 5 5 5 1 4
\n
"
data
+=
"1 5 2 3 4 4 6 6 6 6 1 5
\n
"
data
+=
"1 6 2 3 5 4 7 7 7 7 1 6
\n
"
data
+=
"1 7 2 3 6 4 8 8 8 8 1 7
\n
"
f
.
write
(
data
)
slots
=
[
"slot1"
,
"slot2"
,
"slot3"
,
"slot4"
]
slots_vars
=
[]
for
slot
in
slots
:
var
=
fluid
.
layers
.
data
(
name
=
slot
,
shape
=
[
1
],
dtype
=
"int64"
,
lod_level
=
1
)
slots_vars
.
append
(
var
)
dataset
=
fluid
.
DatasetFactory
().
create_dataset
(
"InMemoryDataset"
)
dataset
.
set_batch_size
(
32
)
dataset
.
set_thread
(
3
)
dataset
.
set_filelist
([
"test_dataset_a.txt"
,
"test_dataset_b.txt"
])
dataset
.
set_pipe_command
(
"cat"
)
dataset
.
set_use_var
(
slots_vars
)
dataset
.
load_into_memory
()
dataset
.
local_shuffle
()
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
())
exe
.
run
(
fluid
.
default_startup_program
())
for
i
in
range
(
2
):
try
:
exe
.
train_from_dataset
(
fluid
.
default_main_program
(),
dataset
)
except
:
self
.
assertTrue
(
False
)
os
.
remove
(
"./test_dataset_a.txt"
)
os
.
remove
(
"./test_dataset_b.txt"
)
def
test_queue_dataset_run
(
self
):
with
open
(
"test_dataset_a.txt"
,
"w"
)
as
f
:
data
=
"1 1 2 3 3 4 5 5 5 5 1 1
\n
"
data
+=
"1 2 2 3 4 4 6 6 6 6 1 2
\n
"
data
+=
"1 3 2 3 5 4 7 7 7 7 1 3
\n
"
f
.
write
(
data
)
with
open
(
"test_dataset_b.txt"
,
"w"
)
as
f
:
data
=
"1 4 2 3 3 4 5 5 5 5 1 4
\n
"
data
+=
"1 5 2 3 4 4 6 6 6 6 1 5
\n
"
data
+=
"1 6 2 3 5 4 7 7 7 7 1 6
\n
"
data
+=
"1 7 2 3 6 4 8 8 8 8 1 7
\n
"
f
.
write
(
data
)
slots
=
[
"slot1"
,
"slot2"
,
"slot3"
,
"slot4"
]
slots_vars
=
[]
for
slot
in
slots
:
var
=
fluid
.
layers
.
data
(
name
=
slot
,
shape
=
[
1
],
dtype
=
"int64"
,
lod_level
=
1
)
slots_vars
.
append
(
var
)
dataset
=
fluid
.
DatasetFactory
().
create_dataset
(
"QueueDataset"
)
dataset
.
set_batch_size
(
32
)
dataset
.
set_thread
(
3
)
dataset
.
set_filelist
([
"test_dataset_a.txt"
,
"test_dataset_b.txt"
])
dataset
.
set_pipe_command
(
"cat"
)
dataset
.
set_use_var
(
slots_vars
)
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
())
exe
.
run
(
fluid
.
default_startup_program
())
for
i
in
range
(
2
):
try
:
exe
.
train_from_dataset
(
fluid
.
default_main_program
(),
dataset
)
except
:
self
.
assertTrue
(
False
)
os
.
remove
(
"./test_dataset_a.txt"
)
os
.
remove
(
"./test_dataset_b.txt"
)
if
__name__
==
'__main__'
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录