Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
b66f0074
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
b66f0074
编写于
3月 10, 2019
作者:
D
dongdaxiang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix data reading bugs in api, add VLOG(3) log for setup
上级
71aa307e
变更
9
显示空白变更内容
内联
并排
Showing
9 changed file
with
46 addition
and
9 deletion
+46
-9
paddle/fluid/framework/data_feed.cc
paddle/fluid/framework/data_feed.cc
+4
-0
paddle/fluid/framework/data_feed_factory.cc
paddle/fluid/framework/data_feed_factory.cc
+3
-0
paddle/fluid/framework/data_set.cc
paddle/fluid/framework/data_set.cc
+11
-3
paddle/fluid/framework/dist_multi_trainer.cc
paddle/fluid/framework/dist_multi_trainer.cc
+3
-2
paddle/fluid/framework/executor.cc
paddle/fluid/framework/executor.cc
+2
-1
paddle/fluid/framework/hogwild_worker.cc
paddle/fluid/framework/hogwild_worker.cc
+1
-0
paddle/fluid/framework/multi_trainer.cc
paddle/fluid/framework/multi_trainer.cc
+5
-0
python/paddle/fluid/dataset.py
python/paddle/fluid/dataset.py
+7
-2
python/paddle/fluid/executor.py
python/paddle/fluid/executor.py
+10
-1
未找到文件。
paddle/fluid/framework/data_feed.cc
浏览文件 @
b66f0074
...
...
@@ -44,10 +44,14 @@ void DataFeed::AddFeedVar(Variable* var, const std::string& name) {
bool
DataFeed
::
SetFileList
(
const
std
::
vector
<
std
::
string
>&
files
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_for_pick_file_
);
CheckInit
();
// Do not set finish_set_filelist_ flag,
// since a user may set file many times after init reader
/*
if (finish_set_filelist_) {
VLOG(3) << "info: you have set the filelist.";
return false;
}
*/
PADDLE_ENFORCE
(
files
.
size
(),
"You have set an empty filelist."
);
filelist_
.
assign
(
files
.
begin
(),
files
.
end
());
file_idx_
=
0
;
...
...
paddle/fluid/framework/data_feed_factory.cc
浏览文件 @
b66f0074
...
...
@@ -54,6 +54,9 @@ std::string DataFeedFactory::DataFeedTypeList() {
std
::
shared_ptr
<
DataFeed
>
DataFeedFactory
::
CreateDataFeed
(
std
::
string
data_feed_class
)
{
if
(
g_data_feed_map
.
count
(
data_feed_class
)
<
1
)
{
LOG
(
WARNING
)
<<
"Your DataFeed "
<<
data_feed_class
<<
"is not supported currently"
;
LOG
(
WARNING
)
<<
"Supported DataFeed: "
<<
DataFeedTypeList
();
exit
(
-
1
);
}
return
g_data_feed_map
[
data_feed_class
]();
...
...
paddle/fluid/framework/data_set.cc
浏览文件 @
b66f0074
...
...
@@ -12,10 +12,10 @@
* See the License for the specific language governing permissions and
* limitations under the License. */
#include "paddle/fluid/framework/data_set.h"
#include "google/protobuf/io/zero_copy_stream_impl.h"
#include "google/protobuf/message.h"
#include "google/protobuf/text_format.h"
#include "paddle/fluid/framework/data_set.h"
#include "paddle/fluid/framework/data_feed_factory.h"
namespace
paddle
{
...
...
@@ -24,6 +24,7 @@ namespace framework {
Dataset
::
Dataset
()
{
thread_num_
=
1
;
}
void
Dataset
::
SetFileList
(
const
std
::
vector
<
std
::
string
>&
filelist
)
{
VLOG
(
3
)
<<
"filelist size: "
<<
filelist
.
size
();
filelist_
=
filelist
;
int
file_cnt
=
filelist_
.
size
();
if
(
thread_num_
>
file_cnt
)
{
...
...
@@ -34,6 +35,8 @@ void Dataset::SetFileList(const std::vector<std::string>& filelist) {
}
}
// buggy here, a user should set filelist first before this function
// not user friendly
void
Dataset
::
SetThreadNum
(
int
thread_num
)
{
int
file_cnt
=
filelist_
.
size
();
if
(
file_cnt
!=
0
&&
thread_num
>
file_cnt
)
{
...
...
@@ -48,8 +51,8 @@ void Dataset::SetThreadNum(int thread_num) {
void
Dataset
::
SetTrainerNum
(
int
trainer_num
)
{
trainer_num_
=
trainer_num
;
}
void
Dataset
::
SetDataFeedDesc
(
const
std
::
string
&
data_feed_desc_str
)
{
google
::
protobuf
::
TextFormat
::
ParseFromString
(
data_feed_desc_str
,
&
data_feed_desc_
);
google
::
protobuf
::
TextFormat
::
ParseFromString
(
data_feed_desc_str
,
&
data_feed_desc_
);
}
const
std
::
vector
<
std
::
shared_ptr
<
paddle
::
framework
::
DataFeed
>>&
...
...
@@ -107,14 +110,19 @@ void Dataset::GlobalShuffle() {
}
void
Dataset
::
CreateReaders
()
{
VLOG
(
3
)
<<
"Calling CreateReaders()"
;
CHECK
(
thread_num_
>
0
)
<<
"thread_num should > 0"
;
VLOG
(
3
)
<<
"thread_num in Readers: "
<<
thread_num_
;
VLOG
(
3
)
<<
"readers size: "
<<
readers_
.
size
();
if
(
readers_
.
size
()
!=
0
)
{
return
;
}
VLOG
(
3
)
<<
"data feed class name: "
<<
data_feed_desc_
.
name
();
for
(
int64_t
i
=
0
;
i
<
thread_num_
;
++
i
)
{
readers_
.
push_back
(
DataFeedFactory
::
CreateDataFeed
(
data_feed_desc_
.
name
()));
readers_
.
back
()
->
Init
(
data_feed_desc_
);
}
VLOG
(
3
)
<<
"Filelist size in readers: "
<<
filelist_
.
size
();
readers_
[
0
]
->
SetFileList
(
filelist_
);
}
...
...
paddle/fluid/framework/dist_multi_trainer.cc
浏览文件 @
b66f0074
...
...
@@ -23,12 +23,13 @@ namespace paddle {
namespace
framework
{
void
DistMultiTrainer
::
Initialize
(
const
TrainerDesc
&
trainer_desc
,
Dataset
*
data
_
set
)
{
Dataset
*
dataset
)
{
thread_num_
=
trainer_desc
.
thread_num
();
workers_
.
resize
(
thread_num_
);
dataset
->
CreateReaders
();
const
std
::
vector
<
std
::
shared_ptr
<
paddle
::
framework
::
DataFeed
>>
readers
=
data
_
set
->
GetReaders
();
dataset
->
GetReaders
();
for
(
int
i
=
0
;
i
<
thread_num_
;
++
i
)
{
workers_
[
i
]
=
DeviceWorkerFactory
::
CreateDeviceWorker
(
...
...
paddle/fluid/framework/executor.cc
浏览文件 @
b66f0074
...
...
@@ -14,8 +14,9 @@ limitations under the License. */
#include "paddle/fluid/framework/executor.h"
#include <deque>
#include <
unordered_set
>
#include <
memory
>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include "google/protobuf/io/zero_copy_stream_impl.h"
#include "google/protobuf/message.h"
...
...
paddle/fluid/framework/hogwild_worker.cc
浏览文件 @
b66f0074
...
...
@@ -90,6 +90,7 @@ void HogwildWorker::TrainFilesWithProfiler() {
int
batch_cnt
=
0
;
timeline
.
Start
();
while
((
cur_batch
=
device_reader_
->
Next
())
>
0
)
{
LOG
(
WARNING
)
<<
"read a batch in thread "
<<
thread_id_
;
timeline
.
Pause
();
read_time
+=
timeline
.
ElapsedSec
();
total_time
+=
timeline
.
ElapsedSec
();
...
...
paddle/fluid/framework/multi_trainer.cc
浏览文件 @
b66f0074
...
...
@@ -26,8 +26,12 @@ void MultiTrainer::Initialize(const TrainerDesc& trainer_desc,
thread_num_
=
trainer_desc
.
thread_num
();
// get filelist from trainer_desc here
workers_
.
resize
(
thread_num_
);
VLOG
(
3
)
<<
"worker thread num: "
<<
thread_num_
;
dataset
->
CreateReaders
();
VLOG
(
3
)
<<
"readers created"
;
const
std
::
vector
<
std
::
shared_ptr
<
paddle
::
framework
::
DataFeed
>>
readers
=
dataset
->
GetReaders
();
VLOG
(
3
)
<<
"readers num: "
<<
readers
.
size
();
for
(
int
i
=
0
;
i
<
thread_num_
;
++
i
)
{
workers_
[
i
]
=
DeviceWorkerFactory
::
CreateDeviceWorker
(
trainer_desc
.
device_worker_name
());
...
...
@@ -50,6 +54,7 @@ void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program,
}
void
MultiTrainer
::
Run
()
{
VLOG
(
3
)
<<
"Going to run"
;
for
(
int
thidx
=
0
;
thidx
<
thread_num_
;
++
thidx
)
{
threads_
.
push_back
(
std
::
thread
(
&
DeviceWorker
::
TrainFiles
,
workers_
[
thidx
].
get
()));
...
...
python/paddle/fluid/dataset.py
浏览文件 @
b66f0074
...
...
@@ -22,7 +22,7 @@ class DatasetFactory(object):
def
__init__
(
self
):
pass
def
create_dataset
(
self
,
datafeed_class
):
def
create_dataset
(
self
,
datafeed_class
=
"QueueDataset"
):
try
:
dataset
=
globals
()[
datafeed_class
]()
return
dataset
...
...
@@ -38,6 +38,7 @@ class DatasetBase(object):
self
.
proto_desc
=
data_feed_pb2
.
DataFeedDesc
()
self
.
proto_desc
.
pipe_command
=
"cat"
self
.
dataset
=
core
.
Dataset
()
self
.
thread_num
=
0
def
set_pipe_command
(
self
,
pipe_command
):
"""
...
...
@@ -63,6 +64,7 @@ class DatasetBase(object):
def
set_thread
(
self
,
thread_num
):
self
.
dataset
.
set_thread_num
(
thread_num
)
self
.
thread_num
=
thread_num
def
set_filelist
(
self
,
filelist
):
self
.
dataset
.
set_filelist
(
filelist
)
...
...
@@ -84,6 +86,9 @@ class DatasetBase(object):
"Currently, fluid.dataset only supports dtype=float32 and dtype=int64"
)
def
_prepare_to_run
(
self
):
self
.
dataset
.
set_data_feed_desc
(
self
.
desc
())
def
desc
(
self
):
"""
Returns a protobuf message for this DataFeedDesc
...
...
@@ -104,7 +109,7 @@ class InMemoryDataset(DatasetBase):
self
.
proto_desc
.
name
=
"MultiSlotInMemoryDataFeed"
def
load_into_memory
(
self
):
self
.
dataset
.
set_data_feed_desc
(
self
.
desc
()
)
_prepare_to_run
(
)
self
.
dataset
.
load_into_memory
()
def
local_shuffle
(
self
):
...
...
python/paddle/fluid/executor.py
浏览文件 @
b66f0074
...
...
@@ -23,6 +23,7 @@ from .framework import Program, default_main_program, Variable
from
.
import
core
from
.
import
compiler
from
..
import
compat
as
cpt
from
.trainer_factory
import
TrainerFactory
__all__
=
[
'Executor'
,
'global_scope'
,
'scope_guard'
]
...
...
@@ -616,6 +617,7 @@ class Executor(object):
dataset
=
None
,
fetch_list
=
None
,
scope
=
None
,
thread
=
0
,
opt_info
=
None
):
if
scope
is
None
:
scope
=
global_scope
()
...
...
@@ -624,7 +626,14 @@ class Executor(object):
compiled
=
isinstance
(
program
,
compiler
.
CompiledProgram
)
if
not
compiled
:
trainer
=
TrainerFactory
().
create_trainer
(
opt_info
)
self
.
_default_executor
.
run_from_dataset
(
program_desc
,
if
thread
<=
0
:
trainer
.
set_thread
(
dataset
.
thread_num
)
else
:
trainer
.
set_thread
(
thread
)
dataset
.
_prepare_to_run
()
print
(
"run_from_dataset called"
)
self
.
_default_executor
.
run_from_dataset
(
program
.
desc
,
scope
,
dataset
.
dataset
,
trainer
.
_desc
())
else
:
# For compiled program, more runtime should be implemented
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录