Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
a05a948d
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
a05a948d
编写于
12月 02, 2018
作者:
Q
Qiao Longfei
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
update readthread
上级
2cd25794
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
77 addition
and
44 deletion
+77
-44
paddle/fluid/operators/reader/create_ctr_reader_op.cc
paddle/fluid/operators/reader/create_ctr_reader_op.cc
+16
-8
paddle/fluid/operators/reader/ctr_reader.cc
paddle/fluid/operators/reader/ctr_reader.cc
+31
-22
paddle/fluid/operators/reader/ctr_reader.h
paddle/fluid/operators/reader/ctr_reader.h
+21
-8
paddle/fluid/operators/reader/ctr_reader_test.cc
paddle/fluid/operators/reader/ctr_reader_test.cc
+9
-6
未找到文件。
paddle/fluid/operators/reader/create_ctr_reader_op.cc
浏览文件 @
a05a948d
...
...
@@ -41,13 +41,16 @@ class CreateCTRReaderOp : public framework::OperatorBase {
auto
*
queue_holder
=
queue_holder_var
->
template
GetMutable
<
LoDTensorBlockingQueueHolder
>();
int
thread_num
=
Attr
<
int
>
(
"thread_num"
);
std
::
vector
<
std
::
string
>
slots
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"slots"
);
int
batch_size
=
Attr
<
int
>
(
"batch_size"
);
std
::
vector
<
std
::
string
>
file_list
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"file_list"
);
out
->
Reset
(
std
::
make_shared
<
CTRReader
>
(
queue_holder
->
GetQueue
(),
batch_size
,
thread_num
,
slots
,
file_list
));
auto
thread_num
=
Attr
<
int
>
(
"thread_num"
);
auto
sparse_slots
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"sparse_slots"
);
auto
dense_slots
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"dense_slots"
);
auto
batch_size
=
Attr
<
int
>
(
"batch_size"
);
auto
file_type
=
Attr
<
std
::
string
>
(
"file_type"
);
auto
file_format
=
Attr
<
std
::
string
>
(
"file_format"
);
auto
file_list
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"file_list"
);
out
->
Reset
(
std
::
make_shared
<
CTRReader
>
(
queue_holder
->
GetQueue
(),
batch_size
,
thread_num
,
file_type
,
file_format
,
dense_slots
,
sparse_slots
,
file_list
));
}
};
...
...
@@ -58,10 +61,15 @@ class CreateCTRReaderOpMaker : public FileReaderMakerBase {
"Name of the `LoDTensorBlockingQueueHolder` variable"
);
AddAttr
<
int
>
(
"thread_num"
,
"the thread num to read data"
);
AddAttr
<
int
>
(
"batch_size"
,
"the batch size of read data"
);
AddAttr
<
std
::
string
>
(
"file_type"
,
"plain or gzip"
).
SetDefault
(
"plain"
);
AddAttr
<
std
::
string
>
(
"file_format"
,
"svm or csv"
).
SetDefault
(
"csv"
);
AddAttr
<
std
::
vector
<
std
::
string
>>
(
"file_list"
,
"The list of files that need to read"
);
AddAttr
<
std
::
vector
<
std
::
string
>>
(
"slots"
,
"the slots that should be extract from file"
);
"dense_slots"
,
"the sparse slots id that should be extract from file"
)
.
SetDefault
({});
AddAttr
<
std
::
vector
<
std
::
string
>>
(
"sparse_slots"
,
"the sparse slots id that should be extract from file"
);
AddComment
(
R"DOC(
Create CTRReader to support read ctr data with cpp.
...
...
paddle/fluid/operators/reader/ctr_reader.cc
浏览文件 @
a05a948d
...
...
@@ -141,40 +141,42 @@ class MultiFileReader : public Reader {
void
MonitorThread
(
std
::
vector
<
ReaderThreadStatus
>*
thread_status
,
std
::
shared_ptr
<
LoDTensorBlockingQueue
>
queue
)
{
VLOG
(
3
0
)
<<
"monitor thread in"
;
VLOG
(
3
)
<<
"monitor thread in"
;
bool
reader_thread_is_running
=
true
;
while
(
reader_thread_is_running
)
{
VLOG
(
3
0
)
<<
"reader_thread_is_running"
;
VLOG
(
3
)
<<
"reader_thread_is_running"
;
reader_thread_is_running
=
false
;
for
(
size_t
i
=
0
;
i
<
(
*
thread_status
).
size
();
++
i
)
{
if
((
*
thread_status
)[
i
]
==
Running
)
{
VLOG
(
3
0
)
<<
"reader is running!"
;
VLOG
(
3
)
<<
"reader is running!"
;
reader_thread_is_running
=
true
;
}
}
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
1000
));
}
VLOG
(
3
0
)
<<
"all reader thread is stopped, push empty data into queue"
;
VLOG
(
3
)
<<
"all reader thread is stopped, push empty data into queue"
;
queue
->
Push
({});
VLOG
(
3
0
)
<<
"monitor thread exited"
;
VLOG
(
3
)
<<
"monitor thread exited"
;
}
void
ReadThread
(
const
std
::
vector
<
std
::
string
>&
file_list
,
const
std
::
vector
<
std
::
string
>&
slots
,
int
batch_size
,
const
std
::
string
&
file_type
,
const
std
::
string
&
file_format
,
const
std
::
vector
<
std
::
string
>&
dense_slots
,
const
std
::
vector
<
std
::
string
>&
sparse_slots
,
int
batch_size
,
int
thread_id
,
std
::
vector
<
ReaderThreadStatus
>*
thread_status
,
std
::
shared_ptr
<
LoDTensorBlockingQueue
>
queue
)
{
VLOG
(
3
0
)
<<
"["
<<
thread_id
<<
"]"
<<
" reader thread start! thread_id = "
<<
thread_id
;
VLOG
(
3
)
<<
"["
<<
thread_id
<<
"]"
<<
" reader thread start! thread_id = "
<<
thread_id
;
for
(
auto
&
file
:
file_list
)
{
VLOG
(
3
0
)
<<
"["
<<
thread_id
<<
"]"
<<
" file "
<<
file
;
VLOG
(
3
)
<<
"["
<<
thread_id
<<
"]"
<<
" file "
<<
file
;
}
(
*
thread_status
)[
thread_id
]
=
Running
;
VLOG
(
3
0
)
<<
"set status to running"
;
VLOG
(
3
)
<<
"set status to running"
;
std
::
unordered_map
<
std
::
string
,
size_t
>
slot_to_index
;
for
(
size_t
i
=
0
;
i
<
slots
.
size
();
++
i
)
{
slot_to_index
[
slots
[
i
]]
=
i
;
for
(
size_t
i
=
0
;
i
<
s
parse_s
lots
.
size
();
++
i
)
{
slot_to_index
[
s
parse_s
lots
[
i
]]
=
i
;
}
std
::
string
line
;
...
...
@@ -182,11 +184,18 @@ void ReadThread(const std::vector<std::string>& file_list,
std
::
vector
<
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
int64_t
>>>
batch_data
;
std
::
vector
<
int64_t
>
batch_label
;
MultiFileReader
<
GzipReader
>
reader
(
file_list
);
std
::
unique_ptr
<
Reader
>
reader
;
if
(
file_type
==
"gzip"
)
{
reader
.
reset
(
new
MultiFileReader
<
GzipReader
>
(
file_list
));
}
else
if
(
file_type
==
"plain"
)
{
reader
.
reset
(
new
MultiFileReader
<
PlainFileReader
>
(
file_list
));
}
else
{
PADDLE_THROW
(
"do not support file format %s"
,
file_type
);
}
VLOG
(
3
0
)
<<
"reader inited"
;
VLOG
(
3
)
<<
"reader inited"
;
while
(
reader
.
HasNext
())
{
while
(
reader
->
HasNext
())
{
batch_data
.
clear
();
batch_data
.
reserve
(
batch_size
);
...
...
@@ -195,8 +204,8 @@ void ReadThread(const std::vector<std::string>& file_list,
// read batch_size data
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
if
(
reader
.
HasNext
())
{
reader
.
NextLine
(
&
line
);
if
(
reader
->
HasNext
())
{
reader
->
NextLine
(
&
line
);
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
int64_t
>>
slot_to_data
;
int64_t
label
;
parse_line
(
line
,
slot_to_index
,
&
label
,
&
slot_to_data
);
...
...
@@ -209,8 +218,8 @@ void ReadThread(const std::vector<std::string>& file_list,
std
::
vector
<
framework
::
LoDTensor
>
lod_datas
;
// first insert tensor for each slots
for
(
auto
&
slot
:
slots
)
{
// first insert tensor for each s
parse_s
lots
for
(
auto
&
slot
:
s
parse_s
lots
)
{
std
::
vector
<
size_t
>
lod_data
{
0
};
std
::
vector
<
int64_t
>
batch_feasign
;
...
...
@@ -242,11 +251,11 @@ void ReadThread(const std::vector<std::string>& file_list,
lod_datas
.
push_back
(
label_tensor
);
queue
->
Push
(
lod_datas
);
VLOG
(
4
0
)
<<
"push one data, queue_size="
<<
queue
->
Size
();
VLOG
(
4
)
<<
"push one data, queue_size="
<<
queue
->
Size
();
}
(
*
thread_status
)[
thread_id
]
=
Stopped
;
VLOG
(
3
0
)
<<
"set status to stopped, thread "
<<
thread_id
<<
" exited"
;
VLOG
(
3
)
<<
"set status to stopped, thread "
<<
thread_id
<<
" exited"
;
}
}
// namespace reader
...
...
paddle/fluid/operators/reader/ctr_reader.h
浏览文件 @
a05a948d
...
...
@@ -36,7 +36,9 @@ namespace reader {
enum
ReaderThreadStatus
{
Running
,
Stopped
};
void
ReadThread
(
const
std
::
vector
<
std
::
string
>&
file_list
,
const
std
::
vector
<
std
::
string
>&
slots
,
int
batch_size
,
const
std
::
string
&
file_type
,
const
std
::
string
&
file_format
,
const
std
::
vector
<
std
::
string
>&
dense_slots
,
const
std
::
vector
<
std
::
string
>&
sparse_slots
,
int
batch_size
,
int
thread_id
,
std
::
vector
<
ReaderThreadStatus
>*
thread_status
,
std
::
shared_ptr
<
LoDTensorBlockingQueue
>
queue
);
...
...
@@ -47,11 +49,18 @@ void MonitorThread(std::vector<ReaderThreadStatus>* thread_status,
class
CTRReader
:
public
framework
::
FileReader
{
public:
explicit
CTRReader
(
const
std
::
shared_ptr
<
LoDTensorBlockingQueue
>&
queue
,
int
batch_size
,
int
thread_num
,
const
std
::
vector
<
std
::
string
>&
slots
,
const
std
::
vector
<
std
::
string
>&
file_list
)
:
batch_size_
(
batch_size
),
slots_
(
slots
),
file_list_
(
file_list
)
{
CTRReader
(
const
std
::
shared_ptr
<
LoDTensorBlockingQueue
>&
queue
,
int
batch_size
,
int
thread_num
,
const
std
::
string
&
file_type
,
const
std
::
string
&
file_format
,
const
std
::
vector
<
std
::
string
>&
dense_slots
,
const
std
::
vector
<
std
::
string
>&
sparse_slots
,
const
std
::
vector
<
std
::
string
>&
file_list
)
:
batch_size_
(
batch_size
),
file_type_
(
file_type
),
file_format_
(
file_format
),
dense_slots_
(
dense_slots
),
sparse_slots_
(
sparse_slots
),
file_list_
(
file_list
)
{
PADDLE_ENFORCE_GT
(
thread_num
,
0
,
"thread num should be larger then 0!"
);
PADDLE_ENFORCE
(
queue
!=
nullptr
,
"LoDTensorBlockingQueue must not be null"
);
PADDLE_ENFORCE_GT
(
file_list
.
size
(),
0
,
"file list should not be empty"
);
...
...
@@ -97,7 +106,8 @@ class CTRReader : public framework::FileReader {
VLOG
(
3
)
<<
"thread_num "
<<
thread_num_
;
for
(
int
thread_id
=
0
;
thread_id
<
thread_num_
;
thread_id
++
)
{
read_threads_
.
emplace_back
(
new
std
::
thread
(
std
::
bind
(
&
ReadThread
,
file_groups_
[
thread_id
],
slots_
,
batch_size_
,
std
::
bind
(
&
ReadThread
,
file_groups_
[
thread_id
],
file_type_
,
file_format_
,
dense_slots_
,
sparse_slots_
,
batch_size_
,
thread_id
,
&
read_thread_status_
,
queue_
)));
}
monitor_thread_
.
reset
(
new
std
::
thread
(
...
...
@@ -119,7 +129,10 @@ class CTRReader : public framework::FileReader {
private:
size_t
thread_num_
;
const
int
batch_size_
;
const
std
::
vector
<
std
::
string
>
slots_
;
const
std
::
string
file_type_
;
const
std
::
string
file_format_
;
const
std
::
vector
<
std
::
string
>
dense_slots_
;
const
std
::
vector
<
std
::
string
>
sparse_slots_
;
const
std
::
vector
<
std
::
string
>
file_list_
;
std
::
shared_ptr
<
LoDTensorBlockingQueue
>
queue_
;
std
::
vector
<
std
::
unique_ptr
<
std
::
thread
>>
read_threads_
;
...
...
paddle/fluid/operators/reader/ctr_reader_test.cc
浏览文件 @
a05a948d
...
...
@@ -132,24 +132,27 @@ TEST(CTR_READER, read_data) {
int
batch_size
=
3
;
int
thread_num
=
1
;
std
::
vector
<
std
::
string
>
slots
=
{
"6002"
,
"6003"
};
std
::
vector
<
std
::
string
>
s
parse_s
lots
=
{
"6002"
,
"6003"
};
std
::
vector
<
std
::
string
>
file_list
;
for
(
int
i
=
0
;
i
<
thread_num
;
++
i
)
{
file_list
.
push_back
(
gz_file_name
);
}
CTRReader
reader
(
queue
,
batch_size
,
thread_num
,
slots
,
file_list
);
CTRReader
reader
(
queue
,
batch_size
,
thread_num
,
"gzip"
,
"plain"
,
{},
sparse_slots
,
file_list
);
reader
.
Start
();
size_t
batch_num
=
std
::
ceil
(
static_cast
<
float
>
(
ctr_data
.
size
())
/
batch_size
)
*
thread_num
;
check_all_data
(
ctr_data
,
slots
,
label_dims
,
label_value
,
data_slot_6002
,
data_slot_6003
,
batch_num
,
batch_size
,
queue
,
&
reader
);
check_all_data
(
ctr_data
,
sparse_slots
,
label_dims
,
label_value
,
data_slot_6002
,
data_slot_6003
,
batch_num
,
batch_size
,
queue
,
&
reader
);
reader
.
Shutdown
();
reader
.
Start
();
check_all_data
(
ctr_data
,
slots
,
label_dims
,
label_value
,
data_slot_6002
,
data_slot_6003
,
batch_num
,
batch_size
,
queue
,
&
reader
);
check_all_data
(
ctr_data
,
sparse_slots
,
label_dims
,
label_value
,
data_slot_6002
,
data_slot_6003
,
batch_num
,
batch_size
,
queue
,
&
reader
);
reader
.
Shutdown
();
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录