Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleDetection
提交
c5855506
P
PaddleDetection
项目概览
PaddlePaddle
/
PaddleDetection
大约 1 年 前同步成功
通知
695
Star
11112
Fork
2696
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
184
列表
看板
标记
里程碑
合并请求
40
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleDetection
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
184
Issue
184
列表
看板
标记
里程碑
合并请求
40
合并请求
40
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
c5855506
编写于
1月 25, 2019
作者:
乔
乔龙飞 Qiao Longfei
提交者:
GitHub
1月 25, 2019
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #14731 from jacquesqiao/optimize-cpp-reader
Optimize cpp reader
上级
d54494ba
119a3d4d
变更
15
隐藏空白更改
内联
并排
Showing
15 changed file
with
533 addition
and
140 deletion
+533
-140
paddle/fluid/API.spec
paddle/fluid/API.spec
+1
-0
paddle/fluid/operators/reader/create_ctr_reader_op.cc
paddle/fluid/operators/reader/create_ctr_reader_op.cc
+27
-9
paddle/fluid/operators/reader/ctr_reader.cc
paddle/fluid/operators/reader/ctr_reader.cc
+199
-39
paddle/fluid/operators/reader/ctr_reader.h
paddle/fluid/operators/reader/ctr_reader.h
+73
-18
paddle/fluid/operators/reader/ctr_reader_test.cc
paddle/fluid/operators/reader/ctr_reader_test.cc
+81
-7
paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
+4
-8
paddle/fluid/operators/reader/read_op.cc
paddle/fluid/operators/reader/read_op.cc
+24
-16
paddle/fluid/operators/reader/reader_op_registry.cc
paddle/fluid/operators/reader/reader_op_registry.cc
+21
-13
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+7
-13
python/paddle/fluid/contrib/__init__.py
python/paddle/fluid/contrib/__init__.py
+3
-0
python/paddle/fluid/contrib/reader/README.md
python/paddle/fluid/contrib/reader/README.md
+15
-0
python/paddle/fluid/contrib/reader/__init__.py
python/paddle/fluid/contrib/reader/__init__.py
+19
-0
python/paddle/fluid/contrib/reader/ctr_reader.py
python/paddle/fluid/contrib/reader/ctr_reader.py
+57
-16
python/paddle/fluid/layers/io.py
python/paddle/fluid/layers/io.py
+1
-1
python/setup.py.in
python/setup.py.in
+1
-0
未找到文件。
paddle/fluid/API.spec
浏览文件 @
c5855506
...
...
@@ -359,6 +359,7 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b
paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None))
paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.contrib.reader.ctr_reader.ctr_reader ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_index', 'sparse_slot_index', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.contrib.build_compressor ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
paddle.fluid.contrib.CompressPass.__init__ ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
paddle.fluid.contrib.CompressPass.add_strategy ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None)
...
...
paddle/fluid/operators/reader/create_ctr_reader_op.cc
浏览文件 @
c5855506
...
...
@@ -41,13 +41,19 @@ class CreateCTRReaderOp : public framework::OperatorBase {
auto
*
queue_holder
=
queue_holder_var
->
template
GetMutable
<
LoDTensorBlockingQueueHolder
>();
int
thread_num
=
Attr
<
int
>
(
"thread_num"
);
std
::
vector
<
std
::
string
>
slots
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"slots"
);
int
batch_size
=
Attr
<
int
>
(
"batch_size"
);
std
::
vector
<
std
::
string
>
file_list
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"file_list"
);
out
->
Reset
(
std
::
make_shared
<
CTRReader
>
(
queue_holder
->
GetQueue
(),
batch_size
,
thread_num
,
slots
,
file_list
));
auto
thread_num
=
Attr
<
int
>
(
"thread_num"
);
auto
sparse_slots
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"sparse_slots"
);
auto
dense_slot_index
=
Attr
<
std
::
vector
<
int
>>
(
"dense_slot_index"
);
auto
sparse_slot_index
=
Attr
<
std
::
vector
<
int
>>
(
"sparse_slot_index"
);
auto
batch_size
=
Attr
<
int
>
(
"batch_size"
);
auto
file_type
=
Attr
<
std
::
string
>
(
"file_type"
);
auto
file_format
=
Attr
<
std
::
string
>
(
"file_format"
);
auto
file_list
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"file_list"
);
DataDesc
data_desc
(
batch_size
,
file_list
,
file_type
,
file_format
,
dense_slot_index
,
sparse_slot_index
,
sparse_slots
);
VLOG
(
1
)
<<
data_desc
;
out
->
Reset
(
std
::
make_shared
<
CTRReader
>
(
queue_holder
->
GetQueue
(),
thread_num
,
data_desc
));
}
};
...
...
@@ -58,10 +64,22 @@ class CreateCTRReaderOpMaker : public FileReaderMakerBase {
"Name of the `LoDTensorBlockingQueueHolder` variable"
);
AddAttr
<
int
>
(
"thread_num"
,
"the thread num to read data"
);
AddAttr
<
int
>
(
"batch_size"
,
"the batch size of read data"
);
AddAttr
<
std
::
string
>
(
"file_type"
,
"plain or gzip"
).
SetDefault
(
"plain"
);
AddAttr
<
std
::
string
>
(
"file_format"
,
"svm or csv"
).
SetDefault
(
"csv"
);
AddAttr
<
std
::
vector
<
std
::
string
>>
(
"file_list"
,
"The list of files that need to read"
);
AddAttr
<
std
::
vector
<
std
::
string
>>
(
"slots"
,
"the slots that should be extract from file"
);
AddAttr
<
std
::
vector
<
int
>>
(
"dense_slot_index"
,
"the dense slots id that should be extract from file"
)
.
SetDefault
({});
AddAttr
<
std
::
vector
<
int
>>
(
"sparse_slot_index"
,
"the sparse slots id that should be extract from file"
)
.
SetDefault
({});
AddAttr
<
std
::
vector
<
std
::
string
>>
(
"sparse_slots"
,
"the sparse slots id that should be "
"extract from file, used when file "
"format is svm"
);
AddComment
(
R"DOC(
Create CTRReader to support read ctr data with cpp.
...
...
paddle/fluid/operators/reader/ctr_reader.cc
浏览文件 @
c5855506
...
...
@@ -73,6 +73,9 @@ static inline void parse_line(
}
}
// label slot1:fea_sign slot2:fea_sign slot1:fea_sign
static
inline
void
parse_svm_line
(
const
std
::
string
&
line
)
{}
class
Reader
{
public:
virtual
~
Reader
()
{}
...
...
@@ -95,11 +98,27 @@ class GzipReader : public Reader {
igzstream
gzstream_
;
};
class
MultiGzip
Reader
:
public
Reader
{
class
PlainFile
Reader
:
public
Reader
{
public:
explicit
MultiGzipReader
(
const
std
::
vector
<
std
::
string
>&
file_list
)
{
explicit
PlainFileReader
(
const
std
::
string
&
file_name
)
:
stream_
(
file_name
.
c_str
())
{}
~
PlainFileReader
()
{}
bool
HasNext
()
override
{
return
stream_
.
peek
()
!=
EOF
;
}
void
NextLine
(
std
::
string
*
line
)
override
{
std
::
getline
(
stream_
,
*
line
);
}
private:
std
::
ifstream
stream_
;
};
template
<
typename
SingleFileReader
>
class
MultiFileReader
:
public
Reader
{
public:
explicit
MultiFileReader
(
const
std
::
vector
<
std
::
string
>&
file_list
)
{
for
(
auto
&
file
:
file_list
)
{
readers_
.
emplace_back
(
std
::
make_shared
<
Gzip
Reader
>
(
file
));
readers_
.
emplace_back
(
std
::
make_shared
<
SingleFile
Reader
>
(
file
));
}
}
...
...
@@ -119,46 +138,35 @@ class MultiGzipReader : public Reader {
}
private:
std
::
vector
<
std
::
shared_ptr
<
Gzip
Reader
>>
readers_
;
std
::
vector
<
std
::
shared_ptr
<
SingleFile
Reader
>>
readers_
;
size_t
current_reader_index_
=
0
;
};
void
MonitorThread
(
std
::
vector
<
ReaderThreadStatus
>*
thread_status
,
std
::
shared_ptr
<
LoDTensorBlockingQueue
>
queue
)
{
VLOG
(
3
0
)
<<
"monitor thread in"
;
VLOG
(
3
)
<<
"monitor thread in"
;
bool
reader_thread_is_running
=
true
;
while
(
reader_thread_is_running
)
{
VLOG
(
3
0
)
<<
"reader_thread_is_running"
;
VLOG
(
3
)
<<
"reader_thread_is_running"
;
reader_thread_is_running
=
false
;
for
(
size_t
i
=
0
;
i
<
(
*
thread_status
).
size
();
++
i
)
{
if
((
*
thread_status
)[
i
]
==
Running
)
{
VLOG
(
3
0
)
<<
"reader is running!"
;
VLOG
(
3
)
<<
"reader is running!"
;
reader_thread_is_running
=
true
;
}
}
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
1000
));
}
VLOG
(
3
0
)
<<
"all reader thread is stopped, push empty data into
queue"
;
queue
->
Push
({}
);
VLOG
(
3
0
)
<<
"monitor thread exited"
;
VLOG
(
3
)
<<
"all reader thread is stopped, close the
queue"
;
queue
->
Close
(
);
VLOG
(
3
)
<<
"monitor thread exited"
;
}
void
ReadThread
(
const
std
::
vector
<
std
::
string
>&
file_list
,
const
std
::
vector
<
std
::
string
>&
slots
,
int
batch_size
,
int
thread_id
,
std
::
vector
<
ReaderThreadStatus
>*
thread_status
,
std
::
shared_ptr
<
LoDTensorBlockingQueue
>
queue
)
{
VLOG
(
30
)
<<
"["
<<
thread_id
<<
"]"
<<
" reader thread start! thread_id = "
<<
thread_id
;
for
(
auto
&
file
:
file_list
)
{
VLOG
(
30
)
<<
"["
<<
thread_id
<<
"]"
<<
" file "
<<
file
;
}
(
*
thread_status
)[
thread_id
]
=
Running
;
VLOG
(
30
)
<<
"set status to running"
;
void
ReadSvmData
(
const
DataDesc
&
data_desc
,
std
::
shared_ptr
<
Reader
>
reader
,
std
::
shared_ptr
<
LoDTensorBlockingQueue
>
queue
)
{
std
::
unordered_map
<
std
::
string
,
size_t
>
slot_to_index
;
for
(
size_t
i
=
0
;
i
<
slots
.
size
();
++
i
)
{
slot_to_index
[
slots
[
i
]]
=
i
;
for
(
size_t
i
=
0
;
i
<
data_desc
.
sparse_slot_ids_
.
size
();
++
i
)
{
slot_to_index
[
data_desc
.
sparse_slot_ids_
[
i
]]
=
i
;
}
std
::
string
line
;
...
...
@@ -166,21 +174,17 @@ void ReadThread(const std::vector<std::string>& file_list,
std
::
vector
<
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
int64_t
>>>
batch_data
;
std
::
vector
<
int64_t
>
batch_label
;
MultiGzipReader
reader
(
file_list
);
VLOG
(
30
)
<<
"reader inited"
;
while
(
reader
.
HasNext
())
{
while
(
reader
->
HasNext
())
{
batch_data
.
clear
();
batch_data
.
reserve
(
batch_size
);
batch_data
.
reserve
(
data_desc
.
batch_size_
);
batch_label
.
clear
();
batch_label
.
reserve
(
batch_size
);
batch_label
.
reserve
(
data_desc
.
batch_size_
);
// read batch_size data
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
if
(
reader
.
HasNext
())
{
reader
.
NextLine
(
&
line
);
for
(
int
i
=
0
;
i
<
data_desc
.
batch_size_
;
++
i
)
{
if
(
reader
->
HasNext
())
{
reader
->
NextLine
(
&
line
);
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
int64_t
>>
slot_to_data
;
int64_t
label
;
parse_line
(
line
,
slot_to_index
,
&
label
,
&
slot_to_data
);
...
...
@@ -193,8 +197,8 @@ void ReadThread(const std::vector<std::string>& file_list,
std
::
vector
<
framework
::
LoDTensor
>
lod_datas
;
// first insert tensor for each slots
for
(
auto
&
slot
:
slots
)
{
// first insert tensor for each s
parse_s
lots
for
(
auto
&
slot
:
data_desc
.
sparse_slot_ids_
)
{
std
::
vector
<
size_t
>
lod_data
{
0
};
std
::
vector
<
int64_t
>
batch_feasign
;
...
...
@@ -226,11 +230,167 @@ void ReadThread(const std::vector<std::string>& file_list,
lod_datas
.
push_back
(
label_tensor
);
queue
->
Push
(
lod_datas
);
VLOG
(
40
)
<<
"push one data, queue_size="
<<
queue
->
Size
();
VLOG
(
4
)
<<
"push one data, queue_size="
<<
queue
->
Size
();
}
}
// label dense_fea,dense_fea sparse_fea,sparse_fea
static
inline
void
parse_csv_line
(
const
std
::
string
&
line
,
const
DataDesc
&
data_desc
,
int64_t
*
label
,
std
::
vector
<
std
::
vector
<
float
>>*
dense_datas
,
std
::
vector
<
std
::
vector
<
int64_t
>>*
sparse_datas
)
{
std
::
vector
<
std
::
string
>
ret
;
string_split
(
line
,
' '
,
&
ret
);
*
label
=
std
::
stol
(
ret
[
0
]);
dense_datas
->
resize
(
data_desc
.
dense_slot_index_
.
size
());
for
(
size_t
i
=
0
;
i
<
data_desc
.
dense_slot_index_
.
size
();
++
i
)
{
int
slot_idx
=
data_desc
.
dense_slot_index_
[
i
];
auto
&
slot_data
=
ret
[
slot_idx
];
std
::
vector
<
std
::
string
>
data_in_slot_str
;
string_split
(
slot_data
,
','
,
&
data_in_slot_str
);
std
::
vector
<
float
>
data_in_slot
;
for
(
auto
&
data_str
:
data_in_slot_str
)
{
(
*
dense_datas
)[
i
].
push_back
(
std
::
stof
(
data_str
));
}
}
sparse_datas
->
resize
(
data_desc
.
sparse_slot_index_
.
size
());
for
(
size_t
i
=
0
;
i
<
data_desc
.
sparse_slot_index_
.
size
();
++
i
)
{
int
slot_idx
=
data_desc
.
sparse_slot_index_
[
i
];
auto
&
slot_data
=
ret
[
slot_idx
];
std
::
vector
<
std
::
string
>
data_in_slot_str
;
string_split
(
slot_data
,
','
,
&
data_in_slot_str
);
std
::
vector
<
int64_t
>
data_in_slot
;
for
(
auto
&
data_str
:
data_in_slot_str
)
{
auto
id
=
std
::
stol
(
data_str
);
(
*
sparse_datas
)[
i
].
push_back
(
id
);
}
}
}
void
ReadCsvData
(
const
DataDesc
&
data_desc
,
std
::
shared_ptr
<
Reader
>
reader
,
std
::
shared_ptr
<
LoDTensorBlockingQueue
>
queue
)
{
std
::
string
line
;
while
(
reader
->
HasNext
())
{
std
::
vector
<
int64_t
>
batch_label
;
batch_label
.
reserve
(
data_desc
.
batch_size_
);
std
::
vector
<
std
::
vector
<
std
::
vector
<
float
>>>
batch_dense_data
;
batch_dense_data
.
reserve
(
data_desc
.
batch_size_
);
std
::
vector
<
std
::
vector
<
std
::
vector
<
int64_t
>>>
batch_sparse_data
;
batch_sparse_data
.
reserve
(
data_desc
.
batch_size_
);
// read batch_size data
for
(
int
i
=
0
;
i
<
data_desc
.
batch_size_
;
++
i
)
{
if
(
reader
->
HasNext
())
{
reader
->
NextLine
(
&
line
);
int64_t
label
;
std
::
vector
<
std
::
vector
<
float
>>
dense_datas
;
std
::
vector
<
std
::
vector
<
int64_t
>>
sparse_datas
;
parse_csv_line
(
line
,
data_desc
,
&
label
,
&
dense_datas
,
&
sparse_datas
);
batch_label
.
push_back
(
label
);
if
(
!
batch_dense_data
.
empty
())
{
PADDLE_ENFORCE_EQ
(
batch_dense_data
[
0
].
size
(),
dense_datas
.
size
(),
"dense data should have the same shape"
);
}
batch_dense_data
.
push_back
(
dense_datas
);
batch_sparse_data
.
push_back
(
sparse_datas
);
}
else
{
break
;
}
}
// the order of output data is label, dense_datas, sparse_datas
std
::
vector
<
framework
::
LoDTensor
>
lod_datas
;
// insert label tensor
framework
::
LoDTensor
label_tensor
;
auto
*
label_tensor_data
=
label_tensor
.
mutable_data
<
int64_t
>
(
framework
::
make_ddim
({
static_cast
<
int64_t
>
(
batch_label
.
size
()),
1
}),
platform
::
CPUPlace
());
memcpy
(
label_tensor_data
,
batch_label
.
data
(),
batch_label
.
size
()
*
sizeof
(
int64_t
));
lod_datas
.
push_back
(
label_tensor
);
// insert tensor for each dense_slots
for
(
size_t
i
=
0
;
i
<
data_desc
.
dense_slot_index_
.
size
();
++
i
)
{
framework
::
LoDTensor
lod_tensor
;
size_t
width
=
batch_dense_data
[
0
][
i
].
size
();
auto
*
tensor_data
=
lod_tensor
.
mutable_data
<
float
>
(
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
batch_dense_data
.
size
()),
// batch_size
static_cast
<
int64_t
>
(
width
)}),
platform
::
CPUPlace
());
for
(
size_t
j
=
0
;
j
<
batch_dense_data
.
size
();
++
j
)
{
auto
&
dense_data_row
=
batch_dense_data
[
j
][
i
];
memcpy
(
tensor_data
+
j
*
width
,
dense_data_row
.
data
(),
width
*
sizeof
(
float
));
}
lod_datas
.
push_back
(
lod_tensor
);
}
// insert tensor for each sparse_slots
for
(
size_t
i
=
0
;
i
<
data_desc
.
sparse_slot_index_
.
size
();
++
i
)
{
std
::
vector
<
size_t
>
lod_data
{
0
};
std
::
vector
<
int64_t
>
batch_feasign
;
for
(
size_t
row_idx
=
0
;
row_idx
<
batch_sparse_data
.
size
();
++
row_idx
)
{
auto
&
sparse_ids
=
batch_sparse_data
[
row_idx
][
i
];
lod_data
.
push_back
(
lod_data
.
back
()
+
sparse_ids
.
size
());
batch_feasign
.
insert
(
batch_feasign
.
end
(),
sparse_ids
.
begin
(),
sparse_ids
.
end
());
}
framework
::
LoDTensor
lod_tensor
;
framework
::
LoD
lod
{
lod_data
};
lod_tensor
.
set_lod
(
lod
);
int64_t
*
tensor_data
=
lod_tensor
.
mutable_data
<
int64_t
>
(
framework
::
make_ddim
({
static_cast
<
int64_t
>
(
batch_feasign
.
size
()),
1
}),
platform
::
CPUPlace
());
memcpy
(
tensor_data
,
batch_feasign
.
data
(),
batch_feasign
.
size
()
*
sizeof
(
int64_t
));
lod_datas
.
push_back
(
lod_tensor
);
}
queue
->
Push
(
lod_datas
);
VLOG
(
4
)
<<
"push one data, queue_size="
<<
queue
->
Size
();
}
}
void
ReadThread
(
const
std
::
vector
<
std
::
string
>&
file_list
,
const
DataDesc
&
data_desc
,
int
thread_id
,
std
::
vector
<
ReaderThreadStatus
>*
thread_status
,
std
::
shared_ptr
<
LoDTensorBlockingQueue
>
queue
)
{
VLOG
(
3
)
<<
"["
<<
thread_id
<<
"]"
<<
" reader thread start! thread_id = "
<<
thread_id
;
for
(
auto
&
file
:
file_list
)
{
VLOG
(
3
)
<<
"["
<<
thread_id
<<
"]"
<<
" file "
<<
file
;
}
(
*
thread_status
)[
thread_id
]
=
Running
;
VLOG
(
3
)
<<
"set status to running"
;
std
::
shared_ptr
<
Reader
>
reader
;
if
(
data_desc
.
file_type_
==
"gzip"
)
{
reader
.
reset
(
new
MultiFileReader
<
GzipReader
>
(
file_list
));
}
else
if
(
data_desc
.
file_type_
==
"plain"
)
{
reader
.
reset
(
new
MultiFileReader
<
PlainFileReader
>
(
file_list
));
}
else
{
PADDLE_THROW
(
"do not support file format %s"
,
data_desc
.
file_type_
);
}
VLOG
(
3
)
<<
"reader inited"
;
if
(
data_desc
.
file_format_
==
"svm"
)
{
ReadSvmData
(
data_desc
,
reader
,
queue
);
}
else
if
(
data_desc
.
file_format_
==
"csv"
)
{
ReadCsvData
(
data_desc
,
reader
,
queue
);
}
(
*
thread_status
)[
thread_id
]
=
Stopped
;
VLOG
(
3
0
)
<<
"set status to stopped, thread "
<<
thread_id
<<
" exited"
;
VLOG
(
3
)
<<
"set status to stopped, thread "
<<
thread_id
<<
" exited"
;
}
}
// namespace reader
...
...
paddle/fluid/operators/reader/ctr_reader.h
浏览文件 @
c5855506
...
...
@@ -36,9 +36,63 @@ namespace reader {
enum
ReaderThreadStatus
{
Running
,
Stopped
};
struct
DataDesc
{
DataDesc
(
int
batch_size
,
const
std
::
vector
<
std
::
string
>&
file_names
,
const
std
::
string
&
file_type
,
const
std
::
string
&
file_format
,
const
std
::
vector
<
int
>&
dense_slot_index
,
const
std
::
vector
<
int
>&
sparse_slot_index
,
const
std
::
vector
<
std
::
string
>&
sparse_slot_ids
)
:
batch_size_
(
batch_size
),
file_names_
(
file_names
),
file_type_
(
file_type
),
file_format_
(
file_format
),
dense_slot_index_
(
dense_slot_index
),
sparse_slot_index_
(
sparse_slot_index
),
sparse_slot_ids_
(
sparse_slot_ids
)
{}
const
int
batch_size_
;
const
std
::
vector
<
std
::
string
>
file_names_
;
const
std
::
string
file_type_
;
// gzip or plain
const
std
::
string
file_format_
;
// csv or svm
// used for csv data format
const
std
::
vector
<
int
>
dense_slot_index_
;
const
std
::
vector
<
int
>
sparse_slot_index_
;
// used for svm data format
const
std
::
vector
<
std
::
string
>
sparse_slot_ids_
;
};
inline
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
DataDesc
&
data_desc
)
{
os
<<
"data_desc:
\n
"
;
os
<<
"
\t
batch_size -> "
<<
data_desc
.
batch_size_
<<
"
\n
"
;
os
<<
"
\t
file_type -> "
<<
data_desc
.
file_type_
<<
"
\n
"
;
os
<<
"
\t
file_format -> "
<<
data_desc
.
file_format_
<<
"
\n
"
;
os
<<
"
\t
file_names -> {"
;
for
(
auto
&
file_name
:
data_desc
.
file_names_
)
{
os
<<
file_name
<<
","
;
}
os
<<
"}
\n
"
;
os
<<
"
\t
dense_slot_index -> {"
;
for
(
auto
&
slot
:
data_desc
.
dense_slot_index_
)
{
os
<<
slot
<<
","
;
}
os
<<
"}
\n
"
;
os
<<
"
\t
sparse_slot_index_ -> {"
;
for
(
auto
&
slot
:
data_desc
.
sparse_slot_index_
)
{
os
<<
slot
<<
","
;
}
os
<<
"}
\n
"
;
os
<<
"
\t
sparse_slot_ids_ -> {"
;
for
(
auto
&
slot
:
data_desc
.
sparse_slot_ids_
)
{
os
<<
slot
<<
","
;
}
os
<<
"}
\n
"
;
return
os
;
}
void
ReadThread
(
const
std
::
vector
<
std
::
string
>&
file_list
,
const
std
::
vector
<
std
::
string
>&
slots
,
int
batch_size
,
int
thread_id
,
std
::
vector
<
ReaderThreadStatus
>*
thread_status
,
const
DataDesc
&
data_desc
,
int
thread_id
,
std
::
vector
<
ReaderThreadStatus
>*
thread_status
,
std
::
shared_ptr
<
LoDTensorBlockingQueue
>
queue
);
// monitor all running thread, if they are all stopped,
...
...
@@ -48,15 +102,15 @@ void MonitorThread(std::vector<ReaderThreadStatus>* thread_status,
class
CTRReader
:
public
framework
::
FileReader
{
public:
explicit
CTRReader
(
const
std
::
shared_ptr
<
LoDTensorBlockingQueue
>&
queue
,
int
batch_size
,
size_t
thread_num
,
const
std
::
vector
<
std
::
string
>&
slots
,
const
std
::
vector
<
std
::
string
>&
file_list
)
:
batch_size_
(
batch_size
),
slots_
(
slots
),
file_list_
(
file_list
)
{
CTRReader
(
const
std
::
shared_ptr
<
LoDTensorBlockingQueue
>&
queue
,
int
thread_num
,
const
DataDesc
&
data_desc
)
:
data_desc_
(
data_desc
)
{
PADDLE_ENFORCE_GT
(
thread_num
,
0
,
"thread num should be larger then 0!"
);
PADDLE_ENFORCE
(
queue
!=
nullptr
,
"LoDTensorBlockingQueue must not be null"
);
PADDLE_ENFORCE_GT
(
file_list
.
size
(),
0
,
"file list should not be empty"
);
thread_num_
=
std
::
min
<
size_t
>
(
file_list_
.
size
(),
thread_num
);
PADDLE_ENFORCE_GT
(
data_desc_
.
file_names_
.
size
(),
0
,
"file list should not be empty"
);
thread_num_
=
std
::
min
<
size_t
>
(
data_desc_
.
file_names_
.
size
(),
thread_num
);
queue_
=
queue
;
SplitFiles
();
for
(
size_t
i
=
0
;
i
<
thread_num_
;
++
i
)
{
...
...
@@ -64,7 +118,7 @@ class CTRReader : public framework::FileReader {
}
}
~
CTRReader
()
{}
~
CTRReader
()
{
Shutdown
();
}
void
ReadNext
(
std
::
vector
<
framework
::
LoDTensor
>*
out
)
override
{
bool
success
;
...
...
@@ -81,7 +135,10 @@ class CTRReader : public framework::FileReader {
for
(
auto
&
read_thread
:
read_threads_
)
{
read_thread
->
join
();
}
monitor_thread_
->
join
();
if
(
monitor_thread_
)
{
monitor_thread_
->
join
();
}
read_threads_
.
clear
();
monitor_thread_
.
reset
(
nullptr
);
...
...
@@ -95,9 +152,9 @@ class CTRReader : public framework::FileReader {
queue_
->
ReOpen
();
VLOG
(
3
)
<<
"reopen success"
;
VLOG
(
3
)
<<
"thread_num "
<<
thread_num_
;
for
(
size_
t
thread_id
=
0
;
thread_id
<
thread_num_
;
thread_id
++
)
{
for
(
in
t
thread_id
=
0
;
thread_id
<
thread_num_
;
thread_id
++
)
{
read_threads_
.
emplace_back
(
new
std
::
thread
(
std
::
bind
(
&
ReadThread
,
file_groups_
[
thread_id
],
slots_
,
batch_size
_
,
&
ReadThread
,
file_groups_
[
thread_id
],
data_desc
_
,
static_cast
<
int
>
(
thread_id
),
&
read_thread_status_
,
queue_
)));
}
monitor_thread_
.
reset
(
new
std
::
thread
(
...
...
@@ -108,8 +165,8 @@ class CTRReader : public framework::FileReader {
private:
void
SplitFiles
()
{
file_groups_
.
resize
(
thread_num_
);
for
(
size_t
i
=
0
;
i
<
file_list
_
.
size
();
++
i
)
{
auto
&
file_name
=
file_list
_
[
i
];
for
(
size_t
i
=
0
;
i
<
data_desc_
.
file_names
_
.
size
();
++
i
)
{
auto
&
file_name
=
data_desc_
.
file_names
_
[
i
];
std
::
ifstream
f
(
file_name
.
c_str
());
PADDLE_ENFORCE
(
f
.
good
(),
"file %s not exist!"
,
file_name
);
file_groups_
[
i
%
thread_num_
].
push_back
(
file_name
);
...
...
@@ -118,9 +175,7 @@ class CTRReader : public framework::FileReader {
private:
size_t
thread_num_
;
const
int
batch_size_
;
const
std
::
vector
<
std
::
string
>
slots_
;
const
std
::
vector
<
std
::
string
>
file_list_
;
const
DataDesc
data_desc_
;
std
::
shared_ptr
<
LoDTensorBlockingQueue
>
queue_
;
std
::
vector
<
std
::
unique_ptr
<
std
::
thread
>>
read_threads_
;
std
::
unique_ptr
<
std
::
thread
>
monitor_thread_
;
...
...
paddle/fluid/operators/reader/ctr_reader_test.cc
浏览文件 @
c5855506
...
...
@@ -36,6 +36,7 @@ using paddle::framework::LoD;
using
paddle
::
framework
::
DDim
;
using
paddle
::
platform
::
CPUPlace
;
using
paddle
::
framework
::
make_ddim
;
using
paddle
::
operators
::
reader
::
DataDesc
;
static
void
generatedata
(
const
std
::
vector
<
std
::
string
>&
data
,
const
std
::
string
&
file_name
)
{
...
...
@@ -126,30 +127,103 @@ TEST(CTR_READER, read_data) {
LoDTensorBlockingQueueHolder
queue_holder
;
int
capacity
=
64
;
queue_holder
.
InitOnce
(
capacity
,
{},
false
);
queue_holder
.
InitOnce
(
capacity
,
false
);
std
::
shared_ptr
<
LoDTensorBlockingQueue
>
queue
=
queue_holder
.
GetQueue
();
int
batch_size
=
3
;
int
thread_num
=
1
;
std
::
vector
<
std
::
string
>
slots
=
{
"6002"
,
"6003"
};
std
::
vector
<
std
::
string
>
s
parse_s
lots
=
{
"6002"
,
"6003"
};
std
::
vector
<
std
::
string
>
file_list
;
for
(
int
i
=
0
;
i
<
thread_num
;
++
i
)
{
file_list
.
push_back
(
gz_file_name
);
}
CTRReader
reader
(
queue
,
batch_size
,
thread_num
,
slots
,
file_list
);
DataDesc
data_desc
(
batch_size
,
file_list
,
"gzip"
,
"svm"
,
{},
{},
sparse_slots
);
CTRReader
reader
(
queue
,
thread_num
,
data_desc
);
reader
.
Start
();
size_t
batch_num
=
std
::
ceil
(
static_cast
<
float
>
(
ctr_data
.
size
())
/
batch_size
)
*
thread_num
;
check_all_data
(
ctr_data
,
slots
,
label_dims
,
label_value
,
data_slot_6002
,
data_slot_6003
,
batch_num
,
batch_size
,
queue
,
&
reader
);
check_all_data
(
ctr_data
,
sparse_slots
,
label_dims
,
label_value
,
data_slot_6002
,
data_slot_6003
,
batch_num
,
batch_size
,
queue
,
&
reader
);
reader
.
Shutdown
();
reader
.
Start
();
check_all_data
(
ctr_data
,
slots
,
label_dims
,
label_value
,
data_slot_6002
,
data_slot_6003
,
batch_num
,
batch_size
,
queue
,
&
reader
);
check_all_data
(
ctr_data
,
sparse_slots
,
label_dims
,
label_value
,
data_slot_6002
,
data_slot_6003
,
batch_num
,
batch_size
,
queue
,
&
reader
);
reader
.
Shutdown
();
}
static
void
GenereteCsvData
(
const
std
::
string
&
file_name
,
const
std
::
vector
<
std
::
string
>&
data
)
{
std
::
ofstream
out
(
file_name
.
c_str
());
PADDLE_ENFORCE
(
out
.
good
(),
"open file %s failed!"
,
file_name
);
for
(
auto
&
c
:
data
)
{
out
<<
c
;
}
out
.
close
();
PADDLE_ENFORCE
(
out
.
good
(),
"save file %s failed!"
,
file_name
);
}
static
void
CheckReadCsvOut
(
const
std
::
vector
<
LoDTensor
>&
out
)
{
ASSERT_EQ
(
out
.
size
(),
3
);
ASSERT_EQ
(
out
[
0
].
dims
()[
1
],
1
);
ASSERT_EQ
(
out
[
1
].
dims
()[
1
],
2
);
ASSERT_EQ
(
out
[
2
].
dims
()[
1
],
1
);
for
(
size_t
i
=
0
;
i
<
out
[
0
].
numel
();
++
i
)
{
int64_t
label
=
out
[
0
].
data
<
int64_t
>
()[
i
];
auto
&
dense_dim
=
out
[
1
].
dims
();
for
(
size_t
j
=
0
;
j
<
dense_dim
[
1
];
++
j
)
{
ASSERT_EQ
(
out
[
1
].
data
<
float
>
()[
i
*
dense_dim
[
1
]
+
j
],
static_cast
<
float
>
(
label
+
0.1
));
}
auto
&
sparse_lod
=
out
[
2
].
lod
();
for
(
size_t
j
=
sparse_lod
[
0
][
i
];
j
<
sparse_lod
[
0
][
i
+
1
];
++
j
)
{
ASSERT_EQ
(
out
[
2
].
data
<
int64_t
>
()[
j
],
label
);
}
}
}
TEST
(
CTR_READER
,
read_csv_data
)
{
std
::
string
file_name
=
"test_ctr_reader_data.csv"
;
const
std
::
vector
<
std
::
string
>
csv_data
=
{
"0 0.1,0.1 0,0,0,0
\n
"
,
"1 1.1,1.1 1,1,1,1
\n
"
,
"2 2.1,2.1 2,2,2,2
\n
"
,
"3 3.1,3.1 3,3,3,3
\n
"
,
};
GenereteCsvData
(
file_name
,
csv_data
);
LoDTensorBlockingQueueHolder
queue_holder
;
int
capacity
=
64
;
queue_holder
.
InitOnce
(
capacity
,
false
);
std
::
shared_ptr
<
LoDTensorBlockingQueue
>
queue
=
queue_holder
.
GetQueue
();
int
batch_size
=
3
;
int
thread_num
=
1
;
std
::
vector
<
std
::
string
>
file_list
;
for
(
int
i
=
0
;
i
<
thread_num
;
++
i
)
{
file_list
.
push_back
(
file_name
);
}
DataDesc
data_desc
(
batch_size
,
file_list
,
"plain"
,
"csv"
,
{
1
},
{
2
},
{});
CTRReader
reader
(
queue
,
thread_num
,
data_desc
);
for
(
size_t
i
=
0
;
i
<
2
;
++
i
)
{
reader
.
Start
();
std
::
vector
<
LoDTensor
>
out
;
while
(
true
)
{
reader
.
ReadNext
(
&
out
);
if
(
out
.
empty
())
{
break
;
}
CheckReadCsvOut
(
out
);
}
reader
.
Shutdown
();
}
}
paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
浏览文件 @
c5855506
...
...
@@ -32,10 +32,8 @@ class LoDTensorBlockingQueue {
friend
class
LoDTensorBlockingQueueHolder
;
private:
LoDTensorBlockingQueue
(
size_t
capacity
,
const
std
::
vector
<
framework
::
DDim
>&
dims
,
bool
speed_test_mode
=
false
)
:
queue_
(
capacity
,
speed_test_mode
),
dims_
(
dims
)
{}
explicit
LoDTensorBlockingQueue
(
size_t
capacity
,
bool
speed_test_mode
=
false
)
:
queue_
(
capacity
,
speed_test_mode
)
{}
public:
bool
Push
(
const
std
::
vector
<
framework
::
LoDTensor
>&
lod_tensor_vec
)
{
...
...
@@ -65,17 +63,15 @@ class LoDTensorBlockingQueue {
private:
BlockingQueue
<
std
::
vector
<
framework
::
LoDTensor
>>
queue_
;
std
::
vector
<
framework
::
DDim
>
dims_
;
};
class
LoDTensorBlockingQueueHolder
{
public:
void
InitOnce
(
size_t
capacity
,
const
std
::
vector
<
framework
::
DDim
>&
dims
,
bool
speed_test_mode
=
false
)
{
void
InitOnce
(
size_t
capacity
,
bool
speed_test_mode
=
false
)
{
PADDLE_ENFORCE
(
queue_
==
nullptr
,
"LoDTensorBlockingQueueHolder::InitOnce() can only be called once"
);
queue_
.
reset
(
new
LoDTensorBlockingQueue
(
capacity
,
dims
,
speed_test_mode
));
queue_
.
reset
(
new
LoDTensorBlockingQueue
(
capacity
,
speed_test_mode
));
}
inline
const
std
::
shared_ptr
<
LoDTensorBlockingQueue
>&
GetQueue
()
const
{
...
...
paddle/fluid/operators/reader/read_op.cc
浏览文件 @
c5855506
...
...
@@ -27,13 +27,13 @@ class ReadInferShape : public framework::InferShapeBase {
"The ReadOp must take a reader as input."
);
PADDLE_ENFORCE
(
ctx
->
HasOutputs
(
"Out"
),
"The ReadOp should be assigned with output."
);
std
::
vector
<
framework
::
DDim
>
reader_dims
=
ctx
->
GetReaderDims
(
"Reader"
);
std
::
vector
<
std
::
string
>
out_names
=
ctx
->
Outputs
(
"Out
"
);
PADDLE_ENFORCE_EQ
(
reader_dims
.
size
(),
out_names
.
size
(),
"The reader's dim number doesn't match the output number."
);
ctx
->
SetOutputsDim
(
"Out"
,
reader_dims
);
if
(
!
ctx
->
IsRuntime
())
{
if
(
!
ctx
->
IsRuntime
()
&&
ctx
->
Attrs
().
Get
<
bool
>
(
"infer_out"
))
{
std
::
vector
<
framework
::
DDim
>
reader_dims
=
ctx
->
GetReaderDims
(
"Reader
"
);
std
::
vector
<
std
::
string
>
out_names
=
ctx
->
Outputs
(
"Out"
);
PADDLE_ENFORCE_EQ
(
reader_dims
.
size
(),
out_names
.
size
(),
"The reader's dim number doesn't match the output number."
);
ctx
->
SetOutputsDim
(
"Out"
,
reader_dims
);
auto
in_desc
=
boost
::
get
<
framework
::
VarDesc
*>
(
ctx
->
GetInputVarPtrs
(
"Reader"
)[
0
]);
auto
in_lod_levels
=
in_desc
->
GetLoDLevels
();
...
...
@@ -53,15 +53,18 @@ class ReadInferVarType : public framework::VarTypeInference {
public:
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
{
std
::
string
reader_name
=
op_desc
.
Input
(
"Reader"
)[
0
];
std
::
vector
<
std
::
string
>
out_names
=
op_desc
.
Output
(
"Out"
);
framework
::
VarDesc
*
reader
=
block
->
FindVarRecursive
(
reader_name
);
auto
dtypes
=
reader
->
GetDataTypes
();
PADDLE_ENFORCE_EQ
(
dtypes
.
size
(),
out_names
.
size
());
for
(
size_t
i
=
0
;
i
<
dtypes
.
size
();
++
i
)
{
framework
::
VarDesc
&
out
=
block
->
FindRecursiveOrCreateVar
(
out_names
[
i
]);
out
.
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
out
.
SetDataType
(
dtypes
[
i
]);
bool
infer_out
=
boost
::
get
<
bool
>
(
op_desc
.
GetAttr
(
"infer_out"
));
if
(
infer_out
)
{
std
::
string
reader_name
=
op_desc
.
Input
(
"Reader"
)[
0
];
std
::
vector
<
std
::
string
>
out_names
=
op_desc
.
Output
(
"Out"
);
framework
::
VarDesc
*
reader
=
block
->
FindVarRecursive
(
reader_name
);
auto
dtypes
=
reader
->
GetDataTypes
();
PADDLE_ENFORCE_EQ
(
dtypes
.
size
(),
out_names
.
size
());
for
(
size_t
i
=
0
;
i
<
dtypes
.
size
();
++
i
)
{
framework
::
VarDesc
&
out
=
block
->
FindRecursiveOrCreateVar
(
out_names
[
i
]);
out
.
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
out
.
SetDataType
(
dtypes
[
i
]);
}
}
}
};
...
...
@@ -73,6 +76,7 @@ class ReadOp : public framework::OperatorBase {
private:
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
override
{
VLOG
(
3
)
<<
"read op in"
;
framework
::
ReaderHolder
*
reader
=
detail
::
Ref
(
scope
.
FindVar
(
Input
(
"Reader"
)),
"Cannot find reader variable %s"
,
Input
(
"Reader"
))
...
...
@@ -87,7 +91,9 @@ class ReadOp : public framework::OperatorBase {
reader
->
ReadNext
(
&
ins
);
if
(
ins
.
empty
())
{
VLOG
(
3
)
<<
"read empty data in"
;
if
(
Attr
<
bool
>
(
"throw_eof_exp"
))
{
VLOG
(
3
)
<<
"throw_eof_exp"
;
PADDLE_THROW_EOF
();
}
else
{
ins
.
resize
(
out_arg_names
.
size
());
...
...
@@ -96,6 +102,7 @@ class ReadOp : public framework::OperatorBase {
tensor
.
mutable_data
<
float
>
(
framework
::
make_ddim
({
0
}),
dev_place
);
}
}
VLOG
(
3
)
<<
"read empty data out"
;
}
PADDLE_ENFORCE_EQ
(
ins
.
size
(),
out_arg_names
.
size
());
for
(
size_t
i
=
0
;
i
<
out_arg_names
.
size
();
++
i
)
{
...
...
@@ -120,6 +127,7 @@ class ReadOpMaker : public framework::OpProtoAndCheckerMaker {
" only when the data-balance is enabled in ParallelExecutor"
" and it is set by ParallelExecutor instance, not users."
)
.
SetDefault
(
true
);
AddAttr
<
bool
>
(
"infer_out"
,
""
).
SetDefault
(
true
);
AddComment
(
R"DOC(
Read Operator
...
...
paddle/fluid/operators/reader/reader_op_registry.cc
浏览文件 @
c5855506
...
...
@@ -65,6 +65,10 @@ void FileReaderMakerBase::Make() {
"It means the reader will generate two data each time,"
"whose shapes are [2,3,4] and [5,6] respectively."
);
AddAttr
<
std
::
vector
<
int
>>
(
"lod_levels"
,
"The LoD levels of each data."
);
AddAttr
<
bool
>
(
"use_data_config"
,
"Use the config of all datas like shape_concat/ranks/lod_levels"
)
.
SetDefault
(
true
);
Apply
();
}
...
...
@@ -75,19 +79,23 @@ void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"The output file reader should not be null."
);
const
auto
shape_concat
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"shape_concat"
);
const
auto
ranks
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"ranks"
);
std
::
vector
<
framework
::
DDim
>
shapes
=
RestoreShapes
(
shape_concat
,
ranks
);
ctx
->
SetReaderDims
(
"Out"
,
shapes
);
const
auto
lod_levels
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"lod_levels"
);
PADDLE_ENFORCE_EQ
(
lod_levels
.
size
(),
shapes
.
size
(),
"The number of 'lod_levels'(%d) doesn't match the number "
"of 'shapes'(%d)."
,
lod_levels
.
size
(),
shapes
.
size
());
framework
::
VarDesc
*
reader
=
boost
::
get
<
framework
::
VarDesc
*>
(
ctx
->
GetOutputVarPtrs
(
"Out"
)[
0
]);
reader
->
SetLoDLevels
(
lod_levels
);
bool
use_data_config
=
ctx
->
Attrs
().
Get
<
bool
>
(
"use_data_config"
);
if
(
use_data_config
)
{
const
auto
shape_concat
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"shape_concat"
);
const
auto
ranks
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"ranks"
);
std
::
vector
<
framework
::
DDim
>
shapes
=
RestoreShapes
(
shape_concat
,
ranks
);
ctx
->
SetReaderDims
(
"Out"
,
shapes
);
const
auto
lod_levels
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"lod_levels"
);
PADDLE_ENFORCE_EQ
(
lod_levels
.
size
(),
shapes
.
size
(),
"The number of 'lod_levels'(%d) doesn't match the number "
"of 'shapes'(%d)."
,
lod_levels
.
size
(),
shapes
.
size
());
framework
::
VarDesc
*
reader
=
boost
::
get
<
framework
::
VarDesc
*>
(
ctx
->
GetOutputVarPtrs
(
"Out"
)[
0
]);
reader
->
SetLoDLevels
(
lod_levels
);
}
}
void
FileReaderInferVarType
::
operator
()(
const
framework
::
OpDesc
&
op_desc
,
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
c5855506
...
...
@@ -485,6 +485,7 @@ All parameter, weight, gradient are variables in Paddle.
py
::
return_value_policy
::
reference
);
py
::
class_
<
framework
::
ReaderHolder
>
(
m
,
"Reader"
,
""
)
.
def
(
"start"
,
&
framework
::
ReaderHolder
::
Start
)
.
def
(
"reset"
,
&
framework
::
ReaderHolder
::
ResetAll
);
using
LoDTensorBlockingQueue
=
...
...
@@ -505,19 +506,12 @@ All parameter, weight, gradient are variables in Paddle.
.
def
(
"is_closed"
,
&
LoDTensorBlockingQueue
::
IsClosed
);
m
.
def
(
"init_lod_tensor_blocking_queue"
,
[](
Variable
&
var
,
size_t
capacity
,
const
std
::
vector
<
std
::
vector
<
int64_t
>>
&
shapes
)
->
std
::
shared_ptr
<
LoDTensorBlockingQueue
>
{
std
::
vector
<
DDim
>
dims
(
shapes
.
size
());
std
::
transform
(
shapes
.
begin
(),
shapes
.
end
(),
dims
.
begin
(),
[](
const
std
::
vector
<
int64_t
>
&
shape
)
{
return
make_ddim
(
shape
);
});
auto
*
holder
=
var
.
GetMutable
<
LoDTensorBlockingQueueHolder
>
();
holder
->
InitOnce
(
capacity
,
dims
,
FLAGS_reader_queue_speed_test_mode
);
return
holder
->
GetQueue
();
},
[](
Variable
&
var
,
size_t
capacity
)
->
std
::
shared_ptr
<
LoDTensorBlockingQueue
>
{
auto
*
holder
=
var
.
GetMutable
<
LoDTensorBlockingQueueHolder
>
();
holder
->
InitOnce
(
capacity
,
FLAGS_reader_queue_speed_test_mode
);
return
holder
->
GetQueue
();
},
py
::
return_value_policy
::
copy
);
py
::
class_
<
Scope
>
(
m
,
"_Scope"
,
R"DOC(
...
...
python/paddle/fluid/contrib/__init__.py
浏览文件 @
c5855506
...
...
@@ -22,6 +22,8 @@ from . import op_frequence
from
.op_frequence
import
*
from
.
import
quantize
from
.quantize
import
*
from
.
import
reader
from
.reader
import
*
from
.
import
slim
from
.slim
import
*
from
.
import
utils
...
...
@@ -32,5 +34,6 @@ __all__ += decoder.__all__
__all__
+=
memory_usage_calc
.
__all__
__all__
+=
op_frequence
.
__all__
__all__
+=
quantize
.
__all__
__all__
+=
reader
.
__all__
__all__
+=
slim
.
__all__
__all__
+=
utils
.
__all__
python/paddle/fluid/contrib/reader/README.md
0 → 100644
浏览文件 @
c5855506
## CTR READER
An multi-thread cpp reader that has the same interface with py_reader. It
uses cpp multi-thread to read file and is much more faster then the Python read
thread in py_reader.
Currently, it support two types of file:
-
gzip
-
plain text file
and two types of data format:
-
cvs data format is :
*
label dense_fea,dense_fea sparse_fea,sparse_fea
-
the svm data format is :
*
label slot1:fea_sign slot2:fea_sign slot1:fea_sign
python/paddle/fluid/contrib/reader/__init__.py
0 → 100644
浏览文件 @
c5855506
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
from
.
import
ctr_reader
__all__
=
ctr_reader
.
__all__
python/paddle/fluid/contrib/reader/ctr_reader.py
浏览文件 @
c5855506
...
...
@@ -20,6 +20,8 @@ from paddle.fluid.framework import default_main_program, \
default_startup_program
,
Variable
from
paddle.fluid.unique_name
import
generate
as
unique_name
__all__
=
[
'ctr_reader'
]
def
monkey_patch_reader_methods
(
reader
):
def
__get_reader__
():
...
...
@@ -30,7 +32,11 @@ def monkey_patch_reader_methods(reader):
def
reset
():
return
__get_reader__
().
reset
()
def
start
():
return
__get_reader__
().
start
()
reader
.
reset
=
reset
reader
.
start
=
start
reader
.
stop_gradient
=
True
reader
.
persistable
=
True
return
reader
...
...
@@ -44,13 +50,18 @@ def _copy_reader_var_(block, var):
return
new_var
def
ctr_reader
(
feed_data
,
capacity
,
thread_num
,
batch_size
,
file_list
,
slots
,
name
=
None
):
def
ctr_reader
(
feed_dict
,
file_type
,
# gzip or plain
file_format
,
# csv or svm
dense_slot_index
,
sparse_slot_index
,
capacity
,
thread_num
,
batch_size
,
file_list
,
slots
,
name
=
None
):
"""
Create a CTR reader for data feeding in Python
...
...
@@ -67,12 +78,21 @@ def ctr_reader(feed_data,
Note that :code:`Program.clone()` method cannot clone :code:`py_reader`.
Args:
feed_dict(list(variable)): a list of data variable.
file_type('gzip'|'plain'): the type of the data file
file_format('csv'|'svm'): csv data or svm data format.
cvs data format is :
label dense_fea,dense_fea sparse_fea,sparse_fea
the svm data format is :
label slot1:fea_sign slot2:fea_sign slot1:fea_sign
dense_slot_index(list(int)): the index of dense slots
sparse_slot_index(list(int)): the index of sparse slots
capacity(int): The buffer capacity maintained by :code:`py_reader`.
thread_num(
list|tuple): List of tuples which declaring data shapes
.
batch_size(
list|tuple): List of strs which declaring data type
.
file_list(list
|tuple): List of ints which declaring data lod_level
.
slots(
bool): Whether use double buffer or not
.
name(
base
string): The prefix Python queue name and Reader name. None will
thread_num(
int): the thread num to read files by cpp reader
.
batch_size(
int): batch size of data
.
file_list(list
(str)): List of file names that need to read
.
slots(
list(int64)): list of slot id
.
name(string): The prefix Python queue name and Reader name. None will
be generated automatically.
Returns:
...
...
@@ -80,7 +100,15 @@ def ctr_reader(feed_data,
Examples:
1. The basic usage of :code:`py_reader` is as follows:
1. The basic usage of :code:`ctr_reader` is as follows:
.. code-block:: python
py_reader = fluid.contrib.ctr_reader.ctr_reader(
feed_dict=datas, file_type='plain', file_format='csv',
file_list=file_list, dense_slot_indexs=[1, 2, 3, 4], sparse_slot_indexs=[],
capacity=64, thread_num=20, batch_size=1000, slots=[], name='ctr_reader')
"""
if
name
is
None
:
queue_name
=
unique_name
(
'lod_tensor_blocking_queue'
)
...
...
@@ -90,7 +118,7 @@ def ctr_reader(feed_data,
reader_name
=
"_"
.
join
([
name
,
"reader"
])
var
=
global_scope
().
var
(
queue_name
)
feed_queue
=
core
.
init_lod_tensor_blocking_queue
(
var
,
capacity
,
shapes
)
feed_queue
=
core
.
init_lod_tensor_blocking_queue
(
var
,
capacity
)
startup_blk
=
default_startup_program
().
current_block
()
reader_var
=
startup_blk
.
create_var
(
name
=
reader_name
)
...
...
@@ -99,12 +127,22 @@ def ctr_reader(feed_data,
inputs
=
{
'blocking_queue'
:
[
queue_name
]},
outputs
=
{
'Out'
:
[
reader_var
]},
attrs
=
{
'use_data_config'
:
False
,
'thread_num'
:
thread_num
,
'batch_size'
:
batch_size
,
'file_list'
:
file_list
,
'slots'
:
slots
,
'file_type'
:
file_type
,
'file_format'
:
file_format
,
'dense_slot_index'
:
dense_slot_index
,
'sparse_slot_index'
:
sparse_slot_index
,
'sparse_slots'
:
slots
,
'ranks'
:
[],
'lod_levels'
:
[],
'shape_concat'
:
[]
})
dtypes
=
[
data
.
dtype
for
data
in
feed_dict
]
reader_var
.
desc
.
set_dtypes
(
dtypes
)
reader_var
.
persistable
=
True
main_prog_reader_var
=
_copy_reader_var_
(
...
...
@@ -118,6 +156,9 @@ def ctr_reader(feed_data,
main_blk
=
default_main_program
().
current_block
()
main_blk
.
append_op
(
type
=
'read'
,
inputs
=
{
'Reader'
:
[
reader
]},
outputs
=
{
'Out'
:
feed_data
})
type
=
'read'
,
inputs
=
{
'Reader'
:
[
reader
]},
attrs
=
{
'infer_out'
:
False
},
outputs
=
{
'Out'
:
feed_dict
})
return
reader
python/paddle/fluid/layers/io.py
浏览文件 @
c5855506
...
...
@@ -523,7 +523,7 @@ def _py_reader(capacity,
double_buffer_name
=
"_"
.
join
([
name
,
"double_buffer"
])
var
=
global_scope
().
var
(
queue_name
)
feed_queue
=
core
.
init_lod_tensor_blocking_queue
(
var
,
capacity
,
shapes
)
feed_queue
=
core
.
init_lod_tensor_blocking_queue
(
var
,
capacity
)
startup_blk
=
default_startup_program
().
current_block
()
startup_var
=
startup_blk
.
create_var
(
name
=
reader_name
)
...
...
python/setup.py.in
浏览文件 @
c5855506
...
...
@@ -109,6 +109,7 @@ packages=['paddle',
'paddle.fluid.contrib',
'paddle.fluid.contrib.decoder',
'paddle.fluid.contrib.quantize',
'paddle.fluid.contrib.reader',
'paddle.fluid.contrib.slim',
'paddle.fluid.contrib.slim.core',
'paddle.fluid.contrib.slim.graph',
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录