Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
84b423a8
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
84b423a8
编写于
2月 23, 2017
作者:
D
dangqingqing
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
refine data feeder and add unit test
上级
823b6352
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
264 addition
and
35 deletion
+264
-35
python/CMakeLists.txt
python/CMakeLists.txt
+1
-0
python/paddle/v2/data_feeder.py
python/paddle/v2/data_feeder.py
+75
-35
python/paddle/v2/tests/CMakeLists.txt
python/paddle/v2/tests/CMakeLists.txt
+2
-0
python/paddle/v2/tests/run_tests.sh
python/paddle/v2/tests/run_tests.sh
+36
-0
python/paddle/v2/tests/test_data_feeder.py
python/paddle/v2/tests/test_data_feeder.py
+150
-0
未找到文件。
python/CMakeLists.txt
浏览文件 @
84b423a8
...
...
@@ -25,6 +25,7 @@ add_custom_target(paddle_python ALL DEPENDS
add_subdirectory
(
paddle/trainer_config_helpers/tests
)
add_subdirectory
(
paddle/reader/tests
)
add_subdirectory
(
paddle/v2/tests
)
install
(
DIRECTORY
${
CMAKE_CURRENT_BINARY_DIR
}
/dist/
DESTINATION opt/paddle/share/wheels
...
...
python/paddle/v2/data_feeder.py
浏览文件 @
84b423a8
...
...
@@ -12,49 +12,89 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from
py_paddle
import
swig_paddle
from
py_paddle
import
DataProviderConverter
import
data_type
__all__
=
[
'DataFeeder'
]
"""
DataFeeder converts the data returned by paddle.reader into a data structure
of Arguments which is defined in the API. The paddle.reader usually returns
a list of mini-batch data. Each item in the list is a tuple or list, which is
one sample with multiple features. DataFeeder converts this mini-batch data
into Arguments in order to feed it to C++ interface.
The example usage:
data_types = [paddle.data_type.dense_vector(784),
paddle.data_type.integer_value(10)]
feeder = DataFeeder(input_types=data_types)
minibatch_data = [
( [1.0,2.0,3.0,4.0], 5, [6,7,8] ), # first sample
( [1.0,2.0,3.0,4.0], 5, [6,7,8] ) # second sample
]
# or
# minibatch_data = [
# [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ], # first sample
# [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ] # second sample
# ]
arg = feeder(minibatch_data)
Args:
input_types: A list of input data types. It's length is equal to the length
of data returned by paddle.reader. Each item specifies the type
of each feature.
mintbatch_data: A list of mini-batch data. Each item is a list or tuple,
class
DataFeeder
(
DataProviderConverter
):
"""
DataFeeder converts the data returned by paddle.reader into a data structure
of Arguments which is defined in the API. The paddle.reader usually returns
a list of mini-batch data. Each item in the list is a list or a tuple,
which is one sample with one or multiple features. DataFeeder converts this
mini-batch data into Arguments in order to feed it to C++ interface.
The example usage:
data_types = [('image', paddle.data_type.dense_vector(784)),
('label', paddle.data_type.integer_value(10))]
reader_dict = {'image':0, 'label':1}
feeder = DataFeeder(data_types=data_types, reader_dict=reader_dict)
minibatch_data = [
( [1.0,2.0,3.0,4.0], 5, [6,7,8] ), # first sample
( [1.0,2.0,3.0,4.0], 5, [6,7,8] ) # second sample
]
arg = feeder(minibatch_data)
"""
def
__init__
(
self
,
data_types
,
reader_dict
):
"""
:param data_types: A list to specify data name and type. Each item is
a tuple of (data_name, data_type). For example:
[('image', paddle.data_type.dense_vector(784)),
('label', paddle.data_type.integer_value(10))]
:type data_types: A list of tuple
:param reader_dict: A dictionary to specify the position of each data
in the input data.
:type reader_dict: dict()
"""
self
.
input_names
=
[]
self
.
input_types
=
[]
self
.
reader_dict
=
reader_dict
for
each
in
data_types
:
self
.
input_names
.
append
(
each
[
0
])
self
.
input_types
.
append
(
each
[
1
])
assert
isinstance
(
each
[
1
],
data_type
.
InputType
)
DataProviderConverter
.
__init__
(
self
,
self
.
input_types
)
def
convert
(
self
,
dat
,
argument
=
None
):
"""
:param dat: A list of mini-batch data. Each item is a list or tuple,
for example:
[
(feature_0, feature_1, feature_2, ...), # first sample
(feature_0, feature_1, feature_2, ...), # second sample
...
]
:type dat: List
:param argument: An Arguments object contains this mini-batch data with
one or multiple features. The Arguments definition is
in the API.
:type argument: swig_paddle.Arguments
"""
if
argument
is
None
:
argument
=
swig_paddle
.
Arguments
.
createArguments
(
0
)
assert
isinstance
(
argument
,
swig_paddle
.
Arguments
)
argument
.
resize
(
len
(
self
.
input_types
))
scanners
=
[
DataProviderConverter
.
create_scanner
(
i
,
each_type
)
for
i
,
each_type
in
enumerate
(
self
.
input_types
)
]
for
each_sample
in
dat
:
for
name
,
scanner
in
zip
(
self
.
input_names
,
scanners
):
scanner
.
scan
(
each_sample
[
self
.
reader_dict
[
name
]])
for
scanner
in
scanners
:
scanner
.
finish_scan
(
argument
)
Returns:
An Arguments object contains this mini-batch data with multiple features.
The Arguments definition is in the API.
"""
return
argument
DataFeeder
=
DataProviderConverter
def
__call__
(
self
,
dat
,
argument
=
None
):
return
self
.
convert
(
dat
,
argument
)
python/paddle/v2/tests/CMakeLists.txt
0 → 100644
浏览文件 @
84b423a8
add_test
(
NAME test_v2_api
COMMAND bash
${
PROJ_ROOT
}
/python/paddle/v2/tests/run_tests.sh
${
PYTHON_EXECUTABLE
}
)
python/paddle/v2/tests/run_tests.sh
0 → 100755
浏览文件 @
84b423a8
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
pushd
`
dirname
$0
`
>
/dev/null
SCRIPTPATH
=
$PWD
popd
>
/dev/null
cd
$SCRIPTPATH
$1
-m
pip
install
../../../../paddle/dist/
*
.whl
test_list
=
"test_data_feeder.py"
export
PYTHONPATH
=
$PWD
/../../../../python/
for
fn
in
$test_list
do
echo
"test
$fn
"
$1
$fn
if
[
$?
-ne
0
]
;
then
exit
1
fi
done
python/paddle/v2/tests/test_data_feeder.py
0 → 100644
浏览文件 @
84b423a8
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
py_paddle.swig_paddle
as
api
import
numpy
as
np
from
paddle.v2
import
data_type
from
paddle.v2.data_feeder
import
DataFeeder
class
DataFeederTest
(
unittest
.
TestCase
):
def
dense_reader
(
self
,
size
):
data
=
np
.
random
.
random
(
size
)
return
data
def
sparse_binary_reader
(
self
,
high
,
size_limit
,
non_empty
=
False
):
num
=
np
.
random
.
randint
(
size_limit
)
# num could be 0
while
non_empty
and
num
==
0
:
num
=
np
.
random
.
randint
(
size_limit
)
return
np
.
random
.
randint
(
high
,
size
=
num
).
tolist
()
def
test_dense_vector
(
self
):
def
compare
(
input
):
feeder
=
DataFeeder
([(
'image'
,
data_type
.
dense_vector
(
784
))],
{
'image'
:
0
})
arg
=
feeder
([
input
])
output
=
arg
.
getSlotValue
(
0
).
copyToNumpyMat
()
input
=
np
.
array
(
input
,
dtype
=
'float32'
)
self
.
assertAlmostEqual
(
input
.
all
(),
output
.
all
())
# test numpy array
batch_size
=
32
dim
=
784
data
=
[]
for
i
in
xrange
(
batch_size
):
data
.
append
(
self
.
dense_reader
(
784
))
compare
(
data
)
# test list
data
=
[]
for
i
in
xrange
(
batch_size
):
data
.
append
(
self
.
dense_reader
(
784
).
tolist
())
compare
(
data
)
def
test_sparse_binary
(
self
):
dim
=
10000
batch_size
=
32
data
=
[]
for
i
in
xrange
(
batch_size
):
data
.
append
([
self
.
sparse_binary_reader
(
dim
,
50
)])
feeder
=
DataFeeder
([(
'input'
,
data_type
.
sparse_binary_vector
(
dim
))],
{
'input'
:
0
})
arg
=
feeder
(
data
)
output
=
arg
.
getSlotValue
(
0
)
assert
isinstance
(
output
,
api
.
Matrix
)
for
i
in
xrange
(
batch_size
):
self
.
assertEqual
(
output
.
getSparseRowCols
(
i
),
data
[
i
][
0
])
def
test_sparse
(
self
):
dim
=
10000
batch_size
=
32
v
=
[]
w
=
[]
data
=
[]
for
dat
in
xrange
(
batch_size
):
a
=
self
.
sparse_binary_reader
(
dim
,
40
,
non_empty
=
True
)
b
=
self
.
dense_reader
(
len
(
a
)).
tolist
()
v
.
append
(
a
)
w
.
append
(
b
[
0
])
data
.
append
([
zip
(
a
,
b
)])
feeder
=
DataFeeder
([(
'input'
,
data_type
.
sparse_vector
(
dim
))],
{
'input'
:
0
})
arg
=
feeder
(
data
)
output
=
arg
.
getSlotValue
(
0
)
assert
isinstance
(
output
,
api
.
Matrix
)
for
i
in
xrange
(
batch_size
):
self
.
assertEqual
(
output
.
getSparseRowCols
(
i
),
v
[
i
])
def
test_integer
(
self
):
dim
=
100
batch_size
=
32
index
=
[]
for
i
in
xrange
(
batch_size
):
index
.
append
([
np
.
random
.
randint
(
dim
)])
feeder
=
DataFeeder
([(
'input'
,
data_type
.
integer_value
(
dim
))],
{
'input'
:
0
})
arg
=
feeder
(
index
)
output
=
arg
.
getSlotIds
(
0
).
copyToNumpyArray
()
index
=
np
.
array
(
index
,
dtype
=
'int'
)
self
.
assertEqual
(
output
.
all
(),
index
.
flatten
().
all
())
def
test_multiple_slots
(
self
):
batch_size
=
2
data
=
[]
for
i
in
xrange
(
batch_size
):
each_sample
=
[]
each_sample
.
append
(
np
.
random
.
randint
(
10
))
# size of feature 2: 10
each_sample
.
append
(
self
.
sparse_binary_reader
(
20000
,
40
,
non_empty
=
True
))
# size of feature 1: 20000
each_sample
.
append
(
self
.
dense_reader
(
100
))
# size of feature 0: 100
data
.
append
(
each_sample
)
# test multiple features
data_types
=
[(
'fea0'
,
data_type
.
dense_vector
(
100
)),
(
'fea1'
,
data_type
.
sparse_binary_vector
(
20000
)),
(
'fea2'
,
data_type
.
integer_value
(
10
))]
feeder
=
DataFeeder
(
data_types
,
{
'fea0'
:
2
,
'fea1'
:
1
,
'fea2'
:
0
})
arg
=
feeder
(
data
)
output_dense
=
arg
.
getSlotValue
(
0
).
copyToNumpyMat
()
output_sparse
=
arg
.
getSlotValue
(
1
)
output_index
=
arg
.
getSlotIds
(
2
).
copyToNumpyArray
()
for
i
in
xrange
(
batch_size
):
self
.
assertEqual
(
output_dense
[
i
].
all
(),
data
[
i
][
2
].
all
())
self
.
assertEqual
(
output_sparse
.
getSparseRowCols
(
i
),
data
[
i
][
1
])
self
.
assertEqual
(
output_index
[
i
],
data
[
i
][
0
])
# reader returns 3 featreus, but only use 2 features
data_types
=
[(
'fea0'
,
data_type
.
dense_vector
(
100
)),
(
'fea2'
,
data_type
.
integer_value
(
10
))]
feeder
=
DataFeeder
(
data_types
,
{
'fea0'
:
2
,
'fea2'
:
0
})
arg
=
feeder
(
data
)
output_dense
=
arg
.
getSlotValue
(
0
).
copyToNumpyMat
()
output_index
=
arg
.
getSlotIds
(
1
).
copyToNumpyArray
()
for
i
in
xrange
(
batch_size
):
self
.
assertEqual
(
output_dense
[
i
].
all
(),
data
[
i
][
2
].
all
())
self
.
assertEqual
(
output_index
[
i
],
data
[
i
][
0
])
if
__name__
==
'__main__'
:
api
.
initPaddle
(
"--use_gpu=0"
)
unittest
.
main
()
if
__name__
==
'__main__'
:
api
.
initPaddle
(
"--use_gpu=0"
)
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录