Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
book
提交
13660f17
B
book
项目概览
PaddlePaddle
/
book
通知
16
Star
4
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
40
列表
看板
标记
里程碑
合并请求
37
Wiki
5
Wiki
分析
仓库
DevOps
项目成员
Pages
B
book
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
40
Issue
40
列表
看板
标记
里程碑
合并请求
37
合并请求
37
Pages
分析
分析
仓库分析
DevOps
Wiki
5
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
13660f17
编写于
3月 09, 2017
作者:
Y
Yu Yang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Removed deprecated
上级
ba0149b3
变更
13
显示空白变更内容
内联
并排
Showing
13 changed file
with
0 addition
and
1024 deletion
+0
-1024
recommender_system/deprecated/common_utils.py
recommender_system/deprecated/common_utils.py
+0
-30
recommender_system/deprecated/data/config.json
recommender_system/deprecated/data/config.json
+0
-16
recommender_system/deprecated/data/config_generator.py
recommender_system/deprecated/data/config_generator.py
+0
-127
recommender_system/deprecated/data/getdata.sh
recommender_system/deprecated/data/getdata.sh
+0
-23
recommender_system/deprecated/data/meta_generator.py
recommender_system/deprecated/data/meta_generator.py
+0
-430
recommender_system/deprecated/data/requirements.txt
recommender_system/deprecated/data/requirements.txt
+0
-2
recommender_system/deprecated/data/split.py
recommender_system/deprecated/data/split.py
+0
-66
recommender_system/deprecated/dataprovider.py
recommender_system/deprecated/dataprovider.py
+0
-87
recommender_system/deprecated/evaluate.py
recommender_system/deprecated/evaluate.py
+0
-37
recommender_system/deprecated/prediction.py
recommender_system/deprecated/prediction.py
+0
-50
recommender_system/deprecated/preprocess.sh
recommender_system/deprecated/preprocess.sh
+0
-40
recommender_system/deprecated/train.sh
recommender_system/deprecated/train.sh
+0
-24
recommender_system/deprecated/trainer_config.py
recommender_system/deprecated/trainer_config.py
+0
-92
未找到文件。
recommender_system/deprecated/common_utils.py
已删除
100755 → 0
浏览文件 @
ba0149b3
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
paddle.trainer.PyDataProvider2
import
*
def
meta_to_header
(
meta
,
name
):
metas
=
meta
[
name
][
'__meta__'
][
'raw_meta'
]
for
each_meta
in
metas
:
slot_name
=
each_meta
.
get
(
'name'
,
'%s_id'
%
name
)
if
each_meta
[
'type'
]
==
'id'
:
yield
slot_name
,
integer_value
(
each_meta
[
'max'
])
elif
each_meta
[
'type'
]
==
'embedding'
:
is_seq
=
each_meta
[
'seq'
]
==
'sequence'
yield
slot_name
,
integer_value
(
len
(
each_meta
[
'dict'
]),
seq_type
=
SequenceType
.
SEQUENCE
if
is_seq
else
SequenceType
.
NO_SEQUENCE
)
elif
each_meta
[
'type'
]
==
'one_hot_dense'
:
yield
slot_name
,
dense_vector
(
len
(
each_meta
[
'dict'
]))
recommender_system/deprecated/data/config.json
已删除
100644 → 0
浏览文件 @
ba0149b3
{
"user"
:
{
"file"
:
{
"name"
:
"users.dat"
,
"delimiter"
:
"::"
},
"fields"
:
[
"id"
,
"gender"
,
"age"
,
"occupation"
]
},
"movie"
:
{
"file"
:
{
"name"
:
"movies.dat"
,
"delimiter"
:
"::"
},
"fields"
:
[
"id"
,
"title"
,
"genres"
]
}
}
recommender_system/deprecated/data/config_generator.py
已删除
100644 → 0
浏览文件 @
ba0149b3
#!/bin/env python2
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
config_generator.py
Usage:
./config_generator.py <config_file> [--output_format=<output_format>]
./config_generator.py -h | --help
Options:
-h --help Show this screen.
--output_format=<output_format> Output Config format(json or yaml) [default: json].
"""
import
json
import
docopt
import
copy
DEFAULT_FILE
=
{
"type"
:
"split"
,
"delimiter"
:
","
}
DEFAULT_FIELD
=
{
"id"
:
{
"type"
:
"id"
},
"gender"
:
{
"name"
:
"gender"
,
"type"
:
"embedding"
,
"dict"
:
{
"type"
:
"char_based"
}
},
"age"
:
{
"name"
:
"age"
,
"type"
:
"embedding"
,
"dict"
:
{
"type"
:
"whole_content"
,
"sort"
:
True
}
},
"occupation"
:
{
"name"
:
"occupation"
,
"type"
:
"embedding"
,
"dict"
:
{
"type"
:
"whole_content"
,
"sort"
:
"true"
}
},
"title"
:
{
"regex"
:
{
"pattern"
:
r
"^(.*)\((\d+)\)$"
,
"group_id"
:
1
,
"strip"
:
True
},
"name"
:
"title"
,
"type"
:
{
"name"
:
"embedding"
,
"seq_type"
:
"sequence"
,
},
"dict"
:
{
"type"
:
"char_based"
}
},
"genres"
:
{
"type"
:
"one_hot_dense"
,
"dict"
:
{
"type"
:
"split"
,
"delimiter"
:
"|"
},
"name"
:
"genres"
}
}
def
merge_dict
(
master_dict
,
slave_dict
):
return
dict
(((
k
,
master_dict
.
get
(
k
)
or
slave_dict
.
get
(
k
))
for
k
in
set
(
slave_dict
)
|
set
(
master_dict
)))
def
main
(
filename
,
fmt
):
with
open
(
filename
,
'r'
)
as
f
:
conf
=
json
.
load
(
f
)
obj
=
dict
()
for
k
in
conf
:
val
=
conf
[
k
]
file_dict
=
val
[
'file'
]
file_dict
=
merge_dict
(
file_dict
,
DEFAULT_FILE
)
fields
=
[]
for
pos
,
field_key
in
enumerate
(
val
[
'fields'
]):
assert
isinstance
(
field_key
,
basestring
)
field
=
copy
.
deepcopy
(
DEFAULT_FIELD
[
field_key
])
field
[
'pos'
]
=
pos
fields
.
append
(
field
)
obj
[
k
]
=
{
"file"
:
file_dict
,
"fields"
:
fields
}
meta
=
{
"meta"
:
obj
}
# print meta
if
fmt
==
'json'
:
def
formatter
(
x
):
import
json
return
json
.
dumps
(
x
,
indent
=
2
)
elif
fmt
==
'yaml'
:
def
formatter
(
x
):
import
yaml
return
yaml
.
safe_dump
(
x
,
default_flow_style
=
False
)
else
:
raise
NotImplementedError
(
"Dump format %s is not implemented"
%
fmt
)
print
formatter
(
meta
)
if
__name__
==
'__main__'
:
args
=
docopt
.
docopt
(
__doc__
,
version
=
"0.1.0"
)
main
(
args
[
"<config_file>"
],
args
[
"--output_format"
])
recommender_system/deprecated/data/getdata.sh
已删除
100755 → 0
浏览文件 @
ba0149b3
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set
-ex
cd
"
$(
dirname
"
$0
"
)
"
# download the dataset
wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
# unzip the dataset
unzip ml-1m.zip
# remove the unused zip file
rm
ml-1m.zip
recommender_system/deprecated/data/meta_generator.py
已删除
100644 → 0
浏览文件 @
ba0149b3
#!/bin/env python2
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Preprocess Movielens dataset, to get movie/user object.
Usage:
./preprocess.py <dataset_dir> <binary_filename> [--config=<config_file>]
./preprocess.py -h | --help
Options:
-h --help Show this screen.
--version Show version.
--config=<config_file> Get MetaData config file [default: config.json].
"""
import
docopt
import
os
import
sys
import
re
import
collections
try
:
import
cPickle
as
pickle
except
ImportError
:
import
pickle
class
UniqueIDGenerator
(
object
):
def
__init__
(
self
):
self
.
pool
=
collections
.
defaultdict
(
self
.
__next_id__
)
self
.
next_id
=
0
def
__next_id__
(
self
):
tmp
=
self
.
next_id
self
.
next_id
+=
1
return
tmp
def
__call__
(
self
,
k
):
return
self
.
pool
[
k
]
def
to_list
(
self
):
ret_val
=
[
None
]
*
len
(
self
.
pool
)
for
k
in
self
.
pool
.
keys
():
ret_val
[
self
.
pool
[
k
]]
=
k
return
ret_val
class
SortedIDGenerator
(
object
):
def
__init__
(
self
):
self
.
__key_set__
=
set
()
self
.
dict
=
None
def
scan
(
self
,
key
):
self
.
__key_set__
.
add
(
key
)
def
finish_scan
(
self
,
compare
=
None
,
key
=
None
,
reverse
=
False
):
self
.
__key_set__
=
sorted
(
list
(
self
.
__key_set__
),
cmp
=
compare
,
key
=
key
,
reverse
=
reverse
)
self
.
dict
=
dict
()
for
idx
,
each_key
in
enumerate
(
self
.
__key_set__
):
self
.
dict
[
each_key
]
=
idx
def
__call__
(
self
,
key
):
return
self
.
dict
[
key
]
def
to_list
(
self
):
return
self
.
__key_set__
class
SplitFileReader
(
object
):
def
__init__
(
self
,
work_dir
,
config
):
assert
isinstance
(
config
,
dict
)
self
.
filename
=
config
[
'name'
]
self
.
delimiter
=
config
.
get
(
'delimiter'
,
','
)
self
.
work_dir
=
work_dir
def
read
(
self
):
with
open
(
os
.
path
.
join
(
self
.
work_dir
,
self
.
filename
),
'r'
)
as
f
:
for
line
in
f
:
line
=
line
.
strip
()
if
isinstance
(
self
.
delimiter
,
unicode
):
self
.
delimiter
=
str
(
self
.
delimiter
)
yield
line
.
split
(
self
.
delimiter
)
@
staticmethod
def
create
(
work_dir
,
config
):
assert
isinstance
(
config
,
dict
)
if
config
[
'type'
]
==
'split'
:
return
SplitFileReader
(
work_dir
,
config
)
class
IFileReader
(
object
):
READERS
=
[
SplitFileReader
]
def
read
(
self
):
raise
NotImplementedError
()
@
staticmethod
def
create
(
work_dir
,
config
):
for
reader_cls
in
IFileReader
.
READERS
:
val
=
reader_cls
.
create
(
work_dir
,
config
)
if
val
is
not
None
:
return
val
class
IDFieldParser
(
object
):
TYPE
=
'id'
def
__init__
(
self
,
config
):
self
.
__max_id__
=
-
sys
.
maxint
-
1
self
.
__min_id__
=
sys
.
maxint
self
.
__id_count__
=
0
def
scan
(
self
,
line
):
idx
=
int
(
line
)
self
.
__max_id__
=
max
(
self
.
__max_id__
,
idx
)
self
.
__min_id__
=
min
(
self
.
__min_id__
,
idx
)
self
.
__id_count__
+=
1
def
parse
(
self
,
line
):
return
int
(
line
)
def
meta_field
(
self
):
return
{
"is_key"
:
True
,
'max'
:
self
.
__max_id__
,
'min'
:
self
.
__min_id__
,
'count'
:
self
.
__id_count__
,
'type'
:
'id'
}
class
SplitEmbeddingDict
(
object
):
def
__init__
(
self
,
delimiter
):
self
.
__id__
=
UniqueIDGenerator
()
self
.
delimiter
=
delimiter
def
scan
(
self
,
multi
):
for
val
in
multi
.
split
(
self
.
delimiter
):
self
.
__id__
(
val
)
def
parse
(
self
,
multi
):
return
map
(
self
.
__id__
,
multi
.
split
(
self
.
delimiter
))
def
meta_field
(
self
):
return
self
.
__id__
.
to_list
()
class
EmbeddingFieldParser
(
object
):
TYPE
=
'embedding'
NO_SEQUENCE
=
"no_sequence"
SEQUENCE
=
"sequence"
class
CharBasedEmbeddingDict
(
object
):
def
__init__
(
self
,
is_seq
=
True
):
self
.
__id__
=
UniqueIDGenerator
()
self
.
is_seq
=
is_seq
def
scan
(
self
,
s
):
for
ch
in
s
:
self
.
__id__
(
ch
)
def
parse
(
self
,
s
):
return
map
(
self
.
__id__
,
s
)
if
self
.
is_seq
else
self
.
__id__
(
s
[
0
])
def
meta_field
(
self
):
return
self
.
__id__
.
to_list
()
class
WholeContentDict
(
object
):
def
__init__
(
self
,
need_sort
=
True
):
assert
need_sort
self
.
__id__
=
SortedIDGenerator
()
self
.
__has_finished__
=
False
def
scan
(
self
,
txt
):
self
.
__id__
.
scan
(
txt
)
def
meta_field
(
self
):
if
not
self
.
__has_finished__
:
self
.
__id__
.
finish_scan
()
self
.
__has_finished__
=
True
return
self
.
__id__
.
to_list
()
def
parse
(
self
,
txt
):
return
self
.
__id__
(
txt
)
def
__init__
(
self
,
config
):
try
:
self
.
seq_type
=
config
[
'type'
][
'seq_type'
]
except
TypeError
:
self
.
seq_type
=
EmbeddingFieldParser
.
NO_SEQUENCE
if
config
[
'dict'
][
'type'
]
==
'char_based'
:
self
.
dict
=
EmbeddingFieldParser
.
CharBasedEmbeddingDict
(
self
.
seq_type
==
EmbeddingFieldParser
.
SEQUENCE
)
elif
config
[
'dict'
][
'type'
]
==
'split'
:
self
.
dict
=
SplitEmbeddingDict
(
config
[
'dict'
].
get
(
'delimiter'
,
','
))
elif
config
[
'dict'
][
'type'
]
==
'whole_content'
:
self
.
dict
=
EmbeddingFieldParser
.
WholeContentDict
(
config
[
'dict'
][
'sort'
])
else
:
print
config
assert
False
self
.
name
=
config
[
'name'
]
def
scan
(
self
,
s
):
self
.
dict
.
scan
(
s
)
def
meta_field
(
self
):
return
{
'name'
:
self
.
name
,
'dict'
:
self
.
dict
.
meta_field
(),
'type'
:
'embedding'
,
'seq'
:
self
.
seq_type
}
def
parse
(
self
,
s
):
return
self
.
dict
.
parse
(
s
)
class
OneHotDenseFieldParser
(
object
):
TYPE
=
'one_hot_dense'
def
__init__
(
self
,
config
):
if
config
[
'dict'
][
'type'
]
==
'split'
:
self
.
dict
=
SplitEmbeddingDict
(
config
[
'dict'
][
'delimiter'
])
self
.
name
=
config
[
'name'
]
def
scan
(
self
,
s
):
self
.
dict
.
scan
(
s
)
def
meta_field
(
self
):
# print self.dict.meta_field()
return
{
'dict'
:
self
.
dict
.
meta_field
(),
'name'
:
self
.
name
,
'type'
:
'one_hot_dense'
}
def
parse
(
self
,
s
):
ids
=
self
.
dict
.
parse
(
s
)
retv
=
[
0.0
]
*
len
(
self
.
dict
.
meta_field
())
for
idx
in
ids
:
retv
[
idx
]
=
1.0
# print retv
return
retv
class
FieldParserFactory
(
object
):
PARSERS
=
[
IDFieldParser
,
EmbeddingFieldParser
,
OneHotDenseFieldParser
]
@
staticmethod
def
create
(
config
):
if
isinstance
(
config
[
'type'
],
basestring
):
config_type
=
config
[
'type'
]
elif
isinstance
(
config
[
'type'
],
dict
):
config_type
=
config
[
'type'
][
'name'
]
assert
config_type
is
not
None
for
each_parser_cls
in
FieldParserFactory
.
PARSERS
:
if
config_type
==
each_parser_cls
.
TYPE
:
return
each_parser_cls
(
config
)
print
config
class
CompositeFieldParser
(
object
):
def
__init__
(
self
,
parser
,
extractor
):
self
.
extractor
=
extractor
self
.
parser
=
parser
def
scan
(
self
,
*
args
,
**
kwargs
):
self
.
parser
.
scan
(
self
.
extractor
.
extract
(
*
args
,
**
kwargs
))
def
parse
(
self
,
*
args
,
**
kwargs
):
return
self
.
parser
.
parse
(
self
.
extractor
.
extract
(
*
args
,
**
kwargs
))
def
meta_field
(
self
):
return
self
.
parser
.
meta_field
()
class
PositionContentExtractor
(
object
):
def
__init__
(
self
,
pos
):
self
.
pos
=
pos
def
extract
(
self
,
line
):
assert
isinstance
(
line
,
list
)
return
line
[
self
.
pos
]
class
RegexPositionContentExtractor
(
PositionContentExtractor
):
def
__init__
(
self
,
pos
,
pattern
,
group_id
,
strip
=
True
):
PositionContentExtractor
.
__init__
(
self
,
pos
)
pattern
=
pattern
.
strip
()
self
.
pattern
=
re
.
compile
(
pattern
)
self
.
group_id
=
group_id
self
.
strip
=
strip
def
extract
(
self
,
line
):
line
=
PositionContentExtractor
.
extract
(
self
,
line
)
match
=
self
.
pattern
.
match
(
line
)
# print line, self.pattern.pattern, match
assert
match
is
not
None
txt
=
match
.
group
(
self
.
group_id
)
if
self
.
strip
:
txt
.
strip
()
return
txt
class
ContentExtractorFactory
(
object
):
def
extract
(
self
,
line
):
pass
@
staticmethod
def
create
(
config
):
if
'pos'
in
config
:
if
'regex'
not
in
config
:
return
PositionContentExtractor
(
config
[
'pos'
])
else
:
extra_args
=
config
[
'regex'
]
return
RegexPositionContentExtractor
(
pos
=
config
[
'pos'
],
**
extra_args
)
class
MetaFile
(
object
):
def
__init__
(
self
,
work_dir
):
self
.
work_dir
=
work_dir
self
.
obj
=
dict
()
def
parse
(
self
,
config
):
config
=
config
[
'meta'
]
ret_obj
=
dict
()
for
key
in
config
.
keys
():
val
=
config
[
key
]
assert
'file'
in
val
reader
=
IFileReader
.
create
(
self
.
work_dir
,
val
[
'file'
])
assert
reader
is
not
None
assert
'fields'
in
val
and
isinstance
(
val
[
'fields'
],
list
)
fields_config
=
val
[
'fields'
]
field_parsers
=
map
(
MetaFile
.
__field_config_mapper__
,
fields_config
)
for
each_parser
in
field_parsers
:
assert
each_parser
is
not
None
for
each_block
in
reader
.
read
():
for
each_parser
in
field_parsers
:
each_parser
.
scan
(
each_block
)
metas
=
map
(
lambda
x
:
x
.
meta_field
(),
field_parsers
)
# print metas
key_index
=
filter
(
lambda
x
:
x
is
not
None
,
map
(
lambda
(
idx
,
meta
):
idx
if
'is_key'
in
meta
and
meta
[
'is_key'
]
else
None
,
enumerate
(
metas
)))[
0
]
key_map
=
[]
for
i
in
range
(
min
(
key_index
,
len
(
metas
))):
key_map
.
append
(
i
)
for
i
in
range
(
key_index
+
1
,
len
(
metas
)):
key_map
.
append
(
i
)
obj
=
{
'__meta__'
:
{
'raw_meta'
:
metas
,
'feature_map'
:
key_map
}}
for
each_block
in
reader
.
read
():
idx
=
field_parsers
[
key_index
].
parse
(
each_block
)
val
=
[]
for
i
,
each_parser
in
enumerate
(
field_parsers
):
if
i
!=
key_index
:
val
.
append
(
each_parser
.
parse
(
each_block
))
obj
[
idx
]
=
val
ret_obj
[
key
]
=
obj
self
.
obj
=
ret_obj
return
ret_obj
@
staticmethod
def
__field_config_mapper__
(
conf
):
assert
isinstance
(
conf
,
dict
)
extrator
=
ContentExtractorFactory
.
create
(
conf
)
field_parser
=
FieldParserFactory
.
create
(
conf
)
assert
extrator
is
not
None
assert
field_parser
is
not
None
return
CompositeFieldParser
(
field_parser
,
extrator
)
def
dump
(
self
,
fp
):
pickle
.
dump
(
self
.
obj
,
fp
,
pickle
.
HIGHEST_PROTOCOL
)
def
preprocess
(
binary_filename
,
dataset_dir
,
config
,
**
kwargs
):
assert
isinstance
(
config
,
str
)
with
open
(
config
,
'r'
)
as
config_file
:
file_loader
=
None
if
config
.
lower
().
endswith
(
'.yaml'
):
import
yaml
file_loader
=
yaml
elif
config
.
lower
().
endswith
(
'.json'
):
import
json
file_loader
=
json
config
=
file_loader
.
load
(
config_file
)
meta
=
MetaFile
(
dataset_dir
)
meta
.
parse
(
config
)
with
open
(
binary_filename
,
'wb'
)
as
outf
:
meta
.
dump
(
outf
)
if
__name__
==
'__main__'
:
args
=
docopt
.
docopt
(
__doc__
,
version
=
'0.1.0'
)
kwargs
=
dict
()
for
key
in
args
.
keys
():
if
key
!=
'--help'
:
param_name
=
key
assert
isinstance
(
param_name
,
str
)
param_name
=
param_name
.
replace
(
'<'
,
''
)
param_name
=
param_name
.
replace
(
'>'
,
''
)
param_name
=
param_name
.
replace
(
'--'
,
''
)
kwargs
[
param_name
]
=
args
[
key
]
preprocess
(
**
kwargs
)
recommender_system/deprecated/data/requirements.txt
已删除
100644 → 0
浏览文件 @
ba0149b3
PyYAML
docopt
recommender_system/deprecated/data/split.py
已删除
100644 → 0
浏览文件 @
ba0149b3
#!/bin/env python2
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Separate movielens 1m dataset to train/test file.
Usage:
./separate.py <input_file> [--test_ratio=<test_ratio>] [--delimiter=<delimiter>]
./separate.py -h | --help
Options:
-h --help Show this screen.
--version Show version.
--test_ratio=<test_ratio> Test ratio for separate [default: 0.1].
--delimiter=<delimiter> File delimiter [default: ,].
"""
import
docopt
import
collections
import
random
def
process
(
test_ratio
,
input_file
,
delimiter
,
**
kwargs
):
test_ratio
=
float
(
test_ratio
)
rating_dict
=
collections
.
defaultdict
(
list
)
with
open
(
input_file
,
'r'
)
as
f
:
for
line
in
f
:
user_id
=
int
(
line
.
split
(
delimiter
)[
0
])
rating_dict
[
user_id
].
append
(
line
.
strip
())
with
open
(
input_file
+
".train"
,
'w'
)
as
train_file
:
with
open
(
input_file
+
".test"
,
'w'
)
as
test_file
:
for
k
in
rating_dict
.
keys
():
lines
=
rating_dict
[
k
]
assert
isinstance
(
lines
,
list
)
random
.
shuffle
(
lines
)
test_len
=
int
(
len
(
lines
)
*
test_ratio
)
for
line
in
lines
[:
test_len
]:
print
>>
test_file
,
line
for
line
in
lines
[
test_len
:]:
print
>>
train_file
,
line
if
__name__
==
'__main__'
:
args
=
docopt
.
docopt
(
__doc__
,
version
=
'0.1.0'
)
kwargs
=
dict
()
for
key
in
args
.
keys
():
if
key
!=
'--help'
:
param_name
=
key
assert
isinstance
(
param_name
,
str
)
param_name
=
param_name
.
replace
(
'<'
,
''
)
param_name
=
param_name
.
replace
(
'>'
,
''
)
param_name
=
param_name
.
replace
(
'--'
,
''
)
kwargs
[
param_name
]
=
args
[
key
]
process
(
**
kwargs
)
recommender_system/deprecated/dataprovider.py
已删除
100755 → 0
浏览文件 @
ba0149b3
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
paddle.trainer.PyDataProvider2
import
*
from
common_utils
import
meta_to_header
def
__list_to_map__
(
lst
):
ret_val
=
dict
()
for
each
in
lst
:
k
,
v
=
each
ret_val
[
k
]
=
v
return
ret_val
def
hook
(
settings
,
meta
,
**
kwargs
):
"""
Init hook is invoked before process data. It will set obj.slots and store
data meta.
:param obj: global object. It will passed to process routine.
:type obj: object
:param meta: the meta file object, which passed from trainer_config. Meta
file record movie/user features.
:param kwargs: unused other arguments.
"""
# Header define slots that used for paddle.
# first part is movie features.
# second part is user features.
# final part is rating score.
# header is a list of [USE_SEQ_OR_NOT?, SlotType]
movie_headers
=
list
(
meta_to_header
(
meta
,
'movie'
))
settings
.
movie_names
=
[
h
[
0
]
for
h
in
movie_headers
]
headers
=
movie_headers
user_headers
=
list
(
meta_to_header
(
meta
,
'user'
))
settings
.
user_names
=
[
h
[
0
]
for
h
in
user_headers
]
headers
.
extend
(
user_headers
)
headers
.
append
((
"rating"
,
dense_vector
(
1
)))
# Score
# slot types.
settings
.
input_types
=
__list_to_map__
(
headers
)
settings
.
meta
=
meta
@
provider
(
init_hook
=
hook
,
cache
=
CacheType
.
CACHE_PASS_IN_MEM
)
def
process
(
settings
,
filename
):
with
open
(
filename
,
'r'
)
as
f
:
for
line
in
f
:
# Get a rating from file.
user_id
,
movie_id
,
score
=
map
(
int
,
line
.
split
(
'::'
)[:
-
1
])
# Scale score to [-2, +2]
score
=
float
(
score
-
3
)
# Get movie/user features by movie_id, user_id
movie_meta
=
settings
.
meta
[
'movie'
][
movie_id
]
user_meta
=
settings
.
meta
[
'user'
][
user_id
]
outputs
=
[(
'movie_id'
,
movie_id
-
1
)]
# Then add movie features
for
i
,
each_meta
in
enumerate
(
movie_meta
):
outputs
.
append
((
settings
.
movie_names
[
i
+
1
],
each_meta
))
# Then add user id.
outputs
.
append
((
'user_id'
,
user_id
-
1
))
# Then add user features.
for
i
,
each_meta
in
enumerate
(
user_meta
):
outputs
.
append
((
settings
.
user_names
[
i
+
1
],
each_meta
))
# Finally, add score
outputs
.
append
((
'rating'
,
[
score
]))
# Return data to paddle
yield
__list_to_map__
(
outputs
)
recommender_system/deprecated/evaluate.py
已删除
100755 → 0
浏览文件 @
ba0149b3
#!/usr/bin/python
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
sys
import
re
import
math
def
get_best_pass
(
log_filename
):
with
open
(
log_filename
,
'r'
)
as
f
:
text
=
f
.
read
()
pattern
=
re
.
compile
(
'Test.*? cost=([0-9]+\.[0-9]+).*?pass-([0-9]+)'
,
re
.
S
)
results
=
re
.
findall
(
pattern
,
text
)
sorted_results
=
sorted
(
results
,
key
=
lambda
result
:
float
(
result
[
0
]))
return
sorted_results
[
0
]
log_filename
=
sys
.
argv
[
1
]
log
=
get_best_pass
(
log_filename
)
predict_error
=
math
.
sqrt
(
float
(
log
[
0
]))
/
2
print
'Best pass is %s, error is %s, which means predict get error as %f'
%
(
log
[
1
],
log
[
0
],
predict_error
)
evaluate_pass
=
"output/pass-%s"
%
log
[
1
]
print
"evaluating from pass %s"
%
evaluate_pass
recommender_system/deprecated/prediction.py
已删除
100755 → 0
浏览文件 @
ba0149b3
#!/bin/env python2
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
py_paddle
import
swig_paddle
,
DataProviderConverter
from
common_utils
import
*
from
paddle.trainer.config_parser
import
parse_config
try
:
import
cPickle
as
pickle
except
ImportError
:
import
pickle
import
sys
if
__name__
==
'__main__'
:
model_path
=
sys
.
argv
[
1
]
swig_paddle
.
initPaddle
(
'--use_gpu=0'
)
conf
=
parse_config
(
"trainer_config.py"
,
"is_predict=1"
)
network
=
swig_paddle
.
GradientMachine
.
createFromConfigProto
(
conf
.
model_config
)
assert
isinstance
(
network
,
swig_paddle
.
GradientMachine
)
network
.
loadParameters
(
model_path
)
with
open
(
'./data/meta.bin'
,
'rb'
)
as
f
:
meta
=
pickle
.
load
(
f
)
headers
=
[
h
[
1
]
for
h
in
meta_to_header
(
meta
,
'movie'
)]
headers
.
extend
([
h
[
1
]
for
h
in
meta_to_header
(
meta
,
'user'
)])
cvt
=
DataProviderConverter
(
headers
)
while
True
:
movie_id
=
int
(
raw_input
(
"Input movie_id: "
))
user_id
=
int
(
raw_input
(
"Input user_id: "
))
movie_meta
=
meta
[
'movie'
][
movie_id
]
# Query Data From Meta.
user_meta
=
meta
[
'user'
][
user_id
]
data
=
[
movie_id
-
1
]
data
.
extend
(
movie_meta
)
data
.
append
(
user_id
-
1
)
data
.
extend
(
user_meta
)
print
"Prediction Score is %.2f"
%
(
network
.
forwardTest
(
cvt
.
convert
([
data
]))[
0
][
'value'
][
0
][
0
]
+
3
)
recommender_system/deprecated/preprocess.sh
已删除
100755 → 0
浏览文件 @
ba0149b3
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set
-e
UNAME_STR
=
`
uname
`
if
[[
${
UNAME_STR
}
==
'Linux'
]]
;
then
SHUF_PROG
=
'shuf'
else
SHUF_PROG
=
'gshuf'
fi
cd
"
$(
dirname
"
$0
"
)
"
delimiter
=
'::'
dir
=
ml-1m
cd
data
echo
'generate meta config file'
python config_generator.py config.json
>
meta_config.json
echo
'generate meta file'
python meta_generator.py
$dir
meta.bin
--config
=
meta_config.json
echo
'split train/test file'
python split.py
$dir
/ratings.dat
--delimiter
=
${
delimiter
}
--test_ratio
=
0.1
echo
'shuffle train file'
${
SHUF_PROG
}
$dir
/ratings.dat.train
>
ratings.dat.train
cp
$dir
/ratings.dat.test
.
echo
"./data/ratings.dat.train"
>
train.list
echo
"./data/ratings.dat.test"
>
test.list
recommender_system/deprecated/train.sh
已删除
100755 → 0
浏览文件 @
ba0149b3
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set
-e
paddle train
\
--config
=
trainer_config.py
\
--save_dir
=
./output
\
--use_gpu
=
false
\
--trainer_count
=
4
\
--test_all_data_in_one_period
=
true
\
--log_period
=
100
\
--dot_period
=
1
\
--num_passes
=
50 2>&1 |
tee
'log.txt'
recommender_system/deprecated/trainer_config.py
已删除
100755 → 0
浏览文件 @
ba0149b3
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
paddle.trainer_config_helpers
import
*
try
:
import
cPickle
as
pickle
except
ImportError
:
import
pickle
is_predict
=
get_config_arg
(
'is_predict'
,
bool
,
False
)
META_FILE
=
'data/meta.bin'
with
open
(
META_FILE
,
'rb'
)
as
f
:
# load meta file
meta
=
pickle
.
load
(
f
)
if
not
is_predict
:
define_py_data_sources2
(
'data/train.list'
,
'data/test.list'
,
module
=
'dataprovider'
,
obj
=
'process'
,
args
=
{
'meta'
:
meta
})
settings
(
batch_size
=
1600
,
learning_rate
=
1e-3
,
learning_method
=
RMSPropOptimizer
())
movie_meta
=
meta
[
'movie'
][
'__meta__'
][
'raw_meta'
]
user_meta
=
meta
[
'user'
][
'__meta__'
][
'raw_meta'
]
movie_id
=
data_layer
(
'movie_id'
,
size
=
movie_meta
[
0
][
'max'
])
title
=
data_layer
(
'title'
,
size
=
len
(
movie_meta
[
1
][
'dict'
]))
genres
=
data_layer
(
'genres'
,
size
=
len
(
movie_meta
[
2
][
'dict'
]))
user_id
=
data_layer
(
'user_id'
,
size
=
user_meta
[
0
][
'max'
])
gender
=
data_layer
(
'gender'
,
size
=
len
(
user_meta
[
1
][
'dict'
]))
age
=
data_layer
(
'age'
,
size
=
len
(
user_meta
[
2
][
'dict'
]))
occupation
=
data_layer
(
'occupation'
,
size
=
len
(
user_meta
[
3
][
'dict'
]))
embsize
=
256
# construct movie feature
movie_id_emb
=
embedding_layer
(
input
=
movie_id
,
size
=
embsize
)
movie_id_hidden
=
fc_layer
(
input
=
movie_id_emb
,
size
=
embsize
)
genres_emb
=
fc_layer
(
input
=
genres
,
size
=
embsize
)
title_emb
=
embedding_layer
(
input
=
title
,
size
=
embsize
)
title_hidden
=
text_conv_pool
(
input
=
title_emb
,
context_len
=
5
,
hidden_size
=
embsize
)
movie_feature
=
fc_layer
(
input
=
[
movie_id_hidden
,
title_hidden
,
genres_emb
],
size
=
embsize
)
# construct user feature
user_id_emb
=
embedding_layer
(
input
=
user_id
,
size
=
embsize
)
user_id_hidden
=
fc_layer
(
input
=
user_id_emb
,
size
=
embsize
)
gender_emb
=
embedding_layer
(
input
=
gender
,
size
=
embsize
)
gender_hidden
=
fc_layer
(
input
=
gender_emb
,
size
=
embsize
)
age_emb
=
embedding_layer
(
input
=
age
,
size
=
embsize
)
age_hidden
=
fc_layer
(
input
=
age_emb
,
size
=
embsize
)
occup_emb
=
embedding_layer
(
input
=
occupation
,
size
=
embsize
)
occup_hidden
=
fc_layer
(
input
=
occup_emb
,
size
=
embsize
)
user_feature
=
fc_layer
(
input
=
[
user_id_hidden
,
gender_hidden
,
age_hidden
,
occup_hidden
],
size
=
embsize
)
similarity
=
cos_sim
(
a
=
movie_feature
,
b
=
user_feature
,
scale
=
2
)
if
not
is_predict
:
lbl
=
data_layer
(
'rating'
,
size
=
1
)
cost
=
regression_cost
(
input
=
similarity
,
label
=
lbl
)
outputs
(
cost
)
else
:
outputs
(
similarity
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录