Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
PaddleRec
提交
77381441
P
PaddleRec
项目概览
BaiXuePrincess
/
PaddleRec
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleRec
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
77381441
编写于
6月 06, 2020
作者:
W
wuzhihua
提交者:
GitHub
6月 06, 2020
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #42 from 123malin/bug_fix
add movie_recommand_demo
上级
3cd0fd19
d0ff1534
变更
20
隐藏空白更改
内联
并排
Showing
20 changed file
with
901 addition
and
1 deletion
+901
-1
core/trainers/single_infer.py
core/trainers/single_infer.py
+16
-1
models/demo/__init__.py
models/demo/__init__.py
+13
-0
models/demo/movie_recommand/__init__.py
models/demo/movie_recommand/__init__.py
+13
-0
models/demo/movie_recommand/data/online_user/users.dat
models/demo/movie_recommand/data/online_user/users.dat
+2
-0
models/demo/movie_recommand/data/process_ml_1m.py
models/demo/movie_recommand/data/process_ml_1m.py
+146
-0
models/demo/movie_recommand/data/split.py
models/demo/movie_recommand/data/split.py
+51
-0
models/demo/movie_recommand/data/test/data.txt
models/demo/movie_recommand/data/test/data.txt
+0
-0
models/demo/movie_recommand/data/train/data.txt
models/demo/movie_recommand/data/train/data.txt
+0
-0
models/demo/movie_recommand/data_prepare.sh
models/demo/movie_recommand/data_prepare.sh
+18
-0
models/demo/movie_recommand/offline_test.sh
models/demo/movie_recommand/offline_test.sh
+12
-0
models/demo/movie_recommand/online_rank.sh
models/demo/movie_recommand/online_rank.sh
+8
-0
models/demo/movie_recommand/online_recall.sh
models/demo/movie_recommand/online_recall.sh
+9
-0
models/demo/movie_recommand/parse.py
models/demo/movie_recommand/parse.py
+176
-0
models/demo/movie_recommand/rank/__init__.py
models/demo/movie_recommand/rank/__init__.py
+13
-0
models/demo/movie_recommand/rank/config.yaml
models/demo/movie_recommand/rank/config.yaml
+93
-0
models/demo/movie_recommand/rank/model.py
models/demo/movie_recommand/rank/model.py
+120
-0
models/demo/movie_recommand/recall/__init__.py
models/demo/movie_recommand/recall/__init__.py
+13
-0
models/demo/movie_recommand/recall/config.yaml
models/demo/movie_recommand/recall/config.yaml
+93
-0
models/demo/movie_recommand/recall/model.py
models/demo/movie_recommand/recall/model.py
+100
-0
models/demo/movie_recommand/train.sh
models/demo/movie_recommand/train.sh
+5
-0
未找到文件。
core/trainers/single_infer.py
浏览文件 @
77381441
...
...
@@ -20,6 +20,8 @@ from __future__ import print_function
import
time
import
logging
import
os
import
json
import
numpy
as
np
import
paddle.fluid
as
fluid
from
paddlerec.core.trainers.transpiler_trainer
import
TranspileTrainer
...
...
@@ -263,8 +265,10 @@ class SingleInfer(TranspileTrainer):
envs
.
get_global_env
(
"runner."
+
self
.
_runner_name
+
".print_interval"
,
20
))
metrics_format
.
append
(
"{}: {{}}"
.
format
(
"batch"
))
metrics_indexes
=
dict
()
for
name
,
var
in
metrics
.
items
():
metrics_varnames
.
append
(
var
.
name
)
metrics_indexes
[
var
.
name
]
=
len
(
metrics_varnames
)
-
1
metrics_format
.
append
(
"{}: {{}}"
.
format
(
name
))
metrics_format
=
", "
.
join
(
metrics_format
)
...
...
@@ -272,19 +276,30 @@ class SingleInfer(TranspileTrainer):
reader
.
start
()
batch_id
=
0
scope
=
self
.
_model
[
model_name
][
2
]
infer_results
=
[]
with
fluid
.
scope_guard
(
scope
):
try
:
while
True
:
metrics_rets
=
self
.
_exe
.
run
(
program
=
program
,
fetch_list
=
metrics_varnames
)
fetch_list
=
metrics_varnames
,
return_numpy
=
False
)
metrics
=
[
batch_id
]
metrics
.
extend
(
metrics_rets
)
batch_infer_result
=
{}
for
k
,
v
in
metrics_indexes
.
items
():
batch_infer_result
[
k
]
=
np
.
array
(
metrics_rets
[
v
]).
tolist
()
infer_results
.
append
(
batch_infer_result
)
if
batch_id
%
fetch_period
==
0
and
batch_id
!=
0
:
print
(
metrics_format
.
format
(
*
metrics
))
batch_id
+=
1
except
fluid
.
core
.
EOFException
:
reader
.
reset
()
with
open
(
model_dict
[
'save_path'
],
'w'
)
as
fout
:
json
.
dump
(
infer_results
,
fout
)
def
terminal
(
self
,
context
):
context
[
'is_exit'
]
=
True
...
...
models/demo/__init__.py
0 → 100755
浏览文件 @
77381441
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
models/demo/movie_recommand/__init__.py
0 → 100755
浏览文件 @
77381441
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
models/demo/movie_recommand/data/online_user/users.dat
0 → 100644
浏览文件 @
77381441
2181::M::25::0
2073::F::18::4
models/demo/movie_recommand/data/process_ml_1m.py
0 → 100644
浏览文件 @
77381441
#coding=utf8
import
sys
reload
(
sys
)
sys
.
setdefaultencoding
(
'utf-8'
)
import
random
import
json
user_fea
=
[
"userid"
,
"gender"
,
"age"
,
"occupation"
]
movie_fea
=
[
"movieid"
,
"title"
,
"genres"
]
rating_fea
=
[
"userid"
,
"movieid"
,
"rating"
,
"time"
]
dict_size
=
60000000
hash_dict
=
dict
()
data_path
=
"ml-1m"
test_user_path
=
"online_user"
def
process
(
path
):
user_dict
=
parse_data
(
data_path
+
"/users.dat"
,
user_fea
)
movie_dict
=
parse_movie_data
(
data_path
+
"/movies.dat"
,
movie_fea
)
for
line
in
open
(
path
):
line
=
line
.
strip
()
arr
=
line
.
split
(
"::"
)
userid
=
arr
[
0
]
movieid
=
arr
[
1
]
out_str
=
"time:%s
\t
%s
\t
%s
\t
label:%s"
%
(
arr
[
3
],
user_dict
[
userid
],
movie_dict
[
movieid
],
arr
[
2
])
log_id
=
hash
(
out_str
)
%
1000000000
print
"%s
\t
%s"
%
(
log_id
,
out_str
)
def
parse_data
(
file_name
,
feas
):
dict
=
{}
for
line
in
open
(
file_name
):
line
=
line
.
strip
()
arr
=
line
.
split
(
"::"
)
out_str
=
""
for
i
in
range
(
0
,
len
(
feas
)):
out_str
+=
"%s:%s
\t
"
%
(
feas
[
i
],
arr
[
i
])
dict
[
arr
[
0
]]
=
out_str
.
strip
()
return
dict
def
parse_movie_data
(
file_name
,
feas
):
dict
=
{}
for
line
in
open
(
file_name
):
line
=
line
.
strip
()
arr
=
line
.
split
(
"::"
)
title_str
=
""
genres_str
=
""
for
term
in
arr
[
1
].
split
(
" "
):
term
=
term
.
strip
()
if
term
!=
""
:
title_str
+=
"%s "
%
(
term
)
for
term
in
arr
[
2
].
split
(
"|"
):
term
=
term
.
strip
()
if
term
!=
""
:
genres_str
+=
"%s "
%
(
term
)
out_str
=
"movieid:%s
\t
title:%s
\t
genres:%s"
%
(
arr
[
0
],
title_str
.
strip
(),
genres_str
.
strip
())
dict
[
arr
[
0
]]
=
out_str
.
strip
()
return
dict
def
to_hash
(
in_str
):
feas
=
in_str
.
split
(
":"
)[
0
]
arr
=
in_str
.
split
(
":"
)[
1
]
out_str
=
"%s:%s"
%
(
feas
,
(
arr
+
arr
[::
-
1
]
+
arr
[::
-
2
]
+
arr
[::
-
3
]))
hash_id
=
hash
(
out_str
)
%
dict_size
if
hash_id
in
hash_dict
and
hash_dict
[
hash_id
]
!=
out_str
:
print
(
hash_id
,
out_str
,
hash
(
out_str
))
print
(
"conflict"
)
exit
(
-
1
)
return
"%s:%s"
%
(
feas
,
hash_id
)
def
to_hash_list
(
in_str
):
arr
=
in_str
.
split
(
":"
)
tmp_arr
=
arr
[
1
].
split
(
" "
)
out_str
=
""
for
item
in
tmp_arr
:
item
=
item
.
strip
()
if
item
!=
""
:
key
=
"%s:%s"
%
(
arr
[
0
],
item
)
out_str
+=
"%s "
%
(
to_hash
(
key
))
return
out_str
.
strip
()
def
get_hash
(
path
):
#0-34831 1-time:974673057 2-userid:2021 3-gender:M 4-age:25 5-occupation:0 6-movieid:1345 7-title:Carrie (1976) 8-genres:Horror 9-label:2
for
line
in
open
(
path
):
arr
=
line
.
strip
().
split
(
"
\t
"
)
out_str
=
"logid:%s %s %s %s %s %s %s %s %s %s"
%
\
(
arr
[
0
],
arr
[
1
],
to_hash
(
arr
[
2
]),
to_hash
(
arr
[
3
]),
to_hash
(
arr
[
4
]),
to_hash
(
arr
[
5
]),
\
to_hash
(
arr
[
6
]),
to_hash_list
(
arr
[
7
]),
to_hash_list
(
arr
[
8
]),
arr
[
9
])
print
out_str
def
generate_online_user
():
movie_dict
=
parse_movie_data
(
data_path
+
"/movies.dat"
,
movie_fea
)
with
open
(
test_user_path
+
"/movies.dat"
,
'w'
)
as
f
:
for
line
in
open
(
test_user_path
+
"/users.dat"
):
line
=
line
.
strip
()
arr
=
line
.
split
(
"::"
)
userid
=
arr
[
0
]
for
item
in
movie_dict
:
f
.
write
(
userid
+
"::"
+
item
+
"::1"
)
f
.
write
(
"
\n
"
)
def
generate_online_data
(
path
):
user_dict
=
parse_data
(
data_path
+
"/users.dat"
,
user_fea
)
movie_dict
=
parse_movie_data
(
data_path
+
"/movies.dat"
,
movie_fea
)
for
line
in
open
(
path
):
line
=
line
.
strip
()
arr
=
line
.
split
(
"::"
)
userid
=
arr
[
0
]
movieid
=
arr
[
1
]
label
=
arr
[
2
]
out_str
=
"time:%s
\t
%s
\t
%s
\t
label:%s"
%
(
"1"
,
user_dict
[
userid
],
movie_dict
[
movieid
],
label
)
log_id
=
hash
(
out_str
)
%
1000000000
res
=
"%s
\t
%s"
%
(
log_id
,
out_str
)
arr
=
res
.
strip
().
split
(
"
\t
"
)
out_str
=
"logid:%s %s %s %s %s %s %s %s %s %s"
%
\
(
arr
[
0
],
arr
[
1
],
to_hash
(
arr
[
2
]),
to_hash
(
arr
[
3
]),
to_hash
(
arr
[
4
]),
to_hash
(
arr
[
5
]),
\
to_hash
(
arr
[
6
]),
to_hash_list
(
arr
[
7
]),
to_hash_list
(
arr
[
8
]),
arr
[
9
])
print
(
out_str
)
if
__name__
==
"__main__"
:
random
.
seed
(
1111111
)
if
sys
.
argv
[
1
]
==
"process_raw"
:
process
(
sys
.
argv
[
2
])
elif
sys
.
argv
[
1
]
==
"hash"
:
get_hash
(
sys
.
argv
[
2
])
elif
sys
.
argv
[
1
]
==
"data_recall"
:
generate_online_user
()
generate_online_data
(
test_user_path
+
"/movies.dat"
)
elif
sys
.
argv
[
1
]
==
"data_rank"
:
generate_online_data
(
test_user_path
+
"/movies.dat"
)
models/demo/movie_recommand/data/split.py
0 → 100644
浏览文件 @
77381441
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
random
train
=
dict
()
test
=
dict
()
data_path
=
"ml-1m"
for
line
in
open
(
data_path
+
"/ratings.dat"
):
fea
=
line
.
rstrip
().
split
(
"::"
)
if
fea
[
0
]
not
in
train
:
train
[
fea
[
0
]]
=
[
line
]
elif
fea
[
0
]
not
in
test
:
test
[
fea
[
0
]]
=
dict
()
test
[
fea
[
0
]][
'time'
]
=
int
(
fea
[
3
])
test
[
fea
[
0
]][
'content'
]
=
line
else
:
time
=
int
(
fea
[
3
])
if
time
<=
test
[
fea
[
0
]][
'time'
]:
train
[
fea
[
0
]].
append
(
line
)
else
:
train
[
fea
[
0
]].
append
(
test
[
fea
[
0
]][
'content'
])
test
[
fea
[
0
]][
'time'
]
=
time
test
[
fea
[
0
]][
'content'
]
=
line
train_data
=
[]
for
key
in
train
:
for
line
in
train
[
key
]:
train_data
.
append
(
line
)
random
.
shuffle
(
train_data
)
with
open
(
data_path
+
"/train.dat"
,
'w'
)
as
f
:
for
line
in
train_data
:
f
.
write
(
line
)
with
open
(
data_path
+
"/test.dat"
,
'w'
)
as
f
:
for
key
in
test
:
f
.
write
(
test
[
key
][
'content'
])
models/demo/movie_recommand/data/test/data.txt
0 → 100644
浏览文件 @
77381441
models/demo/movie_recommand/data/train/data.txt
0 → 100644
浏览文件 @
77381441
models/demo/movie_recommand/data_prepare.sh
0 → 100644
浏览文件 @
77381441
cd
data
wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
unzip ml-1m.zip
python split.py
mkdir
train/
mkdir test
/
python process_ml_1m.py process_raw ./ml-1m/train.dat |
sort
-t
$'
\t
'
-k
9
-n
>
log.data.train
python process_ml_1m.py process_raw ./ml-1m/test.dat |
sort
-t
$'
\t
'
-k
9
-n
>
log.data.test
python process_ml_1m.py
hash
log.data.train
>
./train/data.txt
python process_ml_1m.py
hash
log.data.test
>
./test/data.txt
rm
log.data.train
rm
log.data.test
cd
../
models/demo/movie_recommand/offline_test.sh
0 → 100644
浏览文件 @
77381441
## modify config.yaml to infer mode at first
cd
recall
python
-m
paddlerec.run
-m
./config.yaml
cd
../rank
python
-m
paddlerec.run
-m
./config.yaml
cd
..
echo
"recall offline test result:"
python parse.py recall_offline recall/infer_result
echo
"rank offline test result:"
python parse.py rank_offline rank/infer_result
models/demo/movie_recommand/online_rank.sh
0 → 100644
浏览文件 @
77381441
cd
data
python process_ml_1m.py data_rank
>
online_user/test/data.txt
## modify recall/config.yaml to online_infer mode
cd
../rank
python
-m
paddlerec.run
-m
./config.yaml
cd
../
python parse.py rank_online rank/infer_result
models/demo/movie_recommand/online_recall.sh
0 → 100644
浏览文件 @
77381441
cd
data
mkdir
online_user/test
python process_ml_1m.py data_recall
>
online_user/test/data.txt
## modify recall/config.yaml to online_infer mode
cd
../recall
python
-m
paddlerec.run
-m
./config.yaml
cd
../
python parse.py recall_online recall/infer_result
models/demo/movie_recommand/parse.py
0 → 100644
浏览文件 @
77381441
#coding=utf8
import
sys
reload
(
sys
)
sys
.
setdefaultencoding
(
'utf-8'
)
import
random
import
json
import
numpy
as
np
import
operator
user_fea
=
[
"userid"
,
"gender"
,
"age"
,
"occupation"
]
movie_fea
=
[
"movieid"
,
"title"
,
"genres"
]
rating_fea
=
[
"userid"
,
"movieid"
,
"rating"
,
"time"
]
dict_size
=
60000000
hash_dict
=
dict
()
data_path
=
"data/ml-1m"
test_user_path
=
"data/online_user"
topk
=
100
def
read_raw_data
():
user_dict
=
parse_data
(
data_path
+
"/users.dat"
,
user_fea
)
movie_dict
=
parse_data
(
data_path
+
"/movies.dat"
,
movie_fea
)
ratings_dict
=
dict
()
for
line
in
open
(
data_path
+
"/ratings.dat"
):
arr
=
line
.
strip
().
split
(
"::"
)
if
arr
[
0
]
not
in
ratings_dict
:
ratings_dict
[
arr
[
0
]]
=
[]
tmp
=
dict
()
tmp
[
"movieid"
]
=
arr
[
1
]
tmp
[
"score"
]
=
arr
[
2
]
tmp
[
"time"
]
=
arr
[
3
]
ratings_dict
[
arr
[
0
]].
append
(
tmp
)
return
user_dict
,
movie_dict
,
ratings_dict
def
parse_data
(
file_name
,
feas
):
res
=
{}
for
line
in
open
(
file_name
):
line
=
line
.
strip
()
arr
=
line
.
split
(
"::"
)
res
[
arr
[
0
]]
=
dict
()
_
=
to_hash
(
feas
[
0
],
arr
[
0
])
for
i
in
range
(
0
,
len
(
feas
)):
res
[
arr
[
0
]][
feas
[
i
]]
=
arr
[
i
]
return
res
def
to_hash
(
feas
,
arr
):
out_str
=
"%s:%s"
%
(
feas
,
(
arr
+
arr
[::
-
1
]
+
arr
[::
-
2
]
+
arr
[::
-
3
]))
hash_id
=
hash
(
out_str
)
%
dict_size
if
hash_id
in
hash_dict
and
hash_dict
[
hash_id
]
!=
out_str
:
print
(
hash_id
,
out_str
,
hash
(
out_str
),
hash_dict
[
hash_id
])
print
(
"conflict"
)
exit
(
-
1
)
hash_dict
[
hash_id
]
=
out_str
return
hash_id
def
load_ground_truth
(
user_dict
,
movie_dict
,
ratings_dict
):
for
line
in
open
(
test_user_path
+
"/users.dat"
):
uid
=
line
.
strip
().
split
(
"::"
)[
0
]
display_user
(
user_dict
[
uid
])
ratings_dict
[
uid
]
=
sorted
(
ratings_dict
[
uid
],
key
=
lambda
i
:
(
i
[
"score"
],
i
[
"time"
]),
reverse
=
True
)
ratings_dict
[
uid
]
=
ratings_dict
[
uid
][:
topk
]
for
i
in
range
(
len
(
ratings_dict
[
uid
])):
item
=
ratings_dict
[
uid
][
i
]
mid
=
item
[
"movieid"
]
for
key
in
movie_fea
:
item
[
key
]
=
movie_dict
[
mid
][
key
]
display_movies
(
ratings_dict
[
uid
])
def
load_infer_results
(
path
,
feas
,
movie_dict
):
with
open
(
path
)
as
f
:
content
=
json
.
load
(
f
)
total
=
0
correct
=
0
mae
=
0.0
res
=
dict
()
for
item
in
content
:
userid
=
reduce
(
operator
.
add
,
item
[
feas
[
"userid"
]])
movieid
=
reduce
(
operator
.
add
,
item
[
feas
[
"movieid"
]])
ratings
=
reduce
(
operator
.
add
,
item
[
feas
[
"ratings"
]])
predict
=
map
(
int
,
ratings
)
label
=
reduce
(
operator
.
add
,
item
[
feas
[
"label"
]])
mae
+=
sum
(
np
.
square
(
np
.
array
(
ratings
)
-
np
.
array
(
label
)))
total
+=
len
(
label
)
correct
+=
sum
(
np
.
array
(
predict
)
==
np
.
array
(
label
))
for
i
in
range
(
len
(
userid
)):
hash_uid
=
userid
[
i
]
hash_mid
=
movieid
[
i
]
if
hash_uid
not
in
hash_dict
or
hash_mid
not
in
hash_dict
:
continue
tmp
=
hash_dict
[
hash_uid
].
split
(
':'
)[
1
]
uid
=
tmp
[:
len
(
tmp
)
/
3
]
tmp
=
hash_dict
[
hash_mid
].
split
(
':'
)[
1
]
mid
=
tmp
[:
len
(
tmp
)
/
3
]
if
uid
not
in
res
:
res
[
uid
]
=
[]
item
=
{
"score"
:
ratings
[
i
]}
for
info
in
movie_dict
[
mid
]:
item
[
info
]
=
movie_dict
[
mid
][
info
]
res
[
uid
].
append
(
item
)
for
key
in
res
:
tmp
=
sorted
(
res
[
key
],
key
=
lambda
i
:
i
[
"score"
],
reverse
=
True
)
existed_movie
=
[]
res
[
key
]
=
[]
for
i
in
range
(
len
(
tmp
)):
if
len
(
res
[
key
])
>=
topk
:
break
if
tmp
[
i
][
"movieid"
]
not
in
existed_movie
:
existed_movie
.
append
(
tmp
[
i
][
"movieid"
])
res
[
key
].
append
(
tmp
[
i
])
print
(
"total: "
+
str
(
total
)
+
"; correct: "
+
str
(
correct
))
print
(
"accuracy: "
+
str
(
float
(
correct
)
/
total
))
print
(
"mae: "
+
str
(
mae
/
total
))
return
res
def
display_user
(
item
):
out_str
=
""
for
key
in
user_fea
:
out_str
+=
"%s:%s "
%
(
key
,
item
[
key
])
print
(
out_str
)
def
display_movies
(
input
):
for
item
in
input
:
print_str
=
""
for
key
in
movie_fea
:
print_str
+=
"%s:%s "
%
(
key
,
item
[
key
])
print_str
+=
"%s:%s"
%
(
"score"
,
item
[
"score"
])
print
(
print_str
)
def
parse_infer
(
mode
,
path
,
user_dict
,
movie_dict
):
stage
,
online
=
mode
.
split
(
'_'
)
feas
=
{
"userid"
:
"userid"
,
"movieid"
:
"movieid"
,
"ratings"
:
"scale_0.tmp_0"
,
"label"
:
"label"
}
infer_results
=
load_infer_results
(
path
,
feas
,
movie_dict
)
if
online
.
startswith
(
"offline"
):
return
for
uid
in
infer_results
:
display_user
(
user_dict
[
uid
])
display_movies
(
infer_results
[
uid
])
with
open
(
test_user_path
+
"/movies.dat"
,
'w'
)
as
fout
:
for
uid
in
infer_results
:
for
item
in
infer_results
[
uid
]:
str_
=
uid
+
"::"
+
str
(
item
[
"movieid"
])
+
"::"
+
str
(
int
(
item
[
"score"
]))
+
"
\n
"
fout
.
write
(
str_
)
if
__name__
==
"__main__"
:
user_dict
,
movie_dict
,
ratings_dict
=
read_raw_data
()
if
sys
.
argv
[
1
]
==
"ground_truth"
:
load_ground_truth
(
user_dict
,
movie_dict
,
ratings_dict
)
else
:
parse_infer
(
sys
.
argv
[
1
],
sys
.
argv
[
2
],
user_dict
,
movie_dict
)
models/demo/movie_recommand/rank/__init__.py
0 → 100755
浏览文件 @
77381441
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
models/demo/movie_recommand/rank/config.yaml
0 → 100755
浏览文件 @
77381441
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
workspace
:
"
paddlerec.models.demo.movie_recommand"
# list of dataset
dataset
:
-
name
:
dataset_train
# name of dataset to distinguish different datasets
batch_size
:
128
type
:
QueueDataset
data_path
:
"
{workspace}/data/train"
sparse_slots
:
"
logid
time
userid
gender
age
occupation
movieid
title
genres
label"
dense_slots
:
"
"
-
name
:
dataset_infer
# name
batch_size
:
128
type
:
DataLoader
data_path
:
"
{workspace}/data/test"
sparse_slots
:
"
logid
time
userid
gender
age
occupation
movieid
title
genres
label"
dense_slots
:
"
"
-
name
:
dataset_online_infer
# name
batch_size
:
10
type
:
DataLoader
data_path
:
"
{workspace}/data/online_user/test"
sparse_slots
:
"
logid
time
userid
gender
age
occupation
movieid
title
genres
label"
dense_slots
:
"
"
# hyper parameters of user-defined network
hyper_parameters
:
# optimizer config
optimizer
:
class
:
Adam
learning_rate
:
0.001
strategy
:
async
# user-defined <key, value> pairs
sparse_feature_number
:
60000000
sparse_feature_dim
:
9
dense_input_dim
:
13
fc_sizes
:
[
512
,
256
,
128
,
32
]
# train
mode
:
runner_train
## online or offline infer
#mode: runner_infer
runner
:
-
name
:
runner_train
class
:
single_train
save_checkpoint_interval
:
1
# save model interval of epochs
save_inference_interval
:
1
# save inference
save_checkpoint_path
:
"
increment"
# save checkpoint path
save_inference_path
:
"
inference"
# save inference path
epochs
:
10
device
:
cpu
-
name
:
runner_infer
epochs
:
1
class
:
single_infer
print_interval
:
10000
init_model_path
:
"
increment/9"
# load model path
#train
phase
:
-
name
:
phase1
model
:
"
{workspace}/model.py"
# user-defined model
dataset_name
:
dataset_train
# select dataset by name
thread_num
:
12
##offline infer
#phase:
#- name: phase1
# model: "{workspace}/model.py" # user-defined model
# dataset_name: dataset_infer # select dataset by name
# save_path: "./infer_result"
# thread_num: 1
##offline infer
#phase:
#- name: phase1
# model: "{workspace}/model.py" # user-defined model
# dataset_name: dataset_online_infer # select dataset by name
# save_path: "./infer_result"
# thread_num: 1
models/demo/movie_recommand/rank/model.py
0 → 100755
浏览文件 @
77381441
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
math
import
paddle.fluid
as
fluid
from
paddlerec.core.utils
import
envs
from
paddlerec.core.model
import
Model
as
ModelBase
class
Model
(
ModelBase
):
def
__init__
(
self
,
config
):
ModelBase
.
__init__
(
self
,
config
)
def
_init_hyper_parameters
(
self
):
self
.
is_distributed
=
True
if
envs
.
get_trainer
(
)
==
"CtrTrainer"
else
False
self
.
sparse_feature_number
=
envs
.
get_global_env
(
"hyper_parameters.sparse_feature_number"
)
self
.
sparse_feature_dim
=
envs
.
get_global_env
(
"hyper_parameters.sparse_feature_dim"
)
self
.
learning_rate
=
envs
.
get_global_env
(
"hyper_parameters.optimizer.learning_rate"
)
self
.
hidden_layers
=
envs
.
get_global_env
(
"hyper_parameters.fc_sizes"
)
def
net
(
self
,
input
,
is_infer
=
False
):
self
.
user_sparse_inputs
=
self
.
_sparse_data_var
[
2
:
6
]
self
.
mov_sparse_inputs
=
self
.
_sparse_data_var
[
6
:
9
]
self
.
label_input
=
self
.
_sparse_data_var
[
-
1
]
def
fc
(
input
):
fcs
=
[
input
]
for
size
in
self
.
hidden_layers
:
output
=
fluid
.
layers
.
fc
(
input
=
fcs
[
-
1
],
size
=
size
,
act
=
'relu'
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Normal
(
scale
=
1.0
/
math
.
sqrt
(
fcs
[
-
1
].
shape
[
1
]))))
fcs
.
append
(
output
)
return
fcs
[
-
1
]
def
embedding_layer
(
input
):
emb
=
fluid
.
layers
.
embedding
(
input
=
input
,
is_sparse
=
True
,
is_distributed
=
self
.
is_distributed
,
size
=
[
self
.
sparse_feature_number
,
self
.
sparse_feature_dim
],
param_attr
=
fluid
.
ParamAttr
(
name
=
"emb"
,
initializer
=
fluid
.
initializer
.
Uniform
()),
)
emb_sum
=
fluid
.
layers
.
sequence_pool
(
input
=
emb
,
pool_type
=
'sum'
)
return
emb_sum
user_sparse_embed_seq
=
list
(
map
(
embedding_layer
,
self
.
user_sparse_inputs
))
mov_sparse_embed_seq
=
list
(
map
(
embedding_layer
,
self
.
mov_sparse_inputs
))
concated_user
=
fluid
.
layers
.
concat
(
user_sparse_embed_seq
,
axis
=
1
)
concated_mov
=
fluid
.
layers
.
concat
(
mov_sparse_embed_seq
,
axis
=
1
)
usr_combined_features
=
fc
(
concated_user
)
mov_combined_features
=
fc
(
concated_mov
)
fc_input
=
fluid
.
layers
.
concat
(
[
usr_combined_features
,
mov_combined_features
],
axis
=
1
)
sim
=
fluid
.
layers
.
fc
(
input
=
fc_input
,
size
=
1
,
act
=
'sigmoid'
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Normal
(
scale
=
1.0
/
math
.
sqrt
(
fc_input
.
shape
[
1
]))))
predict
=
fluid
.
layers
.
scale
(
sim
,
scale
=
5
)
self
.
predict
=
predict
#auc, batch_auc, _ = fluid.layers.auc(input=self.predict,
# label=self.label_input,
# num_thresholds=10000,
# slide_steps=20)
if
is_infer
:
self
.
_infer_results
[
"user_feature"
]
=
usr_combined_features
self
.
_infer_results
[
"movie_feature"
]
=
mov_combined_features
self
.
_infer_results
[
"uid"
]
=
self
.
_sparse_data_var
[
2
]
self
.
_infer_results
[
"movieid"
]
=
self
.
_sparse_data_var
[
6
]
self
.
_infer_results
[
"label"
]
=
self
.
_sparse_data_var
[
-
1
]
self
.
_infer_results
[
"predict"
]
=
self
.
predict
return
#self._metrics["AUC"] = auc
#self._metrics["BATCH_AUC"] = batch_auc
#cost = fluid.layers.cross_entropy(
# input=self.predict, label=self.label_input)
cost
=
fluid
.
layers
.
square_error_cost
(
self
.
predict
,
fluid
.
layers
.
cast
(
x
=
self
.
label_input
,
dtype
=
'float32'
))
avg_cost
=
fluid
.
layers
.
reduce_mean
(
cost
)
self
.
_cost
=
avg_cost
self
.
_metrics
[
"LOSS"
]
=
avg_cost
def
optimizer
(
self
):
optimizer
=
fluid
.
optimizer
.
Adam
(
self
.
learning_rate
,
lazy_mode
=
True
)
return
optimizer
def
infer_net
(
self
):
pass
models/demo/movie_recommand/recall/__init__.py
0 → 100755
浏览文件 @
77381441
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
models/demo/movie_recommand/recall/config.yaml
0 → 100755
浏览文件 @
77381441
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
workspace
:
"
paddlerec.models.demo.movie_recommand"
# list of dataset
dataset
:
-
name
:
dataset_train
# name of dataset to distinguish different datasets
batch_size
:
128
type
:
QueueDataset
data_path
:
"
{workspace}/data/train"
sparse_slots
:
"
logid
time
userid
gender
age
occupation
movieid
title
genres
label"
dense_slots
:
"
"
-
name
:
dataset_infer
# name
batch_size
:
128
type
:
DataLoader
data_path
:
"
{workspace}/data/test"
sparse_slots
:
"
logid
time
userid
gender
age
occupation
movieid
title
genres
label"
dense_slots
:
"
"
-
name
:
dataset_online_infer
# name
batch_size
:
128
type
:
DataLoader
data_path
:
"
{workspace}/data/online_user/test"
sparse_slots
:
"
logid
time
userid
gender
age
occupation
movieid
title
genres
label"
dense_slots
:
"
"
# hyper parameters of user-defined network
hyper_parameters
:
# optimizer config
optimizer
:
class
:
Adam
learning_rate
:
0.001
strategy
:
async
# user-defined <key, value> pairs
sparse_feature_number
:
60000000
sparse_feature_dim
:
9
dense_input_dim
:
13
fc_sizes
:
[
512
,
256
,
128
,
32
]
# train
mode
:
runner_train
## online or offline infer
#mode: runner_infer
runner
:
-
name
:
runner_train
class
:
single_train
save_checkpoint_interval
:
1
# save model interval of epochs
save_inference_interval
:
1
# save inference
save_checkpoint_path
:
"
increment"
# save checkpoint path
save_inference_path
:
"
inference"
# save inference path
epochs
:
10
device
:
cpu
-
name
:
runner_infer
epochs
:
1
class
:
single_infer
print_interval
:
10000
init_model_path
:
"
increment/9"
# load model path
#train
phase
:
-
name
:
phase1
model
:
"
{workspace}/model.py"
# user-defined model
dataset_name
:
dataset_train
# select dataset by name
thread_num
:
12
##offline infer
#phase:
#- name: phase1
# model: "{workspace}/model.py" # user-defined model
# dataset_name: dataset_infer # select dataset by name
# save_path: "./infer_result"
# thread_num: 1
##offline infer
#phase:
#- name: phase1
# model: "{workspace}/model.py" # user-defined model
# dataset_name: dataset_online_infer # select dataset by name
# save_path: "./infer_result"
# thread_num: 1
models/demo/movie_recommand/recall/model.py
0 → 100755
浏览文件 @
77381441
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
math
import
paddle.fluid
as
fluid
from
paddlerec.core.utils
import
envs
from
paddlerec.core.model
import
Model
as
ModelBase
class
Model
(
ModelBase
):
def
__init__
(
self
,
config
):
ModelBase
.
__init__
(
self
,
config
)
def
_init_hyper_parameters
(
self
):
self
.
is_distributed
=
True
if
envs
.
get_trainer
(
)
==
"CtrTrainer"
else
False
self
.
sparse_feature_number
=
envs
.
get_global_env
(
"hyper_parameters.sparse_feature_number"
)
self
.
sparse_feature_dim
=
envs
.
get_global_env
(
"hyper_parameters.sparse_feature_dim"
)
self
.
learning_rate
=
envs
.
get_global_env
(
"hyper_parameters.optimizer.learning_rate"
)
self
.
hidden_layers
=
envs
.
get_global_env
(
"hyper_parameters.fc_sizes"
)
def
net
(
self
,
input
,
is_infer
=
False
):
self
.
user_sparse_inputs
=
self
.
_sparse_data_var
[
2
:
6
]
self
.
mov_sparse_inputs
=
self
.
_sparse_data_var
[
6
:
9
]
self
.
label_input
=
self
.
_sparse_data_var
[
-
1
]
def
fc
(
input
):
fcs
=
[
input
]
for
size
in
self
.
hidden_layers
:
output
=
fluid
.
layers
.
fc
(
input
=
fcs
[
-
1
],
size
=
size
,
act
=
'relu'
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Normal
(
scale
=
1.0
/
math
.
sqrt
(
fcs
[
-
1
].
shape
[
1
]))))
fcs
.
append
(
output
)
return
fcs
[
-
1
]
def
embedding_layer
(
input
):
emb
=
fluid
.
layers
.
embedding
(
input
=
input
,
is_sparse
=
True
,
is_distributed
=
self
.
is_distributed
,
size
=
[
self
.
sparse_feature_number
,
self
.
sparse_feature_dim
],
param_attr
=
fluid
.
ParamAttr
(
name
=
"emb"
,
initializer
=
fluid
.
initializer
.
Uniform
()),
)
emb_sum
=
fluid
.
layers
.
sequence_pool
(
input
=
emb
,
pool_type
=
'sum'
)
return
emb_sum
user_sparse_embed_seq
=
list
(
map
(
embedding_layer
,
self
.
user_sparse_inputs
))
mov_sparse_embed_seq
=
list
(
map
(
embedding_layer
,
self
.
mov_sparse_inputs
))
concated_user
=
fluid
.
layers
.
concat
(
user_sparse_embed_seq
,
axis
=
1
)
concated_mov
=
fluid
.
layers
.
concat
(
mov_sparse_embed_seq
,
axis
=
1
)
usr_combined_features
=
fc
(
concated_user
)
mov_combined_features
=
fc
(
concated_mov
)
sim
=
fluid
.
layers
.
cos_sim
(
X
=
usr_combined_features
,
Y
=
mov_combined_features
)
predict
=
fluid
.
layers
.
scale
(
sim
,
scale
=
5
)
self
.
predict
=
predict
if
is_infer
:
self
.
_infer_results
[
"uid"
]
=
self
.
_sparse_data_var
[
2
]
self
.
_infer_results
[
"movieid"
]
=
self
.
_sparse_data_var
[
6
]
self
.
_infer_results
[
"label"
]
=
self
.
_sparse_data_var
[
-
1
]
self
.
_infer_results
[
"predict"
]
=
self
.
predict
return
cost
=
fluid
.
layers
.
square_error_cost
(
self
.
predict
,
fluid
.
layers
.
cast
(
x
=
self
.
label_input
,
dtype
=
'float32'
))
avg_cost
=
fluid
.
layers
.
reduce_mean
(
cost
)
self
.
_cost
=
avg_cost
self
.
_metrics
[
"LOSS"
]
=
avg_cost
def
optimizer
(
self
):
optimizer
=
fluid
.
optimizer
.
Adam
(
self
.
learning_rate
,
lazy_mode
=
True
)
return
optimizer
models/demo/movie_recommand/train.sh
0 → 100644
浏览文件 @
77381441
cd
recall
python
-m
paddlerec.run
-m
./config.yaml &> log &
cd
../rank
python
-m
paddlerec.run
-m
./config.yaml &> log &
cd
..
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录