Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
PaddleRec
提交
fe9cfe4e
P
PaddleRec
项目概览
BaiXuePrincess
/
PaddleRec
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleRec
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
fe9cfe4e
编写于
7月 28, 2020
作者:
M
malin10
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
bug fix
上级
c34d6b53
变更
10
隐藏空白更改
内联
并排
Showing
10 changed file
with
11 addition
and
322 deletion
+11
-322
core/metric.py
core/metric.py
+3
-17
core/metrics/auc_metrics.py
core/metrics/auc_metrics.py
+0
-216
core/metrics/binary_class/auc.py
core/metrics/binary_class/auc.py
+0
-3
core/metrics/binary_class/precision_recall.py
core/metrics/binary_class/precision_recall.py
+0
-1
core/metrics/pairwise_pn.py
core/metrics/pairwise_pn.py
+0
-1
core/metrics/recall_k.py
core/metrics/recall_k.py
+1
-1
models/recall/gnn/readme.md
models/recall/gnn/readme.md
+0
-76
tests/test_pairwise_pn.py
tests/test_pairwise_pn.py
+3
-3
tests/test_precision_recall_metrics.py
tests/test_precision_recall_metrics.py
+1
-1
tests/test_recall_k.py
tests/test_recall_k.py
+3
-3
未找到文件。
core/metric.py
浏览文件 @
fe9cfe4e
...
...
@@ -27,12 +27,7 @@ class Metric(object):
pass
def
clear
(
self
,
scope
=
None
):
"""
clear current value
Args:
scope: value container
params: extend varilable for clear
"""
""" """
if
scope
is
None
:
scope
=
fluid
.
global_scope
()
...
...
@@ -46,11 +41,7 @@ class Metric(object):
var
.
set
(
data_array
,
place
)
def
get_global_metric
(
self
,
fleet
,
scope
,
metric_name
,
mode
=
"sum"
):
"""
reduce metric named metric_name from all worker
Return:
metric reduce result
"""
""" """
input
=
np
.
array
(
scope
.
find_var
(
metric_name
).
get_tensor
())
if
fleet
is
None
:
return
input
...
...
@@ -63,12 +54,7 @@ class Metric(object):
return
output
def
cal_global_metrics
(
self
,
fleet
,
scope
=
None
):
"""
calculate result
Args:
scope: value container
params: extend varilable for clear
"""
""" """
if
scope
is
None
:
scope
=
fluid
.
global_scope
()
...
...
core/metrics/auc_metrics.py
已删除
100755 → 0
浏览文件 @
c34d6b53
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
math
import
numpy
as
np
import
paddle.fluid
as
fluid
from
paddlerec.core.metric
import
Metric
class
AUCMetric
(
Metric
):
"""
Metric For Fluid Model
"""
def
__init__
(
self
,
config
,
fleet
):
""" """
self
.
config
=
config
self
.
fleet
=
fleet
def
clear
(
self
,
scope
,
params
):
"""
Clear current metric value, usually set to zero
Args:
scope : paddle runtime var container
params(dict) :
label : a group name for metric
metric_dict : current metric_items in group
Return:
None
"""
self
.
_label
=
params
[
'label'
]
self
.
_metric_dict
=
params
[
'metric_dict'
]
self
.
_result
=
{}
place
=
fluid
.
CPUPlace
()
for
metric_name
in
self
.
_metric_dict
:
metric_config
=
self
.
_metric_dict
[
metric_name
]
if
scope
.
find_var
(
metric_config
[
'var'
].
name
)
is
None
:
continue
metric_var
=
scope
.
var
(
metric_config
[
'var'
].
name
).
get_tensor
()
data_type
=
'float32'
if
'data_type'
in
metric_config
:
data_type
=
metric_config
[
'data_type'
]
data_array
=
np
.
zeros
(
metric_var
.
_get_dims
()).
astype
(
data_type
)
metric_var
.
set
(
data_array
,
place
)
def
get_metric
(
self
,
scope
,
metric_name
):
"""
reduce metric named metric_name from all worker
Return:
metric reduce result
"""
metric
=
np
.
array
(
scope
.
find_var
(
metric_name
).
get_tensor
())
old_metric_shape
=
np
.
array
(
metric
.
shape
)
metric
=
metric
.
reshape
(
-
1
)
global_metric
=
np
.
copy
(
metric
)
*
0
self
.
fleet
.
_role_maker
.
all_reduce_worker
(
metric
,
global_metric
)
global_metric
=
global_metric
.
reshape
(
old_metric_shape
)
return
global_metric
[
0
]
def
get_global_metrics
(
self
,
scope
,
metric_dict
):
"""
reduce all metric in metric_dict from all worker
Return:
dict : {matric_name : metric_result}
"""
self
.
fleet
.
_role_maker
.
_barrier_worker
()
result
=
{}
for
metric_name
in
metric_dict
:
metric_item
=
metric_dict
[
metric_name
]
if
scope
.
find_var
(
metric_item
[
'var'
].
name
)
is
None
:
result
[
metric_name
]
=
None
continue
result
[
metric_name
]
=
self
.
get_metric
(
scope
,
metric_item
[
'var'
].
name
)
return
result
def
calculate_auc
(
self
,
global_pos
,
global_neg
):
"""R
"""
num_bucket
=
len
(
global_pos
)
area
=
0.0
pos
=
0.0
neg
=
0.0
new_pos
=
0.0
new_neg
=
0.0
total_ins_num
=
0
for
i
in
range
(
num_bucket
):
index
=
num_bucket
-
1
-
i
new_pos
=
pos
+
global_pos
[
index
]
total_ins_num
+=
global_pos
[
index
]
new_neg
=
neg
+
global_neg
[
index
]
total_ins_num
+=
global_neg
[
index
]
area
+=
(
new_neg
-
neg
)
*
(
pos
+
new_pos
)
/
2
pos
=
new_pos
neg
=
new_neg
auc_value
=
None
if
pos
*
neg
==
0
or
total_ins_num
==
0
:
auc_value
=
0.5
else
:
auc_value
=
area
/
(
pos
*
neg
)
return
auc_value
def
calculate_bucket_error
(
self
,
global_pos
,
global_neg
):
"""R
"""
num_bucket
=
len
(
global_pos
)
last_ctr
=
-
1.0
impression_sum
=
0.0
ctr_sum
=
0.0
click_sum
=
0.0
error_sum
=
0.0
error_count
=
0.0
click
=
0.0
show
=
0.0
ctr
=
0.0
adjust_ctr
=
0.0
relative_error
=
0.0
actual_ctr
=
0.0
relative_ctr_error
=
0.0
k_max_span
=
0.01
k_relative_error_bound
=
0.05
for
i
in
range
(
num_bucket
):
click
=
global_pos
[
i
]
show
=
global_pos
[
i
]
+
global_neg
[
i
]
ctr
=
float
(
i
)
/
num_bucket
if
abs
(
ctr
-
last_ctr
)
>
k_max_span
:
last_ctr
=
ctr
impression_sum
=
0.0
ctr_sum
=
0.0
click_sum
=
0.0
impression_sum
+=
show
ctr_sum
+=
ctr
*
show
click_sum
+=
click
if
impression_sum
==
0
:
continue
adjust_ctr
=
ctr_sum
/
impression_sum
if
adjust_ctr
==
0
:
continue
relative_error
=
\
math
.
sqrt
((
1
-
adjust_ctr
)
/
(
adjust_ctr
*
impression_sum
))
if
relative_error
<
k_relative_error_bound
:
actual_ctr
=
click_sum
/
impression_sum
relative_ctr_error
=
abs
(
actual_ctr
/
adjust_ctr
-
1
)
error_sum
+=
relative_ctr_error
*
impression_sum
error_count
+=
impression_sum
last_ctr
=
-
1
bucket_error
=
error_sum
/
error_count
if
error_count
>
0
else
0.0
return
bucket_error
def
calculate
(
self
,
scope
,
params
):
""" """
self
.
_label
=
params
[
'label'
]
self
.
_metric_dict
=
params
[
'metric_dict'
]
self
.
fleet
.
_role_maker
.
_barrier_worker
()
result
=
self
.
get_global_metrics
(
scope
,
self
.
_metric_dict
)
if
result
[
'total_ins_num'
]
==
0
:
self
.
_result
=
result
self
.
_result
[
'auc'
]
=
0
self
.
_result
[
'bucket_error'
]
=
0
self
.
_result
[
'actual_ctr'
]
=
0
self
.
_result
[
'predict_ctr'
]
=
0
self
.
_result
[
'mae'
]
=
0
self
.
_result
[
'rmse'
]
=
0
self
.
_result
[
'copc'
]
=
0
self
.
_result
[
'mean_q'
]
=
0
return
self
.
_result
if
'stat_pos'
in
result
and
'stat_neg'
in
result
:
result
[
'auc'
]
=
self
.
calculate_auc
(
result
[
'stat_pos'
],
result
[
'stat_neg'
])
result
[
'bucket_error'
]
=
self
.
calculate_auc
(
result
[
'stat_pos'
],
result
[
'stat_neg'
])
if
'pos_ins_num'
in
result
:
result
[
'actual_ctr'
]
=
result
[
'pos_ins_num'
]
/
result
[
'total_ins_num'
]
if
'abserr'
in
result
:
result
[
'mae'
]
=
result
[
'abserr'
]
/
result
[
'total_ins_num'
]
if
'sqrerr'
in
result
:
result
[
'rmse'
]
=
math
.
sqrt
(
result
[
'sqrerr'
]
/
result
[
'total_ins_num'
])
if
'prob'
in
result
:
result
[
'predict_ctr'
]
=
result
[
'prob'
]
/
result
[
'total_ins_num'
]
if
abs
(
result
[
'predict_ctr'
])
>
1e-6
:
result
[
'copc'
]
=
result
[
'actual_ctr'
]
/
result
[
'predict_ctr'
]
if
'q'
in
result
:
result
[
'mean_q'
]
=
result
[
'q'
]
/
result
[
'total_ins_num'
]
self
.
_result
=
result
return
result
def
get_result
(
self
):
""" """
return
self
.
_result
def
__str__
(
self
):
""" """
result
=
self
.
get_result
()
result_str
=
"%s AUC=%.6f BUCKET_ERROR=%.6f MAE=%.6f RMSE=%.6f "
\
"Actural_CTR=%.6f Predicted_CTR=%.6f COPC=%.6f MEAN Q_VALUE=%.6f Ins number=%s"
%
\
(
self
.
_label
,
result
[
'auc'
],
result
[
'bucket_error'
],
result
[
'mae'
],
result
[
'rmse'
],
result
[
'actual_ctr'
],
result
[
'predict_ctr'
],
result
[
'copc'
],
result
[
'mean_q'
],
result
[
'total_ins_num'
])
return
result_str
core/metrics/binary_class/auc.py
浏览文件 @
fe9cfe4e
...
...
@@ -18,9 +18,6 @@ import numpy as np
import
paddle.fluid
as
fluid
from
paddlerec.core.metric
import
Metric
from
paddle.fluid.layers
import
nn
,
accuracy
from
paddle.fluid.initializer
import
Constant
from
paddle.fluid.layer_helper
import
LayerHelper
from
paddle.fluid.layers.tensor
import
Variable
...
...
core/metrics/binary_class/precision_recall.py
浏览文件 @
fe9cfe4e
...
...
@@ -18,7 +18,6 @@ import numpy as np
import
paddle.fluid
as
fluid
from
paddlerec.core.metric
import
Metric
from
paddle.fluid.layers
import
nn
,
accuracy
from
paddle.fluid.initializer
import
Constant
from
paddle.fluid.layer_helper
import
LayerHelper
from
paddle.fluid.layers.tensor
import
Variable
...
...
core/metrics/pairwise_pn.py
浏览文件 @
fe9cfe4e
...
...
@@ -18,7 +18,6 @@ import numpy as np
import
paddle.fluid
as
fluid
from
paddlerec.core.metric
import
Metric
from
paddle.fluid.layers
import
nn
,
accuracy
from
paddle.fluid.initializer
import
Constant
from
paddle.fluid.layer_helper
import
LayerHelper
from
paddle.fluid.layers.tensor
import
Variable
...
...
core/metrics/recall_k.py
浏览文件 @
fe9cfe4e
...
...
@@ -18,7 +18,7 @@ import numpy as np
import
paddle.fluid
as
fluid
from
paddlerec.core.metric
import
Metric
from
paddle.fluid.layers
import
nn
,
accuracy
from
paddle.fluid.layers
import
accuracy
from
paddle.fluid.initializer
import
Constant
from
paddle.fluid.layer_helper
import
LayerHelper
from
paddle.fluid.layers.tensor
import
Variable
...
...
models/recall/gnn/readme.md
已删除
100644 → 0
浏览文件 @
c34d6b53
# GNN
## 快速开始
PaddleRec中每个内置模型都配备了对应的样例数据,用户可基于该数据集快速对模型、环境进行验证,从而降低后续的调试成本。在内置数据集上进行训练的命令为:
```
python -m paddlerec.run -m paddlerec.models.recall.gnn
```
## 数据处理
-
Step1: 原始数据数据集下载,本示例提供了两个开源数据集:DIGINETICA和Yoochoose,可选其中任意一个训练本模型。
```
cd data && python download.py diginetica # or yoochoose
```
> [Yoochooses](https://2015.recsyschallenge.com/challenge.html)数据集来源于RecSys Challenge 2015,原始数据包含如下字段:
1.
Session ID – the id of the session. In one session there are one or many clicks.
2.
Timestamp – the time when the click occurred.
3.
Item ID – the unique identifier of the item.
4.
Category – the category of the item.
> [DIGINETICA](https://competitions.codalab.org/competitions/11161#learn_the_details-data2)数据集来源于CIKM Cup 2016 _Personalized E-Commerce Search Challenge_项目。原始数据包含如下字段:
1. sessionId - the id of the session. In one session there are one or many clicks.
2. userId - the id of the user, with anonymized user ids.
3. itemId - the unique identifier of the item.
4. timeframe - time since the first query in a session, in milliseconds.
5. eventdate - calendar date.
-
Step2: 数据预处理
```
cd data && python preprocess.py --dataset diginetica # or yoochoose
```
1.
以session_id为key合并原始数据集,得到每个session的日期,及顺序点击列表。
2.
过滤掉长度为1的session;过滤掉点击次数小于5的items。
3.
训练集、测试集划分。原始数据集里最新日期七天内的作为测试集,更早之前的数据作为测试集。
-
Step3: 数据整理。 将训练文件统一放在data/train目录下,测试文件统一放在data/test目录下。
```
cat data/diginetica/train.txt | wc -l >> data/config.txt # or yoochoose1_4 or yoochoose1_64
rm -rf data/train/*
rm -rf data/test/*
mv data/diginetica/train.txt data/train
mv data/diginetica/test.txt data/test
```
数据处理完成后,data/train目录存放训练数据,data/test目录下存放测试数据,data/config.txt中存放数据统计信息,用以配置模型超参。
方便起见, 我们提供了一键式数据处理脚本:
```
sh data_prepare.sh diginetica # or yoochoose1_4 or yoochoose1_64
```
## 实验配置
为在真实数据中复现论文中的效果,你还需要完成如下几步,PaddleRec所有配置均通过修改模型目录下的config.yaml文件完成:
1.
真实数据配置。config.yaml中数据集相关配置见
`dataset`
字段,数据路径通过
`data_path`
进行配置。用户可以直接将workspace修改为当前项目目录的绝对路径完成设置。
2.
超参配置。
-
batch_size: 修改config.yaml中dataset_train数据集的batch_size为100。
-
epochs: 修改config.yaml中runner的epochs为5。
-
sparse_feature_number: 不同训练数据集(diginetica or yoochoose)配置不一致,diginetica数据集配置为43098,yoochoose数据集配置为37484。具体见数据处理后得到的data/config.txt文件中第一行。
-
corpus_size: 不同训练数据集配置不一致,diginetica数据集配置为719470,yoochoose数据集配置为5917745。具体见数据处理后得到的data/config.txt文件中第二行。
## 训练
在完成
[
实验配置
](
##实验配置
)
后,执行如下命令完成训练:
```
python -m paddlerec.run -m ./config.yaml
```
## 测试
开始测试前,你需要完成如下几步配置:
1.
修改config.yaml中的mode,为infer_runner。
2.
修改config.yaml中的phase,为phase_infer,需按提示注释掉phase_trainer。
3.
修改config.yaml中dataset_infer数据集的batch_size为100。
完成上面两步配置后,执行如下命令完成测试:
```
python -m paddlerec.run -m ./config.yaml
```
tests/test_pairwise_pn.py
浏览文件 @
fe9cfe4e
...
...
@@ -75,10 +75,10 @@ class TestPosNegRatio(unittest.TestCase):
return_numpy
=
True
)
outs
=
dict
(
zip
(
metric_keys
,
outs
))
self
.
assertTrue
(
np
.
allclose
(
outs
[
'
right_c
nt'
],
self
.
right_cnt
))
self
.
assertTrue
(
np
.
allclose
(
outs
[
'
wrong_c
nt'
],
self
.
wrong_cnt
))
self
.
assertTrue
(
np
.
allclose
(
outs
[
'
RightC
nt'
],
self
.
right_cnt
))
self
.
assertTrue
(
np
.
allclose
(
outs
[
'
WrongC
nt'
],
self
.
wrong_cnt
))
self
.
assertTrue
(
np
.
allclose
(
outs
[
'
pos_neg_ratio
'
],
np
.
allclose
(
outs
[
'
PN
'
],
np
.
array
((
self
.
right_cnt
+
1.0
)
/
(
self
.
wrong_cnt
+
1.0
))))
...
...
tests/test_precision_recall_metrics.py
浏览文件 @
fe9cfe4e
...
...
@@ -145,7 +145,7 @@ class TestPrecisionRecall(unittest.TestCase):
return_numpy
=
True
)
outs
=
dict
(
zip
(
metric_keys
,
outs
))
self
.
assertTrue
(
np
.
allclose
(
outs
[
'
accum_states
'
],
self
.
states
))
self
.
assertTrue
(
np
.
allclose
(
outs
[
'
[TP FP TN FN]
'
],
self
.
states
))
self
.
assertTrue
(
np
.
allclose
(
outs
[
'precision_recall_f1'
],
self
.
metrics
))
def
test_exception
(
self
):
...
...
tests/test_recall_k.py
浏览文件 @
fe9cfe4e
...
...
@@ -78,10 +78,10 @@ class TestRecallK(unittest.TestCase):
outs
=
dict
(
zip
(
metric_keys
,
outs
))
self
.
assertTrue
(
np
.
allclose
(
outs
[
'
ins_c
nt'
],
self
.
ins_num
*
self
.
batch_nums
))
self
.
assertTrue
(
np
.
allclose
(
outs
[
'
pos_c
nt'
],
self
.
match_num
))
np
.
allclose
(
outs
[
'
InsC
nt'
],
self
.
ins_num
*
self
.
batch_nums
))
self
.
assertTrue
(
np
.
allclose
(
outs
[
'
RecallC
nt'
],
self
.
match_num
))
self
.
assertTrue
(
np
.
allclose
(
outs
[
'
Recall@%d_ACC
'
%
(
self
.
topk
)],
np
.
allclose
(
outs
[
'
Acc(Recall@%d)
'
%
(
self
.
topk
)],
np
.
array
(
self
.
match_num
/
(
self
.
ins_num
*
self
.
batch_nums
))))
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录