Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleRec
提交
fe9cfe4e
P
PaddleRec
项目概览
PaddlePaddle
/
PaddleRec
通知
68
Star
12
Fork
5
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
27
列表
看板
标记
里程碑
合并请求
10
Wiki
1
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleRec
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
27
Issue
27
列表
看板
标记
里程碑
合并请求
10
合并请求
10
Pages
分析
分析
仓库分析
DevOps
Wiki
1
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
fe9cfe4e
编写于
7月 28, 2020
作者:
M
malin10
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
bug fix
上级
c34d6b53
变更
10
隐藏空白更改
内联
并排
Showing
10 changed file
with
11 addition
and
322 deletion
+11
-322
core/metric.py
core/metric.py
+3
-17
core/metrics/auc_metrics.py
core/metrics/auc_metrics.py
+0
-216
core/metrics/binary_class/auc.py
core/metrics/binary_class/auc.py
+0
-3
core/metrics/binary_class/precision_recall.py
core/metrics/binary_class/precision_recall.py
+0
-1
core/metrics/pairwise_pn.py
core/metrics/pairwise_pn.py
+0
-1
core/metrics/recall_k.py
core/metrics/recall_k.py
+1
-1
models/recall/gnn/readme.md
models/recall/gnn/readme.md
+0
-76
tests/test_pairwise_pn.py
tests/test_pairwise_pn.py
+3
-3
tests/test_precision_recall_metrics.py
tests/test_precision_recall_metrics.py
+1
-1
tests/test_recall_k.py
tests/test_recall_k.py
+3
-3
未找到文件。
core/metric.py
浏览文件 @
fe9cfe4e
...
@@ -27,12 +27,7 @@ class Metric(object):
...
@@ -27,12 +27,7 @@ class Metric(object):
pass
pass
def
clear
(
self
,
scope
=
None
):
def
clear
(
self
,
scope
=
None
):
"""
""" """
clear current value
Args:
scope: value container
params: extend varilable for clear
"""
if
scope
is
None
:
if
scope
is
None
:
scope
=
fluid
.
global_scope
()
scope
=
fluid
.
global_scope
()
...
@@ -46,11 +41,7 @@ class Metric(object):
...
@@ -46,11 +41,7 @@ class Metric(object):
var
.
set
(
data_array
,
place
)
var
.
set
(
data_array
,
place
)
def
get_global_metric
(
self
,
fleet
,
scope
,
metric_name
,
mode
=
"sum"
):
def
get_global_metric
(
self
,
fleet
,
scope
,
metric_name
,
mode
=
"sum"
):
"""
""" """
reduce metric named metric_name from all worker
Return:
metric reduce result
"""
input
=
np
.
array
(
scope
.
find_var
(
metric_name
).
get_tensor
())
input
=
np
.
array
(
scope
.
find_var
(
metric_name
).
get_tensor
())
if
fleet
is
None
:
if
fleet
is
None
:
return
input
return
input
...
@@ -63,12 +54,7 @@ class Metric(object):
...
@@ -63,12 +54,7 @@ class Metric(object):
return
output
return
output
def
cal_global_metrics
(
self
,
fleet
,
scope
=
None
):
def
cal_global_metrics
(
self
,
fleet
,
scope
=
None
):
"""
""" """
calculate result
Args:
scope: value container
params: extend varilable for clear
"""
if
scope
is
None
:
if
scope
is
None
:
scope
=
fluid
.
global_scope
()
scope
=
fluid
.
global_scope
()
...
...
core/metrics/auc_metrics.py
已删除
100755 → 0
浏览文件 @
c34d6b53
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
math
import
numpy
as
np
import
paddle.fluid
as
fluid
from
paddlerec.core.metric
import
Metric
class
AUCMetric
(
Metric
):
"""
Metric For Fluid Model
"""
def
__init__
(
self
,
config
,
fleet
):
""" """
self
.
config
=
config
self
.
fleet
=
fleet
def
clear
(
self
,
scope
,
params
):
"""
Clear current metric value, usually set to zero
Args:
scope : paddle runtime var container
params(dict) :
label : a group name for metric
metric_dict : current metric_items in group
Return:
None
"""
self
.
_label
=
params
[
'label'
]
self
.
_metric_dict
=
params
[
'metric_dict'
]
self
.
_result
=
{}
place
=
fluid
.
CPUPlace
()
for
metric_name
in
self
.
_metric_dict
:
metric_config
=
self
.
_metric_dict
[
metric_name
]
if
scope
.
find_var
(
metric_config
[
'var'
].
name
)
is
None
:
continue
metric_var
=
scope
.
var
(
metric_config
[
'var'
].
name
).
get_tensor
()
data_type
=
'float32'
if
'data_type'
in
metric_config
:
data_type
=
metric_config
[
'data_type'
]
data_array
=
np
.
zeros
(
metric_var
.
_get_dims
()).
astype
(
data_type
)
metric_var
.
set
(
data_array
,
place
)
def
get_metric
(
self
,
scope
,
metric_name
):
"""
reduce metric named metric_name from all worker
Return:
metric reduce result
"""
metric
=
np
.
array
(
scope
.
find_var
(
metric_name
).
get_tensor
())
old_metric_shape
=
np
.
array
(
metric
.
shape
)
metric
=
metric
.
reshape
(
-
1
)
global_metric
=
np
.
copy
(
metric
)
*
0
self
.
fleet
.
_role_maker
.
all_reduce_worker
(
metric
,
global_metric
)
global_metric
=
global_metric
.
reshape
(
old_metric_shape
)
return
global_metric
[
0
]
def
get_global_metrics
(
self
,
scope
,
metric_dict
):
"""
reduce all metric in metric_dict from all worker
Return:
dict : {matric_name : metric_result}
"""
self
.
fleet
.
_role_maker
.
_barrier_worker
()
result
=
{}
for
metric_name
in
metric_dict
:
metric_item
=
metric_dict
[
metric_name
]
if
scope
.
find_var
(
metric_item
[
'var'
].
name
)
is
None
:
result
[
metric_name
]
=
None
continue
result
[
metric_name
]
=
self
.
get_metric
(
scope
,
metric_item
[
'var'
].
name
)
return
result
def
calculate_auc
(
self
,
global_pos
,
global_neg
):
"""R
"""
num_bucket
=
len
(
global_pos
)
area
=
0.0
pos
=
0.0
neg
=
0.0
new_pos
=
0.0
new_neg
=
0.0
total_ins_num
=
0
for
i
in
range
(
num_bucket
):
index
=
num_bucket
-
1
-
i
new_pos
=
pos
+
global_pos
[
index
]
total_ins_num
+=
global_pos
[
index
]
new_neg
=
neg
+
global_neg
[
index
]
total_ins_num
+=
global_neg
[
index
]
area
+=
(
new_neg
-
neg
)
*
(
pos
+
new_pos
)
/
2
pos
=
new_pos
neg
=
new_neg
auc_value
=
None
if
pos
*
neg
==
0
or
total_ins_num
==
0
:
auc_value
=
0.5
else
:
auc_value
=
area
/
(
pos
*
neg
)
return
auc_value
def
calculate_bucket_error
(
self
,
global_pos
,
global_neg
):
"""R
"""
num_bucket
=
len
(
global_pos
)
last_ctr
=
-
1.0
impression_sum
=
0.0
ctr_sum
=
0.0
click_sum
=
0.0
error_sum
=
0.0
error_count
=
0.0
click
=
0.0
show
=
0.0
ctr
=
0.0
adjust_ctr
=
0.0
relative_error
=
0.0
actual_ctr
=
0.0
relative_ctr_error
=
0.0
k_max_span
=
0.01
k_relative_error_bound
=
0.05
for
i
in
range
(
num_bucket
):
click
=
global_pos
[
i
]
show
=
global_pos
[
i
]
+
global_neg
[
i
]
ctr
=
float
(
i
)
/
num_bucket
if
abs
(
ctr
-
last_ctr
)
>
k_max_span
:
last_ctr
=
ctr
impression_sum
=
0.0
ctr_sum
=
0.0
click_sum
=
0.0
impression_sum
+=
show
ctr_sum
+=
ctr
*
show
click_sum
+=
click
if
impression_sum
==
0
:
continue
adjust_ctr
=
ctr_sum
/
impression_sum
if
adjust_ctr
==
0
:
continue
relative_error
=
\
math
.
sqrt
((
1
-
adjust_ctr
)
/
(
adjust_ctr
*
impression_sum
))
if
relative_error
<
k_relative_error_bound
:
actual_ctr
=
click_sum
/
impression_sum
relative_ctr_error
=
abs
(
actual_ctr
/
adjust_ctr
-
1
)
error_sum
+=
relative_ctr_error
*
impression_sum
error_count
+=
impression_sum
last_ctr
=
-
1
bucket_error
=
error_sum
/
error_count
if
error_count
>
0
else
0.0
return
bucket_error
def
calculate
(
self
,
scope
,
params
):
""" """
self
.
_label
=
params
[
'label'
]
self
.
_metric_dict
=
params
[
'metric_dict'
]
self
.
fleet
.
_role_maker
.
_barrier_worker
()
result
=
self
.
get_global_metrics
(
scope
,
self
.
_metric_dict
)
if
result
[
'total_ins_num'
]
==
0
:
self
.
_result
=
result
self
.
_result
[
'auc'
]
=
0
self
.
_result
[
'bucket_error'
]
=
0
self
.
_result
[
'actual_ctr'
]
=
0
self
.
_result
[
'predict_ctr'
]
=
0
self
.
_result
[
'mae'
]
=
0
self
.
_result
[
'rmse'
]
=
0
self
.
_result
[
'copc'
]
=
0
self
.
_result
[
'mean_q'
]
=
0
return
self
.
_result
if
'stat_pos'
in
result
and
'stat_neg'
in
result
:
result
[
'auc'
]
=
self
.
calculate_auc
(
result
[
'stat_pos'
],
result
[
'stat_neg'
])
result
[
'bucket_error'
]
=
self
.
calculate_auc
(
result
[
'stat_pos'
],
result
[
'stat_neg'
])
if
'pos_ins_num'
in
result
:
result
[
'actual_ctr'
]
=
result
[
'pos_ins_num'
]
/
result
[
'total_ins_num'
]
if
'abserr'
in
result
:
result
[
'mae'
]
=
result
[
'abserr'
]
/
result
[
'total_ins_num'
]
if
'sqrerr'
in
result
:
result
[
'rmse'
]
=
math
.
sqrt
(
result
[
'sqrerr'
]
/
result
[
'total_ins_num'
])
if
'prob'
in
result
:
result
[
'predict_ctr'
]
=
result
[
'prob'
]
/
result
[
'total_ins_num'
]
if
abs
(
result
[
'predict_ctr'
])
>
1e-6
:
result
[
'copc'
]
=
result
[
'actual_ctr'
]
/
result
[
'predict_ctr'
]
if
'q'
in
result
:
result
[
'mean_q'
]
=
result
[
'q'
]
/
result
[
'total_ins_num'
]
self
.
_result
=
result
return
result
def
get_result
(
self
):
""" """
return
self
.
_result
def
__str__
(
self
):
""" """
result
=
self
.
get_result
()
result_str
=
"%s AUC=%.6f BUCKET_ERROR=%.6f MAE=%.6f RMSE=%.6f "
\
"Actural_CTR=%.6f Predicted_CTR=%.6f COPC=%.6f MEAN Q_VALUE=%.6f Ins number=%s"
%
\
(
self
.
_label
,
result
[
'auc'
],
result
[
'bucket_error'
],
result
[
'mae'
],
result
[
'rmse'
],
result
[
'actual_ctr'
],
result
[
'predict_ctr'
],
result
[
'copc'
],
result
[
'mean_q'
],
result
[
'total_ins_num'
])
return
result_str
core/metrics/binary_class/auc.py
浏览文件 @
fe9cfe4e
...
@@ -18,9 +18,6 @@ import numpy as np
...
@@ -18,9 +18,6 @@ import numpy as np
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
from
paddlerec.core.metric
import
Metric
from
paddlerec.core.metric
import
Metric
from
paddle.fluid.layers
import
nn
,
accuracy
from
paddle.fluid.initializer
import
Constant
from
paddle.fluid.layer_helper
import
LayerHelper
from
paddle.fluid.layers.tensor
import
Variable
from
paddle.fluid.layers.tensor
import
Variable
...
...
core/metrics/binary_class/precision_recall.py
浏览文件 @
fe9cfe4e
...
@@ -18,7 +18,6 @@ import numpy as np
...
@@ -18,7 +18,6 @@ import numpy as np
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
from
paddlerec.core.metric
import
Metric
from
paddlerec.core.metric
import
Metric
from
paddle.fluid.layers
import
nn
,
accuracy
from
paddle.fluid.initializer
import
Constant
from
paddle.fluid.initializer
import
Constant
from
paddle.fluid.layer_helper
import
LayerHelper
from
paddle.fluid.layer_helper
import
LayerHelper
from
paddle.fluid.layers.tensor
import
Variable
from
paddle.fluid.layers.tensor
import
Variable
...
...
core/metrics/pairwise_pn.py
浏览文件 @
fe9cfe4e
...
@@ -18,7 +18,6 @@ import numpy as np
...
@@ -18,7 +18,6 @@ import numpy as np
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
from
paddlerec.core.metric
import
Metric
from
paddlerec.core.metric
import
Metric
from
paddle.fluid.layers
import
nn
,
accuracy
from
paddle.fluid.initializer
import
Constant
from
paddle.fluid.initializer
import
Constant
from
paddle.fluid.layer_helper
import
LayerHelper
from
paddle.fluid.layer_helper
import
LayerHelper
from
paddle.fluid.layers.tensor
import
Variable
from
paddle.fluid.layers.tensor
import
Variable
...
...
core/metrics/recall_k.py
浏览文件 @
fe9cfe4e
...
@@ -18,7 +18,7 @@ import numpy as np
...
@@ -18,7 +18,7 @@ import numpy as np
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
from
paddlerec.core.metric
import
Metric
from
paddlerec.core.metric
import
Metric
from
paddle.fluid.layers
import
nn
,
accuracy
from
paddle.fluid.layers
import
accuracy
from
paddle.fluid.initializer
import
Constant
from
paddle.fluid.initializer
import
Constant
from
paddle.fluid.layer_helper
import
LayerHelper
from
paddle.fluid.layer_helper
import
LayerHelper
from
paddle.fluid.layers.tensor
import
Variable
from
paddle.fluid.layers.tensor
import
Variable
...
...
models/recall/gnn/readme.md
已删除
100644 → 0
浏览文件 @
c34d6b53
# GNN
## 快速开始
PaddleRec中每个内置模型都配备了对应的样例数据,用户可基于该数据集快速对模型、环境进行验证,从而降低后续的调试成本。在内置数据集上进行训练的命令为:
```
python -m paddlerec.run -m paddlerec.models.recall.gnn
```
## 数据处理
-
Step1: 原始数据数据集下载,本示例提供了两个开源数据集:DIGINETICA和Yoochoose,可选其中任意一个训练本模型。
```
cd data && python download.py diginetica # or yoochoose
```
> [Yoochooses](https://2015.recsyschallenge.com/challenge.html)数据集来源于RecSys Challenge 2015,原始数据包含如下字段:
1.
Session ID – the id of the session. In one session there are one or many clicks.
2.
Timestamp – the time when the click occurred.
3.
Item ID – the unique identifier of the item.
4.
Category – the category of the item.
> [DIGINETICA](https://competitions.codalab.org/competitions/11161#learn_the_details-data2)数据集来源于CIKM Cup 2016 _Personalized E-Commerce Search Challenge_项目。原始数据包含如下字段:
1. sessionId - the id of the session. In one session there are one or many clicks.
2. userId - the id of the user, with anonymized user ids.
3. itemId - the unique identifier of the item.
4. timeframe - time since the first query in a session, in milliseconds.
5. eventdate - calendar date.
-
Step2: 数据预处理
```
cd data && python preprocess.py --dataset diginetica # or yoochoose
```
1.
以session_id为key合并原始数据集,得到每个session的日期,及顺序点击列表。
2.
过滤掉长度为1的session;过滤掉点击次数小于5的items。
3.
训练集、测试集划分。原始数据集里最新日期七天内的作为测试集,更早之前的数据作为测试集。
-
Step3: 数据整理。 将训练文件统一放在data/train目录下,测试文件统一放在data/test目录下。
```
cat data/diginetica/train.txt | wc -l >> data/config.txt # or yoochoose1_4 or yoochoose1_64
rm -rf data/train/*
rm -rf data/test/*
mv data/diginetica/train.txt data/train
mv data/diginetica/test.txt data/test
```
数据处理完成后,data/train目录存放训练数据,data/test目录下存放测试数据,data/config.txt中存放数据统计信息,用以配置模型超参。
方便起见, 我们提供了一键式数据处理脚本:
```
sh data_prepare.sh diginetica # or yoochoose1_4 or yoochoose1_64
```
## 实验配置
为在真实数据中复现论文中的效果,你还需要完成如下几步,PaddleRec所有配置均通过修改模型目录下的config.yaml文件完成:
1.
真实数据配置。config.yaml中数据集相关配置见
`dataset`
字段,数据路径通过
`data_path`
进行配置。用户可以直接将workspace修改为当前项目目录的绝对路径完成设置。
2.
超参配置。
-
batch_size: 修改config.yaml中dataset_train数据集的batch_size为100。
-
epochs: 修改config.yaml中runner的epochs为5。
-
sparse_feature_number: 不同训练数据集(diginetica or yoochoose)配置不一致,diginetica数据集配置为43098,yoochoose数据集配置为37484。具体见数据处理后得到的data/config.txt文件中第一行。
-
corpus_size: 不同训练数据集配置不一致,diginetica数据集配置为719470,yoochoose数据集配置为5917745。具体见数据处理后得到的data/config.txt文件中第二行。
## 训练
在完成
[
实验配置
](
##实验配置
)
后,执行如下命令完成训练:
```
python -m paddlerec.run -m ./config.yaml
```
## 测试
开始测试前,你需要完成如下几步配置:
1.
修改config.yaml中的mode,为infer_runner。
2.
修改config.yaml中的phase,为phase_infer,需按提示注释掉phase_trainer。
3.
修改config.yaml中dataset_infer数据集的batch_size为100。
完成上面两步配置后,执行如下命令完成测试:
```
python -m paddlerec.run -m ./config.yaml
```
tests/test_pairwise_pn.py
浏览文件 @
fe9cfe4e
...
@@ -75,10 +75,10 @@ class TestPosNegRatio(unittest.TestCase):
...
@@ -75,10 +75,10 @@ class TestPosNegRatio(unittest.TestCase):
return_numpy
=
True
)
return_numpy
=
True
)
outs
=
dict
(
zip
(
metric_keys
,
outs
))
outs
=
dict
(
zip
(
metric_keys
,
outs
))
self
.
assertTrue
(
np
.
allclose
(
outs
[
'
right_c
nt'
],
self
.
right_cnt
))
self
.
assertTrue
(
np
.
allclose
(
outs
[
'
RightC
nt'
],
self
.
right_cnt
))
self
.
assertTrue
(
np
.
allclose
(
outs
[
'
wrong_c
nt'
],
self
.
wrong_cnt
))
self
.
assertTrue
(
np
.
allclose
(
outs
[
'
WrongC
nt'
],
self
.
wrong_cnt
))
self
.
assertTrue
(
self
.
assertTrue
(
np
.
allclose
(
outs
[
'
pos_neg_ratio
'
],
np
.
allclose
(
outs
[
'
PN
'
],
np
.
array
((
self
.
right_cnt
+
1.0
)
/
(
self
.
wrong_cnt
+
1.0
np
.
array
((
self
.
right_cnt
+
1.0
)
/
(
self
.
wrong_cnt
+
1.0
))))
))))
...
...
tests/test_precision_recall_metrics.py
浏览文件 @
fe9cfe4e
...
@@ -145,7 +145,7 @@ class TestPrecisionRecall(unittest.TestCase):
...
@@ -145,7 +145,7 @@ class TestPrecisionRecall(unittest.TestCase):
return_numpy
=
True
)
return_numpy
=
True
)
outs
=
dict
(
zip
(
metric_keys
,
outs
))
outs
=
dict
(
zip
(
metric_keys
,
outs
))
self
.
assertTrue
(
np
.
allclose
(
outs
[
'
accum_states
'
],
self
.
states
))
self
.
assertTrue
(
np
.
allclose
(
outs
[
'
[TP FP TN FN]
'
],
self
.
states
))
self
.
assertTrue
(
np
.
allclose
(
outs
[
'precision_recall_f1'
],
self
.
metrics
))
self
.
assertTrue
(
np
.
allclose
(
outs
[
'precision_recall_f1'
],
self
.
metrics
))
def
test_exception
(
self
):
def
test_exception
(
self
):
...
...
tests/test_recall_k.py
浏览文件 @
fe9cfe4e
...
@@ -78,10 +78,10 @@ class TestRecallK(unittest.TestCase):
...
@@ -78,10 +78,10 @@ class TestRecallK(unittest.TestCase):
outs
=
dict
(
zip
(
metric_keys
,
outs
))
outs
=
dict
(
zip
(
metric_keys
,
outs
))
self
.
assertTrue
(
self
.
assertTrue
(
np
.
allclose
(
outs
[
'
ins_c
nt'
],
self
.
ins_num
*
self
.
batch_nums
))
np
.
allclose
(
outs
[
'
InsC
nt'
],
self
.
ins_num
*
self
.
batch_nums
))
self
.
assertTrue
(
np
.
allclose
(
outs
[
'
pos_c
nt'
],
self
.
match_num
))
self
.
assertTrue
(
np
.
allclose
(
outs
[
'
RecallC
nt'
],
self
.
match_num
))
self
.
assertTrue
(
self
.
assertTrue
(
np
.
allclose
(
outs
[
'
Recall@%d_ACC
'
%
(
self
.
topk
)],
np
.
allclose
(
outs
[
'
Acc(Recall@%d)
'
%
(
self
.
topk
)],
np
.
array
(
self
.
match_num
/
(
self
.
ins_num
*
np
.
array
(
self
.
match_num
/
(
self
.
ins_num
*
self
.
batch_nums
))))
self
.
batch_nums
))))
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录