Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PGL
提交
752b6169
P
PGL
项目概览
PaddlePaddle
/
PGL
通知
76
Star
4
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
11
列表
看板
标记
里程碑
合并请求
1
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PGL
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
11
Issue
11
列表
看板
标记
里程碑
合并请求
1
合并请求
1
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
752b6169
编写于
2月 14, 2020
作者:
L
liweibin
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add ogb PropPredDataset for pgl
上级
b46b2b1a
变更
11
隐藏空白更改
内联
并排
Showing
11 changed file
with
959 addition
and
0 deletion
+959
-0
ogb_examples/linkproppred/main_pgl.py
ogb_examples/linkproppred/main_pgl.py
+208
-0
ogb_examples/nodeproppred/main_pgl.py
ogb_examples/nodeproppred/main_pgl.py
+176
-0
pgl/contrib/ogb/__init__.py
pgl/contrib/ogb/__init__.py
+13
-0
pgl/contrib/ogb/graphproppred/__init__.py
pgl/contrib/ogb/graphproppred/__init__.py
+14
-0
pgl/contrib/ogb/graphproppred/dataset_pgl.py
pgl/contrib/ogb/graphproppred/dataset_pgl.py
+152
-0
pgl/contrib/ogb/io/__init__.py
pgl/contrib/ogb/io/__init__.py
+15
-0
pgl/contrib/ogb/io/read_graph_pgl.py
pgl/contrib/ogb/io/read_graph_pgl.py
+49
-0
pgl/contrib/ogb/linkproppred/__init__.py
pgl/contrib/ogb/linkproppred/__init__.py
+15
-0
pgl/contrib/ogb/linkproppred/dataset_pgl.py
pgl/contrib/ogb/linkproppred/dataset_pgl.py
+149
-0
pgl/contrib/ogb/nodeproppred/__init__.py
pgl/contrib/ogb/nodeproppred/__init__.py
+15
-0
pgl/contrib/ogb/nodeproppred/dataset_pgl.py
pgl/contrib/ogb/nodeproppred/dataset_pgl.py
+153
-0
未找到文件。
ogb_examples/linkproppred/main_pgl.py
0 → 100644
浏览文件 @
752b6169
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""test ogb
"""
import
argparse
import
pgl
import
numpy
as
np
import
paddle.fluid
as
fluid
from
pgl.contrib.ogb.linkproppred.dataset_pgl
import
PglLinkPropPredDataset
from
pgl.utils
import
paddle_helper
from
ogb.linkproppred
import
Evaluator
def
send_func
(
src_feat
,
dst_feat
,
edge_feat
):
"""send_func"""
return
src_feat
[
"h"
]
def
recv_func
(
feat
):
"""recv_func"""
return
fluid
.
layers
.
sequence_pool
(
feat
,
pool_type
=
"sum"
)
class
GNNModel
(
object
):
"""GNNModel"""
def
__init__
(
self
,
name
,
num_nodes
,
emb_dim
,
num_layers
):
self
.
num_nodes
=
num_nodes
self
.
emb_dim
=
emb_dim
self
.
num_layers
=
num_layers
self
.
name
=
name
self
.
src_nodes
=
fluid
.
layers
.
data
(
name
=
'src_nodes'
,
shape
=
[
None
,
1
],
dtype
=
'int64'
,
)
self
.
dst_nodes
=
fluid
.
layers
.
data
(
name
=
'dst_nodes'
,
shape
=
[
None
,
1
],
dtype
=
'int64'
,
)
self
.
edge_label
=
fluid
.
layers
.
data
(
name
=
'edge_label'
,
shape
=
[
None
,
1
],
dtype
=
'float32'
,
)
def
forward
(
self
,
graph
):
"""forward"""
h
=
fluid
.
layers
.
create_parameter
(
shape
=
[
self
.
num_nodes
,
self
.
emb_dim
],
dtype
=
"float32"
,
name
=
self
.
name
+
"_embedding"
)
# edge_attr = fluid.layers.fc(graph.edge_feat["feat"], size=self.emb_dim)
for
layer
in
range
(
self
.
num_layers
):
msg
=
graph
.
send
(
send_func
,
nfeat_list
=
[(
"h"
,
h
)],
)
h
=
graph
.
recv
(
msg
,
recv_func
)
h
=
fluid
.
layers
.
fc
(
h
,
size
=
self
.
emb_dim
,
bias_attr
=
False
,
param_attr
=
fluid
.
ParamAttr
(
name
=
self
.
name
+
'_%s'
%
layer
))
h
=
h
*
graph
.
node_feat
[
"norm"
]
bias
=
fluid
.
layers
.
create_parameter
(
shape
=
[
self
.
emb_dim
],
dtype
=
'float32'
,
is_bias
=
True
,
name
=
self
.
name
+
'_bias_%s'
%
layer
)
h
=
fluid
.
layers
.
elementwise_add
(
h
,
bias
,
act
=
"relu"
)
src
=
fluid
.
layers
.
gather
(
h
,
self
.
src_nodes
)
dst
=
fluid
.
layers
.
gather
(
h
,
self
.
dst_nodes
)
edge_embed
=
src
*
dst
pred
=
fluid
.
layers
.
fc
(
input
=
edge_embed
,
size
=
1
,
name
=
self
.
name
+
"_pred_output"
)
prob
=
fluid
.
layers
.
sigmoid
(
pred
)
loss
=
fluid
.
layers
.
sigmoid_cross_entropy_with_logits
(
pred
,
self
.
edge_label
)
loss
=
fluid
.
layers
.
reduce_mean
(
loss
)
return
pred
,
prob
,
loss
def
main
():
"""main
"""
# Training settings
parser
=
argparse
.
ArgumentParser
(
description
=
'Graph Dataset'
)
parser
.
add_argument
(
'--epochs'
,
type
=
int
,
default
=
100
,
help
=
'number of epochs to train (default: 100)'
)
parser
.
add_argument
(
'--dataset'
,
type
=
str
,
default
=
"ogbl-ppa"
,
help
=
'dataset name (default: protein protein associations)'
)
args
=
parser
.
parse_args
()
#place = fluid.CUDAPlace(0)
place
=
fluid
.
CPUPlace
()
# Dataset too big to use GPU
### automatic dataloading and splitting
print
(
"loadding dataset"
)
dataset
=
PglLinkPropPredDataset
(
name
=
args
.
dataset
)
splitted_edge
=
dataset
.
get_edge_split
()
print
(
splitted_edge
[
'train_edge'
].
shape
)
print
(
splitted_edge
[
'train_edge_label'
].
shape
)
print
(
"building evaluator"
)
### automatic evaluator. takes dataset name as input
evaluator
=
Evaluator
(
args
.
dataset
)
graph_data
=
dataset
[
0
]
print
(
"num_nodes: %d"
%
graph_data
.
num_nodes
)
train_program
=
fluid
.
Program
()
startup_program
=
fluid
.
Program
()
test_program
=
fluid
.
Program
()
# degree normalize
indegree
=
graph_data
.
indegree
()
norm
=
np
.
zeros_like
(
indegree
,
dtype
=
"float32"
)
norm
[
indegree
>
0
]
=
np
.
power
(
indegree
[
indegree
>
0
],
-
0.5
)
graph_data
.
node_feat
[
"norm"
]
=
np
.
expand_dims
(
norm
,
-
1
).
astype
(
"float32"
)
with
fluid
.
program_guard
(
train_program
,
startup_program
):
model
=
GNNModel
(
name
=
"gnn"
,
num_nodes
=
graph_data
.
num_nodes
,
emb_dim
=
64
,
num_layers
=
2
)
gw
=
pgl
.
graph_wrapper
.
GraphWrapper
(
"graph"
,
place
,
node_feat
=
graph_data
.
node_feat_info
(),
edge_feat
=
graph_data
.
edge_feat_info
())
pred
,
prob
,
loss
=
model
.
forward
(
gw
)
val_program
=
train_program
.
clone
(
for_test
=
True
)
with
fluid
.
program_guard
(
train_program
,
startup_program
):
adam
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
1e-2
,
regularization
=
fluid
.
regularizer
.
L2DecayRegularizer
(
regularization_coeff
=
0.0005
))
adam
.
minimize
(
loss
)
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup_program
)
feed
=
gw
.
to_feed
(
graph_data
)
for
epoch
in
range
(
1
,
args
.
epochs
+
1
):
feed
[
'src_nodes'
]
=
splitted_edge
[
"train_edge"
][:,
0
].
reshape
(
-
1
,
1
)
feed
[
'dst_nodes'
]
=
splitted_edge
[
"train_edge"
][:,
1
].
reshape
(
-
1
,
1
)
feed
[
'edge_label'
]
=
splitted_edge
[
"train_edge_label"
].
astype
(
"float32"
).
reshape
(
-
1
,
1
)
res_loss
,
y_pred
=
exe
.
run
(
train_program
,
feed
=
feed
,
fetch_list
=
[
loss
,
prob
])
print
(
"Loss %s"
%
res_loss
[
0
])
result
=
{}
print
(
"Evaluating..."
)
feed
[
'src_nodes'
]
=
splitted_edge
[
"valid_edge"
][:,
0
].
reshape
(
-
1
,
1
)
feed
[
'dst_nodes'
]
=
splitted_edge
[
"valid_edge"
][:,
1
].
reshape
(
-
1
,
1
)
feed
[
'edge_label'
]
=
splitted_edge
[
"valid_edge_label"
].
astype
(
"float32"
).
reshape
(
-
1
,
1
)
y_pred
=
exe
.
run
(
val_program
,
feed
=
feed
,
fetch_list
=
[
prob
])[
0
]
input_dict
=
{
"y_true"
:
splitted_edge
[
"valid_edge_label"
],
"y_pred"
:
y_pred
.
reshape
(
-
1
,
),
}
result
[
"valid"
]
=
evaluator
.
eval
(
input_dict
)
feed
[
'src_nodes'
]
=
splitted_edge
[
"test_edge"
][:,
0
].
reshape
(
-
1
,
1
)
feed
[
'dst_nodes'
]
=
splitted_edge
[
"test_edge"
][:,
1
].
reshape
(
-
1
,
1
)
feed
[
'edge_label'
]
=
splitted_edge
[
"test_edge_label"
].
astype
(
"float32"
).
reshape
(
-
1
,
1
)
y_pred
=
exe
.
run
(
val_program
,
feed
=
feed
,
fetch_list
=
[
prob
])[
0
]
input_dict
=
{
"y_true"
:
splitted_edge
[
"test_edge_label"
],
"y_pred"
:
y_pred
.
reshape
(
-
1
,
),
}
result
[
"test"
]
=
evaluator
.
eval
(
input_dict
)
print
(
result
)
if
__name__
==
"__main__"
:
main
()
ogb_examples/nodeproppred/main_pgl.py
0 → 100644
浏览文件 @
752b6169
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""test ogb
"""
import
argparse
import
pgl
import
numpy
as
np
import
paddle.fluid
as
fluid
from
pgl.contrib.ogb.nodeproppred.dataset_pgl
import
PglNodePropPredDataset
from
pgl.utils
import
paddle_helper
from
ogb.nodeproppred
import
Evaluator
def
train
():
pass
def
send_func
(
src_feat
,
dst_feat
,
edge_feat
):
return
(
src_feat
[
"h"
]
+
edge_feat
[
"h"
])
*
src_feat
[
"norm"
]
class
GNNModel
(
object
):
def
__init__
(
self
,
name
,
emb_dim
,
num_task
,
num_layers
):
self
.
num_task
=
num_task
self
.
emb_dim
=
emb_dim
self
.
num_layers
=
num_layers
self
.
name
=
name
def
forward
(
self
,
graph
):
h
=
fluid
.
layers
.
embedding
(
graph
.
node_feat
[
"x"
],
size
=
(
2
,
self
.
emb_dim
))
# name=self.name + "_embedding")
edge_attr
=
fluid
.
layers
.
fc
(
graph
.
edge_feat
[
"feat"
],
size
=
self
.
emb_dim
)
for
layer
in
range
(
self
.
num_layers
):
msg
=
graph
.
send
(
send_func
,
nfeat_list
=
[(
"h"
,
h
),
(
"norm"
,
graph
.
node_feat
[
"norm"
])],
efeat_list
=
[(
"h"
,
edge_attr
)])
h
=
graph
.
recv
(
msg
,
"sum"
)
h
=
fluid
.
layers
.
fc
(
h
,
size
=
self
.
emb_dim
,
bias_attr
=
False
,
param_attr
=
fluid
.
ParamAttr
(
name
=
self
.
name
+
'_%s'
%
layer
))
h
=
h
*
graph
.
node_feat
[
"norm"
]
bias
=
fluid
.
layers
.
create_parameter
(
shape
=
[
self
.
emb_dim
],
dtype
=
'float32'
,
is_bias
=
True
,
name
=
self
.
name
+
'_bias_%s'
%
layer
)
h
=
fluid
.
layers
.
elementwise_add
(
h
,
bias
,
act
=
"relu"
)
pred
=
fluid
.
layers
.
fc
(
h
,
self
.
num_task
,
act
=
None
,
name
=
self
.
name
+
"_pred_output"
)
return
pred
def
main
():
"""main
"""
# Training settings
parser
=
argparse
.
ArgumentParser
(
description
=
'Graph Dataset'
)
parser
.
add_argument
(
'--epochs'
,
type
=
int
,
default
=
100
,
help
=
'number of epochs to train (default: 100)'
)
parser
.
add_argument
(
'--dataset'
,
type
=
str
,
default
=
"ogbn-proteins"
,
help
=
'dataset name (default: proteinfunc)'
)
args
=
parser
.
parse_args
()
#device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu")
#place = fluid.CUDAPlace(0)
place
=
fluid
.
CPUPlace
()
# Dataset too big to use GPU
### automatic dataloading and splitting
dataset
=
PglNodePropPredDataset
(
name
=
args
.
dataset
)
splitted_idx
=
dataset
.
get_idx_split
()
### automatic evaluator. takes dataset name as input
evaluator
=
Evaluator
(
args
.
dataset
)
graph_data
,
label
=
dataset
[
0
]
train_program
=
fluid
.
Program
()
startup_program
=
fluid
.
Program
()
test_program
=
fluid
.
Program
()
# degree normalize
indegree
=
graph_data
.
indegree
()
norm
=
np
.
zeros_like
(
indegree
,
dtype
=
"float32"
)
norm
[
indegree
>
0
]
=
np
.
power
(
indegree
[
indegree
>
0
],
-
0.5
)
graph_data
.
node_feat
[
"norm"
]
=
np
.
expand_dims
(
norm
,
-
1
).
astype
(
"float32"
)
graph_data
.
node_feat
[
"x"
]
=
np
.
zeros
((
len
(
indegree
),
1
),
dtype
=
"int64"
)
graph_data
.
edge_feat
[
"feat"
]
=
graph_data
.
edge_feat
[
"feat"
].
astype
(
"float32"
)
model
=
GNNModel
(
name
=
"gnn"
,
num_task
=
dataset
.
num_tasks
,
emb_dim
=
64
,
num_layers
=
2
)
with
fluid
.
program_guard
(
train_program
,
startup_program
):
gw
=
pgl
.
graph_wrapper
.
StaticGraphWrapper
(
"graph"
,
graph_data
,
place
)
pred
=
model
.
forward
(
gw
)
sigmoid_pred
=
fluid
.
layers
.
sigmoid
(
pred
)
val_program
=
train_program
.
clone
(
for_test
=
True
)
initializer
=
[]
with
fluid
.
program_guard
(
train_program
,
startup_program
):
train_node_index
,
init
=
paddle_helper
.
constant
(
"train_node_index"
,
dtype
=
"int64"
,
value
=
splitted_idx
[
"train"
])
initializer
.
append
(
init
)
train_node_label
,
init
=
paddle_helper
.
constant
(
"train_node_label"
,
dtype
=
"float32"
,
value
=
label
[
splitted_idx
[
"train"
]].
astype
(
"float32"
))
initializer
.
append
(
init
)
train_pred_t
=
fluid
.
layers
.
gather
(
pred
,
train_node_index
)
train_loss_t
=
fluid
.
layers
.
sigmoid_cross_entropy_with_logits
(
x
=
train_pred_t
,
label
=
train_node_label
)
train_loss_t
=
fluid
.
layers
.
reduce_sum
(
train_loss_t
)
train_pred_t
=
fluid
.
layers
.
sigmoid
(
train_pred_t
)
adam
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
1e-2
,
regularization
=
fluid
.
regularizer
.
L2DecayRegularizer
(
regularization_coeff
=
0.0005
))
adam
.
minimize
(
train_loss_t
)
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup_program
)
gw
.
initialize
(
place
)
for
init
in
initializer
:
init
(
place
)
for
epoch
in
range
(
1
,
args
.
epochs
+
1
):
loss
=
exe
.
run
(
train_program
,
feed
=
{},
fetch_list
=
[
train_loss_t
])
print
(
"Loss %s"
%
loss
[
0
])
print
(
"Evaluating..."
)
y_pred
=
exe
.
run
(
val_program
,
feed
=
{},
fetch_list
=
[
sigmoid_pred
])[
0
]
result
=
{}
input_dict
=
{
"y_true"
:
label
[
splitted_idx
[
"train"
]],
"y_pred"
:
y_pred
[
splitted_idx
[
"train"
]]
}
result
[
"train"
]
=
evaluator
.
eval
(
input_dict
)
input_dict
=
{
"y_true"
:
label
[
splitted_idx
[
"valid"
]],
"y_pred"
:
y_pred
[
splitted_idx
[
"valid"
]]
}
result
[
"valid"
]
=
evaluator
.
eval
(
input_dict
)
input_dict
=
{
"y_true"
:
label
[
splitted_idx
[
"test"
]],
"y_pred"
:
y_pred
[
splitted_idx
[
"test"
]]
}
result
[
"test"
]
=
evaluator
.
eval
(
input_dict
)
print
(
result
)
if
__name__
==
"__main__"
:
main
()
pgl/contrib/ogb/__init__.py
0 → 100644
浏览文件 @
752b6169
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
pgl/contrib/ogb/graphproppred/__init__.py
0 → 100644
浏览文件 @
752b6169
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""__init__.py"""
pgl/contrib/ogb/graphproppred/dataset_pgl.py
0 → 100644
浏览文件 @
752b6169
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PglGraphPropPredDataset
"""
import
pandas
as
pd
import
shutil
,
os
import
os.path
as
osp
import
numpy
as
np
from
ogb.utils.url
import
decide_download
,
download_url
,
extract_zip
from
ogb.graphproppred
import
make_master_file
from
pgl.contrib.ogb.io.read_graph_pgl
import
read_csv_graph_pgl
def
to_bool
(
value
):
"""to_bool"""
return
np
.
array
([
value
],
dtype
=
"bool"
)[
0
]
class
PglGraphPropPredDataset
(
object
):
"""PglGraphPropPredDataset"""
def
__init__
(
self
,
name
,
root
=
"dataset"
):
self
.
name
=
name
## original name, e.g., ogbg-mol-tox21
self
.
dir_name
=
"_"
.
join
(
name
.
split
(
"-"
)
)
+
"_pgl"
## replace hyphen with underline, e.g., ogbg_mol_tox21_dgl
self
.
original_root
=
root
self
.
root
=
osp
.
join
(
root
,
self
.
dir_name
)
self
.
meta_info
=
make_master_file
.
df
#pd.read_csv(
#os.path.join(os.path.dirname(__file__), "master.csv"), index_col=0)
if
not
self
.
name
in
self
.
meta_info
:
print
(
self
.
name
)
error_mssg
=
"Invalid dataset name {}.
\n
"
.
format
(
self
.
name
)
error_mssg
+=
"Available datasets are as follows:
\n
"
error_mssg
+=
"
\n
"
.
join
(
self
.
meta_info
.
keys
())
raise
ValueError
(
error_mssg
)
self
.
download_name
=
self
.
meta_info
[
self
.
name
][
"download_name"
]
## name of downloaded file, e.g., tox21
self
.
num_tasks
=
int
(
self
.
meta_info
[
self
.
name
][
"num tasks"
])
self
.
task_type
=
self
.
meta_info
[
self
.
name
][
"task type"
]
super
(
PglGraphPropPredDataset
,
self
).
__init__
()
self
.
pre_process
()
def
pre_process
(
self
):
"""Pre-processing"""
processed_dir
=
osp
.
join
(
self
.
root
,
'processed'
)
raw_dir
=
osp
.
join
(
self
.
root
,
'raw'
)
pre_processed_file_path
=
osp
.
join
(
processed_dir
,
'pgl_data_processed'
)
if
os
.
path
.
exists
(
pre_processed_file_path
):
# TODO: Load Preprocessed
pass
else
:
### download
url
=
self
.
meta_info
[
self
.
name
][
"url"
]
if
decide_download
(
url
):
path
=
download_url
(
url
,
self
.
original_root
)
extract_zip
(
path
,
self
.
original_root
)
os
.
unlink
(
path
)
# delete folder if there exists
try
:
shutil
.
rmtree
(
self
.
root
)
except
:
pass
shutil
.
move
(
osp
.
join
(
self
.
original_root
,
self
.
download_name
),
self
.
root
)
else
:
print
(
"Stop download."
)
exit
(
-
1
)
### preprocess
add_inverse_edge
=
to_bool
(
self
.
meta_info
[
self
.
name
][
"add_inverse_edge"
])
self
.
graphs
=
read_csv_graph_pgl
(
raw_dir
,
add_inverse_edge
=
add_inverse_edge
)
self
.
graphs
=
np
.
array
(
self
.
graphs
)
self
.
labels
=
np
.
array
(
pd
.
read_csv
(
osp
.
join
(
raw_dir
,
"graph-label.csv.gz"
),
compression
=
"gzip"
,
header
=
None
).
values
)
# TODO: Load Graph
### load preprocessed files
def
get_idx_split
(
self
):
"""Train/Valid/Test split"""
split_type
=
self
.
meta_info
[
self
.
name
][
"split"
]
path
=
osp
.
join
(
self
.
root
,
"split"
,
split_type
)
train_idx
=
pd
.
read_csv
(
osp
.
join
(
path
,
"train.csv.gz"
),
compression
=
"gzip"
,
header
=
None
).
values
.
T
[
0
]
valid_idx
=
pd
.
read_csv
(
osp
.
join
(
path
,
"valid.csv.gz"
),
compression
=
"gzip"
,
header
=
None
).
values
.
T
[
0
]
test_idx
=
pd
.
read_csv
(
osp
.
join
(
path
,
"test.csv.gz"
),
compression
=
"gzip"
,
header
=
None
).
values
.
T
[
0
]
return
{
"train"
:
np
.
array
(
train_idx
,
dtype
=
"int64"
),
"valid"
:
np
.
array
(
valid_idx
,
dtype
=
"int64"
),
"test"
:
np
.
array
(
test_idx
,
dtype
=
"int64"
)
}
def
__getitem__
(
self
,
idx
):
"""Get datapoint with index"""
return
self
.
graphs
[
idx
],
self
.
labels
[
idx
]
def
__len__
(
self
):
"""Length of the dataset
Returns
-------
int
Length of Dataset
"""
return
len
(
self
.
graphs
)
def
__repr__
(
self
):
# pragma: no cover
return
'{}({})'
.
format
(
self
.
__class__
.
__name__
,
len
(
self
))
if
__name__
==
"__main__"
:
pgl_dataset
=
PglGraphPropPredDataset
(
name
=
"ogbg-mol-bace"
)
splitted_index
=
pgl_dataset
.
get_idx_split
()
print
(
pgl_dataset
)
print
(
pgl_dataset
[
3
:
20
])
#print(pgl_dataset[splitted_index["train"]])
#print(pgl_dataset[splitted_index["valid"]])
#print(pgl_dataset[splitted_index["test"]])
pgl/contrib/ogb/io/__init__.py
0 → 100644
浏览文件 @
752b6169
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""__init__.py
"""
pgl/contrib/ogb/io/read_graph_pgl.py
0 → 100644
浏览文件 @
752b6169
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""pgl read_csv_graph for ogb
"""
import
pandas
as
pd
import
os.path
as
osp
import
numpy
as
np
import
pgl
from
ogb.io.read_graph_raw
import
read_csv_graph_raw
def
read_csv_graph_pgl
(
raw_dir
,
add_inverse_edge
=
False
):
"""Read CSV data and build PGL Graph
"""
graph_list
=
read_csv_graph_raw
(
raw_dir
,
add_inverse_edge
)
pgl_graph_list
=
[]
for
graph
in
graph_list
:
edges
=
list
(
zip
(
graph
[
"edge_index"
][
0
],
graph
[
"edge_index"
][
1
]))
g
=
pgl
.
graph
.
Graph
(
num_nodes
=
graph
[
"num_nodes"
],
edges
=
edges
)
if
graph
[
"edge_feat"
]
is
not
None
:
g
.
edge_feat
[
"feat"
]
=
graph
[
"edge_feat"
]
if
graph
[
"node_feat"
]
is
not
None
:
g
.
node_feat
[
"feat"
]
=
graph
[
"node_feat"
]
pgl_graph_list
.
append
(
g
)
return
pgl_graph_list
if
__name__
==
"__main__"
:
# graph_list = read_csv_graph_dgl('dataset/proteinfunc_v2/raw', add_inverse_edge = True)
graph_list
=
read_csv_graph_pgl
(
'dataset/ogbn_proteins_pgl/raw'
,
add_inverse_edge
=
True
)
print
(
graph_list
)
pgl/contrib/ogb/linkproppred/__init__.py
0 → 100644
浏览文件 @
752b6169
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""__init__.py
"""
pgl/contrib/ogb/linkproppred/dataset_pgl.py
0 → 100644
浏览文件 @
752b6169
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""LinkPropPredDataset for pgl
"""
import
pandas
as
pd
import
shutil
,
os
import
os.path
as
osp
import
numpy
as
np
from
ogb.utils.url
import
decide_download
,
download_url
,
extract_zip
from
ogb.linkproppred
import
make_master_file
from
pgl.contrib.ogb.io.read_graph_pgl
import
read_csv_graph_pgl
def
to_bool
(
value
):
"""to_bool"""
return
np
.
array
([
value
],
dtype
=
"bool"
)[
0
]
class
PglLinkPropPredDataset
(
object
):
"""PglLinkPropPredDataset
"""
def
__init__
(
self
,
name
,
root
=
"dataset"
):
self
.
name
=
name
## original name, e.g., ogbl-ppa
self
.
dir_name
=
"_"
.
join
(
name
.
split
(
"-"
))
+
"_pgl"
## replace hyphen with underline, e.g., ogbl_ppa_pgl
self
.
original_root
=
root
self
.
root
=
osp
.
join
(
root
,
self
.
dir_name
)
self
.
meta_info
=
make_master_file
.
df
#pd.read_csv(os.path.join(os.path.dirname(__file__), "master.csv"), index_col=0)
if
not
self
.
name
in
self
.
meta_info
:
print
(
self
.
name
)
error_mssg
=
"Invalid dataset name {}.
\n
"
.
format
(
self
.
name
)
error_mssg
+=
"Available datasets are as follows:
\n
"
error_mssg
+=
"
\n
"
.
join
(
self
.
meta_info
.
keys
())
raise
ValueError
(
error_mssg
)
self
.
download_name
=
self
.
meta_info
[
self
.
name
][
"download_name"
]
## name of downloaded file, e.g., ppassoc
self
.
task_type
=
self
.
meta_info
[
self
.
name
][
"task type"
]
super
(
PglLinkPropPredDataset
,
self
).
__init__
()
self
.
pre_process
()
def
pre_process
(
self
):
"""pre_process downlaoding data
"""
processed_dir
=
osp
.
join
(
self
.
root
,
'processed'
)
pre_processed_file_path
=
osp
.
join
(
processed_dir
,
'dgl_data_processed'
)
if
osp
.
exists
(
pre_processed_file_path
):
#TODO: Reload Preprocess files
pass
else
:
### check download
if
not
osp
.
exists
(
osp
.
join
(
self
.
root
,
"raw"
,
"edge.csv.gz"
)):
url
=
self
.
meta_info
[
self
.
name
][
"url"
]
if
decide_download
(
url
):
path
=
download_url
(
url
,
self
.
original_root
)
extract_zip
(
path
,
self
.
original_root
)
os
.
unlink
(
path
)
# delete folder if there exists
try
:
shutil
.
rmtree
(
self
.
root
)
except
:
pass
shutil
.
move
(
osp
.
join
(
self
.
original_root
,
self
.
download_name
),
self
.
root
)
else
:
print
(
"Stop download."
)
exit
(
-
1
)
raw_dir
=
osp
.
join
(
self
.
root
,
"raw"
)
### pre-process and save
add_inverse_edge
=
to_bool
(
self
.
meta_info
[
self
.
name
][
"add_inverse_edge"
])
self
.
graph
=
read_csv_graph_pgl
(
raw_dir
,
add_inverse_edge
=
add_inverse_edge
)
#TODO: SAVE preprocess graph
def
get_edge_split
(
self
):
"""Train/Validation/Test split
"""
split_type
=
self
.
meta_info
[
self
.
name
][
"split"
]
path
=
osp
.
join
(
self
.
root
,
"split"
,
split_type
)
train_idx
=
pd
.
read_csv
(
osp
.
join
(
path
,
"train.csv.gz"
),
compression
=
"gzip"
,
header
=
None
).
values
valid_idx
=
pd
.
read_csv
(
osp
.
join
(
path
,
"valid.csv.gz"
),
compression
=
"gzip"
,
header
=
None
).
values
test_idx
=
pd
.
read_csv
(
osp
.
join
(
path
,
"test.csv.gz"
),
compression
=
"gzip"
,
header
=
None
).
values
if
self
.
task_type
==
"link prediction"
:
target_type
=
np
.
int64
else
:
target_type
=
np
.
float32
return
{
"train_edge"
:
np
.
array
(
train_idx
[:,
:
2
],
dtype
=
"int64"
),
"train_edge_label"
:
np
.
array
(
train_idx
[:,
2
],
dtype
=
target_type
),
"valid_edge"
:
np
.
array
(
valid_idx
[:,
:
2
],
dtype
=
"int64"
),
"valid_edge_label"
:
np
.
array
(
valid_idx
[:,
2
],
dtype
=
target_type
),
"test_edge"
:
np
.
array
(
test_idx
[:,
:
2
],
dtype
=
"int64"
),
"test_edge_label"
:
np
.
array
(
test_idx
[:,
2
],
dtype
=
target_type
)
}
def
__getitem__
(
self
,
idx
):
assert
idx
==
0
,
"This dataset has only one graph"
return
self
.
graph
[
0
]
def
__len__
(
self
):
return
1
def
__repr__
(
self
):
# pragma: no cover
return
'{}({})'
.
format
(
self
.
__class__
.
__name__
,
len
(
self
))
if
__name__
==
"__main__"
:
pgl_dataset
=
PglLinkPropPredDataset
(
name
=
"ogbl-ppa"
)
splitted_edge
=
pgl_dataset
.
get_edge_split
()
print
(
pgl_dataset
[
0
])
print
(
splitted_edge
)
pgl/contrib/ogb/nodeproppred/__init__.py
0 → 100644
浏览文件 @
752b6169
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""__init__.py
"""
pgl/contrib/ogb/nodeproppred/dataset_pgl.py
0 → 100644
浏览文件 @
752b6169
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""NodePropPredDataset for pgl
"""
import
pandas
as
pd
import
shutil
,
os
import
os.path
as
osp
import
numpy
as
np
from
ogb.utils.url
import
decide_download
,
download_url
,
extract_zip
from
ogb.nodeproppred
import
make_master_file
# create master.csv
from
pgl.contrib.ogb.io.read_graph_pgl
import
read_csv_graph_pgl
def
to_bool
(
value
):
"""to_bool"""
return
np
.
array
([
value
],
dtype
=
"bool"
)[
0
]
class
PglNodePropPredDataset
(
object
):
"""PglNodePropPredDataset
"""
def
__init__
(
self
,
name
,
root
=
"dataset"
):
self
.
name
=
name
## original name, e.g., ogbn-proteins
self
.
dir_name
=
"_"
.
join
(
name
.
split
(
"-"
)
)
+
"_pgl"
## replace hyphen with underline, e.g., ogbn_proteins_pgl
self
.
original_root
=
root
self
.
root
=
osp
.
join
(
root
,
self
.
dir_name
)
self
.
meta_info
=
make_master_file
.
df
#pd.read_csv(
#os.path.join(os.path.dirname(__file__), "master.csv"), index_col=0)
if
not
self
.
name
in
self
.
meta_info
:
error_mssg
=
"Invalid dataset name {}.
\n
"
.
format
(
self
.
name
)
error_mssg
+=
"Available datasets are as follows:
\n
"
error_mssg
+=
"
\n
"
.
join
(
self
.
meta_info
.
keys
())
raise
ValueError
(
error_mssg
)
self
.
download_name
=
self
.
meta_info
[
self
.
name
][
"download_name"
]
## name of downloaded file, e.g., tox21
self
.
num_tasks
=
int
(
self
.
meta_info
[
self
.
name
][
"num tasks"
])
self
.
task_type
=
self
.
meta_info
[
self
.
name
][
"task type"
]
super
(
PglNodePropPredDataset
,
self
).
__init__
()
self
.
pre_process
()
def
pre_process
(
self
):
"""pre_process downlaoding data
"""
processed_dir
=
osp
.
join
(
self
.
root
,
'processed'
)
pre_processed_file_path
=
osp
.
join
(
processed_dir
,
'pgl_data_processed'
)
if
osp
.
exists
(
pre_processed_file_path
):
# TODO: Reload Preprocess files
pass
else
:
### check download
if
not
osp
.
exists
(
osp
.
join
(
self
.
root
,
"raw"
,
"edge.csv.gz"
)):
url
=
self
.
meta_info
[
self
.
name
][
"url"
]
if
decide_download
(
url
):
path
=
download_url
(
url
,
self
.
original_root
)
extract_zip
(
path
,
self
.
original_root
)
os
.
unlink
(
path
)
# delete folder if there exists
try
:
shutil
.
rmtree
(
self
.
root
)
except
:
pass
shutil
.
move
(
osp
.
join
(
self
.
original_root
,
self
.
download_name
),
self
.
root
)
else
:
print
(
"Stop download."
)
exit
(
-
1
)
raw_dir
=
osp
.
join
(
self
.
root
,
"raw"
)
### pre-process and save
add_inverse_edge
=
to_bool
(
self
.
meta_info
[
self
.
name
][
"add_inverse_edge"
])
self
.
graph
=
read_csv_graph_pgl
(
raw_dir
,
add_inverse_edge
=
add_inverse_edge
)
### adding prediction target
node_label
=
pd
.
read_csv
(
osp
.
join
(
raw_dir
,
'node-label.csv.gz'
),
compression
=
"gzip"
,
header
=
None
).
values
if
"classification"
in
self
.
task_type
:
node_label
=
np
.
array
(
node_label
,
dtype
=
np
.
int64
)
else
:
node_label
=
np
.
array
(
node_label
,
dtype
=
np
.
float32
)
label_dict
=
{
"labels"
:
node_label
}
# TODO: SAVE preprocess graph
self
.
labels
=
label_dict
[
'labels'
]
def
get_idx_split
(
self
):
"""Train/Validation/Test split
"""
split_type
=
self
.
meta_info
[
self
.
name
][
"split"
]
path
=
osp
.
join
(
self
.
root
,
"split"
,
split_type
)
train_idx
=
pd
.
read_csv
(
osp
.
join
(
path
,
"train.csv.gz"
),
compression
=
"gzip"
,
header
=
None
).
values
.
T
[
0
]
valid_idx
=
pd
.
read_csv
(
osp
.
join
(
path
,
"valid.csv.gz"
),
compression
=
"gzip"
,
header
=
None
).
values
.
T
[
0
]
test_idx
=
pd
.
read_csv
(
osp
.
join
(
path
,
"test.csv.gz"
),
compression
=
"gzip"
,
header
=
None
).
values
.
T
[
0
]
return
{
"train"
:
np
.
array
(
train_idx
,
dtype
=
"int64"
),
"valid"
:
np
.
array
(
valid_idx
,
dtype
=
"int64"
),
"test"
:
np
.
array
(
test_idx
,
dtype
=
"int64"
)
}
def
__getitem__
(
self
,
idx
):
assert
idx
==
0
,
"This dataset has only one graph"
return
self
.
graph
[
idx
],
self
.
labels
def
__len__
(
self
):
return
1
def
__repr__
(
self
):
# pragma: no cover
return
'{}({})'
.
format
(
self
.
__class__
.
__name__
,
len
(
self
))
if
__name__
==
"__main__"
:
pgl_dataset
=
PglNodePropPredDataset
(
name
=
"ogbn-proteins"
)
splitted_index
=
pgl_dataset
.
get_idx_split
()
print
(
pgl_dataset
[
0
])
print
(
splitted_index
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录