From 0a07f6c909c91008db48f228c05e60c2fe826b24 Mon Sep 17 00:00:00 2001 From: heleiwang Date: Tue, 30 Jun 2020 17:44:29 +0800 Subject: [PATCH] support pubmed dataset --- .../graph_to_mindrecord/pubmed/__init__.py | 0 .../graph_to_mindrecord/pubmed/mr_api.py | 105 ++++++++++++++++++ .../utils/graph_to_mindrecord/write_pubmed.sh | 12 ++ 3 files changed, 117 insertions(+) create mode 100644 model_zoo/utils/graph_to_mindrecord/pubmed/__init__.py create mode 100644 model_zoo/utils/graph_to_mindrecord/pubmed/mr_api.py create mode 100644 model_zoo/utils/graph_to_mindrecord/write_pubmed.sh diff --git a/model_zoo/utils/graph_to_mindrecord/pubmed/__init__.py b/model_zoo/utils/graph_to_mindrecord/pubmed/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/model_zoo/utils/graph_to_mindrecord/pubmed/mr_api.py b/model_zoo/utils/graph_to_mindrecord/pubmed/mr_api.py new file mode 100644 index 000000000..4e04e90cd --- /dev/null +++ b/model_zoo/utils/graph_to_mindrecord/pubmed/mr_api.py @@ -0,0 +1,105 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +""" +User-defined API for MindRecord GNN writer. +""" +import os + +import pickle as pkl +import numpy as np +import scipy.sparse as sp + +# parse args from command line parameter 'graph_api_args' +# args delimiter is ':' +args = os.environ['graph_api_args'].split(':') +PUBMED_PATH = args[0] +dataset_str = 'pubmed' + +# profile: (num_features, feature_data_types, feature_shapes) +node_profile = (2, ["float32", "int32"], [[-1], [-1]]) +edge_profile = (0, [], []) + + +def _normalize_cora_features(features): + row_sum = np.array(features.sum(1)) + r_inv = np.power(row_sum * 1.0, -1).flatten() + r_inv[np.isinf(r_inv)] = 0. + r_mat_inv = sp.diags(r_inv) + features = r_mat_inv.dot(features) + return features + + +def _parse_index_file(filename): + """Parse index file.""" + index = [] + for line in open(filename): + index.append(int(line.strip())) + return index + + +def yield_nodes(task_id=0): + """ + Generate node data + + Yields: + data (dict): data row which is dict. + """ + print("Node task is {}".format(task_id)) + + names = ['tx', 'ty', 'allx', 'ally'] + objects = [] + for name in names: + with open("{}/ind.{}.{}".format(PUBMED_PATH, dataset_str, name), 'rb') as f: + objects.append(pkl.load(f, encoding='latin1')) + tx, ty, allx, ally = tuple(objects) + test_idx_reorder = _parse_index_file( + "{}/ind.{}.test.index".format(PUBMED_PATH, dataset_str)) + test_idx_range = np.sort(test_idx_reorder) + + features = sp.vstack((allx, tx)).tolil() + features[test_idx_reorder, :] = features[test_idx_range, :] + features = _normalize_cora_features(features) + features = features.A + + labels = np.vstack((ally, ty)) + labels[test_idx_reorder, :] = labels[test_idx_range, :] + + line_count = 0 + for i, label in enumerate(labels): + node = {'id': i, 'type': 0, 'feature_1': features[i].tolist(), + 'feature_2': label.tolist().index(1)} + line_count += 1 + yield node + print('Processed {} lines for nodes.'.format(line_count)) + + +def yield_edges(task_id=0): + """ + Generate edge data + + Yields: + data (dict): data row which is dict. + """ + print("Edge task is {}".format(task_id)) + with open("{}/ind.{}.graph".format(PUBMED_PATH, dataset_str), 'rb') as f: + graph = pkl.load(f, encoding='latin1') + line_count = 0 + for i in graph: + for dst_id in graph[i]: + edge = {'id': line_count, + 'src_id': i, 'dst_id': dst_id, 'type': 0} + line_count += 1 + yield edge + print('Processed {} lines for edges.'.format(line_count)) diff --git a/model_zoo/utils/graph_to_mindrecord/write_pubmed.sh b/model_zoo/utils/graph_to_mindrecord/write_pubmed.sh new file mode 100644 index 000000000..309505e27 --- /dev/null +++ b/model_zoo/utils/graph_to_mindrecord/write_pubmed.sh @@ -0,0 +1,12 @@ +#!/bin/bash +SRC_PATH=/tmp/pubmed/dataset +MINDRECORD_PATH=/tmp/pubmed/mindrecord + +rm -f $MINDRECORD_PATH/* + +python writer.py --mindrecord_script pubmed \ +--mindrecord_file "$MINDRECORD_PATH/pubmed_mr" \ +--mindrecord_partitions 1 \ +--mindrecord_header_size_by_bit 18 \ +--mindrecord_page_size_by_bit 20 \ +--graph_api_args "$SRC_PATH" -- GitLab