提交 1f4b5a88 编写于 作者: zhang_wenshuo's avatar zhang_wenshuo

data.py with comment

上级 0151f259
......@@ -11,6 +11,7 @@ def parse_index_file(filename):
Parse index file.
"""
index = []
# 解析index文件的,传进来的是ind.{}.test.index文件
for line in open(filename):
index.append(int(line.strip()))
return index
......@@ -20,6 +21,7 @@ def sample_mask(idx, l):
"""
Create mask.
"""
# 生成一个行向量,其中,idx对应的部分为True 其他部分False
mask = np.zeros(l)
mask[idx] = 1
return np.array(mask, dtype=np.bool)
......@@ -29,17 +31,27 @@ def load_data(dataset_str):
"""
Loads input data from gcn/data directory
https://blog.csdn.net/yyl424525/article/details/100831452(前六个讲的很好)
https://blog.csdn.net/chl183/article/details/107446836(ind.dataset_str.graph用的对象,他这个反应的是节点之间的关系:边信息)
那个defaultdict:dict_items([('yellow', [1, 3]), ('blue', [2, 4])]
ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object;
scipy.sparse.csr.csr_matrix object:稀疏矩阵:(location) value,x是训练集特征
ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object;
测试集特征
ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances
(a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object;
all指的是剩下的 的特征,因为是半监督的分类,因而
ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object;
one-hot编码,训练集的分类
ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object;
one-hot编码,测试集的分类
ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object;
全部的费雷
ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict
object;
这个东西就是图的链表表示在Python上边的东西
ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object.
测试机的编号
All objects above must be saved using python pickle module.
:param dataset_str: Dataset name
......@@ -55,8 +67,12 @@ def load_data(dataset_str):
objects.append(pkl.load(f))
x, y, tx, ty, allx, ally, graph = tuple(objects)
test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str))
test_idx_range = np.sort(test_idx_reorder)
# 把前几个数据读进来
test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str)) # reorder 乱序的
test_idx_range = np.sort(test_idx_reorder) # range 有序的
# 把test的数据读进来
# 上边几个完成了数据从文件中读进来
if dataset_str == 'citeseer':
# Fix citeseer dataset (there are some isolated nodes in the graph)
......@@ -69,20 +85,22 @@ def load_data(dataset_str):
ty_extended[test_idx_range-min(test_idx_range), :] = ty
ty = ty_extended
features = sp.vstack((allx, tx)).tolil()
features[test_idx_reorder, :] = features[test_idx_range, :]
adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
features = sp.vstack((allx, tx)).tolil() # 先合并起来,然后再将他转换为系数矩阵的矩阵表示形式
features[test_idx_reorder, :] = features[test_idx_range, :] # 将特征和编号对应起来
# https://zhuanlan.zhihu.com/p/356227823
adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) # 构建联通矩阵
labels = np.vstack((ally, ty))
labels[test_idx_reorder, :] = labels[test_idx_range, :]
labels = np.vstack((ally, ty)) # ty和ally共同构成的label的组合体
labels[test_idx_reorder, :] = labels[test_idx_range, :] # 对应label和序列的index
idx_test = test_idx_range.tolist()
idx_train = range(len(y))
idx_test = test_idx_range.tolist() # 记录test(包括tx和allx)的index
idx_train = range(len(y)) # 记录train的index
idx_val = range(len(y), len(y)+500)
train_mask = sample_mask(idx_train, labels.shape[0])
val_mask = sample_mask(idx_val, labels.shape[0])
test_mask = sample_mask(idx_test, labels.shape[0])
# 相当于是给了一个索引,用mask去筛选数据
y_train = np.zeros(labels.shape)
y_val = np.zeros(labels.shape)
......@@ -90,7 +108,9 @@ def load_data(dataset_str):
y_train[train_mask, :] = labels[train_mask, :]
y_val[val_mask, :] = labels[val_mask, :]
y_test[test_mask, :] = labels[test_mask, :]
# 筛选出来的数据去进行对应,放进对应名字里边去
# https://blog.csdn.net/qq_37995260/article/details/100146401 几个返回值分别的作用
return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask
......@@ -128,7 +148,9 @@ def preprocess_features(features):
def normalize_adj(adj):
# 这个东西的返回值就是那个系数D^-0.5AD^0.5
"""Symmetrically normalize adjacency matrix."""
# scipy.sparse as sp
adj = sp.coo_matrix(adj)
rowsum = np.array(adj.sum(1)) # D
d_inv_sqrt = np.power(rowsum, -0.5).flatten() # D^-0.5
......@@ -138,6 +160,7 @@ def normalize_adj(adj):
def preprocess_adj(adj):
#这个是实现了那个A+1的那一步
"""Preprocessing of adjacency matrix for simple GCN model and conversion to tuple representation."""
adj_normalized = normalize_adj(adj + sp.eye(adj.shape[0]))
return sparse_to_tuple(adj_normalized)
......@@ -147,12 +170,16 @@ def preprocess_adj(adj):
def chebyshev_polynomials(adj, k):
# 切比雪夫是用作卷积核的,这个我之前一直以为是推前向传播的,有点干蒙了,离谱,本质上就是一个转换成前向传播函数的系数求解
# k为切比雪夫的阶数
# 这玩意就是没用到啊。。。。。。,在文件里都没出现,无语。他只用了两个preprocess函数
# 看明白了,教学用的
"""
Calculate Chebyshev polynomials up to order k. Return a list of sparse matrices (tuple representation).
"""
print("Calculating Chebyshev polynomials up to order {}...".format(k))
adj_normalized = normalize_adj(adj)
adj_normalized = normalize_adj(adj) # 返回那个正向传播的那个的系数矩阵D^-0.5AD^0.51
laplacian = sp.eye(adj.shape[0]) - adj_normalized
largest_eigval, _ = eigsh(laplacian, 1, which='LM')
scaled_laplacian = (2. / largest_eigval[0]) * laplacian - sp.eye(adj.shape[0])
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册