data.py with comment

1f4b5a88 · zhang_wenshuo · 0151f259 · 1f4b5a88
隐藏空白更改
内联并排

Showing with 38 addition and 11 deletion

GCN-PyTorch-master/data.py GCN-PyTorch-master/data.py +38 -11

未找到文件。
--- a/GCN-PyTorch-master/data.py
+++ b/GCN-PyTorch-master/data.py
@@ -11,6 +11,7 @@ def parse_index_file(filename):
    Parse index file.
    """
    index = []
+    # 解析index文件的，传进来的是ind.{}.test.index文件
    for line in open(filename):
        index.append(int(line.strip()))
    return index
@@ -20,6 +21,7 @@ def sample_mask(idx, l):
    """
    Create mask.
    """
+    # 生成一个行向量，其中，idx对应的部分为True 其他部分False
    mask = np.zeros(l)
    mask[idx] = 1
    return np.array(mask, dtype=np.bool)
@@ -29,17 +31,27 @@ def load_data(dataset_str):
    """
    Loads input data from gcn/data directory

+    https://blog.csdn.net/yyl424525/article/details/100831452（前六个讲的很好）
+    https://blog.csdn.net/chl183/article/details/107446836（ind.dataset_str.graph用的对象，他这个反应的是节点之间的关系：边信息）
+    那个defaultdict：dict_items([('yellow', [1, 3]), ('blue', [2, 4])]
    ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object;
+    scipy.sparse.csr.csr_matrix object：稀疏矩阵：（location） value，x是训练集特征
    ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object;
+    测试集特征
    ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances
        (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object;
+    all指的是剩下的 的特征，因为是半监督的分类，因而
    ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object;
+    one-hot编码，训练集的分类
    ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object;
+    one-hot编码，测试集的分类
    ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object;
+    全部的费雷
    ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict
        object;
+    这个东西就是图的链表表示在Python上边的东西
    ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object.
-
+    测试机的编号
    All objects above must be saved using python pickle module.

    :param dataset_str: Dataset name
@@ -55,8 +67,12 @@ def load_data(dataset_str):
                objects.append(pkl.load(f))

    x, y, tx, ty, allx, ally, graph = tuple(objects)
-    test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str))
-    test_idx_range = np.sort(test_idx_reorder)
+    # 把前几个数据读进来
+    test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str))       # reorder 乱序的
+    test_idx_range = np.sort(test_idx_reorder)                                              # range 有序的
+    # 把test的数据读进来
+    # 上边几个完成了数据从文件中读进来
+

    if dataset_str == 'citeseer':
        # Fix citeseer dataset (there are some isolated nodes in the graph)
@@ -69,20 +85,22 @@ def load_data(dataset_str):
        ty_extended[test_idx_range-min(test_idx_range), :] = ty
        ty = ty_extended

-    features = sp.vstack((allx, tx)).tolil()
-    features[test_idx_reorder, :] = features[test_idx_range, :]
-    adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
+    features = sp.vstack((allx, tx)).tolil()                        # 先合并起来，然后再将他转换为系数矩阵的矩阵表示形式
+    features[test_idx_reorder, :] = features[test_idx_range, :]     # 将特征和编号对应起来
+    # https://zhuanlan.zhihu.com/p/356227823
+    adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))         # 构建联通矩阵

-    labels = np.vstack((ally, ty))
-    labels[test_idx_reorder, :] = labels[test_idx_range, :]
+    labels = np.vstack((ally, ty))                                  # ty和ally共同构成的label的组合体
+    labels[test_idx_reorder, :] = labels[test_idx_range, :]         # 对应label和序列的index

-    idx_test = test_idx_range.tolist()
-    idx_train = range(len(y))
+    idx_test = test_idx_range.tolist()                              # 记录test（包括tx和allx）的index
+    idx_train = range(len(y))                                       # 记录train的index
    idx_val = range(len(y), len(y)+500)

    train_mask = sample_mask(idx_train, labels.shape[0])
    val_mask = sample_mask(idx_val, labels.shape[0])
    test_mask = sample_mask(idx_test, labels.shape[0])
+    # 相当于是给了一个索引，用mask去筛选数据

    y_train = np.zeros(labels.shape)
    y_val = np.zeros(labels.shape)
@@ -90,7 +108,9 @@ def load_data(dataset_str):
    y_train[train_mask, :] = labels[train_mask, :]
    y_val[val_mask, :] = labels[val_mask, :]
    y_test[test_mask, :] = labels[test_mask, :]
+    # 筛选出来的数据去进行对应，放进对应名字里边去

+    # https://blog.csdn.net/qq_37995260/article/details/100146401 几个返回值分别的作用
    return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask


@@ -128,7 +148,9 @@ def preprocess_features(features):


 def normalize_adj(adj):
+    # 这个东西的返回值就是那个系数D^-0.5AD^0.5
    """Symmetrically normalize adjacency matrix."""
+    # scipy.sparse as sp
    adj = sp.coo_matrix(adj)
    rowsum = np.array(adj.sum(1)) # D
    d_inv_sqrt = np.power(rowsum, -0.5).flatten() # D^-0.5
@@ -138,6 +160,7 @@ def normalize_adj(adj):


 def preprocess_adj(adj):
+    #这个是实现了那个A+1的那一步
    """Preprocessing of adjacency matrix for simple GCN model and conversion to tuple representation."""
    adj_normalized = normalize_adj(adj + sp.eye(adj.shape[0]))
    return sparse_to_tuple(adj_normalized)
@@ -147,12 +170,16 @@ def preprocess_adj(adj):


 def chebyshev_polynomials(adj, k):
+    # 切比雪夫是用作卷积核的，这个我之前一直以为是推前向传播的，有点干蒙了，离谱，本质上就是一个转换成前向传播函数的系数求解
+    # k为切比雪夫的阶数
+    # 这玩意就是没用到啊。。。。。。，在文件里都没出现，无语。他只用了两个preprocess函数
+    # 看明白了，教学用的
    """
    Calculate Chebyshev polynomials up to order k. Return a list of sparse matrices (tuple representation).
    """
    print("Calculating Chebyshev polynomials up to order {}...".format(k))

-    adj_normalized = normalize_adj(adj)
+    adj_normalized = normalize_adj(adj)                     # 返回那个正向传播的那个的系数矩阵D^-0.5AD^0.51
    laplacian = sp.eye(adj.shape[0]) - adj_normalized
    largest_eigval, _ = eigsh(laplacian, 1, which='LM')
    scaled_laplacian = (2. / largest_eigval[0]) * laplacian - sp.eye(adj.shape[0])