diff --git a/docs/graphs/gatv2/experiment.html b/docs/graphs/gatv2/experiment.html new file mode 100644 index 0000000000000000000000000000000000000000..1e2e318035991f90f524035ed9589d7e9afb820e --- /dev/null +++ b/docs/graphs/gatv2/experiment.html @@ -0,0 +1,1194 @@ + + +
+ + + + + + + + + + + + + + + + + + + +13from typing import Dict
+14
+15import numpy as np
+16import torch
+17from torch import nn
+18
+19from labml import lab, monit, tracker, experiment
+20from labml.configs import BaseConfigs
+21from labml.utils import download
+22from labml_helpers.device import DeviceConfigs
+23from labml_helpers.module import Module
+24from labml_nn.graphs.gatv2 import GraphAttentionV2Layer
+25from labml_nn.optimizers.configs import OptimizerConfigs
Cora dataset is a dataset of research papers. +For each paper we are given a binary feature vector that indicates the presence of words. +Each paper is classified into one of 7 classes. +The dataset also has the citation network.
+The papers are the nodes of the graph and the edges are the citations.
+The task is to classify the edges to the 7 classes with feature vectors and +citation network as input.
+28class CoraDataset:
Labels for each node
+43 labels: torch.Tensor
Set of class names and an unique integer index
+45 classes: Dict[str, int]
Feature vectors for all nodes
+47 features: torch.Tensor
Adjacency matrix with the edge information.
+adj_mat[i][j]
is True
if there is an edge from i
to j
.
50 adj_mat: torch.Tensor
Download the dataset
+52 @staticmethod
+53 def _download():
57 if not (lab.get_data_path() / 'cora').exists():
+58 download.download_file('https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz',
+59 lab.get_data_path() / 'cora.tgz')
+60 download.extract_tar(lab.get_data_path() / 'cora.tgz', lab.get_data_path())
Load the dataset
+62 def __init__(self, include_edges: bool = True):
Whether to include edges. +This is test how much accuracy is lost if we ignore the citation network.
+69 self.include_edges = include_edges
Download dataset
+72 self._download()
Read the paper ids, feature vectors, and labels
+75 with monit.section('Read content file'):
+76 content = np.genfromtxt(str(lab.get_data_path() / 'cora/cora.content'), dtype=np.dtype(str))
Load the citations, it’s a list of pairs of integers.
+78 with monit.section('Read citations file'):
+79 citations = np.genfromtxt(str(lab.get_data_path() / 'cora/cora.cites'), dtype=np.int32)
Get the feature vectors
+82 features = torch.tensor(np.array(content[:, 1:-1], dtype=np.float32))
Normalize the feature vectors
+84 self.features = features / features.sum(dim=1, keepdim=True)
Get the class names and assign an unique integer to each of them
+87 self.classes = {s: i for i, s in enumerate(set(content[:, -1]))}
Get the labels as those integers
+89 self.labels = torch.tensor([self.classes[i] for i in content[:, -1]], dtype=torch.long)
Get the paper ids
+92 paper_ids = np.array(content[:, 0], dtype=np.int32)
Map of paper id to index
+94 ids_to_idx = {id_: i for i, id_ in enumerate(paper_ids)}
Empty adjacency matrix - an identity matrix
+97 self.adj_mat = torch.eye(len(self.labels), dtype=torch.bool)
Mark the citations in the adjacency matrix
+100 if self.include_edges:
+101 for e in citations:
The pair of paper indexes
+103 e1, e2 = ids_to_idx[e[0]], ids_to_idx[e[1]]
We build a symmetrical graph, where if paper $i$ referenced +paper $j$ we place an adge from $i$ to $j$ as well as an edge +from $j$ to $i$.
+107 self.adj_mat[e1][e2] = True
+108 self.adj_mat[e2][e1] = True
This graph attention network has two graph attention layers.
+111class GATv2(Module):
in_features
is the number of features per noden_hidden
is the number of features in the first graph attention layern_classes
is the number of classesn_heads
is the number of heads in the graph attention layersdropout
is the dropout probabilityshare_weights
if set to True, the same matrix will be applied to the source and the target node of every edge118 def __init__(self, in_features: int, n_hidden: int, n_classes: int, n_heads: int, dropout: float, share_weights: bool = True):
127 super().__init__()
First graph attention layer where we concatenate the heads
+130 self.layer1 = GraphAttentionV2Layer(in_features, n_hidden, n_heads, is_concat=True, dropout=dropout, share_weights=share_weights)
Activation function after first graph attention layer
+132 self.activation = nn.ELU()
Final graph attention layer where we average the heads
+134 self.output = GraphAttentionV2Layer(n_hidden, n_classes, 1, is_concat=False, dropout=dropout, share_weights=share_weights)
Dropout
+136 self.dropout = nn.Dropout(dropout)
x
is the features vectors of shape [n_nodes, in_features]
adj_mat
is the adjacency matrix of the form
+ [n_nodes, n_nodes, n_heads]
or [n_nodes, n_nodes, 1]
138 def __call__(self, x: torch.Tensor, adj_mat: torch.Tensor):
Apply dropout to the input
+145 x = self.dropout(x)
First graph attention layer
+147 x = self.layer1(x, adj_mat)
Activation function
+149 x = self.activation(x)
Dropout
+151 x = self.dropout(x)
Output layer (without activation) for logits
+153 return self.output(x, adj_mat)
A simple function to calculate the accuracy
+156def accuracy(output: torch.Tensor, labels: torch.Tensor):
160 return output.argmax(dim=-1).eq(labels).sum().item() / len(labels)
163class Configs(BaseConfigs):
Model
+169 model: GATv2
Number of nodes to train on
+171 training_samples: int = 500
Number of features per node in the input
+173 in_features: int
Number of features in the first graph attention layer
+175 n_hidden: int = 64
Number of heads
+177 n_heads: int = 8
Number of classes for classification
+179 n_classes: int
Dropout probability
+181 dropout: float = 0.6
Whether to include the citation network
+183 include_edges: bool = True
Dataset
+185 dataset: CoraDataset
Number of training iterations
+187 epochs: int = 1_000
Loss function
+189 loss_func = nn.CrossEntropyLoss()
Device to train on
+This creates configs for device, so that +we can change the device by passing a config value
+194 device: torch.device = DeviceConfigs()
Optimizer
+196 optimizer: torch.optim.Adam
Initialize
+198 def initialize(self):
Create the dataset
+203 self.dataset = CoraDataset(self.include_edges)
Get the number of classes
+205 self.n_classes = len(self.dataset.classes)
Number of features in the input
+207 self.in_features = self.dataset.features.shape[1]
Create the model
+209 self.model = GATv2(self.in_features, self.n_hidden, self.n_classes, self.n_heads, self.dropout)
Move the model to the device
+211 self.model.to(self.device)
Configurable optimizer, so that we can set the configurations +such as learning rate by passing the dictionary later.
+214 optimizer_conf = OptimizerConfigs()
+215 optimizer_conf.parameters = self.model.parameters()
+216 self.optimizer = optimizer_conf
We do full batch training since the dataset is small. +If we were to sample and train we will have to sample a set of +nodes for each training step along with the edges that span +across those selected nodes.
+218 def run(self):
Move the feature vectors to the device
+228 features = self.dataset.features.to(self.device)
Move the labels to the device
+230 labels = self.dataset.labels.to(self.device)
Move the adjacency matrix to the device
+232 edges_adj = self.dataset.adj_mat.to(self.device)
Add an empty third dimension for the heads
+234 edges_adj = edges_adj.unsqueeze(-1)
Random indexes
+237 idx_rand = torch.randperm(len(labels))
Nodes for training
+239 idx_train = idx_rand[:self.training_samples]
Nodes for validation
+241 idx_valid = idx_rand[self.training_samples:]
Training loop
+244 for epoch in monit.loop(self.epochs):
Set the model to training mode
+246 self.model.train()
Make all the gradients zero
+248 self.optimizer.zero_grad()
Evaluate the model
+250 output = self.model(features, edges_adj)
Get the loss for training nodes
+252 loss = self.loss_func(output[idx_train], labels[idx_train])
Calculate gradients
+254 loss.backward()
Take optimization step
+256 self.optimizer.step()
Log the loss
+258 tracker.add('loss.train', loss)
Log the accuracy
+260 tracker.add('accuracy.train', accuracy(output[idx_train], labels[idx_train]))
Set mode to evaluation mode for validation
+263 self.model.eval()
No need to compute gradients
+266 with torch.no_grad():
Evaluate the model again
+268 output = self.model(features, edges_adj)
Calculate the loss for validation nodes
+270 loss = self.loss_func(output[idx_valid], labels[idx_valid])
Log the loss
+272 tracker.add('loss.valid', loss)
Log the accuracy
+274 tracker.add('accuracy.valid', accuracy(output[idx_valid], labels[idx_valid]))
Save logs
+277 tracker.save()
280def main():
Create configurations
+282 conf = Configs()
Create an experiment
+284 experiment.create(name='gatv2')
Calculate configurations.
+286 experiment.configs(conf, {
Adam optimizer
+288 'optimizer.optimizer': 'Adam',
+289 'optimizer.learning_rate': 5e-3,
+290 'optimizer.weight_decay': 5e-4,
+291 })
Initialize
+293 conf.initialize()
Start and watch the experiment
+296 with experiment.start():
Run the training
+298 conf.run()
302if __name__ == '__main__':
+303 main()
This is a PyTorch implementation of the GATv2 operator from the paper +How Attentive are Graph Attention Networks?.
+GATv2s work on graph data. +A graph consists of nodes and edges connecting nodes. +For example, in Cora dataset the nodes are research papers and the edges are citations that +connect the papers.
+The GATv2 operator which fixes the static attention problem of the standard GAT: +since the linear layers in the standard GAT are applied right after each other, the ranking +of attended nodes is unconditioned on the query node. +In contrast, in GATv2, every node can attend to any other node.
+Here is the training code for training +a two-layer GATv2 on Cora dataset.
+ +29import torch
+30from torch import nn
+31
+32from labml_helpers.module import Module
This is a single graph attention v2 layer. +A GATv2 is made up of multiple such layers.
+It takes +, +where $\overrightarrow{h_i} \in \mathbb{R}^F$ as input +and outputs +, +where $\overrightarrow{h’_i} \in \mathbb{R}^{F’}$.
+35class GraphAttentionV2Layer(Module):
in_features
, $F$, is the number of input features per nodeout_features
, $F’$, is the number of output features per noden_heads
, $K$, is the number of attention headsis_concat
whether the multi-head results should be concatenated or averageddropout
is the dropout probabilityleaky_relu_negative_slope
is the negative slope for leaky relu activationshare_weights
if set to True, the same matrix will be applied to the source and the target node of every edge49 def __init__(self, in_features: int, out_features: int, n_heads: int,
+50 is_concat: bool = True,
+51 dropout: float = 0.6,
+52 leaky_relu_negative_slope: float = 0.2,
+53 share_weights=False):
63 super().__init__()
+64
+65 self.is_concat = is_concat
+66 self.n_heads = n_heads
+67 self.share_weights = share_weights
Calculate the number of dimensions per head
+70 if is_concat:
+71 assert out_features % n_heads == 0
If we are concatenating the multiple heads
+73 self.n_hidden = out_features // n_heads
+74 else:
If we are averaging the multiple heads
+76 self.n_hidden = out_features
Linear layer for initial source transformation; +i.e. to transform the source node embeddings before self-attention
+80 self.linear_l = nn.Linear(in_features, self.n_hidden * n_heads, bias=False)
If share_weights is True
the same linear layer is used for the target nodes
82 if share_weights:
+83 self.linear_r = self.linear_l
+84 else:
+85 self.linear_r = Linear(in_channels, heads * out_channels, bias=bias)
Linear layer to compute attention score $e_{ij}$
+87 self.attn = nn.Linear(self.n_hidden, 1, bias=False)
The activation for attention score $e_{ij}$
+89 self.activation = nn.LeakyReLU(negative_slope=leaky_relu_negative_slope)
Softmax to compute attention $\alpha_{ij}$
+91 self.softmax = nn.Softmax(dim=1)
Dropout layer to be applied for attention
+93 self.dropout = nn.Dropout(dropout)
h
, $\mathbf{h}$ is the input node embeddings of shape [n_nodes, in_features]
.adj_mat
is the adjacency matrix of shape [n_nodes, n_nodes, n_heads]
.
+We use shape [n_nodes, n_nodes, 1]
since the adjacency is the same for each head.Adjacency matrix represent the edges (or connections) among nodes.
+adj_mat[i][j]
is True
if there is an edge from node i
to node j
.
95 def __call__(self, h: torch.Tensor, adj_mat: torch.Tensor):
Number of nodes
+106 n_nodes = h.shape[0]
The initial transformations, + + +for each head. +We do two linear transformations and then split it up for each head.
+112 g_l = self.linear_l(h).view(n_nodes, self.n_heads, self.n_hidden)
+113 g_r = self.linear_r(h).view(n_nodes, self.n_heads, self.n_hidden)
We calculate these for each head $k$. We have omitted $\cdot^k$ for simplicity.
++ +
+$e_{ij}$ is the attention score (importance) from node $j$ to node $i$. +We calculate this for each head.
+$a$ is the attention mechanism, that calculates the attention score. +The paper sums +$\overrightarrow{{g_l}_i}$, $\overrightarrow{{g_r}_j}$ +followed by a $\text{LeakyReLU}$ +and does a linear transformation with a weight vector $\mathbf{a} \in \mathbb{R}^{F’}$
++ +
+First we calculate +$\Big[\overrightarrow{{g_l}_i} + \overrightarrow{{g_r}_j} \Big]$ +for all pairs of $i, j$.
+g_l_repeat
gets
+
+where each node embedding is repeated n_nodes
times.
145 g_l_repeat = g_l.repeat(n_nodes, 1, 1)
g_r_repeat_interleave
gets
+
+where each node embedding is repeated n_nodes
times.
150 g_r_repeat_interleave = g_r.repeat_interleave(n_nodes, dim=0)
Now we sum to get + +
+158 g_sum = g_l_repeat + g_r_repeat_interleave
Reshape so that g_sum[i, j]
is $\overrightarrow{{g_l}_i} + \overrightarrow{{g_r}_j}$
160 g_sum = g_sum.view(n_nodes, n_nodes, self.n_heads, self.n_hidden)
Calculate
+
+e
is of shape [n_nodes, n_nodes, n_heads, 1]
168 e = self.attn(self.activation(g_sum))
Remove the last dimension of size 1
170 e = e.squeeze(-1)
The adjacency matrix should have shape
+[n_nodes, n_nodes, n_heads]
or[n_nodes, n_nodes, 1]
174 assert adj_mat.shape[0] == 1 or adj_mat.shape[0] == n_nodes
+175 assert adj_mat.shape[1] == 1 or adj_mat.shape[1] == n_nodes
+176 assert adj_mat.shape[2] == 1 or adj_mat.shape[2] == self.n_heads
Mask $e_{ij}$ based on adjacency matrix. +$e_{ij}$ is set to $- \infty$ if there is no edge from $i$ to $j$.
+179 e = e.masked_fill(adj_mat == 0, float('-inf'))
We then normalize attention scores (or coefficients) + +
+where $\mathcal{N}_i$ is the set of nodes connected to $i$.
+We do this by setting unconnected $e_{ij}$ to $- \infty$ which +makes $\exp(e_{ij}) \sim 0$ for unconnected pairs.
+189 a = self.softmax(e)
Apply dropout regularization
+192 a = self.dropout(a)
Calculate final output for each head + +
+196 attn_res = torch.einsum('ijh,jhf->ihf', a, g_r)
Concatenate the heads
+199 if self.is_concat:
+ +
+201 return attn_res.reshape(n_nodes, self.n_heads * self.n_hidden)
Take the mean of the heads
+203 else:
+ +
+205 return attn_res.mean(dim=1)
This is a PyTorch implementation of the GATv2 opeartor from the paper +How Attentive are Graph Attention Networks?.
+GATv2s work on graph data. +A graph consists of nodes and edges connecting nodes. +For example, in Cora dataset the nodes are research papers and the edges are citations that +connect the papers.
+The GATv2 operator which fixes the static attention problem of the standard GAT: +since the linear layers in the standard GAT are applied right after each other, the ranking +of attended nodes is unconditioned on the query node. +In contrast, in GATv2, every node can attend to any other node.
+Here is the training code for training +a two-layer GAT on Cora dataset.
+ +Solving games with incomplete information such as poker with CFR.
diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 94992d14ed724e493d70106bc168df3eff59673e..cc975e7b799ba8797deb9a9c9a4d9ab3a863ee7c 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -281,7 +281,7 @@