diff --git a/docs/graphs/gatv2/experiment.html b/docs/graphs/gatv2/experiment.html new file mode 100644 index 0000000000000000000000000000000000000000..1e2e318035991f90f524035ed9589d7e9afb820e --- /dev/null +++ b/docs/graphs/gatv2/experiment.html @@ -0,0 +1,1194 @@ + + + + + + + + + + + + + + + + + + + + + + + Train a Graph Attention Network v2 (GATv2) on Cora dataset + + + + + + + + +
+
+
+
+

+ home + graphs + gatv2 +

+

+ + + Github + + Twitter +

+
+
+
+
+ +

Train a Graph Attention Network v2 (GATv2) on Cora dataset

+

View Run

+
+
+
13from typing import Dict
+14
+15import numpy as np
+16import torch
+17from torch import nn
+18
+19from labml import lab, monit, tracker, experiment
+20from labml.configs import BaseConfigs
+21from labml.utils import download
+22from labml_helpers.device import DeviceConfigs
+23from labml_helpers.module import Module
+24from labml_nn.graphs.gatv2 import GraphAttentionV2Layer
+25from labml_nn.optimizers.configs import OptimizerConfigs
+
+
+
+
+ +

Cora Dataset

+

Cora dataset is a dataset of research papers. +For each paper we are given a binary feature vector that indicates the presence of words. +Each paper is classified into one of 7 classes. +The dataset also has the citation network.

+

The papers are the nodes of the graph and the edges are the citations.

+

The task is to classify the edges to the 7 classes with feature vectors and +citation network as input.

+
+
+
28class CoraDataset:
+
+
+
+
+ +

Labels for each node

+
+
+
43    labels: torch.Tensor
+
+
+
+
+ +

Set of class names and an unique integer index

+
+
+
45    classes: Dict[str, int]
+
+
+
+
+ +

Feature vectors for all nodes

+
+
+
47    features: torch.Tensor
+
+
+
+
+ +

Adjacency matrix with the edge information. +adj_mat[i][j] is True if there is an edge from i to j.

+
+
+
50    adj_mat: torch.Tensor
+
+
+
+
+ +

Download the dataset

+
+
+
52    @staticmethod
+53    def _download():
+
+
+
+
+ + +
+
+
57        if not (lab.get_data_path() / 'cora').exists():
+58            download.download_file('https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz',
+59                                   lab.get_data_path() / 'cora.tgz')
+60            download.extract_tar(lab.get_data_path() / 'cora.tgz', lab.get_data_path())
+
+
+
+
+ +

Load the dataset

+
+
+
62    def __init__(self, include_edges: bool = True):
+
+
+
+
+ +

Whether to include edges. +This is test how much accuracy is lost if we ignore the citation network.

+
+
+
69        self.include_edges = include_edges
+
+
+
+
+ +

Download dataset

+
+
+
72        self._download()
+
+
+
+
+ +

Read the paper ids, feature vectors, and labels

+
+
+
75        with monit.section('Read content file'):
+76            content = np.genfromtxt(str(lab.get_data_path() / 'cora/cora.content'), dtype=np.dtype(str))
+
+
+
+
+ +

Load the citations, it’s a list of pairs of integers.

+
+
+
78        with monit.section('Read citations file'):
+79            citations = np.genfromtxt(str(lab.get_data_path() / 'cora/cora.cites'), dtype=np.int32)
+
+
+
+
+ +

Get the feature vectors

+
+
+
82        features = torch.tensor(np.array(content[:, 1:-1], dtype=np.float32))
+
+
+
+
+ +

Normalize the feature vectors

+
+
+
84        self.features = features / features.sum(dim=1, keepdim=True)
+
+
+
+
+ +

Get the class names and assign an unique integer to each of them

+
+
+
87        self.classes = {s: i for i, s in enumerate(set(content[:, -1]))}
+
+
+
+
+ +

Get the labels as those integers

+
+
+
89        self.labels = torch.tensor([self.classes[i] for i in content[:, -1]], dtype=torch.long)
+
+
+
+
+ +

Get the paper ids

+
+
+
92        paper_ids = np.array(content[:, 0], dtype=np.int32)
+
+
+
+
+ +

Map of paper id to index

+
+
+
94        ids_to_idx = {id_: i for i, id_ in enumerate(paper_ids)}
+
+
+
+
+ +

Empty adjacency matrix - an identity matrix

+
+
+
97        self.adj_mat = torch.eye(len(self.labels), dtype=torch.bool)
+
+
+
+
+ +

Mark the citations in the adjacency matrix

+
+
+
100        if self.include_edges:
+101            for e in citations:
+
+
+
+
+ +

The pair of paper indexes

+
+
+
103                e1, e2 = ids_to_idx[e[0]], ids_to_idx[e[1]]
+
+
+
+
+ +

We build a symmetrical graph, where if paper $i$ referenced +paper $j$ we place an adge from $i$ to $j$ as well as an edge +from $j$ to $i$.

+
+
+
107                self.adj_mat[e1][e2] = True
+108                self.adj_mat[e2][e1] = True
+
+
+
+
+ +

Graph Attention Network v2 (GATv2)

+

This graph attention network has two graph attention layers.

+
+
+
111class GATv2(Module):
+
+
+
+
+ +
    +
  • in_features is the number of features per node
  • +
  • n_hidden is the number of features in the first graph attention layer
  • +
  • n_classes is the number of classes
  • +
  • n_heads is the number of heads in the graph attention layers
  • +
  • dropout is the dropout probability
  • +
  • share_weights if set to True, the same matrix will be applied to the source and the target node of every edge
  • +
+
+
+
118    def __init__(self, in_features: int, n_hidden: int, n_classes: int, n_heads: int, dropout: float, share_weights: bool = True):
+
+
+
+
+ + +
+
+
127        super().__init__()
+
+
+
+
+ +

First graph attention layer where we concatenate the heads

+
+
+
130        self.layer1 = GraphAttentionV2Layer(in_features, n_hidden, n_heads, is_concat=True, dropout=dropout, share_weights=share_weights)
+
+
+
+
+ +

Activation function after first graph attention layer

+
+
+
132        self.activation = nn.ELU()
+
+
+
+
+ +

Final graph attention layer where we average the heads

+
+
+
134        self.output = GraphAttentionV2Layer(n_hidden, n_classes, 1, is_concat=False, dropout=dropout, share_weights=share_weights)
+
+
+
+
+ +

Dropout

+
+
+
136        self.dropout = nn.Dropout(dropout)
+
+
+
+
+ +
    +
  • x is the features vectors of shape [n_nodes, in_features]
  • +
  • adj_mat is the adjacency matrix of the form + [n_nodes, n_nodes, n_heads] or [n_nodes, n_nodes, 1]
  • +
+
+
+
138    def __call__(self, x: torch.Tensor, adj_mat: torch.Tensor):
+
+
+
+
+ +

Apply dropout to the input

+
+
+
145        x = self.dropout(x)
+
+
+
+
+ +

First graph attention layer

+
+
+
147        x = self.layer1(x, adj_mat)
+
+
+
+
+ +

Activation function

+
+
+
149        x = self.activation(x)
+
+
+
+
+ +

Dropout

+
+
+
151        x = self.dropout(x)
+
+
+
+
+ +

Output layer (without activation) for logits

+
+
+
153        return self.output(x, adj_mat)
+
+
+
+
+ +

A simple function to calculate the accuracy

+
+
+
156def accuracy(output: torch.Tensor, labels: torch.Tensor):
+
+
+
+
+ + +
+
+
160    return output.argmax(dim=-1).eq(labels).sum().item() / len(labels)
+
+
+
+
+ +

Configurations

+
+
+
163class Configs(BaseConfigs):
+
+
+
+
+ +

Model

+
+
+
169    model: GATv2
+
+
+
+
+ +

Number of nodes to train on

+
+
+
171    training_samples: int = 500
+
+
+
+
+ +

Number of features per node in the input

+
+
+
173    in_features: int
+
+
+
+
+ +

Number of features in the first graph attention layer

+
+
+
175    n_hidden: int = 64
+
+
+
+
+ +

Number of heads

+
+
+
177    n_heads: int = 8
+
+
+
+
+ +

Number of classes for classification

+
+
+
179    n_classes: int
+
+
+
+
+ +

Dropout probability

+
+
+
181    dropout: float = 0.6
+
+
+
+
+ +

Whether to include the citation network

+
+
+
183    include_edges: bool = True
+
+
+
+
+ +

Dataset

+
+
+
185    dataset: CoraDataset
+
+
+
+
+ +

Number of training iterations

+
+
+
187    epochs: int = 1_000
+
+
+
+
+ +

Loss function

+
+
+
189    loss_func = nn.CrossEntropyLoss()
+
+
+
+
+ +

Device to train on

+

This creates configs for device, so that +we can change the device by passing a config value

+
+
+
194    device: torch.device = DeviceConfigs()
+
+
+
+
+ +

Optimizer

+
+
+
196    optimizer: torch.optim.Adam
+
+
+
+
+ +

Initialize

+
+
+
198    def initialize(self):
+
+
+
+
+ +

Create the dataset

+
+
+
203        self.dataset = CoraDataset(self.include_edges)
+
+
+
+
+ +

Get the number of classes

+
+
+
205        self.n_classes = len(self.dataset.classes)
+
+
+
+
+ +

Number of features in the input

+
+
+
207        self.in_features = self.dataset.features.shape[1]
+
+
+
+
+ +

Create the model

+
+
+
209        self.model = GATv2(self.in_features, self.n_hidden, self.n_classes, self.n_heads, self.dropout)
+
+
+
+
+ +

Move the model to the device

+
+
+
211        self.model.to(self.device)
+
+
+
+
+ +

Configurable optimizer, so that we can set the configurations +such as learning rate by passing the dictionary later.

+
+
+
214        optimizer_conf = OptimizerConfigs()
+215        optimizer_conf.parameters = self.model.parameters()
+216        self.optimizer = optimizer_conf
+
+
+
+
+ +

Training loop

+

We do full batch training since the dataset is small. +If we were to sample and train we will have to sample a set of +nodes for each training step along with the edges that span +across those selected nodes.

+
+
+
218    def run(self):
+
+
+
+
+ +

Move the feature vectors to the device

+
+
+
228        features = self.dataset.features.to(self.device)
+
+
+
+
+ +

Move the labels to the device

+
+
+
230        labels = self.dataset.labels.to(self.device)
+
+
+
+
+ +

Move the adjacency matrix to the device

+
+
+
232        edges_adj = self.dataset.adj_mat.to(self.device)
+
+
+
+
+ +

Add an empty third dimension for the heads

+
+
+
234        edges_adj = edges_adj.unsqueeze(-1)
+
+
+
+
+ +

Random indexes

+
+
+
237        idx_rand = torch.randperm(len(labels))
+
+
+
+
+ +

Nodes for training

+
+
+
239        idx_train = idx_rand[:self.training_samples]
+
+
+
+
+ +

Nodes for validation

+
+
+
241        idx_valid = idx_rand[self.training_samples:]
+
+
+
+
+ +

Training loop

+
+
+
244        for epoch in monit.loop(self.epochs):
+
+
+
+
+ +

Set the model to training mode

+
+
+
246            self.model.train()
+
+
+
+
+ +

Make all the gradients zero

+
+
+
248            self.optimizer.zero_grad()
+
+
+
+
+ +

Evaluate the model

+
+
+
250            output = self.model(features, edges_adj)
+
+
+
+
+ +

Get the loss for training nodes

+
+
+
252            loss = self.loss_func(output[idx_train], labels[idx_train])
+
+
+
+
+ +

Calculate gradients

+
+
+
254            loss.backward()
+
+
+
+
+ +

Take optimization step

+
+
+
256            self.optimizer.step()
+
+
+
+
+ +

Log the loss

+
+
+
258            tracker.add('loss.train', loss)
+
+
+
+
+ +

Log the accuracy

+
+
+
260            tracker.add('accuracy.train', accuracy(output[idx_train], labels[idx_train]))
+
+
+
+
+ +

Set mode to evaluation mode for validation

+
+
+
263            self.model.eval()
+
+
+
+
+ +

No need to compute gradients

+
+
+
266            with torch.no_grad():
+
+
+
+
+ +

Evaluate the model again

+
+
+
268                output = self.model(features, edges_adj)
+
+
+
+
+ +

Calculate the loss for validation nodes

+
+
+
270                loss = self.loss_func(output[idx_valid], labels[idx_valid])
+
+
+
+
+ +

Log the loss

+
+
+
272                tracker.add('loss.valid', loss)
+
+
+
+
+ +

Log the accuracy

+
+
+
274                tracker.add('accuracy.valid', accuracy(output[idx_valid], labels[idx_valid]))
+
+
+
+
+ +

Save logs

+
+
+
277            tracker.save()
+
+
+
+
+ + +
+
+
280def main():
+
+
+
+
+ +

Create configurations

+
+
+
282    conf = Configs()
+
+
+
+
+ +

Create an experiment

+
+
+
284    experiment.create(name='gatv2')
+
+
+
+
+ +

Calculate configurations.

+
+
+
286    experiment.configs(conf, {
+
+
+
+
+ +

Adam optimizer

+
+
+
288        'optimizer.optimizer': 'Adam',
+289        'optimizer.learning_rate': 5e-3,
+290        'optimizer.weight_decay': 5e-4,
+291    })
+
+
+
+
+ +

Initialize

+
+
+
293    conf.initialize()
+
+
+
+
+ +

Start and watch the experiment

+
+
+
296    with experiment.start():
+
+
+
+
+ +

Run the training

+
+
+
298        conf.run()
+
+
+
+
+ + +
+
+
302if __name__ == '__main__':
+303    main()
+
+
+
+ + + + + + + \ No newline at end of file diff --git a/docs/graphs/gatv2/index.html b/docs/graphs/gatv2/index.html new file mode 100644 index 0000000000000000000000000000000000000000..632027c5a61bf12b34f84cb6633cb9b66e421009 --- /dev/null +++ b/docs/graphs/gatv2/index.html @@ -0,0 +1,589 @@ + + + + + + + + + + + + + + + + + + + + + + + Graph Attention Networks v2 (GATv2) + + + + + + + + +
+
+
+
+

+ home + graphs + gatv2 +

+

+ + + Github + + Twitter +

+
+
+
+
+ +

Graph Attention Networks v2 (GATv2)

+

This is a PyTorch implementation of the GATv2 operator from the paper +How Attentive are Graph Attention Networks?.

+

GATv2s work on graph data. +A graph consists of nodes and edges connecting nodes. +For example, in Cora dataset the nodes are research papers and the edges are citations that +connect the papers.

+

The GATv2 operator which fixes the static attention problem of the standard GAT: +since the linear layers in the standard GAT are applied right after each other, the ranking +of attended nodes is unconditioned on the query node. +In contrast, in GATv2, every node can attend to any other node.

+

Here is the training code for training +a two-layer GATv2 on Cora dataset.

+

View Run

+
+
+
29import torch
+30from torch import nn
+31
+32from labml_helpers.module import Module
+
+
+
+
+ +

Graph attention v2 layer

+

This is a single graph attention v2 layer. +A GATv2 is made up of multiple such layers.

+

It takes +, +where $\overrightarrow{h_i} \in \mathbb{R}^F$ as input +and outputs +, +where $\overrightarrow{h’_i} \in \mathbb{R}^{F’}$.

+
+
+
35class GraphAttentionV2Layer(Module):
+
+
+
+
+ +
    +
  • in_features, $F$, is the number of input features per node
  • +
  • out_features, $F’$, is the number of output features per node
  • +
  • n_heads, $K$, is the number of attention heads
  • +
  • is_concat whether the multi-head results should be concatenated or averaged
  • +
  • dropout is the dropout probability
  • +
  • leaky_relu_negative_slope is the negative slope for leaky relu activation
  • +
  • share_weights if set to True, the same matrix will be applied to the source and the target node of every edge
  • +
+
+
+
49    def __init__(self, in_features: int, out_features: int, n_heads: int,
+50                 is_concat: bool = True,
+51                 dropout: float = 0.6,
+52                 leaky_relu_negative_slope: float = 0.2, 
+53                 share_weights=False):
+
+
+
+
+ + +
+
+
63        super().__init__()
+64
+65        self.is_concat = is_concat
+66        self.n_heads = n_heads
+67        self.share_weights = share_weights
+
+
+
+
+ +

Calculate the number of dimensions per head

+
+
+
70        if is_concat:
+71            assert out_features % n_heads == 0
+
+
+
+
+ +

If we are concatenating the multiple heads

+
+
+
73            self.n_hidden = out_features // n_heads
+74        else:
+
+
+
+
+ +

If we are averaging the multiple heads

+
+
+
76            self.n_hidden = out_features
+
+
+
+
+ +

Linear layer for initial source transformation; +i.e. to transform the source node embeddings before self-attention

+
+
+
80        self.linear_l = nn.Linear(in_features, self.n_hidden * n_heads, bias=False)
+
+
+
+
+ +

If share_weights is True the same linear layer is used for the target nodes

+
+
+
82        if share_weights:
+83            self.linear_r = self.linear_l
+84        else:
+85            self.linear_r = Linear(in_channels, heads * out_channels, bias=bias)
+
+
+
+
+ +

Linear layer to compute attention score $e_{ij}$

+
+
+
87        self.attn = nn.Linear(self.n_hidden, 1, bias=False)
+
+
+
+
+ +

The activation for attention score $e_{ij}$

+
+
+
89        self.activation = nn.LeakyReLU(negative_slope=leaky_relu_negative_slope)
+
+
+
+
+ +

Softmax to compute attention $\alpha_{ij}$

+
+
+
91        self.softmax = nn.Softmax(dim=1)
+
+
+
+
+ +

Dropout layer to be applied for attention

+
+
+
93        self.dropout = nn.Dropout(dropout)
+
+
+
+
+ +
    +
  • h, $\mathbf{h}$ is the input node embeddings of shape [n_nodes, in_features].
  • +
  • adj_mat is the adjacency matrix of shape [n_nodes, n_nodes, n_heads]. +We use shape [n_nodes, n_nodes, 1] since the adjacency is the same for each head.
  • +
+

Adjacency matrix represent the edges (or connections) among nodes. +adj_mat[i][j] is True if there is an edge from node i to node j.

+
+
+
95    def __call__(self, h: torch.Tensor, adj_mat: torch.Tensor):
+
+
+
+
+ +

Number of nodes

+
+
+
106        n_nodes = h.shape[0]
+
+
+
+
+ +

The initial transformations, + + +for each head. +We do two linear transformations and then split it up for each head.

+
+
+
112        g_l = self.linear_l(h).view(n_nodes, self.n_heads, self.n_hidden)
+113        g_r = self.linear_r(h).view(n_nodes, self.n_heads, self.n_hidden)
+
+
+
+
+ +

Calculate attention score

+

We calculate these for each head $k$. We have omitted $\cdot^k$ for simplicity.

+

+ +

+

$e_{ij}$ is the attention score (importance) from node $j$ to node $i$. +We calculate this for each head.

+

$a$ is the attention mechanism, that calculates the attention score. +The paper sums +$\overrightarrow{{g_l}_i}$, $\overrightarrow{{g_r}_j}$ +followed by a $\text{LeakyReLU}$ +and does a linear transformation with a weight vector $\mathbf{a} \in \mathbb{R}^{F’}$

+

+ +

+
+
+
+
+
+
+
+ +

First we calculate +$\Big[\overrightarrow{{g_l}_i} + \overrightarrow{{g_r}_j} \Big]$ +for all pairs of $i, j$.

+

g_l_repeat gets + +where each node embedding is repeated n_nodes times.

+
+
+
145        g_l_repeat = g_l.repeat(n_nodes, 1, 1)
+
+
+
+
+ +

g_r_repeat_interleave gets + +where each node embedding is repeated n_nodes times.

+
+
+
150        g_r_repeat_interleave = g_r.repeat_interleave(n_nodes, dim=0)
+
+
+
+
+ +

Now we sum to get + +

+
+
+
158        g_sum = g_l_repeat + g_r_repeat_interleave
+
+
+
+
+ +

Reshape so that g_sum[i, j] is $\overrightarrow{{g_l}_i} + \overrightarrow{{g_r}_j}$

+
+
+
160        g_sum = g_sum.view(n_nodes, n_nodes, self.n_heads, self.n_hidden)
+
+
+
+
+ +

Calculate + +e is of shape [n_nodes, n_nodes, n_heads, 1]

+
+
+
168        e = self.attn(self.activation(g_sum))
+
+
+
+
+ +

Remove the last dimension of size 1

+
+
+
170        e = e.squeeze(-1)
+
+
+
+
+ +

The adjacency matrix should have shape +[n_nodes, n_nodes, n_heads] or[n_nodes, n_nodes, 1]

+
+
+
174        assert adj_mat.shape[0] == 1 or adj_mat.shape[0] == n_nodes
+175        assert adj_mat.shape[1] == 1 or adj_mat.shape[1] == n_nodes
+176        assert adj_mat.shape[2] == 1 or adj_mat.shape[2] == self.n_heads
+
+
+
+
+ +

Mask $e_{ij}$ based on adjacency matrix. +$e_{ij}$ is set to $- \infty$ if there is no edge from $i$ to $j$.

+
+
+
179        e = e.masked_fill(adj_mat == 0, float('-inf'))
+
+
+
+
+ +

We then normalize attention scores (or coefficients) + +

+

where $\mathcal{N}_i$ is the set of nodes connected to $i$.

+

We do this by setting unconnected $e_{ij}$ to $- \infty$ which +makes $\exp(e_{ij}) \sim 0$ for unconnected pairs.

+
+
+
189        a = self.softmax(e)
+
+
+
+
+ +

Apply dropout regularization

+
+
+
192        a = self.dropout(a)
+
+
+
+
+ +

Calculate final output for each head + +

+
+
+
196        attn_res = torch.einsum('ijh,jhf->ihf', a, g_r)
+
+
+
+
+ +

Concatenate the heads

+
+
+
199        if self.is_concat:
+
+
+
+
+ +

+ +

+
+
+
201            return attn_res.reshape(n_nodes, self.n_heads * self.n_hidden)
+
+
+
+
+ +

Take the mean of the heads

+
+
+
203        else:
+
+
+
+
+ +

+ +

+
+
+
205            return attn_res.mean(dim=1)
+
+
+
+ + + + + + + \ No newline at end of file diff --git a/docs/graphs/gatv2/readme.html b/docs/graphs/gatv2/readme.html new file mode 100644 index 0000000000000000000000000000000000000000..1b713f3f827b06b879ae67e29d03954dbc93ed9e --- /dev/null +++ b/docs/graphs/gatv2/readme.html @@ -0,0 +1,149 @@ + + + + + + + + + + + + + + + + + + + + + + + Graph Attention Networks v2 (GATv2) + + + + + + + + +
+
+
+
+

+ home + graphs + gatv2 +

+

+ + + Github + + Twitter +

+
+
+
+
+ +

Graph Attention Networks v2 (GATv2)

+

This is a PyTorch implementation of the GATv2 opeartor from the paper +How Attentive are Graph Attention Networks?.

+

GATv2s work on graph data. +A graph consists of nodes and edges connecting nodes. +For example, in Cora dataset the nodes are research papers and the edges are citations that +connect the papers.

+

The GATv2 operator which fixes the static attention problem of the standard GAT: +since the linear layers in the standard GAT are applied right after each other, the ranking +of attended nodes is unconditioned on the query node. +In contrast, in GATv2, every node can attend to any other node.

+

Here is the training code for training +a two-layer GAT on Cora dataset.

+

View Run

+
+
+ +
+
+
+ + + + + + + \ No newline at end of file diff --git a/docs/graphs/index.html b/docs/graphs/index.html index 20bad842c070f2b268d28ffb3bdb51e55ae6ff43..fbf7dea99c1734518e9474eb99c90f57b6b67e48 100644 --- a/docs/graphs/index.html +++ b/docs/graphs/index.html @@ -69,6 +69,7 @@

Graph Neural Networks

diff --git a/docs/index.html b/docs/index.html index 6d8df374c239d9140829d69472367de56c29bbbd..56060389ad6ec14e9a85d54484b74d37e953fa08 100644 --- a/docs/index.html +++ b/docs/index.html @@ -115,6 +115,7 @@ implementations.

✨ Graph Neural Networks

Counterfactual Regret Minimization (CFR)

Solving games with incomplete information such as poker with CFR.

diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 94992d14ed724e493d70106bc168df3eff59673e..cc975e7b799ba8797deb9a9c9a4d9ab3a863ee7c 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -281,7 +281,7 @@ https://nn.labml.ai/index.html - 2021-07-17T16:30:00+00:00 + 2021-07-25T16:30:00+00:00 1.00 @@ -741,9 +741,23 @@ + + https://nn.labml.ai/graphs/gatv2/index.html + 2021-07-25T16:30:00+00:00 + 1.00 + + + + + https://nn.labml.ai/graphs/gatv2/experiment.html + 2021-07-25T16:30:00+00:00 + 1.00 + + + https://nn.labml.ai/graphs/index.html - 2021-07-08T16:30:00+00:00 + 2021-07-25T16:30:00+00:00 1.00 diff --git a/labml_nn/graphs/gatv2/__init__.py b/labml_nn/graphs/gatv2/__init__.py index 678e789df73f9dd154e467390aa0e2452180f515..9935205c1e58f504380f821c052483aa3fb5c380 100644 --- a/labml_nn/graphs/gatv2/__init__.py +++ b/labml_nn/graphs/gatv2/__init__.py @@ -117,7 +117,7 @@ class GraphAttentionV2Layer(Module): # We calculate these for each head $k$. *We have omitted $\cdot^k$ for simplicity*. # # $$e_{ij} = a(\mathbf{W_l} \overrightarrow{h_i}, \mathbf{W_r} \overrightarrow{h_j}) = - # a(\overrightarrow{{g_l}_i}}, \overrightarrow{{g_r}_j}})$$ + # a(\overrightarrow{{g_l}_i}, \overrightarrow{{g_r}_j})$$ # # $e_{ij}$ is the attention score (importance) from node $j$ to node $i$. # We calculate this for each head. @@ -131,7 +131,7 @@ class GraphAttentionV2Layer(Module): # # $$e_{ij} = \mathbf{a}^\top \text{LeakyReLU} \Big( # \Big[ - # \overrightarrow{{g_l}_i}} + \overrightarrow{{g_r}_j}} + # \overrightarrow{{g_l}_i} + \overrightarrow{{g_r}_j} # \Big] \Big)$$ # First we calculate diff --git a/readme.md b/readme.md index 64b505dd55e7f6716a15a1f237f8fe6eb93a45d5..2b9165d438beea61e9911c22075210bbf57b1648 100644 --- a/readme.md +++ b/readme.md @@ -62,6 +62,7 @@ implementations almost weekly. #### ✨ Graph Neural Networks * [Graph Attention Networks (GAT)](https://nn.labml.ai/graphs/gat/index.html) +* [Graph Attention Networks v2 (GATv2)](https://nn.labml.ai/graphs/gatv2/index.html) #### ✨ [Counterfactual Regret Minimization (CFR)](https://nn.labml.ai/cfr/index.html)