global attention layer

rusty1s · rusty1s · commit c4ebbbfb4896 · 2019-01-15T19:51:09.000+01:00
diff --git a/README.md b/README.md
@@ -132,6 +132,7 @@ In detail, the following methods are currently implemented:
 * **[XConv](https://rusty1s.github.io/pytorch_geometric/build/html/modules/nn.html#torch_geometric.nn.conv.XConv)** from Li *et al.*: [PointCNN: Convolution On X-Transformed Points](https://arxiv.org/abs/1801.07791) (NeurIPS 2018)
 * **[GMMConv](https://rusty1s.github.io/pytorch_geometric/build/html/modules/nn.html#torch_geometric.nn.conv.GMMConv)** from Monti *et al.*: [Geometric Deep Learning on Graphs and Manifolds using Mixture Model CNNs](https://arxiv.org/abs/1612.00593) (CVPR 2017)
 * A **[MetaLayer](https://rusty1s.github.io/pytorch_geometric/build/html/modules/nn.html#torch_geometric.nn.meta.MetaLayer)** for building any kind of graph network similar to the [TensorFlow Graph Nets library](https://github.com/deepmind/graph_nets) from Battaglia *et al.*: [Relational Inductive Biases, Deep Learning, and Graph Networks](https://arxiv.org/abs/1806.01261) (CoRR 2018)
+* **[GlobalAttention](https://rusty1s.github.io/pytorch_geometric/build/html/modules/nn.html#torch_geometric.nn.glob.GlobalAttention)** from Li *et al.*: [Gated Graph Sequence Neural Networks](https://arxiv.org/abs/1511.05493) (ICLR 2016)
 * **[Set2Set](https://rusty1s.github.io/pytorch_geometric/build/html/modules/nn.html#torch_geometric.nn.glob.Set2Set)** from Vinyals *et al.*: [Order Matters: Sequence to Sequence for Sets](https://arxiv.org/abs/1511.06391) (ICLR 2016)
 * **[Sort Pool](https://rusty1s.github.io/pytorch_geometric/build/html/modules/nn.html#torch_geometric.nn.glob.global_sort_pool)** from Zhang *et al.*: [An End-to-End Deep Learning Architecture for Graph Classification](https://www.cse.wustl.edu/~muhan/papers/AAAI_2018_DGCNN.pdf) (AAAI 2018)
 * **[Dense Differentiable Pooling](https://rusty1s.github.io/pytorch_geometric/build/html/modules/nn.html#torch_geometric.nn.dense.diff_pool.dense_diff_pool)** from Ying *et al.*: [Hierarchical Graph Representation Learning with Differentiable Pooling](https://arxiv.org/abs/1806.08804) (NeurIPS 2018)
diff --git a/test/nn/glob/test_attention.py b/test/nn/glob/test_attention.py
@@ -0,0 +1,28 @@
+import torch
+from torch.nn import Sequential as Seq, Linear as Lin, ReLU
+from torch_geometric.nn import GlobalAttention
+
+
+def test_global_attention():
+    channels, batch_size = (32, 10)
+    gate_nn = Seq(Lin(channels, channels), ReLU(), Lin(channels, 1))
+    nn = Seq(Lin(channels, channels), ReLU(), Lin(channels, channels))
+
+    glob = GlobalAttention(gate_nn, nn)
+    assert glob.__repr__() == (
+        'GlobalAttention(gate_nn=Sequential(\n'
+        '  (0): Linear(in_features=32, out_features=32, bias=True)\n'
+        '  (1): ReLU()\n'
+        '  (2): Linear(in_features=32, out_features=1, bias=True)\n'
+        '), nn=Sequential(\n'
+        '  (0): Linear(in_features=32, out_features=32, bias=True)\n'
+        '  (1): ReLU()\n'
+        '  (2): Linear(in_features=32, out_features=32, bias=True)\n'
+        '))')
+
+    x = torch.randn((batch_size**2, channels))
+    batch = torch.arange(batch_size, dtype=torch.long)
+    batch = batch.view(-1, 1).repeat(1, batch_size).view(-1)
+
+    assert glob(x, batch).size() == (batch_size, channels)
+    assert glob(x, batch, batch_size + 1).size() == (batch_size + 1, channels)
diff --git a/torch_geometric/nn/conv/point_conv.py b/torch_geometric/nn/conv/point_conv.py
@@ -14,7 +14,7 @@ class PointConv(torch.nn.Module):
     .. math::
         \mathbf{x}^{\prime}_i = \gamma_{\mathbf{\Theta}} \left( \max_{j \in
         \mathcal{N}(i) \cup \{ i \}} h_{\mathbf{\Theta}} ( \mathbf{x}_j \,
-        \Vert \, \mathbf{p}_j - \mathbf{p}_i) \right)
+        \Vert \, \mathbf{p}_j - \mathbf{p}_i) \right),
 
     where :math:`\gamma_{\mathbf{\Theta}}` and
     :math:`h_{\mathbf{\Theta}}` denote neural
diff --git a/torch_geometric/nn/glob/__init__.py b/torch_geometric/nn/glob/__init__.py
@@ -1,11 +1,13 @@
 from .glob import global_add_pool, global_mean_pool, global_max_pool
 from .sort import global_sort_pool
+from .attention import GlobalAttention
 from .set2set import Set2Set
 
 __all__ = [
     'global_add_pool',
     'global_mean_pool',
     'global_max_pool',
     'global_sort_pool',
+    'GlobalAttention',
     'Set2Set',
 ]
diff --git a/torch_geometric/nn/glob/attention.py b/torch_geometric/nn/glob/attention.py
@@ -0,0 +1,54 @@
+import torch
+from torch_geometric.utils import softmax, scatter_
+
+from ..inits import reset
+
+
+class GlobalAttention(torch.nn.Module):
+    r"""Global soft attention layer from the `"Gated Graph Sequence Neural
+    Networks" <https://arxiv.org/abs/1511.05493>`_ paper
+
+    .. math::
+        \mathbf{r}_i = \sum_{n=1}^{N_i} \mathrm{softmax} \left(
+        h_{\mathrm{gate}} ( \mathbf{x}_n ) \right) \odot
+        h_{\mathbf{\Theta}} ( \mathbf{x}_n ),
+
+    where :math:`h_{\mathrm{gate}} \colon \mathbb{R}^F \to
+    \mathbb{R}` and :math:`h_{\mathbf{\Theta}}` denote neural networks, *i.e.*
+    MLPS.
+
+    Args:
+        gate_nn (nn.Sequential): Neural network
+            :math:`h_{\mathrm{gate}} \colon \mathbb{R}^F \to \mathbb{R}`.
+        nn (nn.Sequential, optional): Neural network
+            :math:`h_{\mathbf{\Theta}}`. (default: :obj:`None`)
+    """
+
+    def __init__(self, gate_nn, nn=None):
+        super(GlobalAttention, self).__init__()
+        self.gate_nn = gate_nn
+        self.nn = nn
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        reset(self.gate_nn)
+        reset(self.nn)
+
+    def forward(self, x, batch, size=None):
+        """"""
+        x = x.unsqueeze(-1) if x.dim() == 1 else x
+        size = batch[-1].item() + 1 if size is None else size
+
+        gate = self.gate_nn(x).view(-1, 1)
+        x = self.nn(x) if self.nn is not None else x
+        assert gate.dim() == x.dim() and gate.size(0) == x.size(0)
+
+        gate = softmax(gate, batch, size)
+        out = scatter_('add', gate * x, batch, size)
+
+        return out
+
+    def __repr__(self):
+        return '{}(gate_nn={}, nn={})'.format(self.__class__.__name__,
+                                              self.gate_nn, self.nn)
diff --git a/torch_geometric/utils/softmax.py b/torch_geometric/utils/softmax.py
@@ -10,9 +10,8 @@ def softmax(src, index, num_nodes=None):
     Args:
         src (Tensor): The source tensor.
         index (LongTensor): The indices of elements for applying the softmax.
-        num_nodes (int, optional): Automatically create output tensor with size
-            :attr:`num_nodes` in the first dimension. If set to :attr:`None`, a
-            minimal sized output tensor is returned. (default: :obj:`None`)
+        num_nodes (int, optional): The number of nodes, *i.e.*
+            :obj:`max_val + 1` of :attr:`index`. (default: :obj:`None`)
 
     :rtype: :class:`Tensor`
     """