SpMM message passing CUDA support for coalesced COO graphs (#617)

dferre97 · web-flow · commit 694a33e5908d · 2025-09-21T07:59:11.000+02:00
* Enhance CUDA support by updating adjacency_matrix and propagate functions for COO graphs

* Swap edge encoding order in coalesce to fix CUDA.jl issue

* Update comments to clarify coalesce behavior

* Add custom _adjacency_matrix for propagate CUDA COO graphs

- Leave public adjacency_matrix interface uniform, always returning a sparse adjacency_matrix
- Implement custom _adjacency_matrix for propagate copy_xj for CUDA COO graphs, converting to dense when more efficient

* Fix imports

* Update GPU compatibility checks for COO CUDA

* Add @non_differentiable annotation to _adjacency_matrix function

* Add tests for coalesced COO graphs

* Remove debug statements
diff --git a/GNNGraphs/ext/GNNGraphsCUDAExt.jl b/GNNGraphs/ext/GNNGraphsCUDAExt.jl
@@ -5,8 +5,10 @@ using Random, Statistics, LinearAlgebra
 using GNNGraphs
 using GNNGraphs: COO_T, ADJMAT_T, SPARSE_T 
 using SparseArrays
+using Graphs
 
 const CUMAT_T = Union{CUDA.AnyCuMatrix, CUDA.CUSPARSE.CuSparseMatrix}
+const CUDA_COO_T = Tuple{T, T, V} where {T <: AnyCuArray{<:Integer}, V <: Union{Nothing, AnyCuArray}}
 
 # Query 
 
@@ -35,5 +37,31 @@ function sort_edge_index(u::AnyCuArray, v::AnyCuArray)
     sort_edge_index(u, v) |> dev
 end
 
+# Convert
+
+function GNNGraphs.to_sparse(coo::CUDA_COO_T, T = nothing; dir = :out, num_nodes = nothing,
+                   weighted = true, is_coalesced = false)
+    s, t, eweight = coo
+    T = T === nothing ? (eweight === nothing ? eltype(s) : eltype(eweight)) : T
+
+    if eweight === nothing || !weighted
+        eweight = fill!(similar(s, T), 1)
+    end
+
+    num_nodes::Int = isnothing(num_nodes) ? max(maximum(s), maximum(t)) : num_nodes
+    
+    # if coalesced build directly sparse coo matrix
+    if is_coalesced
+        A = CUDA.CUSPARSE.CuSparseMatrixCOO{T,eltype(s)}(s, t, eweight, (num_nodes, num_nodes)) 
+    else
+        A = sparse(s, t, eweight, num_nodes, num_nodes)
+    end
+
+    num_edges::Int = nnz(A)
+    if eltype(A) != T
+        A = T.(A)
+    end
+    return A, num_nodes, num_edges
+end
 
 end #module
diff --git a/GNNGraphs/src/gnngraph.jl b/GNNGraphs/src/gnngraph.jl
@@ -113,7 +113,7 @@ struct GNNGraph{T <: Union{COO_T, ADJMAT_T}} <: AbstractGNNGraph{T}
     ndata::DataStore
     edata::DataStore
     gdata::DataStore
-    is_coalesced::Bool # only for :coo, true if the graph is coalesced, i.e., indices ordered by row and no multi edges
+    is_coalesced::Bool # only for :coo, true if the graph is coalesced, i.e., no multi edges and indices ordered by target, then source
 end
 
 # GNNGraph constructor setting the is_coalesced field to false
diff --git a/GNNGraphs/src/query.jl b/GNNGraphs/src/query.jl
@@ -231,13 +231,7 @@ If `weighted=true`, the `A` will contain the edge weights if any, otherwise the
 """
 function Graphs.adjacency_matrix(g::GNNGraph{<:COO_T}, T::DataType = eltype(g); dir = :out,
                                  weighted = true)
-    if iscuarray(g.graph[1])
-        # Revisit after 
-        # https://github.com/JuliaGPU/CUDA.jl/issues/1113
-        A, n, m = to_dense(g.graph, T; num_nodes = g.num_nodes, weighted)
-    else
-        A, n, m = to_sparse(g.graph, T; num_nodes = g.num_nodes, weighted)
-    end
+    A, n, m = to_sparse(g.graph, T; num_nodes = g.num_nodes, weighted)
     @assert size(A) == (n, n)
     return dir == :out ? A : A'
 end
diff --git a/GNNGraphs/src/transform.jl b/GNNGraphs/src/transform.jl
@@ -148,7 +148,7 @@ end
 """
     coalesce(g::GNNGraph; aggr=+)
 
-Return a new GNNGraph where all multiple edges between the same pair of nodes are merged (using aggr for edge weights and features), and the edge indices are sorted lexicographically (by source, then target).
+Return a new GNNGraph where all multiple edges between the same pair of nodes are merged (using aggr for edge weights and features), and the edge indices are sorted lexicographically (by target, then by source).
 This method is only applicable to graphs of type `:coo`.
 
 `aggr` can take value `+`,`min`, `max` or `mean`.
@@ -158,7 +158,8 @@ function Base.coalesce(g::GNNGraph{<:COO_T}; aggr = +)
     w = get_edge_weight(g)
     edata = g.edata
     num_edges = g.num_edges
-    idxs, idxmax = edge_encoding(s, t, g.num_nodes)
+    # order by target first and then source as a workaround of CUDA.jl issue: https://github.com/JuliaGPU/CUDA.jl/issues/2820
+    idxs, idxmax = edge_encoding(t, s, g.num_nodes)
 
     perm = sortperm(idxs)
     idxs = idxs[perm]
diff --git a/GNNGraphs/test/gnngraph.jl b/GNNGraphs/test/gnngraph.jl
@@ -99,13 +99,14 @@ end
                 mat_gpu = adjacency_matrix(g_gpu)
                 @test mat_gpu isa AbstractMatrix{Int}
                 @test get_device(mat_gpu) isa AbstractGPUDevice
-                @test Array(mat_gpu) == adj_mat
+                # Convert to float first because poor Int support in CUSPARSE, throws an error
+                @test Array(Float32.(mat_gpu)) == Float32.(adj_mat)
             end
         end
 
         @testset "normalized_laplacian" begin
             mat = normalized_laplacian(g)
-            if TEST_GPU && !(dev isa MetalDevice) && GRAPH_T != :sparse
+            if TEST_GPU && !(dev isa MetalDevice) && GRAPH_T != :sparse && GRAPH_T != :coo
                 mat_gpu = normalized_laplacian(g_gpu)
                 @test mat_gpu isa AbstractMatrix{Float32}
                 @test get_device(mat_gpu)isa AbstractGPUDevice
@@ -114,7 +115,7 @@ end
         end
 
         @testset "scaled_laplacian" begin 
-            if TEST_GPU && !(dev isa MetalDevice) && GRAPH_T != :sparse
+            if TEST_GPU && !(dev isa MetalDevice) && GRAPH_T != :sparse && GRAPH_T != :coo
                 mat = scaled_laplacian(g)
                 mat_gpu = scaled_laplacian(g_gpu)
                 @test mat_gpu isa AbstractMatrix{Float32}
diff --git a/GNNGraphs/test/transform.jl b/GNNGraphs/test/transform.jl
@@ -456,8 +456,10 @@ end
 
             s2, t2 = edge_index(g2)
             w2 = get_edge_weight(g2)
-            @test s2 == [1, 2, 2, 3, 3, 4, 4]
-            @test t2 == [2, 1, 3, 2, 4, 3, 4]
+            # @test s2 == [1, 2, 2, 3, 3, 4, 4]
+            # @test t2 == [2, 1, 3, 2, 4, 3, 4]
+            @test s2 == [2, 1, 3, 2, 4, 3, 4]
+            @test t2 == [1, 2, 2, 3, 3, 4, 4]
             @test w2 == [1, 1, 2, 2, 3.5, 3.5, 5]
             @test g2.edata.e == [10.0, 10.0, 20.0, 20.0, 35.0, 35.0, 50.0]
         end 
diff --git a/GNNlib/ext/GNNlibCUDAExt.jl b/GNNlib/ext/GNNlibCUDAExt.jl
@@ -3,7 +3,10 @@ module GNNlibCUDAExt
 using CUDA
 using Random, Statistics, LinearAlgebra
 using GNNlib: GNNlib, propagate, copy_xj, e_mul_xj, w_mul_xj
-using GNNGraphs: GNNGraph, COO_T, SPARSE_T
+using GNNGraphs: GNNGraph, COO_T, SPARSE_T, to_dense, to_sparse
+using ChainRulesCore: @non_differentiable
+
+const CUDA_COO_T = Tuple{T, T, V} where {T <: AnyCuArray{<:Integer}, V <: Union{Nothing, AnyCuArray}}
 
 ###### PROPAGATE SPECIALIZATIONS ####################
 
@@ -12,7 +15,9 @@ using GNNGraphs: GNNGraph, COO_T, SPARSE_T
 ## avoid the fast path on gpu until we have better cuda support
 function GNNlib.propagate(::typeof(copy_xj), g::GNNGraph{<:COO_T}, ::typeof(+),
         xi, xj::AnyCuMatrix, e)
-    propagate((xi, xj, e) -> copy_xj(xi, xj, e), g, +, xi, xj, e)
+    A = _adjacency_matrix(g, eltype(xj); weighted = false)
+
+    return xj * A
 end
 
 ## E_MUL_XJ 
@@ -42,4 +47,21 @@ end
 
 # Flux.Zygote.@nograd compute_degree
 
+## CUSTOM ADJACENCY_MATRIX IMPLEMENTATION FOR CUDA COO GRAPHS, returning dense matrix when not coalesced, more efficient 
+
+function _adjacency_matrix(g::GNNGraph{<:CUDA_COO_T}, T::DataType = eltype(g); dir = :out,
+                                 weighted = true)
+    if !g.is_coalesced
+        # Revisit after 
+        # https://github.com/JuliaGPU/CUDA.jl/issues/1113
+        A, n, m = to_dense(g.graph, T; num_nodes = g.num_nodes, weighted) # if not coalesced, construction of sparse matrix is slow
+    else
+        A, n, m = to_sparse(g.graph, T; num_nodes = g.num_nodes, weighted, is_coalesced = true)
+    end
+    @assert size(A) == (n, n)
+    return dir == :out ? A : A'
+end
+
+@non_differentiable _adjacency_matrix(x...)
+
 end #module
diff --git a/GNNlib/test/test_module.jl b/GNNlib/test/test_module.jl
@@ -150,7 +150,7 @@ function test_gradients(
     return true
 end
 
-function generate_test_graphs(graph_type)
+function generate_test_graphs(graph_type; do_coalesce=false)
     adj1 = [0 1 0 1
             1 0 1 0
             0 1 0 1
@@ -168,12 +168,18 @@ function generate_test_graphs(graph_type)
     g_single_vertex = GNNGraph(adj_single_vertex,
                                 ndata = rand(Float32, D_IN, 4);
                                 graph_type)
+    
+    if graph_type == :coo && do_coalesce
+        g1 = coalesce(g1)
+        g_single_vertex = coalesce(g_single_vertex)
+    end
 
     return (g1, g_single_vertex)
 end
 
 GRAPH_TYPES = [:coo, :dense, :sparse]
 TEST_GRAPHS = [generate_test_graphs(:coo)...,
+               generate_test_graphs(:coo, do_coalesce=true)...,
                generate_test_graphs(:dense)...,
                generate_test_graphs(:sparse)...]
 
diff --git a/GraphNeuralNetworks/test/layers/conv.jl b/GraphNeuralNetworks/test/layers/conv.jl
@@ -108,7 +108,7 @@ end
         
         if gpu_backend() == "AMDGPU"
             broken = true
-        elseif gpu_backend() == "CUDA" && get_graph_type(g) == :sparse
+        elseif gpu_backend() == "CUDA" && get_graph_type(g) in [:coo, :sparse]
             broken = true
         else
             broken = false
diff --git a/GraphNeuralNetworks/test/test_module.jl b/GraphNeuralNetworks/test/test_module.jl
@@ -157,7 +157,7 @@ function test_gradients(
 end
 
 
-function generate_test_graphs(graph_type)
+function generate_test_graphs(graph_type; do_coalesce=false)
     adj1 = [0 1 0 1
             1 0 1 0
             0 1 0 1
@@ -175,12 +175,18 @@ function generate_test_graphs(graph_type)
     g_single_vertex = GNNGraph(adj_single_vertex,
                                 ndata = rand(Float32, D_IN, 4);
                                 graph_type)
+    
+    if graph_type == :coo && do_coalesce
+        g1 = coalesce(g1)
+        g_single_vertex = coalesce(g_single_vertex)
+    end
 
     return (g1, g_single_vertex)
 end
 
 GRAPH_TYPES = [:coo, :dense, :sparse]
 TEST_GRAPHS = [generate_test_graphs(:coo)...,
+               generate_test_graphs(:coo, do_coalesce=true)...,
                generate_test_graphs(:dense)...,
                generate_test_graphs(:sparse)...]