`SGConv` layer exhibits progressively increasing runtime in training loop

As the title says, the `SGConv` layer seems to exhibit progressively increasing runtimes when differentiated with Zygote in a training loop. This does happen on both CPU and GPU, but does neither happen for other layers, nor when using it purely for inference without Zygote. The problem does also not show up for Flux builtin layers. The issue happens in the same way on multiple machines, but I haven't been able to check it with Enzyme or ForwardDiff. 

I created a number of visualizations to show more clearly what's going on: 

![Image](https://github.com/user-attachments/assets/34ffa3b3-f40c-413e-b163-1604448dd571)
![Image](https://github.com/user-attachments/assets/750fd604-f3ca-429f-a528-6eb57ed09a9e)
![Image](https://github.com/user-attachments/assets/829a6757-503a-4a1e-bae7-e6d8543f6e53)
![Image](https://github.com/user-attachments/assets/aefd98ab-9e7d-46e0-b6cb-6e22dc7b2349)
![Image](https://github.com/user-attachments/assets/7182f6b6-8bd8-4479-8018-84e5b312e20e)

Code that produced the plots: 

```Julia 
begin
    import Flux
    import GraphNeuralNetworks as GNN
    import CUDA
    import MLUtils
    import CairoMakie
    import Statistics
    CUDA.allowscalar(false)

"""
Test script for the SGConv layer issue in the GraphNeuralNetworks.jl package.
"""

"""
Generate a set of community graphs for testing purposes.
"""
function generate_dummy_graphs(n_samples = 100)
    all_graphs = GNN.GNNGraph[]

    for i in 1:n_samples
        n_nodes = 120

        # random adjacency matrix
        A = Float32.(rand(Float32, n_nodes, n_nodes) .> 0.5)

        # Create simple dummy node features
        features = ones(Float32, 1, n_nodes)

        # Create GNNGraph
        g = GNN.GNNGraph(A, ndata = (x = features,))

        push!(all_graphs, g) # make it symmetric
    end
    return all_graphs
end

################################################################################################################
# build data loader
graph_loader = let
    data = generate_dummy_graphs(50000)
    data_loader = Flux.DataLoader(data, batchsize = 50, shuffle = true, collate = true)
end

################################################################################################################
# test functions
function run_tests(layer, plotname, plottitle, data_loader)
    times = Float64[]

    for bg in data_loader
        t = @elapsed Flux.withgradient(layer) do model
            A = model(bg, bg.ndata.x)
            return sum(A)
        end
        push!(times, t)
    end

    fig = CairoMakie.Figure(size = (600, 400))
    ax = CairoMakie.Axis(fig[1, 1], title = plottitle,
        xlabel = "Batch Index", ylabel = "Time (s)")
    CairoMakie.scatterlines!(ax, times, label = "Training Time", markersize = 6)
    CairoMakie.ylims!(ax, minimum(times)*0.9, 1.1*Statistics.quantile(times, 0.99))
    @show fig
    CairoMakie.save("$(plotname).png", fig)
end

function run_tests_cuda(layer, plotname, plottitle, data_loader)
    CUDA.device_reset!() # reset the GPU device
    CUDA.reclaim() # reclaim GPU memory
    times = Float64[]
    cuda_layer = layer |> Flux.gpu # move to gpu
    for bg in data_loader
        bg_gpu = bg |> Flux.gpu # move to gpu

        t = CUDA.@elapsed Flux.withgradient(cuda_layer) do model
            A = model(bg_gpu, bg_gpu.ndata.x)
            return sum(A)
        end

        push!(times, t)
    end

    fig = CairoMakie.Figure(size = (600, 400))
    ax = CairoMakie.Axis(fig[1, 1], title = plottitle,
        xlabel = "Batch Index", ylabel = "Time (s)")
    CairoMakie.scatterlines!(ax, times, label = "Training Time", markersize = 6)
    CairoMakie.ylims!(ax, minimum(times)*0.9, 1.1*Statistics.quantile(times, 0.99))
    @show fig
    CairoMakie.save("$(plotname).png", fig)
end

run_tests(GNN.SGConv(1 => 64, 1; bias = false, add_self_loops = true), "sgconv_cpu", "Differentiation Time per batch CPU", graph_loader)

run_tests_cuda(GNN.SGConv(1 => 64, 1; bias = false, add_self_loops = true), "sgconv_gpu", "Differentiation Time per batch GPU", graph_loader)

run_tests_cuda(GNN.GraphConv(
    1 => 64,), 
    "graphconv_gpu", "Differentiation Time per batch GraphConv GPU", graph_loader)

####################################################################################################
# with inference mode only 
let
    if !CUDA.functional()
        throw(SystemError("CUDA is not functional. Please check your CUDA installation."))
    end

    CUDA.device_reset!()
    CUDA.reclaim()

    times = Float64[]

    for bg in graph_loader
        bg_gpu = bg |> Flux.gpu # move to gpu

        thing = GNN.SGConv(1 => 64, 1; bias = false, add_self_loops = true) |> Flux.gpu

        t = CUDA.@elapsed sum(thing(bg_gpu, bg_gpu.ndata.x))

        push!(times, t)
    end

    fig = CairoMakie.Figure(size = (600, 400))
    ax = CairoMakie.Axis(fig[1, 1], title = "Inference time only GPU",
        xlabel = "Batch Index", ylabel = "Time (s)")
    CairoMakie.scatterlines!(ax, times, label = "Training Time", markersize = 6)
    CairoMakie.ylims!(ax, minimum(times)*0.9, 1.1*Statistics.quantile(times, 0.9))
    @show fig
    CairoMakie.save("sgconv_inference.png", fig)
end


#############################################################################################
# run with Flux Dense layer 
let
    plotname = "flux_dense_gpu"
    plottitle = "Flux Dense Layer Differentiation Time per batch GPU"
    CUDA.device_reset!() # reset the GPU device
    CUDA.reclaim() # reclaim GPU memory
    times = Float64[]
    cuda_layer = Flux.Dense(6000 => 64) |> Flux.gpu # move to gpu
    for bg in graph_loader
        x = bg.ndata.x |> vec |> Flux.gpu # move to gpu

        t = CUDA.@elapsed Flux.withgradient(cuda_layer) do model
            A = model(x)
            return sum(A)
        end

        push!(times, t)
    end

    fig = CairoMakie.Figure(size = (600, 400))
    ax = CairoMakie.Axis(fig[1, 1], title = plottitle,
        xlabel = "Batch Index", ylabel = "Time (s)")
    CairoMakie.scatterlines!(ax, times, label = "Training Time", markersize = 6)
    CairoMakie.ylims!(ax, minimum(times)*0.9, 1.1*Statistics.quantile(times, 0.99))
    @show fig
    CairoMakie.save("$(plotname).png", fig)
end
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

`SGConv` layer exhibits progressively increasing runtime in training loop #600

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

SGConv layer exhibits progressively increasing runtime in training loop #600

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions

`SGConv` layer exhibits progressively increasing runtime in training loop #600