Skip to content

SGConv layer exhibits progressively increasing runtime in training loop #600

@MaHaWo

Description

@MaHaWo

As the title says, the SGConv layer seems to exhibit progressively increasing runtimes when differentiated with Zygote in a training loop. This does happen on both CPU and GPU, but does neither happen for other layers, nor when using it purely for inference without Zygote. The problem does also not show up for Flux builtin layers. The issue happens in the same way on multiple machines, but I haven't been able to check it with Enzyme or ForwardDiff.

I created a number of visualizations to show more clearly what's going on:

Image
Image
Image
Image
Image

Code that produced the plots:

begin
    import Flux
    import GraphNeuralNetworks as GNN
    import CUDA
    import MLUtils
    import CairoMakie
    import Statistics
    CUDA.allowscalar(false)

"""
Test script for the SGConv layer issue in the GraphNeuralNetworks.jl package.
"""

"""
Generate a set of community graphs for testing purposes.
"""
function generate_dummy_graphs(n_samples = 100)
    all_graphs = GNN.GNNGraph[]

    for i in 1:n_samples
        n_nodes = 120

        # random adjacency matrix
        A = Float32.(rand(Float32, n_nodes, n_nodes) .> 0.5)

        # Create simple dummy node features
        features = ones(Float32, 1, n_nodes)

        # Create GNNGraph
        g = GNN.GNNGraph(A, ndata = (x = features,))

        push!(all_graphs, g) # make it symmetric
    end
    return all_graphs
end

################################################################################################################
# build data loader
graph_loader = let
    data = generate_dummy_graphs(50000)
    data_loader = Flux.DataLoader(data, batchsize = 50, shuffle = true, collate = true)
end

################################################################################################################
# test functions
function run_tests(layer, plotname, plottitle, data_loader)
    times = Float64[]

    for bg in data_loader
        t = @elapsed Flux.withgradient(layer) do model
            A = model(bg, bg.ndata.x)
            return sum(A)
        end
        push!(times, t)
    end

    fig = CairoMakie.Figure(size = (600, 400))
    ax = CairoMakie.Axis(fig[1, 1], title = plottitle,
        xlabel = "Batch Index", ylabel = "Time (s)")
    CairoMakie.scatterlines!(ax, times, label = "Training Time", markersize = 6)
    CairoMakie.ylims!(ax, minimum(times)*0.9, 1.1*Statistics.quantile(times, 0.99))
    @show fig
    CairoMakie.save("$(plotname).png", fig)
end

function run_tests_cuda(layer, plotname, plottitle, data_loader)
    CUDA.device_reset!() # reset the GPU device
    CUDA.reclaim() # reclaim GPU memory
    times = Float64[]
    cuda_layer = layer |> Flux.gpu # move to gpu
    for bg in data_loader
        bg_gpu = bg |> Flux.gpu # move to gpu

        t = CUDA.@elapsed Flux.withgradient(cuda_layer) do model
            A = model(bg_gpu, bg_gpu.ndata.x)
            return sum(A)
        end

        push!(times, t)
    end

    fig = CairoMakie.Figure(size = (600, 400))
    ax = CairoMakie.Axis(fig[1, 1], title = plottitle,
        xlabel = "Batch Index", ylabel = "Time (s)")
    CairoMakie.scatterlines!(ax, times, label = "Training Time", markersize = 6)
    CairoMakie.ylims!(ax, minimum(times)*0.9, 1.1*Statistics.quantile(times, 0.99))
    @show fig
    CairoMakie.save("$(plotname).png", fig)
end

run_tests(GNN.SGConv(1 => 64, 1; bias = false, add_self_loops = true), "sgconv_cpu", "Differentiation Time per batch CPU", graph_loader)

run_tests_cuda(GNN.SGConv(1 => 64, 1; bias = false, add_self_loops = true), "sgconv_gpu", "Differentiation Time per batch GPU", graph_loader)

run_tests_cuda(GNN.GraphConv(
    1 => 64,), 
    "graphconv_gpu", "Differentiation Time per batch GraphConv GPU", graph_loader)

####################################################################################################
# with inference mode only 
let
    if !CUDA.functional()
        throw(SystemError("CUDA is not functional. Please check your CUDA installation."))
    end

    CUDA.device_reset!()
    CUDA.reclaim()

    times = Float64[]

    for bg in graph_loader
        bg_gpu = bg |> Flux.gpu # move to gpu

        thing = GNN.SGConv(1 => 64, 1; bias = false, add_self_loops = true) |> Flux.gpu

        t = CUDA.@elapsed sum(thing(bg_gpu, bg_gpu.ndata.x))

        push!(times, t)
    end

    fig = CairoMakie.Figure(size = (600, 400))
    ax = CairoMakie.Axis(fig[1, 1], title = "Inference time only GPU",
        xlabel = "Batch Index", ylabel = "Time (s)")
    CairoMakie.scatterlines!(ax, times, label = "Training Time", markersize = 6)
    CairoMakie.ylims!(ax, minimum(times)*0.9, 1.1*Statistics.quantile(times, 0.9))
    @show fig
    CairoMakie.save("sgconv_inference.png", fig)
end


#############################################################################################
# run with Flux Dense layer 
let
    plotname = "flux_dense_gpu"
    plottitle = "Flux Dense Layer Differentiation Time per batch GPU"
    CUDA.device_reset!() # reset the GPU device
    CUDA.reclaim() # reclaim GPU memory
    times = Float64[]
    cuda_layer = Flux.Dense(6000 => 64) |> Flux.gpu # move to gpu
    for bg in graph_loader
        x = bg.ndata.x |> vec |> Flux.gpu # move to gpu

        t = CUDA.@elapsed Flux.withgradient(cuda_layer) do model
            A = model(x)
            return sum(A)
        end

        push!(times, t)
    end

    fig = CairoMakie.Figure(size = (600, 400))
    ax = CairoMakie.Axis(fig[1, 1], title = plottitle,
        xlabel = "Batch Index", ylabel = "Time (s)")
    CairoMakie.scatterlines!(ax, times, label = "Training Time", markersize = 6)
    CairoMakie.ylims!(ax, minimum(times)*0.9, 1.1*Statistics.quantile(times, 0.99))
    @show fig
    CairoMakie.save("$(plotname).png", fig)
end

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions