-
Notifications
You must be signed in to change notification settings - Fork 54
Open
Description
As the title says, the SGConv
layer seems to exhibit progressively increasing runtimes when differentiated with Zygote in a training loop. This does happen on both CPU and GPU, but does neither happen for other layers, nor when using it purely for inference without Zygote. The problem does also not show up for Flux builtin layers. The issue happens in the same way on multiple machines, but I haven't been able to check it with Enzyme or ForwardDiff.
I created a number of visualizations to show more clearly what's going on:
Code that produced the plots:
begin
import Flux
import GraphNeuralNetworks as GNN
import CUDA
import MLUtils
import CairoMakie
import Statistics
CUDA.allowscalar(false)
"""
Test script for the SGConv layer issue in the GraphNeuralNetworks.jl package.
"""
"""
Generate a set of community graphs for testing purposes.
"""
function generate_dummy_graphs(n_samples = 100)
all_graphs = GNN.GNNGraph[]
for i in 1:n_samples
n_nodes = 120
# random adjacency matrix
A = Float32.(rand(Float32, n_nodes, n_nodes) .> 0.5)
# Create simple dummy node features
features = ones(Float32, 1, n_nodes)
# Create GNNGraph
g = GNN.GNNGraph(A, ndata = (x = features,))
push!(all_graphs, g) # make it symmetric
end
return all_graphs
end
################################################################################################################
# build data loader
graph_loader = let
data = generate_dummy_graphs(50000)
data_loader = Flux.DataLoader(data, batchsize = 50, shuffle = true, collate = true)
end
################################################################################################################
# test functions
function run_tests(layer, plotname, plottitle, data_loader)
times = Float64[]
for bg in data_loader
t = @elapsed Flux.withgradient(layer) do model
A = model(bg, bg.ndata.x)
return sum(A)
end
push!(times, t)
end
fig = CairoMakie.Figure(size = (600, 400))
ax = CairoMakie.Axis(fig[1, 1], title = plottitle,
xlabel = "Batch Index", ylabel = "Time (s)")
CairoMakie.scatterlines!(ax, times, label = "Training Time", markersize = 6)
CairoMakie.ylims!(ax, minimum(times)*0.9, 1.1*Statistics.quantile(times, 0.99))
@show fig
CairoMakie.save("$(plotname).png", fig)
end
function run_tests_cuda(layer, plotname, plottitle, data_loader)
CUDA.device_reset!() # reset the GPU device
CUDA.reclaim() # reclaim GPU memory
times = Float64[]
cuda_layer = layer |> Flux.gpu # move to gpu
for bg in data_loader
bg_gpu = bg |> Flux.gpu # move to gpu
t = CUDA.@elapsed Flux.withgradient(cuda_layer) do model
A = model(bg_gpu, bg_gpu.ndata.x)
return sum(A)
end
push!(times, t)
end
fig = CairoMakie.Figure(size = (600, 400))
ax = CairoMakie.Axis(fig[1, 1], title = plottitle,
xlabel = "Batch Index", ylabel = "Time (s)")
CairoMakie.scatterlines!(ax, times, label = "Training Time", markersize = 6)
CairoMakie.ylims!(ax, minimum(times)*0.9, 1.1*Statistics.quantile(times, 0.99))
@show fig
CairoMakie.save("$(plotname).png", fig)
end
run_tests(GNN.SGConv(1 => 64, 1; bias = false, add_self_loops = true), "sgconv_cpu", "Differentiation Time per batch CPU", graph_loader)
run_tests_cuda(GNN.SGConv(1 => 64, 1; bias = false, add_self_loops = true), "sgconv_gpu", "Differentiation Time per batch GPU", graph_loader)
run_tests_cuda(GNN.GraphConv(
1 => 64,),
"graphconv_gpu", "Differentiation Time per batch GraphConv GPU", graph_loader)
####################################################################################################
# with inference mode only
let
if !CUDA.functional()
throw(SystemError("CUDA is not functional. Please check your CUDA installation."))
end
CUDA.device_reset!()
CUDA.reclaim()
times = Float64[]
for bg in graph_loader
bg_gpu = bg |> Flux.gpu # move to gpu
thing = GNN.SGConv(1 => 64, 1; bias = false, add_self_loops = true) |> Flux.gpu
t = CUDA.@elapsed sum(thing(bg_gpu, bg_gpu.ndata.x))
push!(times, t)
end
fig = CairoMakie.Figure(size = (600, 400))
ax = CairoMakie.Axis(fig[1, 1], title = "Inference time only GPU",
xlabel = "Batch Index", ylabel = "Time (s)")
CairoMakie.scatterlines!(ax, times, label = "Training Time", markersize = 6)
CairoMakie.ylims!(ax, minimum(times)*0.9, 1.1*Statistics.quantile(times, 0.9))
@show fig
CairoMakie.save("sgconv_inference.png", fig)
end
#############################################################################################
# run with Flux Dense layer
let
plotname = "flux_dense_gpu"
plottitle = "Flux Dense Layer Differentiation Time per batch GPU"
CUDA.device_reset!() # reset the GPU device
CUDA.reclaim() # reclaim GPU memory
times = Float64[]
cuda_layer = Flux.Dense(6000 => 64) |> Flux.gpu # move to gpu
for bg in graph_loader
x = bg.ndata.x |> vec |> Flux.gpu # move to gpu
t = CUDA.@elapsed Flux.withgradient(cuda_layer) do model
A = model(x)
return sum(A)
end
push!(times, t)
end
fig = CairoMakie.Figure(size = (600, 400))
ax = CairoMakie.Axis(fig[1, 1], title = plottitle,
xlabel = "Batch Index", ylabel = "Time (s)")
CairoMakie.scatterlines!(ax, times, label = "Training Time", markersize = 6)
CairoMakie.ylims!(ax, minimum(times)*0.9, 1.1*Statistics.quantile(times, 0.99))
@show fig
CairoMakie.save("$(plotname).png", fig)
end
Metadata
Metadata
Assignees
Labels
No labels