From 2c496bc0622dad58a5ff43e66f0782d4b58ec6bd Mon Sep 17 00:00:00 2001 From: Sacheendra Talluri Date: Fri, 4 Apr 2025 11:39:49 +0200 Subject: [PATCH 1/9] [WIP] Initial GPU support --- .../opendc/compute/simulator/host/SimHost.kt | 8 +- .../provisioner/HostsProvisioningStep.kt | 1 + .../compute/topology/TopologyFactories.kt | 9 + .../opendc/compute/topology/specs/HostSpec.kt | 1 + .../compute/topology/specs/TopologySpecs.kt | 9 + .../compute/workload/ComputeWorkloadLoader.kt | 12 +- .../base/runner/ScenarioReplayer.kt | 3 +- .../opendc/experiments/base/BatteryTest.kt | 16 +- .../org/opendc/experiments/base/CarbonTest.kt | 20 +- .../opendc/experiments/base/ExperimentTest.kt | 14 +- .../base/FailuresAndCheckpointingTest.kt | 24 +- .../experiments/base/FlowDistributorTest.kt | 62 ++-- .../experiments/base/FragmentScalingTest.kt | 40 +-- .../opendc/experiments/base/SchedulerTest.kt | 4 +- .../opendc/simulator/compute/gpu/SimGpu.java | 269 ++++++++++++++++++ .../simulator/compute/machine/SimMachine.java | 19 +- .../compute/models/MachineModel.java | 17 +- .../compute/workload/ChainWorkload.java | 9 +- .../compute/workload/VirtualMachine.java | 80 ++++-- .../simulator/compute/workload/Workload.java | 5 +- .../workload/trace/SimTraceWorkload.java | 15 +- .../compute/workload/trace/TraceFragment.java | 6 +- .../compute/workload/trace/TraceWorkload.java | 13 +- .../simulator/engine/graph/FlowEdge.java | 16 ++ .../opendc/trace/conv/ResourceStateColumns.kt | 12 + .../opendc/OdcVmResourceStateTableReader.kt | 13 +- .../opendc/OdcVmResourceStateTableWriter.kt | 19 +- .../trace/formats/opendc/OdcVmTraceFormat.kt | 4 + .../formats/opendc/parquet/ResourceState.kt | 2 + .../parquet/ResourceStateReadSupport.kt | 8 + .../ResourceStateRecordMaterializer.kt | 27 +- .../parquet/ResourceStateWriteSupport.kt | 14 + .../org/opendc/web/runner/OpenDCRunner.kt | 2 + 33 files changed, 633 insertions(+), 140 deletions(-) create mode 100644 opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/gpu/SimGpu.java diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/host/SimHost.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/host/SimHost.kt index d23794ab9..37beaeff0 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/host/SimHost.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/host/SimHost.kt @@ -58,6 +58,7 @@ public class SimHost( private val engine: FlowEngine, private val machineModel: MachineModel, private val cpuPowerModel: CpuPowerModel, + private val accelPowerModel: CpuPowerModel, private val embodiedCarbon: Double, private val expectedLifetime: Double, private val powerDistributor: FlowDistributor, @@ -136,6 +137,7 @@ public class SimHost( this.machineModel, this.powerDistributor, this.cpuPowerModel, + this.accelPowerModel, ) { cause -> hostState = if (cause != null) HostState.ERROR else HostState.DOWN } @@ -352,7 +354,11 @@ public class SimHost( * Convert flavor to machine model. */ private fun Flavor.toMachineModel(): MachineModel { - return MachineModel(simMachine!!.machineModel.cpuModel, MemoryUnit("Generic", "Generic", 3200.0, memorySize)) + return MachineModel( + simMachine!!.machineModel.cpuModel, + simMachine!!.machineModel.accelModel, + MemoryUnit("Generic", "Generic", 3200.0, memorySize), + ) } /** diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/provisioner/HostsProvisioningStep.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/provisioner/HostsProvisioningStep.kt index 675ce3a92..8a64a5f3a 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/provisioner/HostsProvisioningStep.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/provisioner/HostsProvisioningStep.kt @@ -127,6 +127,7 @@ public class HostsProvisioningStep internal constructor( engine, hostSpec.model, hostSpec.cpuPowerModel, + hostSpec.accelPowerModel, hostSpec.embodiedCarbon, hostSpec.expectedLifetime, hostDistributor, diff --git a/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/TopologyFactories.kt b/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/TopologyFactories.kt index cc2c4b4e1..6521d09a3 100644 --- a/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/TopologyFactories.kt +++ b/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/TopologyFactories.kt @@ -175,12 +175,21 @@ private fun HostJSONSpec.toHostSpec(clusterName: String): HostSpec { val powerModel = getPowerModel(powerModel.modelType, powerModel.power.toWatts(), powerModel.maxPower.toWatts(), powerModel.idlePower.toWatts()) + val accelPowerModel = + getPowerModel( + accelPowerModel.modelType, + accelPowerModel.power.toWatts(), + accelPowerModel.maxPower.toWatts(), + accelPowerModel.idlePower.toWatts(), + ) + val hostSpec = HostSpec( createUniqueName(this.name, hostNames), clusterName, machineModel, powerModel, + accelPowerModel, ) return hostSpec } diff --git a/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/HostSpec.kt b/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/HostSpec.kt index e4ec89e1d..8fdbbe5b8 100644 --- a/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/HostSpec.kt +++ b/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/HostSpec.kt @@ -37,6 +37,7 @@ public data class HostSpec( val clusterName: String, val model: MachineModel, val cpuPowerModel: CpuPowerModel, + val accelPowerModel: CpuPowerModel, val embodiedCarbon: Double = 1000.0, val expectedLifetime: Double = 5.0, ) diff --git a/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/TopologySpecs.kt b/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/TopologySpecs.kt index 920d83731..7aff1eb48 100644 --- a/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/TopologySpecs.kt +++ b/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/TopologySpecs.kt @@ -79,6 +79,7 @@ public data class HostJSONSpec( val cpu: CPUJSONSpec, val memory: MemoryJSONSpec, val powerModel: PowerModelSpec = PowerModelSpec.DFLT, + val accelPowerModel: PowerModelSpec = PowerModelSpec.NONE, val count: Int = 1, ) @@ -139,6 +140,14 @@ public data class PowerModelSpec( maxPower = Power.ofWatts(400.0), idlePower = Power.ofWatts(200.0), ) + + public val NONE: PowerModelSpec = + PowerModelSpec( + modelType = "constant", + power = Power.ofWatts(0), + maxPower = Power.ofWatts(0), + idlePower = Power.ofWatts(0), + ) } } diff --git a/opendc-compute/opendc-compute-workload/src/main/kotlin/org/opendc/compute/workload/ComputeWorkloadLoader.kt b/opendc-compute/opendc-compute-workload/src/main/kotlin/org/opendc/compute/workload/ComputeWorkloadLoader.kt index 80996c0ef..640e3dc03 100644 --- a/opendc-compute/opendc-compute-workload/src/main/kotlin/org/opendc/compute/workload/ComputeWorkloadLoader.kt +++ b/opendc-compute/opendc-compute-workload/src/main/kotlin/org/opendc/compute/workload/ComputeWorkloadLoader.kt @@ -36,8 +36,10 @@ import org.opendc.trace.conv.resourceDuration import org.opendc.trace.conv.resourceID import org.opendc.trace.conv.resourceMemCapacity import org.opendc.trace.conv.resourceNature +import org.opendc.trace.conv.resourceStateAccelUsage import org.opendc.trace.conv.resourceStateCpuUsage import org.opendc.trace.conv.resourceStateDuration +import org.opendc.trace.conv.resourceStateIsGpu import org.opendc.trace.conv.resourceSubmissionTime import java.io.File import java.lang.ref.SoftReference @@ -79,6 +81,8 @@ public class ComputeWorkloadLoader( val durationCol = reader.resolve(resourceStateDuration) val coresCol = reader.resolve(resourceCpuCount) val usageCol = reader.resolve(resourceStateCpuUsage) + val accelUsageCol = reader.resolve(resourceStateAccelUsage) + val isGpuCol = reader.resolve(resourceStateIsGpu) val fragments = mutableMapOf() @@ -88,12 +92,14 @@ public class ComputeWorkloadLoader( val durationMs = reader.getDuration(durationCol)!! val cores = reader.getInt(coresCol) val cpuUsage = reader.getDouble(usageCol) + val accelUsage = reader.getDouble(accelUsageCol) + val isGpu = reader.getBoolean(isGpuCol) val builder = fragments.computeIfAbsent( id, ) { Builder(checkpointInterval, checkpointDuration, checkpointIntervalScaling, scalingPolicy, id) } - builder.add(durationMs, cpuUsage, cores) + builder.add(durationMs, cpuUsage, cores, accelUsage, isGpu) } fragments @@ -231,10 +237,12 @@ public class ComputeWorkloadLoader( duration: Duration, usage: Double, cores: Int, + accelUsage: Double, + isGpu: Boolean, ) { totalLoad += (usage * duration.toMillis()) / 1000 // avg MHz * duration = MFLOPs - builder.add(duration.toMillis(), usage, cores) + builder.add(duration.toMillis(), usage, cores, accelUsage, isGpu) } /** diff --git a/opendc-experiments/opendc-experiments-base/src/main/kotlin/org/opendc/experiments/base/runner/ScenarioReplayer.kt b/opendc-experiments/opendc-experiments-base/src/main/kotlin/org/opendc/experiments/base/runner/ScenarioReplayer.kt index d56e4e4bc..5de52eae8 100644 --- a/opendc-experiments/opendc-experiments-base/src/main/kotlin/org/opendc/experiments/base/runner/ScenarioReplayer.kt +++ b/opendc-experiments/opendc-experiments-base/src/main/kotlin/org/opendc/experiments/base/runner/ScenarioReplayer.kt @@ -120,7 +120,8 @@ public suspend fun ComputeService.replay( } val workload = entry.trace - val meta = mutableMapOf("workload" to workload) +// val meta = mutableMapOf("workload" to workload) + val meta = mutableMapOf() val nature = if (entry.nature == "deferrable") { diff --git a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/BatteryTest.kt b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/BatteryTest.kt index a85c84f3e..17efd49e5 100644 --- a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/BatteryTest.kt +++ b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/BatteryTest.kt @@ -45,7 +45,7 @@ class BatteryTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), + TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), ), submissionTime = "2022-01-01T00:00", ), @@ -71,7 +71,7 @@ class BatteryTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), + TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), ), submissionTime = "2022-01-01T00:00", ), @@ -98,7 +98,7 @@ class BatteryTest { name = "0", fragments = arrayListOf( - TraceFragment(20 * 60 * 1000, 1000.0, 1), + TraceFragment(20 * 60 * 1000, 1000.0, 1, 0.0, false), ), submissionTime = "2022-01-01T00:00", ), @@ -126,7 +126,7 @@ class BatteryTest { name = "0", fragments = arrayListOf( - TraceFragment(30 * 60 * 1000, 1000.0, 1), + TraceFragment(30 * 60 * 1000, 1000.0, 1, 0.0, false), ), submissionTime = "2022-01-01T00:00", ), @@ -153,7 +153,7 @@ class BatteryTest { name = "0", fragments = arrayListOf( - TraceFragment(30 * 60 * 1000, 1000.0, 1), + TraceFragment(30 * 60 * 1000, 1000.0, 1, 0.0, false), ), submissionTime = "2022-01-01T00:00", ), @@ -195,7 +195,7 @@ class BatteryTest { createTestTask( name = "0", fragments = - arrayListOf(TraceFragment(10 * 60 * 1000, 1000.0, 1)), + arrayListOf(TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false)), submissionTime = "2022-01-01T00:00", ), ) @@ -221,7 +221,7 @@ class BatteryTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), + TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), ), submissionTime = "2022-01-01T00:00", ), @@ -254,7 +254,7 @@ class BatteryTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), + TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), ), checkpointInterval = 60 * 1000L, checkpointDuration = 1000L, diff --git a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/CarbonTest.kt b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/CarbonTest.kt index a0f5978f9..21efc7d08 100644 --- a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/CarbonTest.kt +++ b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/CarbonTest.kt @@ -49,7 +49,7 @@ class CarbonTest { name = "0", fragments = arrayListOf( - TraceFragment(120 * 60 * 1000, 1000.0, 1), + TraceFragment(120 * 60 * 1000, 1000.0, 1, 0.0, false), ), submissionTime = "2022-01-01T00:00", ), @@ -96,14 +96,14 @@ class CarbonTest { name = "0", fragments = arrayListOf( - TraceFragment(40 * 60 * 1000, 1000.0, 1), - TraceFragment(40 * 60 * 1000, 2000.0, 1), - TraceFragment(40 * 60 * 1000, 1000.0, 1), - TraceFragment(40 * 60 * 1000, 2000.0, 1), - TraceFragment(40 * 60 * 1000, 1000.0, 1), - TraceFragment(40 * 60 * 1000, 2000.0, 1), - TraceFragment(40 * 60 * 1000, 1000.0, 1), - TraceFragment(40 * 60 * 1000, 2000.0, 1), + TraceFragment(40 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(40 * 60 * 1000, 2000.0, 1, 0.0, false), + TraceFragment(40 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(40 * 60 * 1000, 2000.0, 1, 0.0, false), + TraceFragment(40 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(40 * 60 * 1000, 2000.0, 1, 0.0, false), + TraceFragment(40 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(40 * 60 * 1000, 2000.0, 1, 0.0, false), ), submissionTime = "2022-01-01T00:00", ), @@ -165,7 +165,7 @@ class CarbonTest { name = "0", fragments = arrayListOf( - TraceFragment(60 * 60 * 1000, 1000.0, 1), + TraceFragment(60 * 60 * 1000, 1000.0, 1, 0.0, false), ), submissionTime = "2022-01-01T00:00", ), diff --git a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/ExperimentTest.kt b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/ExperimentTest.kt index 2fb5ece82..fe33f101d 100644 --- a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/ExperimentTest.kt +++ b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/ExperimentTest.kt @@ -51,7 +51,7 @@ class ExperimentTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), + TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), ), ), ) @@ -87,14 +87,14 @@ class ExperimentTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), + TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), ), ), createTestTask( name = "1", fragments = arrayListOf( - TraceFragment(5 * 60 * 1000, 2000.0, 1), + TraceFragment(5 * 60 * 1000, 2000.0, 1, 0.0, false), ), ), ) @@ -130,14 +130,14 @@ class ExperimentTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), + TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), ), ), createTestTask( name = "1", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), + TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), ), ), ) @@ -173,14 +173,14 @@ class ExperimentTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), + TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), ), ), createTestTask( name = "1", fragments = arrayListOf( - TraceFragment(5 * 60 * 1000, 2000.0, 1), + TraceFragment(5 * 60 * 1000, 2000.0, 1, 0.0, false), ), submissionTime = "1970-01-01T00:20", ), diff --git a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/FailuresAndCheckpointingTest.kt b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/FailuresAndCheckpointingTest.kt index 3231f533f..b2776e709 100644 --- a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/FailuresAndCheckpointingTest.kt +++ b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/FailuresAndCheckpointingTest.kt @@ -53,7 +53,7 @@ class FailuresAndCheckpointingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), + TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), ), ), ) @@ -93,7 +93,7 @@ class FailuresAndCheckpointingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), + TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), ), ), ) @@ -136,7 +136,7 @@ class FailuresAndCheckpointingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), + TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), ), ), ) @@ -181,7 +181,7 @@ class FailuresAndCheckpointingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), + TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), ), ), ) @@ -240,7 +240,7 @@ class FailuresAndCheckpointingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), + TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), ), checkpointInterval = 60 * 1000L, checkpointDuration = 1000L, @@ -276,8 +276,8 @@ class FailuresAndCheckpointingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 2000.0, 1), - TraceFragment(10 * 60 * 1000, 1000.0, 1), + TraceFragment(10 * 60 * 1000, 2000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), ), checkpointInterval = 60 * 1000L, checkpointDuration = 1000L, @@ -318,8 +318,8 @@ class FailuresAndCheckpointingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), - TraceFragment(10 * 60 * 1000, 2000.0, 1), + TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 2000.0, 1, 0.0, false), ), checkpointInterval = 60 * 1000L, checkpointDuration = 1000L, @@ -356,7 +356,7 @@ class FailuresAndCheckpointingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), + TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), ), checkpointInterval = 60 * 1000L, checkpointDuration = 1000L, @@ -388,7 +388,7 @@ class FailuresAndCheckpointingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), + TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), ), checkpointInterval = 60 * 1000L, checkpointDuration = 1000L, @@ -430,7 +430,7 @@ class FailuresAndCheckpointingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), + TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), ), checkpointInterval = 60 * 1000L, checkpointDuration = 1000L, diff --git a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/FlowDistributorTest.kt b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/FlowDistributorTest.kt index 3d7333607..3e9f54e78 100644 --- a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/FlowDistributorTest.kt +++ b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/FlowDistributorTest.kt @@ -46,8 +46,8 @@ class FlowDistributorTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), - TraceFragment(10 * 60 * 1000, 2000.0, 1), + TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 2000.0, 1, 0.0, false), ), ), ) @@ -81,8 +81,8 @@ class FlowDistributorTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 3000.0, 1), - TraceFragment(10 * 60 * 1000, 4000.0, 1), + TraceFragment(10 * 60 * 1000, 3000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 4000.0, 1, 0.0, false), ), ), ) @@ -116,8 +116,8 @@ class FlowDistributorTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), - TraceFragment(10 * 60 * 1000, 4000.0, 1), + TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 4000.0, 1, 0.0, false), ), ), ) @@ -151,8 +151,8 @@ class FlowDistributorTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 4000.0, 1), - TraceFragment(10 * 60 * 1000, 1000.0, 1), + TraceFragment(10 * 60 * 1000, 4000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), ), ), ) @@ -186,8 +186,8 @@ class FlowDistributorTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 4000.0, 1), - TraceFragment(10 * 60 * 1000, 2000.0, 1), + TraceFragment(10 * 60 * 1000, 4000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 2000.0, 1, 0.0, false), ), ), ) @@ -221,16 +221,16 @@ class FlowDistributorTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), - TraceFragment(10 * 60 * 1000, 3000.0, 1), + TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 3000.0, 1, 0.0, false), ), ), createTestTask( name = "1", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 3000.0, 1), - TraceFragment(10 * 60 * 1000, 1000.0, 1), + TraceFragment(10 * 60 * 1000, 3000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), ), ), ) @@ -268,16 +268,16 @@ class FlowDistributorTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 6000.0, 1), - TraceFragment(10 * 60 * 1000, 5000.0, 1), + TraceFragment(10 * 60 * 1000, 6000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 5000.0, 1, 0.0, false), ), ), createTestTask( name = "1", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 5000.0, 1), - TraceFragment(10 * 60 * 1000, 6000.0, 1), + TraceFragment(10 * 60 * 1000, 5000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 6000.0, 1, 0.0, false), ), ), ) @@ -315,8 +315,8 @@ class FlowDistributorTest { submissionTime = "2024-02-01T10:00", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), - TraceFragment(10 * 60 * 1000, 2000.0, 1), + TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 2000.0, 1, 0.0, false), ), ), createTestTask( @@ -324,7 +324,7 @@ class FlowDistributorTest { submissionTime = "2024-02-01T10:05", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 2000.0, 1), + TraceFragment(10 * 60 * 1000, 2000.0, 1, 0.0, false), ), ), ) @@ -373,7 +373,7 @@ class FlowDistributorTest { submissionTime = "2024-02-01T10:00", fragments = arrayListOf( - TraceFragment(20 * 60 * 1000, 3000.0, 1), + TraceFragment(20 * 60 * 1000, 3000.0, 1, 0.0, false), ), ), createTestTask( @@ -381,7 +381,7 @@ class FlowDistributorTest { submissionTime = "2024-02-01T10:05", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1500.0, 1), + TraceFragment(10 * 60 * 1000, 1500.0, 1, 0.0, false), ), ), ) @@ -426,17 +426,17 @@ class FlowDistributorTest { name = "0", fragments = arrayListOf( - TraceFragment(5 * 60 * 1000, 1000.0, 1), - TraceFragment(5 * 60 * 1000, 1500.0, 1), - TraceFragment(5 * 60 * 1000, 2500.0, 1), - TraceFragment(5 * 60 * 1000, 1000.0, 1), + TraceFragment(5 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(5 * 60 * 1000, 1500.0, 1, 0.0, false), + TraceFragment(5 * 60 * 1000, 2500.0, 1, 0.0, false), + TraceFragment(5 * 60 * 1000, 1000.0, 1, 0.0, false), ), ), createTestTask( name = "1", fragments = arrayListOf( - TraceFragment(20 * 60 * 1000, 3000.0, 1), + TraceFragment(20 * 60 * 1000, 3000.0, 1, 0.0, false), ), ), ) @@ -487,7 +487,7 @@ class FlowDistributorTest { name = "0", fragments = arrayListOf().apply { - repeat(1) { this.add(TraceFragment(10 * 60 * 1000, 3000.0, 1)) } + repeat(1) { this.add(TraceFragment(10 * 60 * 1000, 3000.0, 1, 0.0, false)) } }, ), ) @@ -515,7 +515,7 @@ class FlowDistributorTest { name = "0", fragments = arrayListOf().apply { - repeat(1000) { this.add(TraceFragment(10 * 60 * 1000, 2000.0, 1)) } + repeat(1000) { this.add(TraceFragment(10 * 60 * 1000, 2000.0, 1, 0.0, false)) } }, ), ) @@ -544,7 +544,7 @@ class FlowDistributorTest { createTestTask( name = "0", fragments = - arrayListOf(TraceFragment(10 * 60 * 1000, 2000.0, 1)), + arrayListOf(TraceFragment(10 * 60 * 1000, 2000.0, 1, 0.0, false)), ), ) } diff --git a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/FragmentScalingTest.kt b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/FragmentScalingTest.kt index b0aa3555d..c58ef3662 100644 --- a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/FragmentScalingTest.kt +++ b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/FragmentScalingTest.kt @@ -48,8 +48,8 @@ class FragmentScalingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 2000.0, 1), - TraceFragment(10 * 60 * 1000, 1000.0, 1), + TraceFragment(10 * 60 * 1000, 2000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), ), scalingPolicy = NoDelayScaling(), ), @@ -61,8 +61,8 @@ class FragmentScalingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 2000.0, 1), - TraceFragment(10 * 60 * 1000, 1000.0, 1), + TraceFragment(10 * 60 * 1000, 2000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), ), scalingPolicy = PerfectScaling(), ), @@ -102,7 +102,7 @@ class FragmentScalingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 4000.0, 1), + TraceFragment(10 * 60 * 1000, 4000.0, 1, 0.0, false), ), scalingPolicy = NoDelayScaling(), ), @@ -114,7 +114,7 @@ class FragmentScalingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 4000.0, 1), + TraceFragment(10 * 60 * 1000, 4000.0, 1, 0.0, false), ), scalingPolicy = PerfectScaling(), ), @@ -151,9 +151,9 @@ class FragmentScalingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), - TraceFragment(10 * 60 * 1000, 4000.0, 1), - TraceFragment(10 * 60 * 1000, 1500.0, 1), + TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 4000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1500.0, 1, 0.0, false), ), scalingPolicy = NoDelayScaling(), ), @@ -165,9 +165,9 @@ class FragmentScalingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), - TraceFragment(10 * 60 * 1000, 4000.0, 1), - TraceFragment(10 * 60 * 1000, 1500.0, 1), + TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 4000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1500.0, 1, 0.0, false), ), scalingPolicy = PerfectScaling(), ), @@ -211,7 +211,7 @@ class FragmentScalingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), + TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), ), scalingPolicy = NoDelayScaling(), ), @@ -219,7 +219,7 @@ class FragmentScalingTest { name = "1", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 3000.0, 1), + TraceFragment(10 * 60 * 1000, 3000.0, 1, 0.0, false), ), scalingPolicy = NoDelayScaling(), ), @@ -231,7 +231,7 @@ class FragmentScalingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), + TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), ), scalingPolicy = PerfectScaling(), ), @@ -239,7 +239,7 @@ class FragmentScalingTest { name = "1", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 3000.0, 1), + TraceFragment(10 * 60 * 1000, 3000.0, 1, 0.0, false), ), scalingPolicy = PerfectScaling(), ), @@ -281,7 +281,7 @@ class FragmentScalingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 2000.0, 1), + TraceFragment(10 * 60 * 1000, 2000.0, 1, 0.0, false), ), scalingPolicy = NoDelayScaling(), ), @@ -289,7 +289,7 @@ class FragmentScalingTest { name = "1", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 4000.0, 1), + TraceFragment(10 * 60 * 1000, 4000.0, 1, 0.0, false), ), scalingPolicy = NoDelayScaling(), ), @@ -301,7 +301,7 @@ class FragmentScalingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 2000.0, 1), + TraceFragment(10 * 60 * 1000, 2000.0, 1, 0.0, false), ), scalingPolicy = PerfectScaling(), ), @@ -309,7 +309,7 @@ class FragmentScalingTest { name = "1", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 4000.0, 1), + TraceFragment(10 * 60 * 1000, 4000.0, 1, 0.0, false), ), scalingPolicy = PerfectScaling(), ), diff --git a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/SchedulerTest.kt b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/SchedulerTest.kt index f9a20c68b..14b7a5d05 100644 --- a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/SchedulerTest.kt +++ b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/SchedulerTest.kt @@ -42,14 +42,14 @@ class SchedulerTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), + TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), ), ), createTestTask( name = "1", fragments = arrayListOf( - TraceFragment(5 * 60 * 1000, 2000.0, 1), + TraceFragment(5 * 60 * 1000, 2000.0, 1, 0.0, false), ), submissionTime = "1970-01-01T00:20", ), diff --git a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/gpu/SimGpu.java b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/gpu/SimGpu.java new file mode 100644 index 000000000..f9e4dca4c --- /dev/null +++ b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/gpu/SimGpu.java @@ -0,0 +1,269 @@ +/* + * Copyright (c) 2024 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.simulator.compute.gpu; + +import java.util.List; +import java.util.Map; +import org.opendc.simulator.compute.cpu.CpuPowerModel; +import org.opendc.simulator.compute.machine.PerformanceCounters; +import org.opendc.simulator.compute.models.CpuModel; +import org.opendc.simulator.engine.engine.FlowEngine; +import org.opendc.simulator.engine.graph.FlowConsumer; +import org.opendc.simulator.engine.graph.FlowEdge; +import org.opendc.simulator.engine.graph.FlowNode; +import org.opendc.simulator.engine.graph.FlowSupplier; + +/** + * A {@link SimGpu} of a machine. + */ +public final class SimGpu extends FlowNode implements FlowSupplier, FlowConsumer { + private final CpuModel gpuModel; + + private final CpuPowerModel gpuPowerModel; + + private double currentCpuDemand = 0.0f; // cpu capacity demanded by the mux + private double currentCpuUtilization = 0.0f; + private double currentCpuSupplied = 0.0f; // cpu capacity supplied to the mux + + private double currentPowerDemand; // power demanded of the psu + private double currentPowerSupplied = 0.0f; // cpu capacity supplied by the psu + + private double maxCapacity; + + private final PerformanceCounters performanceCounters = new PerformanceCounters(); + private long lastCounterUpdate; + private final double gpuFrequencyInv; + + private FlowEdge distributorEdge; + private FlowEdge psuEdge; + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Basic Getters and Setters + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + public double getFrequency() { + return gpuModel.getTotalCapacity(); + } + + public void setFrequency(double frequency) { + // Clamp the capacity of the CPU between [0.0, maxFreq] + frequency = Math.max(0, Math.min(this.maxCapacity, frequency)); + // psu.setCpuFrequency(muxInPort, frequency); + } + + @Override + public double getCapacity() { + return maxCapacity; + } + + public PerformanceCounters getPerformanceCounters() { + return performanceCounters; + } + + public double getPowerDraw() { + return this.currentPowerSupplied; + } + + public double getDemand() { + return this.currentCpuDemand; + } + + public double getSpeed() { + return this.currentCpuSupplied; + } + + public CpuModel getCpuModel() { + return gpuModel; + } + + @Override + public String toString() { + return "SimBareMetalMachine.Cpu[model=" + gpuModel + "]"; + } + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Constructors + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + public SimGpu(FlowEngine engine, CpuModel gpuModel, CpuPowerModel powerModel, int id) { + super(engine); + this.gpuModel = gpuModel; + this.maxCapacity = this.gpuModel.getTotalCapacity(); + + this.gpuPowerModel = powerModel; + + this.lastCounterUpdate = clock.millis(); + + this.gpuFrequencyInv = 1 / this.maxCapacity; + + this.currentPowerDemand = this.gpuPowerModel.computePower(this.currentCpuUtilization); + } + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // FlowNode related functionality + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + @Override + public long onUpdate(long now) { + updateCounters(now); + + // Check if supply == demand + if (this.currentPowerDemand != this.currentPowerSupplied) { + this.pushOutgoingDemand(this.psuEdge, this.currentPowerDemand); + + return Long.MAX_VALUE; + } + + this.currentCpuSupplied = Math.min(this.currentCpuDemand, this.maxCapacity); + + this.pushOutgoingSupply(this.distributorEdge, this.currentCpuSupplied); + + return Long.MAX_VALUE; + } + + public void updateCounters() { + this.updateCounters(this.clock.millis()); + } + + /** + * Update the performance counters of the CPU. + * + * @param now The timestamp at which to update the counter. + */ + public void updateCounters(long now) { + long lastUpdate = this.lastCounterUpdate; + this.lastCounterUpdate = now; + long delta = now - lastUpdate; + + if (delta > 0) { + double demand = this.currentCpuDemand; + double rate = this.currentCpuSupplied; + double capacity = this.maxCapacity; + + final double factor = this.gpuFrequencyInv * delta; + + this.performanceCounters.addCpuActiveTime(Math.round(rate * factor)); + this.performanceCounters.addCpuIdleTime(Math.round((capacity - rate) * factor)); + this.performanceCounters.addCpuStealTime(Math.round((demand - rate) * factor)); + } + + this.performanceCounters.setCpuDemand(this.currentCpuDemand); + this.performanceCounters.setCpuSupply(this.currentCpuSupplied); + this.performanceCounters.setCpuCapacity(this.maxCapacity); + } + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // FlowGraph Related functionality + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + /** + * Push new demand to the psu + */ + @Override + public void pushOutgoingDemand(FlowEdge supplierEdge, double newPowerDemand) { + updateCounters(); + this.currentPowerDemand = newPowerDemand; + this.psuEdge.pushDemand(newPowerDemand); + } + + /** + * Push updated supply to the mux + */ + @Override + public void pushOutgoingSupply(FlowEdge consumerEdge, double newCpuSupply) { + updateCounters(); + this.currentCpuSupplied = newCpuSupply; + + this.distributorEdge.pushSupply(newCpuSupply, true); + } + + /** + * Handle new demand coming in from the mux + */ + @Override + public void handleIncomingDemand(FlowEdge consumerEdge, double newCpuDemand) { + updateCounters(); + this.currentCpuDemand = newCpuDemand; + + this.currentCpuUtilization = Math.min(this.currentCpuDemand / this.maxCapacity, 1.0); + + // Calculate Power Demand and send to PSU + this.currentPowerDemand = this.gpuPowerModel.computePower(this.currentCpuUtilization); + + this.invalidate(); + } + + /** + * Handle updated supply from the psu + */ + @Override + public void handleIncomingSupply(FlowEdge supplierEdge, double newPowerSupply) { + updateCounters(); + this.currentPowerSupplied = newPowerSupply; + + this.invalidate(); + } + + /** + * Add a connection to the mux + */ + @Override + public void addConsumerEdge(FlowEdge consumerEdge) { + this.distributorEdge = consumerEdge; + } + + /** + * Add a connection to the psu + */ + @Override + public void addSupplierEdge(FlowEdge supplierEdge) { + this.psuEdge = supplierEdge; + + this.invalidate(); + } + + /** + * Remove the connection to the mux + */ + @Override + public void removeConsumerEdge(FlowEdge consumerEdge) { + this.distributorEdge = null; + this.invalidate(); + } + + /** + * Remove the connection to the psu + */ + @Override + public void removeSupplierEdge(FlowEdge supplierEdge) { + this.psuEdge = null; + this.invalidate(); + } + + @Override + public Map> getConnectedEdges() { + return Map.of( + FlowEdge.NodeType.CONSUMING, List.of(this.psuEdge), + FlowEdge.NodeType.SUPPLYING, List.of(this.distributorEdge)); + } +} diff --git a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/machine/SimMachine.java b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/machine/SimMachine.java index 8baa7f344..4d10f3a4e 100644 --- a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/machine/SimMachine.java +++ b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/machine/SimMachine.java @@ -26,6 +26,7 @@ import java.util.function.Consumer; import org.opendc.simulator.compute.cpu.CpuPowerModel; import org.opendc.simulator.compute.cpu.SimCpu; +import org.opendc.simulator.compute.gpu.SimGpu; import org.opendc.simulator.compute.memory.Memory; import org.opendc.simulator.compute.models.MachineModel; import org.opendc.simulator.compute.power.SimPsu; @@ -46,7 +47,9 @@ public class SimMachine { private final InstantSource clock; private SimCpu cpu; + private SimGpu accel; private FlowDistributor cpuDistributor; + private FlowDistributor accelDistributor; private SimPsu psu; private Memory memory; @@ -76,6 +79,10 @@ public SimCpu getCpu() { return cpu; } + public SimGpu getAccel() { + return accel; + } + public Memory getMemory() { return memory; } @@ -114,6 +121,7 @@ public SimMachine( MachineModel machineModel, FlowDistributor powerDistributor, CpuPowerModel cpuPowerModel, + CpuPowerModel accelPowerModel, Consumer completion) { this.engine = engine; this.machineModel = machineModel; @@ -125,6 +133,7 @@ public SimMachine( new FlowEdge(this.psu, powerDistributor); this.cpu = new SimCpu(engine, this.machineModel.getCpuModel(), cpuPowerModel, 0); + this.accel = new SimGpu(engine, this.machineModel.getAccelModel(), accelPowerModel, 0); new FlowEdge(this.cpu, this.psu); @@ -132,8 +141,10 @@ public SimMachine( // Create a FlowDistributor and add the cpu as supplier this.cpuDistributor = new FlowDistributor(engine); + this.accelDistributor = new FlowDistributor(engine); new FlowEdge(this.cpuDistributor, this.cpu); + new FlowEdge(this.accelDistributor, this.accel); this.completion = completion; } @@ -155,6 +166,12 @@ public void shutdown(Exception cause) { this.cpuDistributor.closeNode(); this.cpuDistributor = null; + this.accel.closeNode(); + this.accel = null; + + this.accelDistributor.closeNode(); + this.accelDistributor = null; + this.memory = null; this.completion.accept(cause); @@ -180,6 +197,6 @@ public boolean canFit(MachineModel model) { * @param completion The completion callback that needs to be called when the workload is done */ public VirtualMachine startWorkload(ChainWorkload workload, Consumer completion) { - return (VirtualMachine) workload.startWorkload(this.cpuDistributor, this, completion); + return (VirtualMachine) workload.startWorkload(this.cpuDistributor, this.accelDistributor, this, completion); } } diff --git a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/models/MachineModel.java b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/models/MachineModel.java index 6c47fbe63..6441ef6d3 100644 --- a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/models/MachineModel.java +++ b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/models/MachineModel.java @@ -30,6 +30,7 @@ */ public final class MachineModel { private final CpuModel cpuModel; + private final CpuModel accelModel; private final MemoryUnit memory; /** @@ -38,8 +39,9 @@ public final class MachineModel { * @param cpuModel The cpu available to the image. * @param memory The list of memory units available to the image. */ - public MachineModel(CpuModel cpuModel, MemoryUnit memory) { + public MachineModel(CpuModel cpuModel, CpuModel accelModel, MemoryUnit memory) { this.cpuModel = cpuModel; + this.accelModel = accelModel; this.memory = memory; } @@ -61,6 +63,13 @@ public MachineModel(List cpus, MemoryUnit memory) { cpus.get(0).getVendor(), cpus.get(0).getModelName(), cpus.get(0).getArchitecture()), + new CpuModel( + cpus.get(1).getId(), + cpus.get(1).getCoreCount() * cpus.size(), + cpus.get(1).getCoreSpeed(), + cpus.get(1).getVendor(), + cpus.get(1).getModelName(), + cpus.get(1).getArchitecture()), memory); } @@ -71,6 +80,10 @@ public CpuModel getCpuModel() { return this.cpuModel; } + public CpuModel getAccelModel() { + return this.accelModel; + } + /** * Return the memory units of this machine. */ @@ -83,7 +96,7 @@ public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; MachineModel that = (MachineModel) o; - return cpuModel.equals(that.cpuModel) && memory.equals(that.memory); + return cpuModel.equals(that.cpuModel) && accelModel.equals(that.accelModel) && memory.equals(that.memory); } @Override diff --git a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/ChainWorkload.java b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/ChainWorkload.java index 3cdde40a5..0edffa0c2 100644 --- a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/ChainWorkload.java +++ b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/ChainWorkload.java @@ -42,12 +42,13 @@ public void removeWorkloads(int numberOfWorkloads) { } @Override - public SimWorkload startWorkload(FlowSupplier supplier) { - return new VirtualMachine(supplier, this); + public SimWorkload startWorkload(FlowSupplier supplier, FlowSupplier accelSupplier) { + return new VirtualMachine(supplier, accelSupplier, this); } @Override - public SimWorkload startWorkload(FlowSupplier supplier, SimMachine machine, Consumer completion) { - return new VirtualMachine(supplier, this, machine, completion); + public SimWorkload startWorkload( + FlowSupplier supplier, FlowSupplier accelSupplier, SimMachine machine, Consumer completion) { + return new VirtualMachine(supplier, accelSupplier, this, machine, completion); } } diff --git a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/VirtualMachine.java b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/VirtualMachine.java index 7632b5032..680204547 100644 --- a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/VirtualMachine.java +++ b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/VirtualMachine.java @@ -44,10 +44,18 @@ public final class VirtualMachine extends SimWorkload implements FlowSupplier { private double cpuSupply = 0.0f; private double d = 0.0f; + private double accelDemand = 0.0f; + private double accelSupply = 0.0f; + private double accelD = 0.0f; + private FlowEdge workloadEdge; private FlowEdge machineEdge; + private FlowEdge accelWorkloadEdge; + private FlowEdge accelMachineEdge; + private double capacity = 0; + private double accelCapacity = 0; private final long checkpointInterval; private final long checkpointDuration; @@ -97,12 +105,13 @@ public PerformanceCounters getPerformanceCounters() { // Constructors //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - VirtualMachine(FlowSupplier supplier, ChainWorkload workload) { + VirtualMachine(FlowSupplier supplier, FlowSupplier accelSupplier, ChainWorkload workload) { super(((FlowNode) supplier).getEngine()); this.snapshot = workload; - new FlowEdge(this, supplier); + new FlowEdge(this, supplier, FlowEdge.ResourceType.CPU); + new FlowEdge(this, accelSupplier, FlowEdge.ResourceType.ACCEL); this.workloads = new LinkedList<>(workload.workloads()); this.checkpointInterval = workload.checkpointInterval(); @@ -120,11 +129,18 @@ public PerformanceCounters getPerformanceCounters() { this.onStart(); } - VirtualMachine(FlowSupplier supplier, ChainWorkload workload, SimMachine machine, Consumer completion) { - this(supplier, workload); + VirtualMachine( + FlowSupplier supplier, + FlowSupplier accelSupplier, + ChainWorkload workload, + SimMachine machine, + Consumer completion) { + this(supplier, accelSupplier, workload); this.capacity = machine.getCpu().getFrequency(); - this.d = 1 / machine.getCpu().getFrequency(); + this.accelCapacity = machine.getAccel().getFrequency(); + this.d = 1 / capacity; + this.accelD = 1 / accelCapacity; this.completion = completion; } @@ -144,7 +160,7 @@ public void onStart() { this.checkpointModel.start(); } - this.activeWorkload = this.getNextWorkload().startWorkload(this); + this.activeWorkload = this.getNextWorkload().startWorkload(this, this); } public void updateCounters(long now) { @@ -229,7 +245,11 @@ public void makeSnapshot(long now) { */ @Override public void addConsumerEdge(FlowEdge consumerEdge) { - this.workloadEdge = consumerEdge; + if (consumerEdge.getResourceType() == FlowEdge.ResourceType.CPU) { + this.workloadEdge = consumerEdge; + } else if (consumerEdge.getResourceType() == FlowEdge.ResourceType.ACCEL) { + this.accelWorkloadEdge = consumerEdge; + } } /** @@ -239,8 +259,13 @@ public void addConsumerEdge(FlowEdge consumerEdge) { */ @Override public void addSupplierEdge(FlowEdge supplierEdge) { - this.machineEdge = supplierEdge; - this.capacity = supplierEdge.getCapacity(); + if (supplierEdge.getResourceType() == FlowEdge.ResourceType.CPU) { + this.machineEdge = supplierEdge; + this.capacity = supplierEdge.getCapacity(); + } else if (supplierEdge.getResourceType() == FlowEdge.ResourceType.ACCEL) { + this.accelMachineEdge = supplierEdge; + this.accelCapacity = supplierEdge.getCapacity(); + } } /** @@ -251,9 +276,13 @@ public void addSupplierEdge(FlowEdge supplierEdge) { */ @Override public void pushOutgoingDemand(FlowEdge supplierEdge, double newDemand) { - - this.cpuDemand = newDemand; - this.machineEdge.pushDemand(newDemand); + if (supplierEdge.getResourceType() == FlowEdge.ResourceType.CPU) { + this.cpuDemand = newDemand; + this.machineEdge.pushDemand(newDemand); + } else if (supplierEdge.getResourceType() == FlowEdge.ResourceType.ACCEL) { + this.accelDemand = newDemand; + this.accelMachineEdge.pushDemand(newDemand); + } } /** @@ -264,9 +293,13 @@ public void pushOutgoingDemand(FlowEdge supplierEdge, double newDemand) { */ @Override public void pushOutgoingSupply(FlowEdge consumerEdge, double newSupply) { - - this.cpuSupply = newSupply; - this.workloadEdge.pushSupply(newSupply); + if (consumerEdge.getResourceType() == FlowEdge.ResourceType.CPU) { + this.cpuSupply = newSupply; + this.workloadEdge.pushSupply(newSupply); + } else if (consumerEdge.getResourceType() == FlowEdge.ResourceType.ACCEL) { + this.accelSupply = newSupply; + this.accelWorkloadEdge.pushSupply(newSupply); + } } /** @@ -279,7 +312,11 @@ public void pushOutgoingSupply(FlowEdge consumerEdge, double newSupply) { public void handleIncomingDemand(FlowEdge consumerEdge, double newDemand) { updateCounters(this.clock.millis()); - this.pushOutgoingDemand(this.machineEdge, newDemand); + if (consumerEdge.getResourceType() == FlowEdge.ResourceType.CPU) { + this.pushOutgoingDemand(this.machineEdge, newDemand); + } else if (consumerEdge.getResourceType() == FlowEdge.ResourceType.ACCEL) { + this.pushOutgoingDemand(this.accelMachineEdge, newDemand); + } } /** @@ -292,7 +329,11 @@ public void handleIncomingDemand(FlowEdge consumerEdge, double newDemand) { public void handleIncomingSupply(FlowEdge supplierEdge, double newSupply) { updateCounters(this.clock.millis()); - this.pushOutgoingSupply(this.machineEdge, newSupply); + if (supplierEdge.getResourceType() == FlowEdge.ResourceType.CPU) { + this.pushOutgoingDemand(this.machineEdge, newSupply); + } else if (supplierEdge.getResourceType() == FlowEdge.ResourceType.ACCEL) { + this.pushOutgoingDemand(this.accelMachineEdge, newSupply); + } } /** @@ -304,17 +345,18 @@ public void handleIncomingSupply(FlowEdge supplierEdge, double newSupply) { */ @Override public void removeConsumerEdge(FlowEdge consumerEdge) { - if (this.workloadEdge == null) { + if (this.workloadEdge == null && this.accelWorkloadEdge == null) { return; } // Remove the connection to the active workload this.activeWorkload = null; this.workloadEdge = null; + this.accelWorkloadEdge = null; // Start next workload if (!this.workloads.isEmpty()) { - this.activeWorkload = getNextWorkload().startWorkload(this); + this.activeWorkload = getNextWorkload().startWorkload(this, this); return; } diff --git a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/Workload.java b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/Workload.java index 3ad7597df..534644686 100644 --- a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/Workload.java +++ b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/Workload.java @@ -34,7 +34,8 @@ public interface Workload { double checkpointIntervalScaling(); - SimWorkload startWorkload(FlowSupplier supplier); + SimWorkload startWorkload(FlowSupplier supplier, FlowSupplier accelSupplier); - SimWorkload startWorkload(FlowSupplier supplier, SimMachine machine, Consumer completion); + SimWorkload startWorkload( + FlowSupplier supplier, FlowSupplier accelSupplier, SimMachine machine, Consumer completion); } diff --git a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/SimTraceWorkload.java b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/SimTraceWorkload.java index 9e7f170c5..decd03583 100644 --- a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/SimTraceWorkload.java +++ b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/SimTraceWorkload.java @@ -85,7 +85,7 @@ public double getCheckpointIntervalScaling() { // Constructors //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - public SimTraceWorkload(FlowSupplier supplier, TraceWorkload workload) { + public SimTraceWorkload(FlowSupplier supplier, FlowSupplier accelSupplier, TraceWorkload workload) { super(((FlowNode) supplier).getEngine()); this.snapshot = workload; @@ -97,7 +97,8 @@ public SimTraceWorkload(FlowSupplier supplier, TraceWorkload workload) { this.startOfFragment = this.clock.millis(); - new FlowEdge(this, supplier); + new FlowEdge(this, supplier, FlowEdge.ResourceType.CPU); + new FlowEdge(this, accelSupplier, FlowEdge.ResourceType.ACCEL); } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -209,8 +210,12 @@ public void makeSnapshot(long now) { } // Create a new fragment based on the current fragment and remaining duration - TraceFragment newFragment = - new TraceFragment(remainingTime, currentFragment.cpuUsage(), currentFragment.coreCount()); + TraceFragment newFragment = new TraceFragment( + remainingTime, + currentFragment.cpuUsage(), + currentFragment.coreCount(), + currentFragment.accelUsage(), + currentFragment.isGpu()); // Alter the snapshot by removing finished fragments this.snapshot.removeFragments(this.fragmentIndex); @@ -220,7 +225,7 @@ public void makeSnapshot(long now) { // Create and add a fragment for processing the snapshot process TraceFragment snapshotFragment = new TraceFragment( - this.checkpointDuration, this.snapshot.getMaxCpuDemand(), this.snapshot.getMaxCoreCount()); + this.checkpointDuration, this.snapshot.getMaxCpuDemand(), this.snapshot.getMaxCoreCount(), 0.0, false); this.remainingFragments.addFirst(snapshotFragment); this.fragmentIndex = -1; diff --git a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/TraceFragment.java b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/TraceFragment.java index a09206a15..216258c48 100644 --- a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/TraceFragment.java +++ b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/TraceFragment.java @@ -22,9 +22,9 @@ package org.opendc.simulator.compute.workload.trace; -public record TraceFragment(long duration, double cpuUsage, int coreCount) { +public record TraceFragment(long duration, double cpuUsage, int coreCount, double accelUsage, boolean isGpu) { - public TraceFragment(long start, long duration, double cpuUsage, int coreCount) { - this(duration, cpuUsage, coreCount); + public TraceFragment(long start, long duration, double cpuUsage, int coreCount, double accelUsage, boolean isGpu) { + this(duration, cpuUsage, coreCount, accelUsage, isGpu); } } diff --git a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/TraceWorkload.java b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/TraceWorkload.java index 9c31a8339..8e5c13330 100644 --- a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/TraceWorkload.java +++ b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/TraceWorkload.java @@ -115,13 +115,14 @@ public void addFirst(TraceFragment fragment) { } @Override - public SimWorkload startWorkload(FlowSupplier supplier) { - return new SimTraceWorkload(supplier, this); + public SimWorkload startWorkload(FlowSupplier supplier, FlowSupplier accelSupplier) { + return new SimTraceWorkload(supplier, accelSupplier, this); } @Override - public SimWorkload startWorkload(FlowSupplier supplier, SimMachine machine, Consumer completion) { - return this.startWorkload(supplier); + public SimWorkload startWorkload( + FlowSupplier supplier, FlowSupplier accelSupplier, SimMachine machine, Consumer completion) { + return this.startWorkload(supplier, accelSupplier); } public static Builder builder( @@ -165,8 +166,8 @@ private Builder( * @param usage The CPU usage at this fragment. * @param cores The number of cores used during this fragment. */ - public void add(long duration, double usage, int cores) { - fragments.add(fragments.size(), new TraceFragment(duration, usage, cores)); + public void add(long duration, double usage, int cores, double accelUsage, boolean isGpu) { + fragments.add(fragments.size(), new TraceFragment(duration, usage, cores, accelUsage, isGpu)); } /** diff --git a/opendc-simulator/opendc-simulator-flow/src/main/java/org/opendc/simulator/engine/graph/FlowEdge.java b/opendc-simulator/opendc-simulator-flow/src/main/java/org/opendc/simulator/engine/graph/FlowEdge.java index 95eac20ba..115afd9b4 100644 --- a/opendc-simulator/opendc-simulator-flow/src/main/java/org/opendc/simulator/engine/graph/FlowEdge.java +++ b/opendc-simulator/opendc-simulator-flow/src/main/java/org/opendc/simulator/engine/graph/FlowEdge.java @@ -39,13 +39,23 @@ public class FlowEdge { private double supply = 0.0; private double capacity; + private ResourceType resourceType; public enum NodeType { CONSUMING, SUPPLYING } + public enum ResourceType { + CPU, + ACCEL + } + public FlowEdge(FlowConsumer consumer, FlowSupplier supplier) { + this(consumer, supplier, null); + } + + public FlowEdge(FlowConsumer consumer, FlowSupplier supplier, ResourceType resourceType) { if (!(consumer instanceof FlowNode)) { throw new IllegalArgumentException("Flow consumer is not a FlowNode"); } @@ -53,6 +63,8 @@ public FlowEdge(FlowConsumer consumer, FlowSupplier supplier) { throw new IllegalArgumentException("Flow consumer is not a FlowNode"); } + this.resourceType = resourceType; + this.consumer = consumer; this.supplier = supplier; @@ -128,6 +140,10 @@ public void setSupplierIndex(int supplierIndex) { this.supplierIndex = supplierIndex; } + public ResourceType getResourceType() { + return resourceType; + } + /** * Push new demand from the Consumer to the Supplier */ diff --git a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/ResourceStateColumns.kt b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/ResourceStateColumns.kt index eede6bd6c..c6a335022 100644 --- a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/ResourceStateColumns.kt +++ b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/ResourceStateColumns.kt @@ -48,6 +48,18 @@ public val resourceStatePoweredOn: String = "powered_on" @JvmField public val resourceStateCpuUsage: String = "cpu_usage" +/** + * Total GPU usage of the resource in MHz. + */ +@JvmField +public val resourceStateAccelUsage: String = "accel_usage" + +/** + * Is the fragment a GPU fragment + */ +@JvmField +public val resourceStateIsGpu: String = "is_gpu" + /** * Total CPU usage of the resource in percentage. */ diff --git a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/OdcVmResourceStateTableReader.kt b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/OdcVmResourceStateTableReader.kt index 39475f9f6..57427665c 100644 --- a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/OdcVmResourceStateTableReader.kt +++ b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/OdcVmResourceStateTableReader.kt @@ -25,8 +25,10 @@ package org.opendc.trace.formats.opendc import org.opendc.trace.TableReader import org.opendc.trace.conv.resourceCpuCount import org.opendc.trace.conv.resourceID +import org.opendc.trace.conv.resourceStateAccelUsage import org.opendc.trace.conv.resourceStateCpuUsage import org.opendc.trace.conv.resourceStateDuration +import org.opendc.trace.conv.resourceStateIsGpu import org.opendc.trace.conv.resourceStateTimestamp import org.opendc.trace.formats.opendc.parquet.ResourceState import org.opendc.trace.util.parquet.LocalParquetReader @@ -60,6 +62,8 @@ internal class OdcVmResourceStateTableReader(private val reader: LocalParquetRea private val colDuration = 2 private val colCpuCount = 3 private val colCpuUsage = 4 + private val colAccelUsage = 5 + private val colIsGpu = 6 override fun resolve(name: String): Int { return when (name) { @@ -68,6 +72,8 @@ internal class OdcVmResourceStateTableReader(private val reader: LocalParquetRea resourceStateDuration -> colDuration resourceCpuCount -> colCpuCount resourceStateCpuUsage -> colCpuUsage + resourceStateAccelUsage -> colAccelUsage + resourceStateIsGpu -> colIsGpu else -> -1 } } @@ -78,7 +84,11 @@ internal class OdcVmResourceStateTableReader(private val reader: LocalParquetRea } override fun getBoolean(index: Int): Boolean { - throw IllegalArgumentException("Invalid column or type [index $index]") + val record = checkNotNull(record) { "Reader in invalid state" } + return when (index) { + colIsGpu -> record.isGpu + else -> throw IllegalArgumentException("Invalid column or type [index $index]") + } } override fun getInt(index: Int): Int { @@ -101,6 +111,7 @@ internal class OdcVmResourceStateTableReader(private val reader: LocalParquetRea val record = checkNotNull(record) { "Reader in invalid state" } return when (index) { colCpuUsage -> record.cpuUsage + colAccelUsage -> record.accelUsage else -> throw IllegalArgumentException("Invalid column or type [index $index]") } } diff --git a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/OdcVmResourceStateTableWriter.kt b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/OdcVmResourceStateTableWriter.kt index 1421d77c8..dfdc49884 100644 --- a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/OdcVmResourceStateTableWriter.kt +++ b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/OdcVmResourceStateTableWriter.kt @@ -26,8 +26,10 @@ import org.apache.parquet.hadoop.ParquetWriter import org.opendc.trace.TableWriter import org.opendc.trace.conv.resourceCpuCount import org.opendc.trace.conv.resourceID +import org.opendc.trace.conv.resourceStateAccelUsage import org.opendc.trace.conv.resourceStateCpuUsage import org.opendc.trace.conv.resourceStateDuration +import org.opendc.trace.conv.resourceStateIsGpu import org.opendc.trace.conv.resourceStateTimestamp import org.opendc.trace.formats.opendc.parquet.ResourceState import java.time.Duration @@ -47,6 +49,8 @@ internal class OdcVmResourceStateTableWriter(private val writer: ParquetWriter= lastTimestamp) { "Records need to be ordered by (id, timestamp)" } - writer.write(ResourceState(localID, localTimestamp, localDuration, localCpuCount, localCpuUsage)) + writer.write(ResourceState(localID, localTimestamp, localDuration, localCpuCount, localCpuUsage, localAccelUsage, localIsGpu)) lastId = localID lastTimestamp = localTimestamp @@ -76,6 +82,8 @@ internal class OdcVmResourceStateTableWriter(private val writer: ParquetWriter colDuration resourceCpuCount -> colCpuCount resourceStateCpuUsage -> colCpuUsage + resourceStateAccelUsage -> colAccelUsage + resourceStateIsGpu -> colIsGpu else -> -1 } } @@ -84,7 +92,11 @@ internal class OdcVmResourceStateTableWriter(private val writer: ParquetWriter localIsGpu = value + else -> throw IllegalArgumentException("Invalid column or type [index $index]") + } } override fun setInt( @@ -119,6 +131,7 @@ internal class OdcVmResourceStateTableWriter(private val writer: ParquetWriter localCpuUsage = value + colAccelUsage -> localAccelUsage = value else -> throw IllegalArgumentException("Invalid column or type [index $index]") } } @@ -206,4 +219,6 @@ internal class OdcVmResourceStateTableWriter(private val writer: ParquetWriter diff --git a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/parquet/ResourceState.kt b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/parquet/ResourceState.kt index 64ab9dcac..f1f42298e 100644 --- a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/parquet/ResourceState.kt +++ b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/parquet/ResourceState.kt @@ -31,4 +31,6 @@ internal class ResourceState( val duration: Duration, val cpuCount: Int, val cpuUsage: Double, + val accelUsage: Double, + val isGpu: Boolean, ) diff --git a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/parquet/ResourceStateReadSupport.kt b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/parquet/ResourceStateReadSupport.kt index e7d356302..5b9798573 100644 --- a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/parquet/ResourceStateReadSupport.kt +++ b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/parquet/ResourceStateReadSupport.kt @@ -33,8 +33,10 @@ import org.apache.parquet.schema.Types import org.opendc.trace.TableColumn import org.opendc.trace.conv.resourceCpuCount import org.opendc.trace.conv.resourceID +import org.opendc.trace.conv.resourceStateAccelUsage import org.opendc.trace.conv.resourceStateCpuUsage import org.opendc.trace.conv.resourceStateDuration +import org.opendc.trace.conv.resourceStateIsGpu import org.opendc.trace.conv.resourceStateTimestamp /** @@ -54,6 +56,8 @@ internal class ResourceStateReadSupport(private val projection: List?) : "cpu_count" to resourceCpuCount, "cpuUsage" to resourceStateCpuUsage, "cpu_usage" to resourceStateCpuUsage, + "accel_usage" to resourceStateAccelUsage, + "is_gpu" to resourceStateIsGpu, ) override fun init(context: InitContext): ReadContext { @@ -137,6 +141,10 @@ internal class ResourceStateReadSupport(private val projection: List?) : Types .required(PrimitiveType.PrimitiveTypeName.DOUBLE) .named("cpu_usage"), + Types.optional(PrimitiveType.PrimitiveTypeName.DOUBLE) + .named("accel_usage"), + Types.optional(PrimitiveType.PrimitiveTypeName.BOOLEAN) + .named("is_gpu"), ) .named("resource_state") diff --git a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/parquet/ResourceStateRecordMaterializer.kt b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/parquet/ResourceStateRecordMaterializer.kt index 8ff0e4762..8f50c4489 100644 --- a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/parquet/ResourceStateRecordMaterializer.kt +++ b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/parquet/ResourceStateRecordMaterializer.kt @@ -43,6 +43,8 @@ internal class ResourceStateRecordMaterializer(schema: MessageType) : RecordMate private var localDuration = Duration.ZERO private var localCpuCount = 0 private var localCpuUsage = 0.0 + private var localAccelUsage = 0.0 + private var localIsGpu = false /** * Root converter for the record. @@ -85,6 +87,18 @@ internal class ResourceStateRecordMaterializer(schema: MessageType) : RecordMate localCpuUsage = value } } + "accel_usage", "accelUsage" -> + object : PrimitiveConverter() { + override fun addDouble(value: Double) { + localAccelUsage = value + } + } + "is_gpu", "isGpu" -> + object : PrimitiveConverter() { + override fun addBoolean(value: Boolean) { + localIsGpu = value + } + } "flops" -> object : PrimitiveConverter() { override fun addLong(value: Long) { @@ -101,6 +115,8 @@ internal class ResourceStateRecordMaterializer(schema: MessageType) : RecordMate localDuration = Duration.ZERO localCpuCount = 0 localCpuUsage = 0.0 + localAccelUsage = 0.0 + localIsGpu = false } override fun end() {} @@ -108,7 +124,16 @@ internal class ResourceStateRecordMaterializer(schema: MessageType) : RecordMate override fun getConverter(fieldIndex: Int): Converter = converters[fieldIndex] } - override fun getCurrentRecord(): ResourceState = ResourceState(localId, localTimestamp, localDuration, localCpuCount, localCpuUsage) + override fun getCurrentRecord(): ResourceState = + ResourceState( + localId, + localTimestamp, + localDuration, + localCpuCount, + localCpuUsage, + localAccelUsage, + localIsGpu, + ) override fun getRootConverter(): GroupConverter = root } diff --git a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/parquet/ResourceStateWriteSupport.kt b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/parquet/ResourceStateWriteSupport.kt index 58c43916a..cea771906 100644 --- a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/parquet/ResourceStateWriteSupport.kt +++ b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/parquet/ResourceStateWriteSupport.kt @@ -78,6 +78,14 @@ internal class ResourceStateWriteSupport : WriteSupport() { consumer.addDouble(record.cpuUsage) consumer.endField("cpu_usage", 4) + consumer.startField("accel_usage", 5) + consumer.addDouble(record.accelUsage) + consumer.endField("accel_usage", 5) + + consumer.startField("is_gpu", 6) + consumer.addBoolean(record.isGpu) + consumer.endField("is_gpu", 6) + consumer.endMessage() } @@ -106,6 +114,12 @@ internal class ResourceStateWriteSupport : WriteSupport() { Types .required(PrimitiveType.PrimitiveTypeName.DOUBLE) .named("cpu_usage"), + Types + .required(PrimitiveType.PrimitiveTypeName.DOUBLE) + .named("accel_usage"), + Types + .required(PrimitiveType.PrimitiveTypeName.BOOLEAN) + .named("is_gpu"), ) .named("resource_state") } diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/OpenDCRunner.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/OpenDCRunner.kt index 406c97722..ab5f3ceb1 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/OpenDCRunner.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/OpenDCRunner.kt @@ -354,6 +354,7 @@ public class OpenDCRunner( val energyConsumptionW = machine.cpus.sumOf { it.energyConsumptionW } val powerModel = CpuPowerModels.linear(2 * energyConsumptionW, energyConsumptionW * 0.5) + val accelPowerModel = CpuPowerModels.constant(0.0) val spec = HostSpec( @@ -361,6 +362,7 @@ public class OpenDCRunner( clusterId, MachineModel(processors, memoryUnits[0]), powerModel, + accelPowerModel, ) res += spec From 19adce70e1ef127f5a4c2873994f20f9d273c770 Mon Sep 17 00:00:00 2001 From: Sacheendra Talluri Date: Sat, 5 Apr 2025 00:38:52 +0200 Subject: [PATCH 2/9] [WIP] basic tests for gpu --- .../opendc/compute/simulator/host/SimHost.kt | 5 + .../compute/simulator/internal/Guest.kt | 19 +++ .../telemetry/table/task/TaskTableReader.kt | 10 ++ .../table/task/TaskTableReaderImpl.kt | 11 ++ .../compute/topology/TopologyFactories.kt | 19 +++ .../compute/topology/specs/TopologySpecs.kt | 1 + .../opendc/experiments/base/BatteryTest.kt | 16 +-- .../org/opendc/experiments/base/CarbonTest.kt | 20 +-- .../opendc/experiments/base/ExperimentTest.kt | 14 +- .../base/FailuresAndCheckpointingTest.kt | 24 ++-- .../experiments/base/FlowDistributorTest.kt | 62 ++++---- .../experiments/base/FragmentScalingTest.kt | 40 +++--- .../org/opendc/experiments/base/GpuTest.kt | 134 +++++++++++++++++ .../opendc/experiments/base/SchedulerTest.kt | 4 +- .../opendc/experiments/base/TestingUtils.kt | 19 ++- .../topologies/gpu/single_1_2000.json | 39 +++++ .../compute/machine/PerformanceCounters.java | 77 ++++++++++ .../simulator/compute/machine/SimMachine.java | 6 +- .../compute/models/MachineModel.java | 14 +- .../compute/workload/VirtualMachine.java | 22 ++- .../workload/trace/SimTraceWorkload.java | 136 ++++++++++++++---- .../compute/workload/trace/TraceFragment.java | 6 +- .../compute/workload/trace/TraceWorkload.java | 24 +++- .../simulator/compute/SimMachineTest.kt | 8 ++ .../opendc/OdcVmResourceStateTableReader.kt | 2 +- .../org/opendc/web/runner/OpenDCRunner.kt | 14 +- 26 files changed, 604 insertions(+), 142 deletions(-) create mode 100644 opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/GpuTest.kt create mode 100644 opendc-experiments/opendc-experiments-base/src/test/resources/topologies/gpu/single_1_2000.json diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/host/SimHost.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/host/SimHost.kt index 37beaeff0..81434e03b 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/host/SimHost.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/host/SimHost.kt @@ -342,6 +342,11 @@ public class SimHost( return guest.getCpuStats() } + public fun getAccelStats(task: ServiceTask): GuestCpuStats { + val guest = requireNotNull(taskToGuestMap[task]) { "Unknown task ${task.name} at host $name" } + return guest.getAccelStats() + } + override fun hashCode(): Int = name.hashCode() override fun equals(other: Any?): Boolean { diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/internal/Guest.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/internal/Guest.kt index fe8cbf2f4..935c8dd29 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/internal/Guest.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/internal/Guest.kt @@ -256,6 +256,25 @@ public class Guest( ) } + /** + * Obtain the GPU statistics of this guest. + */ + public fun getAccelStats(): GuestCpuStats { + virtualMachine!!.updateCounters(this.clock.millis()) + val counters = virtualMachine!!.performanceCounters + + return GuestCpuStats( + counters.cpuActiveTime / 1000L, + counters.cpuIdleTime / 1000L, + counters.cpuStealTime / 1000L, + counters.cpuLostTime / 1000L, + counters.cpuCapacity, + counters.cpuSupply, + counters.cpuDemand, + counters.cpuSupply / cpuLimit, + ) + } + /** * Helper function to track the uptime and downtime of the guest. */ diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/table/task/TaskTableReader.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/table/task/TaskTableReader.kt index 771ced376..aeea28cff 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/table/task/TaskTableReader.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/table/task/TaskTableReader.kt @@ -107,6 +107,16 @@ public interface TaskTableReader : Exportable { */ public val cpuDemand: Double + /** + * The GPU given to this task (in MHz). + */ + public val accelUsage: Double + + /** + * The GPU demanded by this task (in MHz). + */ + public val accelDemand: Double + /** * The duration (in seconds) that a CPU was active in the task. */ diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/table/task/TaskTableReaderImpl.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/table/task/TaskTableReaderImpl.kt index 881b9916d..814dcc725 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/table/task/TaskTableReaderImpl.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/table/task/TaskTableReaderImpl.kt @@ -143,6 +143,14 @@ public class TaskTableReaderImpl( get() = _cpuDemand private var _cpuDemand = 0.0 + override val accelUsage: Double + get() = _accelUsage + private var _accelUsage = 0.0 + + override val accelDemand: Double + get() = _accelDemand + private var _accelDemand = 0.0 + override val cpuActiveTime: Long get() = _cpuActiveTime - previousCpuActiveTime private var _cpuActiveTime = 0L @@ -186,6 +194,7 @@ public class TaskTableReaderImpl( } val cpuStats = _host?.getCpuStats(task) + val accelStats = _host?.getAccelStats(task) val sysStats = _host?.getSystemStats(task) _timestamp = now @@ -194,6 +203,8 @@ public class TaskTableReaderImpl( _cpuLimit = cpuStats?.capacity ?: 0.0 _cpuDemand = cpuStats?.demand ?: 0.0 _cpuUsage = cpuStats?.usage ?: 0.0 + _accelDemand = accelStats?.demand ?: 0.0 + _accelUsage = accelStats?.usage ?: 0.0 _cpuActiveTime = cpuStats?.activeTime ?: _cpuActiveTime _cpuIdleTime = cpuStats?.idleTime ?: _cpuIdleTime _cpuStealTime = cpuStats?.stealTime ?: _cpuStealTime diff --git a/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/TopologyFactories.kt b/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/TopologyFactories.kt index 6521d09a3..fed5a0d7f 100644 --- a/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/TopologyFactories.kt +++ b/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/TopologyFactories.kt @@ -165,10 +165,29 @@ private fun HostJSONSpec.toHostSpec(clusterName: String): HostSpec { ) } + val accelUnits = if (accel == null) { + List(1) { + CpuModel( + globalCoreId++, + 0, + 0.0, + ) + } + } else { + List(accel.count) { + CpuModel( + globalCoreId++, + accel.coreCount, + accel.coreSpeed.toMHz(), + ) + } + } + val unknownMemoryUnit = MemoryUnit(memory.vendor, memory.modelName, memory.memorySpeed.toMHz(), memory.memorySize.toMiB().toLong()) val machineModel = MachineModel( units, + accelUnits, unknownMemoryUnit, ) diff --git a/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/TopologySpecs.kt b/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/TopologySpecs.kt index 7aff1eb48..40a24d28e 100644 --- a/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/TopologySpecs.kt +++ b/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/TopologySpecs.kt @@ -77,6 +77,7 @@ public data class ClusterJSONSpec( public data class HostJSONSpec( val name: String = "Host", val cpu: CPUJSONSpec, + val accel: CPUJSONSpec? = null, val memory: MemoryJSONSpec, val powerModel: PowerModelSpec = PowerModelSpec.DFLT, val accelPowerModel: PowerModelSpec = PowerModelSpec.NONE, diff --git a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/BatteryTest.kt b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/BatteryTest.kt index 17efd49e5..a85c84f3e 100644 --- a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/BatteryTest.kt +++ b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/BatteryTest.kt @@ -45,7 +45,7 @@ class BatteryTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1000.0, 1), ), submissionTime = "2022-01-01T00:00", ), @@ -71,7 +71,7 @@ class BatteryTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1000.0, 1), ), submissionTime = "2022-01-01T00:00", ), @@ -98,7 +98,7 @@ class BatteryTest { name = "0", fragments = arrayListOf( - TraceFragment(20 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(20 * 60 * 1000, 1000.0, 1), ), submissionTime = "2022-01-01T00:00", ), @@ -126,7 +126,7 @@ class BatteryTest { name = "0", fragments = arrayListOf( - TraceFragment(30 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(30 * 60 * 1000, 1000.0, 1), ), submissionTime = "2022-01-01T00:00", ), @@ -153,7 +153,7 @@ class BatteryTest { name = "0", fragments = arrayListOf( - TraceFragment(30 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(30 * 60 * 1000, 1000.0, 1), ), submissionTime = "2022-01-01T00:00", ), @@ -195,7 +195,7 @@ class BatteryTest { createTestTask( name = "0", fragments = - arrayListOf(TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false)), + arrayListOf(TraceFragment(10 * 60 * 1000, 1000.0, 1)), submissionTime = "2022-01-01T00:00", ), ) @@ -221,7 +221,7 @@ class BatteryTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1000.0, 1), ), submissionTime = "2022-01-01T00:00", ), @@ -254,7 +254,7 @@ class BatteryTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1000.0, 1), ), checkpointInterval = 60 * 1000L, checkpointDuration = 1000L, diff --git a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/CarbonTest.kt b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/CarbonTest.kt index 21efc7d08..a0f5978f9 100644 --- a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/CarbonTest.kt +++ b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/CarbonTest.kt @@ -49,7 +49,7 @@ class CarbonTest { name = "0", fragments = arrayListOf( - TraceFragment(120 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(120 * 60 * 1000, 1000.0, 1), ), submissionTime = "2022-01-01T00:00", ), @@ -96,14 +96,14 @@ class CarbonTest { name = "0", fragments = arrayListOf( - TraceFragment(40 * 60 * 1000, 1000.0, 1, 0.0, false), - TraceFragment(40 * 60 * 1000, 2000.0, 1, 0.0, false), - TraceFragment(40 * 60 * 1000, 1000.0, 1, 0.0, false), - TraceFragment(40 * 60 * 1000, 2000.0, 1, 0.0, false), - TraceFragment(40 * 60 * 1000, 1000.0, 1, 0.0, false), - TraceFragment(40 * 60 * 1000, 2000.0, 1, 0.0, false), - TraceFragment(40 * 60 * 1000, 1000.0, 1, 0.0, false), - TraceFragment(40 * 60 * 1000, 2000.0, 1, 0.0, false), + TraceFragment(40 * 60 * 1000, 1000.0, 1), + TraceFragment(40 * 60 * 1000, 2000.0, 1), + TraceFragment(40 * 60 * 1000, 1000.0, 1), + TraceFragment(40 * 60 * 1000, 2000.0, 1), + TraceFragment(40 * 60 * 1000, 1000.0, 1), + TraceFragment(40 * 60 * 1000, 2000.0, 1), + TraceFragment(40 * 60 * 1000, 1000.0, 1), + TraceFragment(40 * 60 * 1000, 2000.0, 1), ), submissionTime = "2022-01-01T00:00", ), @@ -165,7 +165,7 @@ class CarbonTest { name = "0", fragments = arrayListOf( - TraceFragment(60 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(60 * 60 * 1000, 1000.0, 1), ), submissionTime = "2022-01-01T00:00", ), diff --git a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/ExperimentTest.kt b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/ExperimentTest.kt index fe33f101d..2fb5ece82 100644 --- a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/ExperimentTest.kt +++ b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/ExperimentTest.kt @@ -51,7 +51,7 @@ class ExperimentTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1000.0, 1), ), ), ) @@ -87,14 +87,14 @@ class ExperimentTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1000.0, 1), ), ), createTestTask( name = "1", fragments = arrayListOf( - TraceFragment(5 * 60 * 1000, 2000.0, 1, 0.0, false), + TraceFragment(5 * 60 * 1000, 2000.0, 1), ), ), ) @@ -130,14 +130,14 @@ class ExperimentTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1000.0, 1), ), ), createTestTask( name = "1", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1000.0, 1), ), ), ) @@ -173,14 +173,14 @@ class ExperimentTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1000.0, 1), ), ), createTestTask( name = "1", fragments = arrayListOf( - TraceFragment(5 * 60 * 1000, 2000.0, 1, 0.0, false), + TraceFragment(5 * 60 * 1000, 2000.0, 1), ), submissionTime = "1970-01-01T00:20", ), diff --git a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/FailuresAndCheckpointingTest.kt b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/FailuresAndCheckpointingTest.kt index b2776e709..3231f533f 100644 --- a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/FailuresAndCheckpointingTest.kt +++ b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/FailuresAndCheckpointingTest.kt @@ -53,7 +53,7 @@ class FailuresAndCheckpointingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1000.0, 1), ), ), ) @@ -93,7 +93,7 @@ class FailuresAndCheckpointingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1000.0, 1), ), ), ) @@ -136,7 +136,7 @@ class FailuresAndCheckpointingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1000.0, 1), ), ), ) @@ -181,7 +181,7 @@ class FailuresAndCheckpointingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1000.0, 1), ), ), ) @@ -240,7 +240,7 @@ class FailuresAndCheckpointingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1000.0, 1), ), checkpointInterval = 60 * 1000L, checkpointDuration = 1000L, @@ -276,8 +276,8 @@ class FailuresAndCheckpointingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 2000.0, 1, 0.0, false), - TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 2000.0, 1), + TraceFragment(10 * 60 * 1000, 1000.0, 1), ), checkpointInterval = 60 * 1000L, checkpointDuration = 1000L, @@ -318,8 +318,8 @@ class FailuresAndCheckpointingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), - TraceFragment(10 * 60 * 1000, 2000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1000.0, 1), + TraceFragment(10 * 60 * 1000, 2000.0, 1), ), checkpointInterval = 60 * 1000L, checkpointDuration = 1000L, @@ -356,7 +356,7 @@ class FailuresAndCheckpointingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1000.0, 1), ), checkpointInterval = 60 * 1000L, checkpointDuration = 1000L, @@ -388,7 +388,7 @@ class FailuresAndCheckpointingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1000.0, 1), ), checkpointInterval = 60 * 1000L, checkpointDuration = 1000L, @@ -430,7 +430,7 @@ class FailuresAndCheckpointingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1000.0, 1), ), checkpointInterval = 60 * 1000L, checkpointDuration = 1000L, diff --git a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/FlowDistributorTest.kt b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/FlowDistributorTest.kt index 3e9f54e78..3d7333607 100644 --- a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/FlowDistributorTest.kt +++ b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/FlowDistributorTest.kt @@ -46,8 +46,8 @@ class FlowDistributorTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), - TraceFragment(10 * 60 * 1000, 2000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1000.0, 1), + TraceFragment(10 * 60 * 1000, 2000.0, 1), ), ), ) @@ -81,8 +81,8 @@ class FlowDistributorTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 3000.0, 1, 0.0, false), - TraceFragment(10 * 60 * 1000, 4000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 3000.0, 1), + TraceFragment(10 * 60 * 1000, 4000.0, 1), ), ), ) @@ -116,8 +116,8 @@ class FlowDistributorTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), - TraceFragment(10 * 60 * 1000, 4000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1000.0, 1), + TraceFragment(10 * 60 * 1000, 4000.0, 1), ), ), ) @@ -151,8 +151,8 @@ class FlowDistributorTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 4000.0, 1, 0.0, false), - TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 4000.0, 1), + TraceFragment(10 * 60 * 1000, 1000.0, 1), ), ), ) @@ -186,8 +186,8 @@ class FlowDistributorTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 4000.0, 1, 0.0, false), - TraceFragment(10 * 60 * 1000, 2000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 4000.0, 1), + TraceFragment(10 * 60 * 1000, 2000.0, 1), ), ), ) @@ -221,16 +221,16 @@ class FlowDistributorTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), - TraceFragment(10 * 60 * 1000, 3000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1000.0, 1), + TraceFragment(10 * 60 * 1000, 3000.0, 1), ), ), createTestTask( name = "1", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 3000.0, 1, 0.0, false), - TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 3000.0, 1), + TraceFragment(10 * 60 * 1000, 1000.0, 1), ), ), ) @@ -268,16 +268,16 @@ class FlowDistributorTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 6000.0, 1, 0.0, false), - TraceFragment(10 * 60 * 1000, 5000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 6000.0, 1), + TraceFragment(10 * 60 * 1000, 5000.0, 1), ), ), createTestTask( name = "1", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 5000.0, 1, 0.0, false), - TraceFragment(10 * 60 * 1000, 6000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 5000.0, 1), + TraceFragment(10 * 60 * 1000, 6000.0, 1), ), ), ) @@ -315,8 +315,8 @@ class FlowDistributorTest { submissionTime = "2024-02-01T10:00", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), - TraceFragment(10 * 60 * 1000, 2000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1000.0, 1), + TraceFragment(10 * 60 * 1000, 2000.0, 1), ), ), createTestTask( @@ -324,7 +324,7 @@ class FlowDistributorTest { submissionTime = "2024-02-01T10:05", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 2000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 2000.0, 1), ), ), ) @@ -373,7 +373,7 @@ class FlowDistributorTest { submissionTime = "2024-02-01T10:00", fragments = arrayListOf( - TraceFragment(20 * 60 * 1000, 3000.0, 1, 0.0, false), + TraceFragment(20 * 60 * 1000, 3000.0, 1), ), ), createTestTask( @@ -381,7 +381,7 @@ class FlowDistributorTest { submissionTime = "2024-02-01T10:05", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1500.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1500.0, 1), ), ), ) @@ -426,17 +426,17 @@ class FlowDistributorTest { name = "0", fragments = arrayListOf( - TraceFragment(5 * 60 * 1000, 1000.0, 1, 0.0, false), - TraceFragment(5 * 60 * 1000, 1500.0, 1, 0.0, false), - TraceFragment(5 * 60 * 1000, 2500.0, 1, 0.0, false), - TraceFragment(5 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(5 * 60 * 1000, 1000.0, 1), + TraceFragment(5 * 60 * 1000, 1500.0, 1), + TraceFragment(5 * 60 * 1000, 2500.0, 1), + TraceFragment(5 * 60 * 1000, 1000.0, 1), ), ), createTestTask( name = "1", fragments = arrayListOf( - TraceFragment(20 * 60 * 1000, 3000.0, 1, 0.0, false), + TraceFragment(20 * 60 * 1000, 3000.0, 1), ), ), ) @@ -487,7 +487,7 @@ class FlowDistributorTest { name = "0", fragments = arrayListOf().apply { - repeat(1) { this.add(TraceFragment(10 * 60 * 1000, 3000.0, 1, 0.0, false)) } + repeat(1) { this.add(TraceFragment(10 * 60 * 1000, 3000.0, 1)) } }, ), ) @@ -515,7 +515,7 @@ class FlowDistributorTest { name = "0", fragments = arrayListOf().apply { - repeat(1000) { this.add(TraceFragment(10 * 60 * 1000, 2000.0, 1, 0.0, false)) } + repeat(1000) { this.add(TraceFragment(10 * 60 * 1000, 2000.0, 1)) } }, ), ) @@ -544,7 +544,7 @@ class FlowDistributorTest { createTestTask( name = "0", fragments = - arrayListOf(TraceFragment(10 * 60 * 1000, 2000.0, 1, 0.0, false)), + arrayListOf(TraceFragment(10 * 60 * 1000, 2000.0, 1)), ), ) } diff --git a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/FragmentScalingTest.kt b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/FragmentScalingTest.kt index c58ef3662..b0aa3555d 100644 --- a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/FragmentScalingTest.kt +++ b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/FragmentScalingTest.kt @@ -48,8 +48,8 @@ class FragmentScalingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 2000.0, 1, 0.0, false), - TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 2000.0, 1), + TraceFragment(10 * 60 * 1000, 1000.0, 1), ), scalingPolicy = NoDelayScaling(), ), @@ -61,8 +61,8 @@ class FragmentScalingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 2000.0, 1, 0.0, false), - TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 2000.0, 1), + TraceFragment(10 * 60 * 1000, 1000.0, 1), ), scalingPolicy = PerfectScaling(), ), @@ -102,7 +102,7 @@ class FragmentScalingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 4000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 4000.0, 1), ), scalingPolicy = NoDelayScaling(), ), @@ -114,7 +114,7 @@ class FragmentScalingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 4000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 4000.0, 1), ), scalingPolicy = PerfectScaling(), ), @@ -151,9 +151,9 @@ class FragmentScalingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), - TraceFragment(10 * 60 * 1000, 4000.0, 1, 0.0, false), - TraceFragment(10 * 60 * 1000, 1500.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1000.0, 1), + TraceFragment(10 * 60 * 1000, 4000.0, 1), + TraceFragment(10 * 60 * 1000, 1500.0, 1), ), scalingPolicy = NoDelayScaling(), ), @@ -165,9 +165,9 @@ class FragmentScalingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), - TraceFragment(10 * 60 * 1000, 4000.0, 1, 0.0, false), - TraceFragment(10 * 60 * 1000, 1500.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1000.0, 1), + TraceFragment(10 * 60 * 1000, 4000.0, 1), + TraceFragment(10 * 60 * 1000, 1500.0, 1), ), scalingPolicy = PerfectScaling(), ), @@ -211,7 +211,7 @@ class FragmentScalingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1000.0, 1), ), scalingPolicy = NoDelayScaling(), ), @@ -219,7 +219,7 @@ class FragmentScalingTest { name = "1", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 3000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 3000.0, 1), ), scalingPolicy = NoDelayScaling(), ), @@ -231,7 +231,7 @@ class FragmentScalingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1000.0, 1), ), scalingPolicy = PerfectScaling(), ), @@ -239,7 +239,7 @@ class FragmentScalingTest { name = "1", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 3000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 3000.0, 1), ), scalingPolicy = PerfectScaling(), ), @@ -281,7 +281,7 @@ class FragmentScalingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 2000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 2000.0, 1), ), scalingPolicy = NoDelayScaling(), ), @@ -289,7 +289,7 @@ class FragmentScalingTest { name = "1", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 4000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 4000.0, 1), ), scalingPolicy = NoDelayScaling(), ), @@ -301,7 +301,7 @@ class FragmentScalingTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 2000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 2000.0, 1), ), scalingPolicy = PerfectScaling(), ), @@ -309,7 +309,7 @@ class FragmentScalingTest { name = "1", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 4000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 4000.0, 1), ), scalingPolicy = PerfectScaling(), ), diff --git a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/GpuTest.kt b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/GpuTest.kt new file mode 100644 index 000000000..57539fc47 --- /dev/null +++ b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/GpuTest.kt @@ -0,0 +1,134 @@ +package org.opendc.experiments.base + +import org.junit.jupiter.api.Assertions.assertEquals +import org.junit.jupiter.api.Test +import org.junit.jupiter.api.assertAll +import org.opendc.compute.workload.Task +import org.opendc.experiments.base.experiment.specs.TraceBasedFailureModelSpec +import org.opendc.simulator.compute.workload.trace.TraceFragment +import java.util.ArrayList + +class GpuTest { + + @Test + fun testGpuOnly() { + val workload: ArrayList = + arrayListOf( + createTestTask( + name = "0", + accelFragments = + arrayListOf( + TraceFragment(10 * 60 * 1000, 1000.0, 1), + ), + ), + ) + + val topology = createTopology("gpu/single_1_2000.json") + + val monitor = runTest(topology, workload) + + assertAll( + { assertEquals(10 * 60 * 1000, monitor.maxTimestamp) { "Total runtime incorrect" } }, + { assertEquals(150.0, monitor.hostPowerDraws["H01"]?.get(0)) { "Incorrect energy usage" } }, + { assertEquals(150.0, monitor.hostPowerDraws["H01"]?.get(9)) { "Incorrect energy usage" } }, + ) + } + + @Test + fun testCpuAndGpu() { + val workload: ArrayList = + arrayListOf( + createTestTask( + name = "0", + fragments = + arrayListOf( + TraceFragment(10 * 60 * 1000, 1000.0, 1), + ), + accelFragments = + arrayListOf( + TraceFragment(10 * 60 * 1000, 1000.0, 1), + ), + ), + ) + + val topology = createTopology("gpu/single_1_2000.json") + + val monitor = runTest(topology, workload) + + assertAll( + { assertEquals(10 * 60 * 1000, monitor.maxTimestamp) { "Total runtime incorrect" } }, + { assertEquals(300.0, monitor.hostPowerDraws["H01"]?.get(0)) { "Incorrect energy usage" } }, + { assertEquals(300.0, monitor.hostPowerDraws["H01"]?.get(9)) { "Incorrect energy usage" } }, + ) + } + + @Test + fun testCpuAndGpuDiffDuration() { + val workload: ArrayList = + arrayListOf( + createTestTask( + name = "0", + fragments = + arrayListOf( + TraceFragment(10 * 60 * 1000, 1000.0, 1), + ), + accelFragments = + arrayListOf( + TraceFragment(20 * 60 * 1000, 1000.0, 1), + ), + ), + ) + + val topology = createTopology("gpu/single_1_2000.json") + + val monitor = runTest(topology, workload) + + assertAll( + { assertEquals(20 * 60 * 1000, monitor.maxTimestamp) { "Total runtime incorrect" } }, + { assertEquals(300.0, monitor.hostPowerDraws["H01"]?.get(0)) { "Incorrect energy usage" } }, + { assertEquals(300.0, monitor.hostPowerDraws["H01"]?.get(8)) { "Incorrect energy usage" } }, + { assertEquals(250.0, monitor.hostPowerDraws["H01"]?.get(13)) { "Incorrect energy usage" } }, + ) + } + + @Test + fun testCpuAndGpuMultipleTasks() { + val workload: ArrayList = + arrayListOf( + createTestTask( + name = "0", + fragments = + arrayListOf( + TraceFragment(10 * 60 * 1000, 1000.0, 1), + ), + accelFragments = + arrayListOf( + TraceFragment(20 * 60 * 1000, 1000.0, 1), + ), + ), + createTestTask( + name = "1", + fragments = + arrayListOf( + TraceFragment(10 * 60 * 1000, 1000.0, 1), + ), + accelFragments = + arrayListOf( + TraceFragment(10 * 60 * 1000, 1000.0, 1), + ), + ), + ) + + val topology = createTopology("gpu/single_1_2000.json") + + val monitor = runTest(topology, workload) + + assertAll( + { assertEquals(30 * 60 * 1000, monitor.maxTimestamp) { "Total runtime incorrect" } }, + { assertEquals(300.0, monitor.hostPowerDraws["H01"]?.get(0)) { "Incorrect energy usage" } }, + { assertEquals(300.0, monitor.hostPowerDraws["H01"]?.get(8)) { "Incorrect energy usage" } }, + { assertEquals(250.0, monitor.hostPowerDraws["H01"]?.get(13)) { "Incorrect energy usage" } }, + { assertEquals(300.0, monitor.hostPowerDraws["H01"]?.get(18)) { "Incorrect energy usage" } }, + ) + } +} diff --git a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/SchedulerTest.kt b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/SchedulerTest.kt index 14b7a5d05..f9a20c68b 100644 --- a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/SchedulerTest.kt +++ b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/SchedulerTest.kt @@ -42,14 +42,14 @@ class SchedulerTest { name = "0", fragments = arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1, 0.0, false), + TraceFragment(10 * 60 * 1000, 1000.0, 1), ), ), createTestTask( name = "1", fragments = arrayListOf( - TraceFragment(5 * 60 * 1000, 2000.0, 1, 0.0, false), + TraceFragment(5 * 60 * 1000, 2000.0, 1), ), submissionTime = "1970-01-01T00:20", ), diff --git a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/TestingUtils.kt b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/TestingUtils.kt index eadb74e77..c7a57f282 100644 --- a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/TestingUtils.kt +++ b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/TestingUtils.kt @@ -67,7 +67,8 @@ fun createTestTask( memCapacity: Long = 0L, submissionTime: String = "1970-01-01T00:00", duration: Long = 0L, - fragments: ArrayList, + fragments: ArrayList = arrayListOf(), + accelFragments: ArrayList = arrayListOf(), checkpointInterval: Long = 0L, checkpointDuration: Long = 0L, checkpointIntervalScaling: Double = 1.0, @@ -76,8 +77,8 @@ fun createTestTask( return Task( UUID.nameUUIDFromBytes(name.toByteArray()), name, - fragments.maxOf { it.coreCount }, - fragments.maxOf { it.cpuUsage }, + if (fragments.isNotEmpty()) fragments.maxOf { it.coreCount } else 0, + if (fragments.isNotEmpty()) fragments.maxOf { it.cpuUsage } else 0.0, memCapacity, 1800000.0, LocalDateTime.parse(submissionTime).toInstant(ZoneOffset.UTC).toEpochMilli(), @@ -86,6 +87,7 @@ fun createTestTask( -1, TraceWorkload( fragments, + accelFragments, checkpointInterval, checkpointDuration, checkpointIntervalScaling, @@ -134,6 +136,8 @@ fun runTest( class TestComputeMonitor : ComputeMonitor { var taskCpuDemands = mutableMapOf>() var taskCpuSupplied = mutableMapOf>() + var taskAccelDemands = mutableMapOf>() + var taskAccelSupplied = mutableMapOf>() override fun record(reader: TaskTableReader) { val taskName: String = reader.taskInfo.name @@ -145,6 +149,15 @@ class TestComputeMonitor : ComputeMonitor { taskCpuDemands[taskName] = arrayListOf(reader.cpuDemand) taskCpuSupplied[taskName] = arrayListOf(reader.cpuUsage) } + + if (taskName in taskAccelDemands) { + taskAccelDemands[taskName]?.add(reader.cpuDemand) + taskAccelSupplied[taskName]?.add(reader.cpuUsage) + } else { + taskAccelDemands[taskName] = arrayListOf(reader.cpuDemand) + taskAccelSupplied[taskName] = arrayListOf(reader.cpuUsage) + } + } var attemptsSuccess = 0 diff --git a/opendc-experiments/opendc-experiments-base/src/test/resources/topologies/gpu/single_1_2000.json b/opendc-experiments/opendc-experiments-base/src/test/resources/topologies/gpu/single_1_2000.json new file mode 100644 index 000000000..77d21196b --- /dev/null +++ b/opendc-experiments/opendc-experiments-base/src/test/resources/topologies/gpu/single_1_2000.json @@ -0,0 +1,39 @@ +{ + "clusters": + [ + { + "name": "C01", + "hosts" : + [ + { + "name": "H01", + "cpu": + { + "coreCount": 1, + "coreSpeed": 2000 + }, + "accel": + { + "coreCount": 1, + "coreSpeed": 2000 + }, + "memory": { + "memorySize": 140457600000 + }, + "powerModel": { + "modelType": "linear", + "power": 400.0, + "idlePower": 100.0, + "maxPower": 200.0 + }, + "accelPowerModel": { + "modelType": "linear", + "power": 400.0, + "idlePower": 100.0, + "maxPower": 200.0 + } + } + ] + } + ] +} diff --git a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/machine/PerformanceCounters.java b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/machine/PerformanceCounters.java index f5b8d27d9..2b9a8ad3d 100644 --- a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/machine/PerformanceCounters.java +++ b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/machine/PerformanceCounters.java @@ -32,6 +32,15 @@ public class PerformanceCounters { private double cpuDemand = 0.0f; private double cpuSupply = 0.0f; + private long accelActiveTime = 0; + private long accelIdleTime = 0; + private long accelStealTime = 0; + private long accelLostTime = 0; + + private double accelCapacity = 0.0f; + private double accelDemand = 0.0f; + private double accelSupply = 0.0f; + public long getCpuActiveTime() { return cpuActiveTime; } @@ -99,4 +108,72 @@ public double getCpuSupply() { public void setCpuSupply(double cpuSupply) { this.cpuSupply = cpuSupply; } + + public long getAccelActiveTime() { + return accelActiveTime; + } + + public void setAccelActiveTime(long accelActiveTime) { + this.accelActiveTime = accelActiveTime; + } + + public void addAccelActiveTime(long accelActiveTime) { + this.accelActiveTime += accelActiveTime; + } + + public long getAccelIdleTime() { + return accelIdleTime; + } + + public void setAccelIdleTime(long accelIdleTime) { + this.accelIdleTime = accelIdleTime; + } + + public void addAccelIdleTime(long accelIdleTime) { + this.accelIdleTime += accelIdleTime; + } + + public long getAccelStealTime() { + return accelStealTime; + } + + public void setAccelStealTime(long accelStealTime) { + this.accelStealTime = accelStealTime; + } + + public void addAccelStealTime(long accelStealTime) { + this.accelStealTime += accelStealTime; + } + + public long getAccelLostTime() { + return accelLostTime; + } + + public void setAccelLostTime(long accelLostTime) { + this.accelLostTime = accelLostTime; + } + + public double getAccelCapacity() { + return accelCapacity; + } + + public void setAccelCapacity(double accelCapacity) { + this.accelCapacity = accelCapacity; + } + + public double getAccelDemand() { + return accelDemand; + } + + public void setAccelDemand(double accelDemand) { + this.accelDemand = accelDemand; + } + + public double getAccelSupply() { + return accelSupply; + } + + public void setAccelSupply(double accelSupply) { + this.accelSupply = accelSupply; + } } diff --git a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/machine/SimMachine.java b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/machine/SimMachine.java index 4d10f3a4e..5db77ae62 100644 --- a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/machine/SimMachine.java +++ b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/machine/SimMachine.java @@ -51,6 +51,7 @@ public class SimMachine { private FlowDistributor cpuDistributor; private FlowDistributor accelDistributor; private SimPsu psu; + private FlowDistributor psuDistributor; private Memory memory; private final Consumer completion; @@ -129,13 +130,16 @@ public SimMachine( // Create the psu and cpu and connect them this.psu = new SimPsu(engine); + this.psuDistributor = new FlowDistributor(engine); + new FlowEdge(this.psuDistributor, this.psu); new FlowEdge(this.psu, powerDistributor); this.cpu = new SimCpu(engine, this.machineModel.getCpuModel(), cpuPowerModel, 0); this.accel = new SimGpu(engine, this.machineModel.getAccelModel(), accelPowerModel, 0); - new FlowEdge(this.cpu, this.psu); + new FlowEdge(this.cpu, this.psuDistributor); + new FlowEdge(this.accel, this.psuDistributor); this.memory = new Memory(engine, this.machineModel.getMemory()); diff --git a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/models/MachineModel.java b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/models/MachineModel.java index 6441ef6d3..515904822 100644 --- a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/models/MachineModel.java +++ b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/models/MachineModel.java @@ -53,7 +53,7 @@ public MachineModel(CpuModel cpuModel, CpuModel accelModel, MemoryUnit memory) { * @param cpus The list of processing units available to the image. * @param memory The list of memory units available to the image. */ - public MachineModel(List cpus, MemoryUnit memory) { + public MachineModel(List cpus, List accelerators, MemoryUnit memory) { this( new CpuModel( @@ -64,12 +64,12 @@ public MachineModel(List cpus, MemoryUnit memory) { cpus.get(0).getModelName(), cpus.get(0).getArchitecture()), new CpuModel( - cpus.get(1).getId(), - cpus.get(1).getCoreCount() * cpus.size(), - cpus.get(1).getCoreSpeed(), - cpus.get(1).getVendor(), - cpus.get(1).getModelName(), - cpus.get(1).getArchitecture()), + accelerators.get(0).getId(), + accelerators.get(0).getCoreCount() * cpus.size(), + accelerators.get(0).getCoreSpeed(), + accelerators.get(0).getVendor(), + accelerators.get(0).getModelName(), + accelerators.get(0).getArchitecture()), memory); } diff --git a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/VirtualMachine.java b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/VirtualMachine.java index 680204547..1edfa7ce0 100644 --- a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/VirtualMachine.java +++ b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/VirtualMachine.java @@ -170,16 +170,26 @@ public void updateCounters(long now) { double cpuCapacity = 0.0f; if (delta > 0) { - final double factor = this.d * delta; + final double cpuFactor = this.d * delta; - this.performanceCounters.addCpuActiveTime(Math.round(this.cpuSupply * factor)); - this.performanceCounters.setCpuIdleTime(Math.round((cpuCapacity - this.cpuSupply) * factor)); - this.performanceCounters.addCpuStealTime(Math.round((this.cpuDemand - this.cpuSupply) * factor)); + this.performanceCounters.addCpuActiveTime(Math.round(this.cpuSupply * cpuFactor)); + this.performanceCounters.setCpuIdleTime(Math.round((cpuCapacity - this.cpuSupply) * cpuFactor)); + this.performanceCounters.addCpuStealTime(Math.round((this.cpuDemand - this.cpuSupply) * cpuFactor)); + + final double accelFactor = this.accelD * delta; + + this.performanceCounters.addAccelActiveTime(Math.round(this.cpuSupply * accelFactor)); + this.performanceCounters.setAccelIdleTime(Math.round((cpuCapacity - this.cpuSupply) * accelFactor)); + this.performanceCounters.addAccelStealTime(Math.round((this.cpuDemand - this.cpuSupply) * accelFactor)); } this.performanceCounters.setCpuDemand(this.cpuDemand); this.performanceCounters.setCpuSupply(this.cpuSupply); this.performanceCounters.setCpuCapacity(cpuCapacity); + + this.performanceCounters.setAccelDemand(this.accelDemand); + this.performanceCounters.setAccelSupply(this.accelSupply); + this.performanceCounters.setAccelCapacity(accelCapacity); } @Override @@ -330,9 +340,9 @@ public void handleIncomingSupply(FlowEdge supplierEdge, double newSupply) { updateCounters(this.clock.millis()); if (supplierEdge.getResourceType() == FlowEdge.ResourceType.CPU) { - this.pushOutgoingDemand(this.machineEdge, newSupply); + this.pushOutgoingSupply(this.machineEdge, newSupply); } else if (supplierEdge.getResourceType() == FlowEdge.ResourceType.ACCEL) { - this.pushOutgoingDemand(this.accelMachineEdge, newSupply); + this.pushOutgoingSupply(this.accelMachineEdge, newSupply); } } diff --git a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/SimTraceWorkload.java b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/SimTraceWorkload.java index decd03583..bf4daaf77 100644 --- a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/SimTraceWorkload.java +++ b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/SimTraceWorkload.java @@ -32,20 +32,35 @@ import org.opendc.simulator.engine.graph.FlowNode; import org.opendc.simulator.engine.graph.FlowSupplier; +import static java.lang.Math.max; +import static java.lang.Math.min; + public class SimTraceWorkload extends SimWorkload implements FlowConsumer { private LinkedList remainingFragments; + private LinkedList remainingAccelFragments; private int fragmentIndex; + private int accelFragmentIndex; + + private boolean cpuFragmentsComplete = false; + private boolean accelFragmentsComplete = false; private TraceFragment currentFragment; + private TraceFragment currentAccelFragment; private long startOfFragment; private FlowEdge machineEdge; + private FlowEdge accelMachineEdge; private double cpuFreqDemand = 0.0; // The Cpu demanded by fragment private double cpuFreqSupplied = 0.0; // The Cpu speed supplied private double newCpuFreqSupplied = 0.0; // The Cpu speed supplied private double remainingWork = 0.0; // The duration of the fragment at the demanded speed + private double accelFreqDemand = 0.0; // The Cpu demanded by fragment + private double accelFreqSupplied = 0.0; // The Cpu speed supplied + private double newAccelFreqSupplied = 0.0; // The Cpu speed supplied + private double remainingAccelWork = 0.0; + private final long checkpointDuration; private final TraceWorkload snapshot; @@ -92,7 +107,9 @@ public SimTraceWorkload(FlowSupplier supplier, FlowSupplier accelSupplier, Trace this.checkpointDuration = workload.checkpointDuration(); this.scalingPolicy = workload.getScalingPolicy(); this.remainingFragments = new LinkedList<>(workload.getFragments()); + this.remainingAccelFragments = new LinkedList<>(workload.getAccelFragments()); this.fragmentIndex = 0; + this.accelFragmentIndex = 0; this.taskName = workload.getTaskName(); this.startOfFragment = this.clock.millis(); @@ -112,32 +129,59 @@ public long onUpdate(long now) { // The amount of work done since last update double finishedWork = this.scalingPolicy.getFinishedWork(this.cpuFreqDemand, this.cpuFreqSupplied, passedTime); + double finishedAccelWork = this.scalingPolicy.getFinishedWork(this.accelFreqDemand, this.accelFreqSupplied, passedTime); this.remainingWork -= finishedWork; + this.remainingAccelWork -= finishedAccelWork; // If this.remainingWork <= 0, the fragment has been completed - if (this.remainingWork <= 0) { + if (!this.cpuFragmentsComplete && this.remainingWork <= 0) { this.startNextFragment(); - + this.invalidate(); + } + if (!this.accelFragmentsComplete && this.remainingAccelWork <= 0) { + this.startNextAccelFragment(); + this.invalidate(); + } + if (this.cpuFragmentsComplete && this.accelFragmentsComplete) { + this.stopWorkload(); this.invalidate(); return Long.MAX_VALUE; } + // Set demand for the cpu and accel to 0 after fragments of that resource are complete + // For example, if cpu runs longer than gpu, then gpu demand should be 0 + if (this.cpuFragmentsComplete && this.cpuFreqDemand > 0) { + this.pushOutgoingDemand(this.machineEdge, 0); + } + if (this.accelFragmentsComplete && this.accelFreqDemand > 0) { + this.pushOutgoingDemand(this.accelMachineEdge, 0); + } + this.cpuFreqSupplied = this.newCpuFreqSupplied; + this.accelFreqSupplied = this.newAccelFreqSupplied; // The amount of time required to finish the fragment at this speed long remainingDuration = this.scalingPolicy.getRemainingDuration( this.cpuFreqDemand, this.newCpuFreqSupplied, this.remainingWork); - - if (remainingDuration == 0.0) { - this.remainingWork = 0.0; + long remainingAccelDuration = this.scalingPolicy.getRemainingDuration( + this.accelFreqDemand, this.newAccelFreqSupplied, this.remainingAccelWork); + + long nextUpdate; + if (remainingDuration > 0.0 && remainingAccelDuration > 0.0) { + nextUpdate = min(remainingDuration, remainingAccelDuration); + } else if (remainingDuration > 0.0) { + nextUpdate = remainingDuration; + } else { + nextUpdate = remainingAccelDuration; } - return now + remainingDuration; + return now + nextUpdate; } public TraceFragment getNextFragment() { if (this.remainingFragments.isEmpty()) { + this.cpuFragmentsComplete = true; return null; } this.currentFragment = this.remainingFragments.pop(); @@ -146,11 +190,21 @@ public TraceFragment getNextFragment() { return this.currentFragment; } - private void startNextFragment() { + public TraceFragment getNextAccelFragment() { + if (this.remainingAccelFragments.isEmpty()) { + this.accelFragmentsComplete = true; + return null; + } + this.currentAccelFragment = this.remainingAccelFragments.pop(); + this.accelFragmentIndex++; + return this.currentAccelFragment; + } + + private void startNextFragment() { TraceFragment nextFragment = this.getNextFragment(); if (nextFragment == null) { - this.stopWorkload(); + this.remainingWork = Double.NEGATIVE_INFINITY; return; } double demand = nextFragment.cpuUsage(); @@ -158,9 +212,20 @@ private void startNextFragment() { this.pushOutgoingDemand(this.machineEdge, demand); } + private void startNextAccelFragment() { + TraceFragment nextFragment = this.getNextAccelFragment(); + if (nextFragment == null) { + this.remainingAccelWork = Double.NEGATIVE_INFINITY; + return; + } + double demand = nextFragment.cpuUsage(); + this.remainingAccelWork = this.scalingPolicy.getRemainingWork(demand, nextFragment.duration()); + this.pushOutgoingDemand(this.accelMachineEdge, demand); + } + @Override public void stopWorkload() { - if (this.machineEdge == null) { + if (this.machineEdge == null && this.accelMachineEdge == null) { return; } @@ -169,6 +234,7 @@ public void stopWorkload() { this.closeNode(); this.machineEdge = null; + this.accelMachineEdge = null; this.remainingFragments = null; this.currentFragment = null; } @@ -213,9 +279,7 @@ public void makeSnapshot(long now) { TraceFragment newFragment = new TraceFragment( remainingTime, currentFragment.cpuUsage(), - currentFragment.coreCount(), - currentFragment.accelUsage(), - currentFragment.isGpu()); + currentFragment.coreCount()); // Alter the snapshot by removing finished fragments this.snapshot.removeFragments(this.fragmentIndex); @@ -225,7 +289,7 @@ public void makeSnapshot(long now) { // Create and add a fragment for processing the snapshot process TraceFragment snapshotFragment = new TraceFragment( - this.checkpointDuration, this.snapshot.getMaxCpuDemand(), this.snapshot.getMaxCoreCount(), 0.0, false); + this.checkpointDuration, this.snapshot.getMaxCpuDemand(), this.snapshot.getMaxCoreCount()); this.remainingFragments.addFirst(snapshotFragment); this.fragmentIndex = -1; @@ -248,13 +312,22 @@ public void makeSnapshot(long now) { */ @Override public void handleIncomingSupply(FlowEdge supplierEdge, double newSupply) { - if (newSupply == this.cpuFreqSupplied) { - return; + if (supplierEdge.getResourceType() == FlowEdge.ResourceType.CPU) { + if (newSupply == this.cpuFreqSupplied) { + return; + } + + this.cpuFreqSupplied = this.newCpuFreqSupplied; + this.newCpuFreqSupplied = newSupply; + } else if (supplierEdge.getResourceType() == FlowEdge.ResourceType.ACCEL) { + if (newSupply == this.accelFreqSupplied) { + return; + } + + this.accelFreqSupplied = this.newAccelFreqSupplied; + this.newAccelFreqSupplied = newSupply; } - this.cpuFreqSupplied = this.newCpuFreqSupplied; - this.newCpuFreqSupplied = newSupply; - this.invalidate(); } @@ -266,12 +339,21 @@ public void handleIncomingSupply(FlowEdge supplierEdge, double newSupply) { */ @Override public void pushOutgoingDemand(FlowEdge supplierEdge, double newDemand) { - if (newDemand == this.cpuFreqDemand) { - return; + if (supplierEdge.getResourceType() == FlowEdge.ResourceType.CPU) { + if (newDemand == this.cpuFreqDemand) { + return; + } + + this.cpuFreqDemand = newDemand; + this.machineEdge.pushDemand(newDemand); + } else if (supplierEdge.getResourceType() == FlowEdge.ResourceType.ACCEL) { + if (newDemand == this.accelFreqDemand) { + return; + } + + this.accelFreqDemand = newDemand; + this.accelMachineEdge.pushDemand(newDemand); } - - this.cpuFreqDemand = newDemand; - this.machineEdge.pushDemand(newDemand); } /** @@ -281,7 +363,11 @@ public void pushOutgoingDemand(FlowEdge supplierEdge, double newDemand) { */ @Override public void addSupplierEdge(FlowEdge supplierEdge) { - this.machineEdge = supplierEdge; + if (supplierEdge.getResourceType() == FlowEdge.ResourceType.CPU) { + this.machineEdge = supplierEdge; + } else if (supplierEdge.getResourceType() == FlowEdge.ResourceType.ACCEL) { + this.accelMachineEdge = supplierEdge; + } } /** @@ -292,7 +378,7 @@ public void addSupplierEdge(FlowEdge supplierEdge) { */ @Override public void removeSupplierEdge(FlowEdge supplierEdge) { - if (this.machineEdge == null) { + if (this.machineEdge == null && this.accelMachineEdge == null) { return; } diff --git a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/TraceFragment.java b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/TraceFragment.java index 216258c48..a09206a15 100644 --- a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/TraceFragment.java +++ b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/TraceFragment.java @@ -22,9 +22,9 @@ package org.opendc.simulator.compute.workload.trace; -public record TraceFragment(long duration, double cpuUsage, int coreCount, double accelUsage, boolean isGpu) { +public record TraceFragment(long duration, double cpuUsage, int coreCount) { - public TraceFragment(long start, long duration, double cpuUsage, int coreCount, double accelUsage, boolean isGpu) { - this(duration, cpuUsage, coreCount, accelUsage, isGpu); + public TraceFragment(long start, long duration, double cpuUsage, int coreCount) { + this(duration, cpuUsage, coreCount); } } diff --git a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/TraceWorkload.java b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/TraceWorkload.java index 8e5c13330..c827ee8b9 100644 --- a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/TraceWorkload.java +++ b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/TraceWorkload.java @@ -33,6 +33,7 @@ public class TraceWorkload implements Workload { private final ArrayList fragments; + private final ArrayList accelFragments; private final long checkpointInterval; private final long checkpointDuration; private final double checkpointIntervalScaling; @@ -53,12 +54,14 @@ public ScalingPolicy getScalingPolicy() { public TraceWorkload( ArrayList fragments, + ArrayList accelFragments, long checkpointInterval, long checkpointDuration, double checkpointIntervalScaling, ScalingPolicy scalingPolicy, String taskName) { this.fragments = fragments; + this.accelFragments = accelFragments; this.checkpointInterval = checkpointInterval; this.checkpointDuration = checkpointDuration; this.checkpointIntervalScaling = checkpointIntervalScaling; @@ -68,18 +71,22 @@ public TraceWorkload( // TODO: remove if we decide not to use it. this.maxCpuDemand = fragments.stream() .max(Comparator.comparing(TraceFragment::cpuUsage)) - .get() - .cpuUsage(); + .map(TraceFragment::cpuUsage) + .orElse(0.0); this.maxCoreCount = fragments.stream() .max(Comparator.comparing(TraceFragment::coreCount)) - .get() - .coreCount(); + .map(TraceFragment::coreCount) + .orElse(0); } public ArrayList getFragments() { return fragments; } + public ArrayList getAccelFragments() { + return accelFragments; + } + @Override public long checkpointInterval() { return checkpointInterval; @@ -136,6 +143,7 @@ public static Builder builder( public static final class Builder { private final ArrayList fragments; + private final ArrayList accelFragments; private final long checkpointInterval; private final long checkpointDuration; private final double checkpointIntervalScaling; @@ -152,6 +160,7 @@ private Builder( ScalingPolicy scalingPolicy, String taskName) { this.fragments = new ArrayList<>(); + this.accelFragments = new ArrayList<>(); this.checkpointInterval = checkpointInterval; this.checkpointDuration = checkpointDuration; this.checkpointIntervalScaling = checkpointIntervalScaling; @@ -167,7 +176,11 @@ private Builder( * @param cores The number of cores used during this fragment. */ public void add(long duration, double usage, int cores, double accelUsage, boolean isGpu) { - fragments.add(fragments.size(), new TraceFragment(duration, usage, cores, accelUsage, isGpu)); + if (isGpu) { + accelFragments.add(new TraceFragment(duration, usage, cores)); + } else { + fragments.add(new TraceFragment(duration, usage, cores)); + } } /** @@ -176,6 +189,7 @@ public void add(long duration, double usage, int cores, double accelUsage, boole public TraceWorkload build() { return new TraceWorkload( this.fragments, + this.accelFragments, this.checkpointInterval, this.checkpointDuration, this.checkpointIntervalScaling, diff --git a/opendc-simulator/opendc-simulator-compute/src/test/kotlin/org/opendc/simulator/compute/SimMachineTest.kt b/opendc-simulator/opendc-simulator-compute/src/test/kotlin/org/opendc/simulator/compute/SimMachineTest.kt index 173c60e7e..d1dfc85a6 100644 --- a/opendc-simulator/opendc-simulator-compute/src/test/kotlin/org/opendc/simulator/compute/SimMachineTest.kt +++ b/opendc-simulator/opendc-simulator-compute/src/test/kotlin/org/opendc/simulator/compute/SimMachineTest.kt @@ -37,6 +37,14 @@ class SimMachineTest { fun setUp() { machineModel = MachineModel( + CpuModel( + 0, + 2, + 1000.0, + "Intel", + "Xeon", + "amd64", + ), CpuModel( 0, 2, diff --git a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/OdcVmResourceStateTableReader.kt b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/OdcVmResourceStateTableReader.kt index 57427665c..643e659e0 100644 --- a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/OdcVmResourceStateTableReader.kt +++ b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/OdcVmResourceStateTableReader.kt @@ -79,7 +79,7 @@ internal class OdcVmResourceStateTableReader(private val reader: LocalParquetRea } override fun isNull(index: Int): Boolean { - require(index in 0..colCpuUsage) { "Invalid column index" } + require(index in 0..colIsGpu) { "Invalid column index" } return false } diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/OpenDCRunner.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/OpenDCRunner.kt index ab5f3ceb1..ba15aca31 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/OpenDCRunner.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/OpenDCRunner.kt @@ -342,6 +342,18 @@ public class OpenDCRunner( ) } + val accelerators = + machine.cpus.map { cpu -> + CpuModel( + 0, + cpu.numberOfCores, + cpu.clockRateMhz, + "Intel", + "amd64", + cpu.name, + ) + } + val memoryUnits = machine.memory.map { memory -> MemoryUnit( @@ -360,7 +372,7 @@ public class OpenDCRunner( HostSpec( "node-$clusterId-$position", clusterId, - MachineModel(processors, memoryUnits[0]), + MachineModel(processors, accelerators, memoryUnits[0]), powerModel, accelPowerModel, ) From 9436f00a1f9b0639abc23850102db4c655c4dc52 Mon Sep 17 00:00:00 2001 From: Sacheendra Talluri Date: Sat, 5 Apr 2025 01:10:38 +0200 Subject: [PATCH 3/9] [WIP] Only add isgpu field --- .../compute/workload/ComputeWorkloadLoader.kt | 8 ++------ .../compute/workload/trace/TraceWorkload.java | 2 +- .../org/opendc/trace/conv/ResourceStateColumns.kt | 6 ------ .../formats/opendc/OdcVmResourceStateTableReader.kt | 6 +----- .../formats/opendc/OdcVmResourceStateTableWriter.kt | 10 ++-------- .../opendc/trace/formats/opendc/OdcVmTraceFormat.kt | 2 -- .../trace/formats/opendc/parquet/ResourceState.kt | 1 - .../opendc/parquet/ResourceStateReadSupport.kt | 4 ---- .../parquet/ResourceStateRecordMaterializer.kt | 9 --------- .../opendc/parquet/ResourceStateWriteSupport.kt | 13 +++---------- 10 files changed, 9 insertions(+), 52 deletions(-) diff --git a/opendc-compute/opendc-compute-workload/src/main/kotlin/org/opendc/compute/workload/ComputeWorkloadLoader.kt b/opendc-compute/opendc-compute-workload/src/main/kotlin/org/opendc/compute/workload/ComputeWorkloadLoader.kt index 640e3dc03..0f0312bfd 100644 --- a/opendc-compute/opendc-compute-workload/src/main/kotlin/org/opendc/compute/workload/ComputeWorkloadLoader.kt +++ b/opendc-compute/opendc-compute-workload/src/main/kotlin/org/opendc/compute/workload/ComputeWorkloadLoader.kt @@ -36,7 +36,6 @@ import org.opendc.trace.conv.resourceDuration import org.opendc.trace.conv.resourceID import org.opendc.trace.conv.resourceMemCapacity import org.opendc.trace.conv.resourceNature -import org.opendc.trace.conv.resourceStateAccelUsage import org.opendc.trace.conv.resourceStateCpuUsage import org.opendc.trace.conv.resourceStateDuration import org.opendc.trace.conv.resourceStateIsGpu @@ -81,7 +80,6 @@ public class ComputeWorkloadLoader( val durationCol = reader.resolve(resourceStateDuration) val coresCol = reader.resolve(resourceCpuCount) val usageCol = reader.resolve(resourceStateCpuUsage) - val accelUsageCol = reader.resolve(resourceStateAccelUsage) val isGpuCol = reader.resolve(resourceStateIsGpu) val fragments = mutableMapOf() @@ -92,14 +90,13 @@ public class ComputeWorkloadLoader( val durationMs = reader.getDuration(durationCol)!! val cores = reader.getInt(coresCol) val cpuUsage = reader.getDouble(usageCol) - val accelUsage = reader.getDouble(accelUsageCol) val isGpu = reader.getBoolean(isGpuCol) val builder = fragments.computeIfAbsent( id, ) { Builder(checkpointInterval, checkpointDuration, checkpointIntervalScaling, scalingPolicy, id) } - builder.add(durationMs, cpuUsage, cores, accelUsage, isGpu) + builder.add(durationMs, cpuUsage, cores, isGpu) } fragments @@ -237,12 +234,11 @@ public class ComputeWorkloadLoader( duration: Duration, usage: Double, cores: Int, - accelUsage: Double, isGpu: Boolean, ) { totalLoad += (usage * duration.toMillis()) / 1000 // avg MHz * duration = MFLOPs - builder.add(duration.toMillis(), usage, cores, accelUsage, isGpu) + builder.add(duration.toMillis(), usage, cores, isGpu) } /** diff --git a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/TraceWorkload.java b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/TraceWorkload.java index c827ee8b9..9122db038 100644 --- a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/TraceWorkload.java +++ b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/TraceWorkload.java @@ -175,7 +175,7 @@ private Builder( * @param usage The CPU usage at this fragment. * @param cores The number of cores used during this fragment. */ - public void add(long duration, double usage, int cores, double accelUsage, boolean isGpu) { + public void add(long duration, double usage, int cores, boolean isGpu) { if (isGpu) { accelFragments.add(new TraceFragment(duration, usage, cores)); } else { diff --git a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/ResourceStateColumns.kt b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/ResourceStateColumns.kt index c6a335022..eb428bf5f 100644 --- a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/ResourceStateColumns.kt +++ b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/ResourceStateColumns.kt @@ -48,12 +48,6 @@ public val resourceStatePoweredOn: String = "powered_on" @JvmField public val resourceStateCpuUsage: String = "cpu_usage" -/** - * Total GPU usage of the resource in MHz. - */ -@JvmField -public val resourceStateAccelUsage: String = "accel_usage" - /** * Is the fragment a GPU fragment */ diff --git a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/OdcVmResourceStateTableReader.kt b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/OdcVmResourceStateTableReader.kt index 643e659e0..8d65d8ce3 100644 --- a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/OdcVmResourceStateTableReader.kt +++ b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/OdcVmResourceStateTableReader.kt @@ -25,7 +25,6 @@ package org.opendc.trace.formats.opendc import org.opendc.trace.TableReader import org.opendc.trace.conv.resourceCpuCount import org.opendc.trace.conv.resourceID -import org.opendc.trace.conv.resourceStateAccelUsage import org.opendc.trace.conv.resourceStateCpuUsage import org.opendc.trace.conv.resourceStateDuration import org.opendc.trace.conv.resourceStateIsGpu @@ -62,8 +61,7 @@ internal class OdcVmResourceStateTableReader(private val reader: LocalParquetRea private val colDuration = 2 private val colCpuCount = 3 private val colCpuUsage = 4 - private val colAccelUsage = 5 - private val colIsGpu = 6 + private val colIsGpu = 5 override fun resolve(name: String): Int { return when (name) { @@ -72,7 +70,6 @@ internal class OdcVmResourceStateTableReader(private val reader: LocalParquetRea resourceStateDuration -> colDuration resourceCpuCount -> colCpuCount resourceStateCpuUsage -> colCpuUsage - resourceStateAccelUsage -> colAccelUsage resourceStateIsGpu -> colIsGpu else -> -1 } @@ -111,7 +108,6 @@ internal class OdcVmResourceStateTableReader(private val reader: LocalParquetRea val record = checkNotNull(record) { "Reader in invalid state" } return when (index) { colCpuUsage -> record.cpuUsage - colAccelUsage -> record.accelUsage else -> throw IllegalArgumentException("Invalid column or type [index $index]") } } diff --git a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/OdcVmResourceStateTableWriter.kt b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/OdcVmResourceStateTableWriter.kt index dfdc49884..f69806b89 100644 --- a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/OdcVmResourceStateTableWriter.kt +++ b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/OdcVmResourceStateTableWriter.kt @@ -26,7 +26,6 @@ import org.apache.parquet.hadoop.ParquetWriter import org.opendc.trace.TableWriter import org.opendc.trace.conv.resourceCpuCount import org.opendc.trace.conv.resourceID -import org.opendc.trace.conv.resourceStateAccelUsage import org.opendc.trace.conv.resourceStateCpuUsage import org.opendc.trace.conv.resourceStateDuration import org.opendc.trace.conv.resourceStateIsGpu @@ -49,7 +48,6 @@ internal class OdcVmResourceStateTableWriter(private val writer: ParquetWriter= lastTimestamp) { "Records need to be ordered by (id, timestamp)" } - writer.write(ResourceState(localID, localTimestamp, localDuration, localCpuCount, localCpuUsage, localAccelUsage, localIsGpu)) + writer.write(ResourceState(localID, localTimestamp, localDuration, localCpuCount, localCpuUsage, localIsGpu)) lastId = localID lastTimestamp = localTimestamp @@ -82,7 +79,6 @@ internal class OdcVmResourceStateTableWriter(private val writer: ParquetWriter colDuration resourceCpuCount -> colCpuCount resourceStateCpuUsage -> colCpuUsage - resourceStateAccelUsage -> colAccelUsage resourceStateIsGpu -> colIsGpu else -> -1 } @@ -131,7 +127,6 @@ internal class OdcVmResourceStateTableWriter(private val writer: ParquetWriter localCpuUsage = value - colAccelUsage -> localAccelUsage = value else -> throw IllegalArgumentException("Invalid column or type [index $index]") } } @@ -219,6 +214,5 @@ internal class OdcVmResourceStateTableWriter(private val writer: ParquetWriter?) : "cpu_count" to resourceCpuCount, "cpuUsage" to resourceStateCpuUsage, "cpu_usage" to resourceStateCpuUsage, - "accel_usage" to resourceStateAccelUsage, "is_gpu" to resourceStateIsGpu, ) @@ -141,8 +139,6 @@ internal class ResourceStateReadSupport(private val projection: List?) : Types .required(PrimitiveType.PrimitiveTypeName.DOUBLE) .named("cpu_usage"), - Types.optional(PrimitiveType.PrimitiveTypeName.DOUBLE) - .named("accel_usage"), Types.optional(PrimitiveType.PrimitiveTypeName.BOOLEAN) .named("is_gpu"), ) diff --git a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/parquet/ResourceStateRecordMaterializer.kt b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/parquet/ResourceStateRecordMaterializer.kt index 8f50c4489..c6feaa5d3 100644 --- a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/parquet/ResourceStateRecordMaterializer.kt +++ b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/parquet/ResourceStateRecordMaterializer.kt @@ -43,7 +43,6 @@ internal class ResourceStateRecordMaterializer(schema: MessageType) : RecordMate private var localDuration = Duration.ZERO private var localCpuCount = 0 private var localCpuUsage = 0.0 - private var localAccelUsage = 0.0 private var localIsGpu = false /** @@ -87,12 +86,6 @@ internal class ResourceStateRecordMaterializer(schema: MessageType) : RecordMate localCpuUsage = value } } - "accel_usage", "accelUsage" -> - object : PrimitiveConverter() { - override fun addDouble(value: Double) { - localAccelUsage = value - } - } "is_gpu", "isGpu" -> object : PrimitiveConverter() { override fun addBoolean(value: Boolean) { @@ -115,7 +108,6 @@ internal class ResourceStateRecordMaterializer(schema: MessageType) : RecordMate localDuration = Duration.ZERO localCpuCount = 0 localCpuUsage = 0.0 - localAccelUsage = 0.0 localIsGpu = false } @@ -131,7 +123,6 @@ internal class ResourceStateRecordMaterializer(schema: MessageType) : RecordMate localDuration, localCpuCount, localCpuUsage, - localAccelUsage, localIsGpu, ) diff --git a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/parquet/ResourceStateWriteSupport.kt b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/parquet/ResourceStateWriteSupport.kt index cea771906..673dcacce 100644 --- a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/parquet/ResourceStateWriteSupport.kt +++ b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/parquet/ResourceStateWriteSupport.kt @@ -78,13 +78,9 @@ internal class ResourceStateWriteSupport : WriteSupport() { consumer.addDouble(record.cpuUsage) consumer.endField("cpu_usage", 4) - consumer.startField("accel_usage", 5) - consumer.addDouble(record.accelUsage) - consumer.endField("accel_usage", 5) - - consumer.startField("is_gpu", 6) + consumer.startField("is_gpu", 5) consumer.addBoolean(record.isGpu) - consumer.endField("is_gpu", 6) + consumer.endField("is_gpu", 5) consumer.endMessage() } @@ -115,10 +111,7 @@ internal class ResourceStateWriteSupport : WriteSupport() { .required(PrimitiveType.PrimitiveTypeName.DOUBLE) .named("cpu_usage"), Types - .required(PrimitiveType.PrimitiveTypeName.DOUBLE) - .named("accel_usage"), - Types - .required(PrimitiveType.PrimitiveTypeName.BOOLEAN) + .optional(PrimitiveType.PrimitiveTypeName.BOOLEAN) .named("is_gpu"), ) .named("resource_state") From b7567900399559908cbc053302184be936c8cf81 Mon Sep 17 00:00:00 2001 From: Sacheendra Talluri Date: Sat, 5 Apr 2025 02:10:13 +0200 Subject: [PATCH 4/9] Return both cpu and accel edges from getConnectedEdges --- .../org/opendc/experiments/base/GpuTest.kt | 20 +++++++++---------- .../simulator/compute/machine/SimMachine.java | 3 +++ .../compute/workload/VirtualMachine.java | 12 ++++++++--- .../workload/trace/SimTraceWorkload.java | 6 +++++- 4 files changed, 27 insertions(+), 14 deletions(-) diff --git a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/GpuTest.kt b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/GpuTest.kt index 57539fc47..0c25787bf 100644 --- a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/GpuTest.kt +++ b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/GpuTest.kt @@ -4,7 +4,6 @@ import org.junit.jupiter.api.Assertions.assertEquals import org.junit.jupiter.api.Test import org.junit.jupiter.api.assertAll import org.opendc.compute.workload.Task -import org.opendc.experiments.base.experiment.specs.TraceBasedFailureModelSpec import org.opendc.simulator.compute.workload.trace.TraceFragment import java.util.ArrayList @@ -27,10 +26,11 @@ class GpuTest { val monitor = runTest(topology, workload) + // Power usage is 150 from gpu + 100 from idle cpu in the topology assertAll( { assertEquals(10 * 60 * 1000, monitor.maxTimestamp) { "Total runtime incorrect" } }, - { assertEquals(150.0, monitor.hostPowerDraws["H01"]?.get(0)) { "Incorrect energy usage" } }, - { assertEquals(150.0, monitor.hostPowerDraws["H01"]?.get(9)) { "Incorrect energy usage" } }, + { assertEquals(250.0, monitor.hostPowerDraws["H01"]?.get(0)) { "Incorrect energy usage" } }, + { assertEquals(250.0, monitor.hostPowerDraws["H01"]?.get(9)) { "Incorrect energy usage" } }, ) } @@ -85,9 +85,9 @@ class GpuTest { assertAll( { assertEquals(20 * 60 * 1000, monitor.maxTimestamp) { "Total runtime incorrect" } }, - { assertEquals(300.0, monitor.hostPowerDraws["H01"]?.get(0)) { "Incorrect energy usage" } }, - { assertEquals(300.0, monitor.hostPowerDraws["H01"]?.get(8)) { "Incorrect energy usage" } }, - { assertEquals(250.0, monitor.hostPowerDraws["H01"]?.get(13)) { "Incorrect energy usage" } }, + { assertEquals(300.0, monitor.hostPowerDraws["H01"]?.get(0)) { "Incorrect energy usage at time 0" } }, + { assertEquals(300.0, monitor.hostPowerDraws["H01"]?.get(8)) { "Incorrect energy usage at time 8" } }, + { assertEquals(250.0, monitor.hostPowerDraws["H01"]?.get(13)) { "Incorrect energy usage at time 13" } }, ) } @@ -125,10 +125,10 @@ class GpuTest { assertAll( { assertEquals(30 * 60 * 1000, monitor.maxTimestamp) { "Total runtime incorrect" } }, - { assertEquals(300.0, monitor.hostPowerDraws["H01"]?.get(0)) { "Incorrect energy usage" } }, - { assertEquals(300.0, monitor.hostPowerDraws["H01"]?.get(8)) { "Incorrect energy usage" } }, - { assertEquals(250.0, monitor.hostPowerDraws["H01"]?.get(13)) { "Incorrect energy usage" } }, - { assertEquals(300.0, monitor.hostPowerDraws["H01"]?.get(18)) { "Incorrect energy usage" } }, + { assertEquals(300.0, monitor.hostPowerDraws["H01"]?.get(0)) { "Incorrect energy usage at time 0" } }, + { assertEquals(300.0, monitor.hostPowerDraws["H01"]?.get(8)) { "Incorrect energy usage at time 8" } }, + { assertEquals(250.0, monitor.hostPowerDraws["H01"]?.get(13)) { "Incorrect energy usage at time 13" } }, + { assertEquals(300.0, monitor.hostPowerDraws["H01"]?.get(25)) { "Incorrect energy usage at time 18" } }, ) } } diff --git a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/machine/SimMachine.java b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/machine/SimMachine.java index 5db77ae62..c7bc45127 100644 --- a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/machine/SimMachine.java +++ b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/machine/SimMachine.java @@ -164,6 +164,9 @@ public void shutdown(Exception cause) { this.psu.closeNode(); this.psu = null; + this.psuDistributor.closeNode(); + this.psuDistributor = null; + this.cpu.closeNode(); this.cpu = null; diff --git a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/VirtualMachine.java b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/VirtualMachine.java index 1edfa7ce0..4f5f48ebe 100644 --- a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/VirtualMachine.java +++ b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/VirtualMachine.java @@ -22,6 +22,7 @@ package org.opendc.simulator.compute.workload; +import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import java.util.Map; @@ -381,7 +382,7 @@ public void removeConsumerEdge(FlowEdge consumerEdge) { */ @Override public void removeSupplierEdge(FlowEdge supplierEdge) { - if (this.machineEdge == null) { + if (this.machineEdge == null && this.accelWorkloadEdge == null) { return; } @@ -390,8 +391,13 @@ public void removeSupplierEdge(FlowEdge supplierEdge) { @Override public Map> getConnectedEdges() { - List consumerEdges = (this.machineEdge != null) ? List.of(this.machineEdge) : List.of(); - List supplierEdges = (this.workloadEdge != null) ? List.of(this.workloadEdge) : List.of(); + ArrayList consumerEdges = new ArrayList<>(); + if (this.machineEdge != null) consumerEdges.add(this.machineEdge); + if (this.accelMachineEdge != null) consumerEdges.add(this.accelMachineEdge); + + ArrayList supplierEdges = new ArrayList<>(); + if (this.workloadEdge != null) supplierEdges.add(this.workloadEdge); + if (this.accelWorkloadEdge != null) supplierEdges.add(this.accelWorkloadEdge); return Map.of( FlowEdge.NodeType.CONSUMING, consumerEdges, diff --git a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/SimTraceWorkload.java b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/SimTraceWorkload.java index bf4daaf77..5da501640 100644 --- a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/SimTraceWorkload.java +++ b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/SimTraceWorkload.java @@ -22,6 +22,7 @@ package org.opendc.simulator.compute.workload.trace; +import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import java.util.Map; @@ -387,6 +388,9 @@ public void removeSupplierEdge(FlowEdge supplierEdge) { @Override public Map> getConnectedEdges() { - return Map.of(FlowEdge.NodeType.CONSUMING, (this.machineEdge != null) ? List.of(this.machineEdge) : List.of()); + ArrayList consumerEdges = new ArrayList<>(); + if (this.machineEdge != null) consumerEdges.add(this.machineEdge); + if (this.accelMachineEdge != null) consumerEdges.add(this.accelMachineEdge); + return Map.of(FlowEdge.NodeType.CONSUMING, consumerEdges); } } From d1298438087ede3b0fd74e1dfdefaa5e3e311234 Mon Sep 17 00:00:00 2001 From: Sacheendra Talluri Date: Sat, 5 Apr 2025 02:22:33 +0200 Subject: [PATCH 5/9] Run spotless apply --- .../compute/topology/TopologyFactories.kt | 33 ++++---- .../org/opendc/experiments/base/GpuTest.kt | 77 ++++++++++++------- .../opendc/experiments/base/TestingUtils.kt | 1 - .../compute/models/MachineModel.java | 10 +-- .../workload/trace/SimTraceWorkload.java | 16 ++-- 5 files changed, 78 insertions(+), 59 deletions(-) diff --git a/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/TopologyFactories.kt b/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/TopologyFactories.kt index fed5a0d7f..be25c3837 100644 --- a/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/TopologyFactories.kt +++ b/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/TopologyFactories.kt @@ -165,23 +165,24 @@ private fun HostJSONSpec.toHostSpec(clusterName: String): HostSpec { ) } - val accelUnits = if (accel == null) { - List(1) { - CpuModel( - globalCoreId++, - 0, - 0.0, - ) - } - } else { - List(accel.count) { - CpuModel( - globalCoreId++, - accel.coreCount, - accel.coreSpeed.toMHz(), - ) + val accelUnits = + if (accel == null) { + List(1) { + CpuModel( + globalCoreId++, + 0, + 0.0, + ) + } + } else { + List(accel.count) { + CpuModel( + globalCoreId++, + accel.coreCount, + accel.coreSpeed.toMHz(), + ) + } } - } val unknownMemoryUnit = MemoryUnit(memory.vendor, memory.modelName, memory.memorySpeed.toMHz(), memory.memorySize.toMiB().toLong()) val machineModel = diff --git a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/GpuTest.kt b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/GpuTest.kt index 0c25787bf..8acf8729a 100644 --- a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/GpuTest.kt +++ b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/GpuTest.kt @@ -1,3 +1,25 @@ +/* + * Copyright (c) 2025 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + package org.opendc.experiments.base import org.junit.jupiter.api.Assertions.assertEquals @@ -8,7 +30,6 @@ import org.opendc.simulator.compute.workload.trace.TraceFragment import java.util.ArrayList class GpuTest { - @Test fun testGpuOnly() { val workload: ArrayList = @@ -16,9 +37,9 @@ class GpuTest { createTestTask( name = "0", accelFragments = - arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), - ), + arrayListOf( + TraceFragment(10 * 60 * 1000, 1000.0, 1), + ), ), ) @@ -41,13 +62,13 @@ class GpuTest { createTestTask( name = "0", fragments = - arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), - ), + arrayListOf( + TraceFragment(10 * 60 * 1000, 1000.0, 1), + ), accelFragments = - arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), - ), + arrayListOf( + TraceFragment(10 * 60 * 1000, 1000.0, 1), + ), ), ) @@ -69,13 +90,13 @@ class GpuTest { createTestTask( name = "0", fragments = - arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), - ), + arrayListOf( + TraceFragment(10 * 60 * 1000, 1000.0, 1), + ), accelFragments = - arrayListOf( - TraceFragment(20 * 60 * 1000, 1000.0, 1), - ), + arrayListOf( + TraceFragment(20 * 60 * 1000, 1000.0, 1), + ), ), ) @@ -98,24 +119,24 @@ class GpuTest { createTestTask( name = "0", fragments = - arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), - ), + arrayListOf( + TraceFragment(10 * 60 * 1000, 1000.0, 1), + ), accelFragments = - arrayListOf( - TraceFragment(20 * 60 * 1000, 1000.0, 1), - ), + arrayListOf( + TraceFragment(20 * 60 * 1000, 1000.0, 1), + ), ), createTestTask( name = "1", fragments = - arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), - ), + arrayListOf( + TraceFragment(10 * 60 * 1000, 1000.0, 1), + ), accelFragments = - arrayListOf( - TraceFragment(10 * 60 * 1000, 1000.0, 1), - ), + arrayListOf( + TraceFragment(10 * 60 * 1000, 1000.0, 1), + ), ), ) diff --git a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/TestingUtils.kt b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/TestingUtils.kt index c7a57f282..44f7e3426 100644 --- a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/TestingUtils.kt +++ b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/TestingUtils.kt @@ -157,7 +157,6 @@ class TestComputeMonitor : ComputeMonitor { taskAccelDemands[taskName] = arrayListOf(reader.cpuDemand) taskAccelSupplied[taskName] = arrayListOf(reader.cpuUsage) } - } var attemptsSuccess = 0 diff --git a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/models/MachineModel.java b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/models/MachineModel.java index 515904822..1c70ed0ee 100644 --- a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/models/MachineModel.java +++ b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/models/MachineModel.java @@ -65,11 +65,11 @@ public MachineModel(List cpus, List accelerators, MemoryUnit cpus.get(0).getArchitecture()), new CpuModel( accelerators.get(0).getId(), - accelerators.get(0).getCoreCount() * cpus.size(), - accelerators.get(0).getCoreSpeed(), - accelerators.get(0).getVendor(), - accelerators.get(0).getModelName(), - accelerators.get(0).getArchitecture()), + accelerators.get(0).getCoreCount() * cpus.size(), + accelerators.get(0).getCoreSpeed(), + accelerators.get(0).getVendor(), + accelerators.get(0).getModelName(), + accelerators.get(0).getArchitecture()), memory); } diff --git a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/SimTraceWorkload.java b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/SimTraceWorkload.java index 5da501640..5e252ee51 100644 --- a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/SimTraceWorkload.java +++ b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/SimTraceWorkload.java @@ -22,6 +22,8 @@ package org.opendc.simulator.compute.workload.trace; +import static java.lang.Math.min; + import java.util.ArrayList; import java.util.LinkedList; import java.util.List; @@ -33,9 +35,6 @@ import org.opendc.simulator.engine.graph.FlowNode; import org.opendc.simulator.engine.graph.FlowSupplier; -import static java.lang.Math.max; -import static java.lang.Math.min; - public class SimTraceWorkload extends SimWorkload implements FlowConsumer { private LinkedList remainingFragments; private LinkedList remainingAccelFragments; @@ -130,7 +129,8 @@ public long onUpdate(long now) { // The amount of work done since last update double finishedWork = this.scalingPolicy.getFinishedWork(this.cpuFreqDemand, this.cpuFreqSupplied, passedTime); - double finishedAccelWork = this.scalingPolicy.getFinishedWork(this.accelFreqDemand, this.accelFreqSupplied, passedTime); + double finishedAccelWork = + this.scalingPolicy.getFinishedWork(this.accelFreqDemand, this.accelFreqSupplied, passedTime); this.remainingWork -= finishedWork; this.remainingAccelWork -= finishedAccelWork; @@ -166,7 +166,7 @@ public long onUpdate(long now) { long remainingDuration = this.scalingPolicy.getRemainingDuration( this.cpuFreqDemand, this.newCpuFreqSupplied, this.remainingWork); long remainingAccelDuration = this.scalingPolicy.getRemainingDuration( - this.accelFreqDemand, this.newAccelFreqSupplied, this.remainingAccelWork); + this.accelFreqDemand, this.newAccelFreqSupplied, this.remainingAccelWork); long nextUpdate; if (remainingDuration > 0.0 && remainingAccelDuration > 0.0) { @@ -277,10 +277,8 @@ public void makeSnapshot(long now) { } // Create a new fragment based on the current fragment and remaining duration - TraceFragment newFragment = new TraceFragment( - remainingTime, - currentFragment.cpuUsage(), - currentFragment.coreCount()); + TraceFragment newFragment = + new TraceFragment(remainingTime, currentFragment.cpuUsage(), currentFragment.coreCount()); // Alter the snapshot by removing finished fragments this.snapshot.removeFragments(this.fragmentIndex); From a2055221c95e70ce964e830686cbb9e48b5176a8 Mon Sep 17 00:00:00 2001 From: Sacheendra Talluri Date: Sat, 5 Apr 2025 02:46:09 +0200 Subject: [PATCH 6/9] Increase heap size during test for the 1000 task test --- .../kotlin/testing-conventions.gradle.kts | 50 ++++++++++--------- 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/buildSrc/src/main/kotlin/testing-conventions.gradle.kts b/buildSrc/src/main/kotlin/testing-conventions.gradle.kts index b374d0ffa..2a521ac94 100644 --- a/buildSrc/src/main/kotlin/testing-conventions.gradle.kts +++ b/buildSrc/src/main/kotlin/testing-conventions.gradle.kts @@ -27,6 +27,10 @@ plugins { tasks.test { useJUnitPlatform() + minHeapSize = "512m" + maxHeapSize = "3072m" + jvmArgs = listOf("-XX:MaxMetaspaceSize=512m") + reports { html.required.set(true) junitXml.required.set(true) @@ -42,26 +46,26 @@ dependencies { testRuntimeOnly(versionCatalog["junit.jupiter.engine"]) } -tasks.register("testsOn18") { - javaLauncher.set(javaToolchains.launcherFor { - languageVersion.set(JavaLanguageVersion.of(18)) - }) - - useJUnitPlatform() - - minHeapSize = "512m" - maxHeapSize = "1024m" - jvmArgs = listOf("-XX:MaxMetaspaceSize=512m") -} - -tasks.register("testsOn19") { - javaLauncher.set(javaToolchains.launcherFor { - languageVersion.set(JavaLanguageVersion.of(19)) - }) - - useJUnitPlatform() - - minHeapSize = "512m" - maxHeapSize = "1024m" - jvmArgs = listOf("-XX:MaxMetaspaceSize=512m") -} +//tasks.register("testsOn18") { +// javaLauncher.set(javaToolchains.launcherFor { +// languageVersion.set(JavaLanguageVersion.of(18)) +// }) +// +// useJUnitPlatform() +// +// minHeapSize = "512m" +// maxHeapSize = "1024m" +// jvmArgs = listOf("-XX:MaxMetaspaceSize=512m") +//} +// +//tasks.register("testsOn19") { +// javaLauncher.set(javaToolchains.launcherFor { +// languageVersion.set(JavaLanguageVersion.of(19)) +// }) +// +// useJUnitPlatform() +// +// minHeapSize = "512m" +// maxHeapSize = "1024m" +// jvmArgs = listOf("-XX:MaxMetaspaceSize=512m") +//} From d5740709334b76b402a3983113dae3f8276e61a4 Mon Sep 17 00:00:00 2001 From: Sacheendra Talluri Date: Sat, 5 Apr 2025 02:58:30 +0200 Subject: [PATCH 7/9] Fix error in test error description --- .../src/test/kotlin/org/opendc/experiments/base/GpuTest.kt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/GpuTest.kt b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/GpuTest.kt index 8acf8729a..6521cc8f9 100644 --- a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/GpuTest.kt +++ b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/GpuTest.kt @@ -149,7 +149,7 @@ class GpuTest { { assertEquals(300.0, monitor.hostPowerDraws["H01"]?.get(0)) { "Incorrect energy usage at time 0" } }, { assertEquals(300.0, monitor.hostPowerDraws["H01"]?.get(8)) { "Incorrect energy usage at time 8" } }, { assertEquals(250.0, monitor.hostPowerDraws["H01"]?.get(13)) { "Incorrect energy usage at time 13" } }, - { assertEquals(300.0, monitor.hostPowerDraws["H01"]?.get(25)) { "Incorrect energy usage at time 18" } }, + { assertEquals(300.0, monitor.hostPowerDraws["H01"]?.get(25)) { "Incorrect energy usage at time 25" } }, ) } } From c6b126870e609e71e201df2b0d6151c4a3c45df6 Mon Sep 17 00:00:00 2001 From: Sacheendra Talluri Date: Tue, 8 Apr 2025 22:21:25 +0200 Subject: [PATCH 8/9] [WIP] Add gpu to snapshot --- .../org/opendc/experiments/base/GpuTest.kt | 35 ++++++++++++ .../workload/trace/SimTraceWorkload.java | 57 ++++++++++++++----- .../compute/workload/trace/TraceWorkload.java | 11 ++++ 3 files changed, 90 insertions(+), 13 deletions(-) diff --git a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/GpuTest.kt b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/GpuTest.kt index 6521cc8f9..e3ee08726 100644 --- a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/GpuTest.kt +++ b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/GpuTest.kt @@ -26,6 +26,7 @@ import org.junit.jupiter.api.Assertions.assertEquals import org.junit.jupiter.api.Test import org.junit.jupiter.api.assertAll import org.opendc.compute.workload.Task +import org.opendc.experiments.base.experiment.specs.TraceBasedFailureModelSpec import org.opendc.simulator.compute.workload.trace.TraceFragment import java.util.ArrayList @@ -152,4 +153,38 @@ class GpuTest { { assertEquals(300.0, monitor.hostPowerDraws["H01"]?.get(25)) { "Incorrect energy usage at time 25" } }, ) } + + @Test + fun testGpuSnapshot() { + val workload: ArrayList = + arrayListOf( + createTestTask( + name = "0", + fragments = + arrayListOf( + TraceFragment(10 * 60 * 1000, 1000.0, 1), + ), + accelFragments = + arrayListOf( + TraceFragment(10 * 60 * 1000, 1000.0, 1), + ), + ), + ) + + val failureModelSpec = + TraceBasedFailureModelSpec( + "src/test/resources/failureTraces/single_failure.parquet", + repeat = false, + ) + + val topology = createTopology("gpu/single_1_2000.json") + + val monitor = runTest(topology, workload, failureModelSpec) + + assertAll( + { assertEquals(10 * 60 * 1000, monitor.maxTimestamp) { "Total runtime incorrect" } }, + { assertEquals(300.0, monitor.hostPowerDraws["H01"]?.get(0)) { "Incorrect energy usage" } }, + { assertEquals(300.0, monitor.hostPowerDraws["H01"]?.get(9)) { "Incorrect energy usage" } }, + ) + } } diff --git a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/SimTraceWorkload.java b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/SimTraceWorkload.java index 5e252ee51..a5d7c94e4 100644 --- a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/SimTraceWorkload.java +++ b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/SimTraceWorkload.java @@ -264,35 +264,66 @@ public void makeSnapshot(long now) { // The amount of work done since last update double finishedWork = this.scalingPolicy.getFinishedWork(this.cpuFreqDemand, this.cpuFreqSupplied, passedTime); + double finishedAccelWork = this.scalingPolicy.getFinishedWork(this.accelFreqDemand, this.accelFreqSupplied, passedTime); this.remainingWork -= finishedWork; + this.remainingAccelWork -= finishedAccelWork; // The amount of time required to finish the fragment at this speed long remainingTime = this.scalingPolicy.getRemainingDuration(this.cpuFreqDemand, this.cpuFreqDemand, this.remainingWork); + long remainingAccelTime = + this.scalingPolicy.getRemainingDuration(this.accelFreqDemand, this.accelFreqDemand, this.remainingAccelWork); // If this is the end of the Task, don't make a snapshot if (remainingTime <= 0 && remainingFragments.isEmpty()) { return; - } - - // Create a new fragment based on the current fragment and remaining duration - TraceFragment newFragment = + } else { + // Create a new fragment based on the current fragment and remaining duration + TraceFragment newFragment = new TraceFragment(remainingTime, currentFragment.cpuUsage(), currentFragment.coreCount()); - // Alter the snapshot by removing finished fragments - this.snapshot.removeFragments(this.fragmentIndex); - this.snapshot.addFirst(newFragment); + // Alter the snapshot by removing finished fragments + this.snapshot.removeFragments(this.fragmentIndex); + this.snapshot.addFirst(newFragment); - this.remainingFragments.addFirst(newFragment); + this.remainingFragments.addFirst(newFragment); - // Create and add a fragment for processing the snapshot process - TraceFragment snapshotFragment = new TraceFragment( + // Create and add a fragment for processing the snapshot process + TraceFragment snapshotFragment = new TraceFragment( this.checkpointDuration, this.snapshot.getMaxCpuDemand(), this.snapshot.getMaxCoreCount()); - this.remainingFragments.addFirst(snapshotFragment); + this.remainingFragments.addFirst(snapshotFragment); + + this.fragmentIndex = -1; + startNextFragment(); + } + + if (remainingAccelTime <= 0 && remainingAccelFragments.isEmpty()) { + return; + } else { + // Create a new fragment based on the current fragment and remaining duration + TraceFragment newAccelFragment = + new TraceFragment(remainingAccelTime, currentAccelFragment.cpuUsage(), currentAccelFragment.coreCount()); + + // Alter the snapshot by removing finished fragments + this.snapshot.removeAccelFragments(this.accelFragmentIndex); + this.snapshot.addFirstAccel(newAccelFragment); + + this.remainingAccelFragments.addFirst(newAccelFragment); + + // Create and add a fragment for processing the snapshot process + // Not adding a snapshot cost to accel fragments for now +// TraceFragment snapshotAccelFragment = new TraceFragment( +// this.checkpointDuration, this.snapshot.getMaxCpuDemand(), this.snapshot.getMaxCoreCount()); +// this.remainingAccelFragments.addFirst(snapshotAccelFragment); - this.fragmentIndex = -1; - startNextFragment(); + this.accelFragmentIndex = -1; + startNextAccelFragment(); + } + + if (remainingFragments.isEmpty() && remainingAccelFragments.isEmpty()) { + return; + } this.startOfFragment = now; diff --git a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/TraceWorkload.java b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/TraceWorkload.java index 9122db038..9dca77ea6 100644 --- a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/TraceWorkload.java +++ b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/TraceWorkload.java @@ -121,6 +121,17 @@ public void addFirst(TraceFragment fragment) { this.fragments.addFirst(fragment); } + public void removeAccelFragments(int numberOfFragments) { + if (numberOfFragments <= 0) { + return; + } + this.accelFragments.subList(0, numberOfFragments).clear(); + } + + public void addFirstAccel(TraceFragment fragment) { + this.accelFragments.addFirst(fragment); + } + @Override public SimWorkload startWorkload(FlowSupplier supplier, FlowSupplier accelSupplier) { return new SimTraceWorkload(supplier, accelSupplier, this); From 21b089b8c495d6dd07480888893eb9b4fd528347 Mon Sep 17 00:00:00 2001 From: Sacheendra Talluri Date: Tue, 8 Apr 2025 23:58:23 +0200 Subject: [PATCH 9/9] Fix checkpoint tests and working gpu checkpoints --- .../base/FailuresAndCheckpointingTest.kt | 74 ++++++++++++++++--- .../org/opendc/experiments/base/GpuTest.kt | 53 ++++++++++++- .../workload/trace/SimTraceWorkload.java | 36 ++++----- .../compute/workload/trace/TraceWorkload.java | 19 +++++ 4 files changed, 148 insertions(+), 34 deletions(-) diff --git a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/FailuresAndCheckpointingTest.kt b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/FailuresAndCheckpointingTest.kt index 3231f533f..df3a3c883 100644 --- a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/FailuresAndCheckpointingTest.kt +++ b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/FailuresAndCheckpointingTest.kt @@ -247,13 +247,31 @@ class FailuresAndCheckpointingTest { ), ) + val failureModelSpec = + TraceBasedFailureModelSpec( + "src/test/resources/failureTraces/single_failure.parquet", + repeat = false, + ) + val topology = createTopology("single_1_2000.json") - val monitor = runTest(topology, workload) + val monitor = runTest(topology, workload, failureModelSpec) assertAll( - { assertEquals((10 * 60000) + (9 * 1000), monitor.maxTimestamp) { "Total runtime incorrect" } }, - { assertEquals((10 * 60 * 150.0) + (9 * 150.0), monitor.hostEnergyUsages["H01"]?.sum()) { "Incorrect energy usage" } }, + // Task run time + Time node is in failed state + checkpoint time + time waiting to be scheduled + { + assertEquals( + (10 * 60 * 1000) + (5 * 60 * 1000) + (9 * 1000) + (56 * 1000), + monitor.maxTimestamp, + ) { "Total runtime incorrect" } + }, + // TODO: The energy draw of last item (56 * 150.0) is wrong. Figure out why? + { + assertEquals( + (10 * 60 * 150.0) + (5 * 60 * 100.0) + (9 * 150.0) + (56 * 150.0), + monitor.hostEnergyUsages["H01"]?.sum(), + ) { "Incorrect energy usage" } + }, ) } @@ -284,15 +302,26 @@ class FailuresAndCheckpointingTest { ), ) + val failureModelSpec = + TraceBasedFailureModelSpec( + "src/test/resources/failureTraces/single_failure.parquet", + repeat = false, + ) + val topology = createTopology("single_1_2000.json") - val monitor = runTest(topology, workload) + val monitor = runTest(topology, workload, failureModelSpec) assertAll( - { assertEquals((20 * 60000) + (19 * 1000), monitor.maxTimestamp) { "Total runtime incorrect" } }, { assertEquals( - (10 * 60 * 200.0) + (10 * 60 * 150.0) + (19 * 200.0), + (20 * 60000) + (5 * 60 * 1000) + (19 * 1000) + (56 * 1000), + monitor.maxTimestamp, + ) { "Total runtime incorrect" } + }, + { + assertEquals( + (10 * 60 * 200.0) + (10 * 60 * 150.0) + (5 * 60 * 100.0) + (19 * 200.0) + (56 * 200.0), monitor.hostEnergyUsages["H01"]?.sum(), ) { "Incorrect energy usage" } }, @@ -326,15 +355,26 @@ class FailuresAndCheckpointingTest { ), ) + val failureModelSpec = + TraceBasedFailureModelSpec( + "src/test/resources/failureTraces/single_failure.parquet", + repeat = false, + ) + val topology = createTopology("single_1_2000.json") - val monitor = runTest(topology, workload) + val monitor = runTest(topology, workload, failureModelSpec) assertAll( - { assertEquals((20 * 60000) + (19 * 1000), monitor.maxTimestamp) { "Total runtime incorrect" } }, { assertEquals( - (10 * 60 * 200.0) + (10 * 60 * 150.0) + (19 * 200.0), + (20 * 60000) + (5 * 60 * 1000) + (19 * 1000) + (56 * 1000), + monitor.maxTimestamp, + ) { "Total runtime incorrect" } + }, + { + assertEquals( + (10 * 60 * 200.0) + (10 * 60 * 150.0) + (5 * 60 * 100.0) + (19 * 200.0) + (56 * 150.0), monitor.hostEnergyUsages["H01"]?.sum(), ) { "Incorrect energy usage" } }, @@ -364,13 +404,23 @@ class FailuresAndCheckpointingTest { ), ) + val failureModelSpec = + TraceBasedFailureModelSpec( + "src/test/resources/failureTraces/single_failure.parquet", + repeat = false, + ) + val topology = createTopology("single_1_2000.json") - val monitor = runTest(topology, workload) + val monitor = runTest(topology, workload, failureModelSpec) assertAll( - { assertEquals((10 * 60000) + (4 * 1000), monitor.maxTimestamp) { "Total runtime incorrect" } }, - { assertEquals((10 * 60 * 150.0) + (4 * 150.0), monitor.hostEnergyUsages["H01"]?.sum()) { "Incorrect energy usage" } }, + { assertEquals((10 * 60000) + (5 * 60 * 1000) + (4 * 1000) + (14 * 1000), monitor.maxTimestamp) { "Total runtime incorrect" } }, + { + assertEquals((10 * 60 * 150.0) + (5 * 60 * 100.0) + (4 * 150.0) + (14 * 150.0), monitor.hostEnergyUsages["H01"]?.sum()) { + "Incorrect energy usage" + } + }, ) } diff --git a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/GpuTest.kt b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/GpuTest.kt index e3ee08726..ed657a227 100644 --- a/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/GpuTest.kt +++ b/opendc-experiments/opendc-experiments-base/src/test/kotlin/org/opendc/experiments/base/GpuTest.kt @@ -154,8 +154,11 @@ class GpuTest { ) } + /** + * Injecting a failure after 5 minutes. Failure lasts 5 minutes. No checkpointing + */ @Test - fun testGpuSnapshot() { + fun testGpuFailure() { val workload: ArrayList = arrayListOf( createTestTask( @@ -182,9 +185,51 @@ class GpuTest { val monitor = runTest(topology, workload, failureModelSpec) assertAll( - { assertEquals(10 * 60 * 1000, monitor.maxTimestamp) { "Total runtime incorrect" } }, - { assertEquals(300.0, monitor.hostPowerDraws["H01"]?.get(0)) { "Incorrect energy usage" } }, - { assertEquals(300.0, monitor.hostPowerDraws["H01"]?.get(9)) { "Incorrect energy usage" } }, + { assertEquals(20 * 60 * 1000, monitor.maxTimestamp) { "Total runtime incorrect" } }, + { assertEquals(300.0, monitor.hostPowerDraws["H01"]?.get(0)) { "Incorrect energy usage at 0" } }, + { assertEquals(300.0, monitor.hostPowerDraws["H01"]?.get(12)) { "Incorrect energy usage at 9" } }, + ) + } + + @Test + fun testGpuCheckpoint() { + val workload: ArrayList = + arrayListOf( + createTestTask( + name = "0", + fragments = + arrayListOf( + TraceFragment(10 * 60 * 1000, 1000.0, 1), + ), + accelFragments = + arrayListOf( + TraceFragment(10 * 60 * 1000, 1000.0, 1), + ), + checkpointInterval = 60 * 1000L, + checkpointDuration = 1000L, + ), + ) + + val failureModelSpec = + TraceBasedFailureModelSpec( + "src/test/resources/failureTraces/single_failure.parquet", + repeat = false, + ) + + val topology = createTopology("gpu/single_1_2000.json") + + val monitor = runTest(topology, workload, failureModelSpec) + + assertAll( + // Task run time + Time node is in failed state + checkpoint time + time waiting to be scheduled + { + assertEquals( + (10 * 60 * 1000) + (5 * 60 * 1000) + (9 * 1000) + (56 * 1000), + monitor.maxTimestamp, + ) { "Total runtime incorrect" } + }, + { assertEquals(300.0, monitor.hostPowerDraws["H01"]?.get(0)) { "Incorrect power draw at 0" } }, + { assertEquals(300.0, monitor.hostPowerDraws["H01"]?.get(12)) { "Incorrect power draw at 9" } }, ) } } diff --git a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/SimTraceWorkload.java b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/SimTraceWorkload.java index a5d7c94e4..b6a70f833 100644 --- a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/SimTraceWorkload.java +++ b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/SimTraceWorkload.java @@ -264,24 +264,23 @@ public void makeSnapshot(long now) { // The amount of work done since last update double finishedWork = this.scalingPolicy.getFinishedWork(this.cpuFreqDemand, this.cpuFreqSupplied, passedTime); - double finishedAccelWork = this.scalingPolicy.getFinishedWork(this.accelFreqDemand, this.accelFreqSupplied, passedTime); + double finishedAccelWork = + this.scalingPolicy.getFinishedWork(this.accelFreqDemand, this.accelFreqSupplied, passedTime); this.remainingWork -= finishedWork; this.remainingAccelWork -= finishedAccelWork; // The amount of time required to finish the fragment at this speed long remainingTime = - this.scalingPolicy.getRemainingDuration(this.cpuFreqDemand, this.cpuFreqDemand, this.remainingWork); - long remainingAccelTime = - this.scalingPolicy.getRemainingDuration(this.accelFreqDemand, this.accelFreqDemand, this.remainingAccelWork); + this.scalingPolicy.getRemainingDuration(this.cpuFreqDemand, this.cpuFreqSupplied, this.remainingWork); + long remainingAccelTime = this.scalingPolicy.getRemainingDuration( + this.accelFreqDemand, this.accelFreqSupplied, this.remainingAccelWork); // If this is the end of the Task, don't make a snapshot - if (remainingTime <= 0 && remainingFragments.isEmpty()) { - return; - } else { + if (!(remainingTime <= 0 && remainingFragments.isEmpty())) { // Create a new fragment based on the current fragment and remaining duration TraceFragment newFragment = - new TraceFragment(remainingTime, currentFragment.cpuUsage(), currentFragment.coreCount()); + new TraceFragment(remainingTime, currentFragment.cpuUsage(), currentFragment.coreCount()); // Alter the snapshot by removing finished fragments this.snapshot.removeFragments(this.fragmentIndex); @@ -291,19 +290,17 @@ public void makeSnapshot(long now) { // Create and add a fragment for processing the snapshot process TraceFragment snapshotFragment = new TraceFragment( - this.checkpointDuration, this.snapshot.getMaxCpuDemand(), this.snapshot.getMaxCoreCount()); + this.checkpointDuration, this.snapshot.getMaxCpuDemand(), this.snapshot.getMaxCoreCount()); this.remainingFragments.addFirst(snapshotFragment); this.fragmentIndex = -1; startNextFragment(); } - if (remainingAccelTime <= 0 && remainingAccelFragments.isEmpty()) { - return; - } else { + if (!(remainingAccelTime <= 0 && remainingAccelFragments.isEmpty())) { // Create a new fragment based on the current fragment and remaining duration - TraceFragment newAccelFragment = - new TraceFragment(remainingAccelTime, currentAccelFragment.cpuUsage(), currentAccelFragment.coreCount()); + TraceFragment newAccelFragment = new TraceFragment( + remainingAccelTime, currentAccelFragment.cpuUsage(), currentAccelFragment.coreCount()); // Alter the snapshot by removing finished fragments this.snapshot.removeAccelFragments(this.accelFragmentIndex); @@ -313,15 +310,18 @@ public void makeSnapshot(long now) { // Create and add a fragment for processing the snapshot process // Not adding a snapshot cost to accel fragments for now -// TraceFragment snapshotAccelFragment = new TraceFragment( -// this.checkpointDuration, this.snapshot.getMaxCpuDemand(), this.snapshot.getMaxCoreCount()); -// this.remainingAccelFragments.addFirst(snapshotAccelFragment); + TraceFragment snapshotAccelFragment = new TraceFragment( + this.checkpointDuration, this.snapshot.getMaxAccelDemand(), this.snapshot.getMaxAccelCoreCount()); + this.remainingAccelFragments.addFirst(snapshotAccelFragment); this.accelFragmentIndex = -1; startNextAccelFragment(); } - if (remainingFragments.isEmpty() && remainingAccelFragments.isEmpty()) { + if (remainingTime <= 0 + && remainingAccelTime <= 0 + && remainingFragments.isEmpty() + && remainingAccelFragments.isEmpty()) { return; } diff --git a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/TraceWorkload.java b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/TraceWorkload.java index 9dca77ea6..e2b08eeb3 100644 --- a/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/TraceWorkload.java +++ b/opendc-simulator/opendc-simulator-compute/src/main/java/org/opendc/simulator/compute/workload/trace/TraceWorkload.java @@ -39,6 +39,8 @@ public class TraceWorkload implements Workload { private final double checkpointIntervalScaling; private final double maxCpuDemand; private final int maxCoreCount; + private final double maxAccelDemand; + private final int maxAccelCoreCount; public String getTaskName() { return taskName; @@ -77,6 +79,15 @@ public TraceWorkload( .max(Comparator.comparing(TraceFragment::coreCount)) .map(TraceFragment::coreCount) .orElse(0); + + this.maxAccelDemand = accelFragments.stream() + .max(Comparator.comparing(TraceFragment::cpuUsage)) + .map(TraceFragment::cpuUsage) + .orElse(0.0); + this.maxAccelCoreCount = accelFragments.stream() + .max(Comparator.comparing(TraceFragment::coreCount)) + .map(TraceFragment::coreCount) + .orElse(0); } public ArrayList getFragments() { @@ -110,6 +121,14 @@ public double getMaxCpuDemand() { return maxCpuDemand; } + public int getMaxAccelCoreCount() { + return maxAccelCoreCount; + } + + public double getMaxAccelDemand() { + return maxAccelDemand; + } + public void removeFragments(int numberOfFragments) { if (numberOfFragments <= 0) { return;