Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Deeploy/Targets/PULPOpen/Bindings.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@
PULPSynchCoresPass(),
ForkClosure(writeback = False, generateStruct = True),
TilingVariableReplacementUpdate("L1"),
PULPClusterTiling("L2", "L1", MchanDma()),
PULPClusterTiling("L2", "L1", MchanDma(), usePerfCounters=True), # Enable perf counters
ArgumentStructGeneration(),
MemoryManagementGeneration("L1"),
TilingVariableReplacement("L2"),
Expand All @@ -120,7 +120,7 @@
TilingVariableReplacement("L1"),
TilingCallClosure(writeback = False, generateStruct = True),
TilingVariableReplacementUpdate("L1"),
PULPClusterTiling("L2", "L1", MchanDma()),
PULPClusterTiling("L2", "L1", MchanDma(), usePerfCounters=True), # Enable perf counters
ArgumentStructGeneration(),
MemoryManagementGeneration("L1"),
TilingVariableReplacement("L2"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, _NoVerbosity
from Deeploy.TilingExtension.AsyncDma import AsyncDma
from Deeploy.TilingExtension.CodeTransformationPasses.DoubleBufferingTilingCodeGeneration import \
DoubleBufferingTilingCodeGeneration, ProfilingDoubleBufferingTilingMixIn
DoubleBufferingTilingCodeGeneration, PerfCounterDoubleBufferingTilingMixIn, ProfilingDoubleBufferingTilingMixIn
from Deeploy.TilingExtension.CodeTransformationPasses.SingleBufferingTilingCodeGeneration import \
ProfilingSingleBufferingTilingMixIn, SingleBufferingTilingCodeGeneration
PerfCounterSingleBufferingTilingMixIn, ProfilingSingleBufferingTilingMixIn, SingleBufferingTilingCodeGeneration


class PULPClusterTilingGenerationSB(SingleBufferingTilingCodeGeneration):
Expand All @@ -28,24 +28,55 @@ class ProfilingPULPClusterTilingGenerationDB(DoubleBufferingTilingCodeGeneration
pass


class PerfCounterPULPClusterTilingGenerationSB(SingleBufferingTilingCodeGeneration, PerfCounterSingleBufferingTilingMixIn):
"""Single buffering with performance counter profiling"""
pass


class PerfCounterPULPClusterTilingGenerationDB(DoubleBufferingTilingCodeGeneration, PerfCounterDoubleBufferingTilingMixIn):
"""Double buffering with performance counter profiling"""
pass


class CombinedProfilingPULPClusterTilingGenerationSB(SingleBufferingTilingCodeGeneration, ProfilingSingleBufferingTilingMixIn, PerfCounterSingleBufferingTilingMixIn):
"""Single buffering with both cycle profiling and performance counter profiling"""
pass


class CombinedProfilingPULPClusterTilingGenerationDB(DoubleBufferingTilingCodeGeneration, ProfilingDoubleBufferingTilingMixIn, PerfCounterDoubleBufferingTilingMixIn):
"""Double buffering with both cycle profiling and performance counter profiling"""
pass


class PULPClusterTiling(CodeTransformationPass):

def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma):
def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma, usePerfCounters: bool = False):
self.usePerfCounters = usePerfCounters
self.SB = PULPClusterTilingGenerationSB(externalMemory, localMemory, dma)
self.profilingSB = ProfilingPULPClusterTilingGenerationSB(externalMemory, localMemory, dma)
self.perfCounterSB = PerfCounterPULPClusterTilingGenerationSB(externalMemory, localMemory, dma)
self.combinedProfilingSB = CombinedProfilingPULPClusterTilingGenerationSB(externalMemory, localMemory, dma)
self.DB = PULPClusterTilingGenerationDB(externalMemory, localMemory, dma)
self.profilingDB = ProfilingPULPClusterTilingGenerationDB(externalMemory, localMemory, dma)
self.perfCounterDB = PerfCounterPULPClusterTilingGenerationDB(externalMemory, localMemory, dma)
self.combinedProfilingDB = CombinedProfilingPULPClusterTilingGenerationDB(externalMemory, localMemory, dma)

def apply(self,
ctxt: NetworkContext,
executionBlock: ExecutionBlock,
name: str,
verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:

if verbose.tilingProfiling:
if self.usePerfCounters and verbose.tilingProfiling:
# Use combined profiling: cycle measurements + performance counter stats
ctxt, executionBlock = self.combinedProfilingSB.apply(ctxt, executionBlock, name)
ctxt, executionBlock = self.combinedProfilingDB.apply(ctxt, executionBlock, name)
elif verbose.tilingProfiling:
# Use cycle profiling only (basic cycle measurements)
ctxt, executionBlock = self.profilingSB.apply(ctxt, executionBlock, name)
ctxt, executionBlock = self.profilingDB.apply(ctxt, executionBlock, name)
else:
# No profiling
ctxt, executionBlock = self.SB.apply(ctxt, executionBlock, name)
ctxt, executionBlock = self.DB.apply(ctxt, executionBlock, name)

Expand Down
2 changes: 1 addition & 1 deletion Deeploy/Targets/PULPOpen/Platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ class PULPStructBuffer(StructBuffer):

# SCHEREMO: stdint is included before pulp_nn_kernels.h because it is supposed to be included in there, but isn't...
_includeList = [
"pmsis.h", "stdint.h", "pulp_nn_kernels.h", "DeeployPULPMath.h", "mchan_siracusa.h", "dory_mem.h", "bsp/ram.h"
"pmsis.h", "stdint.h", "pulp_nn_kernels.h", "DeeployPULPMath.h", "mchan_siracusa.h", "dory_mem.h", "bsp/ram.h", "perf_utils.h"
]


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
from Deeploy.TilingExtension.AsyncDma import AnydimAsyncDmaTransferAdapter, AsyncDma, Future
from Deeploy.TilingExtension.CodeTransformationPasses.TilingCodeGeneration import TilingCodeGeneration
from Deeploy.TilingExtension.CodeTransformationPasses.TilingHoistingMixIn import dictOfArrays
from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import ProfilingPrototypeMixIn, \
PrototypeTilingMixIn, TilingMetaInfo
from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import PerfCounterProfilingMixIn, \
ProfilingPrototypeMixIn, PrototypeTilingMixIn, TilingMetaInfo
from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
from Deeploy.TilingExtension.TilingCodegen import TilingSchedule, VariableReplacementScheme, stridesFromShape

Expand Down Expand Up @@ -364,3 +364,38 @@ def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaIn
executionBlock = super().generateLoopCode(executionBlock, metaInfo, _openLoopStatements, _ingressDMAStatements,
_egressDMAStatements, closeLoopStatements)
return executionBlock

class PerfCounterDoubleBufferingTilingMixIn(PrototypeTilingMixIn, PerfCounterProfilingMixIn):
"""
Double buffering tiling with performance counter profiling.
Provides detailed instruction-level statistics for each tile.
"""

@classmethod
def generateSetupAndTeardownCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
setupStatements: List[CodeSnippet],
teardownStatements: List[CodeSnippet]) -> ExecutionBlock:

executionBlock = super().generateSetupAndTeardownCode(executionBlock, metaInfo, setupStatements,
teardownStatements)

# Inject performance counter initialization in setup (only once, not per-tile)
executionBlock = cls.injectPerfCounterInit(executionBlock, metaInfo)

# Inject performance counter stop and print in teardown (only once, not per-tile)
executionBlock = cls.injectPerfCounterStop(executionBlock, metaInfo)

return executionBlock

@classmethod
def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
openLoopStatements: List[CodeSnippet], ingressDMAStatements: List[CodeSnippet],
egressDMAStatements: List[CodeSnippet],
closeLoopStatements: List[CodeSnippet]) -> ExecutionBlock:

# Don't wrap kernel - perf counters measure the whole tiling loop, not individual tiles
# executionBlock = cls.injectPerfCounterKernelWrap(executionBlock, metaInfo)

executionBlock = super().generateLoopCode(executionBlock, metaInfo, openLoopStatements, ingressDMAStatements,
egressDMAStatements, closeLoopStatements)
return executionBlock
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
from Deeploy.TilingExtension.AsyncDma import AsyncDma, DmaDirection, Future
from Deeploy.TilingExtension.CodeTransformationPasses.TilingCodeGeneration import TilingCodeGeneration
from Deeploy.TilingExtension.CodeTransformationPasses.TilingHoistingMixIn import dictOfArrays
from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import ProfilingPrototypeMixIn, \
PrototypeTilingMixIn, TilingMetaInfo
from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import PerfCounterProfilingMixIn, \
ProfilingPrototypeMixIn, PrototypeTilingMixIn, TilingMetaInfo
from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint, TensorMemoryConstraint
from Deeploy.TilingExtension.TilingCodegen import HyperRectangle, TilingSchedule, VariableReplacementScheme

Expand Down Expand Up @@ -191,3 +191,39 @@ def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaIn
executionBlock = super().generateLoopCode(executionBlock, metaInfo, _openLoopStatements, _ingressDMAStatements,
_egressDMAStatements, closeLoopStatements)
return executionBlock


class PerfCounterSingleBufferingTilingMixIn(PrototypeTilingMixIn, PerfCounterProfilingMixIn):
"""
Single buffering tiling with performance counter profiling.
Provides detailed instruction-level statistics for each tile.
"""

@classmethod
def generateSetupAndTeardownCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
setupStatements: List[CodeSnippet],
teardownStatements: List[CodeSnippet]) -> ExecutionBlock:

executionBlock = super().generateSetupAndTeardownCode(executionBlock, metaInfo, setupStatements,
teardownStatements)

# Inject performance counter initialization in setup (only once, not per-tile)
executionBlock = cls.injectPerfCounterInit(executionBlock, metaInfo)

# Inject performance counter stop and print in teardown (only once, not per-tile)
executionBlock = cls.injectPerfCounterStop(executionBlock, metaInfo)

return executionBlock

@classmethod
def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
openLoopStatements: List[CodeSnippet], ingressDMAStatements: List[CodeSnippet],
egressDMAStatements: List[CodeSnippet],
closeLoopStatements: List[CodeSnippet]) -> ExecutionBlock:

# Don't wrap kernel - perf counters measure the whole tiling loop, not individual tiles
# executionBlock = cls.injectPerfCounterKernelWrap(executionBlock, metaInfo)

executionBlock = super().generateLoopCode(executionBlock, metaInfo, openLoopStatements, ingressDMAStatements,
egressDMAStatements, closeLoopStatements)
return executionBlock
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,105 @@ def generateAllTilingCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingM
return executionBlock


class PerfCounterProfilingMixIn(ABC):
"""
MixIn for injecting performance counter profiling code.
Provides detailed instruction-level statistics using CSR performance counters.
"""

_perfCounterInit = NodeTemplate("""
perf_stats_t ${nodeName}_perf_start, ${nodeName}_perf_end, ${nodeName}_perf_total;
if (pi_core_id() == 0) {
perf_bench_init();
perf_bench_start();
perf_bench_read(&${nodeName}_perf_start);
}
""")

_perfCounterStop = NodeTemplate("""
if (pi_core_id() == 0) {
perf_bench_stop();
perf_bench_read(&${nodeName}_perf_end);
perf_bench_diff(&${nodeName}_perf_total, &${nodeName}_perf_end, &${nodeName}_perf_start);
perf_bench_print("${nodeName}", &${nodeName}_perf_total);
}
""")

_perfCounterKernelStart = NodeTemplate("""
if (pi_core_id() == 0) {
perf_bench_start();
perf_bench_read(&${nodeName}_perf_kernel_start);
}
""")

_perfCounterKernelEnd = NodeTemplate("""
if (pi_core_id() == 0) {
perf_bench_stop();
perf_bench_read(&${nodeName}_perf_kernel_end);
perf_bench_diff(&${nodeName}_perf_kernel_total, &${nodeName}_perf_kernel_end, &${nodeName}_perf_kernel_start);
perf_bench_print("${nodeName} Kernel", &${nodeName}_perf_kernel_total);
}
""")

_perfCounterKernelDecl = NodeTemplate("""
perf_stats_t ${nodeName}_perf_kernel_start, ${nodeName}_perf_kernel_end, ${nodeName}_perf_kernel_total;
""")

@classmethod
def injectPerfCounterInit(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo) -> ExecutionBlock:
"""
Inject performance counter initialization at the beginning of the node execution.
This should be called in the setup phase.
"""
nodeName = metaInfo.nodeName

executionBlock.addLeft(cls._perfCounterInit, {
"nodeName": nodeName,
})

return executionBlock

@classmethod
def injectPerfCounterStop(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo) -> ExecutionBlock:
"""
Inject performance counter stop and print at the end of the node execution.
This should be called in the teardown phase.
"""
nodeName = metaInfo.nodeName

executionBlock.addRight(cls._perfCounterStop, {
"nodeName": nodeName,
})

return executionBlock

@classmethod
def injectPerfCounterKernelWrap(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo) -> ExecutionBlock:
"""
Wrap the kernel execution with performance counter measurements.
This provides detailed statistics for just the kernel computation (excluding DMA).
"""
nodeName = metaInfo.nodeName

if metaInfo.kernelLevelTiling:
# Add declaration at the beginning
executionBlock.addLeft(cls._perfCounterKernelDecl, {
"nodeName": nodeName,
})

# Add start measurement before kernel
executionBlock.addLeft(cls._perfCounterKernelStart, {
"nodeName": nodeName,
})

# Add stop and print after kernel
executionBlock.addRight(cls._perfCounterKernelEnd, {
"nodeName": nodeName,
})

return executionBlock


class ProfilingPrototypeMixIn(ABC):
_measureCycles = NodeTemplate("""
${measurements}[${tileIdxVar}] = getCycles();
Expand Down
Loading