diff --git a/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/LoweringOptimizationPasses.py b/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/LoweringOptimizationPasses.py index aba6740d49..22cbd76a55 100644 --- a/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/LoweringOptimizationPasses.py +++ b/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/LoweringOptimizationPasses.py @@ -245,8 +245,29 @@ def _NCHWtoNHWC_fun(graph: gs.Graph, match: Match, name: str, default_channels_f if node.op in ["Conv", "RequantizedConv"]: # In the case of Conv: [weights, opt. bias], RequantizedConv: [weights, mul, add, opt. shift] for tensor in node.inputs[1:]: - _transformLayoutConst(tensor, spatialDims, default_channels_first) + # Standard case: The weight is a direct constant input. + if isinstance(tensor, gs.Constant): + const_to_transform = tensor + # MeZO case: The weight is produced by a Perturb node. + elif isinstance(tensor, gs.Variable): + producer_node = None + for n in graph.nodes: + if tensor in n.outputs: + producer_node = n + break + + if producer_node and producer_node.op in ["PerturbNormal", "PerturbUniform"]: + # Find the original constant that feeds the Perturb node. + const_to_transform = producer_node.inputs[0] + + # If we found a constant, transpose it. The Perturb node will inherit the new layout. + if const_to_transform and isinstance(const_to_transform, gs.Constant): + # Only apply layout transformation to multi-dimensional tensors (i.e., weights) + if len(const_to_transform.shape) > 1: + _transformLayoutConst(const_to_transform, spatialDims, default_channels_first) + if producer_node: + tensor.shape = tuple(const_to_transform.shape) node.attrs["channels_first"] = default_channels_first return graph diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py index d9d768fabc..2c6e0edc39 100644 --- a/Deeploy/DeeployTypes.py +++ b/Deeploy/DeeployTypes.py @@ -2884,10 +2884,11 @@ def generateIOBufferInitializationCode(self) -> str: callStack += "static const uint32_t " + self.ctxt._mangle("num_inputs") + f" = {len(inputs)};" callStack += "static const uint32_t " + self.ctxt._mangle("num_outputs") + f" = {len(outputs)};" - + callStack += "static const uint32_t seed = 12345;" # fixed seed for reproducibility + callStack += "static const uint32_t perturbation_sign = 1;" # fixed sign for reproducibility callStack += "extern void* " + self.ctxt._mangle("inputs") + f"[{len(inputs)}];" callStack += "extern void* " + self.ctxt._mangle("outputs") + f"[{len(outputs)}];" - + callStack += "static const uint32_t " + self.ctxt._mangle("inputs_bytes") + f"[{len(inputs)}] = " + "{" numBytes = [] diff --git a/Deeploy/Targets/GAP9/Bindings.py b/Deeploy/Targets/GAP9/Bindings.py index 0e7b052f46..22c87cdeb0 100644 --- a/Deeploy/Targets/GAP9/Bindings.py +++ b/Deeploy/Targets/GAP9/Bindings.py @@ -328,6 +328,9 @@ GAP9ConcatBindings = [ NodeBinding(ConcatChecker([PointerClass(type), PointerClass(type)], [PointerClass(type)]), ConcatTemplate.referenceTemplate, GAP9ClusterTransformer) for type in IntegerDataTypes +] + [ + NodeBinding(ConcatChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + ConcatTemplate.referenceTemplate, GAP9ClusterTransformer) ] GAP9iRMSNormBindings = [ diff --git a/Deeploy/Targets/Generic/Layers.py b/Deeploy/Targets/Generic/Layers.py index cc733937cc..54b5cbaa67 100644 --- a/Deeploy/Targets/Generic/Layers.py +++ b/Deeploy/Targets/Generic/Layers.py @@ -709,3 +709,40 @@ def computeOps(self): numPx = opRep['dim_im_out_x'] return numPx * opsPerPx + + +class PerturbNormalLayer(ONNXLayer): + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + def computeOps(self): + return self.mapper.parser.operatorRepresentation['size'] + +class PerturbUniformLayer(ONNXLayer): + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + def computeOps(self): + return self.mapper.parser.operatorRepresentation['size'] + +class PerturbEggrollLayer(ONNXLayer): + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + def computeOps(self): + return self.mapper.parser.operatorRepresentation['size'] + +class PerturbRademacherLayer(ONNXLayer): + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + def computeOps(self): + return self.mapper.parser.operatorRepresentation['size'] + +class PerturbTriangleLayer(ONNXLayer): + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + def computeOps(self): + return self.mapper.parser.operatorRepresentation['size'] + diff --git a/Deeploy/Targets/Generic/Parsers.py b/Deeploy/Targets/Generic/Parsers.py index cf1ba776bd..dda7122ace 100644 --- a/Deeploy/Targets/Generic/Parsers.py +++ b/Deeploy/Targets/Generic/Parsers.py @@ -1291,7 +1291,10 @@ def parseNode(self, node: gs.Node) -> (bool): if ret: if 'kernel_shape' not in node.attrs: - node.attrs['kernel_shape'] = node.inputs[1].shape[-2:] + if self.operatorRepresentation['channels_first']: + node.attrs['kernel_shape'] = node.inputs[1].shape[-2:] + else: + node.attrs['kernel_shape'] = node.inputs[1].shape[1:3] self.operatorRepresentation['kernel_shape'] = node.attrs['kernel_shape'] self.operatorRepresentation['dim_kernel_x'] = int(self.operatorRepresentation['kernel_shape'][0]) self.operatorRepresentation['dim_kernel_y'] = int(self.operatorRepresentation['kernel_shape'][1]) @@ -2882,3 +2885,168 @@ def parseNodeCtxt(self, self.operatorRepresentation['size'] = int(np.prod(data_in.shape)) return ctxt, True + +class PerturbNormalParser(NodeParser): + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + + ret = all([len(node.inputs) == 1, + len(node.outputs) == 1, + 'seed' in node.attrs, + 'eps' in node.attrs]) + return ret + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + data_in = ctxt.lookup(node.inputs[0].name) + data_out = ctxt.lookup(node.outputs[0].name) + input_shape = data_in.shape + if isinstance(data_in.shape, int): + input_shape = tuple(input_shape, ) + self.operatorRepresentation['data_in'] = data_in.name + self.operatorRepresentation['data_out'] = data_out.name + self.operatorRepresentation['seed'] = node.attrs['seed'] + self.operatorRepresentation['size'] = np.prod(input_shape) + self.operatorRepresentation['nodeIdx'] = node.attrs['idx'] + self.operatorRepresentation['eps'] = node.attrs['eps'] + return ctxt, True + +class PerturbUniformParser(NodeParser): + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + + ret = all([len(node.inputs) == 1, + len(node.outputs) == 1, + 'low' in node.attrs, + 'high' in node.attrs, + 'seed' in node.attrs, + 'eps' in node.attrs]) + return ret + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + data_in = ctxt.lookup(node.inputs[0].name) + data_out = ctxt.lookup(node.outputs[0].name) + input_shape = data_in.shape + if isinstance(data_in.shape, int): + input_shape = tuple(input_shape, ) + self.operatorRepresentation['data_in'] = data_in.name + self.operatorRepresentation['data_out'] = data_out.name + self.operatorRepresentation['seed'] = node.attrs['seed'] + self.operatorRepresentation['size'] = np.prod(input_shape) + self.operatorRepresentation['nodeIdx'] = node.attrs['idx'] + self.operatorRepresentation['eps'] = node.attrs['eps'] + self.operatorRepresentation['low'] = float(node.attrs['low']) + self.operatorRepresentation['high'] = float(node.attrs['high']) + + return ctxt, True + +class PerturbEggrollParser(NodeParser): + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + + ret = all([len(node.inputs) == 1, + len(node.outputs) == 1, + 'seed' in node.attrs, + 'eps' in node.attrs]) + return ret + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + data_in = ctxt.lookup(node.inputs[0].name) + data_out = ctxt.lookup(node.outputs[0].name) + input_shape = data_in.shape + if isinstance(data_in.shape, int): + input_shape = tuple(input_shape, ) + self.operatorRepresentation['data_in'] = data_in.name + self.operatorRepresentation['data_out'] = data_out.name + self.operatorRepresentation['seed'] = node.attrs['seed'] + self.operatorRepresentation['sizeA'] = input_shape[0] + self.operatorRepresentation['sizeB'] = np.prod(input_shape[1:]) + self.operatorRepresentation['nodeIdx'] = node.attrs['idx'] + self.operatorRepresentation['eps'] = node.attrs['eps'] + return ctxt, True + +class PerturbRademacherParser(NodeParser): + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + + ret = all([len(node.inputs) == 1, + len(node.outputs) == 1, + 'seed' in node.attrs, + 'eps' in node.attrs]) + return ret + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + data_in = ctxt.lookup(node.inputs[0].name) + data_out = ctxt.lookup(node.outputs[0].name) + input_shape = data_in.shape + if isinstance(data_in.shape, int): + input_shape = tuple(input_shape, ) + self.operatorRepresentation['data_in'] = data_in.name + self.operatorRepresentation['data_out'] = data_out.name + self.operatorRepresentation['seed'] = node.attrs['seed'] + self.operatorRepresentation['size'] = np.prod(input_shape) + self.operatorRepresentation['nodeIdx'] = node.attrs['idx'] + self.operatorRepresentation['eps'] = node.attrs['eps'] + return ctxt, True + + +class PerturbTriangleParser(NodeParser): + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + + ret = all([len(node.inputs) == 1, + len(node.outputs) == 1, + 'seed' in node.attrs, + 'eps' in node.attrs]) + return ret + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + data_in = ctxt.lookup(node.inputs[0].name) + data_out = ctxt.lookup(node.outputs[0].name) + input_shape = data_in.shape + if isinstance(data_in.shape, int): + input_shape = tuple(input_shape, ) + self.operatorRepresentation['data_in'] = data_in.name + self.operatorRepresentation['data_out'] = data_out.name + self.operatorRepresentation['seed'] = node.attrs['seed'] + self.operatorRepresentation['size'] = np.prod(input_shape) + self.operatorRepresentation['nodeIdx'] = node.attrs['idx'] + self.operatorRepresentation['eps'] = node.attrs['eps'] + self.operatorRepresentation['low'] = float(node.attrs['low']) + self.operatorRepresentation['high'] = float(node.attrs['high']) + + return ctxt, True \ No newline at end of file diff --git a/Deeploy/Targets/Generic/TileConstraints/UntiledTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/UntiledTileConstraint.py index 091cb55a41..525d8093bf 100644 --- a/Deeploy/Targets/Generic/TileConstraints/UntiledTileConstraint.py +++ b/Deeploy/Targets/Generic/TileConstraints/UntiledTileConstraint.py @@ -35,8 +35,11 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw tilerModel.addTensorDimToModel(ctxt, tensorName) - for idx, shapeDim in enumerate(_buffer.shape): - tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName = tensorName, dimIdx = idx) == shapeDim) + if isinstance(_buffer.shape, int): + tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName = tensorName, dimIdx = 0) == _buffer.shape) + else: + for idx, shapeDim in enumerate(_buffer.shape): + tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName = tensorName, dimIdx = idx) == shapeDim) return tilerModel diff --git a/Deeploy/Targets/Generic/TypeCheckers.py b/Deeploy/Targets/Generic/TypeCheckers.py index c2c8d436f8..6d14a37c2b 100644 --- a/Deeploy/Targets/Generic/TypeCheckers.py +++ b/Deeploy/Targets/Generic/TypeCheckers.py @@ -610,3 +610,17 @@ def _inferNumLevels(self, inputs: List[VariableBuffer], def _inferSignedness(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[bool]: return [True] + +class PerturbZOChecker(SignPropTypeChecker): + + def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): + super().__init__(input_types, output_types) + + def _inferNumLevels(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[int]: + return [inputs[0].nLevels] + + def _inferSignedness(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[bool]: + return [True] + \ No newline at end of file diff --git a/Deeploy/Targets/PULPOpen/Bindings.py b/Deeploy/Targets/PULPOpen/Bindings.py index e1a9ed5932..185117fc86 100644 --- a/Deeploy/Targets/PULPOpen/Bindings.py +++ b/Deeploy/Targets/PULPOpen/Bindings.py @@ -19,7 +19,7 @@ from Deeploy.Targets.Generic.TypeCheckers import AddChecker, ConcatChecker, ConvChecker, DequantChecker, \ GatherChecker, GELUChecker, GEMMChecker, HardswishChecker, LayerNormChecker, MatMulChecker, MulChecker, \ QuantChecker, ReduceMeanChecker, ReluChecker, ReshapeChecker, RQAddChecker, RQHardswishChecker, SGDChecker, \ - SliceChecker, SoftmaxChecker, SoftmaxCrossEntropyLossChecker, TransposeChecker + SliceChecker, SoftmaxChecker, SoftmaxCrossEntropyLossChecker, TransposeChecker, PerturbZOChecker from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterSynch import PULPSynchCoresPass from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterTiling import PULPClusterTiling from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPL3Tiling import PULPL3Tiling @@ -32,7 +32,8 @@ FloatMulTemplate, FloatReduceMeanTemplate, FloatReluTemplate, FloatSoftmaxTemplate, GEMMTemplate, \ MatrixVectorTemplate, MaxPool2DTemplate, MulTemplate, ReduceMeanTemplate, RequantShiftTemplate, ReshapeTemplate, \ RQAddTemplate, RQSiHardswishTemplate, SGDTemplate, SoftmaxCrossEntropyLossTemplate, TallGEMMTemplate, \ - TransposeTemplate, UniformRequantShiftTemplate, iRMSNormTemplate, iSoftmaxTemplate + TransposeTemplate, UniformRequantShiftTemplate, iRMSNormTemplate, iSoftmaxTemplate, FloatPerturbNormalTemplate, \ + FloatPerturbUniformTemplate, FloatPerturbEggrollTemplate, FloatPerturbRademacherTemplate, FloatPerturbTriangleTemplate from Deeploy.Targets.PULPOpen.TypeCheckers import PULPConvChecker, PULPLinearChecker, PULPMaxPoolChecker, \ PULPRequantShiftChecker from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement, \ @@ -368,6 +369,9 @@ PULPConcatBindings = [ NodeBinding(ConcatChecker([PointerClass(type), PointerClass(type)], [PointerClass(type)]), ConcatTemplate.referenceTemplate, ClusterTransformer) for type in IntegerDataTypes +] + [ + NodeBinding(ConcatChecker([PointerClass(float_type), PointerClass(float_type)], [PointerClass(float_type)]), + ConcatTemplate.referenceTemplate, ClusterTransformer) for float_type in FloatDataTypes ] PULPiRMSNormBindings = [ @@ -448,3 +452,33 @@ NodeBinding(DequantChecker([PointerClass(int32_t)], [PointerClass(float32_t)]), DequantTemplate.referenceTemplate, ForkTransformer), ] + +PULPPerturbNormalBindings = [ + NodeBinding( + PerturbZOChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatPerturbNormalTemplate.referenceTemplate, + ForkTransformer)] + +PULPPerturbUniformBindings = [ + NodeBinding( + PerturbZOChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatPerturbUniformTemplate.referenceTemplate, + ForkTransformer)] + +PULPPerturbEggrollBindings = [ + NodeBinding( + PerturbZOChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatPerturbEggrollTemplate.referenceTemplate, + ForkTransformer)] + +PULPPerturbRademacherBindings = [ + NodeBinding( + PerturbZOChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatPerturbRademacherTemplate.referenceTemplate, + ForkTransformer)] + +PULPPerturbTriangleBindings = [ + NodeBinding( + PerturbZOChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatPerturbTriangleTemplate.referenceTemplate, + ForkTransformer)] diff --git a/Deeploy/Targets/PULPOpen/Parsers.py b/Deeploy/Targets/PULPOpen/Parsers.py index 5c5951eaba..99e45cefb3 100644 --- a/Deeploy/Targets/PULPOpen/Parsers.py +++ b/Deeploy/Targets/PULPOpen/Parsers.py @@ -75,10 +75,10 @@ def parseNode(self, node: gs.Node) -> (bool): # Current PULP kernel only supports grouping of 1 self.operatorRepresentation['group'] == 1, - # Make sure padding is square - self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][2], - self.operatorRepresentation['pads'][1] == self.operatorRepresentation['pads'][3], - self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][1], + # Make sure padding is symmetric (left==right, top==bottom) + # but top/bottom can differ from left/right + self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][2], # top == bottom + self.operatorRepresentation['pads'][1] == self.operatorRepresentation['pads'][3], # left == right # Check number of inputs # 2 inputs if no bias, 3 if layer has bias @@ -133,10 +133,10 @@ def parseNode(self, node: gs.Node) -> (bool): if wellFormed: # Check if the node is a depthwise convolution ret = all([ - # Make sure padding is square - self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][2], - self.operatorRepresentation['pads'][1] == self.operatorRepresentation['pads'][3], - self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][1], + # Make sure padding is symmetric (left==right, top==bottom) + # but top/bottom can differ from left/right + self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][2], # top == bottom + self.operatorRepresentation['pads'][1] == self.operatorRepresentation['pads'][3], # left == right # Check number of inputs # 2 inputs if no bias, 3 if layer has bias diff --git a/Deeploy/Targets/PULPOpen/Platform.py b/Deeploy/Targets/PULPOpen/Platform.py index d45dc00f9c..4b6a8d6a3d 100644 --- a/Deeploy/Targets/PULPOpen/Platform.py +++ b/Deeploy/Targets/PULPOpen/Platform.py @@ -17,13 +17,15 @@ GEMMLayer, LayerNormGradLayer, LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, QuantLayer, \ ReduceMeanLayer, ReduceSumLayer, ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, \ RQSiHardswishLayer, SGDLayer, SliceLayer, SoftmaxCrossEntropyLossGradLayer, SoftmaxCrossEntropyLossLayer, \ - SoftmaxGradLayer, SoftmaxLayer, TransposeLayer, iHardswishLayer, iRMSNormLayer + SoftmaxGradLayer, SoftmaxLayer, TransposeLayer, iHardswishLayer, iRMSNormLayer, PerturbNormalLayer, \ + PerturbUniformLayer, PerturbEggrollLayer, PerturbRademacherLayer, PerturbTriangleLayer from Deeploy.Targets.Generic.Parsers import AddParser, ConcatParser, DequantParser, FlattenParser, GatherParser, \ GELUGradParser, GELUParser, GEMMParser, LayerNormGradParser, LayerNormParser, MatMulParser, MaxPool2DParser, \ MulParser, Pad1DParser, Pad2DParser, QuantParser, ReduceSumParser, ReluParser, RequantShiftParser, ReshapeParser, \ RQAddParser, RQIntegerDivParser, RQSiGELUParser, RQSiHardswishParser, SGDParser, SliceParser, \ SoftmaxCrossEntropyLossGradParser, SoftmaxCrossEntropyLossParser, SoftmaxGradParser, SoftmaxParser, \ - TransposeParser, UniformRequantShiftParser, UnsqueezeParser, iHardswishParser, iRMSNormParser, iSoftmaxParser + TransposeParser, UniformRequantShiftParser, UnsqueezeParser, iHardswishParser, iRMSNormParser, iSoftmaxParser, \ + PerturbNormalParser, PerturbUniformParser, PerturbEggrollParser, PerturbRademacherParser, PerturbTriangleParser from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import DequantPatternPass, IntegerDivRequantMergePass, \ MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, QuantPatternPass, RQSSplitPass, \ @@ -47,7 +49,9 @@ PULPRQSTallGEMMTilingReadyBindings, PULPRQSTilingReadyBindings, PULPSGDTilingReadyBindings, \ PULPSliceTilingReadyBindings, PULPSoftmaxCrossEntropyGradTilingReadyBindings, \ PULPSoftmaxCrossEntropyTilingReadyBindings, PULPSoftmaxGradTilingReadyBindings, PULPSoftmaxTilingReadyBindings, \ - PULPTransposeTilingReadyBindings, PULPUniformRQSTilingReadyBindings + PULPTransposeTilingReadyBindings, PULPUniformRQSTilingReadyBindings, \ + PULPPerturbNormalTilingReadyBindings, PULPPerturbUniformTilingReadyBindings, \ + PULPPerturbEggrollTilingReadyBindings, PULPPerturbRademacherTilingReadyBindings, PULPPerturbTriangleTilingReadyBindings from Deeploy.Targets.PULPOpen.TopologyOptimizationPasses.Passes import PULPAddRequantMergePass, \ PULPConvRequantMergePass, PULPGEMMRequantMergePass, PULPMatMulRequantMergePass @@ -90,6 +94,13 @@ SoftmaxMapper = NodeMapper(SoftmaxParser(), PULPSoftmaxTilingReadyBindings) SoftmaxGradMapper = NodeMapper(SoftmaxGradParser(), PULPSoftmaxGradTilingReadyBindings) Softmax_int8_Mapper = NodeMapper(iSoftmaxParser(), PULPSoftmaxTilingReadyBindings) +PerturbNormalMapper = NodeMapper(PerturbNormalParser(), PULPPerturbNormalTilingReadyBindings) +PerturbUniformMapper = NodeMapper(PerturbUniformParser(), PULPPerturbUniformTilingReadyBindings) +PerturbEggrollMapper = NodeMapper(PerturbEggrollParser(), PULPPerturbEggrollTilingReadyBindings) +PerturbRademacherMapper = NodeMapper(PerturbRademacherParser(), PULPPerturbRademacherTilingReadyBindings) +PerturbTriangleMapper = NodeMapper(PerturbTriangleParser(), PULPPerturbTriangleTilingReadyBindings) + + ConcatMapper = NodeMapper(ConcatParser(), PULPConcatTilingReadyBindings) @@ -148,7 +159,12 @@ 'SoftmaxGrad': SoftmaxGradLayer([SoftmaxGradMapper]), 'SoftmaxCrossEntropyLoss': SoftmaxCrossEntropyLossLayer([SoftmaxCrossEntropyLossMapper]), 'SoftmaxCrossEntropyLossGrad': SoftmaxCrossEntropyLossGradLayer([SoftmaxCrossEntropyLossGradMapper]), - 'SGD': SGDLayer([SGDMapper]) + 'SGD': SGDLayer([SGDMapper]), + 'PerturbNormal': PerturbNormalLayer([PerturbNormalMapper]), + 'PerturbUniform': PerturbUniformLayer([PerturbUniformMapper]), + 'PerturbEggroll': PerturbEggrollLayer([PerturbEggrollMapper]), + 'PerturbRademacher': PerturbRademacherLayer([PerturbRademacherMapper]), + 'PerturbTriangle': PerturbTriangleLayer([PerturbTriangleMapper]), } diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatPerturbEggrollTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatPerturbEggrollTemplate.py new file mode 100644 index 0000000000..5d96f83725 --- /dev/null +++ b/Deeploy/Targets/PULPOpen/Templates/FloatPerturbEggrollTemplate.py @@ -0,0 +1,27 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class _FloatPerturbEggrollTemplate(NodeTemplate): + + def __init__(self, templateStr): + super().__init__(templateStr) + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + # Add the node's unique ID to help create a unique seed_${nodeName}. + operatorRepresentation['node_id'] = operatorRepresentation['nodeIdx'] + return ctxt, operatorRepresentation, [] + + +# TODO: No loop unrolling optimization yet +referenceTemplate = _FloatPerturbEggrollTemplate(""" +// Perturb Eggroll (Name: ${nodeName}, Op: ${nodeOp}) +); +""" +) \ No newline at end of file diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatPerturbNormalTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatPerturbNormalTemplate.py new file mode 100644 index 0000000000..5337545c5b --- /dev/null +++ b/Deeploy/Targets/PULPOpen/Templates/FloatPerturbNormalTemplate.py @@ -0,0 +1,44 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class _FloatPerturbNormalTemplate(NodeTemplate): + + def __init__(self, templateStr): + super().__init__(templateStr) + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + # Add the node's unique ID to help create a unique seed_${nodeName}. + operatorRepresentation['node_id'] = operatorRepresentation['nodeIdx'] + return ctxt, operatorRepresentation, [] + + +# TODO: No loop unrolling optimization yet +referenceTemplate = _FloatPerturbNormalTemplate(""" +// PerturbNormal (Name: ${nodeName}, Op: ${nodeOp}) +uint8_t ${nodeName}_core_id = (uint8_t) pi_core_id(); +uint8_t ${nodeName}_log2Core = (uint8_t) log2(NUM_CORES); +uint32_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0); +uint32_t ${nodeName}_chunk_start = (uint32_t) MIN(${nodeName}_chunk*${nodeName}_core_id, (uint32_t) ${size}); +uint32_t ${nodeName}_chunk_stop = (uint32_t) MIN(${nodeName}_chunk_start + ${nodeName}_chunk, (uint32_t) ${size}); +uint32_t ${nodeName}_local_size = ${nodeName}_chunk_stop - ${nodeName}_chunk_start; + +uint32_t chunk_seed = seed + (${nodeName}_chunk_start * ${node_id}) + (${node_id} * 104729); + +// pick large enough stride to minimize correlation between nodes. +ApplyGaussianPerturbation( + (const float32_t *) &${data_in}[${nodeName}_chunk_start], + (float32_t *) &${data_out}[${nodeName}_chunk_start], + chunk_seed, + ${eps}f, + perturbation_sign, // globally defined in DeedeployTest main + ${nodeName}_local_size +); +""" +) \ No newline at end of file diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatPerturbRademacherTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatPerturbRademacherTemplate.py new file mode 100644 index 0000000000..5c8a41a312 --- /dev/null +++ b/Deeploy/Targets/PULPOpen/Templates/FloatPerturbRademacherTemplate.py @@ -0,0 +1,42 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class _FloatPerturbRademacherTemplate(NodeTemplate): + + def __init__(self, templateStr): + super().__init__(templateStr) + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + # Add the node's unique ID to help create a unique seed. + operatorRepresentation['node_id'] = operatorRepresentation['nodeIdx'] + return ctxt, operatorRepresentation, [] + + +referenceTemplate = _FloatPerturbRademacherTemplate(""" +// PerturbRademacher (Name: ${nodeName}, Op: ${nodeOp}) +uint8_t ${nodeName}_core_id = (uint8_t) pi_core_id(); +uint8_t ${nodeName}_log2Core = (uint8_t) log2(NUM_CORES); +uint32_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0); +uint32_t ${nodeName}_chunk_start = (uint32_t) MIN(${nodeName}_chunk*${nodeName}_core_id, (uint32_t) ${size}); +uint32_t ${nodeName}_chunk_stop = (uint32_t) MIN(${nodeName}_chunk_start + ${nodeName}_chunk, (uint32_t) ${size}); +uint32_t ${nodeName}_local_size = ${nodeName}_chunk_stop - ${nodeName}_chunk_start; + +uint32_t i = ${nodeName}_chunk_start; +for (; i < ${nodeName}_chunk_stop; i++) { + // pick large enough stride to minimize correlation between nodes. + uint32_t chunk_seed = seed + i*${nodeName}_chunk_start + (${node_id} * 104729); + ApplyRademacherPerturbation((const float32_t *) &${data_in}[i], + (float32_t *) &${data_out}[i], + chunk_seed, + ${eps}f, + perturbation_sign, // globally defined in DeedeployTest main + ${nodeName}_local_size); +} +""") \ No newline at end of file diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatPerturbTriangleTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatPerturbTriangleTemplate.py new file mode 100644 index 0000000000..b810601186 --- /dev/null +++ b/Deeploy/Targets/PULPOpen/Templates/FloatPerturbTriangleTemplate.py @@ -0,0 +1,55 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class _FloatPerturbTriangleTemplate(NodeTemplate): + + def __init__(self, templateStr): + super().__init__(templateStr) + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + # Add the node's unique ID to help create a unique seed_${nodeName}. + operatorRepresentation['node_id'] = operatorRepresentation['nodeIdx'] + return ctxt, operatorRepresentation, [] + + +# TODO: No loop unrolling optimization yet +referenceTemplate = _FloatPerturbTriangleTemplate(""" +// PerturbTriangle (Name: ${nodeName}, Op: ${nodeOp}) +uint8_t ${nodeName}_core_id = (uint8_t) pi_core_id(); +uint8_t ${nodeName}_log2Core = (uint8_t) log2(NUM_CORES); +uint32_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0); +uint32_t ${nodeName}_chunk_start = (uint32_t) MIN(${nodeName}_chunk*${nodeName}_core_id, (uint32_t) ${size}); +uint32_t ${nodeName}_chunk_stop = (uint32_t) MIN(${nodeName}_chunk_start + ${nodeName}_chunk, (uint32_t) ${size}); +uint32_t ${nodeName}_local_size = ${nodeName}_chunk_stop - ${nodeName}_chunk_start; + +uint32_t i = ${nodeName}_chunk_start; +for (; i < ${nodeName}_chunk_stop; i++) { + // pick large enough stride to minimize correlation between nodes. + uint32_t chunk_seed = seed + i*${nodeName}_chunk_start + (${node_id} * 104729); + ApplyTrianglePerturbation((const float32_t *) &${data_in}[i], + (float32_t *) &${data_out}[i], + chunk_seed, + ${eps}f, + perturbation_sign, // globally defined in DeedeployTest main + ${nodeName}_local_size); +} +""") + +updateTemplate = _FloatPerturbTriangleTemplate(""" +// UpdateTriangle (Name: ${nodeName}, Op: ${nodeOp}) +BEGIN_SINGLE_CORE + UpdateWeightsTriangle((float32_t *)${data_in}, + loss, + seed + ${node_id}, + ${eps}f, + lr, // globally defined + ${size}); +END_SINGLE_CORE +""") diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatPerturbUniformTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatPerturbUniformTemplate.py new file mode 100644 index 0000000000..28df975065 --- /dev/null +++ b/Deeploy/Targets/PULPOpen/Templates/FloatPerturbUniformTemplate.py @@ -0,0 +1,39 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class _FloatPerturbUniformTemplate(NodeTemplate): + + def __init__(self, templateStr): + super().__init__(templateStr) + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + # Add the node's unique ID to help create a unique seed. + operatorRepresentation['node_id'] = operatorRepresentation['nodeIdx'] + return ctxt, operatorRepresentation, [] + + +referenceTemplate = _FloatPerturbUniformTemplate(""" +// PerturbUniform (Name: ${nodeName}, Op: ${nodeOp}) +uint8_t ${nodeName}_core_id = (uint8_t) pi_core_id(); +uint8_t ${nodeName}_log2Core = (uint8_t) log2(NUM_CORES); +uint32_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0); +uint32_t ${nodeName}_chunk_start = (uint32_t) MIN(${nodeName}_chunk*${nodeName}_core_id, (uint32_t) ${size}); +uint32_t ${nodeName}_chunk_stop = (uint32_t) MIN(${nodeName}_chunk_start + ${nodeName}_chunk, (uint32_t) ${size}); +uint32_t ${nodeName}_local_size = ${nodeName}_chunk_stop - ${nodeName}_chunk_start; + +// pick large enough stride to minimize correlation between nodes. +uint32_t chunk_seed = seed + (${nodeName}_chunk_start * ${node_id}) + (${node_id} * 104729); +ApplyUniformPerturbation((const float32_t *) &${data_in}[${nodeName}_chunk_start], + (float32_t *) &${data_out}[${nodeName}_chunk_start], + chunk_seed, + ${eps}f, + perturbation_sign, // globally defined in DeedeployTest main + ${nodeName}_local_size); +""") \ No newline at end of file diff --git a/Deeploy/Targets/PULPOpen/Tiler.py b/Deeploy/Targets/PULPOpen/Tiler.py index 3d7d11f343..8a3cda2ee8 100644 --- a/Deeploy/Targets/PULPOpen/Tiler.py +++ b/Deeploy/Targets/PULPOpen/Tiler.py @@ -22,7 +22,8 @@ PULPRQSConv2DBindings, PULPRQSDWConv2DBindings, PULPRQSGEMMBindings, PULPRQSiHardswishBindings, \ PULPRQSMatrixVecBindings, PULPRQSTallGEMMBindings, PULPSGDBindings, PULPSliceBindings, PULPSoftmaxBindings, \ PULPSoftmaxCrossEntropyLossBindings, PULPSoftmaxCrossEntropyLossGradBindings, PULPSoftmaxGradBindings, \ - PULPTransposeBindings, PULPUniformRQSBindings + PULPTransposeBindings, PULPUniformRQSBindings, PULPPerturbNormalBindings, PULPPerturbUniformBindings, \ + PULPPerturbEggrollBindings, PULPPerturbRademacherBindings, PULPPerturbTriangleBindings from Deeploy.Targets.PULPOpen.TileConstraints.ConvTileConstraint import Conv2DTileConstraint, RQConv2DTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.DWConvTileConstraint import DWConv2DTileConstraint, \ RQDWConv2DTileConstraint @@ -153,3 +154,18 @@ PULPReduceMeanTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPReduceMeanBindings, tileConstraint = ReduceMeanTileConstraint()) + +PULPPerturbNormalTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPPerturbNormalBindings, + tileConstraint = UnaryTileConstraint()) + +PULPPerturbUniformTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPPerturbUniformBindings, + tileConstraint = UnaryTileConstraint()) + +PULPPerturbEggrollTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPPerturbEggrollBindings, + tileConstraint = UnaryTileConstraint()) + +PULPPerturbRademacherTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPPerturbRademacherBindings, + tileConstraint = UnaryTileConstraint()) + +PULPPerturbTriangleTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPPerturbTriangleBindings, + tileConstraint = UnaryTileConstraint()) diff --git a/Deeploy/TilingExtension/TileConstraint.py b/Deeploy/TilingExtension/TileConstraint.py index 5b067b2ce9..1b92df1752 100644 --- a/Deeploy/TilingExtension/TileConstraint.py +++ b/Deeploy/TilingExtension/TileConstraint.py @@ -146,6 +146,8 @@ def getCubeTransfers(tensorConstraint: TensorMemoryConstraint, sourceCubes: List targetIdx = 1 fullShape = ctxt.lookup(outVar).shape + if isinstance(fullShape, int): + fullShape = (fullShape,) initialOffset = (0,) * len(fullShape) outputCubes = [ AbsoluteHyperRectangle(rectangle = HyperRectangle(offset = initialOffset, dims = tuple(fullShape)), diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbEggrol/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbEggrol/inputs.npz new file mode 100644 index 0000000000..b58ac20c7b Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbEggrol/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbEggrol/network.onnx b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbEggrol/network.onnx new file mode 100644 index 0000000000..38798357d4 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbEggrol/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbEggrol/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbEggrol/outputs.npz new file mode 100644 index 0000000000..5284177d8e Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbEggrol/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbNormal/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbNormal/inputs.npz new file mode 100644 index 0000000000..847536024f Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbNormal/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbNormal/network.onnx b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbNormal/network.onnx new file mode 100644 index 0000000000..5dcf8bae70 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbNormal/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbNormal/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbNormal/outputs.npz new file mode 100644 index 0000000000..a780bf64e6 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbNormal/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbRademacher/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbRademacher/inputs.npz new file mode 100644 index 0000000000..4a1e9c269c Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbRademacher/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbRademacher/network.onnx b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbRademacher/network.onnx new file mode 100644 index 0000000000..52f0ccfd9c Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbRademacher/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbRademacher/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbRademacher/outputs.npz new file mode 100644 index 0000000000..8dc3b1d0cf Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbRademacher/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbTriangle/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbTriangle/inputs.npz new file mode 100644 index 0000000000..d77ca34b35 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbTriangle/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbTriangle/network.onnx b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbTriangle/network.onnx new file mode 100644 index 0000000000..23990e812d Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbTriangle/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbTriangle/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbTriangle/outputs.npz new file mode 100644 index 0000000000..9113de38f2 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbTriangle/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbUniform/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbUniform/inputs.npz new file mode 100644 index 0000000000..a7e5c1cfa0 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbUniform/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbUniform/network.onnx b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbUniform/network.onnx new file mode 100644 index 0000000000..42c66ac0c8 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbUniform/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbUniform/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbUniform/outputs.npz new file mode 100644 index 0000000000..56be194e61 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ZOPerturb/PerturbUniform/outputs.npz differ diff --git a/DeeployTest/Tests/Models/Lite-CNN-ZO/inputs.npz b/DeeployTest/Tests/Models/Lite-CNN-ZO/inputs.npz new file mode 100644 index 0000000000..8dcd54a7dd Binary files /dev/null and b/DeeployTest/Tests/Models/Lite-CNN-ZO/inputs.npz differ diff --git a/DeeployTest/Tests/Models/Lite-CNN-ZO/network.onnx b/DeeployTest/Tests/Models/Lite-CNN-ZO/network.onnx new file mode 100644 index 0000000000..26bedabcb4 Binary files /dev/null and b/DeeployTest/Tests/Models/Lite-CNN-ZO/network.onnx differ diff --git a/DeeployTest/Tests/Models/Lite-CNN-ZO/outputs.npz b/DeeployTest/Tests/Models/Lite-CNN-ZO/outputs.npz new file mode 100644 index 0000000000..e768b0ce4d Binary files /dev/null and b/DeeployTest/Tests/Models/Lite-CNN-ZO/outputs.npz differ diff --git a/DeeployTest/Tests/Models/Lite-CNN/inputs.npz b/DeeployTest/Tests/Models/Lite-CNN/inputs.npz new file mode 100644 index 0000000000..8dcd54a7dd Binary files /dev/null and b/DeeployTest/Tests/Models/Lite-CNN/inputs.npz differ diff --git a/DeeployTest/Tests/Models/Lite-CNN/network.onnx b/DeeployTest/Tests/Models/Lite-CNN/network.onnx new file mode 100644 index 0000000000..2a39932575 Binary files /dev/null and b/DeeployTest/Tests/Models/Lite-CNN/network.onnx differ diff --git a/DeeployTest/Tests/Models/Lite-CNN/outputs.npz b/DeeployTest/Tests/Models/Lite-CNN/outputs.npz new file mode 100644 index 0000000000..e768b0ce4d Binary files /dev/null and b/DeeployTest/Tests/Models/Lite-CNN/outputs.npz differ diff --git a/DeeployTest/Tests/Models/SleepConVit-ZO/inputs.npz b/DeeployTest/Tests/Models/SleepConVit-ZO/inputs.npz new file mode 100644 index 0000000000..d55dda479f Binary files /dev/null and b/DeeployTest/Tests/Models/SleepConVit-ZO/inputs.npz differ diff --git a/DeeployTest/Tests/Models/SleepConVit-ZO/network.onnx b/DeeployTest/Tests/Models/SleepConVit-ZO/network.onnx new file mode 100644 index 0000000000..c5aefc7f47 Binary files /dev/null and b/DeeployTest/Tests/Models/SleepConVit-ZO/network.onnx differ diff --git a/DeeployTest/Tests/Models/SleepConVit-ZO/outputs.npz b/DeeployTest/Tests/Models/SleepConVit-ZO/outputs.npz new file mode 100644 index 0000000000..7b64cc07d8 Binary files /dev/null and b/DeeployTest/Tests/Models/SleepConVit-ZO/outputs.npz differ diff --git a/DeeployTest/Tests/Models/SleepConVit/inputs.npz b/DeeployTest/Tests/Models/SleepConVit/inputs.npz new file mode 100644 index 0000000000..ee174fcab4 Binary files /dev/null and b/DeeployTest/Tests/Models/SleepConVit/inputs.npz differ diff --git a/DeeployTest/Tests/Models/SleepConVit/network.onnx b/DeeployTest/Tests/Models/SleepConVit/network.onnx new file mode 100644 index 0000000000..c51390febe Binary files /dev/null and b/DeeployTest/Tests/Models/SleepConVit/network.onnx differ diff --git a/DeeployTest/Tests/Models/SleepConVit/outputs.npz b/DeeployTest/Tests/Models/SleepConVit/outputs.npz new file mode 100644 index 0000000000..8babb4ed7a Binary files /dev/null and b/DeeployTest/Tests/Models/SleepConVit/outputs.npz differ diff --git a/DeeployTest/Tests/Models/TempConv/inputs.npz b/DeeployTest/Tests/Models/TempConv/inputs.npz new file mode 100644 index 0000000000..9c0899ec46 Binary files /dev/null and b/DeeployTest/Tests/Models/TempConv/inputs.npz differ diff --git a/DeeployTest/Tests/Models/TempConv/network.onnx b/DeeployTest/Tests/Models/TempConv/network.onnx new file mode 100644 index 0000000000..0b214d6ee5 Binary files /dev/null and b/DeeployTest/Tests/Models/TempConv/network.onnx differ diff --git a/DeeployTest/Tests/Models/TempConv/outputs.npz b/DeeployTest/Tests/Models/TempConv/outputs.npz new file mode 100644 index 0000000000..d1e163374c Binary files /dev/null and b/DeeployTest/Tests/Models/TempConv/outputs.npz differ diff --git a/DeeployTest/testMVP.py b/DeeployTest/testMVP.py index 01216984af..cbaeda7cae 100644 --- a/DeeployTest/testMVP.py +++ b/DeeployTest/testMVP.py @@ -212,6 +212,8 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg help = "Number of cores on which the network is run. Currently, required for im2col buffer sizing on Siracusa. Default: 1." ) + parser.add_argument('--run_mode', type = str, default = 'inference', + help = 'Run mode of the network. Options are: inference, mezo_training.') parser.set_defaults(shouldFail = False) args = parser.parse_args() diff --git a/DeeployTest/testRunner_tiled_siracusa_mezo.py b/DeeployTest/testRunner_tiled_siracusa_mezo.py new file mode 100644 index 0000000000..9b85b7f491 --- /dev/null +++ b/DeeployTest/testRunner_tiled_siracusa_mezo.py @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from testUtils.testRunner import TestRunner, TestRunnerArgumentParser + +if __name__ == "__main__": + + parser = TestRunnerArgumentParser( + tiling_arguments = True, description = "Deeploy Code Generation Utility for the Siracusa Platform (Tiling).") + + parser.add_argument('--cores', + metavar = '', + dest = 'cores', + type = int, + default = 8, + help = 'Set number of cluster cores') + args = parser.parse_args() + + testRunner = TestRunner(platform = "Siracusa", simulator = "gvsoc", tiling = True, argument_parser = parser, gen_args = "--run_mode mezo_training") + + testRunner.cmake_args += f" -D NUM_CORES={args.cores}" + + testRunner.run() diff --git a/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h b/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h index f6e8308c97..95de0c7a5b 100644 --- a/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h +++ b/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h @@ -34,6 +34,7 @@ #include "kernel/UniformRequantShift.h" #include "kernel/gemv.h" #include "kernel/iRMSnorm.h" +#include "kernel/RandomNoise.h" #define LOG2(x) (__builtin_pulp_fl1(x)) diff --git a/TargetLibraries/PULPOpen/inc/kernel/RandomNoise.h b/TargetLibraries/PULPOpen/inc/kernel/RandomNoise.h new file mode 100644 index 0000000000..4d88a2d897 --- /dev/null +++ b/TargetLibraries/PULPOpen/inc/kernel/RandomNoise.h @@ -0,0 +1,99 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __DEEPLOY_MATH_RANDOMNOISE_KERNEL_HEADER_ +#define __DEEPLOY_MATH_RANDOMNOISE_KERNEL_HEADER_ + +#include "DeeployPULPMath.h" + + +#define PI_F 3.14159265358979323846f + + +typedef struct { + uint32_t state; + uint32_t bits; + int bitpos; +} RademacherRNG; + +// Sample from Unifom distribution U[-0.5,0.5] +float32_t UniformSample(uint32_t *state); +// Sample from triangular distribution Tr[-1, 1] +float32_t TriangularSample(uint32_t *state); +float32_t GaussianSample(uint32_t *state); +float32_t RademacherSample(RademacherRNG *rng); + +void RademacherRNG_init(RademacherRNG *rng, uint32_t seed); + +// Applies triangular perturbation to the weights and applies rescaling to match Gaussian(0, 1) l2 norm. +void ApplyTriangularPerturbation(const float32_t *__restrict__ pweights, + float32_t *__restrict__ pweights_dest, + uint32_t seed, + float32_t epsilon, + uint32_t dir, + uint32_t size); + +// Applies uniform perturbation to the weights and applies rescaling to match Gaussian(0, 1) l2 norm. +void ApplyUniformPerturbation(const float32_t *__restrict__ pweights, + float32_t *__restrict__ pweights_dest, + uint32_t seed, + float32_t epsilon, + uint32_t dir, + uint32_t size); + +// Applies uniform perturbation to the weights and applies rescaling to match Gaussian(0, 1) l2 norm. +void ApplyGaussianPerturbation(const float32_t *__restrict__ pweights, + float32_t *__restrict__ pweights_dest, + uint32_t seed, + float32_t epsilon, + uint32_t dir, + uint32_t size); + +// Applies uniform perturbation to the weights and applies rescaling to match Gaussian(0, 1) l2 norm. +void ApplyRademacherPerturbation(const float32_t *__restrict__ pweights, + float32_t *__restrict__ pweights_dest, + uint32_t seed, + float32_t epsilon, + uint32_t dir, + uint32_t size); + + +// Updates the weights in place according to the MeZO update rule with triangular noise. +// Only supports qMeZO with q = 1 for now. +void UpdateWeightsTriangle(float32_t *__restrict__ pweights, + float32_t loss, + uint32_t seed, + float32_t epsilon, + float32_t lr, + uint32_t size); + +// Updates the weights in place according to the MeZO update rule with uniform noise. +// Only supports qMeZO with q = 1 for now. +void UpdateWeightsUniform(float32_t *__restrict__ pweights, + float32_t loss, + uint32_t seed, + float32_t epsilon, + float32_t lr, + uint32_t size); + +void UpdateWeightsGaussian(float32_t *__restrict__ pweights, + float32_t loss, + uint32_t seed, + float32_t epsilon, + float32_t lr, + uint32_t size); + +void UpdateWeightsRademacher(float32_t *__restrict__ pweights, + float32_t loss, + uint32_t seed, + float32_t epsilon, + float32_t lr, + uint32_t size); + +/* Xorshift32 implementation. Most basic software PRNG*/ +uint32_t Xorshift32(uint32_t state); + +#endif //__DEEPLOY_MATH_RANDOMNOISE_KERNEL_HEADER_ \ No newline at end of file diff --git a/TargetLibraries/PULPOpen/src/Gemm.c b/TargetLibraries/PULPOpen/src/Gemm.c index a46f8ac6ae..02fd991674 100644 --- a/TargetLibraries/PULPOpen/src/Gemm.c +++ b/TargetLibraries/PULPOpen/src/Gemm.c @@ -6,6 +6,7 @@ #include "DeeployPULPMath.h" #include "pmsis.h" +// #include "perf_utils.h" void PULP_Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA, const float32_t *__restrict__ pSrcB, @@ -17,6 +18,16 @@ void PULP_Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA, int8_t core_id = pi_core_id(); int8_t log2Core = LOG2(NUM_CORES); + //RW: Performance monitoring is currently disabled + // perf_stats_t perf_start, perf_end, perf_total; + + // // Initialize and start performance counters (only core 0) + // if (core_id == 0) { + // perf_bench_init(); + // perf_bench_start(); + // perf_bench_read(&perf_start); + // } + uint32_t M_chunk = (M >> log2Core) + ((M & (NUM_CORES - 1)) != 0); uint32_t M_start = MIN(core_id * M_chunk, M); uint32_t M_end = MIN(M_start + M_chunk, M); @@ -351,4 +362,16 @@ void PULP_Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA, } } } + + // RW: Stop performance counters and print results (only core 0) + // if (core_id == 0) { + // perf_bench_stop(); + // perf_bench_read(&perf_end); + // perf_bench_diff(&perf_total, &perf_end, &perf_start); + + // char label[100]; + // snprintf(label, sizeof(label), "GEMM M=%u N=%u O=%u transA=%u transB=%u", + // M, N, O, transA, transB); + // perf_bench_print(label, &perf_total); + // } } \ No newline at end of file diff --git a/TargetLibraries/PULPOpen/src/RandomNoise.c b/TargetLibraries/PULPOpen/src/RandomNoise.c new file mode 100644 index 0000000000..04e17d88fe --- /dev/null +++ b/TargetLibraries/PULPOpen/src/RandomNoise.c @@ -0,0 +1,268 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeployPULPMath.h" +#include + +// TODO: 1) loop unrolling for ILP perf +// TODO: 2) Perturbation directly integrated in GEMM or Conv kernels. +/* --------------------------- RNG ---------------------------------- */ + +uint32_t Xorshift32(uint32_t state) { + state ^= state << 13; + state ^= state >> 17; + state ^= state << 5; + return state; +} + +/* --------------------------- Samplers ---------------------------------- */ + +float32_t TriangularSample(uint32_t *state) { + *state = Xorshift32(*state); + float32_t u1 = (float32_t)(*state) / (float32_t)0xFFFFFFFF; // in [0,1] + // mutate state to avoid same seed for u2. + *state = Xorshift32(*state); + float32_t u2 = (float32_t)(*state) / (float32_t)0xFFFFFFFF; // in [0,1] + return u1 - u2; +} + +float32_t UniformSample(uint32_t *state) { + *state = Xorshift32(*state); + float32_t u1 = (float32_t)(*state) / (float32_t)0xFFFFFFFF; // in [0,1] + return u1-0.5f; // centered around 0 +} + +float32_t GaussianSample(uint32_t *state) { + // Box-Muller transform + *state = Xorshift32(*state); + float32_t u1 = (float32_t)(*state) / (float32_t)0xFFFFFFFF; // in (0,1] + // mutate state to avoid same seed for u2. + *state = Xorshift32(*state); + float32_t u2 = (float32_t)(*state) / (float32_t)0xFFFFFFFF; // in [0,1] + return sqrtf(-2.0f * logf(u1)) * cosf(2.0f * PI_F * u2); +} + +/* ---------------- Ziggurat method for Gaussian sampling ---------------- */ +// This implementation is adapted from the public domain Ziggurat algorithm +// by Marsaglia and Tsang. + +#define ZIGGURAT_TABLE_SIZE 128 +#define ZIGGURAT_R 3.442619855899 +#define ZIGGURAT_V 9.91256303526217e-3 + +static uint32_t kn[ZIGGURAT_TABLE_SIZE]; +static float32_t wn[ZIGGURAT_TABLE_SIZE]; +static float32_t fn[ZIGGURAT_TABLE_SIZE]; +static int ziggurat_tables_initialized = 0; + +void build_ziggurat_tables() { + if (ziggurat_tables_initialized) return; + + float32_t dn = ZIGGURAT_R; + float32_t tn = dn; + float32_t vn = ZIGGURAT_V; + + // Set up the tables + float32_t q = vn / expf(-0.5f * dn * dn); + kn[0] = (uint32_t)((dn / q) * (float32_t)0xFFFFFFFF); + kn[1] = 0; + + wn[0] = (float32_t)(q / (float32_t)0xFFFFFFFF); + wn[ZIGGURAT_TABLE_SIZE - 1] = (float32_t)(dn / (float32_t)0xFFFFFFFF); + + fn[0] = 1.0f; + fn[ZIGGURAT_TABLE_SIZE - 1] = expf(-0.5f * dn * dn); + + for (int i = ZIGGURAT_TABLE_SIZE - 2; i >= 1; i--) { + dn = sqrtf(-2.0f * logf(vn / dn + expf(-0.5f * dn * dn))); + kn[i + 1] = (uint32_t)((dn / tn) * (float32_t)0xFFFFFFFF); + tn = dn; + fn[i] = expf(-0.5f * dn * dn); + wn[i] = (float32_t)(dn / (float32_t)0xFFFFFFFF); + } + ziggurat_tables_initialized = 1; +} + + +float32_t GaussianZigguratSample(uint32_t *state) { + if (!ziggurat_tables_initialized) { + build_ziggurat_tables(); + } + + int32_t hz; + uint32_t iz; + float32_t x, y; + + for (;;) { + *state = Xorshift32(*state); + hz = (int32_t)(*state); + iz = hz & (ZIGGURAT_TABLE_SIZE - 1); + + // Quick acceptance path + if ((uint32_t)abs(hz) < kn[iz]) { + return (float32_t)hz * wn[iz]; + } + + // Handle the tail + if (iz == 0) { + do { + *state = Xorshift32(*state); + x = -logf((float32_t)(*state) / (float32_t)0xFFFFFFFF) / ZIGGURAT_R; + *state = Xorshift32(*state); + y = -logf((float32_t)(*state) / (float32_t)0xFFFFFFFF); + } while (y + y < x * x); + return (hz > 0) ? ZIGGURAT_R + x : -ZIGGURAT_R - x; + } + + // Slower rejection path + x = (float32_t)hz * wn[iz]; + if (fn[iz] + ((float32_t)(*state) / (float32_t)0xFFFFFFFF) * (fn[iz - 1] - fn[iz]) < expf(-0.5f * x * x)) { + return x; + } + } +} + +void RademacherRNG_init(RademacherRNG *rng, uint32_t seed) { + rng->state = seed; + rng->bits = 0; + rng->bitpos = 32; // force refill on first use +} + +float32_t RademacherSample(RademacherRNG *rng) { + if (rng->bitpos >= 32) { + rng->state = Xorshift32(rng->state); + rng->bits = rng->state; + rng->bitpos = 0; + } + float32_t val = (rng->bits & 1) ? 1.0f : -1.0f; + rng->bits >>= 1; + rng->bitpos++; + return val; +} + +/* ------------------------- Perturbation Functions -------------------------------- */ + +void ApplyTriangularPerturbation(const float32_t *__restrict__ pweights, + float32_t *__restrict__ pweights_dest, + uint32_t seed, + float32_t epsilon, + uint32_t dir, + uint32_t size) { + uint32_t rng_state = (seed * 1664525u) + 1013904223u; + float32_t sqrt6 = 2.44948974278f; + float32_t scale = epsilon * sqrt6; // sqrt(6): => variance 1 + if (dir == 0) {scale *= -1.0f;} + for (uint32_t i = 0; i < size; i++) { + float32_t tr = TriangularSample(&rng_state); + pweights_dest[i] = pweights[i] + tr * scale; + } +} + +void ApplyUniformPerturbation(const float32_t *__restrict__ pweights, + float32_t *__restrict__ pweights_dest, + uint32_t seed, + float32_t epsilon, + uint32_t dir, + uint32_t size) { + uint32_t rng_state = (seed * 1664525u) + 1013904223u; + float32_t sqrt3 = 1.73205080757f; + float32_t scale = epsilon * sqrt3 * 2.0f; // factor 2: [-0.5,0.5] => [-1,1], sqrt(3): => Gaussian(0, 1) l2 norm. + if (dir == 0) {scale *= -1.0f;} + for (uint32_t i = 0; i < size; i++) { + float32_t u = UniformSample(&rng_state); + pweights_dest[i] = pweights[i] + u * scale; + } +} + + +void ApplyGaussianPerturbation(const float32_t *__restrict__ pweights, + float32_t *__restrict__ pweights_dest, + uint32_t seed, + float32_t epsilon, + uint32_t dir, + uint32_t size) { + uint32_t rng_state = (seed * 1664525u) + 1013904223u; + float32_t scale = epsilon; // gaussian naturally has variance 1 + if (dir == 0) {scale *= -1.0f;} + for (uint32_t i = 0; i < size; i++) { + float32_t u = GaussianSample(&rng_state); + pweights_dest[i] = pweights[i] + u * scale; + } +} + +void ApplyRademacherPerturbation(const float32_t *__restrict__ pweights, + float32_t *__restrict__ pweights_dest, + uint32_t seed, + float32_t epsilon, + uint32_t dir, + uint32_t size) { + RademacherRNG rng_state = { (seed * 1664525u) + 1013904223u, 0, 32 }; + float32_t sqrt3 = 1.73205080757f; + float32_t scale = epsilon; // rademacher naturally has variance 1 + if (dir == 0) {scale *= -1.0f;} + for (uint32_t i = 0; i < size; i++) { + float32_t u = RademacherSample(&rng_state); + pweights_dest[i] = pweights[i] + u * scale; + } +} + +/* --------------------------- Update functions ---------------------------------- */ + +void UpdateWeightsTriangle(float32_t *__restrict__ pweights, + float32_t loss, + uint32_t seed, + float32_t epsilon, + float32_t lr, + uint32_t size) { + uint32_t rng_state = (seed * 1664525u) + 1013904223u; + float32_t sqrt6 = 2.44948974278f; + const float32_t scale = sqrt6; // sqrt(6): => Gaussian(0, 1) l2 norm. + for (uint32_t i = 0; i < size; i++) { + float32_t tr = TriangularSample(&rng_state); + pweights[i] = pweights[i] - lr * loss/(2.0f * epsilon) * tr * scale; + } +} + +void UpdateWeightsUniform(float32_t *__restrict__ pweights, + float32_t loss, + uint32_t seed, + float32_t epsilon, + float32_t lr, + uint32_t size) { + uint32_t rng_state = (seed * 1664525u) + 1013904223u; + float32_t sqrt3 = 1.73205080757f; + const float32_t scale = sqrt3 * 2.0f; // factor 2: [-0.5,0.5] => [-1,1], sqrt(3): => variance 1 + for (uint32_t i = 0; i < size; i++) { + float32_t u = UniformSample(&rng_state); + pweights[i] = pweights[i] - lr * loss/(2.0f * epsilon) * u * scale; + } +} + +void UpdateWeightsGaussian(float32_t *__restrict__ pweights, + float32_t loss, + uint32_t seed, + float32_t epsilon, + float32_t lr, + uint32_t size) { + uint32_t rng_state = (seed * 1664525u) + 1013904223u; + for (uint32_t i = 0; i < size; i++) { + float32_t u = GaussianSample(&rng_state); + pweights[i] = pweights[i] - lr * loss/(2.0f * epsilon) * u; + } +} + +void UpdateWeightsRademacher(float32_t *__restrict__ pweights, + float32_t loss, + uint32_t seed, + float32_t epsilon, + float32_t lr, + uint32_t size) { + RademacherRNG rng_state = { (seed * 1664525u) + 1013904223u, 0, 32 }; + for (uint32_t i = 0; i < size; i++) { + float32_t u = RademacherSample(&rng_state); + pweights[i] = pweights[i] - lr * loss/(2.0f * epsilon) * u; + } +} \ No newline at end of file