fix diffusers test/models on 910B

lvyufeng · lvyufeng · commit 3cba0d20af51 · 2025-12-06T21:06:06.000+08:00
diff --git a/mindnlp/patch/safetensors/common.py b/mindnlp/patch/safetensors/common.py
@@ -254,7 +254,7 @@ def safe_save_file(tensor_dict, filename, metadata=None):
     return safetensors.numpy.save_file(tensor_dict, filename, metadata)
 
 
-def safe_load_file(filename, device):
+def safe_load_file(filename, device = 'cpu'):
     """
     Loads a safetensors file into torch format.
 
diff --git a/mindtorch/_apis/npu_910a.py b/mindtorch/_apis/npu_910a.py
@@ -1192,6 +1192,8 @@ def avg_pool2d(input, kernel_size, stride, padding=0, ceil_mode=False, count_inc
         return pyboost.avg_pool2d_op(input, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override)
 
 def avg_pool3d(input, kernel_size, stride, padding=0, ceil_mode=False, count_include_pad=True, divisor_override=None):
+    if divisor_override is None:
+        divisor_override = 0
     return legacy.avg_pool3_d(
         input,
         kernel_size,
diff --git a/mindtorch/_apis/npu_910b.py b/mindtorch/_apis/npu_910b.py
@@ -1,5 +1,6 @@
 import math
 import numbers
+import warnings
 import mindspore
 import mindtorch
 import numpy as np
@@ -1380,6 +1381,8 @@ def conv_transpose2d(input, weight, bias=None, stride=1, padding=0, output_paddi
     return out
 
 def conv_transpose3d(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1):
+    warnings.warn('conv_transposed3d only support float16 on MindSpore, mindtorch will do autocast + nan_to_num to void inf/nan, please check the precision if the result is not good.')
+
     in_channel, out_channel = weight.shape[0], weight.shape[1]
     kernel_size = weight.shape[2:]
     # conv_transpose3d_op = ops.Conv3DTranspose(
@@ -1438,6 +1441,7 @@ def conv_transpose3d(input, weight, bias=None, stride=1, padding=0, output_paddi
         out = cast(out, input_dtype)
         if bias is not None:
             out = add(out, bias)
+    out = nan_to_num(out, 0., 0., 0.)
     return out
 
 def relu(input):
@@ -1880,6 +1884,8 @@ def dynamic_rnn(x, w, b, seq_length, init_h, init_c):
                               'LSTM', 'UNIDIRECTIONAL', 1, False, 1.0, -1.0, 0, True, 'tanh', 0.0, True)
 
 def nan_to_num(input, nan=0.0, posinf=None, neginf=None):
+    if ENABLE_PYBOOST:
+        return pyboost.nan_to_num_impl(input, nan, posinf, neginf)
     return legacy.nan_to_num(input, nan, posinf, neginf)
 
 def round(input, decimals):
@@ -2131,14 +2137,14 @@ def sdpa_manual(query, key, value, attn_mask=None, dropout_p=0.0,
 
 def sdpa(query, key, value, attn_mask=None, dropout_p=0.0,
         is_causal=False, scale=None, enable_gqa=False):
-    if ENABLE_FLASH_ATTENTION:
+    if not ENABLE_FLASH_ATTENTION:
         return sdpa_manual(query, key, value, attn_mask, dropout_p, is_causal, scale, enable_gqa)
 
     scale_factor = 1 / math.sqrt(query.shape[-1]) if scale is None else scale
 
     if attn_mask is not None and not is_causal:
         if FLASH_ATTN_MASK_VALID == 1:
-            attn_mask = bitwise_not(attn_mask)
+            attn_mask = attn_mask == 0.0
         else:
             attn_mask = cast(attn_mask, mindspore.bool_)
 
@@ -2500,4 +2506,7 @@ def raw_adam(param, exp_avg, exp_avg_sq, beta1_power, beta2_power, lr, beta1, be
     return legacy.adam(param, exp_avg, exp_avg_sq, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad, False, False)
 
 def inplace_sub(input, other):
-    return pyboost.inplace_sub_ext_op(input, other)
+    return pyboost.inplace_sub_ext_op(input, other)
+
+def isfinite(input):
+    return pyboost.isfinite_op(input)
diff --git a/mindtorch/_tensor.py b/mindtorch/_tensor.py
@@ -715,7 +715,9 @@ def byte(self):
         return self.to(mindspore.uint8)
 
     # Tensor.broadcast_to
-    def broadcast_to(self, shape):
+    def broadcast_to(self, *shape):
+        if isinstance(shape[0], (tuple, list)):
+            shape = shape[0]
         return ops.broadcast_to(self, shape)
 
     # Tensor.cauchy_
@@ -1309,7 +1311,7 @@ def is_complex(self):
 
     # Tensor.is_floating_point
     def is_floating_point(self):
-        return isinstance(self.dtype, typing.Float)
+        return isinstance(self.dtype, (typing.Float, typing.BFloat))
 
     # Tensor.is_inference
 
diff --git a/mindtorch/configs.py b/mindtorch/configs.py
@@ -47,3 +47,4 @@ def parse_flag_from_env(key, default=False):
 ENABLE_PYBOOST = parse_flag_from_env('ENABLE_PYBOOST', True)
 CPU_USE_NUMPY_OP = parse_flag_from_env('CPU_USE_NUMPY', False)
 ENABLE_FLASH_ATTENTION = parse_flag_from_env('ENABLE_FLASH_ATTENTION', False)
+CAPTURE_INF_NAN = parse_flag_from_env('CAPTURE_INF_NAN', False)
diff --git a/mindtorch/executor.py b/mindtorch/executor.py
@@ -1,5 +1,6 @@
+
 from ._apis import cpu, gpu, meta, numpy, npu_910a, npu_910b, npu_310b, npu_310p
-from .configs import CPU_USE_NUMPY_OP, SOC, ENABLE_DISPATCH, DEVICE_TARGET
+from .configs import CPU_USE_NUMPY_OP, SOC, ENABLE_DISPATCH, DEVICE_TARGET, CAPTURE_INF_NAN
 
 if SOC == 'ascend910':
     npu = npu_910a
@@ -24,6 +25,7 @@
 }
 
 DISPATCH_WHITE_LIST = ['inplace_zero', 'inplace_fill_scalar']
+SKIP_NAN_CHECK = ['empty', 'empty_like']
 
 if ENABLE_DISPATCH:
     def execute(func_name, *args, **kwargs):
@@ -65,4 +67,17 @@ def execute(func_name, *args, **kwargs):
             raise RuntimeError(
                 f"No implementation for function: {func_name} on {device_type}."
             )
+        if CAPTURE_INF_NAN:
+            outs = func(*args, **kwargs)
+            if func_name in SKIP_NAN_CHECK:
+                return outs
+
+            isfinite_op = getattr(api_map[device_type], 'isfinite')
+            if isinstance(outs, tuple):
+                for out in outs:
+                    assert isfinite_op(out).asnumpy().all()
+            else:
+                assert isfinite_op(outs).asnumpy().all()
+            return outs
+
         return func(*args, **kwargs)
diff --git a/mindtorch/nn/functional.py b/mindtorch/nn/functional.py
@@ -82,7 +82,7 @@ def avg_pool1d(input, kernel_size, stride, padding=0, ceil_mode=False, count_inc
 def avg_pool2d(input, kernel_size, stride=None, padding=0, ceil_mode=False, count_include_pad=True, divisor_override=None):
     return execute('avg_pool2d', input, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override)
 
-def avg_pool3d(input, kernel_size, stride=None, padding=0, ceil_mode=False, count_include_pad=True, divisor_override=0):
+def avg_pool3d(input, kernel_size, stride=None, padding=0, ceil_mode=False, count_include_pad=True, divisor_override=None):
     return execute('avg_pool3d', input, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override)
 
 def adaptive_avg_pool1d(input, output_size):
diff --git a/mindtorch/nn/utils/weight_norm.py b/mindtorch/nn/utils/weight_norm.py
@@ -14,7 +14,8 @@
 # ============================================================================
 r"""Weight Normalization from https://arxiv.org/abs/1602.07868."""
 from typing import Any, TypeVar
-from ..parameter import Parameter
+from typing_extensions import deprecated
+from ..parameter import Parameter, UninitializedParameter
 from ..modules import Module
 from ... import ops
 
@@ -43,12 +44,6 @@ def _weight_norm(weight_v, weight_g, dim):
 
 
 class WeightNorm:
-
-    r"""
-    The 'WeightNorm' class implements weight normalization for neural network modules. It provides methods to compute normalized weights, apply weight normalization to a cell, wrap a function, and remove
-    weight bias from a cell. The class also contains an initializer for the name and dimension of the weight parameters, as well as a method to compute the weight using the normalized parameters. Additionally, it
-    includes a method to remove the weight bias and a wrapper function for transposing cell_id to cell. 
-    """
     name: str
     dim: int
 
@@ -60,63 +55,64 @@ def __init__(self, name: str, dim: int) -> None:
 
     # TODO Make return type more specific
     def compute_weight(self, module: Module) -> Any:
-        g = getattr(module, self.name + '_g')
-        v = getattr(module, self.name + '_v')
-        return Parameter(_weight_norm(v, g, self.dim))
+        g = getattr(module, self.name + "_g")
+        v = getattr(module, self.name + "_v")
+        return _weight_norm(v, g, self.dim)
 
     @staticmethod
-    def apply(module, name: str, dim: int) -> 'WeightNorm':
-        for k, hook in module._forward_pre_hooks.items():
+    @deprecated(
+        "`torch.nn.utils.weight_norm` is deprecated "
+        "in favor of `torch.nn.utils.parametrizations.weight_norm`.",
+        category=FutureWarning,
+    )
+    def apply(module, name: str, dim: int) -> "WeightNorm":
+        for hook in module._forward_pre_hooks.values():
             if isinstance(hook, WeightNorm) and hook.name == name:
-                raise RuntimeError("Cannot register two weight_norm hooks on "
-                                   "the same parameter {}".format(name))
+                raise RuntimeError(
+                    f"Cannot register two weight_norm hooks on the same parameter {name}"
+                )
 
         if dim is None:
             dim = -1
 
         fn = WeightNorm(name, dim)
 
         weight = getattr(module, name)
-        # if isinstance(weight, UninitializedParameter):
-        #     raise ValueError(
-        #         'The module passed to `WeightNorm` can\'t have uninitialized parameters. '
-        #         'Make sure to run the dummy forward before applying weight normalization')
+        if isinstance(weight, UninitializedParameter):
+            raise ValueError(
+                "The module passed to `WeightNorm` can't have uninitialized parameters. "
+                "Make sure to run the dummy forward before applying weight normalization"
+            )
         # remove w from parameter list
         del module._parameters[name]
 
         # add g and v as new parameters and express w as g/||v|| * v
-        module.register_parameter(name + '_g', Parameter(norm_except_dim(weight, 2, dim)))
-        module.register_parameter(name + '_v', Parameter(weight))
+        module.register_parameter(
+            name + "_g", Parameter(norm_except_dim(weight, 2, dim).data)
+        )
+        module.register_parameter(name + "_v", Parameter(weight.data))
         setattr(module, name, fn.compute_weight(module))
 
         # recompute weight before every forward()
         module.register_forward_pre_hook(fn)
 
         return fn
 
-    def wrapper_func(self, cell, func):
-        r"""
-        wrapper_func where used to transpose cell_id to cell
-        """
-        def new_func(_, inputs):
-            nonlocal cell
-            return func(cell, inputs)
-        return new_func
-
     def remove(self, module: Module) -> None:
         weight = self.compute_weight(module)
         delattr(module, self.name)
-        del module._parameters[self.name + '_g']
-        del module._parameters[self.name + '_v']
-        setattr(module, self.name, weight)
+        del module._parameters[self.name + "_g"]
+        del module._parameters[self.name + "_v"]
+        setattr(module, self.name, Parameter(weight.data))
 
     def __call__(self, module: Module, inputs: Any) -> None:
         setattr(module, self.name, self.compute_weight(module))
 
 
-T_module = TypeVar('T_module', bound=Module)
+T_module = TypeVar("T_module", bound=Module)
+
 
-def weight_norm(module: T_module, name: str = 'weight', dim: int = 0) -> T_module:
+def weight_norm(module: T_module, name: str = "weight", dim: int = 0) -> T_module:
     r"""Apply weight normalization to a parameter in the given module.
 
     .. math::
@@ -138,7 +134,7 @@ def weight_norm(module: T_module, name: str = 'weight', dim: int = 0) -> T_modul
 
     .. warning::
 
-        This function is deprecated.  Use :func:`mindtorch.nn.utils.parametrizations.weight_norm`
+        This function is deprecated.  Use :func:`torch.nn.utils.parametrizations.weight_norm`
         which uses the modern parametrization API.  The new ``weight_norm`` is compatible
         with ``state_dict`` generated from old ``weight_norm``.
 
@@ -150,11 +146,11 @@ def weight_norm(module: T_module, name: str = 'weight', dim: int = 0) -> T_modul
           https://github.com/pytorch/pytorch/issues/102999
 
         * To remove the weight normalization reparametrization, use
-          :func:`mindtorch.nn.utils.parametrize.remove_parametrizations`.
+          :func:`torch.nn.utils.parametrize.remove_parametrizations`.
 
         * The weight is no longer recomputed once at module forward; instead, it will
           be recomputed on every access.  To restore the old behavior, use
-          :func:`mindtorch.nn.utils.parametrize.cached` before invoking the module
+          :func:`torch.nn.utils.parametrize.cached` before invoking the module
           in question.
 
     Args:
@@ -171,16 +167,17 @@ def weight_norm(module: T_module, name: str = 'weight', dim: int = 0) -> T_modul
         >>> m
         Linear(in_features=20, out_features=40, bias=True)
         >>> m.weight_g.size()
-        mindtorch.Size([40, 1])
+        torch.Size([40, 1])
         >>> m.weight_v.size()
-        mindtorch.Size([40, 20])
+        torch.Size([40, 20])
 
     """
     WeightNorm.apply(module, name, dim)
     return module
 
-def remove_weight_norm(module: T_module, name: str = 'weight') -> T_module:
-    r"""Removes the weight normalization reparameterization from a module.
+
+def remove_weight_norm(module: T_module, name: str = "weight") -> T_module:
+    r"""Remove the weight normalization reparameterization from a module.
 
     Args:
         module (Module): containing module
@@ -196,5 +193,4 @@ def remove_weight_norm(module: T_module, name: str = 'weight') -> T_module:
             del module._forward_pre_hooks[k]
             return module
 
-    raise ValueError("weight_norm of '{}' not found in {}"
-                     .format(name, module))
+    raise ValueError(f"weight_norm of '{name}' not found in {module}")
diff --git a/mindtorch/ops/creation.py b/mindtorch/ops/creation.py
@@ -96,6 +96,9 @@ def arange(start=0, end=None, step=1, *, out=None, dtype=None, layout=None, devi
         start, end = 0, int(start)
     if dtype is None:
         dtype = mindtorch.py2dtype[type(start)]
+    
+    if dtype == mindtorch.float64:
+        dtype = mindtorch.float32
 
     device = check_device(device)
 
@@ -181,8 +184,8 @@ def empty_like(input, *, dtype=None, layout=None, device=None, requires_grad=Fal
 
 # full
 def full(size, fill_value, *, out=None, dtype=None, layout=None, device=None, requires_grad=False):
-    # if dtype is None:
-    #     dtype = get_default_dtype()
+    if dtype is None:
+        dtype = get_default_dtype()
     device = check_device(device)
     if not isinstance(device, str):
         device = device.type