Add NaN handling in softmax pattern in SDPA fusion (#2593)

gramalingam · web-flow · commit a1db753311ff · 2025-09-30T17:15:41.000-07:00
Add NaN handling in softmax pattern in SDPA fusion

Signed-off-by: Ganesan Ramalingam &lt;grama@microsoft.com&gt;
diff --git a/onnxscript/rewriter/ort_fusions/sdpa.py b/onnxscript/rewriter/ort_fusions/sdpa.py
@@ -88,6 +88,9 @@ def pattern(
         )
 
         attn_weight = op.Softmax(attn_score, axis=-1)
+        is_nan = op.IsNaN(attn_weight)
+        adj_attn_weight = op.Where(is_nan, 0.0, attn_weight)
+        attn_weight = pattern.OrValue([adj_attn_weight, attn_weight])
         attn_output = op.MatMul(attn_weight, value)
         return attn_output
 
diff --git a/onnxscript/rewriter/ort_fusions/sdpa_test.py b/onnxscript/rewriter/ort_fusions/sdpa_test.py
@@ -44,7 +44,10 @@ def _unmasked_pre_div_sdpa_script(query, key, value):
     scaled_key = op.Div(key_transposed, divisor)
     attn_score = op.MatMul(scaled_query, scaled_key)
     attn_weight = op.Softmax(attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
@@ -56,7 +59,10 @@ def _unmasked_pre_mul_sdpa_script(query, key, value):
     scaled_key = op.Mul(key_transposed, multiplier)
     attn_score = op.MatMul(scaled_query, scaled_key)
     attn_weight = op.Softmax(attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
@@ -67,7 +73,10 @@ def _unmasked_post_div_sdpa_script(query, key, value):
     attn_score = op.MatMul(query, key_transposed)
     scaled_attn_score = op.Div(attn_score, divisor)
     attn_weight = op.Softmax(scaled_attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
@@ -78,7 +87,10 @@ def _unmasked_post_mul_sdpa_script(query, key, value):
     attn_score = op.MatMul(query, key_transposed)
     scaled_attn_score = op.Mul(attn_score, multiplier)
     attn_weight = op.Softmax(scaled_attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
@@ -90,7 +102,10 @@ def _custom_scale_pre_div_sdpa_script(query, key, value):
     scaled_key = op.Div(key_transposed, divisor)
     attn_score = op.MatMul(scaled_query, scaled_key)
     attn_weight = op.Softmax(attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
@@ -102,7 +117,10 @@ def _custom_scale_pre_mul_sdpa_script(query, key, value):
     scaled_key = op.Mul(key_transposed, multiplier)
     attn_score = op.MatMul(scaled_query, scaled_key)
     attn_weight = op.Softmax(attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
@@ -115,7 +133,10 @@ def _custom_multi_scale_pre_mul_sdpa_script(query, key, value):
     scaled_key = op.Mul(key_transposed, multiplier_k)
     attn_score = op.MatMul(scaled_query, scaled_key)
     attn_weight = op.Softmax(attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
@@ -126,7 +147,10 @@ def _custom_scale_post_div_sdpa_script(query, key, value):
     attn_score = op.MatMul(query, key_transposed)
     scaled_attn_score = op.Div(attn_score, divisor)
     attn_weight = op.Softmax(scaled_attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
@@ -137,7 +161,10 @@ def _custom_scale_post_mul_sdpa_script(query, key, value):
     attn_score = op.MatMul(query, key_transposed)
     scaled_attn_score = op.Mul(attn_score, multiplier)
     attn_weight = op.Softmax(scaled_attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
@@ -150,7 +177,10 @@ def _masked_pre_div_sdpa_script(query, key, value, mask):
     attn_score = op.MatMul(scaled_query, scaled_key)
     masked_attn_score = op.Add(attn_score, mask)
     attn_weight = op.Softmax(masked_attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
@@ -163,7 +193,10 @@ def _masked_pre_mul_sdpa_script(query, key, value, mask):
     attn_score = op.MatMul(scaled_query, scaled_key)
     masked_attn_score = op.Add(attn_score, mask)
     attn_weight = op.Softmax(masked_attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
@@ -175,7 +208,10 @@ def _masked_post_div_sdpa_script(query, key, value, mask):
     scaled_attn_score = op.Div(attn_score, divisor)
     masked_attn_score = op.Add(scaled_attn_score, mask)
     attn_weight = op.Softmax(masked_attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
@@ -187,7 +223,10 @@ def _masked_post_mul_sdpa_script(query, key, value, mask):
     scaled_attn_score = op.Mul(attn_score, multiplier)
     masked_attn_score = op.Add(scaled_attn_score, mask)
     attn_weight = op.Softmax(masked_attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
@@ -200,7 +239,10 @@ def _masked_custom_scale_pre_div_sdpa_script(query, key, value, mask):
     attn_score = op.MatMul(scaled_query, scaled_key)
     masked_attn_score = op.Add(attn_score, mask)
     attn_weight = op.Softmax(masked_attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
@@ -213,7 +255,10 @@ def _masked_custom_scale_pre_mul_sdpa_script(query, key, value, mask):
     attn_score = op.MatMul(scaled_query, scaled_key)
     masked_attn_score = op.Add(attn_score, mask)
     attn_weight = op.Softmax(masked_attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
@@ -225,7 +270,10 @@ def _masked_custom_scale_post_div_sdpa_script(query, key, value, mask):
     scaled_attn_score = op.Div(attn_score, divisor)
     masked_attn_score = op.Add(scaled_attn_score, mask)
     attn_weight = op.Softmax(masked_attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 
@@ -237,7 +285,10 @@ def _masked_custom_scale_post_mul_sdpa_script(query, key, value, mask):
     scaled_attn_score = op.Mul(attn_score, multiplier)
     masked_attn_score = op.Add(scaled_attn_score, mask)
     attn_weight = op.Softmax(masked_attn_score, axis=-1)
-    attn_output = op.MatMul(attn_weight, value)
+    is_nan = op.IsNaN(attn_weight)
+    zero = op.Constant(value_float=0.0)
+    adj_attn_weight = op.Where(is_nan, zero, attn_weight)
+    attn_output = op.MatMul(adj_attn_weight, value)
     return attn_output
 
 

Original file line number	Diff line number	Diff line change
`@@ -88,6 +88,9 @@ def pattern(`
`88`	`88`	`)`
`89`	`89`
`90`	`90`	`attn_weight = op.Softmax(attn_score, axis=-1)`
	`91`	`+ is_nan = op.IsNaN(attn_weight)`
	`92`	`+ adj_attn_weight = op.Where(is_nan, 0.0, attn_weight)`
	`93`	`+ attn_weight = pattern.OrValue([adj_attn_weight, attn_weight])`
`91`	`94`	`attn_output = op.MatMul(attn_weight, value)`
`92`	`95`	`return attn_output`
`93`	`96`