Improve SIMD usage of Oklab conversions (image-rs#75)

RunDevelopment · web-flow · commit aedee69a28fc · 2025-09-21T13:08:03.000+02:00
diff --git a/src/color/oklab.rs b/src/color/oklab.rs
@@ -1,63 +1,83 @@
 use glam::Vec3A;
 
 trait Operations {
-    fn srgb_to_linear(c: f32) -> f32;
-    fn linear_to_srgb(c: f32) -> f32;
-    fn cbrt(x: f32) -> f32;
+    fn srgb_to_linear(c: Vec3A) -> Vec3A;
+    fn linear_to_srgb(c: Vec3A) -> Vec3A;
+    fn cbrt(x: Vec3A) -> Vec3A;
 }
 
 struct Reference;
 impl Operations for Reference {
-    fn srgb_to_linear(c: f32) -> f32 {
-        if c >= 0.04045 {
-            ((c + 0.055) / 1.055).powf(2.4)
-        } else {
-            c / 12.92
+    fn srgb_to_linear(c: Vec3A) -> Vec3A {
+        fn srgb_to_linear(c: f32) -> f32 {
+            if c >= 0.04045 {
+                ((c + 0.055) / 1.055).powf(2.4)
+            } else {
+                c / 12.92
+            }
         }
+
+        Vec3A::new(
+            srgb_to_linear(c.x),
+            srgb_to_linear(c.y),
+            srgb_to_linear(c.z),
+        )
     }
-    fn linear_to_srgb(c: f32) -> f32 {
-        if c > 0.0031308 {
-            1.055 * c.powf(1.0 / 2.4) - 0.055
-        } else {
-            12.92 * c
+    fn linear_to_srgb(c: Vec3A) -> Vec3A {
+        fn linear_to_srgb(c: f32) -> f32 {
+            if c > 0.0031308 {
+                1.055 * c.powf(1.0 / 2.4) - 0.055
+            } else {
+                12.92 * c
+            }
         }
+
+        Vec3A::new(
+            linear_to_srgb(c.x),
+            linear_to_srgb(c.y),
+            linear_to_srgb(c.z),
+        )
     }
-    fn cbrt(x: f32) -> f32 {
-        f32::cbrt(x)
+    fn cbrt(x: Vec3A) -> Vec3A {
+        Vec3A::new(x.x.cbrt(), x.y.cbrt(), x.z.cbrt())
     }
 }
 
 struct Fast;
 impl Operations for Fast {
-    fn srgb_to_linear(c: f32) -> f32 {
-        if c >= 0.04045 {
-            // This uses a Padé approximant for ((c + 0.055) / 1.055) ^ 2.4:
-            // (0.000857709 +0.0359438 x+0.524293 x^2+1.31193 x^3)/(1+0.992498 x-0.119725 x^2)
-            let c2 = c * c;
-            let c3 = c2 * c;
-            f32::min(
-                1.0,
-                (0.000857709 + 0.0359438 * c + 0.524293 * c2 + 1.31193 * c3)
-                    / (1.0 + 0.992498 * c - 0.119725 * c2),
-            )
-        } else {
-            c * (1.0 / 12.92)
-        }
+    fn srgb_to_linear(c: Vec3A) -> Vec3A {
+        Vec3A::select(
+            c.cmpge(Vec3A::splat(0.04045)),
+            {
+                // This uses a Padé approximant for ((c + 0.055) / 1.055) ^ 2.4:
+                // (0.000857709 +0.0359438 x+0.524293 x^2+1.31193 x^3)/(1+0.992498 x-0.119725 x^2)
+                let c2 = c * c;
+                let c3 = c2 * c;
+                Vec3A::min(
+                    Vec3A::ONE,
+                    (0.000857709 + 0.0359438 * c + 0.524293 * c2 + 1.31193 * c3)
+                        / (Vec3A::ONE + 0.992498 * c - 0.119725 * c2),
+                )
+            },
+            c * (1.0 / 12.92),
+        )
     }
-    fn linear_to_srgb(c: f32) -> f32 {
-        if c > 0.0031308 {
-            // This uses a Padé approximant for 1.055 c^(1/2.4) - 0.055:
-            // (-0.0117264+21.0897 x+949.46 x^2+2225.62 x^3)/(1+176.398 x+1983.15 x^2+1035.65 x^3)
-            let c2 = c * c;
-            let c3 = c2 * c;
-            (-0.0117264 + 21.0897 * c + 949.46 * c2 + 2225.62 * c3)
-                / (1.0 + 176.398 * c + 1983.15 * c2 + 1035.65 * c3)
-        } else {
-            12.92 * c
-        }
+    fn linear_to_srgb(c: Vec3A) -> Vec3A {
+        Vec3A::select(
+            c.cmpgt(Vec3A::splat(0.0031308)),
+            {
+                // This uses a Padé approximant for 1.055 c^(1/2.4) - 0.055:
+                // (-0.0117264+21.0897 x+949.46 x^2+2225.62 x^3)/(1+176.398 x+1983.15 x^2+1035.65 x^3)
+                let c2 = c * c;
+                let c3 = c2 * c;
+                (-0.0117264 + 21.0897 * c + 949.46 * c2 + 2225.62 * c3)
+                    / (1.0 + 176.398 * c + 1983.15 * c2 + 1035.65 * c3)
+            },
+            c * 12.92,
+        )
     }
     #[allow(clippy::excessive_precision)]
-    fn cbrt(x: f32) -> f32 {
+    fn cbrt(x: Vec3A) -> Vec3A {
         // This is the fast cbrt approximation from the oklab crate.
         // Source: https://gitlab.com/kornelski/oklab/-/blob/d3c074f154187dd5c0642119a6402a6c0753d70c/oklab/src/lib.rs#L61
         // Author: Kornel (https://gitlab.com/kornelski/)
@@ -68,55 +88,52 @@ impl Operations for Fast {
         const F: f32 = 1.6071428061e+0;
         const G: f32 = 3.5714286566e-1;
 
-        let mut t = f32::from_bits((x.to_bits() / 3).wrapping_add(B));
+        let mut t = Vec3A::from_array(
+            x.to_array()
+                .map(|x| f32::from_bits((x.to_bits() / 3).wrapping_add(B))),
+        );
         let s = C + (t * t) * (t / x);
         t *= G + F / (s + E + D / s);
         t
     }
 }
 
 #[allow(clippy::excessive_precision)]
-fn srgb_to_oklab_impl<O: Operations>(rgb: Vec3A) -> Vec3A {
-    let [r, g, b] = rgb.to_array().map(O::srgb_to_linear);
-
-    let mut l = 0.4122214708 * r + 0.5363325363 * g + 0.0514459929 * b;
-    let mut m = 0.2119034982 * r + 0.6806995451 * g + 0.1073969566 * b;
-    let mut s = 0.0883024619 * r + 0.2817188376 * g + 0.6299787005 * b;
-
-    l = O::cbrt(l);
-    m = O::cbrt(m);
-    s = O::cbrt(s);
-
-    let l_final = l * 0.2104542553 + m * 0.7936177850 + s * -0.0040720468;
-    let a = l * 1.9779984951 + m * -2.4285922050 + s * 0.4505937099;
-    let b = l * 0.0259040371 + m * 0.7827717662 + s * -0.8086757660;
+fn srgb_to_oklab_impl<O: Operations>(srgb: Vec3A) -> Vec3A {
+    let rgb = O::srgb_to_linear(srgb);
+
+    let lms = Vec3A::new(
+        rgb.dot(Vec3A::new(0.4122214708, 0.5363325363, 0.0514459929)),
+        rgb.dot(Vec3A::new(0.2119034982, 0.6806995451, 0.1073969566)),
+        rgb.dot(Vec3A::new(0.0883024619, 0.2817188376, 0.6299787005)),
+    );
+    let lms = O::cbrt(lms);
+
+    let lab = Vec3A::new(
+        lms.dot(Vec3A::new(0.2104542553, 0.7936177850, -0.0040720468)),
+        lms.dot(Vec3A::new(1.9779984951, -2.4285922050, 0.4505937099)),
+        lms.dot(Vec3A::new(0.0259040371, 0.7827717662, -0.8086757660)),
+    );
 
     // normalize everything to the 0..1 range
-    Vec3A::new(l_final, a + 0.5, b + 0.5)
+    lab + Vec3A::new(0.0, 0.5, 0.5)
 }
 #[allow(clippy::excessive_precision)]
 fn oklab_to_srgb_impl<O: Operations>(lab: Vec3A) -> Vec3A {
-    let l_org = lab.x;
-    let a = lab.y - 0.5;
-    let b = lab.z - 0.5;
-
-    let mut l = l_org + a * 0.3963377774 + b * 0.2158037573;
-    let mut m = l_org + a * -0.1055613458 + b * -0.0638541728;
-    let mut s = l_org + a * -0.0894841775 + b * -1.2914855480;
-
-    l = l * l * l;
-    m = m * m * m;
-    s = s * s * s;
-
-    let r = l * 4.0767416621 + m * -3.3077115913 + s * 0.2309699292;
-    let g = l * -1.2684380046 + m * 2.6097574011 + s * -0.3413193965;
-    let b = l * -0.0041960863 + m * -0.7034186147 + s * 1.7076147010;
-
-    Vec3A::new(
-        O::linear_to_srgb(r),
-        O::linear_to_srgb(g),
-        O::linear_to_srgb(b),
-    )
+    let lab_norm = lab - Vec3A::new(0.0, 0.5, 0.5);
+    let lms = Vec3A::new(
+        lab_norm.dot(Vec3A::new(1.0, 0.3963377774, 0.2158037573)),
+        lab_norm.dot(Vec3A::new(1.0, -0.1055613458, -0.0638541728)),
+        lab_norm.dot(Vec3A::new(1.0, -0.0894841775, -1.2914855480)),
+    );
+    let lms = lms * lms * lms; // lms^3
+    let rgb = Vec3A::new(
+        lms.dot(Vec3A::new(4.0767416621, -3.3077115913, 0.2309699292)),
+        lms.dot(Vec3A::new(-1.2684380046, 2.6097574011, -0.3413193965)),
+        lms.dot(Vec3A::new(-0.0041960863, -0.7034186147, 1.7076147010)),
+    );
+
+    O::linear_to_srgb(rgb)
 }
 
 #[allow(unused)]
@@ -200,43 +217,58 @@ mod tests {
         }
     }
 
+    pub struct Scalar<O>(O);
+    impl<O: Operations> Scalar<O> {
+        fn srgb_to_linear(c: f32) -> f32 {
+            O::srgb_to_linear(Vec3A::splat(c)).x
+        }
+        fn linear_to_srgb(c: f32) -> f32 {
+            O::linear_to_srgb(Vec3A::splat(c)).x
+        }
+        fn cbrt(x: f32) -> f32 {
+            O::cbrt(Vec3A::splat(x)).x
+        }
+    }
+    type RefScalar = Scalar<Reference>;
+    type FastScalar = Scalar<Fast>;
+
     #[test]
     fn test_linear_srgb() {
         for c in 0..=255 {
             let c = c as f32 / 255.0;
-            let l = Reference::srgb_to_linear(c);
-            let c2 = Reference::linear_to_srgb(l);
+            let l = RefScalar::srgb_to_linear(c);
+            let c2 = RefScalar::linear_to_srgb(l);
 
             assert!((c - c2).abs() < 1e-6, "{c} -> {c2}");
         }
 
         for c in 0..=255 {
             let c = c as f32 / 255.0;
-            let l = Fast::srgb_to_linear(c);
-            let c2 = Fast::linear_to_srgb(l);
+            let l = FastScalar::srgb_to_linear(c);
+            let c2 = FastScalar::linear_to_srgb(l);
 
             assert!((c - c2).abs() < 2.5e-3, "{c} -> {c2}");
             assert!((0.0..=1.0).contains(&l), "{c} -> {l}");
             assert!((0.0..=1.0).contains(&c2), "{c} -> {l}");
         }
 
-        assert_eq!(Reference::srgb_to_linear(0.0), 0.0);
-        assert!((Reference::srgb_to_linear(1.0) - 1.0).abs() < 1e-6);
-        assert_eq!(Fast::linear_to_srgb(0.0), 0.0);
-        assert!((Fast::srgb_to_linear(1.0) - 1.0).abs() < 1e-6);
+        assert_eq!(RefScalar::srgb_to_linear(0.0), 0.0);
+        assert!((RefScalar::srgb_to_linear(1.0) - 1.0).abs() < 1e-6);
+        assert_eq!(FastScalar::linear_to_srgb(0.0), 0.0);
+        assert!((FastScalar::srgb_to_linear(1.0) - 1.0).abs() < 1e-6);
     }
 
     #[test]
     fn test_error_fast_srgb_to_linear() {
         assert_eq!(
-            get_error_stats(Reference::srgb_to_linear, Fast::srgb_to_linear),
+            get_error_stats(RefScalar::srgb_to_linear, FastScalar::srgb_to_linear),
             "Error: avg=0.00002514 max=0.00013047 for 0.999"
         );
     }
     #[test]
     fn test_error_fast_linear_to_srgb() {
         assert_eq!(
-            get_error_stats(Reference::linear_to_srgb, Fast::linear_to_srgb),
+            get_error_stats(RefScalar::linear_to_srgb, FastScalar::linear_to_srgb),
             "Error: avg=0.00105457 max=0.00236702 for 0.732"
         );
     }