michaelborkowski · michaelborkowski · Mar 24, 2025 · Aug 19, 2025 · Aug 19, 2025 · Aug 19, 2025
diff --git a/benchmarks/scripts/criterion-drop-in-replacement/README.md b/benchmarks/scripts/criterion-drop-in-replacement/README.md
@@ -0,0 +1,44 @@
+## Purpose
+
+This directory contains a Python re-implementation of the Haskell Criterion methodology to run executables (instead of Haskell functions, like Criterion normally does).
+One could call it "benchrunner-runner" because the purpose is to run `benchrunner` many times and calculate the appropriate run time statistics.
+
+We take as input a path to some program `prog` (meant to be the `benchrunner`) with the following interface:
+
+- `prog` takes `iters` as a command-line argument,
+- `prog` measures run time of a function of interest in a tight loop that repeats `iters` many times, and finally
+- `prog` prints to stdout the batchtime (total loop time) and selftimed (total loop time divided by `iters`).
+
+The ultimate goal is then to sweep `iters` and perform a linear regression against `iters` and `batchtime`.
+The slope is the mean and the y-intercept represents some notion of shared overhead, insensitive to `iters`.
+
+## Run
+
+This package contains two scripts:
+
+- `sweep_seq.py` (top level)
+- `criterionmethodology.py` (called by `sweep_seq.py`)
+
+Both can be ran directly, i.e.:
+
+```shellsession
+criterionmethodology benchrunner Quicksort Seq 2000
+```
+
+will call `benchrunner iters Quicksort Seq 2000` for various `iters`.
+
+`sweep_seq` performs a logarithmic sweep over different array sizes, invoking `criterionmethdology.py` at each point.
+
+## Arithmetic vs geometric mean
+
+Since performance data is non-negative and judged multiplicatively (twice as good means numbers are half, twice has bad means numbers are doubled; these are all *factors*), the geomean and geo-standard-deviation may make more sense theoretically.
+However, from some testing, the geomean seems to vary wildly for programs with fleeting execution times, even between repeated runs with the same parameters.
+
+In particular, to compute the geomean, we:
+
+- take the logarithm of all the `x` and `y` values,
+- compute linear regression over that, then 
+- exponentiate the y-intercept.
+
+The other dependent portion, which is the slope, becomes a power (the equation is `y = e^b x^m`), which represents *geometric overhead*, e.g. how much overhead is being added per iteration.
+This may do well to model any slowdowns, e.g. ones arising from pre-allocating arrays.
diff --git a/benchmarks/scripts/criterion-drop-in-replacement/criterionmethodology.py b/benchmarks/scripts/criterion-drop-in-replacement/criterionmethodology.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python
+
+#
+# The script determines the cost of one iteration of a function (in seconds) using an executable that
+#
+# - runs `iters` iterations of that function in a tight loop and
+# - prints out the time it took to run them.
+#
+# Example call:
+#
+#    ./criterionmethodology.py $(cabal list-bin benchrunner) Quicksort Seq 2000
+#
+# In particular, we
+#
+# - run given executable (the first and only relevant argument) with 'iters' argument varied from 1 to N;
+#   N and the step size are dynamially determined based on the time it takes to run the binary;
+# - fetch timing results from binary's stdout and do linear regression over them;
+# - plot the regression (see the `plot` function) in `plot.png`.
+#
+# Growing the `iters` parameter is the main ingenuity of the script. It follows the Criterion methodology:
+# running the given binary for small number of iterations doubling them every time, and upon reaching
+# a certain threshold (FIRST_ITER_THRESHOLD), increasing them linearly until the overall execution time
+# reaches another threshold (TOTAL_TIME_THRESHOLD) seconds.
+#
+# - The `converge` function runs the whole process, starting with a small number of iterations.
+# - The `iter` function encodes the methodology for increasing 'iters'.
+# - The `do_bench` function runs the binary and scrapes the output, so the expected binary's interface is encoded in it.
+#
+
+import numpy as np
+from sys import argv
+import subprocess
+from time import time
+import math
+
+from matplotlib import pyplot as plt
+
+LOG=True
+MAKE_PLOT = False
+FIRST_ITER_THRESHOLD = 3e-6 # 0.65
+TOTAL_TIME_THRESHOLD = 1    # 3.5
+                            # ^^ Joseph's original values, but they are too high for my machine.
+
+# Poor-man logging
+def log(format, **xs):
+    if LOG:
+        print(format, **xs)
+
+def linear_regression_with_std(x, y):
+    x = np.array(x)
+    y = np.array(y)
+    x_mean = np.mean(x)
+    y_mean = np.mean(y)
+    numerator = np.sum((x - x_mean) * (y - y_mean))
+    denominator = np.sum((x - x_mean) ** 2)
+    slope = numerator / denominator
+    intercept = y_mean - slope * x_mean
+    y_pred = slope * x + intercept
+    residuals = y - y_pred
+    std_dev = np.std(residuals)
+    return slope, intercept, std_dev
+
+# Do one trial: run the binary with given arguments, including the given `iters`, and return the batch time.
+def do_bench(cliargs, iters):
+    out = str(subprocess.check_output([cliargs[0], str(iters)] + cliargs[1:]))
+    s1 = out[out.find("SELFTIMED")+11:]
+    s2 = float(s1[:s1.find("\n")-4])
+    selftimed = s2
+
+    b1 = out[out.find("BATCHTIME")+11:]
+    b2 = float(b1[:b1.find("SELFTIMED")-2])
+    batchtime = b2
+
+    #log(f"ITERS: {iters}, BATCHTIME: {batchtime}, SELFTIMED: {selftimed}")
+    return batchtime
+
+# Increase 'iters' and do one trial with that. Store results in xs and ys. Return new iters.
+def iter(iters, cliargs, start_time, xs, ys):
+    if time() - start_time < TOTAL_TIME_THRESHOLD:
+        iters = int(math.trunc(float(iters) * 1.2) + 1)
+    else:
+        iters += 1 + iters // 20
+    log(str(iters) + " ", end="", flush=True)
+    st = do_bench(cliargs, iters)
+    xs.append(iters)
+    ys.append(st)
+    return iters
+
+def plot(xs, ys, b, c, m, p):
+    plotfile = "plot.png"
+    os.remove(plotfile) if os.path.exists(plotfile) else None
+    plt.plot(xs, ys, 'rx')
+    plt.plot([xs[0], xs[-1]], [m*xs[0]+b, m*xs[-1]+b], color="blue")
+    plt.plot(xs, [c*x**p for x in xs], color="green")
+    plt.savefig(plotfile)
+
+# Main function to run the iteration experiment.
+# - cliargs is a list of command line arguments WIHTOUT the current script's name (argv[0]), in particular:
+#   - the first argument is the path to the binary, and
+#   - the rest is simply the arguments to pass to the binary.
+def converge(cliargs):
+    bin = cliargs[0].rsplit('/', 1)[-1] # Get the binary name from the path
+    log("Converge on: " + str([bin] + cliargs[1:]))
+    log("iters: ", end="")
+    xs = []
+    ys = []
+    iters = 1
+    t = time()
+
+    # First find a starting point for `iters` where the time is at least FIRST_ITER_THRESHOLD seconds
+    while len(xs) == 0:
+        log(str(iters) + " ", end="", flush=True)
+        st = do_bench(cliargs, iters)
+        if st < FIRST_ITER_THRESHOLD: # Artem: Joseph had `st * iters < ...` here but I think it's a typo
+            iters *= 2
+            continue
+        xs.append(iters)
+        ys.append(st)
+
+    log(" | ", end="", flush=True)
+    # Do two more trials increasing iters regardless of time
+    for _ in range(2):
+        iters = iter(iters, cliargs, t, xs, ys)
+
+    log(" | ", end="", flush=True)
+    # Keep increasing iters until we reach TOTAL_TIME_THRESHOLD seconds of execution in total
+    while time() - t < TOTAL_TIME_THRESHOLD:
+        iters = iter(iters, cliargs, t, xs, ys)
+    log("done!")
+
+    m, b, sig = linear_regression_with_std(xs, ys)
+    p, lnc, lngsd = linear_regression_with_std([math.log(x) for x in xs], [math.log(y) for y in ys])
+    c, gsd = math.exp(lnc), math.exp(lngsd)
+
+    log(f"Slope (Mean):     {m:.2e}, Stdev:    {sig:.2e}, Intercept (Overhead): {b:.2e}")
+    log(f"Factor (Geomean): {c:.2e}, GeoStdev: {gsd:.2e}, Power (Distortion):   {p:.2e}")
+
+    if MAKE_PLOT:
+        plot(xs, ys, b, c, m, p)
+
+    return m, sig, c, gsd
+
+if __name__ == "__main__":
+    converge(argv[1:])
diff --git a/benchmarks/scripts/criterion-drop-in-replacement/sweep_seq.py b/benchmarks/scripts/criterion-drop-in-replacement/sweep_seq.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+import os
+import numpy as np
+from criterionmethodology import converge
+import sys
+
+# names = ["Optsort", "Insertionsort", "Mergesort", "Quicksort"]
+# names = ["CopyArray", "Quicksort", "Insertionsort", "Mergesort"]
+names = ["Insertionsort"]
+
+# DENSITY = 4
+DENSITY = 12
+def bounds(name):
+    match name:
+        case "Insertionsort":
+            lo = 3  # 2**n ...
+            hi = 12 # for local testing; initially: 16
+        case "Quicksort":
+            lo = 3
+            hi = 22
+        case "Mergesort":
+            # lo = 12
+            lo = 3
+            hi = 24
+        case "Cilksort":
+            # lo = 12
+            lo = 3
+            hi = 16#24
+        case "Optsort":
+            lo = 3
+            hi = 16#24
+        case _:
+            lo = 3
+            hi = 20
+    return lo, hi, (hi-lo)*DENSITY+1
+
+def dotrial(exe, name, size):
+    return converge([exe, name, "Seq", str(int(size))])
+
+if __name__ == "__main__":
+    exe = sys.argv[1]
+    print("Running with executable:", exe)
+    for name in names:
+        lo, hi, pts = bounds(name)
+        with open("%s_out3.csv" % name, "w") as f:
+            f.write("# size\tmean\tstddev\tgeomean\tgeostdev\n")
+        for i in np.unique(np.logspace(lo, hi, pts, base=2).astype(int)):  # Artem: I don't understand this and I must
+            with open("%s_out3.csv" % name, "a") as f:
+                f.write("%d" % int(i) + "\t%f\t%f\t%f\t%f\n" % dotrial(exe, name, i))
diff --git a/...marks/scripts/c-sorting-benchmarks/readme → ...old-criterion/c-sorting-benchmarks/readme b/...marks/scripts/c-sorting-benchmarks/readme → ...old-criterion/c-sorting-benchmarks/readme
diff --git a/...sorting-benchmarks/sort_insertion_out.csv → ...sorting-benchmarks/sort_insertion_out.csv b/...sorting-benchmarks/sort_insertion_out.csv → ...sorting-benchmarks/sort_insertion_out.csv
diff --git a/...sorting-benchmarks/sort_merge_seq_out.csv → ...sorting-benchmarks/sort_merge_seq_out.csv b/...sorting-benchmarks/sort_merge_seq_out.csv → ...sorting-benchmarks/sort_merge_seq_out.csv
diff --git a/...s/c-sorting-benchmarks/sort_quick_out.csv → ...n/c-sorting-benchmarks/sort_quick_out.csv b/...s/c-sorting-benchmarks/sort_quick_out.csv → ...n/c-sorting-benchmarks/sort_quick_out.csv
diff --git a/benchmarks/scripts/plot.py → benchmarks/scripts/old-criterion/plot.py b/benchmarks/scripts/plot.py → benchmarks/scripts/old-criterion/plot.py
diff --git a/benchmarks/scripts/plot_relative_speedup.py → ...ts/old-criterion/plot_relative_speedup.py b/benchmarks/scripts/plot_relative_speedup.py → ...ts/old-criterion/plot_relative_speedup.py
diff --git a/benchmarks/scripts/readme → benchmarks/scripts/old-criterion/readme b/benchmarks/scripts/readme → benchmarks/scripts/old-criterion/readme
diff --git a/benchmarks/scripts/sweep_seq.py → ...hmarks/scripts/old-criterion/sweep_seq.py b/benchmarks/scripts/sweep_seq.py → ...hmarks/scripts/old-criterion/sweep_seq.py
diff --git a/src/Array.hs b/src/Array.hs
@@ -14,7 +14,8 @@ module Array
     Array
 
     -- * Construction and querying
-  , alloc, make, generate, generate_par, generate_par_m, makeArray
+  , alloc, make, allocScratch
+  , generate, generate_par, generate_par_m, makeArray
   , copy, copy_par, copy_par_m
   , size, get, set, slice, append
   , splitAt
@@ -95,9 +96,25 @@ makeArray = make
 #endif
 
 {-# INLINE free #-}
-free :: HasPrim a => Array a -. ()
+free :: Array a -. ()
 free = Unsafe.toLinear (\_ -> ())
 
+{-# INLINE allocScratch #-} -- todo: are we linear in the use of the algorithm?
+{-@ allocScratch :: forall <p :: Array dsts -> Bool>. n:Nat -> x:_ 
+      -> f:({xs:_ | size xs == n } -> { ys:_ | size ys == n } 
+              -> { tup:(Array<p> dsts, Array tmpdsts) | 
+                      token (fst tup) == token xs & token (snd tup) == token ys &
+                      size (fst tup) == size xs & size (snd tup) == size ys &
+                      left (fst tup) == left xs & left (snd tup) == left ys &
+                      right (fst tup) == right xs & right (snd tup) == right ys })
+      -> { src:_ | size src == n } -> { dst:Array<p> dsts | token src == token dsts } @-}
+allocScratch :: HasPrim tmps => Int -> tmps -> (Array srcs -. Array tmps -. (Array dsts, Array tmpdsts)) 
+                  -. Array srcs -. Array dsts
+allocScratch i a f arr =
+  let
+    !(dst, tmp) = f arr (makeArray i a)
+  in case free tmp of !() -> dst
+
 --------------------------------------------------------------------------------
 -- Parallel operations
 --------------------------------------------------------------------------------

diff --git a/src/DpsMergeSort.hs b/src/DpsMergeSort.hs
diff --git a/src/DpsMergeSort4.hs b/src/DpsMergeSort4.hs
@@ -76,9 +76,7 @@ msortInplace src tmp = go src tmp where
                                A.size xs == A.size zs && token xs == token zs } @-}
 msort' :: (Show a, HasPrimOrd a) => a -> A.Array a -. A.Array a
 msort' anyVal src =
-  let !(Ur len, src') = A.size2 src
-      !(src'', _tmp) = msortInplace src' (A.make len anyVal) in
-  case A.free _tmp of !() -> src''
+  let !(Ur len, src') = A.size2 src in allocScratch len anyVal msortInplace src'
 {-# INLINE msort' #-}
 
 -- finally, the top-level merge sort function -- TODO: use A.get2/A.size2 for linearity