runmat-org · finrunsfar · Nov 24, 2025 · Nov 24, 2025 · Nov 24, 2025 · Nov 24, 2025
diff --git a/benchmarks/4k-image-processing/README.md b/benchmarks/4k-image-processing/README.md
@@ -1,4 +1,4 @@
-# 4K Image Preprocessing, GPU‑accelerated
+# Image Preprocessing, GPU‑accelerated
 
 If you ship geospatial or vision workloads, you’ve likely written this stage countless times: standardize each 4K tile, apply a small radiometric correction, gamma‑correct, and run a quick QC metric. On CPUs this is fine—until the batch grows and your wall‑clock explodes. 
 
@@ -10,9 +10,16 @@ The math is deliberately simple and realistic: compute a per‑image mean and st
 
 ## Results
 
-![RunMat is 8.1x faster than NumPy](https://web.runmatstatic.com/4k-image-processing_speedup.svg)
-
+![RunMat is 10x faster than NumPy](https://web.runmatstatic.com/4k-image-processing_speedup-b.svg)
 
+### 4K Image Pipeline Perf Sweep (B = batch size)
+| B | RunMat (ms) | PyTorch (ms) | NumPy (ms) | NumPy ÷ RunMat | PyTorch ÷ RunMat |
+|---|---:|---:|---:|---:|---:|
+| 4  | 142.97 | 801.29 | 500.34 | 3.50× | 5.60× |
+| 8  | 212.77 | 808.92 | 939.27 | 4.41× | 3.80× |
+| 16 | 241.56 | 907.73 | 1783.47 | 7.38× | 3.76× |
+| 32 | 389.25 | 1141.92 | 3605.95 | 9.26× | 2.93× |
+| 64 | 683.54 | 1203.20 | 6958.28 | 10.18× | 1.76× |
 ---
 
 ## Core implementation in RunMat (MATLAB-syntax)
@@ -45,7 +52,7 @@ Note: MATLAB’s license agreement restricts usage of their runtime for benchmar
 
 ## Why RunMat is fast (accelerate + fusion)
 
-<insert blurb about accelerate and fusion here, direct to the individual doc for more detail>
+RunMat fuses elementwise stages and keeps tensors resident on device between steps, while random number generation and updates execute in large, coalesced kernels—a strong fit for GPUs. For the big picture on fusion and residency, see the [Introduction to RunMat on the GPU](https://github.com/runmat-org/runmat/blob/main/docs/INTRODUCTION_TO_RUNMAT_GPU.md) document.
 
 ---
 

diff --git a/benchmarks/elementwise-math/README.md b/benchmarks/elementwise-math/README.md
@@ -18,7 +18,23 @@ The scripts scale the number of samples via `ELM_POINTS` (default
 
 ## Results
 
-![RunMat is up to 100x faster](https://web.runmatstatic.com/elementwise-math_speedup.svg)
+![RunMat is up to 144x faster](https://web.runmatstatic.com/elementwise-math_speedup-b.svg)
+
+
+### Elementwise Math Perf Sweep (points)
+| points | RunMat (ms) | PyTorch (ms) | NumPy (ms) | NumPy ÷ RunMat | PyTorch ÷ RunMat |
+|---|---:|---:|---:|---:|---:|
+| 1M   | 145.15 | 856.41  |   72.39 | 0.50× | 5.90× |
+| 2M   | 149.75 | 901.05  |   79.49 | 0.53× | 6.02× |
+| 5M   | 145.14 | 1111.16 |  119.45 | 0.82× | 7.66× |
+| 10M  | 143.39 | 1377.43 |  154.38 | 1.08× | 9.61× |
+| 100M | 144.81 | 16,404.22 | 1,073.09 | 7.41× | 113.28× |
+| 200M | 156.94 | 16,558.98 | 2,114.66 | 13.47× | 105.51× |
+| 500M | 137.58 | 17,882.11 | 5,026.94 | 36.54× | 129.97× |
+| 1B | 144.40 | 20,841.42 | 11,931.93 | 82.63× | 144.34× |
+
+*M = 10⁶ elements, B = 10⁹ elements.*
+
 
 ---
 

diff --git a/benchmarks/monte-carlo-analysis/README.md b/benchmarks/monte-carlo-analysis/README.md
@@ -14,7 +14,18 @@ price  = mean(payoff) · exp(−μ T Δt)
 
 ## Results
 
-![RunMat is up to 138x faster than NumPy](https://web.runmatstatic.com/monte-carlo-analysis_speedup.svg)
+![RunMat is up to 131x faster than NumPy](https://web.runmatstatic.com/monte-carlo-analysis_speedup-b.svg)
+
+### Monte Carlo Perf Sweep 
+| Paths (simulations) | RunMat (ms) | PyTorch (ms) | NumPy (ms) | NumPy ÷ RunMat | PyTorch ÷ RunMat |
+|--------------------:|-----------:|-------------:|-----------:|---------------:|-----------------:|
+| 250k   | 108.58 |   824.42 |  4,065.87 | 37.44× | 7.59× |
+| 500k   | 136.10 |   900.11 |  8,206.56 | 60.30× | 6.61× |
+| 1M     | 188.00 |   894.32 | 16,092.49 | 85.60× | 4.76× |
+| 2M     | 297.65 | 1,108.80 | 32,304.64 |108.53× | 3.73× |
+| 5M     | 607.36 | 1,697.59 | 79,894.98 |131.55× | 2.80× |
+
+*250k = 250,000 paths, 1M = 1,000,000 paths, etc.*
 
 ---
 

diff --git a/website/app/global-error.tsx b/website/app/global-error.tsx
@@ -40,3 +40,4 @@ export default function GlobalError({ error, reset }: GlobalErrorProps) {
   );
 }
 
+
diff --git a/website/app/layout.tsx b/website/app/layout.tsx
@@ -182,7 +182,7 @@ export default function RootLayout({
         />
         <GoogleAnalytics />
       </head>
-      <body className={`${inter.className} antialiased`}>
+      <body className={`${inter.className} antialiased`} suppressHydrationWarning>
         <GoogleTagManager />
         <ThemeProvider
           attribute="class"

diff --git a/website/app/page.tsx b/website/app/page.tsx
@@ -175,8 +175,11 @@ export default function HomePage() {
               <p>
                 <span className="font-semibold">Monte Carlo:</span> geometric Brownian motion with terminal PnL and risk stats.
               </p>
+              <p>
+                <span className="font-semibold">Elementwise math:</span> long chain of sin, exp, cos, and tanh operations on big 1D arrays.
+              </p>
               <p className="text-sm">
-                Each number is the mean of <span className="font-semibold">3 runs</span>. Full scripts live in the{" "}
+                Each number is the median of <span className="font-semibold">3 runs</span>. Full scripts live in the{" "}
                 <a
                   href="https://github.com/runmat-org/runmat/tree/main/benchmarks"
                   className="underline"

diff --git a/website/components/benchmarks/BenchmarkSweepCarousel.tsx b/website/components/benchmarks/BenchmarkSweepCarousel.tsx
@@ -7,9 +7,9 @@ import MonteCarloSweep from "./MonteCarloSweep";
 import ElementwiseMathSweep from "./ElementwiseMathSweep";
 
 const slides = [
-  { id: "4k", component: FourKImagePipelineSweep },
-  { id: "monte-carlo", component: MonteCarloSweep },
-  { id: "elementwise", component: ElementwiseMathSweep },
+  { id: "4k", component: FourKImagePipelineSweep, href: "/benchmarks/4k-image-processing" },
+  { id: "monte-carlo", component: MonteCarloSweep, href: "/benchmarks/monte-carlo-analysis" },
+  { id: "elementwise", component: ElementwiseMathSweep, href: "/benchmarks/elementwise-math" },
 ];
 
 export default function BenchmarkSweepCarousel() {
@@ -31,9 +31,9 @@ export default function BenchmarkSweepCarousel() {
         >
           ←
         </button>
-        <div className="w-full max-w-[42rem]">
+        <a className="w-full max-w-[42rem]" href={activeSlide.href}>
           <Component />
-        </div>
+        </a>
         <button
           type="button"
           onClick={goNext}

diff --git a/website/components/benchmarks/ElementwiseMathSweep.tsx b/website/components/benchmarks/ElementwiseMathSweep.tsx
@@ -6,11 +6,11 @@ type ElementwiseRow = {
 };
 
 const ELEMENTWISE_ROWS: ElementwiseRow[] = [
-  { points: "10M", runMatMs: 174, vsPyTorch: "8× faster", vsNumPy: "≈ same speed" },
-  { points: "100M", runMatMs: 171, vsPyTorch: "99× faster", vsNumPy: "6× faster" },
-  { points: "200M", runMatMs: 203, vsPyTorch: "86× faster", vsNumPy: "11× faster" },
-  { points: "500M", runMatMs: 172, vsPyTorch: "110× faster", vsNumPy: "35× faster" },
-  { points: "1B", runMatMs: 199, vsPyTorch: "114× faster", vsNumPy: "63× faster" },
+  { points: "10M", runMatMs: 143, vsPyTorch: "9.6× faster", vsNumPy: "1.1× faster" },
+  { points: "100M", runMatMs: 145, vsPyTorch: "113.3× faster", vsNumPy: "7.4× faster" },
+  { points: "200M", runMatMs: 157, vsPyTorch: "105.5× faster", vsNumPy: "13.5× faster" },
+  { points: "500M", runMatMs: 138, vsPyTorch: "130× faster", vsNumPy: "36.5× faster" },
+  { points: "1B", runMatMs: 144, vsPyTorch: "144.3× faster", vsNumPy: "82.6× faster" },
 ];
 
 export default function ElementwiseMathSweep() {
@@ -71,10 +71,10 @@ export default function ElementwiseMathSweep() {
                   RunMat speedup range
                 </td>
                 <td className="px-3 sm:px-6 py-2 sm:py-3 text-center">
-                  8× – 114× vs PyTorch
+                  9.6× – 144.3× vs PyTorch
                 </td>
                 <td className="px-3 sm:px-6 py-2 sm:py-3 text-center">
-                  6× – 63× vs NumPy
+                  1.1× – 82.6× vs NumPy
                 </td>
               </tr>
             </tfoot>

diff --git a/website/components/benchmarks/FourKImagePipelineSweep.tsx b/website/components/benchmarks/FourKImagePipelineSweep.tsx
@@ -6,11 +6,11 @@ type SweepRow = {
 };
 
 const SWEEP_ROWS: SweepRow[] = [
-  { batchSize: 4, runMatMs: 204, vsPyTorch: "4.5× faster", vsNumPy: "2.6× faster" },
-  { batchSize: 8, runMatMs: 265, vsPyTorch: "3.3× faster", vsNumPy: "3.5× faster" },
-  { batchSize: 16, runMatMs: 299, vsPyTorch: "3.3× faster", vsNumPy: "6.1× faster" },
-  { batchSize: 32, runMatMs: 493, vsPyTorch: "2.2× faster", vsNumPy: "7.6× faster" },
-  { batchSize: 64, runMatMs: 871, vsPyTorch: "1.5× faster", vsNumPy: "8.3× faster" },
+  { batchSize: 4, runMatMs: 143, vsPyTorch: "5.6× faster", vsNumPy: "3.5× faster" },
+  { batchSize: 8, runMatMs: 213, vsPyTorch: "3.8× faster", vsNumPy: "4.4× faster" },
+  { batchSize: 16, runMatMs: 242, vsPyTorch: "3.8× faster", vsNumPy: "7.4× faster" },
+  { batchSize: 32, runMatMs: 389, vsPyTorch: "2.9× faster", vsNumPy: "9.3× faster" },
+  { batchSize: 64, runMatMs: 684, vsPyTorch: "1.8× faster", vsNumPy: "10.2× faster" },
 ];
 
 export default function FourKImagePipelineSweep() {
@@ -79,10 +79,10 @@ export default function FourKImagePipelineSweep() {
                   RunMat speedup range
                 </td>
                 <td className="px-3 sm:px-6 py-2 sm:py-3 text-center">
-                  1.5× – 4.5× vs PyTorch
+                  1.8× – 5.6× vs PyTorch
                 </td>
                 <td className="px-3 sm:px-6 py-2 sm:py-3 text-center">
-                  2.6× – 8.3× vs NumPy
+                  3.5× – 10.2× vs NumPy
                 </td>
               </tr>
             </tfoot>

diff --git a/website/components/benchmarks/MonteCarloSweep.tsx b/website/components/benchmarks/MonteCarloSweep.tsx
@@ -6,11 +6,11 @@ type MonteCarloRow = {
 };
 
 const MONTE_CARLO_ROWS: MonteCarloRow[] = [
-  { paths: "250k", runMatMs: 168, vsPyTorch: "5.1× faster", vsNumPy: "24.7× faster" },
-  { paths: "500k", runMatMs: 166, vsPyTorch: "5.3× faster", vsNumPy: "50.1× faster" },
-  { paths: "1M", runMatMs: 240, vsPyTorch: "4.0× faster", vsNumPy: "69.1× faster" },
-  { paths: "2M", runMatMs: 343, vsPyTorch: "3.3× faster", vsNumPy: "96.3× faster" },
-  { paths: "5M", runMatMs: 652, vsPyTorch: "2.8× faster", vsNumPy: "125.8× faster" },
+  { paths: "250k", runMatMs: 109, vsPyTorch: "7.6× faster", vsNumPy: "37.4× faster" },
+  { paths: "500k", runMatMs: 136, vsPyTorch: "6.6× faster", vsNumPy: "60.3× faster" },
+  { paths: "1M", runMatMs: 188, vsPyTorch: "4.8× faster", vsNumPy: "85.6× faster" },
+  { paths: "2M", runMatMs: 298, vsPyTorch: "3.7× faster", vsNumPy: "108.5× faster" },
+  { paths: "5M", runMatMs: 607, vsPyTorch: "2.8× faster", vsNumPy: "131.5× faster" },
 ];
 
 export default function MonteCarloSweep() {
@@ -71,10 +71,10 @@ export default function MonteCarloSweep() {
                   RunMat speedup range
                 </td>
                 <td className="px-3 sm:px-6 py-2 sm:py-3 text-center">
-                  2.8× – 5.3× vs PyTorch
+                  2.8× – 7.6× vs PyTorch
                 </td>
                 <td className="px-3 sm:px-6 py-2 sm:py-3 text-center">
-                  24.7× – 125.8× vs NumPy
+                  37.4× – 131.5× vs NumPy
                 </td>
               </tr>
             </tfoot>
Original file line number	Diff line number	Diff line change
Expand Up		@@ -40,3 +40,4 @@ export default function GlobalError({ error, reset }: GlobalErrorProps) {
		);
		}