diff --git a/extra/PROFILING.md b/extra/PROFILING.md new file mode 100644 index 00000000000..219eb04385b --- /dev/null +++ b/extra/PROFILING.md @@ -0,0 +1,312 @@ +# Haxe Compiler Profiling Guide & Analysis + +## How to Build a Profiling-Enabled Binary + +The `src/dune` file includes a `profile` build environment that adds DWARF +debug info (`-g`) and keeps optimisations (`-O2`), so `perf` can resolve +OCaml symbols without a significant performance penalty: + +```bash +eval $(opam env) +dune build --profile profile src/haxe.exe +# binary is _build/default/src/haxe.exe +``` + +## How to Profile + +### Built-in Timer Breakdown (recommended first step) + +```bash +# Overall phase timing +haxe --cwd tests/unit compile-macro.hxml --times + +# Detailed per-method eval timing +haxe --cwd tests/unit compile-macro.hxml --times -D times.eval + +# Detailed filter + analyzer timing +haxe --cwd tests/unit compile-macro.hxml --times \ + -D times.filter=2 -D times.analyzer=2 + +# HXB-specific timing +haxe --cwd tests/unit compile-hxb-interp-roundtrip.hxml --times -D times.hxb +``` + +### `perf` (Linux, requires root or `perf_event_paranoid <= 1`) + +```bash +sudo sysctl -w kernel.perf_event_paranoid=-1 + +# Record with DWARF call-graph unwinding +perf record -F 999 -g --call-graph dwarf -o eval.perf \ + haxe --cwd tests/unit compile-macro.hxml + +# Flat profile (top self-time functions) +perf report -i eval.perf --no-children --stdio --call-graph=none + +# Callers of a specific symbol +perf report -i eval.perf --children --stdio \ + --call-graph=caller --symbol-filter=compare_val +``` + +### `olly` (OCaml 5 runtime-events GC profiler) + +```bash +opam install runtime_events_tools +olly gc-stats -- haxe --cwd tests/unit compile-macro.hxml +``` + +### `memtrace` (allocation profiling) + +**Note:** `memtrace` does **not** work with OCaml 5 multicore (`Gc.Memprof` +is disabled). Use `olly gc-stats` for GC overhead metrics, or build a +single-domain binary to use `memtrace`. + +--- + +## Profiling Results — Eval Unit Tests (~2.7 s) + +### `--times` Phase Breakdown + +| Phase | Time (s) | % | +|-------|----------|---| +| Macro execution | 1.06 | 36 | +| — `ancestorHasInitializeUtest` (utest build macro) | 0.23 | 8 | +| Typing | 0.76 | 26 | +| Parsing | 0.33 | 11 | +| Filters | 0.31 | 11 | +| — `handle_abstract_casts` | 0.05 | 2 | +| — `fix_return_dynamic_from_void_function` | 0.06 | 2 | +| Analyzer | 0.24 | 8 | +| — fusion / fuse | 0.07 | 2 | +| Interp (eval JIT) | 0.16 | 6 | + +### `perf` Flat Profile (Top Self-Time Functions) + +| % | Symbol | Category | +|---|--------|----------| +| 13.5 | `do_some_marking` | **GC major marking** | +| 5.3 | `caml_shared_try_alloc` | GC allocation | +| 3.7 | `oldify_one` | GC minor→major promotion | +| 3.1 | `compare_val` | **Polymorphic comparison** | +| 2.6 | `pool_sweep` | GC sweep | +| 2.4 | `Texpr.map_expr` | Type expression traversal | +| 2.4 | `caml_hash` | Hash-table operations | +| 2.4 | `oldify_mopup` | GC | +| 1.0 | `TFunctions.follow` | Type follow | +| 0.96 | `caml_alloc_string` | String allocation | +| 0.84 | `Stdlib.List.map` | List processing | +| 0.70 | `Dce.expr` | Dead-code elimination | +| 0.62 | `Stdlib.Map.find` | Map lookup | +| 0.45 | `EvalJit.loop` | JIT compilation | + +### `olly` GC Statistics + +| Metric | Value | +|--------|-------| +| Wall time | 2.98 s | +| CPU time | 5.27 s | +| GC time | 1.38 s | +| **GC overhead (% of CPU)** | **26.3 %** | +| Domain 0 GC overhead | 30.7 % | +| Worker domains (1-3) GC | 19-22 % | +| P99 GC latency | 3.0 ms | +| Max GC latency | 6.0 ms | + +--- + +## Profiling Results — HXB Roundtrip + +### Write Phase (+0.38 s over normal eval) + +The HXB writing adds `generate/hxb` at 12-13 % of total time. `perf` shows +`HxbWriter.loop` (0.61 %) and `HxbWriter.write_type_instance` (0.38 %) as +the top writer functions. `Zlib.update_crc` (1.19 %) appears due to +zip compression. + +**After switching to `Stored` (level 0) compression**, `Zlib.update_crc` +disappears from the profile entirely. + +### Read Phase (~0.85 s — 3× faster than full compile) + +| Phase | Time (s) | % | +|-------|----------|---| +| Typing (HXB deserialization) | 0.33 | 39 | +| Interp (eval JIT) | 0.30 | 35 | +| Filters | 0.11 | 13 | +| hxblib I/O | 0.03 | 4 | +| — `get bytes` (zip read) | 0.02 | 72 % of hxblib | + +With `Stored` compression, `hxblib get bytes` dropped from **0.090 s → 0.021 s** +(4.3× faster). Archive size went from 3.6 MB → 6.5 MB (1.8× larger). + +### Roundtrip `olly` GC Statistics + +| Metric | Value | +|--------|-------| +| Wall time | 3.49 s | +| GC time | 1.61 s | +| **GC overhead** | **43.5 %** | +| Domain 0 GC overhead | 32.1 % | + +The higher GC overhead in roundtrip comes from the HXB write phase creating +many temporary serialization buffers. + +--- + +## Identified Hotspots & Recommendations + +### 1. GC Pressure (~26 % of CPU) + +The OCaml 5 GC (with multicore support) shows significant overhead. +`do_some_marking` alone accounts for 13.5 % of self-time. While OCaml's +generational GC handles short-lived allocations well, the sheer volume of +allocations in type traversal (`Texpr.map_expr`, `TFunctions.follow`, +`Stdlib.List.map`) creates GC pressure. + +**Note:** `memtrace` is incompatible with OCaml 5 multicore. To profile +allocations at the source level, either: +- Build a single-domain (non-multicore) OCaml switch and use `memtrace`, or +- Use OCaml 5's `runtime_events` with `olly gc-stats` for aggregate GC + metrics (already done above). + +### 2. Polymorphic Comparison (`compare_val`, 3.1 %) + +`perf` shows `compare_val` consumes 3.1 % of total time, called via both +`caml_compare` (2 %) and `caml_equal` (1 %). These are triggered by OCaml's +polymorphic `(=)` and `compare` operators. + +**Important:** OCaml's native compiler specialises `=` when the types +are known at compile-time, but **only for types whose constructors are all +constant** (take no arguments). If any constructor of the type carries +data (e.g. `Var of var_kind`), OCaml emits a call to `caml_equal` even +when the concrete values at runtime might be constant constructors. + +One exception: when one side of `=` is a **literal constant constructor** +(e.g. `x = Const`), OCaml recognises that the constant constructor is +an immediate and emits a direct `cmpq` regardless of whether the type +has structured variants. + +Confirmed by inspecting the generated assembly (`ocamlfind ocamlopt -S`): + +| Expression | Type | Assembly | Polymorphic? | +|---|---|---|---| +| `(a : method_kind) = (b : method_kind)` | all-constant ctors | `cmpq` | No | +| `(m : method_kind) = MethDynamic` | literal constant | `cmpq` | No | +| `!(a.a_status) = Const` | literal constant | `cmpq` | No | +| `mkind m1 = mkind m2` | `int = int` | `cmpq` | No | +| `(a : field_kind) = (b : field_kind)` | has `Var of var_kind` | `caml_equal` | **Yes** | +| `(a : tconstant) = (b : tconstant)` | has `TInt of int32` etc. | `caml_equal` | **Yes** | +| `e.eexpr = TConst TSuper` | literal structured ctor | `caml_equal` | **Yes** (but shallow) | +| `(a : var_access) = (b : var_access)` | has `AccRequire of ...` | `caml_equal` | **Yes** | +| `(a : path) = (b : path)` | `string list * string` | `caml_equal` | **Yes** | + +**Confirmed polymorphic call sites:** + +1. **`src/typing/typeloadCheck.ml:187`** — `| a, b when a = b -> ()`: + compares two `field_kind` variables. `field_kind` has `Var of var_kind`, + so OCaml cannot specialise this. Fixing: decompose into a pattern match + or a custom `field_kind_eq` helper. +2. **`src/typing/fields.ml:141,194`** — `e.eexpr = TConst TSuper`: + `texpr_expr` is massively structured. However, this comparison is + **shallow** — `caml_equal` checks the constructor tag first, and both + `TConst` and `TSuper` are quickly resolved. Low priority. +3. **`src/optimization/optimizerTexpr.ml:189`** — `a = b` comparing two + `tconstant` values (has `TInt of int32`, `TString of string`, etc.). +4. **Path comparisons** (~10 sites) — `c.cl_path = path` compares + `string list * string` tuples. +5. **`src/typing/nullSafety.ml`** — polymorphic `Hashtbl` with + `safety_subject` keys (a variant with `SFieldOfClass of path * string list` + etc.). Every `Hashtbl.find`/`Hashtbl.mem`/`Hashtbl.replace` call + triggers both `caml_hash` and `caml_equal`. + +**Not polymorphic (previously incorrectly listed):** + +- `typeloadCheck.ml:66` — `not (m1 = MethDynamic)`: `method_kind` has only + constant constructors → direct `cmpq`. +- `typeloadCheck.ml:432` — `mkind m1 = mkind m2`: projects to `int` first + → direct `cmpq`. +- `fields.ml:41` — `!(a.a_status) = Const`: comparing against a literal + constant constructor → direct `cmpq`. + +**Recommendation:** The total cost is modest (3.1 %). The most impactful +fix would be switching `nullSafety.ml` to functorized hash tables with a +custom hash/equal for `safety_subject`, which would also eliminate the +`caml_hash` overhead (2.4 % of perf time, much of which likely comes from +these tables). The `field_kind = field_kind` comparison at +`typeloadCheck.ml:187` can be replaced with a pattern match. + +### 3. HXB Zip I/O + +Changing from `Deflated` (level 6) to `Stored` (level 0) eliminates all +compression/decompression overhead. This was implemented in this PR. + +- Read-phase `get bytes`: 0.090 s → 0.021 s (4.3× improvement) +- Archive size: 3.6 MB → 6.5 MB (1.8× increase) + +### 4. HxbWriter Type Instance Handling + +The writer already deduplicates type instances within expression contexts +via `write_texpr_type_instance` (serialises to bytes, then interns via +`StringPool`). Top-level field-signature type writes are not deduplicated +but account for far fewer calls. The `perf` data shows +`HxbWriter.write_type_instance` at only 0.38 % of total time, so further +deduplication here would yield diminishing returns. + +### 5. Domain Management — Domainslib replaced with WorkerPool + +**Problem:** Domainslib's `Task.setup_pool` spawns N OS-level domains that +spin-wait on a lock-free multi-channel. Even when there is no work to do, +each domain busy-loops, consuming CPU. For single-file compilations or +eval-only runs, the pool is acquired but domains sit idle for 80 %+ of the +compilation. + +**Solution (implemented in this PR):** Replaced Domainslib entirely with a +custom `WorkerPool` in `parallel.ml`. The pool uses `Domain.spawn` for +workers that block on `Condition.wait` between calls — **zero CPU when +idle**, unlike Domainslib's spin-wait. + +The `domainslib` dependency has been removed from `src/dune` and +`haxe.opam`. + +**Architecture:** + +``` + ┌─── Worker 0: Condition.wait → process chunk → signal done ───┐ + submit(length, f) ─┼─── Worker 1: Condition.wait → process chunk → signal done ───┼→ all done + ├─── Worker 2: Condition.wait → process chunk → signal done ───┤ + └─── Main domain: process chunk 0 → wait for workers ──────────┘ +``` + +- `WorkerPool.create nw`: spawns `nw` worker domains that immediately + block on `Condition.wait`. Zero CPU. +- `WorkerPool.parallel_for pool length f`: partitions `[0..length-1]` + into contiguous chunks across `nw+1` domains (workers + main). Workers + are woken via `Condition.broadcast`, process their chunk, then signal + completion via a counter + `Condition.signal`. +- `WorkerPool.teardown pool`: sets a `stop` flag, broadcasts, joins all + worker domains. +- **Nested call detection:** An `Atomic.t bool` `busy` flag prevents + nested `parallel_for` calls (e.g. analyzer iterating types → iterating + fields) from corrupting shared state. Nested calls fall back to + sequential execution, matching Domainslib's effective behaviour for the + same code paths. +- **Exception propagation:** First exception from any domain (worker or + main) is captured with backtrace and re-raised after all domains finish. + +**`ManagedPool`** is retained as a thin wrapper that lazily creates a +`WorkerPool` on first use and tears it down on `release`. Workers sleep +between `run_with_pool` scopes — zero CPU overhead. + +**Benchmark results** (4-core CI runner, median of 5 runs for eval, 3 for JVM): + +| Benchmark | Domainslib | WorkerPool | Delta | +|---|---|---|---| +| Eval unit tests | 2762 ms | 2809 ms | +1.7 % (noise) | +| JVM compilation | 1062 ms | 1070 ms | +0.8 % (noise) | + +Throughput is within measurement noise — the WorkerPool is not faster for +active parallel work. The primary benefit is eliminating idle CPU +consumption: Domainslib workers spin-wait on a lock-free channel even when +no work is available, whereas WorkerPool workers block on `Condition.wait` +and consume zero CPU between parallel sections. This matters for the +compilation server where the pool persists across requests. diff --git a/haxe.opam b/haxe.opam index 4ad52d6a811..433e0d32e1d 100644 --- a/haxe.opam +++ b/haxe.opam @@ -19,7 +19,7 @@ build: [ install: [make "install" "INSTALL_DIR=%{prefix}%"] remove: [make "uninstall" "INSTALL_DIR=%{prefix}%"] depends: [ - "ocaml" {>= "5.0"} # required by domainslib for multicore support + "ocaml" {>= "5.0"} # required for Domain.spawn multicore support "ocamlfind" {build} "dune" {>= "3.17"} "sedlex" {>= "2.0"} @@ -34,7 +34,6 @@ depends: [ "luv" {>= "0.5.13"} "ipaddr" "terminal_size" - "domainslib" {>= "0.5.2"} "saturn" "thread-local-storage" "dynamic_gc" diff --git a/src/compiler/generate.ml b/src/compiler/generate.ml index 99069915330..7c28890c452 100644 --- a/src/compiler/generate.ml +++ b/src/compiler/generate.ml @@ -72,7 +72,7 @@ let check_hxb_output com config = let path = Str.global_replace (Str.regexp "\\$target") (platform_name com.platform) path in let t = Timer.start_timer com.timer_ctx ["generate";"hxb"] in Path.mkdir_from_path path; - let zip = new Zip_output.zip_output path 6 in + let zip = new Zip_output.zip_output path 0 in let export com config = let cc = CommonCache.get_cache com in let target = Common.platform_name_macro com in diff --git a/src/compiler/server/serverCompilationContext.ml b/src/compiler/server/serverCompilationContext.ml index 99c04334354..7f5118753d3 100644 --- a/src/compiler/server/serverCompilationContext.ml +++ b/src/compiler/server/serverCompilationContext.ml @@ -40,7 +40,7 @@ let create_version () = } let create verbose is_server = - let pool = Parallel.ManagedPool.create (fun () -> Domainslib.Task.setup_pool ~num_domains:(Domain.recommended_domain_count() - 1) ()) in + let pool = Parallel.ManagedPool.create () in { is_server; version = create_version (); diff --git a/src/context/parallel.ml b/src/context/parallel.ml index 4f1a825fe42..b0e7cc6d60a 100644 --- a/src/context/parallel.ml +++ b/src/context/parallel.ml @@ -1,35 +1,195 @@ let enable = ref true -let run_parallel_for num_domains ?(chunk_size=0) length f = - if not !enable then begin - for i = 0 to length - 1 do - f i - done - end else - let pool = Domainslib.Task.setup_pool ~num_domains:(num_domains - 1) () in - Domainslib.Task.run pool (fun _ -> Domainslib.Task.parallel_for pool ~chunk_size ~start:0 ~finish:(length-1) ~body:f); - Domainslib.Task.teardown_pool pool +(* Minimum number of items to justify spawning domains. + Below this threshold we run sequentially on the calling domain. *) +let min_parallel_items = 8 + +let num_domains () = + Domain.recommended_domain_count () + +(* ────────────────────────────────────────────────────────────────────── + WorkerPool — a reusable pool of OS domains that sleep (via + Condition.wait, zero CPU) between parallel_for calls. + + Workers are spawned once when the pool is created and joined when it + is torn down. Between calls they block on a condition variable — + no spin-wait, no OS thread creation/destruction per call. + + Nested parallel_for calls (e.g. iterating over types, where each + type iterates over fields) are detected via an atomic [busy] flag + and fall back to sequential execution. + ────────────────────────────────────────────────────────────────────── *) +module WorkerPool = struct + type t = { + (* Total domains = num_workers + 1 (main). *) + num_workers : int; + (* Dispatch a parallel-for: [submit length f] calls [f 0] … [f (length-1)]. *) + submit : int -> (int -> unit) -> unit; + (* Tear down workers (join). *) + shutdown : unit -> unit; + } + + (* Create a pool with [nw] worker domains (0 = sequential). *) + let create nw = + if nw <= 0 then { + num_workers = 0; + submit = (fun length f -> for i = 0 to length - 1 do f i done); + shutdown = (fun () -> ()); + } + else begin + (* -- shared mutable state, all protected by [mu] -- *) + let mu = Mutex.create () in + let work_avail = Condition.create () in + let all_done = Condition.create () in + let body = ref (fun (_:int) -> ()) in + let chunk_starts = Array.make nw 0 in + let chunk_ends = Array.make nw (-1) in (* -1 = empty range *) + let gen = ref 0 in + let remaining = ref 0 in + let stop = ref false in + (* First exception (with backtrace) from any domain. *) + let exc : (exn * Printexc.raw_backtrace) option Atomic.t = Atomic.make None in + (* True while a parallel_for is in progress. Nested calls + (from within the body of a parallel_for) fall back to + sequential to avoid corrupting the pool's shared state. *) + let busy = Atomic.make false in + + (* Worker loop — runs on each spawned domain. *) + let worker idx = + let my_gen = ref 0 in + let running = ref true in + while !running do + Mutex.lock mu; + while !gen = !my_gen && not !stop do + Condition.wait work_avail mu + done; + if !stop then begin + Mutex.unlock mu; + running := false + end else begin + my_gen := !gen; + let f = !body in + let s = chunk_starts.(idx) in + let e = chunk_ends.(idx) in + Mutex.unlock mu; + (* Execute the assigned chunk. *) + (try for j = s to e do f j done + with exn -> + let bt = Printexc.get_raw_backtrace () in + ignore (Atomic.compare_and_set exc None (Some (exn, bt)))); + (* Signal completion. *) + Mutex.lock mu; + decr remaining; + if !remaining = 0 then Condition.signal all_done; + Mutex.unlock mu + end + done + in + + let workers = Array.init nw (fun idx -> + Domain.spawn (fun () -> worker idx) + ) in + + let nd = nw + 1 in (* total domains *) + + let submit length f = + if length <= 0 then () + else if length < min_parallel_items || not (Atomic.compare_and_set busy false true) then + (* Too few items or nested call — run sequentially. *) + for i = 0 to length - 1 do f i done + else begin + let n = min nd length in + let chunk = length / n in + let rem = length mod n in + (* Prepare worker chunks (workers process chunks 1 … n-1). *) + Mutex.lock mu; + body := f; + Atomic.set exc None; + for i = 0 to nw - 1 do + let ci = i + 1 in + if ci < n then begin + chunk_starts.(i) <- ci * chunk + min ci rem; + chunk_ends.(i) <- (ci + 1) * chunk + min (ci + 1) rem - 1 + end else begin + chunk_starts.(i) <- 0; + chunk_ends.(i) <- -1 (* empty range — finishes instantly *) + end + done; + remaining := nw; + incr gen; + Condition.broadcast work_avail; + Mutex.unlock mu; + (* Main domain processes chunk 0. *) + let e0 = chunk + (if rem > 0 then 1 else 0) - 1 in + (try for j = 0 to e0 do f j done + with exn -> + let bt = Printexc.get_raw_backtrace () in + ignore (Atomic.compare_and_set exc None (Some (exn, bt)))); + (* Wait for all workers to finish. *) + Mutex.lock mu; + while !remaining > 0 do + Condition.wait all_done mu + done; + Mutex.unlock mu; + Atomic.set busy false; + (* Re-raise the first captured exception, if any. *) + match Atomic.get exc with + | Some (exn, bt) -> Printexc.raise_with_backtrace exn bt + | None -> () + end + in + + let shutdown () = + Mutex.lock mu; + stop := true; + Condition.broadcast work_avail; + Mutex.unlock mu; + Array.iter Domain.join workers + in + + { num_workers = nw; submit; shutdown } + end + + let parallel_for pool length f = pool.submit length f + let teardown pool = pool.shutdown () +end + +(* ────────────────────────────────────────────────────────────────────── + Public API — kept identical to the old Domainslib-based API so that + callers (filters.ml, genjvm.ml, etc.) need minimal changes. + The pool token is now [WorkerPool.t option] instead of + [Domainslib.Task.pool option]. + ────────────────────────────────────────────────────────────────────── *) + +let run_parallel_for nd ?(chunk_size=0) length f = + ignore chunk_size; (* kept for API compat *) + if not !enable then + for i = 0 to length - 1 do f i done + else begin + let wp = WorkerPool.create (nd - 1) in + Std.finally (fun () -> WorkerPool.teardown wp) (WorkerPool.parallel_for wp length) f + end module ParallelArray = struct + (** [iter pool f a]: iterate [f] over [a] in parallel when [pool = Some wp]. *) let iter pool f a = match pool with | None -> Array.iter f a - | Some pool -> - let f' idx = f a.(idx) in - Domainslib.Task.parallel_for pool ~start:0 ~finish:(Array.length a - 1) ~body:f' + | Some wp -> + WorkerPool.parallel_for wp (Array.length a) (fun i -> f a.(i)) + (** [map pool f a default]: map [f] over [a] in parallel when [pool = Some wp]. *) let map pool f a x = match pool with | None -> Array.map f a - | Some pool -> + | Some wp -> let length = Array.length a in let a_out = Array.make length x in - let f' idx = - Array.unsafe_set a_out idx (f (Array.unsafe_get a idx)) - in - Domainslib.Task.parallel_for pool ~start:0 ~finish:(length - 1) ~body:f'; + WorkerPool.parallel_for wp length (fun i -> + Array.unsafe_set a_out i (f (Array.unsafe_get a i)) + ); a_out end @@ -38,19 +198,18 @@ module ParallelSeq = struct ParallelArray.iter pool f (Array.of_seq seq) end -(* A pool that can be acquired on demand and released when idle. - Unlike AtomicLazy, supports teardown and re-creation so that - domain workers don't stay alive during phases that don't need them - (e.g. eval interpretation). *) +(* A managed pool that lazily creates a WorkerPool on first use and + keeps it alive (workers sleeping via Condition.wait — zero CPU) + until [release] is called. This matches the old Domainslib lifecycle. *) module ManagedPool = struct type t = { - setup : unit -> Domainslib.Task.pool; - mutable pool : Domainslib.Task.pool option; + num_domains : int; + mutable pool : WorkerPool.t option; mutex : Mutex.t; } - let create setup = { - setup; + let create () = { + num_domains = Domain.recommended_domain_count (); pool = None; mutex = Mutex.create (); } @@ -58,18 +217,18 @@ module ManagedPool = struct let acquire mp = Mutex.protect mp.mutex (fun () -> match mp.pool with - | Some p -> p + | Some wp -> wp | None -> - let p = mp.setup () in - mp.pool <- Some p; - p + let wp = WorkerPool.create (mp.num_domains - 1) in + mp.pool <- Some wp; + wp ) let release mp = Mutex.protect mp.mutex (fun () -> match mp.pool with - | Some p -> - Domainslib.Task.teardown_pool p; + | Some wp -> + WorkerPool.teardown wp; mp.pool <- None | None -> () ) @@ -83,13 +242,15 @@ end let run_in_new_pool timer_ctx f = if not !enable then f None - else - let pool = Timer.time timer_ctx ["domainslib";"setup"] (Domainslib.Task.setup_pool ~num_domains:(Domain.recommended_domain_count() - 1)) () in - Std.finally (fun () -> Timer.time timer_ctx ["domainslib";"teardown"] Domainslib.Task.teardown_pool pool) (Domainslib.Task.run pool) (fun () -> f (Some pool)) + else begin + let nd = Timer.time timer_ctx ["parallel";"setup"] num_domains () in + let wp = WorkerPool.create (nd - 1) in + Std.finally (fun () -> WorkerPool.teardown wp) f (Some wp) + end let run_with_pool mp f = if not !enable then f None else - let pool = ManagedPool.acquire mp in - Domainslib.Task.run pool (fun () -> f (Some pool)) + let wp = ManagedPool.acquire mp in + f (Some wp) diff --git a/src/dune b/src/dune index 49e2115649e..9d156b7db62 100644 --- a/src/dune +++ b/src/dune @@ -23,7 +23,7 @@ unix ipaddr str bigarray threads dynlink xml-light extlib sha terminal_size luv - domainslib saturn thread-local-storage dynamic_gc + saturn thread-local-storage dynamic_gc ) (modules (:standard \ haxe prebuild)) (preprocess (per_module