From 6164c1955a7795312929ca194b8bc26d21098cb4 Mon Sep 17 00:00:00 2001
From: Ralf Anton Beier <ralf_beier@me.com>
Date: Fri, 20 Mar 2026 20:52:40 +0100
Subject: [PATCH] feat: sub-word memory ops, memory.size/grow, compilation
 chain artifacts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sub-word loads/stores (15 new WASM opcodes):
- i32: load8_s/u, load16_s/u, store8, store16
- i64: load8_s/u, load16_s/u, load32_s/u, store8, store16, store32
- ARM encoding: LDRB/LDRSB/LDRH/LDRSH/STRB/STRH with Thumb-2 narrow/wide
- Bounds checking works uniformly across all sub-word operations
- 42 new tests (decoder, instruction selector, encoder)

memory.size / memory.grow:
- memory.size: LSR R10, #16 (R10 = memory bytes, /65536 for pages)
- memory.grow: returns -1 on embedded (fixed memory cannot grow)

Compilation chain rivet artifacts (4 new files, 40+ artifacts):
- compilation-chain.yaml: CC-001..008 — full pipeline stages
- kiln-builtins-api.yaml: KB-001..005, KB-TR-001..005 — C ABI bridge
- static-linking.yaml: SL-001..007, SL-TR-001..003 — linker requirements
- e2e-verification.yaml: E2E-VER-001..010 — end-to-end test plan

802 tests (up from 766), clippy clean, fmt clean.

Implements: FR-002
Implements: FR-003
Trace: skip

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 artifacts/compilation-chain.yaml              | 317 ++++++
 artifacts/e2e-verification.yaml               | 384 +++++++
 artifacts/kiln-builtins-api.yaml              | 385 +++++++
 artifacts/static-linking.yaml                 | 373 +++++++
 crates/synth-backend/src/arm_encoder.rs       | 685 +++++++++++++
 crates/synth-core/src/wasm_decoder.rs         | 265 +++++
 crates/synth-core/src/wasm_op.rs              |  27 +
 .../src/instruction_selector.rs               | 941 ++++++++++++++++++
 crates/synth-synthesis/src/rules.rs           |  37 +
 9 files changed, 3414 insertions(+)
 create mode 100644 artifacts/compilation-chain.yaml
 create mode 100644 artifacts/e2e-verification.yaml
 create mode 100644 artifacts/kiln-builtins-api.yaml
 create mode 100644 artifacts/static-linking.yaml

diff --git a/artifacts/compilation-chain.yaml b/artifacts/compilation-chain.yaml
new file mode 100644
index 0000000..36ba8d3
--- /dev/null
+++ b/artifacts/compilation-chain.yaml
@@ -0,0 +1,317 @@
+# Compilation Chain Architecture (ASPICE SYS.2 / SYS.3)
+#
+# System: Synth -- WebAssembly-to-ARM Cortex-M AOT compiler
+#
+# Defines the end-to-end compilation chain from WASM Component through
+# Meld fusing to core WASM, Synth AOT compilation to relocatable object,
+# static linking with kiln-builtins.a, and final firmware.elf for Cortex-M.
+#
+# Pipeline:
+#   WASM Component --> Meld fuse --> Core WASM --> Synth AOT --> module.o
+#   --> link with kiln-builtins.a --> firmware.elf --> Cortex-M
+#
+# Format: rivet generic-yaml
+
+artifacts:
+  # ---------------------------------------------------------------------------
+  # System-level requirements for the compilation chain
+  # ---------------------------------------------------------------------------
+
+  - id: CC-001
+    type: system-req
+    title: End-to-end compilation chain from WASM Component to firmware.elf
+    description: >
+      Synth shall support a complete compilation chain that transforms a
+      WebAssembly Component into a firmware.elf suitable for flashing to
+      ARM Cortex-M targets. The chain consists of five stages: (1) Meld
+      fuses the WASM Component into a core WASM module, resolving component
+      imports/exports via BA RFC #46 lowering; (2) Synth AOT-compiles the
+      core WASM to ARM Thumb-2 machine code, emitting a relocatable ELF
+      object (module.o) with undefined symbols for host intrinsics;
+      (3) kiln-builtins.a provides the static library resolving those
+      symbols (import dispatch, memory base, cabi_realloc); (4) the ARM
+      cross-linker combines module.o + kiln-builtins.a + linker script
+      into firmware.elf; (5) firmware.elf is flashed or loaded into the
+      target. Each stage has defined inputs, outputs, and system
+      requirements.
+    status: draft
+    tags: [compilation-chain, pipeline, end-to-end]
+    links:
+      - type: derives-from
+        target: BR-002
+      - type: derives-from
+        target: BR-003
+      - type: refines
+        target: FR-002
+      - type: traces-to
+        target: CM-002
+      - type: traces-to
+        target: ARCH-003
+    fields:
+      req-type: functional
+      priority: must
+      verification-criteria: >
+        A WASM component completes all five pipeline stages and produces
+        a firmware.elf that boots on a Cortex-M4 Renode emulation target.
+
+  - id: CC-002
+    type: system-req
+    title: Stage 1 -- Meld component fusing to core WASM
+    description: >
+      The first pipeline stage uses Meld to fuse a WASM Component into a
+      core WASM module. Meld resolves component-level imports and exports,
+      lowers the Component Model binary format per BA RFC #46, and produces
+      a core module with canonical ABI wrappers for host intrinsic calls.
+      The output core module contains import declarations for host functions
+      that will be resolved by kiln-builtins at link time. Meld is an
+      external tool (meld: prefix) not built by synth.
+    status: draft
+    tags: [compilation-chain, meld, fusing, stage-1]
+    links:
+      - type: derives-from
+        target: BR-003
+      - type: refines
+        target: FR-001
+      - type: traces-to
+        target: CM-002
+    fields:
+      req-type: functional
+      priority: must
+      stage: 1
+      inputs: ["WASM Component (.wasm)", "WIT interface definitions (.wit)"]
+      outputs: ["Core WASM module (.wasm) with host import declarations"]
+      tool: meld
+      verification-criteria: >
+        Meld-fused core module validates with wasm-tools validate;
+        import declarations match kiln-builtins ABI signatures.
+
+  - id: CC-003
+    type: system-req
+    title: Stage 2 -- Synth AOT compilation to relocatable object
+    description: >
+      The second pipeline stage uses Synth to AOT-compile the core WASM
+      module to ARM Thumb-2 machine code, producing a relocatable ELF
+      object file (module.o). The object contains: .text section with
+      compiled functions, .wasm_linear_memory reservation, undefined
+      symbols for each host import (BL __meld_dispatch_import or per-import
+      BL __meld_import_N), and a .meld_import_table section encoding the
+      import metadata. The output is a standard ELF relocatable (ET_REL)
+      that any ARM toolchain linker can process.
+    status: draft
+    tags: [compilation-chain, synth, aot, stage-2, relocatable]
+    links:
+      - type: derives-from
+        target: BR-002
+      - type: derives-from
+        target: BR-003
+      - type: refines
+        target: FR-002
+      - type: refines
+        target: FR-005
+      - type: traces-to
+        target: ARCH-002
+      - type: traces-to
+        target: ARCH-003
+    fields:
+      req-type: functional
+      priority: must
+      stage: 2
+      inputs: ["Core WASM module (.wasm)", "Target profile (--cortex-m4)"]
+      outputs: ["Relocatable ELF object (module.o) with undefined host symbols"]
+      tool: synth
+      crate: synth-cli
+      verification-criteria: >
+        readelf -h module.o shows ET_REL type; readelf -s shows UND symbols
+        for __meld_dispatch_import or __meld_import_N; .meld_import_table
+        section present in readelf -S output.
+
+  - id: CC-004
+    type: system-req
+    title: Stage 3 -- kiln-builtins static library provides host runtime
+    description: >
+      The third pipeline stage requires kiln-builtins.a, a no_std static
+      library cross-compiled for the ARM Cortex-M target. kiln-builtins.a
+      provides the extern "C" symbols that synth-compiled code references:
+      __meld_dispatch_import (or per-import __meld_import_N),
+      __meld_get_memory_base, and cabi_realloc. The library is built from
+      the kiln repository using the ARM cross-compilation toolchain
+      (arm-none-eabi-gcc or rust target thumbv7em-none-eabihf). This is
+      an external dependency (kiln: prefix).
+    status: draft
+    tags: [compilation-chain, kiln-builtins, static-library, stage-3]
+    links:
+      - type: derives-from
+        target: BR-002
+      - type: derives-from
+        target: BR-003
+      - type: refines
+        target: FR-004
+      - type: traces-to
+        target: CM-002
+      - type: traces-to
+        target: CM-003
+      - type: traces-to
+        target: kiln:REQ_FUNC_014
+      - type: traces-to
+        target: kiln:REQ_HELPER_ABI_001
+    fields:
+      req-type: interface
+      priority: must
+      stage: 3
+      inputs: ["kiln-builtins source (Rust, no_std)", "ARM target triple"]
+      outputs: ["kiln-builtins.a (static library for ARM Cortex-M)"]
+      tool: "cargo build --target thumbv7em-none-eabihf (in kiln repo)"
+      verification-criteria: >
+        arm-none-eabi-nm kiln-builtins.a shows defined symbols for
+        __meld_dispatch_import, __meld_get_memory_base, cabi_realloc;
+        library compiled with no_std, no unwinding, thumb-2 ISA.
+
+  - id: CC-005
+    type: system-req
+    title: Stage 4 -- ARM cross-linker produces firmware.elf
+    description: >
+      The fourth pipeline stage uses the ARM cross-linker
+      (arm-none-eabi-ld or lld) to combine module.o + kiln-builtins.a
+      using a linker script that defines the memory layout for the target
+      board. The linker resolves all undefined symbols from module.o
+      against kiln-builtins.a, places .text in FLASH, .wasm_linear_memory
+      in RAM, .meld_import_table in FLASH (read-only), and produces the
+      final firmware.elf (ET_EXEC). The linker script is generated by
+      synth's LinkerScriptGenerator or provided externally. This stage
+      ties to issue #27 (ARM cross-compilation toolchain).
+    status: draft
+    tags: [compilation-chain, linker, firmware, stage-4, issue-27]
+    links:
+      - type: derives-from
+        target: BR-002
+      - type: derives-from
+        target: BR-003
+      - type: refines
+        target: FR-002
+      - type: refines
+        target: TR-002
+      - type: traces-to
+        target: ARCH-003
+      - type: traces-to
+        target: ZI-008
+    fields:
+      req-type: functional
+      priority: must
+      stage: 4
+      github-issue: 27
+      inputs:
+        - "module.o (from stage 2)"
+        - "kiln-builtins.a (from stage 3)"
+        - "linker script (.ld)"
+      outputs: ["firmware.elf (ET_EXEC, loadable)"]
+      tool: arm-none-eabi-ld or lld
+      verification-criteria: >
+        readelf -h firmware.elf shows ET_EXEC; no undefined symbols
+        (readelf -s | grep UND shows only expected libc/startup symbols);
+        LOAD segments map to target memory regions (FLASH at board flash
+        base, RAM at 0x20000000).
+
+  - id: CC-006
+    type: system-req
+    title: Stage 5 -- Firmware deployment to Cortex-M target
+    description: >
+      The fifth pipeline stage deploys firmware.elf to the target
+      Cortex-M device via flashing (J-Link, OpenOCD, west flash) or
+      loading into an emulator (Renode). For Renode testing, the ELF
+      is loaded via sysbus LoadELF. For Zephyr targets, west flash
+      handles the deployment via the board's configured runner. The
+      deployed firmware shall boot from the vector table, execute the
+      Reset_Handler, initialize WASM linear memory, and begin executing
+      compiled WASM functions.
+    status: draft
+    tags: [compilation-chain, deployment, flash, stage-5]
+    links:
+      - type: derives-from
+        target: BR-003
+      - type: refines
+        target: FR-002
+      - type: refines
+        target: FR-005
+      - type: traces-to
+        target: VER-005
+      - type: traces-to
+        target: TP-006
+      - type: traces-to
+        target: ZI-006
+    fields:
+      req-type: functional
+      priority: must
+      stage: 5
+      inputs: ["firmware.elf"]
+      outputs: ["Running firmware on Cortex-M target"]
+      tool: "Renode (emulation) or J-Link/OpenOCD (hardware)"
+      verification-criteria: >
+        Renode boots firmware.elf and reaches user code;
+        hardware test on nRF52840-DK produces expected output via UART.
+
+  # ---------------------------------------------------------------------------
+  # Pipeline orchestration requirements
+  # ---------------------------------------------------------------------------
+
+  - id: CC-007
+    type: sw-req
+    title: Synth CLI pipeline orchestration for multi-stage build
+    description: >
+      synth-cli shall support a pipeline mode (synth build or synth pipeline)
+      that orchestrates stages 2-4 in a single invocation when provided with
+      a core WASM module, a kiln-builtins.a path, and a target profile.
+      The CLI shall invoke the synthesis engine, emit module.o to a temporary
+      location, invoke the ARM linker with the generated linker script, and
+      produce firmware.elf as output. This reduces the manual steps from
+      three commands to one. Stage 1 (meld fuse) and stage 5 (deployment)
+      remain external.
+    status: planned
+    tags: [compilation-chain, cli, orchestration, pipeline]
+    links:
+      - type: derives-from
+        target: CC-001
+      - type: derives-from
+        target: CC-003
+      - type: derives-from
+        target: CC-005
+      - type: refines
+        target: TR-003
+    fields:
+      req-type: functional
+      priority: should
+      crate: synth-cli
+      verification-criteria: >
+        synth build --input module.wasm --builtins kiln-builtins.a
+        --target cortex-m4 -o firmware.elf produces a valid ELF;
+        equivalent to manual synth compile + arm-none-eabi-ld invocation.
+
+  - id: CC-008
+    type: sw-req
+    title: Relocatable ELF emission with Meld integration sections
+    description: >
+      synth-backend shall emit relocatable ELF objects (ET_REL) containing
+      the following sections when compiling modules with host imports:
+      .text (compiled ARM Thumb-2 code), .symtab/.strtab (symbol table
+      with UND entries for host imports), .meld_import_table (binary
+      table mapping import indices to symbol names), and .rel.text
+      (relocation entries for BL instructions targeting import symbols).
+      The ELF shall be linkable by any ARM ELF-compatible linker without
+      requiring synth-specific tooling.
+    status: planned
+    tags: [compilation-chain, elf, relocatable, meld-sections]
+    links:
+      - type: derives-from
+        target: CC-003
+      - type: derives-from
+        target: CM-002
+      - type: refines
+        target: TR-002
+    fields:
+      req-type: functional
+      priority: must
+      crate: synth-backend
+      verification-criteria: >
+        readelf -S module.o shows .meld_import_table section;
+        readelf -r module.o shows R_ARM_THM_CALL relocations for import
+        BL instructions; arm-none-eabi-ld resolves all symbols when
+        linked with kiln-builtins.a.
diff --git a/artifacts/e2e-verification.yaml b/artifacts/e2e-verification.yaml
new file mode 100644
index 0000000..9589025
--- /dev/null
+++ b/artifacts/e2e-verification.yaml
@@ -0,0 +1,384 @@
+# End-to-End Verification Plan (ASPICE SYS.5 / SWE.6)
+#
+# System: Synth -- WebAssembly-to-ARM Cortex-M AOT compiler
+#
+# Verification artifacts for the compilation chain, covering the first PoC
+# (simple WASM with import), WASI subset test, and component round-trip.
+# These verification activities exercise the full pipeline from WASM input
+# through Meld fusing, Synth AOT compilation, static linking with
+# kiln-builtins.a, to execution on Renode Cortex-M4 emulation.
+#
+# Format: rivet generic-yaml
+
+artifacts:
+  # ---------------------------------------------------------------------------
+  # PoC 1: Simple WASM with single import
+  # ---------------------------------------------------------------------------
+
+  - id: E2E-VER-001
+    type: sys-verification
+    title: "PoC: Simple WASM with host import -- compile, link, execute"
+    description: >
+      First proof-of-concept test validating the minimal compilation chain.
+      A simple WASM module with one exported function and one host import
+      (e.g., env.print_i32) is compiled through synth, linked with a
+      minimal kiln-builtins stub providing __meld_dispatch_import, and
+      executed on Renode. The test verifies: (1) synth emits BL
+      __meld_dispatch_import for the import call, (2) the linker resolves
+      the symbol against kiln-builtins.a, (3) the compiled code calls the
+      host function with the correct argument, (4) the host function
+      executes and produces observable output (UART print). This is the
+      simplest possible end-to-end test of the import dispatch mechanism.
+    status: planned
+    tags: [e2e, poc, import-dispatch, renode]
+    links:
+      - type: verifies
+        target: CC-001
+      - type: verifies
+        target: CC-003
+      - type: verifies
+        target: CC-005
+      - type: verifies
+        target: KB-001
+      - type: verifies
+        target: SL-003
+    fields:
+      method: simulation
+      preconditions:
+        - synth CLI compiled and available
+        - kiln-builtins.a stub compiled for thumbv7em-none-eabihf
+        - arm-none-eabi-ld available on PATH
+        - Renode emulator available
+      test-input: >
+        (module
+          (import "env" "print_i32" (func $print (param i32)))
+          (func (export "main")
+            i32.const 42
+            call $print))
+      steps:
+        - "synth compile test_import.wasm --target cortex-m4 -o module.o"
+        - "arm-none-eabi-ld -T linker.ld module.o kiln-builtins.a -o firmware.elf"
+        - "renode --execute 'sysbus LoadELF @firmware.elf; start; sleep 1'"
+        - "Verify UART output contains '42'"
+      pass-criteria: >
+        UART output shows the value 42 printed by the host import handler;
+        no HardFault or other exception; execution completes normally.
+
+  - id: E2E-VER-002
+    type: sys-verification
+    title: "PoC: Verify relocatable ELF structure from synth"
+    description: >
+      Structural verification of the relocatable ELF object emitted by
+      synth for a module with host imports. Validates ELF header (ET_REL,
+      EM_ARM, Thumb-2), section headers (.text, .meld_import_table,
+      .rel.text, .symtab), symbol table (UND entries for import symbols,
+      GLOBAL entries for export symbols), and relocation entries
+      (R_ARM_THM_CALL for import BL instructions). This test does not
+      require linking or execution -- it validates synth's ELF emission
+      in isolation.
+    status: planned
+    tags: [e2e, poc, elf-validation, structural]
+    links:
+      - type: verifies
+        target: CC-003
+      - type: verifies
+        target: KB-005
+      - type: verifies
+        target: SL-004
+    fields:
+      method: automated-test
+      steps:
+        - "synth compile test_import.wasm --target cortex-m4 -o module.o"
+        - "readelf -h module.o  # verify ET_REL, EM_ARM"
+        - "readelf -S module.o  # verify .text, .meld_import_table sections"
+        - "readelf -s module.o  # verify UND import symbols, GLOBAL export symbols"
+        - "readelf -r module.o  # verify R_ARM_THM_CALL relocations"
+      pass-criteria: >
+        All readelf checks pass; import symbols are UND; export symbols
+        are GLOBAL FUNC; .meld_import_table section is present with
+        correct flags; relocation count matches import call count.
+
+  # ---------------------------------------------------------------------------
+  # WASI subset test
+  # ---------------------------------------------------------------------------
+
+  - id: E2E-VER-003
+    type: sys-verification
+    title: "WASI: WASM calling wasi:cli/stdout via kiln dispatcher"
+    description: >
+      End-to-end test verifying that a WASM module calling wasi:cli/stdout
+      (fd_write) compiles through synth, links with kiln-builtins, and
+      produces output on the target's UART. The WASI call path is:
+      compiled code calls __meld_dispatch_import with the wasi:cli/stdout
+      import index, kiln-builtins routes to its WasiDispatcher
+      implementation, which writes the string to the UART peripheral.
+      This validates the WASI Preview 2 dispatch compatibility (CM-004)
+      and the generic import dispatch mechanism (KB-001) for a real-world
+      use case.
+    status: planned
+    tags: [e2e, wasi, stdout, renode]
+    links:
+      - type: verifies
+        target: CM-004
+      - type: verifies
+        target: KB-001
+      - type: verifies
+        target: CC-001
+    fields:
+      method: simulation
+      preconditions:
+        - kiln-builtins.a with WASI stdout handler compiled
+        - Renode with UART monitoring configured
+      test-input: >
+        A WASM module that calls wasi:cli/stdout with the string
+        "Hello from WASM" using canonical ABI string lowering.
+      steps:
+        - "meld fuse hello_component.wasm -o hello_core.wasm"
+        - "synth compile hello_core.wasm --target cortex-m4 -o module.o"
+        - "arm-none-eabi-ld -T linker.ld module.o kiln-builtins.a -o firmware.elf"
+        - "Renode execute firmware.elf with UART monitoring"
+        - "Verify UART output contains 'Hello from WASM'"
+      pass-criteria: >
+        UART output shows "Hello from WASM"; string was lowered via
+        canonical ABI (UTF-8, ptr+len), passed through dispatch, and
+        rendered by kiln's WASI stdout handler; no memory faults.
+
+  - id: E2E-VER-004
+    type: sys-verification
+    title: "WASI: cabi_realloc allocation during string lowering"
+    description: >
+      Focused verification that cabi_realloc is called correctly during
+      canonical ABI string lowering for WASI calls. The test module passes
+      a string argument to a WASI import; synth's canonical ABI lowering
+      code emits BL cabi_realloc to allocate space in linear memory for
+      the string bytes, then writes the UTF-8 bytes and passes (ptr, len)
+      to the import. The test verifies: (1) cabi_realloc is called with
+      correct alignment (1 for UTF-8 strings), (2) the returned pointer
+      is within the linear memory region, (3) the string bytes are written
+      correctly at the allocated address, (4) the import receives the
+      correct (ptr, len) pair.
+    status: planned
+    tags: [e2e, wasi, cabi-realloc, canonical-abi]
+    links:
+      - type: verifies
+        target: KB-004
+      - type: verifies
+        target: CM-001
+    fields:
+      method: simulation
+      preconditions:
+        - kiln-builtins.a with cabi_realloc bump allocator
+        - Renode with memory inspection capability
+      steps:
+        - "Compile test module with string-passing WASI import"
+        - "Execute on Renode with memory breakpoint on cabi_realloc"
+        - "Verify cabi_realloc arguments: old_ptr=0, old_size=0, align=1, new_size=string_len"
+        - "Verify returned pointer is within .wasm_linear_memory section"
+        - "Verify string bytes at allocated address match expected UTF-8"
+      pass-criteria: >
+        cabi_realloc called with correct arguments; allocated memory is
+        within linear memory bounds; string bytes match expected content;
+        import receives (ptr, len) with correct values.
+
+  # ---------------------------------------------------------------------------
+  # Component round-trip
+  # ---------------------------------------------------------------------------
+
+  - id: E2E-VER-005
+    type: sys-verification
+    title: "Component round-trip: component fuse, compile, link, execute"
+    description: >
+      Full round-trip test of the BA RFC #46 compilation chain. Starting
+      from a WASM Component with WIT-defined imports and exports:
+      (1) Meld fuses the component to a core module with canonical ABI
+      wrappers, (2) synth AOT-compiles to ARM with import stubs,
+      (3) kiln-builtins.a provides the host functions, (4) the linker
+      produces firmware.elf, (5) Renode executes and validates results.
+      The component uses a non-trivial WIT interface with record types
+      to exercise the canonical ABI lift/lower path end-to-end. This is
+      the integration test that validates the full PulseEngine compilation
+      story: component authoring -> meld composition -> synth AOT ->
+      kiln runtime -> embedded execution.
+    status: planned
+    tags: [e2e, component-model, round-trip, rfc46, integration]
+    links:
+      - type: verifies
+        target: CC-001
+      - type: verifies
+        target: CC-002
+      - type: verifies
+        target: CM-002
+      - type: verifies
+        target: CM-001
+      - type: verifies
+        target: KB-001
+    fields:
+      method: simulation
+      preconditions:
+        - Meld fuser available and functional
+        - kiln-builtins.a with host function implementations
+        - Renode emulator with UART and memory inspection
+      test-input: >
+        A WASM Component defining: interface sensor { record reading {
+        temp: f32, humidity: f32 }; get-reading: func() -> reading; }
+        and a component that imports sensor.get-reading, processes the
+        reading (e.g., converts C to F), and exports a process func.
+      steps:
+        - "meld fuse sensor_component.wasm -o sensor_core.wasm"
+        - "synth compile sensor_core.wasm --target cortex-m4 -o module.o"
+        - "arm-none-eabi-ld -T linker.ld module.o kiln-builtins.a -o firmware.elf"
+        - "Renode execute firmware.elf with mock sensor host function"
+        - "Verify processed reading value matches expected conversion"
+      pass-criteria: >
+        Component imports resolved via Meld lowering; canonical ABI
+        record lift/lower produces correct field values; host function
+        receives correct arguments; computed result matches expected
+        value (e.g., 25.0C -> 77.0F); no memory faults or traps.
+
+  - id: E2E-VER-006
+    type: sys-verification
+    title: "Component round-trip: ABI compatibility validation"
+    description: >
+      Verification that the canonical ABI encoding produced by synth-abi
+      (at compile time) matches the canonical ABI decoding in kiln-builtins
+      (at runtime). For each WIT type family (primitives, strings, records,
+      variants, options, results, lists, enums, flags), a test component
+      passes values through the import boundary and verifies that the
+      host receives exactly the expected bytes. This is the runtime
+      counterpart of CM-VER-002 (cross-toolchain ABI compatibility).
+      Exercises both the lower path (guest -> host) and lift path
+      (host -> guest return values).
+    status: planned
+    tags: [e2e, component-model, abi-compatibility, canonical-abi]
+    links:
+      - type: verifies
+        target: CM-001
+      - type: verifies
+        target: CM-003
+      - type: verifies
+        target: KB-004
+    fields:
+      method: simulation
+      preconditions:
+        - kiln-builtins.a with ABI validation host functions
+        - Test components for each WIT type family
+      steps:
+        - "For each WIT type family: compile test component through pipeline"
+        - "Execute on Renode with ABI validation host function"
+        - "Host function compares received bytes against expected layout"
+        - "Report pass/fail for each type family"
+      pass-criteria: >
+        All WIT type families pass byte-level compatibility check;
+        synth-abi lowering matches kiln-builtins lifting for every
+        type; no alignment errors, truncation, or sign extension bugs.
+
+  # ---------------------------------------------------------------------------
+  # Regression and CI integration
+  # ---------------------------------------------------------------------------
+
+  - id: E2E-VER-007
+    type: sys-verification
+    title: "CI: Compilation chain regression test suite"
+    description: >
+      Bazel test targets exercising the compilation chain in CI. Tests
+      are organized under //tests/compilation-chain/... and include:
+      (1) poc_import_test -- PoC with single import (E2E-VER-001),
+      (2) elf_structure_test -- relocatable ELF validation (E2E-VER-002),
+      (3) wasi_stdout_test -- WASI stdout (E2E-VER-003), and
+      (4) component_roundtrip_test -- component round-trip (E2E-VER-005).
+      All tests use Renode via rules_renode for hermetic ARM emulation.
+      The test suite shall be gated on kiln-builtins.a being available
+      (via Bazel external dependency or pre-built artifact).
+    status: planned
+    tags: [e2e, ci, bazel, regression, renode]
+    links:
+      - type: verifies
+        target: CC-001
+      - type: verifies
+        target: FR-002
+    fields:
+      method: automated-test
+      steps:
+        run: "bazel test //tests/compilation-chain/..."
+        coverage: >
+          PoC import dispatch, ELF structure validation, WASI stdout,
+          component round-trip. All on Renode Cortex-M4 emulation.
+      preconditions:
+        - Bazel with rules_renode configured
+        - kiln-builtins.a available as Bazel external
+        - arm-none-eabi-ld available in Bazel toolchain
+
+  - id: E2E-VER-008
+    type: sw-verification
+    title: Unit tests for import stub codegen
+    description: >
+      Unit tests in synth-synthesis verifying that import call stub
+      generation produces correct ARM instruction sequences. Tests
+      cover: (1) __meld_dispatch_import generic dispatch path with
+      argument marshalling, (2) __meld_import_N per-import direct
+      dispatch with AAPCS argument placement, (3) trap check after
+      dispatch return, (4) return value lifting. Tests use synth's
+      existing test infrastructure (compile WASM instruction, verify
+      emitted ARM instructions) without requiring linking or execution.
+    status: planned
+    tags: [e2e, unit-test, import-stub, synth-synthesis]
+    links:
+      - type: verifies
+        target: KB-TR-001
+      - type: verifies
+        target: KB-TR-002
+    fields:
+      method: automated-test
+      steps:
+        run: "cargo test -p synth-synthesis -- test_import_dispatch"
+        coverage: >
+          Generic dispatch stub, per-import direct stub, trap check,
+          return value lift. Both single-arg and multi-arg imports.
+
+  - id: E2E-VER-009
+    type: sys-verification
+    title: Unit tests for .meld_import_table emission
+    description: >
+      Unit tests in synth-backend verifying that the ElfBuilder correctly
+      emits .meld_import_table and .meld_import_strings sections. Tests
+      cover: (1) import_count field matches module import count,
+      (2) ImportEntry packing (module_name_offset, field_name_offset,
+      signature_hash, flags), (3) string table construction with correct
+      offsets, (4) FNV-1a hash computation for import type signatures,
+      (5) empty import table for modules with no imports (section omitted).
+    status: planned
+    tags: [e2e, unit-test, import-table, synth-backend]
+    links:
+      - type: verifies
+        target: KB-005
+    fields:
+      method: automated-test
+      steps:
+        run: "cargo test -p synth-backend -- test_meld_import_table"
+        coverage: >
+          Import table emission, string table construction, hash
+          computation, empty table omission. Byte-level verification
+          of section content.
+
+  - id: E2E-VER-010
+    type: sw-verification
+    title: Unit tests for startup memory base initialization
+    description: >
+      Unit tests in synth-backend verifying that the Reset_Handler startup
+      code correctly calls __meld_get_memory_base and stores the result in
+      R11. Tests cover: (1) BL __meld_get_memory_base instruction emitted
+      in startup sequence, (2) MOV R11, R0 follows the BL, (3) weak symbol
+      fallback emits correct default (0x20000000), (4) FPU initialization
+      precedes memory base call when VFP is enabled.
+    status: planned
+    tags: [e2e, unit-test, startup, memory-base, synth-backend]
+    links:
+      - type: verifies
+        target: KB-TR-005
+    fields:
+      method: automated-test
+      steps:
+        run: "cargo test -p synth-backend -- test_startup_memory_base"
+        coverage: >
+          Memory base initialization in Reset_Handler; weak symbol
+          fallback; ordering relative to FPU init.
diff --git a/artifacts/kiln-builtins-api.yaml b/artifacts/kiln-builtins-api.yaml
new file mode 100644
index 0000000..c4f6329
--- /dev/null
+++ b/artifacts/kiln-builtins-api.yaml
@@ -0,0 +1,385 @@
+# kiln-builtins API Requirements (ASPICE SYS.2 / SWE.1)
+#
+# System: Synth -- WebAssembly-to-ARM Cortex-M AOT compiler
+#
+# Defines the API contract between synth-compiled code and kiln-builtins.a,
+# the no_std static library that provides the C ABI runtime bridge for
+# import dispatch, linear memory access, and canonical ABI memory allocation.
+#
+# This artifact captures what synth-compiled code NEEDS from kiln-builtins;
+# the implementation lives in kiln. Cross-repo links use kiln: prefix.
+#
+# References:
+#   - BA RFC #46: https://github.com/bytecodealliance/rfcs/pull/46
+#   - Host-Guest C API: value_t union, host_function_t signatures
+#
+# Format: rivet generic-yaml
+
+artifacts:
+  # ---------------------------------------------------------------------------
+  # Import dispatch bridge
+  # ---------------------------------------------------------------------------
+
+  - id: KB-001
+    type: system-req
+    title: __meld_dispatch_import -- generic import dispatch bridge
+    description: >
+      Synth-compiled code shall call host imports via an extern "C" function
+      __meld_dispatch_import(import_index: u32, args: *const value_t,
+      args_len: u32, ret: *mut value_t) -> i32. This is the generic dispatch
+      entry point: synth emits a BL __meld_dispatch_import instruction for
+      each WASM import call, passing the import index and a pointer to the
+      canonical-ABI-lowered argument buffer in linear memory. The dispatcher
+      in kiln-builtins routes the call to the appropriate host function
+      based on the import index. The return value is 0 for success, non-zero
+      for trap. This function must use C ABI (AAPCS on ARM) so that synth
+      can generate the call using standard ARM calling convention (R0-R3
+      for first four arguments).
+    status: draft
+    tags: [kiln-builtins, import-dispatch, c-abi, aapcs]
+    links:
+      - type: derives-from
+        target: BR-002
+      - type: derives-from
+        target: BR-003
+      - type: refines
+        target: CM-002
+      - type: refines
+        target: CM-003
+      - type: traces-to
+        target: kiln:REQ_FUNC_014
+      - type: traces-to
+        target: kiln:REQ_HELPER_ABI_001
+    fields:
+      req-type: interface
+      priority: must
+      symbol: __meld_dispatch_import
+      signature: "extern \"C\" fn(import_index: u32, args: *const value_t, args_len: u32, ret: *mut value_t) -> i32"
+      calling-convention: AAPCS
+      verification-criteria: >
+        Synth emits BL __meld_dispatch_import with R0=import_index,
+        R1=args_ptr, R2=args_len, R3=ret_ptr; kiln-builtins resolves
+        the symbol and dispatches to correct host function.
+
+  - id: KB-002
+    type: system-req
+    title: Per-import direct symbols (__meld_import_N) for optimized dispatch
+    description: >
+      As an optimization over generic dispatch, synth shall support emitting
+      per-import direct call symbols (__meld_import_0, __meld_import_1, etc.)
+      where each symbol has a function signature matching the AAPCS lowering
+      of the WASM import's type signature. This avoids the overhead of the
+      generic dispatch table lookup and value_t boxing/unboxing. Each
+      __meld_import_N function takes WASM arguments directly in R0-R3
+      (spilling to stack for >4 args) and returns the result in R0 (or
+      R0+R1 for i64). kiln-builtins.a provides a stub for each import
+      index that either implements the function directly or trampolines
+      to the host. Per-import symbols are the preferred dispatch mechanism
+      for embedded targets where call overhead matters.
+    status: draft
+    tags: [kiln-builtins, per-import, direct-call, optimization]
+    links:
+      - type: derives-from
+        target: BR-002
+      - type: refines
+        target: FR-004
+      - type: refines
+        target: CM-002
+      - type: traces-to
+        target: kiln:REQ_FUNC_014
+    fields:
+      req-type: interface
+      priority: should
+      symbol-pattern: "__meld_import_N (N = 0, 1, 2, ...)"
+      calling-convention: AAPCS
+      verification-criteria: >
+        Synth emits BL __meld_import_0 for import index 0 with arguments
+        directly in R0-R3; kiln-builtins.a defines __meld_import_0 with
+        the matching AAPCS signature; link succeeds without symbol errors.
+
+  # ---------------------------------------------------------------------------
+  # Memory access
+  # ---------------------------------------------------------------------------
+
+  - id: KB-003
+    type: system-req
+    title: __meld_get_memory_base -- linear memory base address accessor
+    description: >
+      Synth-compiled code shall obtain the WASM linear memory base address
+      by calling extern "C" fn __meld_get_memory_base(memory_index: u32)
+      -> *mut u8. For single-memory modules (the common case), memory_index
+      is 0 and the return value is the base pointer of the WASM linear
+      memory region in RAM. The synth Reset_Handler calls this function
+      once at startup and caches the result in R11 (the dedicated linear
+      memory base register). For multi-memory modules, each memory has
+      a distinct index. kiln-builtins provides this function, backed by
+      the target's memory allocation (static buffer for bare-metal,
+      Zephyr k_mem_partition for RTOS targets).
+    status: draft
+    tags: [kiln-builtins, memory, linear-memory, base-address]
+    links:
+      - type: derives-from
+        target: BR-002
+      - type: refines
+        target: FR-003
+      - type: refines
+        target: CM-002
+      - type: traces-to
+        target: ARCH-005
+      - type: traces-to
+        target: kiln:REQ_FUNC_014
+    fields:
+      req-type: interface
+      priority: must
+      symbol: __meld_get_memory_base
+      signature: "extern \"C\" fn(memory_index: u32) -> *mut u8"
+      calling-convention: AAPCS
+      verification-criteria: >
+        Synth startup code calls __meld_get_memory_base(0) and stores
+        result in R11; all subsequent memory operations use R11 as base;
+        kiln-builtins returns a valid RAM pointer aligned to 4 bytes.
+
+  - id: KB-004
+    type: system-req
+    title: cabi_realloc -- canonical ABI memory allocator for embedded
+    description: >
+      Synth-compiled code performing canonical ABI lift/lower operations
+      shall call extern "C" fn cabi_realloc(old_ptr: *mut u8, old_size:
+      usize, align: usize, new_size: usize) -> *mut u8 for dynamic
+      memory allocation within the WASM linear memory. This function is
+      defined by the Component Model Canonical ABI specification and is
+      called during string/list lowering when the host needs to allocate
+      guest memory for passing compound values. For embedded targets,
+      the implementation shall be a simple bump allocator or arena
+      allocator within the linear memory region (no malloc/free). The
+      allocator shall respect the alignment parameter and trap on
+      allocation failure (OOM) rather than returning null.
+    status: draft
+    tags: [kiln-builtins, cabi-realloc, canonical-abi, allocator]
+    links:
+      - type: derives-from
+        target: BR-002
+      - type: refines
+        target: CM-001
+      - type: refines
+        target: CM-002
+      - type: traces-to
+        target: CM-TR-004
+      - type: traces-to
+        target: kiln:REQ_HELPER_ABI_001
+    fields:
+      req-type: interface
+      priority: must
+      symbol: cabi_realloc
+      signature: "extern \"C\" fn(old_ptr: *mut u8, old_size: usize, align: usize, new_size: usize) -> *mut u8"
+      calling-convention: AAPCS
+      spec-reference: "Component Model Canonical ABI -- realloc"
+      verification-criteria: >
+        Synth canonical ABI lowering for strings/lists emits BL cabi_realloc;
+        kiln-builtins implementation allocates within linear memory with
+        correct alignment; OOM triggers trap (not null return).
+
+  # ---------------------------------------------------------------------------
+  # Import table binary format
+  # ---------------------------------------------------------------------------
+
+  - id: KB-005
+    type: system-req
+    title: .meld_import_table section binary format
+    description: >
+      Synth shall emit a .meld_import_table ELF section in relocatable
+      objects containing metadata for each host import. The section format
+      is a packed array of import descriptors: [import_count: u32,
+      entries: ImportEntry[import_count]] where each ImportEntry is
+      {module_name_offset: u32, field_name_offset: u32, signature_hash:
+      u32, flags: u32}. String data (module and field names) is stored
+      in a companion .meld_import_strings section. The signature_hash
+      is a FNV-1a hash of the canonical type signature, used for
+      link-time ABI version mismatch detection (per CM-003). This section
+      is placed in FLASH (read-only) by the linker script. kiln-builtins
+      reads this table at startup to configure the dispatch table.
+    status: draft
+    tags: [kiln-builtins, import-table, elf-section, binary-format]
+    links:
+      - type: derives-from
+        target: BR-002
+      - type: refines
+        target: CM-002
+      - type: refines
+        target: CM-003
+      - type: traces-to
+        target: ARCH-003
+      - type: traces-to
+        target: ZI-009
+    fields:
+      req-type: interface
+      priority: must
+      elf-section: .meld_import_table
+      placement: FLASH (read-only)
+      verification-criteria: >
+        readelf -x .meld_import_table module.o shows correct import_count
+        and packed ImportEntry records; string offsets resolve correctly
+        in .meld_import_strings; signature hashes match expected FNV-1a
+        values for test import signatures.
+
+  # ---------------------------------------------------------------------------
+  # SW-level requirements for synth codegen of kiln-builtins calls
+  # ---------------------------------------------------------------------------
+
+  - id: KB-TR-001
+    type: sw-req
+    title: Import call stub generation for __meld_dispatch_import
+    description: >
+      synth-synthesis shall generate ARM Thumb-2 call stubs for each WASM
+      import that: (1) lower canonical ABI arguments into a stack-allocated
+      value_t array, (2) set R0 = import_index, R1 = args pointer,
+      R2 = args_len, R3 = ret pointer, (3) emit BL __meld_dispatch_import,
+      (4) check R0 for trap (non-zero = trap), (5) lift the return value
+      from ret buffer. The stub shall be emitted inline at each call site
+      for single-use imports and as a shared trampoline for imports called
+      more than once in the same function.
+    status: planned
+    tags: [kiln-builtins, codegen, import-stub, synth-synthesis]
+    links:
+      - type: derives-from
+        target: KB-001
+      - type: refines
+        target: CM-TR-005
+      - type: refines
+        target: FR-002
+    fields:
+      req-type: functional
+      priority: must
+      crate: synth-synthesis
+      verification-criteria: >
+        Generated ARM code for a WASM import call contains BL to
+        __meld_dispatch_import with correct R0-R3 setup; Z3 validates
+        argument lowering correctness; trap check branch present after BL.
+
+  - id: KB-TR-002
+    type: sw-req
+    title: Per-import direct symbol generation (__meld_import_N)
+    description: >
+      synth-synthesis shall support an optimization mode where each WASM
+      import generates a BL __meld_import_N instruction with arguments
+      passed directly in R0-R3 per AAPCS, bypassing the generic dispatch
+      value_t boxing. The import index N is assigned sequentially from
+      the module's import section. For imports with more than 4 i32
+      arguments, excess arguments are pushed to the stack per AAPCS.
+      For i64 arguments, the value occupies two registers (R0+R1 or
+      R2+R3). This mode is selected via a synth CLI flag
+      (--import-dispatch=direct) and is the recommended default for
+      embedded targets.
+    status: planned
+    tags: [kiln-builtins, codegen, per-import, direct-dispatch]
+    links:
+      - type: derives-from
+        target: KB-002
+      - type: refines
+        target: CM-TR-005
+      - type: refines
+        target: FR-004
+    fields:
+      req-type: functional
+      priority: should
+      crate: synth-synthesis
+      verification-criteria: >
+        With --import-dispatch=direct, synth emits BL __meld_import_0
+        (not BL __meld_dispatch_import) for import index 0; arguments
+        are in R0-R3 directly without value_t wrapping; generated code
+        is 4-8 instructions shorter per call site.
+
+  - id: KB-TR-003
+    type: sw-req
+    title: .meld_import_table section emission in synth-backend
+    description: >
+      synth-backend shall emit the .meld_import_table and
+      .meld_import_strings ELF sections when the compiled module has
+      host imports. The ElfBuilder shall add these sections with correct
+      SHT_PROGBITS type, SHF_ALLOC flags, 4-byte alignment, and packed
+      binary content matching the KB-005 format specification. The
+      import_count field shall match the number of import declarations
+      in the WASM module. The signature_hash shall be computed from the
+      WASM function type signature (param types, result types) using
+      FNV-1a hashing.
+    status: planned
+    tags: [kiln-builtins, elf-section, synth-backend, emission]
+    links:
+      - type: derives-from
+        target: KB-005
+      - type: refines
+        target: CC-008
+      - type: refines
+        target: TR-002
+    fields:
+      req-type: functional
+      priority: must
+      crate: synth-backend
+      verification-criteria: >
+        ElfBuilder produces .meld_import_table with correct header;
+        section content matches expected binary layout for test modules;
+        arm-none-eabi-readelf parses the section without errors.
+
+  - id: KB-TR-004
+    type: sw-req
+    title: value_t union type definition for generic dispatch
+    description: >
+      synth-core shall define a value_t union type matching the kiln-builtins
+      C ABI definition from BA RFC #46. The union shall have variants:
+      i32 (4 bytes), i64 (8 bytes), f32 (4 bytes), f64 (8 bytes), and
+      externref (4 bytes, opaque handle). The union size shall be 8 bytes
+      (aligned to the largest variant). synth-synthesis uses this type to
+      compute argument buffer sizes and offsets for the generic dispatch
+      path (__meld_dispatch_import). The type definition shall be shared
+      between synth (for codegen) and kiln-builtins (for runtime) via a
+      common header or WIT definition.
+    status: planned
+    tags: [kiln-builtins, value-type, c-abi, rfc46]
+    links:
+      - type: derives-from
+        target: KB-001
+      - type: derives-from
+        target: CM-003
+    fields:
+      req-type: functional
+      priority: must
+      crate: synth-core
+      verification-criteria: >
+        value_t size is 8 bytes with 8-byte alignment; field offsets
+        match kiln-builtins C definition; synth codegen produces correct
+        buffer sizes (args_len * 8 bytes).
+
+  - id: KB-TR-005
+    type: sw-req
+    title: Startup code memory base initialization via __meld_get_memory_base
+    description: >
+      synth-backend Reset_Handler startup code shall call
+      __meld_get_memory_base(0) to obtain the linear memory base address
+      and store the result in R11. This replaces the current hardcoded
+      MOV R11, #0x20000000 with a dynamic memory base that kiln-builtins
+      configures based on the target platform. The startup sequence shall
+      be: (1) set SP from vector table, (2) optionally enable FPU,
+      (3) BL __meld_get_memory_base with R0=0, (4) MOV R11, R0,
+      (5) branch to user code. For bare-metal targets without
+      kiln-builtins, a weak symbol default returning 0x20000000 shall
+      be provided in the synth-generated startup code.
+    status: planned
+    tags: [kiln-builtins, startup, memory-base, reset-handler]
+    links:
+      - type: derives-from
+        target: KB-003
+      - type: derives-from
+        target: CC-003
+      - type: refines
+        target: FR-003
+      - type: traces-to
+        target: TP-006
+    fields:
+      req-type: functional
+      priority: must
+      crate: synth-backend
+      verification-criteria: >
+        Reset_Handler contains BL __meld_get_memory_base followed by
+        MOV R11, R0; weak symbol default returns 0x20000000 when
+        kiln-builtins is not linked; Renode test verifies R11 is set
+        correctly at user code entry.
diff --git a/artifacts/static-linking.yaml b/artifacts/static-linking.yaml
new file mode 100644
index 0000000..4a60e14
--- /dev/null
+++ b/artifacts/static-linking.yaml
@@ -0,0 +1,373 @@
+# Static Linking Requirements (ASPICE SYS.2 / SWE.1)
+#
+# System: Synth -- WebAssembly-to-ARM Cortex-M AOT compiler
+#
+# Defines the static linking requirements for combining synth-compiled
+# relocatable objects with kiln-builtins.a into firmware.elf. Covers
+# ARM cross-compilation toolchain, linker script with Meld integration
+# sections, symbol resolution, and memory layout.
+#
+# Ties to issue #27 (ARM cross-compilation toolchain).
+#
+# Format: rivet generic-yaml
+
+artifacts:
+  # ---------------------------------------------------------------------------
+  # Cross-compilation toolchain requirements
+  # ---------------------------------------------------------------------------
+
+  - id: SL-001
+    type: system-req
+    title: ARM cross-compilation toolchain for static linking
+    description: >
+      The static linking stage shall use an ARM cross-compilation toolchain
+      capable of linking Thumb-2 ELF relocatable objects into an executable
+      ELF for Cortex-M targets. Supported toolchains are:
+      (1) arm-none-eabi-gcc / arm-none-eabi-ld from the GNU Arm Embedded
+      Toolchain, (2) LLVM lld with --target=thumbv7em-none-eabi, and
+      (3) the Zephyr SDK's arm-zephyr-eabi-gcc. The toolchain shall be
+      detectable at synth build time or configurable via --linker flag.
+      This addresses issue #27 (ARM cross-compilation toolchain).
+    status: draft
+    tags: [static-linking, toolchain, arm, cross-compilation, issue-27]
+    links:
+      - type: derives-from
+        target: BR-003
+      - type: derives-from
+        target: BR-004
+      - type: refines
+        target: TR-003
+      - type: refines
+        target: CC-005
+    fields:
+      req-type: interface
+      priority: must
+      github-issue: 27
+      verification-criteria: >
+        synth detects arm-none-eabi-ld on PATH or accepts --linker flag;
+        linking succeeds with GNU, LLVM, and Zephyr SDK toolchains;
+        produced ELF is identical across toolchains for the same inputs.
+
+  - id: SL-002
+    type: system-req
+    title: Linker script with Meld integration sections
+    description: >
+      Synth's LinkerScriptGenerator shall produce GNU ld linker scripts
+      that include Meld-specific sections alongside standard Cortex-M
+      sections. The linker script shall define: MEMORY regions (FLASH
+      and RAM with board-specific origins and sizes), ENTRY(Reset_Handler),
+      and SECTIONS including .isr_vector (KEEP, ALIGN(128), FLASH),
+      .text (FLASH), .meld_import_table (FLASH, read-only),
+      .meld_import_strings (FLASH, read-only), .data (RAM, AT> FLASH
+      for LMA copy), .bss (RAM, NOLOAD), .wasm_linear_memory (RAM,
+      aligned to MPU region size), .stack (RAM), and .heap (RAM).
+      EXTERN declarations for __meld_dispatch_import and
+      __meld_get_memory_base shall be emitted when Meld integration
+      is enabled.
+    status: draft
+    tags: [static-linking, linker-script, meld-sections, memory-layout]
+    links:
+      - type: derives-from
+        target: BR-002
+      - type: derives-from
+        target: BR-003
+      - type: refines
+        target: CC-005
+      - type: refines
+        target: FR-002
+      - type: traces-to
+        target: ZI-008
+      - type: traces-to
+        target: ZI-009
+      - type: traces-to
+        target: KB-005
+    fields:
+      req-type: functional
+      priority: must
+      verification-criteria: >
+        Generated linker script contains .meld_import_table and
+        .meld_import_strings sections in FLASH; EXTERN declarations
+        present for meld symbols; arm-none-eabi-ld accepts the script
+        without warnings.
+
+  # ---------------------------------------------------------------------------
+  # Symbol resolution requirements
+  # ---------------------------------------------------------------------------
+
+  - id: SL-003
+    type: system-req
+    title: Symbol resolution -- synth undefined symbols provided by kiln-builtins
+    description: >
+      The linker shall resolve all undefined symbols emitted by synth
+      against definitions in kiln-builtins.a. The expected undefined
+      symbols from synth-compiled objects are: __meld_dispatch_import
+      (or __meld_import_N per-import symbols), __meld_get_memory_base,
+      and cabi_realloc. All three must be defined in kiln-builtins.a
+      with matching function types (STT_FUNC, STB_GLOBAL). Any unresolved
+      symbol at link time shall produce a clear error identifying the
+      missing symbol and suggesting that kiln-builtins.a is required.
+      Weak symbols (__meld_get_memory_base_default) provide fallback
+      for bare-metal operation without kiln-builtins.
+    status: draft
+    tags: [static-linking, symbols, resolution, kiln-builtins]
+    links:
+      - type: derives-from
+        target: BR-002
+      - type: refines
+        target: CC-005
+      - type: traces-to
+        target: KB-001
+      - type: traces-to
+        target: KB-003
+      - type: traces-to
+        target: KB-004
+    fields:
+      req-type: interface
+      priority: must
+      undefined-symbols:
+        - __meld_dispatch_import
+        - __meld_get_memory_base
+        - cabi_realloc
+      weak-symbols:
+        - __meld_get_memory_base_default
+      verification-criteria: >
+        arm-none-eabi-nm module.o shows UND for expected symbols;
+        arm-none-eabi-nm kiln-builtins.a shows T (text) for same symbols;
+        linking succeeds with zero unresolved symbol errors.
+
+  - id: SL-004
+    type: system-req
+    title: Export symbol visibility for synth-compiled functions
+    description: >
+      Synth-compiled WASM exported functions shall be emitted as global
+      symbols (STB_GLOBAL, STT_FUNC) in the relocatable ELF, using either
+      the WASM export name directly (e.g., "add", "multiply") or a
+      prefixed name (e.g., "synth_add", "synth_multiply") configurable
+      via --export-prefix. These symbols are visible to the linker so
+      that Zephyr C code or other firmware components can call them via
+      extern declarations. Non-exported WASM functions shall be local
+      symbols (STB_LOCAL) to avoid namespace pollution.
+    status: draft
+    tags: [static-linking, symbols, exports, visibility]
+    links:
+      - type: derives-from
+        target: BR-003
+      - type: refines
+        target: FR-002
+      - type: refines
+        target: FR-004
+      - type: traces-to
+        target: ZI-002
+      - type: traces-to
+        target: ARCH-003
+    fields:
+      req-type: functional
+      priority: must
+      verification-criteria: >
+        arm-none-eabi-nm module.o shows T (global text) for exported
+        WASM functions; local WASM functions show t (local text);
+        --export-prefix=synth_ prepends prefix to all export names.
+
+  # ---------------------------------------------------------------------------
+  # Memory layout requirements
+  # ---------------------------------------------------------------------------
+
+  - id: SL-005
+    type: system-req
+    title: Memory layout -- .text in FLASH for compiled code
+    description: >
+      Synth-compiled ARM Thumb-2 code shall be placed in the .text section,
+      which the linker script maps to FLASH memory. For XIP (execute-in-place)
+      targets, code executes directly from flash without copying to RAM.
+      The .text section shall follow the .isr_vector section and precede
+      the .meld_import_table section in the FLASH layout. Code alignment
+      shall be 4 bytes (Thumb-2 instruction alignment). The linker shall
+      merge .text from module.o and kiln-builtins.a into a single .text
+      output section.
+    status: draft
+    tags: [static-linking, memory-layout, flash, text-section]
+    links:
+      - type: derives-from
+        target: BR-002
+      - type: refines
+        target: FR-005
+      - type: refines
+        target: CC-005
+      - type: traces-to
+        target: TP-002
+      - type: traces-to
+        target: TP-003
+    fields:
+      req-type: functional
+      priority: must
+      section: .text
+      placement: FLASH
+      alignment: 4
+      verification-criteria: >
+        readelf -S firmware.elf shows .text with ALLOC+EXEC flags;
+        .text address falls within FLASH memory region of target board;
+        objdump -d confirms Thumb-2 instruction decoding.
+
+  - id: SL-006
+    type: system-req
+    title: Memory layout -- .wasm_linear_memory in RAM
+    description: >
+      WASM linear memory shall be placed in a .wasm_linear_memory section
+      mapped to RAM. The section shall be aligned to the MPU region size
+      (power of 2, minimum 256 bytes) to enable hardware memory protection.
+      The initial size shall match the WASM module's memory declaration
+      (minimum pages * 64KB). The section is NOLOAD (zero-initialized
+      at startup via .bss-style clearing or lazy initialization by
+      kiln-builtins). For multi-memory modules, each memory gets a
+      separate section (.wasm_linear_memory_0, .wasm_linear_memory_1, etc.)
+      with individual MPU region allocation.
+    status: draft
+    tags: [static-linking, memory-layout, ram, linear-memory, mpu]
+    links:
+      - type: derives-from
+        target: BR-001
+      - type: derives-from
+        target: BR-002
+      - type: refines
+        target: FR-003
+      - type: refines
+        target: CC-005
+      - type: traces-to
+        target: ARCH-005
+      - type: traces-to
+        target: TP-007
+    fields:
+      req-type: functional
+      priority: must
+      section: .wasm_linear_memory
+      placement: RAM
+      alignment: "Power-of-2, matching MPU region size"
+      verification-criteria: >
+        readelf -S firmware.elf shows .wasm_linear_memory with ALLOC+WRITE
+        flags; section address is in RAM region (0x20000000+); alignment
+        is power-of-2; MPU region covers the section exactly.
+
+  - id: SL-007
+    type: system-req
+    title: Memory layout -- .meld_import_table in FLASH (read-only)
+    description: >
+      The .meld_import_table section shall be placed in FLASH as read-only
+      data. This section contains the import descriptor table that
+      kiln-builtins reads at startup to configure the dispatch table.
+      The companion .meld_import_strings section shall immediately follow.
+      Both sections have ALLOC flag but not WRITE or EXEC. Placing import
+      metadata in FLASH saves RAM on constrained targets and ensures the
+      import table cannot be corrupted by software bugs.
+    status: draft
+    tags: [static-linking, memory-layout, flash, import-table, read-only]
+    links:
+      - type: derives-from
+        target: BR-001
+      - type: refines
+        target: CC-005
+      - type: traces-to
+        target: KB-005
+      - type: traces-to
+        target: SL-002
+    fields:
+      req-type: functional
+      priority: must
+      section: .meld_import_table
+      placement: FLASH (read-only)
+      verification-criteria: >
+        readelf -S firmware.elf shows .meld_import_table with ALLOC
+        flag only (no WRITE, no EXEC); section address is in FLASH
+        region; content matches expected import descriptor format.
+
+  # ---------------------------------------------------------------------------
+  # SW-level requirements for linker integration in synth
+  # ---------------------------------------------------------------------------
+
+  - id: SL-TR-001
+    type: sw-req
+    title: LinkerScriptGenerator Meld section support
+    description: >
+      synth-backend LinkerScriptGenerator shall support a with_meld_sections()
+      builder method that adds .meld_import_table and .meld_import_strings
+      sections to the generated linker script. When enabled, the generator
+      emits EXTERN(__meld_dispatch_import), EXTERN(__meld_get_memory_base),
+      and section definitions with KEEP() to prevent dead-stripping of
+      import metadata. The method is composable with existing
+      with_meld_integration() and board-specific constructors
+      (new_stm32, new_nrf52840, new_generic).
+    status: planned
+    tags: [static-linking, linker-script, synth-backend]
+    links:
+      - type: derives-from
+        target: SL-002
+      - type: derives-from
+        target: ZI-009
+    fields:
+      req-type: functional
+      priority: must
+      crate: synth-backend
+      verification-criteria: >
+        LinkerScriptGenerator::new_stm32().with_meld_sections().generate()
+        produces script containing .meld_import_table and EXTERN directives;
+        script is parseable by arm-none-eabi-ld --verbose.
+
+  - id: SL-TR-002
+    type: sw-req
+    title: ARM cross-linker invocation from synth-cli
+    description: >
+      synth-cli shall support invoking the ARM cross-linker as a subprocess
+      when the --link flag is provided. The linker invocation shall pass:
+      (1) the generated module.o, (2) the kiln-builtins.a path (from
+      --builtins flag), (3) the generated or user-provided linker script
+      (from --linker-script flag or auto-generated), and (4) the output
+      path (-o firmware.elf). The linker binary is resolved via --linker
+      flag, SYNTH_LINKER environment variable, or PATH detection of
+      arm-none-eabi-ld. Linker errors are captured and reported with
+      context (which symbol is unresolved, which object needs it).
+    status: planned
+    tags: [static-linking, cli, linker-invocation, synth-cli]
+    links:
+      - type: derives-from
+        target: SL-001
+      - type: refines
+        target: CC-007
+      - type: refines
+        target: TR-003
+    fields:
+      req-type: functional
+      priority: should
+      crate: synth-cli
+      verification-criteria: >
+        synth compile --link --builtins kiln-builtins.a -o firmware.elf
+        invokes arm-none-eabi-ld with correct arguments; linker errors
+        are reported with synth-specific context; exit code propagated.
+
+  - id: SL-TR-003
+    type: sw-req
+    title: Relocation emission for import BL instructions
+    description: >
+      synth-backend ElfBuilder shall emit R_ARM_THM_CALL relocation entries
+      in the .rel.text section for each BL instruction targeting an import
+      symbol. The relocation entry shall reference the correct symbol table
+      index for the import's undefined symbol (__meld_dispatch_import or
+      __meld_import_N). The ARM linker uses these relocations to patch the
+      BL instruction's offset field to point to the resolved symbol address.
+      The BL encoding shall leave the offset field as zero (or a placeholder)
+      in the relocatable object, to be filled by the linker.
+    status: planned
+    tags: [static-linking, relocations, elf, arm-thumb]
+    links:
+      - type: refines
+        target: CC-008
+      - type: derives-from
+        target: SL-003
+      - type: refines
+        target: TR-002
+    fields:
+      req-type: functional
+      priority: must
+      crate: synth-backend
+      verification-criteria: >
+        readelf -r module.o shows R_ARM_THM_CALL entries for each import
+        call; symbol indices reference correct UND symbols; after linking,
+        objdump -d shows BL instructions with correct target addresses.
diff --git a/crates/synth-backend/src/arm_encoder.rs b/crates/synth-backend/src/arm_encoder.rs
index 9fd451b..18fdb3b 100644
--- a/crates/synth-backend/src/arm_encoder.rs
+++ b/crates/synth-backend/src/arm_encoder.rs
@@ -360,6 +360,78 @@ impl ArmEncoder {
                 0xE5800000 | (base_bits << 16) | (rd_bits << 12) | offset_bits
             }
 
+            // Sub-word loads (ARM32 encoding)
+            ArmOp::Ldrb { rd, addr } => {
+                let rd_bits = reg_to_bits(rd);
+                let (base_bits, offset_bits) = encode_mem_addr(addr);
+                // LDRB: LDR with B=1 (byte): cond|01|I|P|U|1|W|L|Rn|Rd|offset
+                0xE5D00000 | (base_bits << 16) | (rd_bits << 12) | offset_bits
+            }
+
+            ArmOp::Ldrsb { rd, addr } => {
+                let rd_bits = reg_to_bits(rd);
+                let (base_bits, offset_bits) = encode_mem_addr(addr);
+                // LDRSB (misc load): cond|000|P|U|1|W|1|Rn|Rd|imm4H|1101|imm4L
+                // Simplified with immediate offset
+                let offset_val = offset_bits & 0xFF;
+                let imm4h = (offset_val >> 4) & 0xF;
+                let imm4l = offset_val & 0xF;
+                0xE1D000D0 | (base_bits << 16) | (rd_bits << 12) | (imm4h << 8) | imm4l
+            }
+
+            ArmOp::Ldrh { rd, addr } => {
+                let rd_bits = reg_to_bits(rd);
+                let (base_bits, offset_bits) = encode_mem_addr(addr);
+                // LDRH (misc load): cond|000|P|U|1|W|1|Rn|Rd|imm4H|1011|imm4L
+                let offset_val = offset_bits & 0xFF;
+                let imm4h = (offset_val >> 4) & 0xF;
+                let imm4l = offset_val & 0xF;
+                0xE1D000B0 | (base_bits << 16) | (rd_bits << 12) | (imm4h << 8) | imm4l
+            }
+
+            ArmOp::Ldrsh { rd, addr } => {
+                let rd_bits = reg_to_bits(rd);
+                let (base_bits, offset_bits) = encode_mem_addr(addr);
+                // LDRSH (misc load): cond|000|P|U|1|W|1|Rn|Rd|imm4H|1111|imm4L
+                let offset_val = offset_bits & 0xFF;
+                let imm4h = (offset_val >> 4) & 0xF;
+                let imm4l = offset_val & 0xF;
+                0xE1D000F0 | (base_bits << 16) | (rd_bits << 12) | (imm4h << 8) | imm4l
+            }
+
+            // Sub-word stores (ARM32 encoding)
+            ArmOp::Strb { rd, addr } => {
+                let rd_bits = reg_to_bits(rd);
+                let (base_bits, offset_bits) = encode_mem_addr(addr);
+                // STRB: STR with B=1 (byte): cond|01|I|P|U|1|W|0|Rn|Rd|offset
+                0xE5C00000 | (base_bits << 16) | (rd_bits << 12) | offset_bits
+            }
+
+            ArmOp::Strh { rd, addr } => {
+                let rd_bits = reg_to_bits(rd);
+                let (base_bits, offset_bits) = encode_mem_addr(addr);
+                // STRH (misc store): cond|000|P|U|1|W|0|Rn|Rd|imm4H|1011|imm4L
+                let offset_val = offset_bits & 0xFF;
+                let imm4h = (offset_val >> 4) & 0xF;
+                let imm4l = offset_val & 0xF;
+                0xE1C000B0 | (base_bits << 16) | (rd_bits << 12) | (imm4h << 8) | imm4l
+            }
+
+            // Memory management (ARM32 encoding)
+            ArmOp::MemorySize { rd } => {
+                let rd_bits = reg_to_bits(rd);
+                // MOV rd, R10, LSR #16  (memory size in bytes / 65536 = pages)
+                // cond|000|1101|S|0000|Rd|shift5|type|0|Rm
+                // LSR #16: shift5=10000, type=01
+                0xE1A00820 | (rd_bits << 12) | 0x0A // Rm=R10, shift=16, LSR
+            }
+
+            ArmOp::MemoryGrow { rd, .. } => {
+                let rd_bits = reg_to_bits(rd);
+                // On embedded, always fail: MOV rd, #-1
+                0xE3E00000 | (rd_bits << 12) // MVN rd, #0 = MOV rd, #-1
+            }
+
             // Label pseudo-instruction: emits no machine code
             ArmOp::Label { .. } => {
                 return Ok(Vec::new());
@@ -1881,6 +1953,201 @@ impl ArmEncoder {
                 }
             }
 
+            // LDRB (Thumb-2)
+            ArmOp::Ldrb { rd, addr } => {
+                let rd_bits = reg_to_bits(rd);
+                let base_bits = reg_to_bits(&addr.base);
+
+                if let Some(offset_reg) = &addr.offset_reg {
+                    if addr.offset != 0 {
+                        let scratch = Reg::R12;
+                        let mut bytes =
+                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
+                        bytes.extend(self.encode_thumb32_ldrb_reg(rd, &addr.base, &scratch)?);
+                        return Ok(bytes);
+                    }
+                    return self.encode_thumb32_ldrb_reg(rd, &addr.base, offset_reg);
+                }
+
+                let offset = addr.offset as u32;
+                if rd_bits < 8 && base_bits < 8 && offset <= 31 {
+                    // LDRB Rd, [Rn, #imm5] (16-bit): 0111 1 imm5 Rn Rd
+                    let instr: u16 = 0x7800
+                        | ((offset as u16) << 6)
+                        | ((base_bits as u16) << 3)
+                        | (rd_bits as u16);
+                    Ok(instr.to_le_bytes().to_vec())
+                } else {
+                    self.encode_thumb32_ldrb_imm(rd, &addr.base, offset)
+                }
+            }
+
+            // LDRSB (Thumb-2)
+            ArmOp::Ldrsb { rd, addr } => {
+                let rd_bits = reg_to_bits(rd);
+                let base_bits = reg_to_bits(&addr.base);
+
+                if let Some(offset_reg) = &addr.offset_reg {
+                    if addr.offset != 0 {
+                        let scratch = Reg::R12;
+                        let mut bytes =
+                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
+                        bytes.extend(self.encode_thumb32_ldrsb_reg(rd, &addr.base, &scratch)?);
+                        return Ok(bytes);
+                    }
+                    return self.encode_thumb32_ldrsb_reg(rd, &addr.base, offset_reg);
+                }
+
+                let offset = addr.offset as u32;
+                // LDRSB has no 16-bit immediate form (only register)
+                // For 16-bit reg form: only if Rd, Rn, Rm < R8
+                if rd_bits < 8 && base_bits < 8 && offset == 0 {
+                    // No immediate 16-bit encoding for LDRSB; use 32-bit
+                    self.encode_thumb32_ldrsb_imm(rd, &addr.base, offset)
+                } else {
+                    self.encode_thumb32_ldrsb_imm(rd, &addr.base, offset)
+                }
+            }
+
+            // LDRH (Thumb-2)
+            ArmOp::Ldrh { rd, addr } => {
+                let rd_bits = reg_to_bits(rd);
+                let base_bits = reg_to_bits(&addr.base);
+
+                if let Some(offset_reg) = &addr.offset_reg {
+                    if addr.offset != 0 {
+                        let scratch = Reg::R12;
+                        let mut bytes =
+                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
+                        bytes.extend(self.encode_thumb32_ldrh_reg(rd, &addr.base, &scratch)?);
+                        return Ok(bytes);
+                    }
+                    return self.encode_thumb32_ldrh_reg(rd, &addr.base, offset_reg);
+                }
+
+                let offset = addr.offset as u32;
+                if rd_bits < 8 && base_bits < 8 && (offset & 0x1) == 0 && offset <= 62 {
+                    // LDRH Rd, [Rn, #imm5*2] (16-bit): 1000 1 imm5 Rn Rd
+                    let imm5 = (offset >> 1) as u16;
+                    let instr: u16 =
+                        0x8800 | (imm5 << 6) | ((base_bits as u16) << 3) | (rd_bits as u16);
+                    Ok(instr.to_le_bytes().to_vec())
+                } else {
+                    self.encode_thumb32_ldrh_imm(rd, &addr.base, offset)
+                }
+            }
+
+            // LDRSH (Thumb-2)
+            ArmOp::Ldrsh { rd, addr } => {
+                if let Some(offset_reg) = &addr.offset_reg {
+                    if addr.offset != 0 {
+                        let scratch = Reg::R12;
+                        let mut bytes =
+                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
+                        bytes.extend(self.encode_thumb32_ldrsh_reg(rd, &addr.base, &scratch)?);
+                        return Ok(bytes);
+                    }
+                    return self.encode_thumb32_ldrsh_reg(rd, &addr.base, offset_reg);
+                }
+
+                let offset = addr.offset as u32;
+                self.encode_thumb32_ldrsh_imm(rd, &addr.base, offset)
+            }
+
+            // STRB (Thumb-2)
+            ArmOp::Strb { rd, addr } => {
+                let rd_bits = reg_to_bits(rd);
+                let base_bits = reg_to_bits(&addr.base);
+
+                if let Some(offset_reg) = &addr.offset_reg {
+                    if addr.offset != 0 {
+                        let scratch = Reg::R12;
+                        let mut bytes =
+                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
+                        bytes.extend(self.encode_thumb32_strb_reg(rd, &addr.base, &scratch)?);
+                        return Ok(bytes);
+                    }
+                    return self.encode_thumb32_strb_reg(rd, &addr.base, offset_reg);
+                }
+
+                let offset = addr.offset as u32;
+                if rd_bits < 8 && base_bits < 8 && offset <= 31 {
+                    // STRB Rd, [Rn, #imm5] (16-bit): 0111 0 imm5 Rn Rd
+                    let instr: u16 = 0x7000
+                        | ((offset as u16) << 6)
+                        | ((base_bits as u16) << 3)
+                        | (rd_bits as u16);
+                    Ok(instr.to_le_bytes().to_vec())
+                } else {
+                    self.encode_thumb32_strb_imm(rd, &addr.base, offset)
+                }
+            }
+
+            // STRH (Thumb-2)
+            ArmOp::Strh { rd, addr } => {
+                let rd_bits = reg_to_bits(rd);
+                let base_bits = reg_to_bits(&addr.base);
+
+                if let Some(offset_reg) = &addr.offset_reg {
+                    if addr.offset != 0 {
+                        let scratch = Reg::R12;
+                        let mut bytes =
+                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
+                        bytes.extend(self.encode_thumb32_strh_reg(rd, &addr.base, &scratch)?);
+                        return Ok(bytes);
+                    }
+                    return self.encode_thumb32_strh_reg(rd, &addr.base, offset_reg);
+                }
+
+                let offset = addr.offset as u32;
+                if rd_bits < 8 && base_bits < 8 && (offset & 0x1) == 0 && offset <= 62 {
+                    // STRH Rd, [Rn, #imm5*2] (16-bit): 1000 0 imm5 Rn Rd
+                    let imm5 = (offset >> 1) as u16;
+                    let instr: u16 =
+                        0x8000 | (imm5 << 6) | ((base_bits as u16) << 3) | (rd_bits as u16);
+                    Ok(instr.to_le_bytes().to_vec())
+                } else {
+                    self.encode_thumb32_strh_imm(rd, &addr.base, offset)
+                }
+            }
+
+            // MemorySize (Thumb-2)
+            ArmOp::MemorySize { rd } => {
+                // LSR rd, R10, #16 — memory size in bytes / 65536 = pages
+                // Thumb-2 16-bit: LSRS Rd, Rm, #imm5 — 0000 1 imm5 Rm Rd
+                let rd_bits = reg_to_bits(rd);
+                let r10_bits = reg_to_bits(&Reg::R10);
+                if rd_bits < 8 && r10_bits < 8 {
+                    let instr: u16 =
+                        0x0800 | (16u16 << 6) | ((r10_bits as u16) << 3) | (rd_bits as u16);
+                    Ok(instr.to_le_bytes().to_vec())
+                } else {
+                    // Thumb-2 32-bit LSR: 1110 1010 010 0 1111 | 0 imm3 Rd imm2 01 Rm
+                    let imm5: u32 = 16;
+                    let imm3 = (imm5 >> 2) & 0x7;
+                    let imm2 = imm5 & 0x3;
+                    let hw1: u16 = 0xEA4F;
+                    let hw2: u16 =
+                        ((imm3 << 12) | (rd_bits << 8) | (imm2 << 6) | 0x10 | r10_bits) as u16;
+                    let mut bytes = hw1.to_le_bytes().to_vec();
+                    bytes.extend_from_slice(&hw2.to_le_bytes());
+                    Ok(bytes)
+                }
+            }
+
+            // MemoryGrow (Thumb-2)
+            ArmOp::MemoryGrow { rd, .. } => {
+                // On embedded with fixed memory, always return -1 (failure)
+                // MVN rd, #0 → MOV rd, #-1
+                // Thumb-2 32-bit: MVN: 1111 0 i 0 0 0 1 1 0 1111 | 0 imm3 Rd imm8
+                let rd_bits = reg_to_bits(rd);
+                let hw1: u16 = 0xF06F; // MVN with i=0
+                let hw2: u16 = (rd_bits << 8) as u16; // imm8=0 → ~0 = 0xFFFFFFFF = -1
+                let mut bytes = hw1.to_le_bytes().to_vec();
+                bytes.extend_from_slice(&hw2.to_le_bytes());
+                Ok(bytes)
+            }
+
             // BX (16-bit)
             ArmOp::Bx { rm } => {
                 let rm_bits = reg_to_bits(rm) as u16;
@@ -5407,6 +5674,158 @@ impl ArmEncoder {
         Ok(bytes)
     }
 
+    // === Sub-word load/store Thumb-2 encoding helpers ===
+
+    /// Encode Thumb-2 32-bit LDRB with immediate: LDRB.W Rd, [Rn, #imm12]
+    fn encode_thumb32_ldrb_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
+        let rd_bits = reg_to_bits(rd);
+        let base_bits = reg_to_bits(base);
+        // LDRB.W Rd, [Rn, #imm12]: 1111 1000 1001 Rn | Rt imm12
+        let hw1: u16 = (0xF890 | base_bits) as u16;
+        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
+        let mut bytes = hw1.to_le_bytes().to_vec();
+        bytes.extend_from_slice(&hw2.to_le_bytes());
+        Ok(bytes)
+    }
+
+    /// Encode Thumb-2 32-bit LDRB with register: LDRB.W Rd, [Rn, Rm]
+    fn encode_thumb32_ldrb_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
+        let rd_bits = reg_to_bits(rd);
+        let base_bits = reg_to_bits(base);
+        let rm_bits = reg_to_bits(offset_reg);
+        // LDRB.W Rd, [Rn, Rm, LSL #0]: 1111 1000 0001 Rn | Rt 0000 00 imm2 Rm
+        let hw1: u16 = (0xF810 | base_bits) as u16;
+        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
+        let mut bytes = hw1.to_le_bytes().to_vec();
+        bytes.extend_from_slice(&hw2.to_le_bytes());
+        Ok(bytes)
+    }
+
+    /// Encode Thumb-2 32-bit LDRSB with immediate: LDRSB.W Rd, [Rn, #imm12]
+    fn encode_thumb32_ldrsb_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
+        let rd_bits = reg_to_bits(rd);
+        let base_bits = reg_to_bits(base);
+        // LDRSB.W Rd, [Rn, #imm12]: 1111 1001 1001 Rn | Rt imm12
+        let hw1: u16 = (0xF990 | base_bits) as u16;
+        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
+        let mut bytes = hw1.to_le_bytes().to_vec();
+        bytes.extend_from_slice(&hw2.to_le_bytes());
+        Ok(bytes)
+    }
+
+    /// Encode Thumb-2 32-bit LDRSB with register: LDRSB.W Rd, [Rn, Rm]
+    fn encode_thumb32_ldrsb_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
+        let rd_bits = reg_to_bits(rd);
+        let base_bits = reg_to_bits(base);
+        let rm_bits = reg_to_bits(offset_reg);
+        // LDRSB.W Rd, [Rn, Rm, LSL #0]: 1111 1001 0001 Rn | Rt 0000 00 imm2 Rm
+        let hw1: u16 = (0xF910 | base_bits) as u16;
+        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
+        let mut bytes = hw1.to_le_bytes().to_vec();
+        bytes.extend_from_slice(&hw2.to_le_bytes());
+        Ok(bytes)
+    }
+
+    /// Encode Thumb-2 32-bit LDRH with immediate: LDRH.W Rd, [Rn, #imm12]
+    fn encode_thumb32_ldrh_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
+        let rd_bits = reg_to_bits(rd);
+        let base_bits = reg_to_bits(base);
+        // LDRH.W Rd, [Rn, #imm12]: 1111 1000 1011 Rn | Rt imm12
+        let hw1: u16 = (0xF8B0 | base_bits) as u16;
+        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
+        let mut bytes = hw1.to_le_bytes().to_vec();
+        bytes.extend_from_slice(&hw2.to_le_bytes());
+        Ok(bytes)
+    }
+
+    /// Encode Thumb-2 32-bit LDRH with register: LDRH.W Rd, [Rn, Rm]
+    fn encode_thumb32_ldrh_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
+        let rd_bits = reg_to_bits(rd);
+        let base_bits = reg_to_bits(base);
+        let rm_bits = reg_to_bits(offset_reg);
+        // LDRH.W Rd, [Rn, Rm, LSL #0]: 1111 1000 0011 Rn | Rt 0000 00 imm2 Rm
+        let hw1: u16 = (0xF830 | base_bits) as u16;
+        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
+        let mut bytes = hw1.to_le_bytes().to_vec();
+        bytes.extend_from_slice(&hw2.to_le_bytes());
+        Ok(bytes)
+    }
+
+    /// Encode Thumb-2 32-bit LDRSH with immediate: LDRSH.W Rd, [Rn, #imm12]
+    fn encode_thumb32_ldrsh_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
+        let rd_bits = reg_to_bits(rd);
+        let base_bits = reg_to_bits(base);
+        // LDRSH.W Rd, [Rn, #imm12]: 1111 1001 1011 Rn | Rt imm12
+        let hw1: u16 = (0xF9B0 | base_bits) as u16;
+        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
+        let mut bytes = hw1.to_le_bytes().to_vec();
+        bytes.extend_from_slice(&hw2.to_le_bytes());
+        Ok(bytes)
+    }
+
+    /// Encode Thumb-2 32-bit LDRSH with register: LDRSH.W Rd, [Rn, Rm]
+    fn encode_thumb32_ldrsh_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
+        let rd_bits = reg_to_bits(rd);
+        let base_bits = reg_to_bits(base);
+        let rm_bits = reg_to_bits(offset_reg);
+        // LDRSH.W Rd, [Rn, Rm, LSL #0]: 1111 1001 0011 Rn | Rt 0000 00 imm2 Rm
+        let hw1: u16 = (0xF930 | base_bits) as u16;
+        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
+        let mut bytes = hw1.to_le_bytes().to_vec();
+        bytes.extend_from_slice(&hw2.to_le_bytes());
+        Ok(bytes)
+    }
+
+    /// Encode Thumb-2 32-bit STRB with immediate: STRB.W Rd, [Rn, #imm12]
+    fn encode_thumb32_strb_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
+        let rd_bits = reg_to_bits(rd);
+        let base_bits = reg_to_bits(base);
+        // STRB.W Rd, [Rn, #imm12]: 1111 1000 1000 Rn | Rt imm12
+        let hw1: u16 = (0xF880 | base_bits) as u16;
+        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
+        let mut bytes = hw1.to_le_bytes().to_vec();
+        bytes.extend_from_slice(&hw2.to_le_bytes());
+        Ok(bytes)
+    }
+
+    /// Encode Thumb-2 32-bit STRB with register: STRB.W Rd, [Rn, Rm]
+    fn encode_thumb32_strb_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
+        let rd_bits = reg_to_bits(rd);
+        let base_bits = reg_to_bits(base);
+        let rm_bits = reg_to_bits(offset_reg);
+        // STRB.W Rd, [Rn, Rm, LSL #0]: 1111 1000 0000 Rn | Rt 0000 00 imm2 Rm
+        let hw1: u16 = (0xF800 | base_bits) as u16;
+        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
+        let mut bytes = hw1.to_le_bytes().to_vec();
+        bytes.extend_from_slice(&hw2.to_le_bytes());
+        Ok(bytes)
+    }
+
+    /// Encode Thumb-2 32-bit STRH with immediate: STRH.W Rd, [Rn, #imm12]
+    fn encode_thumb32_strh_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
+        let rd_bits = reg_to_bits(rd);
+        let base_bits = reg_to_bits(base);
+        // STRH.W Rd, [Rn, #imm12]: 1111 1000 1010 Rn | Rt imm12
+        let hw1: u16 = (0xF8A0 | base_bits) as u16;
+        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
+        let mut bytes = hw1.to_le_bytes().to_vec();
+        bytes.extend_from_slice(&hw2.to_le_bytes());
+        Ok(bytes)
+    }
+
+    /// Encode Thumb-2 32-bit STRH with register: STRH.W Rd, [Rn, Rm]
+    fn encode_thumb32_strh_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
+        let rd_bits = reg_to_bits(rd);
+        let base_bits = reg_to_bits(base);
+        let rm_bits = reg_to_bits(offset_reg);
+        // STRH.W Rd, [Rn, Rm, LSL #0]: 1111 1000 0010 Rn | Rt 0000 00 imm2 Rm
+        let hw1: u16 = (0xF820 | base_bits) as u16;
+        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
+        let mut bytes = hw1.to_le_bytes().to_vec();
+        bytes.extend_from_slice(&hw2.to_le_bytes());
+        Ok(bytes)
+    }
+
     /// Encode Thumb-2 32-bit ADD with immediate: ADD.W Rd, Rn, #imm
     fn encode_thumb32_add_imm(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
         let rd_bits = reg_to_bits(rd);
@@ -6929,4 +7348,270 @@ mod tests {
         // MOVW + MOVT for lo (8 bytes) + MOVW + MOVT for hi (8 bytes) = 16 bytes
         assert_eq!(code.len(), 16, "I64Const(-1) should be 16 bytes");
     }
+
+    // =========================================================================
+    // Sub-word load/store encoding tests
+    // =========================================================================
+
+    #[test]
+    fn test_encode_ldrb_arm32() {
+        let encoder = ArmEncoder::new_arm32();
+        let op = ArmOp::Ldrb {
+            rd: Reg::R0,
+            addr: MemAddr::imm(Reg::R1, 4),
+        };
+        let code = encoder.encode(&op).unwrap();
+        assert_eq!(code.len(), 4, "ARM32 LDRB should be 4 bytes");
+        // LDRB R0, [R1, #4] = 0xE5D10004
+        let encoded = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
+        assert_eq!(encoded, 0xE5D10004, "Should encode LDRB R0, [R1, #4]");
+    }
+
+    #[test]
+    fn test_encode_strb_arm32() {
+        let encoder = ArmEncoder::new_arm32();
+        let op = ArmOp::Strb {
+            rd: Reg::R0,
+            addr: MemAddr::imm(Reg::R1, 0),
+        };
+        let code = encoder.encode(&op).unwrap();
+        assert_eq!(code.len(), 4, "ARM32 STRB should be 4 bytes");
+        // STRB R0, [R1, #0] = 0xE5C10000
+        let encoded = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
+        assert_eq!(encoded, 0xE5C10000, "Should encode STRB R0, [R1, #0]");
+    }
+
+    #[test]
+    fn test_encode_ldrh_arm32() {
+        let encoder = ArmEncoder::new_arm32();
+        let op = ArmOp::Ldrh {
+            rd: Reg::R0,
+            addr: MemAddr::imm(Reg::R1, 2),
+        };
+        let code = encoder.encode(&op).unwrap();
+        assert_eq!(code.len(), 4, "ARM32 LDRH should be 4 bytes");
+    }
+
+    #[test]
+    fn test_encode_strh_arm32() {
+        let encoder = ArmEncoder::new_arm32();
+        let op = ArmOp::Strh {
+            rd: Reg::R0,
+            addr: MemAddr::imm(Reg::R1, 0),
+        };
+        let code = encoder.encode(&op).unwrap();
+        assert_eq!(code.len(), 4, "ARM32 STRH should be 4 bytes");
+    }
+
+    #[test]
+    fn test_encode_ldrsb_arm32() {
+        let encoder = ArmEncoder::new_arm32();
+        let op = ArmOp::Ldrsb {
+            rd: Reg::R0,
+            addr: MemAddr::imm(Reg::R1, 0),
+        };
+        let code = encoder.encode(&op).unwrap();
+        assert_eq!(code.len(), 4, "ARM32 LDRSB should be 4 bytes");
+    }
+
+    #[test]
+    fn test_encode_ldrsh_arm32() {
+        let encoder = ArmEncoder::new_arm32();
+        let op = ArmOp::Ldrsh {
+            rd: Reg::R0,
+            addr: MemAddr::imm(Reg::R1, 0),
+        };
+        let code = encoder.encode(&op).unwrap();
+        assert_eq!(code.len(), 4, "ARM32 LDRSH should be 4 bytes");
+    }
+
+    #[test]
+    fn test_encode_ldrb_thumb2_16bit() {
+        let encoder = ArmEncoder::new_thumb2();
+        let op = ArmOp::Ldrb {
+            rd: Reg::R0,
+            addr: MemAddr::imm(Reg::R1, 4),
+        };
+        let code = encoder.encode(&op).unwrap();
+        // Low registers + small offset -> 16-bit encoding
+        assert_eq!(
+            code.len(),
+            2,
+            "Thumb-2 LDRB with small offset should be 16-bit"
+        );
+    }
+
+    #[test]
+    fn test_encode_ldrb_thumb2_32bit() {
+        let encoder = ArmEncoder::new_thumb2();
+        let op = ArmOp::Ldrb {
+            rd: Reg::R0,
+            addr: MemAddr::imm(Reg::R1, 100), // offset > 31 needs 32-bit
+        };
+        let code = encoder.encode(&op).unwrap();
+        assert_eq!(
+            code.len(),
+            4,
+            "Thumb-2 LDRB with large offset should be 32-bit"
+        );
+    }
+
+    #[test]
+    fn test_encode_strb_thumb2_16bit() {
+        let encoder = ArmEncoder::new_thumb2();
+        let op = ArmOp::Strb {
+            rd: Reg::R0,
+            addr: MemAddr::imm(Reg::R1, 10),
+        };
+        let code = encoder.encode(&op).unwrap();
+        assert_eq!(
+            code.len(),
+            2,
+            "Thumb-2 STRB with small offset should be 16-bit"
+        );
+    }
+
+    #[test]
+    fn test_encode_ldrh_thumb2_16bit() {
+        let encoder = ArmEncoder::new_thumb2();
+        let op = ArmOp::Ldrh {
+            rd: Reg::R0,
+            addr: MemAddr::imm(Reg::R1, 4), // offset aligned to 2, <= 62
+        };
+        let code = encoder.encode(&op).unwrap();
+        assert_eq!(
+            code.len(),
+            2,
+            "Thumb-2 LDRH with small aligned offset should be 16-bit"
+        );
+    }
+
+    #[test]
+    fn test_encode_strh_thumb2_16bit() {
+        let encoder = ArmEncoder::new_thumb2();
+        let op = ArmOp::Strh {
+            rd: Reg::R0,
+            addr: MemAddr::imm(Reg::R1, 4),
+        };
+        let code = encoder.encode(&op).unwrap();
+        assert_eq!(
+            code.len(),
+            2,
+            "Thumb-2 STRH with small aligned offset should be 16-bit"
+        );
+    }
+
+    #[test]
+    fn test_encode_ldrsb_thumb2() {
+        let encoder = ArmEncoder::new_thumb2();
+        let op = ArmOp::Ldrsb {
+            rd: Reg::R0,
+            addr: MemAddr::imm(Reg::R1, 0),
+        };
+        let code = encoder.encode(&op).unwrap();
+        // LDRSB has no 16-bit immediate form, always 32-bit
+        assert_eq!(code.len(), 4, "Thumb-2 LDRSB should be 32-bit");
+    }
+
+    #[test]
+    fn test_encode_ldrsh_thumb2() {
+        let encoder = ArmEncoder::new_thumb2();
+        let op = ArmOp::Ldrsh {
+            rd: Reg::R0,
+            addr: MemAddr::imm(Reg::R1, 0),
+        };
+        let code = encoder.encode(&op).unwrap();
+        assert_eq!(code.len(), 4, "Thumb-2 LDRSH should be 32-bit");
+    }
+
+    #[test]
+    fn test_encode_memory_size_thumb2() {
+        let encoder = ArmEncoder::new_thumb2();
+        let op = ArmOp::MemorySize { rd: Reg::R0 };
+        let code = encoder.encode(&op).unwrap();
+        // R0 and R10 are not both low registers, so this needs careful handling
+        assert!(!code.is_empty(), "MemorySize should produce code");
+    }
+
+    #[test]
+    fn test_encode_memory_grow_thumb2() {
+        let encoder = ArmEncoder::new_thumb2();
+        let op = ArmOp::MemoryGrow {
+            rd: Reg::R0,
+            rn: Reg::R0,
+        };
+        let code = encoder.encode(&op).unwrap();
+        assert_eq!(code.len(), 4, "MemoryGrow (MVN) should be 32-bit Thumb-2");
+    }
+
+    #[test]
+    fn test_encode_subword_reg_offset_thumb2() {
+        let encoder = ArmEncoder::new_thumb2();
+
+        // LDRB with register offset
+        let op = ArmOp::Ldrb {
+            rd: Reg::R0,
+            addr: MemAddr::reg(Reg::R1, Reg::R2),
+        };
+        let code = encoder.encode(&op).unwrap();
+        assert_eq!(
+            code.len(),
+            4,
+            "Thumb-2 LDRB with reg offset should be 32-bit"
+        );
+
+        // STRB with register offset
+        let op = ArmOp::Strb {
+            rd: Reg::R0,
+            addr: MemAddr::reg(Reg::R1, Reg::R2),
+        };
+        let code = encoder.encode(&op).unwrap();
+        assert_eq!(
+            code.len(),
+            4,
+            "Thumb-2 STRB with reg offset should be 32-bit"
+        );
+
+        // LDRH with register offset
+        let op = ArmOp::Ldrh {
+            rd: Reg::R0,
+            addr: MemAddr::reg(Reg::R1, Reg::R2),
+        };
+        let code = encoder.encode(&op).unwrap();
+        assert_eq!(
+            code.len(),
+            4,
+            "Thumb-2 LDRH with reg offset should be 32-bit"
+        );
+
+        // STRH with register offset
+        let op = ArmOp::Strh {
+            rd: Reg::R0,
+            addr: MemAddr::reg(Reg::R1, Reg::R2),
+        };
+        let code = encoder.encode(&op).unwrap();
+        assert_eq!(
+            code.len(),
+            4,
+            "Thumb-2 STRH with reg offset should be 32-bit"
+        );
+    }
+
+    #[test]
+    fn test_encode_subword_reg_imm_offset_thumb2() {
+        let encoder = ArmEncoder::new_thumb2();
+
+        // LDRB with both register and immediate offset
+        let op = ArmOp::Ldrb {
+            rd: Reg::R0,
+            addr: MemAddr::reg_imm(Reg::R1, Reg::R2, 4),
+        };
+        let code = encoder.encode(&op).unwrap();
+        // ADD R12, R2, #4 (4 bytes) + LDRB R0, [R1, R12] (4 bytes) = 8 bytes
+        assert_eq!(
+            code.len(),
+            8,
+            "Thumb-2 LDRB with reg+imm offset should be 8 bytes"
+        );
+    }
 }
diff --git a/crates/synth-core/src/wasm_decoder.rs b/crates/synth-core/src/wasm_decoder.rs
index 72603fd..65df784 100644
--- a/crates/synth-core/src/wasm_decoder.rs
+++ b/crates/synth-core/src/wasm_decoder.rs
@@ -338,6 +338,34 @@ fn convert_operator(op: &wasmparser::Operator) -> Option<WasmOp> {
             align: memarg.align as u32,
         }),
 
+        // Sub-word loads (i32)
+        I32Load8S { memarg } => Some(WasmOp::I32Load8S {
+            offset: memarg.offset as u32,
+            align: memarg.align as u32,
+        }),
+        I32Load8U { memarg } => Some(WasmOp::I32Load8U {
+            offset: memarg.offset as u32,
+            align: memarg.align as u32,
+        }),
+        I32Load16S { memarg } => Some(WasmOp::I32Load16S {
+            offset: memarg.offset as u32,
+            align: memarg.align as u32,
+        }),
+        I32Load16U { memarg } => Some(WasmOp::I32Load16U {
+            offset: memarg.offset as u32,
+            align: memarg.align as u32,
+        }),
+
+        // Sub-word stores (i32)
+        I32Store8 { memarg } => Some(WasmOp::I32Store8 {
+            offset: memarg.offset as u32,
+            align: memarg.align as u32,
+        }),
+        I32Store16 { memarg } => Some(WasmOp::I32Store16 {
+            offset: memarg.offset as u32,
+            align: memarg.align as u32,
+        }),
+
         // Local/Global
         LocalGet { local_index } => Some(WasmOp::LocalGet(*local_index)),
         LocalSet { local_index } => Some(WasmOp::LocalSet(*local_index)),
@@ -377,6 +405,50 @@ fn convert_operator(op: &wasmparser::Operator) -> Option<WasmOp> {
         If { .. } => Some(WasmOp::If),
         Else => Some(WasmOp::Else),
 
+        // i64 sub-word loads
+        I64Load8S { memarg } => Some(WasmOp::I64Load8S {
+            offset: memarg.offset as u32,
+            align: memarg.align as u32,
+        }),
+        I64Load8U { memarg } => Some(WasmOp::I64Load8U {
+            offset: memarg.offset as u32,
+            align: memarg.align as u32,
+        }),
+        I64Load16S { memarg } => Some(WasmOp::I64Load16S {
+            offset: memarg.offset as u32,
+            align: memarg.align as u32,
+        }),
+        I64Load16U { memarg } => Some(WasmOp::I64Load16U {
+            offset: memarg.offset as u32,
+            align: memarg.align as u32,
+        }),
+        I64Load32S { memarg } => Some(WasmOp::I64Load32S {
+            offset: memarg.offset as u32,
+            align: memarg.align as u32,
+        }),
+        I64Load32U { memarg } => Some(WasmOp::I64Load32U {
+            offset: memarg.offset as u32,
+            align: memarg.align as u32,
+        }),
+
+        // i64 sub-word stores
+        I64Store8 { memarg } => Some(WasmOp::I64Store8 {
+            offset: memarg.offset as u32,
+            align: memarg.align as u32,
+        }),
+        I64Store16 { memarg } => Some(WasmOp::I64Store16 {
+            offset: memarg.offset as u32,
+            align: memarg.align as u32,
+        }),
+        I64Store32 { memarg } => Some(WasmOp::I64Store32 {
+            offset: memarg.offset as u32,
+            align: memarg.align as u32,
+        }),
+
+        // Memory management
+        MemorySize { mem, .. } => Some(WasmOp::MemorySize(*mem)),
+        MemoryGrow { mem, .. } => Some(WasmOp::MemoryGrow(*mem)),
+
         // Other operators not yet supported
         _ => None,
     }
@@ -537,4 +609,197 @@ mod tests {
         assert_eq!(add_func.index, 1);
         assert!(add_func.ops.contains(&WasmOp::I32Add));
     }
+
+    #[test]
+    fn test_decode_subword_loads() {
+        let wat = r#"
+            (module
+                (memory 1)
+                (func (export "test") (param i32) (result i32)
+                    local.get 0
+                    i32.load8_u
+                )
+            )
+        "#;
+
+        let wasm = wat::parse_str(wat).expect("Failed to parse WAT");
+        let functions = decode_wasm_functions(&wasm).expect("Failed to decode");
+
+        assert_eq!(functions.len(), 1);
+        assert!(functions[0].ops.contains(&WasmOp::I32Load8U {
+            offset: 0,
+            align: 0,
+        }));
+    }
+
+    #[test]
+    fn test_decode_subword_stores() {
+        let wat = r#"
+            (module
+                (memory 1)
+                (func (export "test") (param i32 i32)
+                    local.get 0
+                    local.get 1
+                    i32.store8
+                )
+            )
+        "#;
+
+        let wasm = wat::parse_str(wat).expect("Failed to parse WAT");
+        let functions = decode_wasm_functions(&wasm).expect("Failed to decode");
+
+        assert_eq!(functions.len(), 1);
+        assert!(functions[0].ops.contains(&WasmOp::I32Store8 {
+            offset: 0,
+            align: 0,
+        }));
+    }
+
+    #[test]
+    fn test_decode_memory_size_grow() {
+        let wat = r#"
+            (module
+                (memory 1)
+                (func (export "test") (result i32)
+                    memory.size
+                )
+            )
+        "#;
+
+        let wasm = wat::parse_str(wat).expect("Failed to parse WAT");
+        let functions = decode_wasm_functions(&wasm).expect("Failed to decode");
+
+        assert_eq!(functions.len(), 1);
+        assert!(functions[0].ops.contains(&WasmOp::MemorySize(0)));
+    }
+
+    #[test]
+    fn test_decode_memory_grow() {
+        let wat = r#"
+            (module
+                (memory 1)
+                (func (export "test") (param i32) (result i32)
+                    local.get 0
+                    memory.grow
+                )
+            )
+        "#;
+
+        let wasm = wat::parse_str(wat).expect("Failed to parse WAT");
+        let functions = decode_wasm_functions(&wasm).expect("Failed to decode");
+
+        assert_eq!(functions.len(), 1);
+        assert!(functions[0].ops.contains(&WasmOp::MemoryGrow(0)));
+    }
+
+    #[test]
+    fn test_decode_i64_subword_loads() {
+        let wat = r#"
+            (module
+                (memory 1)
+                (func (export "test") (param i32) (result i64)
+                    local.get 0
+                    i64.load8_s
+                )
+            )
+        "#;
+
+        let wasm = wat::parse_str(wat).expect("Failed to parse WAT");
+        let functions = decode_wasm_functions(&wasm).expect("Failed to decode");
+
+        assert_eq!(functions.len(), 1);
+        assert!(functions[0].ops.contains(&WasmOp::I64Load8S {
+            offset: 0,
+            align: 0,
+        }));
+    }
+
+    #[test]
+    fn test_decode_all_subword_memory_ops() {
+        // Test that all sub-word operations are decoded from WAT
+        let wat = r#"
+            (module
+                (memory 1)
+                (func (export "test") (param i32)
+                    ;; i32 sub-word loads
+                    local.get 0
+                    i32.load8_s
+                    drop
+                    local.get 0
+                    i32.load8_u
+                    drop
+                    local.get 0
+                    i32.load16_s
+                    drop
+                    local.get 0
+                    i32.load16_u
+                    drop
+
+                    ;; i32 sub-word stores
+                    local.get 0
+                    i32.const 42
+                    i32.store8
+                    local.get 0
+                    i32.const 42
+                    i32.store16
+
+                    ;; i64 sub-word loads
+                    local.get 0
+                    i64.load8_s
+                    drop
+                    local.get 0
+                    i64.load8_u
+                    drop
+                    local.get 0
+                    i64.load16_s
+                    drop
+                    local.get 0
+                    i64.load16_u
+                    drop
+                    local.get 0
+                    i64.load32_s
+                    drop
+                    local.get 0
+                    i64.load32_u
+                    drop
+
+                    ;; i64 sub-word stores
+                    local.get 0
+                    i64.const 42
+                    i64.store8
+                    local.get 0
+                    i64.const 42
+                    i64.store16
+                    local.get 0
+                    i64.const 42
+                    i64.store32
+                )
+            )
+        "#;
+
+        let wasm = wat::parse_str(wat).expect("Failed to parse WAT");
+        let functions = decode_wasm_functions(&wasm).expect("Failed to decode");
+
+        assert_eq!(functions.len(), 1);
+        let ops = &functions[0].ops;
+
+        // Verify i32 sub-word ops are present
+        assert!(ops.iter().any(|o| matches!(o, WasmOp::I32Load8S { .. })));
+        assert!(ops.iter().any(|o| matches!(o, WasmOp::I32Load8U { .. })));
+        assert!(ops.iter().any(|o| matches!(o, WasmOp::I32Load16S { .. })));
+        assert!(ops.iter().any(|o| matches!(o, WasmOp::I32Load16U { .. })));
+        assert!(ops.iter().any(|o| matches!(o, WasmOp::I32Store8 { .. })));
+        assert!(ops.iter().any(|o| matches!(o, WasmOp::I32Store16 { .. })));
+
+        // Verify i64 sub-word ops are present
+        assert!(ops.iter().any(|o| matches!(o, WasmOp::I64Load8S { .. })));
+        assert!(ops.iter().any(|o| matches!(o, WasmOp::I64Load8U { .. })));
+        assert!(ops.iter().any(|o| matches!(o, WasmOp::I64Load16S { .. })));
+        assert!(ops.iter().any(|o| matches!(o, WasmOp::I64Load16U { .. })));
+        assert!(ops.iter().any(|o| matches!(o, WasmOp::I64Load32S { .. })));
+        assert!(ops.iter().any(|o| matches!(o, WasmOp::I64Load32U { .. })));
+        assert!(ops.iter().any(|o| matches!(o, WasmOp::I64Store8 { .. })));
+        assert!(ops.iter().any(|o| matches!(o, WasmOp::I64Store16 { .. })));
+        assert!(ops.iter().any(|o| matches!(o, WasmOp::I64Store32 { .. })));
+    }
 }
diff --git a/crates/synth-core/src/wasm_op.rs b/crates/synth-core/src/wasm_op.rs
index eaae3e0..3d4036b 100644
--- a/crates/synth-core/src/wasm_op.rs
+++ b/crates/synth-core/src/wasm_op.rs
@@ -56,6 +56,16 @@ pub enum WasmOp {
     I32Load { offset: u32, align: u32 },
     I32Store { offset: u32, align: u32 },
 
+    // Sub-word loads (i32)
+    I32Load8S { offset: u32, align: u32 }, // byte load, sign-extend to i32
+    I32Load8U { offset: u32, align: u32 }, // byte load, zero-extend to i32
+    I32Load16S { offset: u32, align: u32 }, // halfword load, sign-extend to i32
+    I32Load16U { offset: u32, align: u32 }, // halfword load, zero-extend to i32
+
+    // Sub-word stores (i32)
+    I32Store8 { offset: u32, align: u32 },  // store low byte
+    I32Store16 { offset: u32, align: u32 }, // store low halfword
+
     // Control flow
     Block,
     Loop,
@@ -71,6 +81,10 @@ pub enum WasmOp {
     GlobalGet(u32),
     GlobalSet(u32),
 
+    // Memory management
+    MemorySize(u32), // returns current memory size in pages (memory index)
+    MemoryGrow(u32), // grow memory by N pages, returns previous size or -1 (memory index)
+
     // More ops
     Drop,
     Select,
@@ -124,6 +138,19 @@ pub enum WasmOp {
     I64Load { offset: u32, align: u32 },
     I64Store { offset: u32, align: u32 },
 
+    // Sub-word loads (i64) — load sub-word, extend to i64
+    I64Load8S { offset: u32, align: u32 },
+    I64Load8U { offset: u32, align: u32 },
+    I64Load16S { offset: u32, align: u32 },
+    I64Load16U { offset: u32, align: u32 },
+    I64Load32S { offset: u32, align: u32 },
+    I64Load32U { offset: u32, align: u32 },
+
+    // Sub-word stores (i64) — store low N bits
+    I64Store8 { offset: u32, align: u32 },
+    I64Store16 { offset: u32, align: u32 },
+    I64Store32 { offset: u32, align: u32 },
+
     // Conversion operations
     I64ExtendI32S, // Sign-extend i32 to i64
     I64ExtendI32U, // Zero-extend i32 to i64
diff --git a/crates/synth-synthesis/src/instruction_selector.rs b/crates/synth-synthesis/src/instruction_selector.rs
index 7384bc3..a0cd44a 100644
--- a/crates/synth-synthesis/src/instruction_selector.rs
+++ b/crates/synth-synthesis/src/instruction_selector.rs
@@ -419,6 +419,134 @@ impl InstructionSelector {
                 self.generate_store_with_bounds_check(rd, rn, *offset as i32, 4)
             }
 
+            // Sub-word loads (i32)
+            I32Load8S { offset, .. } => {
+                self.generate_subword_load_with_bounds_check(rd, rn, *offset as i32, 1, true)
+            }
+            I32Load8U { offset, .. } => {
+                self.generate_subword_load_with_bounds_check(rd, rn, *offset as i32, 1, false)
+            }
+            I32Load16S { offset, .. } => {
+                self.generate_subword_load_with_bounds_check(rd, rn, *offset as i32, 2, true)
+            }
+            I32Load16U { offset, .. } => {
+                self.generate_subword_load_with_bounds_check(rd, rn, *offset as i32, 2, false)
+            }
+
+            // Sub-word stores (i32)
+            I32Store8 { offset, .. } => {
+                self.generate_subword_store_with_bounds_check(rd, rn, *offset as i32, 1)
+            }
+            I32Store16 { offset, .. } => {
+                self.generate_subword_store_with_bounds_check(rd, rn, *offset as i32, 2)
+            }
+
+            // i64 sub-word loads — load sub-word, extend to i64 register pair
+            I64Load8S { offset, .. } => {
+                // LDRSB R0, [R11, rn, #offset]; ASR R1, R0, #31 (sign-extend to hi)
+                let mut ops = self.generate_subword_load_with_bounds_check(
+                    Reg::R0,
+                    rn,
+                    *offset as i32,
+                    1,
+                    true,
+                );
+                ops.push(ArmOp::Asr {
+                    rd: Reg::R1,
+                    rn: Reg::R0,
+                    shift: 31,
+                });
+                ops
+            }
+            I64Load8U { offset, .. } => {
+                // LDRB R0, [R11, rn, #offset]; MOV R1, #0
+                let mut ops = self.generate_subword_load_with_bounds_check(
+                    Reg::R0,
+                    rn,
+                    *offset as i32,
+                    1,
+                    false,
+                );
+                ops.push(ArmOp::Mov {
+                    rd: Reg::R1,
+                    op2: Operand2::Imm(0),
+                });
+                ops
+            }
+            I64Load16S { offset, .. } => {
+                let mut ops = self.generate_subword_load_with_bounds_check(
+                    Reg::R0,
+                    rn,
+                    *offset as i32,
+                    2,
+                    true,
+                );
+                ops.push(ArmOp::Asr {
+                    rd: Reg::R1,
+                    rn: Reg::R0,
+                    shift: 31,
+                });
+                ops
+            }
+            I64Load16U { offset, .. } => {
+                let mut ops = self.generate_subword_load_with_bounds_check(
+                    Reg::R0,
+                    rn,
+                    *offset as i32,
+                    2,
+                    false,
+                );
+                ops.push(ArmOp::Mov {
+                    rd: Reg::R1,
+                    op2: Operand2::Imm(0),
+                });
+                ops
+            }
+            I64Load32S { offset, .. } => {
+                // LDR R0, [R11, rn, #offset]; ASR R1, R0, #31
+                let mut ops = self.generate_load_with_bounds_check(Reg::R0, rn, *offset as i32, 4);
+                ops.push(ArmOp::Asr {
+                    rd: Reg::R1,
+                    rn: Reg::R0,
+                    shift: 31,
+                });
+                ops
+            }
+            I64Load32U { offset, .. } => {
+                // LDR R0, [R11, rn, #offset]; MOV R1, #0
+                let mut ops = self.generate_load_with_bounds_check(Reg::R0, rn, *offset as i32, 4);
+                ops.push(ArmOp::Mov {
+                    rd: Reg::R1,
+                    op2: Operand2::Imm(0),
+                });
+                ops
+            }
+
+            // i64 sub-word stores — store low N bits from i64 register pair
+            I64Store8 { offset, .. } => {
+                // STRB R0, [R11, rn, #offset] (low byte of low word)
+                self.generate_subword_store_with_bounds_check(Reg::R0, rn, *offset as i32, 1)
+            }
+            I64Store16 { offset, .. } => {
+                // STRH R0, [R11, rn, #offset] (low halfword of low word)
+                self.generate_subword_store_with_bounds_check(Reg::R0, rn, *offset as i32, 2)
+            }
+            I64Store32 { offset, .. } => {
+                // STR R0, [R11, rn, #offset] (low word)
+                self.generate_store_with_bounds_check(Reg::R0, rn, *offset as i32, 4)
+            }
+
+            // Memory management
+            MemorySize(_mem_idx) => {
+                // On embedded with fixed memory, return memory size in pages.
+                // R10 holds memory size in bytes; divide by 65536 (page size) via LSR #16.
+                vec![ArmOp::MemorySize { rd }]
+            }
+            MemoryGrow(_mem_idx) => {
+                // On embedded with fixed memory, always return -1 (cannot grow).
+                vec![ArmOp::MemoryGrow { rd, rn }]
+            }
+
             LocalGet(_index) => vec![ArmOp::Ldr {
                 rd,
                 addr: MemAddr::imm(Reg::SP, 0), // Simplified - would use proper frame offset
@@ -1465,6 +1593,117 @@ impl InstructionSelector {
         }
     }
 
+    /// Generate a sub-word load with optional bounds checking.
+    /// `access_size`: 1 for byte, 2 for halfword.
+    /// `sign_extend`: true for sign-extending loads (LDRSB/LDRSH), false for zero-extending (LDRB/LDRH).
+    fn generate_subword_load_with_bounds_check(
+        &self,
+        rd: Reg,
+        addr_reg: Reg,
+        offset: i32,
+        access_size: u32,
+        sign_extend: bool,
+    ) -> Vec<ArmOp> {
+        let addr = MemAddr::reg_imm(Reg::R11, addr_reg, offset);
+        let load_op = match (access_size, sign_extend) {
+            (1, false) => ArmOp::Ldrb { rd, addr },
+            (1, true) => ArmOp::Ldrsb { rd, addr },
+            (2, false) => ArmOp::Ldrh { rd, addr },
+            (2, true) => ArmOp::Ldrsh { rd, addr },
+            _ => ArmOp::Ldr { rd, addr }, // fallback to word load
+        };
+
+        match self.bounds_check {
+            BoundsCheckConfig::None => vec![load_op],
+            BoundsCheckConfig::Software => {
+                let temp = Reg::R12;
+                vec![
+                    ArmOp::Add {
+                        rd: temp,
+                        rn: addr_reg,
+                        op2: Operand2::Imm(offset),
+                    },
+                    ArmOp::Cmp {
+                        rn: temp,
+                        op2: Operand2::Reg(Reg::R10),
+                    },
+                    ArmOp::Bhs {
+                        label: "Trap_Handler".to_string(),
+                    },
+                    load_op,
+                ]
+            }
+            BoundsCheckConfig::Masking => {
+                vec![
+                    ArmOp::And {
+                        rd: addr_reg,
+                        rn: addr_reg,
+                        op2: Operand2::Reg(Reg::R10),
+                    },
+                    load_op,
+                ]
+            }
+        }
+    }
+
+    /// Generate a sub-word store with optional bounds checking.
+    /// `access_size`: 1 for byte (STRB), 2 for halfword (STRH).
+    fn generate_subword_store_with_bounds_check(
+        &self,
+        value_reg: Reg,
+        addr_reg: Reg,
+        offset: i32,
+        access_size: u32,
+    ) -> Vec<ArmOp> {
+        let addr = MemAddr::reg_imm(Reg::R11, addr_reg, offset);
+        let store_op = match access_size {
+            1 => ArmOp::Strb {
+                rd: value_reg,
+                addr,
+            },
+            2 => ArmOp::Strh {
+                rd: value_reg,
+                addr,
+            },
+            _ => ArmOp::Str {
+                rd: value_reg,
+                addr,
+            },
+        };
+
+        match self.bounds_check {
+            BoundsCheckConfig::None => vec![store_op],
+            BoundsCheckConfig::Software => {
+                let temp = Reg::R12;
+                vec![
+                    ArmOp::Add {
+                        rd: temp,
+                        rn: addr_reg,
+                        op2: Operand2::Imm(offset),
+                    },
+                    ArmOp::Cmp {
+                        rn: temp,
+                        op2: Operand2::Reg(Reg::R10),
+                    },
+                    ArmOp::Bhs {
+                        label: "Trap_Handler".to_string(),
+                    },
+                    store_op,
+                ]
+            }
+            BoundsCheckConfig::Masking => {
+                vec![
+                    ArmOp::And {
+                        rd: addr_reg,
+                        rn: addr_reg,
+                        op2: Operand2::Reg(Reg::R10),
+                    },
+                    store_op,
+                ]
+            }
+        }
+    }
+
     /// Get statistics about instruction selection
     pub fn get_stats(&self) -> SelectionStats {
         SelectionStats {
@@ -1988,6 +2227,239 @@ impl InstructionSelector {
                     // Store doesn't push anything to stack
                 }
 
+                // Sub-word loads (i32) — like I32Load but with LDRB/LDRSB/LDRH/LDRSH
+                I32Load8S { offset, .. }
+                | I32Load8U { offset, .. }
+                | I32Load16S { offset, .. }
+                | I32Load16U { offset, .. } => {
+                    let addr = stack.pop().unwrap_or(Reg::R0);
+                    let is_return_value = idx == wasm_ops.len() - 1
+                        || (idx + 1 < wasm_ops.len() && matches!(wasm_ops[idx + 1], End));
+                    let dst = if is_return_value {
+                        Reg::R0
+                    } else {
+                        let t = index_to_reg(next_temp);
+                        next_temp = (next_temp + 1) % 13;
+                        t
+                    };
+
+                    let (access_size, sign_extend) = match op {
+                        I32Load8S { .. } => (1, true),
+                        I32Load8U { .. } => (1, false),
+                        I32Load16S { .. } => (2, true),
+                        I32Load16U { .. } => (2, false),
+                        _ => unreachable!(),
+                    };
+
+                    let load_ops = self.generate_subword_load_with_bounds_check(
+                        dst,
+                        addr,
+                        *offset as i32,
+                        access_size,
+                        sign_extend,
+                    );
+                    for arm_op in load_ops {
+                        instructions.push(ArmInstruction {
+                            op: arm_op,
+                            source_line: Some(idx),
+                        });
+                    }
+                    stack.push(dst);
+                }
+
+                // Sub-word stores (i32) — like I32Store but with STRB/STRH
+                I32Store8 { offset, .. } | I32Store16 { offset, .. } => {
+                    let value = stack.pop().unwrap_or(Reg::R1);
+                    let addr = stack.pop().unwrap_or(Reg::R0);
+
+                    let access_size = match op {
+                        I32Store8 { .. } => 1,
+                        I32Store16 { .. } => 2,
+                        _ => unreachable!(),
+                    };
+
+                    let store_ops = self.generate_subword_store_with_bounds_check(
+                        value,
+                        addr,
+                        *offset as i32,
+                        access_size,
+                    );
+                    for arm_op in store_ops {
+                        instructions.push(ArmInstruction {
+                            op: arm_op,
+                            source_line: Some(idx),
+                        });
+                    }
+                }
+
+                // i64 sub-word loads — load sub-word, extend to i64 (register pair)
+                I64Load8S { offset, .. }
+                | I64Load8U { offset, .. }
+                | I64Load16S { offset, .. }
+                | I64Load16U { offset, .. }
+                | I64Load32S { offset, .. }
+                | I64Load32U { offset, .. } => {
+                    let addr = stack.pop().unwrap_or(Reg::R0);
+                    let dst_lo = Reg::R0;
+                    let dst_hi = Reg::R1;
+
+                    let ops: Vec<ArmOp> = match op {
+                        I64Load8S { .. } => {
+                            let mut v = self.generate_subword_load_with_bounds_check(
+                                dst_lo,
+                                addr,
+                                *offset as i32,
+                                1,
+                                true,
+                            );
+                            v.push(ArmOp::Asr {
+                                rd: dst_hi,
+                                rn: dst_lo,
+                                shift: 31,
+                            });
+                            v
+                        }
+                        I64Load8U { .. } => {
+                            let mut v = self.generate_subword_load_with_bounds_check(
+                                dst_lo,
+                                addr,
+                                *offset as i32,
+                                1,
+                                false,
+                            );
+                            v.push(ArmOp::Mov {
+                                rd: dst_hi,
+                                op2: Operand2::Imm(0),
+                            });
+                            v
+                        }
+                        I64Load16S { .. } => {
+                            let mut v = self.generate_subword_load_with_bounds_check(
+                                dst_lo,
+                                addr,
+                                *offset as i32,
+                                2,
+                                true,
+                            );
+                            v.push(ArmOp::Asr {
+                                rd: dst_hi,
+                                rn: dst_lo,
+                                shift: 31,
+                            });
+                            v
+                        }
+                        I64Load16U { .. } => {
+                            let mut v = self.generate_subword_load_with_bounds_check(
+                                dst_lo,
+                                addr,
+                                *offset as i32,
+                                2,
+                                false,
+                            );
+                            v.push(ArmOp::Mov {
+                                rd: dst_hi,
+                                op2: Operand2::Imm(0),
+                            });
+                            v
+                        }
+                        I64Load32S { .. } => {
+                            let mut v = self.generate_load_with_bounds_check(
+                                dst_lo,
+                                addr,
+                                *offset as i32,
+                                4,
+                            );
+                            v.push(ArmOp::Asr {
+                                rd: dst_hi,
+                                rn: dst_lo,
+                                shift: 31,
+                            });
+                            v
+                        }
+                        I64Load32U { .. } => {
+                            let mut v = self.generate_load_with_bounds_check(
+                                dst_lo,
+                                addr,
+                                *offset as i32,
+                                4,
+                            );
+                            v.push(ArmOp::Mov {
+                                rd: dst_hi,
+                                op2: Operand2::Imm(0),
+                            });
+                            v
+                        }
+                        _ => unreachable!(),
+                    };
+
+                    for arm_op in ops {
+                        instructions.push(ArmInstruction {
+                            op: arm_op,
+                            source_line: Some(idx),
+                        });
+                    }
+                    // i64 on 32-bit ARM uses register pair; push low register
+                    stack.push(dst_lo);
+                }
+
+                // i64 sub-word stores
+                I64Store8 { offset, .. }
+                | I64Store16 { offset, .. }
+                | I64Store32 { offset, .. } => {
+                    // Pop i64 value (lo register) and address
+                    let value_lo = stack.pop().unwrap_or(Reg::R1);
+                    let addr = stack.pop().unwrap_or(Reg::R0);
+
+                    let ops: Vec<ArmOp> = match op {
+                        I64Store8 { .. } => self.generate_subword_store_with_bounds_check(
+                            value_lo,
+                            addr,
+                            *offset as i32,
+                            1,
+                        ),
+                        I64Store16 { .. } => self.generate_subword_store_with_bounds_check(
+                            value_lo,
+                            addr,
+                            *offset as i32,
+                            2,
+                        ),
+                        I64Store32 { .. } => {
+                            self.generate_store_with_bounds_check(value_lo, addr, *offset as i32, 4)
+                        }
+                        _ => unreachable!(),
+                    };
+
+                    for arm_op in ops {
+                        instructions.push(ArmInstruction {
+                            op: arm_op,
+                            source_line: Some(idx),
+                        });
+                    }
+                }
+
+                // Memory management
+                MemorySize(_mem_idx) => {
+                    let dst = index_to_reg(next_temp);
+                    next_temp = (next_temp + 1) % 13;
+                    instructions.push(ArmInstruction {
+                        op: ArmOp::MemorySize { rd: dst },
+                        source_line: Some(idx),
+                    });
+                    stack.push(dst);
+                }
+
+                MemoryGrow(_mem_idx) => {
+                    // Pop the requested number of pages from stack
+                    let pages = stack.pop().unwrap_or(Reg::R0);
+                    let dst = index_to_reg(next_temp);
+                    next_temp = (next_temp + 1) % 13;
+                    instructions.push(ArmInstruction {
+                        op: ArmOp::MemoryGrow { rd: dst, rn: pages },
+                        source_line: Some(idx),
+                    });
+                    stack.push(dst);
+                }
+
                 // =========================================================
                 // Control flow operations
                 // =========================================================
@@ -5516,4 +5988,473 @@ mod tests {
             );
         }
     }
+
+    // =========================================================================
+    // Sub-word load/store tests
+    // =========================================================================
+
+    #[test]
+    fn test_i32_load8_u() {
+        let db = RuleDatabase::new();
+        let mut selector = InstructionSelector::new(db.rules().to_vec());
+
+        let wasm_ops = vec![WasmOp::I32Load8U {
+            offset: 0,
+            align: 1,
+        }];
+        let arm_instrs = selector.select(&wasm_ops).unwrap();
+
+        assert_eq!(arm_instrs.len(), 1);
+        match &arm_instrs[0].op {
+            ArmOp::Ldrb { rd, addr } => {
+                assert_eq!(*rd, Reg::R0);
+                assert_eq!(addr.base, Reg::R11);
+                assert_eq!(addr.offset, 0);
+            }
+            other => panic!("Expected Ldrb, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn test_i32_load8_s() {
+        let db = RuleDatabase::new();
+        let mut selector = InstructionSelector::new(db.rules().to_vec());
+
+        let wasm_ops = vec![WasmOp::I32Load8S {
+            offset: 4,
+            align: 1,
+        }];
+        let arm_instrs = selector.select(&wasm_ops).unwrap();
+
+        assert_eq!(arm_instrs.len(), 1);
+        match &arm_instrs[0].op {
+            ArmOp::Ldrsb { rd, addr } => {
+                assert_eq!(*rd, Reg::R0);
+                assert_eq!(addr.base, Reg::R11);
+                assert_eq!(addr.offset, 4);
+            }
+            other => panic!("Expected Ldrsb, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn test_i32_load16_u() {
+        let db = RuleDatabase::new();
+        let mut selector = InstructionSelector::new(db.rules().to_vec());
+
+        let wasm_ops = vec![WasmOp::I32Load16U {
+            offset: 8,
+            align: 2,
+        }];
+        let arm_instrs = selector.select(&wasm_ops).unwrap();
+
+        assert_eq!(arm_instrs.len(), 1);
+        match &arm_instrs[0].op {
+            ArmOp::Ldrh { rd, addr } => {
+                assert_eq!(*rd, Reg::R0);
+                assert_eq!(addr.base, Reg::R11);
+                assert_eq!(addr.offset, 8);
+            }
+            other => panic!("Expected Ldrh, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn test_i32_load16_s() {
+        let db = RuleDatabase::new();
+        let mut selector = InstructionSelector::new(db.rules().to_vec());
+
+        let wasm_ops = vec![WasmOp::I32Load16S {
+            offset: 0,
+            align: 2,
+        }];
+        let arm_instrs = selector.select(&wasm_ops).unwrap();
+
+        assert_eq!(arm_instrs.len(), 1);
+        match &arm_instrs[0].op {
+            ArmOp::Ldrsh { rd, addr } => {
+                assert_eq!(*rd, Reg::R0);
+                assert_eq!(addr.base, Reg::R11);
+            }
+            other => panic!("Expected Ldrsh, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn test_i32_store8() {
+        let db = RuleDatabase::new();
+        let mut selector = InstructionSelector::new(db.rules().to_vec());
+
+        let wasm_ops = vec![WasmOp::I32Store8 {
+            offset: 0,
+            align: 1,
+        }];
+        let arm_instrs = selector.select(&wasm_ops).unwrap();
+
+        assert_eq!(arm_instrs.len(), 1);
+        match &arm_instrs[0].op {
+            ArmOp::Strb { rd, addr } => {
+                assert_eq!(*rd, Reg::R0);
+                assert_eq!(addr.base, Reg::R11);
+            }
+            other => panic!("Expected Strb, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn test_i32_store16() {
+        let db = RuleDatabase::new();
+        let mut selector = InstructionSelector::new(db.rules().to_vec());
+
+        let wasm_ops = vec![WasmOp::I32Store16 {
+            offset: 4,
+            align: 2,
+        }];
+        let arm_instrs = selector.select(&wasm_ops).unwrap();
+
+        assert_eq!(arm_instrs.len(), 1);
+        match &arm_instrs[0].op {
+            ArmOp::Strh { rd, addr } => {
+                assert_eq!(*rd, Reg::R0);
+                assert_eq!(addr.base, Reg::R11);
+                assert_eq!(addr.offset, 4);
+            }
+            other => panic!("Expected Strh, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn test_i32_subword_loads_with_bounds_checking() {
+        let db = RuleDatabase::new();
+        let mut selector = InstructionSelector::with_bounds_check(
+            db.rules().to_vec(),
+            BoundsCheckConfig::Software,
+        );
+
+        let wasm_ops = vec![WasmOp::I32Load8U {
+            offset: 4,
+            align: 1,
+        }];
+        let arm_instrs = selector.select(&wasm_ops).unwrap();
+
+        // With software bounds checking: ADD + CMP + BHS + LDRB
+        assert_eq!(arm_instrs.len(), 4);
+        assert!(matches!(&arm_instrs[0].op, ArmOp::Add { .. }));
+        assert!(matches!(&arm_instrs[1].op, ArmOp::Cmp { .. }));
+        assert!(matches!(&arm_instrs[2].op, ArmOp::Bhs { .. }));
+        assert!(matches!(&arm_instrs[3].op, ArmOp::Ldrb { .. }));
+    }
+
+    #[test]
+    fn test_i32_subword_stores_with_bounds_checking() {
+        let db = RuleDatabase::new();
+        let mut selector = InstructionSelector::with_bounds_check(
+            db.rules().to_vec(),
+            BoundsCheckConfig::Software,
+        );
+
+        let wasm_ops = vec![WasmOp::I32Store16 {
+            offset: 0,
+            align: 2,
+        }];
+        let arm_instrs = selector.select(&wasm_ops).unwrap();
+
+        // With software bounds checking: ADD + CMP + BHS + STRH
+        assert_eq!(arm_instrs.len(), 4);
+        assert!(matches!(&arm_instrs[3].op, ArmOp::Strh { .. }));
+    }
+
+    #[test]
+    fn test_i64_subword_loads() {
+        let db = RuleDatabase::new();
+
+        // i64.load8_s: LDRSB + ASR (sign-extend hi from lo)
+        let mut selector = InstructionSelector::new(db.rules().to_vec());
+        let wasm_ops = vec![WasmOp::I64Load8S {
+            offset: 0,
+            align: 1,
+        }];
+        let arm_instrs = selector.select(&wasm_ops).unwrap();
+        assert_eq!(arm_instrs.len(), 2);
+        assert!(matches!(&arm_instrs[0].op, ArmOp::Ldrsb { .. }));
+        assert!(matches!(
+            &arm_instrs[1].op,
+            ArmOp::Asr {
+                rd: Reg::R1,
+                rn: Reg::R0,
+                shift: 31
+            }
+        ));
+
+        // i64.load8_u: LDRB + MOV R1, #0
+        let mut selector = InstructionSelector::new(db.rules().to_vec());
+        let wasm_ops = vec![WasmOp::I64Load8U {
+            offset: 0,
+            align: 1,
+        }];
+        let arm_instrs = selector.select(&wasm_ops).unwrap();
+        assert_eq!(arm_instrs.len(), 2);
+        assert!(matches!(&arm_instrs[0].op, ArmOp::Ldrb { .. }));
+        assert!(matches!(
+            &arm_instrs[1].op,
+            ArmOp::Mov {
+                rd: Reg::R1,
+                op2: Operand2::Imm(0)
+            }
+        ));
+
+        // i64.load32_s: LDR + ASR
+        let mut selector = InstructionSelector::new(db.rules().to_vec());
+        let wasm_ops = vec![WasmOp::I64Load32S {
+            offset: 0,
+            align: 4,
+        }];
+        let arm_instrs = selector.select(&wasm_ops).unwrap();
+        assert_eq!(arm_instrs.len(), 2);
+        assert!(matches!(&arm_instrs[0].op, ArmOp::Ldr { .. }));
+        assert!(matches!(
+            &arm_instrs[1].op,
+            ArmOp::Asr {
+                rd: Reg::R1,
+                rn: Reg::R0,
+                shift: 31
+            }
+        ));
+
+        // i64.load32_u: LDR + MOV R1, #0
+        let mut selector = InstructionSelector::new(db.rules().to_vec());
+        let wasm_ops = vec![WasmOp::I64Load32U {
+            offset: 0,
+            align: 4,
+        }];
+        let arm_instrs = selector.select(&wasm_ops).unwrap();
+        assert_eq!(arm_instrs.len(), 2);
+        assert!(matches!(&arm_instrs[0].op, ArmOp::Ldr { .. }));
+        assert!(matches!(
+            &arm_instrs[1].op,
+            ArmOp::Mov {
+                rd: Reg::R1,
+                op2: Operand2::Imm(0)
+            }
+        ));
+    }
+
+    #[test]
+    fn test_i64_subword_stores() {
+        let db = RuleDatabase::new();
+
+        // i64.store8: STRB
+        let mut selector = InstructionSelector::new(db.rules().to_vec());
+        let wasm_ops = vec![WasmOp::I64Store8 {
+            offset: 0,
+            align: 1,
+        }];
+        let arm_instrs = selector.select(&wasm_ops).unwrap();
+        assert_eq!(arm_instrs.len(), 1);
+        assert!(matches!(&arm_instrs[0].op, ArmOp::Strb { .. }));
+
+        // i64.store16: STRH
+        let mut selector = InstructionSelector::new(db.rules().to_vec());
+        let wasm_ops = vec![WasmOp::I64Store16 {
+            offset: 0,
+            align: 2,
+        }];
+        let arm_instrs = selector.select(&wasm_ops).unwrap();
+        assert_eq!(arm_instrs.len(), 1);
+        assert!(matches!(&arm_instrs[0].op, ArmOp::Strh { .. }));
+
+        // i64.store32: STR
+        let mut selector = InstructionSelector::new(db.rules().to_vec());
+        let wasm_ops = vec![WasmOp::I64Store32 {
+            offset: 0,
+            align: 4,
+        }];
+        let arm_instrs = selector.select(&wasm_ops).unwrap();
+        assert_eq!(arm_instrs.len(), 1);
+        assert!(matches!(&arm_instrs[0].op, ArmOp::Str { .. }));
+    }
+
+    // =========================================================================
+    // memory.size / memory.grow tests
+    // =========================================================================
+
+    #[test]
+    fn test_memory_size() {
+        let db = RuleDatabase::new();
+        let mut selector = InstructionSelector::new(db.rules().to_vec());
+
+        let wasm_ops = vec![WasmOp::MemorySize(0)];
+        let arm_instrs = selector.select(&wasm_ops).unwrap();
+
+        assert_eq!(arm_instrs.len(), 1);
+        assert!(matches!(&arm_instrs[0].op, ArmOp::MemorySize { .. }));
+    }
+
+    #[test]
+    fn test_memory_grow() {
+        let db = RuleDatabase::new();
+        let mut selector = InstructionSelector::new(db.rules().to_vec());
+
+        let wasm_ops = vec![WasmOp::MemoryGrow(0)];
+        let arm_instrs = selector.select(&wasm_ops).unwrap();
+
+        assert_eq!(arm_instrs.len(), 1);
+        assert!(matches!(&arm_instrs[0].op, ArmOp::MemoryGrow { .. }));
+    }
+
+    #[test]
+    fn test_all_subword_ops_succeed() {
+        let db = RuleDatabase::new();
+
+        let all_subword_ops = vec![
+            WasmOp::I32Load8S {
+                offset: 0,
+                align: 1,
+            },
+            WasmOp::I32Load8U {
+                offset: 0,
+                align: 1,
+            },
+            WasmOp::I32Load16S {
+                offset: 0,
+                align: 2,
+            },
+            WasmOp::I32Load16U {
+                offset: 0,
+                align: 2,
+            },
+            WasmOp::I32Store8 {
+                offset: 0,
+                align: 1,
+            },
+            WasmOp::I32Store16 {
+                offset: 0,
+                align: 2,
+            },
+            WasmOp::I64Load8S {
+                offset: 0,
+                align: 1,
+            },
+            WasmOp::I64Load8U {
+                offset: 0,
+                align: 1,
+            },
+            WasmOp::I64Load16S {
+                offset: 0,
+                align: 2,
+            },
+            WasmOp::I64Load16U {
+                offset: 0,
+                align: 2,
+            },
+            WasmOp::I64Load32S {
+                offset: 0,
+                align: 4,
+            },
+            WasmOp::I64Load32U {
+                offset: 0,
+                align: 4,
+            },
+            WasmOp::I64Store8 {
+                offset: 0,
+                align: 1,
+            },
+            WasmOp::I64Store16 {
+                offset: 0,
+                align: 2,
+            },
+            WasmOp::I64Store32 {
+                offset: 0,
+                align: 4,
+            },
+            WasmOp::MemorySize(0),
+            WasmOp::MemoryGrow(0),
+        ];
+
+        for op in &all_subword_ops {
+            let mut selector = InstructionSelector::new(db.rules().to_vec());
+            let result = selector.select(std::slice::from_ref(op));
+            assert!(
+                result.is_ok(),
+                "Sub-word/memory operation {op:?} should succeed but got error: {:?}",
+                result.err()
+            );
+        }
+    }
+
+    #[test]
+    fn test_subword_load_stack_mode() {
+        let db = RuleDatabase::new();
+        let mut selector = InstructionSelector::new(db.rules().to_vec());
+
+        // Test i32.load8_u in stack mode: local.get 0; i32.load8_u; end
+        let wasm_ops = vec![
+            WasmOp::LocalGet(0),
+            WasmOp::I32Load8U {
+                offset: 0,
+                align: 1,
+            },
+            WasmOp::End,
+        ];
+        let arm_instrs = selector.select_with_stack(&wasm_ops, 1).unwrap();
+
+        // Should contain at least one Ldrb instruction
+        let has_ldrb = arm_instrs
+            .iter()
+            .any(|i| matches!(&i.op, ArmOp::Ldrb { .. }));
+        assert!(has_ldrb, "Should contain LDRB instruction");
+    }
+
+    #[test]
+    fn test_subword_store_stack_mode() {
+        let db = RuleDatabase::new();
+        let mut selector = InstructionSelector::new(db.rules().to_vec());
+
+        // Test i32.store8 in stack mode: local.get 0; i32.const 42; i32.store8; end
+        let wasm_ops = vec![
+            WasmOp::LocalGet(0),
+            WasmOp::I32Const(42),
+            WasmOp::I32Store8 {
+                offset: 0,
+                align: 1,
+            },
+            WasmOp::End,
+        ];
+        let arm_instrs = selector.select_with_stack(&wasm_ops, 1).unwrap();
+
+        // Should contain a Strb instruction
+        let has_strb = arm_instrs
+            .iter()
+            .any(|i| matches!(&i.op, ArmOp::Strb { .. }));
+        assert!(has_strb, "Should contain STRB instruction");
+    }
+
+    #[test]
+    fn test_memory_size_stack_mode() {
+        let db = RuleDatabase::new();
+        let mut selector = InstructionSelector::new(db.rules().to_vec());
+
+        let wasm_ops = vec![WasmOp::MemorySize(0), WasmOp::End];
+        let arm_instrs = selector.select_with_stack(&wasm_ops, 0).unwrap();
+
+        let has_mem_size = arm_instrs
+            .iter()
+            .any(|i| matches!(&i.op, ArmOp::MemorySize { .. }));
+        assert!(has_mem_size, "Should contain MemorySize instruction");
+    }
+
+    #[test]
+    fn test_memory_grow_stack_mode() {
+        let db = RuleDatabase::new();
+        let mut selector = InstructionSelector::new(db.rules().to_vec());
+
+        // memory.grow pops 1 value (requested pages) from stack
+        let wasm_ops = vec![WasmOp::I32Const(1), WasmOp::MemoryGrow(0), WasmOp::End];
+        let arm_instrs = selector.select_with_stack(&wasm_ops, 0).unwrap();
+
+        let has_mem_grow = arm_instrs
+            .iter()
+            .any(|i| matches!(&i.op, ArmOp::MemoryGrow { .. }));
+        assert!(has_mem_grow, "Should contain MemoryGrow instruction");
+    }
 }
diff --git a/crates/synth-synthesis/src/rules.rs b/crates/synth-synthesis/src/rules.rs
index 77615ff..ef26804 100644
--- a/crates/synth-synthesis/src/rules.rs
+++ b/crates/synth-synthesis/src/rules.rs
@@ -248,6 +248,43 @@ pub enum ArmOp {
         addr: MemAddr,
     },
 
+    // Sub-word loads
+    Ldrb {
+        rd: Reg,
+        addr: MemAddr,
+    }, // Load byte, zero-extend
+    Ldrsb {
+        rd: Reg,
+        addr: MemAddr,
+    }, // Load byte, sign-extend
+    Ldrh {
+        rd: Reg,
+        addr: MemAddr,
+    }, // Load halfword, zero-extend
+    Ldrsh {
+        rd: Reg,
+        addr: MemAddr,
+    }, // Load halfword, sign-extend
+
+    // Sub-word stores
+    Strb {
+        rd: Reg,
+        addr: MemAddr,
+    }, // Store byte
+    Strh {
+        rd: Reg,
+        addr: MemAddr,
+    }, // Store halfword
+
+    // Memory management
+    MemorySize {
+        rd: Reg,
+    }, // Return current memory size in pages
+    MemoryGrow {
+        rd: Reg,
+        rn: Reg,
+    }, // Attempt to grow memory by rn pages, result in rd
+
     /// Label pseudo-instruction — marks a branch target position.
     /// Emits no machine code; used for branch offset resolution.
     Label {