Explore prefetching to try to fix #446

mratsim · mratsim · commit b7f9e1017abd · 2025-01-27T23:25:35.000+01:00
diff --git a/benchmarks/bench_blueprint.nim b/benchmarks/bench_blueprint.nim
@@ -41,7 +41,7 @@ proc warmup*() =
   let stop = cpuTime()
   echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n"
 
-warmup()
+# warmup()
 
 when defined(gcc):
   echo "\nCompiled with GCC"
diff --git a/benchmarks/bench_ec_g1.nim b/benchmarks/bench_ec_g1.nim
@@ -26,54 +26,54 @@ import
 # ############################################################
 
 
-const Iters = 10_000
+const Iters = 10_000_000
 const MulIters = 100
 const AvailableCurves = [
   # P224,
-  BN254_Nogami,
-  BN254_Snarks,
+  # BN254_Nogami,
+  # BN254_Snarks,
   # Edwards25519,
   # P256,
   Secp256k1,
-  Pallas,
-  Vesta,
-  BLS12_377,
-  BLS12_381,
+  # Pallas,
+  # Vesta,
+  # BLS12_377,
+  # BLS12_381,
 ]
 
 proc main() =
   separator()
   staticFor i, 0, AvailableCurves.len:
     const curve = AvailableCurves[i]
     addBench(EC_ShortW_Prj[Fp[curve], G1], Iters)
-    addBench(EC_ShortW_Jac[Fp[curve], G1], Iters)
-    addBench(EC_ShortW_JacExt[Fp[curve], G1], Iters)
-    mixedAddBench(EC_ShortW_Prj[Fp[curve], G1], Iters)
-    mixedAddBench(EC_ShortW_Jac[Fp[curve], G1], Iters)
-    mixedAddBench(EC_ShortW_JacExt[Fp[curve], G1], Iters)
-    doublingBench(EC_ShortW_Prj[Fp[curve], G1], Iters)
-    doublingBench(EC_ShortW_Jac[Fp[curve], G1], Iters)
-    doublingBench(EC_ShortW_JacExt[Fp[curve], G1], Iters)
-    separator()
-    affFromProjBench(EC_ShortW_Prj[Fp[curve], G1], MulIters)
-    affFromJacBench(EC_ShortW_Jac[Fp[curve], G1], MulIters)
-    separator()
-    for numPoints in [10, 100, 1000, 10000]:
-      let batchIters = max(1, Iters div numPoints)
-      affFromProjBatchBench(EC_ShortW_Prj[Fp[curve], G1], numPoints, useBatching = false, batchIters)
-    separator()
-    for numPoints in [10, 100, 1000, 10000]:
-      let batchIters = max(1, Iters div numPoints)
-      affFromProjBatchBench(EC_ShortW_Prj[Fp[curve], G1], numPoints, useBatching = true, batchIters)
-    separator()
-    for numPoints in [10, 100, 1000, 10000]:
-      let batchIters = max(1, Iters div numPoints)
-      affFromJacBatchBench(EC_ShortW_Jac[Fp[curve], G1], numPoints, useBatching = false, batchIters)
-    separator()
-    for numPoints in [10, 100, 1000, 10000]:
-      let batchIters = max(1, Iters div numPoints)
-      affFromJacBatchBench(EC_ShortW_Jac[Fp[curve], G1], numPoints, useBatching = true, batchIters)
-    separator()
+    # addBench(EC_ShortW_Jac[Fp[curve], G1], Iters)
+    # addBench(EC_ShortW_JacExt[Fp[curve], G1], Iters)
+    # mixedAddBench(EC_ShortW_Prj[Fp[curve], G1], Iters)
+    # mixedAddBench(EC_ShortW_Jac[Fp[curve], G1], Iters)
+    # mixedAddBench(EC_ShortW_JacExt[Fp[curve], G1], Iters)
+    # doublingBench(EC_ShortW_Prj[Fp[curve], G1], Iters)
+    # doublingBench(EC_ShortW_Jac[Fp[curve], G1], Iters)
+    # doublingBench(EC_ShortW_JacExt[Fp[curve], G1], Iters)
+    # separator()
+    # affFromProjBench(EC_ShortW_Prj[Fp[curve], G1], MulIters)
+    # affFromJacBench(EC_ShortW_Jac[Fp[curve], G1], MulIters)
+    # separator()
+    # for numPoints in [10, 100, 1000, 10000]:
+    #   let batchIters = max(1, Iters div numPoints)
+    #   affFromProjBatchBench(EC_ShortW_Prj[Fp[curve], G1], numPoints, useBatching = false, batchIters)
+    # separator()
+    # for numPoints in [10, 100, 1000, 10000]:
+    #   let batchIters = max(1, Iters div numPoints)
+    #   affFromProjBatchBench(EC_ShortW_Prj[Fp[curve], G1], numPoints, useBatching = true, batchIters)
+    # separator()
+    # for numPoints in [10, 100, 1000, 10000]:
+    #   let batchIters = max(1, Iters div numPoints)
+    #   affFromJacBatchBench(EC_ShortW_Jac[Fp[curve], G1], numPoints, useBatching = false, batchIters)
+    # separator()
+    # for numPoints in [10, 100, 1000, 10000]:
+    #   let batchIters = max(1, Iters div numPoints)
+    #   affFromJacBatchBench(EC_ShortW_Jac[Fp[curve], G1], numPoints, useBatching = true, batchIters)
+    # separator()
     separator()
 
 main()
diff --git a/benchmarks/bench_elliptic_template.nim b/benchmarks/bench_elliptic_template.nim
@@ -84,9 +84,9 @@ proc addBench*(EC: typedesc, iters: int) {.noinline.} =
     block:
       bench("EC Add " & $EC.G, EC, iters):
         r.sum(P, Q)
-    block:
-      bench("EC Add vartime " & $EC.G, EC, iters):
-        r.sum_vartime(P, Q)
+    # block:
+    #   bench("EC Add vartime " & $EC.G, EC, iters):
+    #     r.sum_vartime(P, Q)
 
 proc mixedAddBench*(EC: typedesc, iters: int) {.noinline.} =
   var r {.noInit.}: EC
diff --git a/constantine/math/arithmetic/assembly/limbs_asm_modular_x86.nim b/constantine/math/arithmetic/assembly/limbs_asm_modular_x86.nim
@@ -41,6 +41,9 @@ proc finalSubNoOverflowImpl*(
   if not a_in_scratch:
     ctx.mov scratch[0], a[0]
   ctx.sub scratch[0], M[0]
+  # Combat cache-misses
+  # https://github.com/mratsim/constantine/issues/446#issuecomment-2254258024
+  ctx.prefetchw r
   for i in 1 ..< N:
     if not a_in_scratch:
       ctx.mov scratch[i], a[i]
@@ -75,6 +78,9 @@ proc finalSubMayOverflowImpl*(
   if not a_in_scratch:
     ctx.mov scratch[0], a[0]
   ctx.sub scratch[0], M[0]
+  # Combat cache-misses
+  # https://github.com/mratsim/constantine/issues/446#issuecomment-2254258024
+  ctx.prefetchw r
   for i in 1 ..< N:
     if not a_in_scratch:
       ctx.mov scratch[i], a[i]
@@ -156,6 +162,9 @@ macro addmod_gen[N: static int](r_PIR: var Limbs[N], a_PIR, b_PIR, M_MEM: Limbs[
   # Addition
   ctx.add u[0], b[0]
   ctx.mov v[0], u[0]
+  # Combat cache-misses
+  # https://github.com/mratsim/constantine/issues/446#issuecomment-2254258024
+  ctx.prefetcht0 M
   for i in 1 ..< N:
     ctx.adc u[i], b[i]
     # Interleaved copy in a second buffer as well
@@ -214,6 +223,10 @@ macro submod_gen[N: static int](r_PIR: var Limbs[N], a_PIR, b_PIR, M_MEM: Limbs[
   let underflowed = b.reuseRegister()
   ctx.sbb underflowed, underflowed
 
+  # Combat cache-misses
+  # https://github.com/mratsim/constantine/issues/446#issuecomment-2254258024
+  ctx.prefetchw r
+
   # Now mask the adder, with 0 or the modulus limbs
   for i in 0 ..< N:
     ctx.`and` v[i], underflowed
@@ -264,6 +277,10 @@ macro negmod_gen[N: static int](r_PIR: var Limbs[N], a_MEM, M_MEM: Limbs[N]): un
     ctx.mov u[i], M[i]
     ctx.sbb u[i], a[i]
 
+  # Combat cache-misses
+  # https://github.com/mratsim/constantine/issues/446#issuecomment-2254258024
+  ctx.prefetchw r
+
   # Deal with a == 0
   ctx.mov isZero, a[0]
   for i in 1 ..< N:
diff --git a/constantine/math/arithmetic/limbs_crandall.nim b/constantine/math/arithmetic/limbs_crandall.nim
@@ -189,10 +189,10 @@ func mulCranPartialReduce[N: static int](
         m: static int, c: static SecretWord) {.inline.} =
   when UseASM_X86_64 and a.len in {3..6}:
     # ADX implies BMI2
-    if ({.noSideEffect.}: hasAdx()):
+    # if ({.noSideEffect.}: hasAdx()):
       r.mulCranPartialReduce_asm_adx(a, b, m, c)
-    else:
-      r.mulCranPartialReduce_asm(a, b, m, c)
+    # else:
+    #   r.mulCranPartialReduce_asm(a, b, m, c)
   else:
     var r2 {.noInit.}: Limbs[2*N]
     r2.prod(a, b)
@@ -208,10 +208,10 @@ func mulCran*[N: static int](
     r.mulCranPartialReduce(a, b, m, c)
   elif UseASM_X86_64 and a.len in {3..6}:
     # ADX implies BMI2
-    if ({.noSideEffect.}: hasAdx()):
+    # if ({.noSideEffect.}: hasAdx()):
       r.mulCran_asm_adx(a, b, p, m, c)
-    else:
-      r.mulCran_asm(a, b, p, m, c)
+    # else:
+    #   r.mulCran_asm(a, b, p, m, c)
   else:
     var r2 {.noInit.}: Limbs[2*N]
     r2.prod(a, b)
@@ -224,10 +224,10 @@ func squareCranPartialReduce[N: static int](
         m: static int, c: static SecretWord) {.inline.} =
   when UseASM_X86_64 and a.len in {3..6}:
     # ADX implies BMI2
-    if ({.noSideEffect.}: hasAdx()):
+    # if ({.noSideEffect.}: hasAdx()):
       r.squareCranPartialReduce_asm_adx(a, m, c)
-    else:
-      r.squareCranPartialReduce_asm(a, m, c)
+    # else:
+    #   r.squareCranPartialReduce_asm(a, m, c)
   else:
     var r2 {.noInit.}: Limbs[2*N]
     r2.square(a)
@@ -243,10 +243,10 @@ func squareCran*[N: static int](
     r.squareCranPartialReduce(a, m, c)
   elif UseASM_X86_64 and a.len in {3..6}:
     # ADX implies BMI2
-    if ({.noSideEffect.}: hasAdx()):
+    # if ({.noSideEffect.}: hasAdx()):
       r.squareCran_asm_adx(a, p, m, c)
-    else:
-      r.squareCran_asm(a, p, m, c)
+    # else:
+    #   r.squareCran_asm(a, p, m, c)
   else:
     var r2 {.noInit.}: Limbs[2*N]
     r2.square(a)
diff --git a/constantine/platforms/isa_x86/macro_assembler_x86_att.nim b/constantine/platforms/isa_x86/macro_assembler_x86_att.nim
@@ -821,6 +821,18 @@ func setc*(a: var Assembler_x86, dst: Register) =
   a.code &= "setc " & Reg8Low[dst] & '\n'
   # No flags affected
 
+func prefetcht0*(a: var Assembler_x86, mem: Operand or OperandArray) =
+  ## Retrieve memory in all cache levels for reading
+  let loc = a.getStrOffset(mem[0])
+  a.code &= "prefetcht0 " & loc & '\n'
+  # No flags affected
+
+func prefetchw*(a: var Assembler_x86, mem: Operand or OperandArray) =
+  ## Retrieve memory in all cache levels for writing
+  let loc = a.getStrOffset(mem[0])
+  a.code &= "prefetchw " & loc & '\n'
+  # No flags affected
+
 func add*(a: var Assembler_x86, dst, src: Operand) =
   ## Does: dst <- dst + src
   doAssert dst.isOutput()
diff --git a/constantine/platforms/isa_x86/macro_assembler_x86_intel.nim b/constantine/platforms/isa_x86/macro_assembler_x86_intel.nim
@@ -788,6 +788,31 @@ func setc*(a: var Assembler_x86, dst: Register) =
   a.code &= "setc " & Reg8Low[dst] & '\n'
   # No flags affected
 
+func getPrefetchLoc(mem: Operand or OperandArray): string =
+  let mem = mem[0]
+  if mem.desc.rm in {Mem, MemOffsettable}:
+    return "BYTE ptr %" & mem.desc.asmId
+  elif mem.desc.rm == PointerInReg or
+      mem.desc.rm in SpecificRegisters or
+      (mem.desc.rm == ElemsInReg and mem.kind == kFromArray):
+    return "BYTE ptr [%" & mem.desc.asmId & "]"
+  elif mem.desc.rm == ClobberedReg:
+    return "BYTE ptr [" & mem.desc.asmId & "]"
+  else:
+    error("Unsupported memory operand type for prefetch: " & mem.repr)
+
+func prefetcht0*(a: var Assembler_x86, mem: Operand or OperandArray) =
+  ## Retrieve memory in all cache levels for reading
+  let loc = getPrefetchLoc(mem)
+  a.code &= "prefetcht0 " & loc & '\n'
+  # No flags affected
+
+func prefetchw*(a: var Assembler_x86, mem: Operand or OperandArray) =
+  ## Retrieve memory in all cache levels for writing
+  let loc = getPrefetchLoc(mem)
+  a.code &= "prefetchw " & loc & '\n'
+  # No flags affected
+
 func add*(a: var Assembler_x86, dst, src: Operand) =
   ## Does: dst <- dst + src
   doAssert dst.isOutput()