Skip to content

Commit b7f9e10

Browse files
committed
Explore prefetching to try to fix #446
1 parent 1a536d9 commit b7f9e10

File tree

7 files changed

+105
-51
lines changed

7 files changed

+105
-51
lines changed

benchmarks/bench_blueprint.nim

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ proc warmup*() =
4141
let stop = cpuTime()
4242
echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n"
4343

44-
warmup()
44+
# warmup()
4545

4646
when defined(gcc):
4747
echo "\nCompiled with GCC"

benchmarks/bench_ec_g1.nim

Lines changed: 35 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -26,54 +26,54 @@ import
2626
# ############################################################
2727

2828

29-
const Iters = 10_000
29+
const Iters = 10_000_000
3030
const MulIters = 100
3131
const AvailableCurves = [
3232
# P224,
33-
BN254_Nogami,
34-
BN254_Snarks,
33+
# BN254_Nogami,
34+
# BN254_Snarks,
3535
# Edwards25519,
3636
# P256,
3737
Secp256k1,
38-
Pallas,
39-
Vesta,
40-
BLS12_377,
41-
BLS12_381,
38+
# Pallas,
39+
# Vesta,
40+
# BLS12_377,
41+
# BLS12_381,
4242
]
4343

4444
proc main() =
4545
separator()
4646
staticFor i, 0, AvailableCurves.len:
4747
const curve = AvailableCurves[i]
4848
addBench(EC_ShortW_Prj[Fp[curve], G1], Iters)
49-
addBench(EC_ShortW_Jac[Fp[curve], G1], Iters)
50-
addBench(EC_ShortW_JacExt[Fp[curve], G1], Iters)
51-
mixedAddBench(EC_ShortW_Prj[Fp[curve], G1], Iters)
52-
mixedAddBench(EC_ShortW_Jac[Fp[curve], G1], Iters)
53-
mixedAddBench(EC_ShortW_JacExt[Fp[curve], G1], Iters)
54-
doublingBench(EC_ShortW_Prj[Fp[curve], G1], Iters)
55-
doublingBench(EC_ShortW_Jac[Fp[curve], G1], Iters)
56-
doublingBench(EC_ShortW_JacExt[Fp[curve], G1], Iters)
57-
separator()
58-
affFromProjBench(EC_ShortW_Prj[Fp[curve], G1], MulIters)
59-
affFromJacBench(EC_ShortW_Jac[Fp[curve], G1], MulIters)
60-
separator()
61-
for numPoints in [10, 100, 1000, 10000]:
62-
let batchIters = max(1, Iters div numPoints)
63-
affFromProjBatchBench(EC_ShortW_Prj[Fp[curve], G1], numPoints, useBatching = false, batchIters)
64-
separator()
65-
for numPoints in [10, 100, 1000, 10000]:
66-
let batchIters = max(1, Iters div numPoints)
67-
affFromProjBatchBench(EC_ShortW_Prj[Fp[curve], G1], numPoints, useBatching = true, batchIters)
68-
separator()
69-
for numPoints in [10, 100, 1000, 10000]:
70-
let batchIters = max(1, Iters div numPoints)
71-
affFromJacBatchBench(EC_ShortW_Jac[Fp[curve], G1], numPoints, useBatching = false, batchIters)
72-
separator()
73-
for numPoints in [10, 100, 1000, 10000]:
74-
let batchIters = max(1, Iters div numPoints)
75-
affFromJacBatchBench(EC_ShortW_Jac[Fp[curve], G1], numPoints, useBatching = true, batchIters)
76-
separator()
49+
# addBench(EC_ShortW_Jac[Fp[curve], G1], Iters)
50+
# addBench(EC_ShortW_JacExt[Fp[curve], G1], Iters)
51+
# mixedAddBench(EC_ShortW_Prj[Fp[curve], G1], Iters)
52+
# mixedAddBench(EC_ShortW_Jac[Fp[curve], G1], Iters)
53+
# mixedAddBench(EC_ShortW_JacExt[Fp[curve], G1], Iters)
54+
# doublingBench(EC_ShortW_Prj[Fp[curve], G1], Iters)
55+
# doublingBench(EC_ShortW_Jac[Fp[curve], G1], Iters)
56+
# doublingBench(EC_ShortW_JacExt[Fp[curve], G1], Iters)
57+
# separator()
58+
# affFromProjBench(EC_ShortW_Prj[Fp[curve], G1], MulIters)
59+
# affFromJacBench(EC_ShortW_Jac[Fp[curve], G1], MulIters)
60+
# separator()
61+
# for numPoints in [10, 100, 1000, 10000]:
62+
# let batchIters = max(1, Iters div numPoints)
63+
# affFromProjBatchBench(EC_ShortW_Prj[Fp[curve], G1], numPoints, useBatching = false, batchIters)
64+
# separator()
65+
# for numPoints in [10, 100, 1000, 10000]:
66+
# let batchIters = max(1, Iters div numPoints)
67+
# affFromProjBatchBench(EC_ShortW_Prj[Fp[curve], G1], numPoints, useBatching = true, batchIters)
68+
# separator()
69+
# for numPoints in [10, 100, 1000, 10000]:
70+
# let batchIters = max(1, Iters div numPoints)
71+
# affFromJacBatchBench(EC_ShortW_Jac[Fp[curve], G1], numPoints, useBatching = false, batchIters)
72+
# separator()
73+
# for numPoints in [10, 100, 1000, 10000]:
74+
# let batchIters = max(1, Iters div numPoints)
75+
# affFromJacBatchBench(EC_ShortW_Jac[Fp[curve], G1], numPoints, useBatching = true, batchIters)
76+
# separator()
7777
separator()
7878

7979
main()

benchmarks/bench_elliptic_template.nim

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -84,9 +84,9 @@ proc addBench*(EC: typedesc, iters: int) {.noinline.} =
8484
block:
8585
bench("EC Add " & $EC.G, EC, iters):
8686
r.sum(P, Q)
87-
block:
88-
bench("EC Add vartime " & $EC.G, EC, iters):
89-
r.sum_vartime(P, Q)
87+
# block:
88+
# bench("EC Add vartime " & $EC.G, EC, iters):
89+
# r.sum_vartime(P, Q)
9090

9191
proc mixedAddBench*(EC: typedesc, iters: int) {.noinline.} =
9292
var r {.noInit.}: EC

constantine/math/arithmetic/assembly/limbs_asm_modular_x86.nim

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@ proc finalSubNoOverflowImpl*(
4141
if not a_in_scratch:
4242
ctx.mov scratch[0], a[0]
4343
ctx.sub scratch[0], M[0]
44+
# Combat cache-misses
45+
# https://github.com/mratsim/constantine/issues/446#issuecomment-2254258024
46+
ctx.prefetchw r
4447
for i in 1 ..< N:
4548
if not a_in_scratch:
4649
ctx.mov scratch[i], a[i]
@@ -75,6 +78,9 @@ proc finalSubMayOverflowImpl*(
7578
if not a_in_scratch:
7679
ctx.mov scratch[0], a[0]
7780
ctx.sub scratch[0], M[0]
81+
# Combat cache-misses
82+
# https://github.com/mratsim/constantine/issues/446#issuecomment-2254258024
83+
ctx.prefetchw r
7884
for i in 1 ..< N:
7985
if not a_in_scratch:
8086
ctx.mov scratch[i], a[i]
@@ -156,6 +162,9 @@ macro addmod_gen[N: static int](r_PIR: var Limbs[N], a_PIR, b_PIR, M_MEM: Limbs[
156162
# Addition
157163
ctx.add u[0], b[0]
158164
ctx.mov v[0], u[0]
165+
# Combat cache-misses
166+
# https://github.com/mratsim/constantine/issues/446#issuecomment-2254258024
167+
ctx.prefetcht0 M
159168
for i in 1 ..< N:
160169
ctx.adc u[i], b[i]
161170
# Interleaved copy in a second buffer as well
@@ -214,6 +223,10 @@ macro submod_gen[N: static int](r_PIR: var Limbs[N], a_PIR, b_PIR, M_MEM: Limbs[
214223
let underflowed = b.reuseRegister()
215224
ctx.sbb underflowed, underflowed
216225

226+
# Combat cache-misses
227+
# https://github.com/mratsim/constantine/issues/446#issuecomment-2254258024
228+
ctx.prefetchw r
229+
217230
# Now mask the adder, with 0 or the modulus limbs
218231
for i in 0 ..< N:
219232
ctx.`and` v[i], underflowed
@@ -264,6 +277,10 @@ macro negmod_gen[N: static int](r_PIR: var Limbs[N], a_MEM, M_MEM: Limbs[N]): un
264277
ctx.mov u[i], M[i]
265278
ctx.sbb u[i], a[i]
266279

280+
# Combat cache-misses
281+
# https://github.com/mratsim/constantine/issues/446#issuecomment-2254258024
282+
ctx.prefetchw r
283+
267284
# Deal with a == 0
268285
ctx.mov isZero, a[0]
269286
for i in 1 ..< N:

constantine/math/arithmetic/limbs_crandall.nim

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -189,10 +189,10 @@ func mulCranPartialReduce[N: static int](
189189
m: static int, c: static SecretWord) {.inline.} =
190190
when UseASM_X86_64 and a.len in {3..6}:
191191
# ADX implies BMI2
192-
if ({.noSideEffect.}: hasAdx()):
192+
# if ({.noSideEffect.}: hasAdx()):
193193
r.mulCranPartialReduce_asm_adx(a, b, m, c)
194-
else:
195-
r.mulCranPartialReduce_asm(a, b, m, c)
194+
# else:
195+
# r.mulCranPartialReduce_asm(a, b, m, c)
196196
else:
197197
var r2 {.noInit.}: Limbs[2*N]
198198
r2.prod(a, b)
@@ -208,10 +208,10 @@ func mulCran*[N: static int](
208208
r.mulCranPartialReduce(a, b, m, c)
209209
elif UseASM_X86_64 and a.len in {3..6}:
210210
# ADX implies BMI2
211-
if ({.noSideEffect.}: hasAdx()):
211+
# if ({.noSideEffect.}: hasAdx()):
212212
r.mulCran_asm_adx(a, b, p, m, c)
213-
else:
214-
r.mulCran_asm(a, b, p, m, c)
213+
# else:
214+
# r.mulCran_asm(a, b, p, m, c)
215215
else:
216216
var r2 {.noInit.}: Limbs[2*N]
217217
r2.prod(a, b)
@@ -224,10 +224,10 @@ func squareCranPartialReduce[N: static int](
224224
m: static int, c: static SecretWord) {.inline.} =
225225
when UseASM_X86_64 and a.len in {3..6}:
226226
# ADX implies BMI2
227-
if ({.noSideEffect.}: hasAdx()):
227+
# if ({.noSideEffect.}: hasAdx()):
228228
r.squareCranPartialReduce_asm_adx(a, m, c)
229-
else:
230-
r.squareCranPartialReduce_asm(a, m, c)
229+
# else:
230+
# r.squareCranPartialReduce_asm(a, m, c)
231231
else:
232232
var r2 {.noInit.}: Limbs[2*N]
233233
r2.square(a)
@@ -243,10 +243,10 @@ func squareCran*[N: static int](
243243
r.squareCranPartialReduce(a, m, c)
244244
elif UseASM_X86_64 and a.len in {3..6}:
245245
# ADX implies BMI2
246-
if ({.noSideEffect.}: hasAdx()):
246+
# if ({.noSideEffect.}: hasAdx()):
247247
r.squareCran_asm_adx(a, p, m, c)
248-
else:
249-
r.squareCran_asm(a, p, m, c)
248+
# else:
249+
# r.squareCran_asm(a, p, m, c)
250250
else:
251251
var r2 {.noInit.}: Limbs[2*N]
252252
r2.square(a)

constantine/platforms/isa_x86/macro_assembler_x86_att.nim

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -821,6 +821,18 @@ func setc*(a: var Assembler_x86, dst: Register) =
821821
a.code &= "setc " & Reg8Low[dst] & '\n'
822822
# No flags affected
823823

824+
func prefetcht0*(a: var Assembler_x86, mem: Operand or OperandArray) =
825+
## Retrieve memory in all cache levels for reading
826+
let loc = a.getStrOffset(mem[0])
827+
a.code &= "prefetcht0 " & loc & '\n'
828+
# No flags affected
829+
830+
func prefetchw*(a: var Assembler_x86, mem: Operand or OperandArray) =
831+
## Retrieve memory in all cache levels for writing
832+
let loc = a.getStrOffset(mem[0])
833+
a.code &= "prefetchw " & loc & '\n'
834+
# No flags affected
835+
824836
func add*(a: var Assembler_x86, dst, src: Operand) =
825837
## Does: dst <- dst + src
826838
doAssert dst.isOutput()

constantine/platforms/isa_x86/macro_assembler_x86_intel.nim

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -788,6 +788,31 @@ func setc*(a: var Assembler_x86, dst: Register) =
788788
a.code &= "setc " & Reg8Low[dst] & '\n'
789789
# No flags affected
790790

791+
func getPrefetchLoc(mem: Operand or OperandArray): string =
792+
let mem = mem[0]
793+
if mem.desc.rm in {Mem, MemOffsettable}:
794+
return "BYTE ptr %" & mem.desc.asmId
795+
elif mem.desc.rm == PointerInReg or
796+
mem.desc.rm in SpecificRegisters or
797+
(mem.desc.rm == ElemsInReg and mem.kind == kFromArray):
798+
return "BYTE ptr [%" & mem.desc.asmId & "]"
799+
elif mem.desc.rm == ClobberedReg:
800+
return "BYTE ptr [" & mem.desc.asmId & "]"
801+
else:
802+
error("Unsupported memory operand type for prefetch: " & mem.repr)
803+
804+
func prefetcht0*(a: var Assembler_x86, mem: Operand or OperandArray) =
805+
## Retrieve memory in all cache levels for reading
806+
let loc = getPrefetchLoc(mem)
807+
a.code &= "prefetcht0 " & loc & '\n'
808+
# No flags affected
809+
810+
func prefetchw*(a: var Assembler_x86, mem: Operand or OperandArray) =
811+
## Retrieve memory in all cache levels for writing
812+
let loc = getPrefetchLoc(mem)
813+
a.code &= "prefetchw " & loc & '\n'
814+
# No flags affected
815+
791816
func add*(a: var Assembler_x86, dst, src: Operand) =
792817
## Does: dst <- dst + src
793818
doAssert dst.isOutput()

0 commit comments

Comments
 (0)