mod.gno
14.59 Kb ยท 605 lines
1package uint256
2
3import (
4 "math/bits"
5)
6
7// Some utility functions
8
9// Reciprocal computes a 320-bit value representing 1/m
10//
11// Notes:
12// - specialized for m.arr[3] != 0, hence limited to 2^192 <= m < 2^256
13// - returns zero if m.arr[3] == 0
14// - starts with a 32-bit division, refines with newton-raphson iterations
15func Reciprocal(m *Uint) (mu [5]uint64) {
16 if m.arr[3] == 0 {
17 return mu
18 }
19
20 s := bits.LeadingZeros64(m.arr[3]) // Replace with leadingZeros(m) for general case
21 p := 255 - s // floor(log_2(m)), m>0
22
23 // 0 or a power of 2?
24
25 // Check if at least one bit is set in m.arr[2], m.arr[1] or m.arr[0],
26 // or at least two bits in m.arr[3]
27
28 if m.arr[0]|m.arr[1]|m.arr[2]|(m.arr[3]&(m.arr[3]-1)) == 0 {
29
30 mu[4] = ^uint64(0) >> uint(p&63)
31 mu[3] = ^uint64(0)
32 mu[2] = ^uint64(0)
33 mu[1] = ^uint64(0)
34 mu[0] = ^uint64(0)
35
36 return mu
37 }
38
39 // Maximise division precision by left-aligning divisor
40
41 var (
42 y Uint // left-aligned copy of m
43 r0 uint32 // estimate of 2^31/y
44 )
45
46 y.Lsh(m, uint(s)) // 1/2 < y < 1
47
48 // Extract most significant 32 bits
49
50 yh := uint32(y.arr[3] >> 32)
51
52 if yh == 0x80000000 { // Avoid overflow in division
53 r0 = 0xffffffff
54 } else {
55 r0, _ = bits.Div32(0x80000000, 0, yh)
56 }
57
58 // First iteration: 32 -> 64
59
60 t1 := uint64(r0) // 2^31/y
61 t1 *= t1 // 2^62/y^2
62 t1, _ = bits.Mul64(t1, y.arr[3]) // 2^62/y^2 * 2^64/y / 2^64 = 2^62/y
63
64 r1 := uint64(r0) << 32 // 2^63/y
65 r1 -= t1 // 2^63/y - 2^62/y = 2^62/y
66 r1 *= 2 // 2^63/y
67
68 if (r1 | (y.arr[3] << 1)) == 0 {
69 r1 = ^uint64(0)
70 }
71
72 // Second iteration: 64 -> 128
73
74 // square: 2^126/y^2
75 a2h, a2l := bits.Mul64(r1, r1)
76
77 // multiply by y: e2h:e2l:b2h = 2^126/y^2 * 2^128/y / 2^128 = 2^126/y
78 b2h, _ := bits.Mul64(a2l, y.arr[2])
79 c2h, c2l := bits.Mul64(a2l, y.arr[3])
80 d2h, d2l := bits.Mul64(a2h, y.arr[2])
81 e2h, e2l := bits.Mul64(a2h, y.arr[3])
82
83 b2h, c := bits.Add64(b2h, c2l, 0)
84 e2l, c = bits.Add64(e2l, c2h, c)
85 e2h, _ = bits.Add64(e2h, 0, c)
86
87 _, c = bits.Add64(b2h, d2l, 0)
88 e2l, c = bits.Add64(e2l, d2h, c)
89 e2h, _ = bits.Add64(e2h, 0, c)
90
91 // subtract: t2h:t2l = 2^127/y - 2^126/y = 2^126/y
92 t2l, b := bits.Sub64(0, e2l, 0)
93 t2h, _ := bits.Sub64(r1, e2h, b)
94
95 // double: r2h:r2l = 2^127/y
96 r2l, c := bits.Add64(t2l, t2l, 0)
97 r2h, _ := bits.Add64(t2h, t2h, c)
98
99 if (r2h | r2l | (y.arr[3] << 1)) == 0 {
100 r2h = ^uint64(0)
101 r2l = ^uint64(0)
102 }
103
104 // Third iteration: 128 -> 192
105
106 // square r2 (keep 256 bits): 2^190/y^2
107 a3h, a3l := bits.Mul64(r2l, r2l)
108 b3h, b3l := bits.Mul64(r2l, r2h)
109 c3h, c3l := bits.Mul64(r2h, r2h)
110
111 a3h, c = bits.Add64(a3h, b3l, 0)
112 c3l, c = bits.Add64(c3l, b3h, c)
113 c3h, _ = bits.Add64(c3h, 0, c)
114
115 a3h, c = bits.Add64(a3h, b3l, 0)
116 c3l, c = bits.Add64(c3l, b3h, c)
117 c3h, _ = bits.Add64(c3h, 0, c)
118
119 // multiply by y: q = 2^190/y^2 * 2^192/y / 2^192 = 2^190/y
120
121 x0 := a3l
122 x1 := a3h
123 x2 := c3l
124 x3 := c3h
125
126 var q0, q1, q2, q3, q4, t0 uint64
127
128 q0, _ = bits.Mul64(x2, y.arr[0])
129 q1, t0 = bits.Mul64(x3, y.arr[0])
130 q0, c = bits.Add64(q0, t0, 0)
131 q1, _ = bits.Add64(q1, 0, c)
132
133 t1, _ = bits.Mul64(x1, y.arr[1])
134 q0, c = bits.Add64(q0, t1, 0)
135 q2, t0 = bits.Mul64(x3, y.arr[1])
136 q1, c = bits.Add64(q1, t0, c)
137 q2, _ = bits.Add64(q2, 0, c)
138
139 t1, t0 = bits.Mul64(x2, y.arr[1])
140 q0, c = bits.Add64(q0, t0, 0)
141 q1, c = bits.Add64(q1, t1, c)
142 q2, _ = bits.Add64(q2, 0, c)
143
144 t1, t0 = bits.Mul64(x1, y.arr[2])
145 q0, c = bits.Add64(q0, t0, 0)
146 q1, c = bits.Add64(q1, t1, c)
147 q3, t0 = bits.Mul64(x3, y.arr[2])
148 q2, c = bits.Add64(q2, t0, c)
149 q3, _ = bits.Add64(q3, 0, c)
150
151 t1, _ = bits.Mul64(x0, y.arr[2])
152 q0, c = bits.Add64(q0, t1, 0)
153 t1, t0 = bits.Mul64(x2, y.arr[2])
154 q1, c = bits.Add64(q1, t0, c)
155 q2, c = bits.Add64(q2, t1, c)
156 q3, _ = bits.Add64(q3, 0, c)
157
158 t1, t0 = bits.Mul64(x1, y.arr[3])
159 q1, c = bits.Add64(q1, t0, 0)
160 q2, c = bits.Add64(q2, t1, c)
161 q4, t0 = bits.Mul64(x3, y.arr[3])
162 q3, c = bits.Add64(q3, t0, c)
163 q4, _ = bits.Add64(q4, 0, c)
164
165 t1, t0 = bits.Mul64(x0, y.arr[3])
166 q0, c = bits.Add64(q0, t0, 0)
167 q1, c = bits.Add64(q1, t1, c)
168 t1, t0 = bits.Mul64(x2, y.arr[3])
169 q2, c = bits.Add64(q2, t0, c)
170 q3, c = bits.Add64(q3, t1, c)
171 q4, _ = bits.Add64(q4, 0, c)
172
173 // subtract: t3 = 2^191/y - 2^190/y = 2^190/y
174 _, b = bits.Sub64(0, q0, 0)
175 _, b = bits.Sub64(0, q1, b)
176 t3l, b := bits.Sub64(0, q2, b)
177 t3m, b := bits.Sub64(r2l, q3, b)
178 t3h, _ := bits.Sub64(r2h, q4, b)
179
180 // double: r3 = 2^191/y
181 r3l, c := bits.Add64(t3l, t3l, 0)
182 r3m, c := bits.Add64(t3m, t3m, c)
183 r3h, _ := bits.Add64(t3h, t3h, c)
184
185 // Fourth iteration: 192 -> 320
186
187 // square r3
188
189 a4h, a4l := bits.Mul64(r3l, r3l)
190 b4h, b4l := bits.Mul64(r3l, r3m)
191 c4h, c4l := bits.Mul64(r3l, r3h)
192 d4h, d4l := bits.Mul64(r3m, r3m)
193 e4h, e4l := bits.Mul64(r3m, r3h)
194 f4h, f4l := bits.Mul64(r3h, r3h)
195
196 b4h, c = bits.Add64(b4h, c4l, 0)
197 e4l, c = bits.Add64(e4l, c4h, c)
198 e4h, _ = bits.Add64(e4h, 0, c)
199
200 a4h, c = bits.Add64(a4h, b4l, 0)
201 d4l, c = bits.Add64(d4l, b4h, c)
202 d4h, c = bits.Add64(d4h, e4l, c)
203 f4l, c = bits.Add64(f4l, e4h, c)
204 f4h, _ = bits.Add64(f4h, 0, c)
205
206 a4h, c = bits.Add64(a4h, b4l, 0)
207 d4l, c = bits.Add64(d4l, b4h, c)
208 d4h, c = bits.Add64(d4h, e4l, c)
209 f4l, c = bits.Add64(f4l, e4h, c)
210 f4h, _ = bits.Add64(f4h, 0, c)
211
212 // multiply by y
213
214 x1, x0 = bits.Mul64(d4h, y.arr[0])
215 x3, x2 = bits.Mul64(f4h, y.arr[0])
216 t1, t0 = bits.Mul64(f4l, y.arr[0])
217 x1, c = bits.Add64(x1, t0, 0)
218 x2, c = bits.Add64(x2, t1, c)
219 x3, _ = bits.Add64(x3, 0, c)
220
221 t1, t0 = bits.Mul64(d4h, y.arr[1])
222 x1, c = bits.Add64(x1, t0, 0)
223 x2, c = bits.Add64(x2, t1, c)
224 x4, t0 := bits.Mul64(f4h, y.arr[1])
225 x3, c = bits.Add64(x3, t0, c)
226 x4, _ = bits.Add64(x4, 0, c)
227 t1, t0 = bits.Mul64(d4l, y.arr[1])
228 x0, c = bits.Add64(x0, t0, 0)
229 x1, c = bits.Add64(x1, t1, c)
230 t1, t0 = bits.Mul64(f4l, y.arr[1])
231 x2, c = bits.Add64(x2, t0, c)
232 x3, c = bits.Add64(x3, t1, c)
233 x4, _ = bits.Add64(x4, 0, c)
234
235 t1, t0 = bits.Mul64(a4h, y.arr[2])
236 x0, c = bits.Add64(x0, t0, 0)
237 x1, c = bits.Add64(x1, t1, c)
238 t1, t0 = bits.Mul64(d4h, y.arr[2])
239 x2, c = bits.Add64(x2, t0, c)
240 x3, c = bits.Add64(x3, t1, c)
241 x5, t0 := bits.Mul64(f4h, y.arr[2])
242 x4, c = bits.Add64(x4, t0, c)
243 x5, _ = bits.Add64(x5, 0, c)
244 t1, t0 = bits.Mul64(d4l, y.arr[2])
245 x1, c = bits.Add64(x1, t0, 0)
246 x2, c = bits.Add64(x2, t1, c)
247 t1, t0 = bits.Mul64(f4l, y.arr[2])
248 x3, c = bits.Add64(x3, t0, c)
249 x4, c = bits.Add64(x4, t1, c)
250 x5, _ = bits.Add64(x5, 0, c)
251
252 t1, t0 = bits.Mul64(a4h, y.arr[3])
253 x1, c = bits.Add64(x1, t0, 0)
254 x2, c = bits.Add64(x2, t1, c)
255 t1, t0 = bits.Mul64(d4h, y.arr[3])
256 x3, c = bits.Add64(x3, t0, c)
257 x4, c = bits.Add64(x4, t1, c)
258 x6, t0 := bits.Mul64(f4h, y.arr[3])
259 x5, c = bits.Add64(x5, t0, c)
260 x6, _ = bits.Add64(x6, 0, c)
261 t1, t0 = bits.Mul64(a4l, y.arr[3])
262 x0, c = bits.Add64(x0, t0, 0)
263 x1, c = bits.Add64(x1, t1, c)
264 t1, t0 = bits.Mul64(d4l, y.arr[3])
265 x2, c = bits.Add64(x2, t0, c)
266 x3, c = bits.Add64(x3, t1, c)
267 t1, t0 = bits.Mul64(f4l, y.arr[3])
268 x4, c = bits.Add64(x4, t0, c)
269 x5, c = bits.Add64(x5, t1, c)
270 x6, _ = bits.Add64(x6, 0, c)
271
272 // subtract
273 _, b = bits.Sub64(0, x0, 0)
274 _, b = bits.Sub64(0, x1, b)
275 r4l, b := bits.Sub64(0, x2, b)
276 r4k, b := bits.Sub64(0, x3, b)
277 r4j, b := bits.Sub64(r3l, x4, b)
278 r4i, b := bits.Sub64(r3m, x5, b)
279 r4h, _ := bits.Sub64(r3h, x6, b)
280
281 // Multiply candidate for 1/4y by y, with full precision
282
283 x0 = r4l
284 x1 = r4k
285 x2 = r4j
286 x3 = r4i
287 x4 = r4h
288
289 q1, q0 = bits.Mul64(x0, y.arr[0])
290 q3, q2 = bits.Mul64(x2, y.arr[0])
291 q5, q4 := bits.Mul64(x4, y.arr[0])
292
293 t1, t0 = bits.Mul64(x1, y.arr[0])
294 q1, c = bits.Add64(q1, t0, 0)
295 q2, c = bits.Add64(q2, t1, c)
296 t1, t0 = bits.Mul64(x3, y.arr[0])
297 q3, c = bits.Add64(q3, t0, c)
298 q4, c = bits.Add64(q4, t1, c)
299 q5, _ = bits.Add64(q5, 0, c)
300
301 t1, t0 = bits.Mul64(x0, y.arr[1])
302 q1, c = bits.Add64(q1, t0, 0)
303 q2, c = bits.Add64(q2, t1, c)
304 t1, t0 = bits.Mul64(x2, y.arr[1])
305 q3, c = bits.Add64(q3, t0, c)
306 q4, c = bits.Add64(q4, t1, c)
307 q6, t0 := bits.Mul64(x4, y.arr[1])
308 q5, c = bits.Add64(q5, t0, c)
309 q6, _ = bits.Add64(q6, 0, c)
310
311 t1, t0 = bits.Mul64(x1, y.arr[1])
312 q2, c = bits.Add64(q2, t0, 0)
313 q3, c = bits.Add64(q3, t1, c)
314 t1, t0 = bits.Mul64(x3, y.arr[1])
315 q4, c = bits.Add64(q4, t0, c)
316 q5, c = bits.Add64(q5, t1, c)
317 q6, _ = bits.Add64(q6, 0, c)
318
319 t1, t0 = bits.Mul64(x0, y.arr[2])
320 q2, c = bits.Add64(q2, t0, 0)
321 q3, c = bits.Add64(q3, t1, c)
322 t1, t0 = bits.Mul64(x2, y.arr[2])
323 q4, c = bits.Add64(q4, t0, c)
324 q5, c = bits.Add64(q5, t1, c)
325 q7, t0 := bits.Mul64(x4, y.arr[2])
326 q6, c = bits.Add64(q6, t0, c)
327 q7, _ = bits.Add64(q7, 0, c)
328
329 t1, t0 = bits.Mul64(x1, y.arr[2])
330 q3, c = bits.Add64(q3, t0, 0)
331 q4, c = bits.Add64(q4, t1, c)
332 t1, t0 = bits.Mul64(x3, y.arr[2])
333 q5, c = bits.Add64(q5, t0, c)
334 q6, c = bits.Add64(q6, t1, c)
335 q7, _ = bits.Add64(q7, 0, c)
336
337 t1, t0 = bits.Mul64(x0, y.arr[3])
338 q3, c = bits.Add64(q3, t0, 0)
339 q4, c = bits.Add64(q4, t1, c)
340 t1, t0 = bits.Mul64(x2, y.arr[3])
341 q5, c = bits.Add64(q5, t0, c)
342 q6, c = bits.Add64(q6, t1, c)
343 q8, t0 := bits.Mul64(x4, y.arr[3])
344 q7, c = bits.Add64(q7, t0, c)
345 q8, _ = bits.Add64(q8, 0, c)
346
347 t1, t0 = bits.Mul64(x1, y.arr[3])
348 q4, c = bits.Add64(q4, t0, 0)
349 q5, c = bits.Add64(q5, t1, c)
350 t1, t0 = bits.Mul64(x3, y.arr[3])
351 q6, c = bits.Add64(q6, t0, c)
352 q7, c = bits.Add64(q7, t1, c)
353 q8, _ = bits.Add64(q8, 0, c)
354
355 // Final adjustment
356
357 // subtract q from 1/4
358 _, b = bits.Sub64(0, q0, 0)
359 _, b = bits.Sub64(0, q1, b)
360 _, b = bits.Sub64(0, q2, b)
361 _, b = bits.Sub64(0, q3, b)
362 _, b = bits.Sub64(0, q4, b)
363 _, b = bits.Sub64(0, q5, b)
364 _, b = bits.Sub64(0, q6, b)
365 _, b = bits.Sub64(0, q7, b)
366 _, b = bits.Sub64(uint64(1)<<62, q8, b)
367
368 // decrement the result
369 x0, t := bits.Sub64(r4l, 1, 0)
370 x1, t = bits.Sub64(r4k, 0, t)
371 x2, t = bits.Sub64(r4j, 0, t)
372 x3, t = bits.Sub64(r4i, 0, t)
373 x4, _ = bits.Sub64(r4h, 0, t)
374
375 // commit the decrement if the subtraction underflowed (reciprocal was too large)
376 if b != 0 {
377 r4h, r4i, r4j, r4k, r4l = x4, x3, x2, x1, x0
378 }
379
380 // Shift to correct bit alignment, truncating excess bits
381
382 p = (p & 63) - 1
383
384 x0, c = bits.Add64(r4l, r4l, 0)
385 x1, c = bits.Add64(r4k, r4k, c)
386 x2, c = bits.Add64(r4j, r4j, c)
387 x3, c = bits.Add64(r4i, r4i, c)
388 x4, _ = bits.Add64(r4h, r4h, c)
389
390 if p < 0 {
391 r4h, r4i, r4j, r4k, r4l = x4, x3, x2, x1, x0
392 p = 0 // avoid negative shift below
393 }
394
395 {
396 r := uint(p) // right shift
397 l := uint(64 - r) // left shift
398
399 x0 = (r4l >> r) | (r4k << l)
400 x1 = (r4k >> r) | (r4j << l)
401 x2 = (r4j >> r) | (r4i << l)
402 x3 = (r4i >> r) | (r4h << l)
403 x4 = (r4h >> r)
404 }
405
406 if p > 0 {
407 r4h, r4i, r4j, r4k, r4l = x4, x3, x2, x1, x0
408 }
409
410 mu[0] = r4l
411 mu[1] = r4k
412 mu[2] = r4j
413 mu[3] = r4i
414 mu[4] = r4h
415
416 return mu
417}
418
419// reduce4 computes the least non-negative residue of x modulo m
420//
421// requires a four-word modulus (m.arr[3] > 1) and its inverse (mu)
422func reduce4(x [8]uint64, m *Uint, mu [5]uint64) (z Uint) {
423 // NB: Most variable names in the comments match the pseudocode for
424 // Barrett reduction in the Handbook of Applied Cryptography.
425
426 // q1 = x/2^192
427
428 x0 := x[3]
429 x1 := x[4]
430 x2 := x[5]
431 x3 := x[6]
432 x4 := x[7]
433
434 // q2 = q1 * mu; q3 = q2 / 2^320
435
436 var q0, q1, q2, q3, q4, q5, t0, t1, c uint64
437
438 q0, _ = bits.Mul64(x3, mu[0])
439 q1, t0 = bits.Mul64(x4, mu[0])
440 q0, c = bits.Add64(q0, t0, 0)
441 q1, _ = bits.Add64(q1, 0, c)
442
443 t1, _ = bits.Mul64(x2, mu[1])
444 q0, c = bits.Add64(q0, t1, 0)
445 q2, t0 = bits.Mul64(x4, mu[1])
446 q1, c = bits.Add64(q1, t0, c)
447 q2, _ = bits.Add64(q2, 0, c)
448
449 t1, t0 = bits.Mul64(x3, mu[1])
450 q0, c = bits.Add64(q0, t0, 0)
451 q1, c = bits.Add64(q1, t1, c)
452 q2, _ = bits.Add64(q2, 0, c)
453
454 t1, t0 = bits.Mul64(x2, mu[2])
455 q0, c = bits.Add64(q0, t0, 0)
456 q1, c = bits.Add64(q1, t1, c)
457 q3, t0 = bits.Mul64(x4, mu[2])
458 q2, c = bits.Add64(q2, t0, c)
459 q3, _ = bits.Add64(q3, 0, c)
460
461 t1, _ = bits.Mul64(x1, mu[2])
462 q0, c = bits.Add64(q0, t1, 0)
463 t1, t0 = bits.Mul64(x3, mu[2])
464 q1, c = bits.Add64(q1, t0, c)
465 q2, c = bits.Add64(q2, t1, c)
466 q3, _ = bits.Add64(q3, 0, c)
467
468 t1, _ = bits.Mul64(x0, mu[3])
469 q0, c = bits.Add64(q0, t1, 0)
470 t1, t0 = bits.Mul64(x2, mu[3])
471 q1, c = bits.Add64(q1, t0, c)
472 q2, c = bits.Add64(q2, t1, c)
473 q4, t0 = bits.Mul64(x4, mu[3])
474 q3, c = bits.Add64(q3, t0, c)
475 q4, _ = bits.Add64(q4, 0, c)
476
477 t1, t0 = bits.Mul64(x1, mu[3])
478 q0, c = bits.Add64(q0, t0, 0)
479 q1, c = bits.Add64(q1, t1, c)
480 t1, t0 = bits.Mul64(x3, mu[3])
481 q2, c = bits.Add64(q2, t0, c)
482 q3, c = bits.Add64(q3, t1, c)
483 q4, _ = bits.Add64(q4, 0, c)
484
485 t1, t0 = bits.Mul64(x0, mu[4])
486 _, c = bits.Add64(q0, t0, 0)
487 q1, c = bits.Add64(q1, t1, c)
488 t1, t0 = bits.Mul64(x2, mu[4])
489 q2, c = bits.Add64(q2, t0, c)
490 q3, c = bits.Add64(q3, t1, c)
491 q5, t0 = bits.Mul64(x4, mu[4])
492 q4, c = bits.Add64(q4, t0, c)
493 q5, _ = bits.Add64(q5, 0, c)
494
495 t1, t0 = bits.Mul64(x1, mu[4])
496 q1, c = bits.Add64(q1, t0, 0)
497 q2, c = bits.Add64(q2, t1, c)
498 t1, t0 = bits.Mul64(x3, mu[4])
499 q3, c = bits.Add64(q3, t0, c)
500 q4, c = bits.Add64(q4, t1, c)
501 q5, _ = bits.Add64(q5, 0, c)
502
503 // Drop the fractional part of q3
504
505 q0 = q1
506 q1 = q2
507 q2 = q3
508 q3 = q4
509 q4 = q5
510
511 // r1 = x mod 2^320
512
513 x0 = x[0]
514 x1 = x[1]
515 x2 = x[2]
516 x3 = x[3]
517 x4 = x[4]
518
519 // r2 = q3 * m mod 2^320
520
521 var r0, r1, r2, r3, r4 uint64
522
523 r4, r3 = bits.Mul64(q0, m.arr[3])
524 _, t0 = bits.Mul64(q1, m.arr[3])
525 r4, _ = bits.Add64(r4, t0, 0)
526
527 t1, r2 = bits.Mul64(q0, m.arr[2])
528 r3, c = bits.Add64(r3, t1, 0)
529 _, t0 = bits.Mul64(q2, m.arr[2])
530 r4, _ = bits.Add64(r4, t0, c)
531
532 t1, t0 = bits.Mul64(q1, m.arr[2])
533 r3, c = bits.Add64(r3, t0, 0)
534 r4, _ = bits.Add64(r4, t1, c)
535
536 t1, r1 = bits.Mul64(q0, m.arr[1])
537 r2, c = bits.Add64(r2, t1, 0)
538 t1, t0 = bits.Mul64(q2, m.arr[1])
539 r3, c = bits.Add64(r3, t0, c)
540 r4, _ = bits.Add64(r4, t1, c)
541
542 t1, t0 = bits.Mul64(q1, m.arr[1])
543 r2, c = bits.Add64(r2, t0, 0)
544 r3, c = bits.Add64(r3, t1, c)
545 _, t0 = bits.Mul64(q3, m.arr[1])
546 r4, _ = bits.Add64(r4, t0, c)
547
548 t1, r0 = bits.Mul64(q0, m.arr[0])
549 r1, c = bits.Add64(r1, t1, 0)
550 t1, t0 = bits.Mul64(q2, m.arr[0])
551 r2, c = bits.Add64(r2, t0, c)
552 r3, c = bits.Add64(r3, t1, c)
553 _, t0 = bits.Mul64(q4, m.arr[0])
554 r4, _ = bits.Add64(r4, t0, c)
555
556 t1, t0 = bits.Mul64(q1, m.arr[0])
557 r1, c = bits.Add64(r1, t0, 0)
558 r2, c = bits.Add64(r2, t1, c)
559 t1, t0 = bits.Mul64(q3, m.arr[0])
560 r3, c = bits.Add64(r3, t0, c)
561 r4, _ = bits.Add64(r4, t1, c)
562
563 // r = r1 - r2
564
565 var b uint64
566
567 r0, b = bits.Sub64(x0, r0, 0)
568 r1, b = bits.Sub64(x1, r1, b)
569 r2, b = bits.Sub64(x2, r2, b)
570 r3, b = bits.Sub64(x3, r3, b)
571 r4, b = bits.Sub64(x4, r4, b)
572
573 // if r<0 then r+=m
574
575 if b != 0 {
576 r0, c = bits.Add64(r0, m.arr[0], 0)
577 r1, c = bits.Add64(r1, m.arr[1], c)
578 r2, c = bits.Add64(r2, m.arr[2], c)
579 r3, c = bits.Add64(r3, m.arr[3], c)
580 r4, _ = bits.Add64(r4, 0, c)
581 }
582
583 // while (r>=m) r-=m
584
585 for {
586 // q = r - m
587 q0, b = bits.Sub64(r0, m.arr[0], 0)
588 q1, b = bits.Sub64(r1, m.arr[1], b)
589 q2, b = bits.Sub64(r2, m.arr[2], b)
590 q3, b = bits.Sub64(r3, m.arr[3], b)
591 q4, b = bits.Sub64(r4, 0, b)
592
593 // if borrow break
594 if b != 0 {
595 break
596 }
597
598 // r = q
599 r4, r3, r2, r1, r0 = q4, q3, q2, q1, q0
600 }
601
602 z.arr[3], z.arr[2], z.arr[1], z.arr[0] = r3, r2, r1, r0
603
604 return z
605}