3 * This project is meant to fill in where LibTomMath
4 * falls short. That is speed ;-)
6 * This project is public domain and free for all purposes.
8 * Tom St Denis, tomstdenis@gmail.com
10 #include <tfm_private.h>
12 #if defined(TFM_PRESCOTT) && defined(TFM_SSE2)
19 /* x86-32 optimized */
26 #define COMBA_STORE(x) \
29 #define COMBA_STORE2(x) \
32 #define CARRY_FORWARD \
33 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
37 #define SQRADD(i, j) \
39 "movl %6,%%eax \n\t" \
41 "addl %%eax,%0 \n\t" \
42 "adcl %%edx,%1 \n\t" \
44 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","cc");
46 #define SQRADD2(i, j) \
48 "movl %6,%%eax \n\t" \
50 "addl %%eax,%0 \n\t" \
51 "adcl %%edx,%1 \n\t" \
53 "addl %%eax,%0 \n\t" \
54 "adcl %%edx,%1 \n\t" \
56 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","cc");
58 #define SQRADDSC(i, j) \
60 "movl %3,%%eax \n\t" \
62 "movl %%eax,%0 \n\t" \
63 "movl %%edx,%1 \n\t" \
65 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%eax","%edx","cc");
67 #define SQRADDAC(i, j) \
69 "movl %6,%%eax \n\t" \
71 "addl %%eax,%0 \n\t" \
72 "adcl %%edx,%1 \n\t" \
74 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","cc");
84 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
86 #elif defined(TFM_X86_64)
87 /* x86-64 optimized */
94 #define COMBA_STORE(x) \
97 #define COMBA_STORE2(x) \
100 #define CARRY_FORWARD \
101 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
105 #define SQRADD(i, j) \
107 "movq %6,%%rax \n\t" \
109 "addq %%rax,%0 \n\t" \
110 "adcq %%rdx,%1 \n\t" \
112 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "x"(i) :"%rax","%rdx","cc");
114 #define SQRADD2(i, j) \
116 "movq %6,%%rax \n\t" \
118 "addq %%rax,%0 \n\t" \
119 "adcq %%rdx,%1 \n\t" \
121 "addq %%rax,%0 \n\t" \
122 "adcq %%rdx,%1 \n\t" \
124 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc");
126 #define SQRADDSC(i, j) \
128 "movq %3,%%rax \n\t" \
130 "movq %%rax,%0 \n\t" \
131 "movq %%rdx,%1 \n\t" \
133 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%rax","%rdx","cc");
135 #define SQRADDAC(i, j) \
137 "movq %6,%%rax \n\t" \
139 "addq %%rax,%0 \n\t" \
140 "adcq %%rdx,%1 \n\t" \
142 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc");
152 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
154 #elif defined(TFM_SSE2)
159 #define CLEAR_CARRY \
162 #define COMBA_STORE(x) \
165 #define COMBA_STORE2(x) \
168 #define CARRY_FORWARD \
169 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
174 #define SQRADD(i, j) \
176 "movd %6,%%mm0 \n\t" \
177 "pmuludq %%mm0,%%mm0\n\t" \
178 "movd %%mm0,%%eax \n\t" \
179 "psrlq $32,%%mm0 \n\t" \
180 "addl %%eax,%0 \n\t" \
181 "movd %%mm0,%%eax \n\t" \
182 "adcl %%eax,%1 \n\t" \
184 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","cc");
186 #define SQRADD2(i, j) \
188 "movd %6,%%mm0 \n\t" \
189 "movd %7,%%mm1 \n\t" \
190 "pmuludq %%mm1,%%mm0\n\t" \
191 "movd %%mm0,%%eax \n\t" \
192 "psrlq $32,%%mm0 \n\t" \
193 "movd %%mm0,%%edx \n\t" \
194 "addl %%eax,%0 \n\t" \
195 "adcl %%edx,%1 \n\t" \
197 "addl %%eax,%0 \n\t" \
198 "adcl %%edx,%1 \n\t" \
200 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","cc");
202 #define SQRADDSC(i, j) \
204 "movd %6,%%mm0 \n\t" \
205 "movd %7,%%mm1 \n\t" \
206 "pmuludq %%mm1,%%mm0\n\t" \
207 "movd %%mm0,%0 \n\t" \
208 "psrlq $32,%%mm0 \n\t" \
209 "movd %%mm0,%1 \n\t" \
211 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j));
213 #define SQRADDAC(i, j) \
215 "movd %6,%%mm0 \n\t" \
216 "movd %7,%%mm1 \n\t" \
217 "pmuludq %%mm1,%%mm0\n\t" \
218 "movd %%mm0,%%eax \n\t" \
219 "psrlq $32,%%mm0 \n\t" \
220 "movd %%mm0,%%edx \n\t" \
221 "addl %%eax,%0 \n\t" \
222 "adcl %%edx,%1 \n\t" \
224 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j) :"%eax","%edx","cc");
234 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
236 #elif defined(TFM_ARM)
242 #define CLEAR_CARRY \
245 #define COMBA_STORE(x) \
248 #define COMBA_STORE2(x) \
251 #define CARRY_FORWARD \
252 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
256 /* multiplies point i and j, updates carry "c1" and digit c2 */
257 #define SQRADD(i, j) \
259 " UMULL r0,r1,%6,%6 \n\t" \
260 " ADDS %0,%0,r0 \n\t" \
261 " ADCS %1,%1,r1 \n\t" \
262 " ADC %2,%2,#0 \n\t" \
263 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "cc");
265 /* for squaring some of the terms are doubled... */
266 #define SQRADD2(i, j) \
268 " UMULL r0,r1,%6,%7 \n\t" \
269 " ADDS %0,%0,r0 \n\t" \
270 " ADCS %1,%1,r1 \n\t" \
271 " ADC %2,%2,#0 \n\t" \
272 " ADDS %0,%0,r0 \n\t" \
273 " ADCS %1,%1,r1 \n\t" \
274 " ADC %2,%2,#0 \n\t" \
275 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
277 #define SQRADDSC(i, j) \
279 " UMULL %0,%1,%6,%7 \n\t" \
280 " SUB %2,%2,%2 \n\t" \
281 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "cc");
283 #define SQRADDAC(i, j) \
285 " UMULL r0,r1,%6,%7 \n\t" \
286 " ADDS %0,%0,r0 \n\t" \
287 " ADCS %1,%1,r1 \n\t" \
288 " ADC %2,%2,#0 \n\t" \
289 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "r0", "r1", "cc");
293 " ADDS %0,%0,%3 \n\t" \
294 " ADCS %1,%1,%4 \n\t" \
295 " ADC %2,%2,%5 \n\t" \
296 " ADDS %0,%0,%3 \n\t" \
297 " ADCS %1,%1,%4 \n\t" \
298 " ADC %2,%2,%5 \n\t" \
299 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
301 #elif defined(TFM_PPC32)
307 #define CLEAR_CARRY \
310 #define COMBA_STORE(x) \
313 #define COMBA_STORE2(x) \
316 #define CARRY_FORWARD \
317 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
321 /* multiplies point i and j, updates carry "c1" and digit c2 */
322 #define SQRADD(i, j) \
324 " mullw 16,%6,%6 \n\t" \
325 " addc %0,%0,16 \n\t" \
326 " mulhwu 16,%6,%6 \n\t" \
327 " adde %1,%1,16 \n\t" \
328 " addze %2,%2 \n\t" \
329 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","cc");
331 /* for squaring some of the terms are doubled... */
332 #define SQRADD2(i, j) \
334 " mullw 16,%6,%7 \n\t" \
335 " mulhwu 17,%6,%7 \n\t" \
336 " addc %0,%0,16 \n\t" \
337 " adde %1,%1,17 \n\t" \
338 " addze %2,%2 \n\t" \
339 " addc %0,%0,16 \n\t" \
340 " adde %1,%1,17 \n\t" \
341 " addze %2,%2 \n\t" \
342 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","cc");
344 #define SQRADDSC(i, j) \
346 " mullw %0,%6,%7 \n\t" \
347 " mulhwu %1,%6,%7 \n\t" \
348 " xor %2,%2,%2 \n\t" \
349 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
351 #define SQRADDAC(i, j) \
353 " mullw 16,%6,%7 \n\t" \
354 " addc %0,%0,16 \n\t" \
355 " mulhwu 16,%6,%7 \n\t" \
356 " adde %1,%1,16 \n\t" \
357 " addze %2,%2 \n\t" \
358 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "cc");
362 " addc %0,%0,%3 \n\t" \
363 " adde %1,%1,%4 \n\t" \
364 " adde %2,%2,%5 \n\t" \
365 " addc %0,%0,%3 \n\t" \
366 " adde %1,%1,%4 \n\t" \
367 " adde %2,%2,%5 \n\t" \
368 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
370 #elif defined(TFM_PPC64)
375 #define CLEAR_CARRY \
378 #define COMBA_STORE(x) \
381 #define COMBA_STORE2(x) \
384 #define CARRY_FORWARD \
385 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
389 /* multiplies point i and j, updates carry "c1" and digit c2 */
390 #define SQRADD(i, j) \
392 " mulld r16,%6,%6 \n\t" \
393 " addc %0,%0,r16 \n\t" \
394 " mulhdu r16,%6,%6 \n\t" \
395 " adde %1,%1,r16 \n\t" \
396 " addze %2,%2 \n\t" \
397 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"r16","cc");
399 /* for squaring some of the terms are doubled... */
400 #define SQRADD2(i, j) \
402 " mulld r16,%6,%7 \n\t" \
403 " mulhdu r17,%6,%7 \n\t" \
404 " addc %0,%0,r16 \n\t" \
405 " adde %1,%1,r17 \n\t" \
406 " addze %2,%2 \n\t" \
407 " addc %0,%0,r16 \n\t" \
408 " adde %1,%1,r17 \n\t" \
409 " addze %2,%2 \n\t" \
410 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r16", "r17","cc");
412 #define SQRADDSC(i, j) \
414 " mulld %0,%6,%7 \n\t" \
415 " mulhdu %1,%6,%7 \n\t" \
416 " xor %2,%2,%2 \n\t" \
417 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
419 #define SQRADDAC(i, j) \
421 " mulld r16,%6,%7 \n\t" \
422 " addc %0,%0,r16 \n\t" \
423 " mulhdu r16,%6,%7 \n\t" \
424 " adde %1,%1,r16 \n\t" \
425 " addze %2,%2 \n\t" \
426 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"r16", "cc");
430 " addc %0,%0,%3 \n\t" \
431 " adde %1,%1,%4 \n\t" \
432 " adde %2,%2,%5 \n\t" \
433 " addc %0,%0,%3 \n\t" \
434 " adde %1,%1,%4 \n\t" \
435 " adde %2,%2,%5 \n\t" \
436 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
439 #elif defined(TFM_AVR32)
445 #define CLEAR_CARRY \
448 #define COMBA_STORE(x) \
451 #define COMBA_STORE2(x) \
454 #define CARRY_FORWARD \
455 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
459 /* multiplies point i and j, updates carry "c1" and digit c2 */
460 #define SQRADD(i, j) \
462 " mulu.d r2,%6,%6 \n\t" \
463 " add %0,%0,r2 \n\t" \
464 " adc %1,%1,r3 \n\t" \
466 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"r2","r3");
468 /* for squaring some of the terms are doubled... */
469 #define SQRADD2(i, j) \
471 " mulu.d r2,%6,%7 \n\t" \
472 " add %0,%0,r2 \n\t" \
473 " adc %1,%1,r3 \n\t" \
475 " add %0,%0,r2 \n\t" \
476 " adc %1,%1,r3 \n\t" \
478 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2", "r3");
480 #define SQRADDSC(i, j) \
482 " mulu.d r2,%6,%7 \n\t" \
486 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "r2", "r3");
488 #define SQRADDAC(i, j) \
490 " mulu.d r2,%6,%7 \n\t" \
491 " add %0,%0,r2 \n\t" \
492 " adc %1,%1,r3 \n\t" \
494 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"r2", "r3");
498 " add %0,%0,%3 \n\t" \
499 " adc %1,%1,%4 \n\t" \
500 " adc %2,%2,%5 \n\t" \
501 " add %0,%0,%3 \n\t" \
502 " adc %1,%1,%4 \n\t" \
503 " adc %2,%2,%5 \n\t" \
504 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
506 #elif defined(TFM_MIPS)
512 #define CLEAR_CARRY \
515 #define COMBA_STORE(x) \
518 #define COMBA_STORE2(x) \
521 #define CARRY_FORWARD \
522 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
526 /* multiplies point i and j, updates carry "c1" and digit c2 */
527 #define SQRADD(i, j) \
529 " multu %6,%6 \n\t" \
532 " addu %0,%0,$12 \n\t" \
533 " sltu $12,%0,$12 \n\t" \
534 " addu %1,%1,$13 \n\t" \
535 " sltu $13,%1,$13 \n\t" \
536 " addu %1,%1,$12 \n\t" \
537 " sltu $12,%1,$12 \n\t" \
538 " addu %2,%2,$13 \n\t" \
539 " addu %2,%2,$12 \n\t" \
540 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"$12","$13");
542 /* for squaring some of the terms are doubled... */
543 #define SQRADD2(i, j) \
545 " multu %6,%7 \n\t" \
549 " addu %0,%0,$12 \n\t" \
550 " sltu $14,%0,$12 \n\t" \
551 " addu %1,%1,$13 \n\t" \
552 " sltu $15,%1,$13 \n\t" \
553 " addu %1,%1,$14 \n\t" \
554 " sltu $14,%1,$14 \n\t" \
555 " addu %2,%2,$15 \n\t" \
556 " addu %2,%2,$14 \n\t" \
558 " addu %0,%0,$12 \n\t" \
559 " sltu $14,%0,$12 \n\t" \
560 " addu %1,%1,$13 \n\t" \
561 " sltu $15,%1,$13 \n\t" \
562 " addu %1,%1,$14 \n\t" \
563 " sltu $14,%1,$14 \n\t" \
564 " addu %2,%2,$15 \n\t" \
565 " addu %2,%2,$14 \n\t" \
566 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"$12", "$13", "$14", "$15");
568 #define SQRADDSC(i, j) \
570 " multu %6,%7 \n\t" \
573 " xor %2,%2,%2 \n\t" \
574 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
576 #define SQRADDAC(i, j) \
578 " multu %6,%7 \n\t" \
581 " addu %0,%0,$12 \n\t" \
582 " sltu $12,%0,$12 \n\t" \
583 " addu %1,%1,$13 \n\t" \
584 " sltu $13,%1,$13 \n\t" \
585 " addu %1,%1,$12 \n\t" \
586 " sltu $12,%1,$12 \n\t" \
587 " addu %2,%2,$13 \n\t" \
588 " addu %2,%2,$12 \n\t" \
589 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"$12", "$13", "$14");
593 " addu %0,%0,%3 \n\t" \
594 " sltu $10,%0,%3 \n\t" \
595 " addu %1,%1,$10 \n\t" \
596 " sltu $10,%1,$10 \n\t" \
597 " addu %1,%1,%4 \n\t" \
598 " sltu $11,%1,%4 \n\t" \
599 " addu %2,%2,$10 \n\t" \
600 " addu %2,%2,$11 \n\t" \
601 " addu %2,%2,%5 \n\t" \
603 " addu %0,%0,%3 \n\t" \
604 " sltu $10,%0,%3 \n\t" \
605 " addu %1,%1,$10 \n\t" \
606 " sltu $10,%1,$10 \n\t" \
607 " addu %1,%1,%4 \n\t" \
608 " sltu $11,%1,%4 \n\t" \
609 " addu %2,%2,$10 \n\t" \
610 " addu %2,%2,$11 \n\t" \
611 " addu %2,%2,%5 \n\t" \
612 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "$10", "$11");
618 /* ISO C portable code */
622 #define CLEAR_CARRY \
625 #define COMBA_STORE(x) \
628 #define COMBA_STORE2(x) \
631 #define CARRY_FORWARD \
632 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
636 /* multiplies point i and j, updates carry "c1" and digit c2 */
637 #define SQRADD(i, j) \
639 t = c0 + ((fp_word)i) * ((fp_word)j); c0 = t; \
640 t = c1 + (t >> DIGIT_BIT); c1 = t; c2 += t >> DIGIT_BIT; \
644 /* for squaring some of the terms are doubled... */
645 #define SQRADD2(i, j) \
647 t = ((fp_word)i) * ((fp_word)j); \
648 tt = (fp_word)c0 + t; c0 = tt; \
649 tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = tt; c2 += tt >> DIGIT_BIT; \
650 tt = (fp_word)c0 + t; c0 = tt; \
651 tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = tt; c2 += tt >> DIGIT_BIT; \
654 #define SQRADDSC(i, j) \
656 t = ((fp_word)i) * ((fp_word)j); \
657 sc0 = (fp_digit)t; sc1 = (t >> DIGIT_BIT); sc2 = 0; \
660 #define SQRADDAC(i, j) \
662 t = sc0 + ((fp_word)i) * ((fp_word)j); sc0 = t; \
663 t = sc1 + (t >> DIGIT_BIT); sc1 = t; sc2 += t >> DIGIT_BIT; \
668 t = ((fp_word)sc0) + ((fp_word)sc0) + c0; c0 = t; \
669 t = ((fp_word)sc1) + ((fp_word)sc1) + c1 + (t >> DIGIT_BIT); c1 = t; \
670 c2 = c2 + ((fp_word)sc2) + ((fp_word)sc2) + (t >> DIGIT_BIT); \