/* TomsFastMath, a fast ISO C bignum library. * * This project is meant to fill in where LibTomMath * falls short. That is speed ;-) * * This project is public domain and free for all purposes. * * Tom St Denis, tomstdenis@gmail.com */ /* About this file... */ #include #if defined(TFM_PRESCOTT) && defined(TFM_SSE2) #undef TFM_SSE2 #define TFM_X86 #endif /* these are the combas. Worship them. */ #if defined(TFM_X86) /* Generic x86 optimized code */ /* anything you need at the start */ #define COMBA_START /* clear the chaining variables */ #define COMBA_CLEAR \ c0 = c1 = c2 = 0; /* forward the carry to the next digit */ #define COMBA_FORWARD \ do { c0 = c1; c1 = c2; c2 = 0; } while (0); /* store the first sum */ #define COMBA_STORE(x) \ x = c0; /* store the second sum [carry] */ #define COMBA_STORE2(x) \ x = c1; /* anything you need at the end */ #define COMBA_FINI /* this should multiply i and j */ #define MULADD(i, j) \ asm( \ "movl %6,%%eax \n\t" \ "mull %7 \n\t" \ "addl %%eax,%0 \n\t" \ "adcl %%edx,%1 \n\t" \ "adcl $0,%2 \n\t" \ :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","cc"); #elif defined(TFM_X86_64) /* x86-64 optimized */ /* anything you need at the start */ #define COMBA_START /* clear the chaining variables */ #define COMBA_CLEAR \ c0 = c1 = c2 = 0; /* forward the carry to the next digit */ #define COMBA_FORWARD \ do { c0 = c1; c1 = c2; c2 = 0; } while (0); /* store the first sum */ #define COMBA_STORE(x) \ x = c0; /* store the second sum [carry] */ #define COMBA_STORE2(x) \ x = c1; /* anything you need at the end */ #define COMBA_FINI /* this should multiply i and j */ #define MULADD(i, j) \ asm ( \ "movq %6,%%rax \n\t" \ "mulq %7 \n\t" \ "addq %%rax,%0 \n\t" \ "adcq %%rdx,%1 \n\t" \ "adcq $0,%2 \n\t" \ :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc"); #elif defined(TFM_SSE2) /* use SSE2 optimizations */ /* anything you need at the start */ #define COMBA_START /* clear the chaining variables */ #define COMBA_CLEAR \ c0 = c1 = c2 = 0; /* forward the carry to the next digit */ #define COMBA_FORWARD \ do { c0 = c1; c1 = c2; c2 = 0; } while (0); /* store the first sum */ #define COMBA_STORE(x) \ x = c0; /* store the second sum [carry] */ #define COMBA_STORE2(x) \ x = c1; /* anything you need at the end */ #define COMBA_FINI \ asm("emms"); /* this should multiply i and j */ #define MULADD(i, j) \ asm( \ "movd %6,%%mm0 \n\t" \ "movd %7,%%mm1 \n\t" \ "pmuludq %%mm1,%%mm0\n\t" \ "movd %%mm0,%%eax \n\t" \ "psrlq $32,%%mm0 \n\t" \ "addl %%eax,%0 \n\t" \ "movd %%mm0,%%eax \n\t" \ "adcl %%eax,%1 \n\t" \ "adcl $0,%2 \n\t" \ :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","cc"); #elif defined(TFM_ARM) /* ARM code */ #define COMBA_START #define COMBA_CLEAR \ c0 = c1 = c2 = 0; #define COMBA_FORWARD \ do { c0 = c1; c1 = c2; c2 = 0; } while (0); #define COMBA_STORE(x) \ x = c0; #define COMBA_STORE2(x) \ x = c1; #define COMBA_FINI #define MULADD(i, j) \ asm( \ " UMULL r0,r1,%6,%7 \n\t" \ " ADDS %0,%0,r0 \n\t" \ " ADCS %1,%1,r1 \n\t" \ " ADC %2,%2,#0 \n\t" \ :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc"); #elif defined(TFM_PPC32) /* For 32-bit PPC */ #define COMBA_START #define COMBA_CLEAR \ c0 = c1 = c2 = 0; #define COMBA_FORWARD \ do { c0 = c1; c1 = c2; c2 = 0; } while (0); #define COMBA_STORE(x) \ x = c0; #define COMBA_STORE2(x) \ x = c1; #define COMBA_FINI /* untested: will mulhwu change the flags? Docs say no */ #define MULADD(i, j) \ asm( \ " mullw 16,%6,%7 \n\t" \ " addc %0,%0,16 \n\t" \ " mulhwu 16,%6,%7 \n\t" \ " adde %1,%1,16 \n\t" \ " addze %2,%2 \n\t" \ :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16"); #elif defined(TFM_PPC64) /* For 64-bit PPC */ #define COMBA_START #define COMBA_CLEAR \ c0 = c1 = c2 = 0; #define COMBA_FORWARD \ do { c0 = c1; c1 = c2; c2 = 0; } while (0); #define COMBA_STORE(x) \ x = c0; #define COMBA_STORE2(x) \ x = c1; #define COMBA_FINI /* untested: will mulhdu change the flags? Docs say no */ #define MULADD(i, j) \ asm( \ " mulld r16,%6,%7 \n\t" \ " addc %0,%0,16 \n\t" \ " mulhdu r16,%6,%7 \n\t" \ " adde %1,%1,16 \n\t" \ " addze %2,%2 \n\t" \ :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r16"); #elif defined(TFM_AVR32) /* ISO C code */ #define COMBA_START #define COMBA_CLEAR \ c0 = c1 = c2 = 0; #define COMBA_FORWARD \ do { c0 = c1; c1 = c2; c2 = 0; } while (0); #define COMBA_STORE(x) \ x = c0; #define COMBA_STORE2(x) \ x = c1; #define COMBA_FINI #define MULADD(i, j) \ asm( \ " mulu.d r2,%6,%7 \n\t"\ " add %0,r2 \n\t"\ " adc %1,%1,r3 \n\t"\ " acr %2 \n\t"\ :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2","r3"); #elif defined(TFM_MIPS) #define COMBA_START #define COMBA_CLEAR \ c0 = c1 = c2 = 0; #define COMBA_FORWARD \ do { c0 = c1; c1 = c2; c2 = 0; } while (0); #define COMBA_STORE(x) \ x = c0; #define COMBA_STORE2(x) \ x = c1; #define COMBA_FINI #define MULADD(i, j) \ asm( \ " multu %6,%7 \n\t" \ " mflo $12 \n\t" \ " mfhi $13 \n\t" \ " addu %0,%0,$12 \n\t" \ " sltu $12,%0,$12 \n\t" \ " addu %1,%1,$13 \n\t" \ " sltu $13,%1,$13 \n\t" \ " addu %1,%1,$12 \n\t" \ " sltu $12,%1,$12 \n\t" \ " addu %2,%2,$13 \n\t" \ " addu %2,%2,$12 \n\t" \ :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"$12","$13"); #else /* ISO C code */ #define COMBA_START #define COMBA_CLEAR \ c0 = c1 = c2 = 0; #define COMBA_FORWARD \ do { c0 = c1; c1 = c2; c2 = 0; } while (0); #define COMBA_STORE(x) \ x = c0; #define COMBA_STORE2(x) \ x = c1; #define COMBA_FINI #define MULADD(i, j) \ do { fp_word t; \ t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j); \ c0 = t; \ t = (fp_word)c1 + (t >> DIGIT_BIT); \ c1 = t; \ c2 += t >> DIGIT_BIT; \ } while (0); #endif #ifndef TFM_DEFINES /* generic PxQ multiplier */ void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C) { int ix, iy, iz, tx, ty, pa; fp_digit c0, c1, c2, *tmpx, *tmpy; fp_int tmp, *dst; COMBA_START; COMBA_CLEAR; /* get size of output and trim */ pa = A->used + B->used; if (pa >= FP_SIZE) { pa = FP_SIZE-1; } if (A == C || B == C) { fp_zero(&tmp); dst = &tmp; } else { fp_zero(C); dst = C; } for (ix = 0; ix < pa; ix++) { /* get offsets into the two bignums */ ty = MIN(ix, B->used-1); tx = ix - ty; /* setup temp aliases */ tmpx = A->dp + tx; tmpy = B->dp + ty; /* this is the number of times the loop will iterrate, essentially its while (tx++ < a->used && ty-- >= 0) { ... } */ iy = MIN(A->used-tx, ty+1); /* execute loop */ COMBA_FORWARD; for (iz = 0; iz < iy; ++iz) { fp_digit _tmpx = *tmpx++; fp_digit _tmpy = *tmpy--; MULADD(_tmpx, _tmpy); } /* store term */ COMBA_STORE(dst->dp[ix]); } COMBA_FINI; dst->used = pa; dst->sign = A->sign ^ B->sign; fp_clamp(dst); fp_copy(dst, C); } #endif /* $Source$ */ /* $Revision$ */ /* $Date$ */