commit files needed for zpm-fetchurl

[zpackage] / tomsfastmath / src / mul / fp_mul_comba.c
diff --git a/tomsfastmath/src/mul/fp_mul_comba.c b/tomsfastmath/src/mul/fp_mul_comba.c

new file mode 100644 (file)

index 0000000..a4253fb
--- /dev/null
+++ b/tomsfastmath/src/mul/fp_mul_comba.c
@@ -0,0 +1,373 @@
+/* TomsFastMath, a fast ISO C bignum library.
+ *
+ * This project is meant to fill in where LibTomMath
+ * falls short.  That is speed ;-)
+ *
+ * This project is public domain and free for all purposes.
+ *
+ * Tom St Denis, tomstdenis@gmail.com
+ */
+
+/* About this file...
+
+*/
+
+#include <tfm_private.h>
+
+#if defined(TFM_PRESCOTT) && defined(TFM_SSE2)
+   #undef TFM_SSE2
+   #define TFM_X86
+#endif
+
+/* these are the combas.  Worship them. */
+#if defined(TFM_X86)
+/* Generic x86 optimized code */
+
+/* anything you need at the start */
+#define COMBA_START
+
+/* clear the chaining variables */
+#define COMBA_CLEAR \
+   c0 = c1 = c2 = 0;
+
+/* forward the carry to the next digit */
+#define COMBA_FORWARD \
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);
+
+/* store the first sum */
+#define COMBA_STORE(x) \
+   x = c0;
+
+/* store the second sum [carry] */
+#define COMBA_STORE2(x) \
+   x = c1;
+
+/* anything you need at the end */
+#define COMBA_FINI
+
+/* this should multiply i and j  */
+#define MULADD(i, j)                                      \
+asm(                                                      \
+     "movl  %6,%%eax     \n\t"                            \
+     "mull  %7           \n\t"                            \
+     "addl  %%eax,%0     \n\t"                            \
+     "adcl  %%edx,%1     \n\t"                            \
+     "adcl  $0,%2        \n\t"                            \
+     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","cc");
+
+#elif defined(TFM_X86_64)
+/* x86-64 optimized */
+
+/* anything you need at the start */
+#define COMBA_START
+
+/* clear the chaining variables */
+#define COMBA_CLEAR \
+   c0 = c1 = c2 = 0;
+
+/* forward the carry to the next digit */
+#define COMBA_FORWARD \
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);
+
+/* store the first sum */
+#define COMBA_STORE(x) \
+   x = c0;
+
+/* store the second sum [carry] */
+#define COMBA_STORE2(x) \
+   x = c1;
+
+/* anything you need at the end */
+#define COMBA_FINI
+
+/* this should multiply i and j  */
+#define MULADD(i, j)                                      \
+asm  (                                                    \
+     "movq  %6,%%rax     \n\t"                            \
+     "mulq  %7           \n\t"                            \
+     "addq  %%rax,%0     \n\t"                            \
+     "adcq  %%rdx,%1     \n\t"                            \
+     "adcq  $0,%2        \n\t"                            \
+     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j)  :"%rax","%rdx","cc");
+
+#elif defined(TFM_SSE2)
+/* use SSE2 optimizations */
+
+/* anything you need at the start */
+#define COMBA_START
+
+/* clear the chaining variables */
+#define COMBA_CLEAR \
+   c0 = c1 = c2 = 0;
+
+/* forward the carry to the next digit */
+#define COMBA_FORWARD \
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);
+
+/* store the first sum */
+#define COMBA_STORE(x) \
+   x = c0;
+
+/* store the second sum [carry] */
+#define COMBA_STORE2(x) \
+   x = c1;
+
+/* anything you need at the end */
+#define COMBA_FINI \
+   asm("emms");
+
+/* this should multiply i and j  */
+#define MULADD(i, j)                                     \
+asm(                                                     \
+    "movd  %6,%%mm0     \n\t"                            \
+    "movd  %7,%%mm1     \n\t"                            \
+    "pmuludq %%mm1,%%mm0\n\t"                            \
+    "movd  %%mm0,%%eax  \n\t"                            \
+    "psrlq $32,%%mm0    \n\t"                            \
+    "addl  %%eax,%0     \n\t"                            \
+    "movd  %%mm0,%%eax  \n\t"                            \
+    "adcl  %%eax,%1     \n\t"                            \
+    "adcl  $0,%2        \n\t"                            \
+    :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","cc");
+
+#elif defined(TFM_ARM)
+/* ARM code */
+
+#define COMBA_START
+
+#define COMBA_CLEAR \
+   c0 = c1 = c2 = 0;
+
+#define COMBA_FORWARD \
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);
+
+#define COMBA_STORE(x) \
+   x = c0;
+
+#define COMBA_STORE2(x) \
+   x = c1;
+
+#define COMBA_FINI
+
+#define MULADD(i, j)                                          \
+asm(                                                          \
+"  UMULL  r0,r1,%6,%7           \n\t"                         \
+"  ADDS   %0,%0,r0              \n\t"                         \
+"  ADCS   %1,%1,r1              \n\t"                         \
+"  ADC    %2,%2,#0              \n\t"                         \
+:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
+
+#elif defined(TFM_PPC32)
+/* For 32-bit PPC */
+
+#define COMBA_START
+
+#define COMBA_CLEAR \
+   c0 = c1 = c2 = 0;
+
+#define COMBA_FORWARD \
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);
+
+#define COMBA_STORE(x) \
+   x = c0;
+
+#define COMBA_STORE2(x) \
+   x = c1;
+
+#define COMBA_FINI
+
+/* untested: will mulhwu change the flags?  Docs say no */
+#define MULADD(i, j)              \
+asm(                              \
+   " mullw  16,%6,%7       \n\t" \
+   " addc   %0,%0,16       \n\t" \
+   " mulhwu 16,%6,%7       \n\t" \
+   " adde   %1,%1,16       \n\t" \
+   " addze  %2,%2          \n\t" \
+:"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16");
+
+#elif defined(TFM_PPC64)
+/* For 64-bit PPC */
+
+#define COMBA_START
+
+#define COMBA_CLEAR \
+   c0 = c1 = c2 = 0;
+
+#define COMBA_FORWARD \
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);
+
+#define COMBA_STORE(x) \
+   x = c0;
+
+#define COMBA_STORE2(x) \
+   x = c1;
+
+#define COMBA_FINI
+
+/* untested: will mulhdu change the flags?  Docs say no */
+#define MULADD(i, j)              \
+asm(                              \
+   " mulld  r16,%6,%7       \n\t" \
+   " addc   %0,%0,16       \n\t" \
+   " mulhdu r16,%6,%7       \n\t" \
+   " adde   %1,%1,16       \n\t" \
+   " addze  %2,%2          \n\t" \
+:"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r16");
+
+#elif defined(TFM_AVR32)
+
+/* ISO C code */
+
+#define COMBA_START
+
+#define COMBA_CLEAR \
+   c0 = c1 = c2 = 0;
+
+#define COMBA_FORWARD \
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);
+
+#define COMBA_STORE(x) \
+   x = c0;
+
+#define COMBA_STORE2(x) \
+   x = c1;
+
+#define COMBA_FINI
+
+#define MULADD(i, j)             \
+asm(                             \
+   " mulu.d r2,%6,%7        \n\t"\
+   " add    %0,r2           \n\t"\
+   " adc    %1,%1,r3        \n\t"\
+   " acr    %2              \n\t"\
+:"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2","r3");
+
+#elif defined(TFM_MIPS)
+
+#define COMBA_START
+
+#define COMBA_CLEAR \
+   c0 = c1 = c2 = 0;
+
+#define COMBA_FORWARD \
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);
+
+#define COMBA_STORE(x) \
+   x = c0;
+
+#define COMBA_STORE2(x) \
+   x = c1;
+
+#define COMBA_FINI
+
+#define MULADD(i, j)              \
+asm(                              \
+   " multu  %6,%7          \n\t"  \
+   " mflo   $12            \n\t"  \
+   " mfhi   $13            \n\t"  \
+   " addu    %0,%0,$12     \n\t"  \
+   " sltu   $12,%0,$12     \n\t"  \
+   " addu    %1,%1,$13     \n\t"  \
+   " sltu   $13,%1,$13     \n\t"  \
+   " addu    %1,%1,$12     \n\t"  \
+   " sltu   $12,%1,$12     \n\t"  \
+   " addu    %2,%2,$13     \n\t"  \
+   " addu    %2,%2,$12     \n\t"  \
+:"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"$12","$13");
+
+#else
+/* ISO C code */
+
+#define COMBA_START
+
+#define COMBA_CLEAR \
+   c0 = c1 = c2 = 0;
+
+#define COMBA_FORWARD \
+   do { c0 = c1; c1 = c2; c2 = 0; } while (0);
+
+#define COMBA_STORE(x) \
+   x = c0;
+
+#define COMBA_STORE2(x) \
+   x = c1;
+
+#define COMBA_FINI
+
+#define MULADD(i, j)                                    \
+   do { fp_word t;                                      \
+   t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j);       \
+   c0 = t;                                              \
+   t = (fp_word)c1 + (t >> DIGIT_BIT);                  \
+   c1 = t;                                              \
+   c2 += t >> DIGIT_BIT;                                \
+   } while (0);
+
+#endif
+
+#ifndef TFM_DEFINES
+
+/* generic PxQ multiplier */
+void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)
+{
+   int       ix, iy, iz, tx, ty, pa;
+   fp_digit  c0, c1, c2, *tmpx, *tmpy;
+   fp_int    tmp, *dst;
+
+   COMBA_START;
+   COMBA_CLEAR;
+
+   /* get size of output and trim */
+   pa = A->used + B->used;
+   if (pa >= FP_SIZE) {
+      pa = FP_SIZE-1;
+   }
+
+   if (A == C || B == C) {
+      fp_zero(&tmp);
+      dst = &tmp;
+   } else {
+      fp_zero(C);
+      dst = C;
+   }
+
+   for (ix = 0; ix < pa; ix++) {
+      /* get offsets into the two bignums */
+      ty = MIN(ix, B->used-1);
+      tx = ix - ty;
+
+      /* setup temp aliases */
+      tmpx = A->dp + tx;
+      tmpy = B->dp + ty;
+
+      /* this is the number of times the loop will iterrate, essentially its
+         while (tx++ < a->used && ty-- >= 0) { ... }
+       */
+      iy = MIN(A->used-tx, ty+1);
+
+      /* execute loop */
+      COMBA_FORWARD;
+      for (iz = 0; iz < iy; ++iz) {
+          fp_digit _tmpx = *tmpx++;
+          fp_digit _tmpy = *tmpy--;
+          MULADD(_tmpx, _tmpy);
+      }
+
+      /* store term */
+      COMBA_STORE(dst->dp[ix]);
+  }
+  COMBA_FINI;
+
+  dst->used = pa;
+  dst->sign = A->sign ^ B->sign;
+  fp_clamp(dst);
+  fp_copy(dst, C);
+}
+
+#endif
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */
+