--- /dev/null
+/* TomsFastMath, a fast ISO C bignum library.
+ *
+ * This project is meant to fill in where LibTomMath
+ * falls short. That is speed ;-)
+ *
+ * This project is public domain and free for all purposes.
+ *
+ * Tom St Denis, tomstdenis@gmail.com
+ */
+#include <tfm_private.h>
+
+/******************************************************************/
+#if defined(TFM_X86) && !defined(TFM_SSE2)
+/* x86-32 code */
+
+#define MONT_START
+#define MONT_FINI
+#define LOOP_END
+#define LOOP_START \
+ mu = c[x] * mp
+
+#define INNERMUL \
+asm( \
+ "movl %5,%%eax \n\t" \
+ "mull %4 \n\t" \
+ "addl %1,%%eax \n\t" \
+ "adcl $0,%%edx \n\t" \
+ "addl %%eax,%0 \n\t" \
+ "adcl $0,%%edx \n\t" \
+ "movl %%edx,%1 \n\t" \
+:"=g"(_c[LO]), "=r"(cy) \
+:"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \
+: "%eax", "%edx", "cc")
+
+#define PROPCARRY \
+asm( \
+ "addl %1,%0 \n\t" \
+ "setb %%al \n\t" \
+ "movzbl %%al,%1 \n\t" \
+:"=g"(_c[LO]), "=r"(cy) \
+:"0"(_c[LO]), "1"(cy) \
+: "%eax", "cc")
+
+/******************************************************************/
+#elif defined(TFM_X86_64)
+/* x86-64 code */
+
+#define MONT_START
+#define MONT_FINI
+#define LOOP_END
+#define LOOP_START \
+ mu = c[x] * mp
+
+#define INNERMUL \
+asm( \
+ "movq %5,%%rax \n\t" \
+ "mulq %4 \n\t" \
+ "addq %1,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "addq %%rax,%0 \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "movq %%rdx,%1 \n\t" \
+:"=g"(_c[LO]), "=r"(cy) \
+:"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \
+: "%rax", "%rdx", "cc")
+
+#define INNERMUL8 \
+ asm( \
+ "movq 0(%5),%%rax \n\t" \
+ "movq 0(%2),%%r10 \n\t" \
+ "movq 0x8(%5),%%r11 \n\t" \
+ "mulq %4 \n\t" \
+ "addq %%r10,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "movq 0x8(%2),%%r10 \n\t" \
+ "addq %3,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "movq %%rax,0(%0) \n\t" \
+ "movq %%rdx,%1 \n\t" \
+ \
+ "movq %%r11,%%rax \n\t" \
+ "movq 0x10(%5),%%r11 \n\t" \
+ "mulq %4 \n\t" \
+ "addq %%r10,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "movq 0x10(%2),%%r10 \n\t" \
+ "addq %3,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "movq %%rax,0x8(%0) \n\t" \
+ "movq %%rdx,%1 \n\t" \
+ \
+ "movq %%r11,%%rax \n\t" \
+ "movq 0x18(%5),%%r11 \n\t" \
+ "mulq %4 \n\t" \
+ "addq %%r10,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "movq 0x18(%2),%%r10 \n\t" \
+ "addq %3,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "movq %%rax,0x10(%0) \n\t" \
+ "movq %%rdx,%1 \n\t" \
+ \
+ "movq %%r11,%%rax \n\t" \
+ "movq 0x20(%5),%%r11 \n\t" \
+ "mulq %4 \n\t" \
+ "addq %%r10,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "movq 0x20(%2),%%r10 \n\t" \
+ "addq %3,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "movq %%rax,0x18(%0) \n\t" \
+ "movq %%rdx,%1 \n\t" \
+ \
+ "movq %%r11,%%rax \n\t" \
+ "movq 0x28(%5),%%r11 \n\t" \
+ "mulq %4 \n\t" \
+ "addq %%r10,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "movq 0x28(%2),%%r10 \n\t" \
+ "addq %3,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "movq %%rax,0x20(%0) \n\t" \
+ "movq %%rdx,%1 \n\t" \
+ \
+ "movq %%r11,%%rax \n\t" \
+ "movq 0x30(%5),%%r11 \n\t" \
+ "mulq %4 \n\t" \
+ "addq %%r10,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "movq 0x30(%2),%%r10 \n\t" \
+ "addq %3,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "movq %%rax,0x28(%0) \n\t" \
+ "movq %%rdx,%1 \n\t" \
+ \
+ "movq %%r11,%%rax \n\t" \
+ "movq 0x38(%5),%%r11 \n\t" \
+ "mulq %4 \n\t" \
+ "addq %%r10,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "movq 0x38(%2),%%r10 \n\t" \
+ "addq %3,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "movq %%rax,0x30(%0) \n\t" \
+ "movq %%rdx,%1 \n\t" \
+ \
+ "movq %%r11,%%rax \n\t" \
+ "mulq %4 \n\t" \
+ "addq %%r10,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "addq %3,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "movq %%rax,0x38(%0) \n\t" \
+ "movq %%rdx,%1 \n\t" \
+ \
+:"=r"(_c), "=r"(cy) \
+: "0"(_c), "1"(cy), "g"(mu), "r"(tmpm)\
+: "%rax", "%rdx", "%r10", "%r11", "cc")
+
+
+#define PROPCARRY \
+asm( \
+ "addq %1,%0 \n\t" \
+ "setb %%al \n\t" \
+ "movzbq %%al,%1 \n\t" \
+:"=g"(_c[LO]), "=r"(cy) \
+:"0"(_c[LO]), "1"(cy) \
+: "%rax", "cc")
+
+/******************************************************************/
+#elif defined(TFM_SSE2)
+/* SSE2 code (assumes 32-bit fp_digits) */
+/* XMM register assignments:
+ * xmm0 *tmpm++, then Mu * (*tmpm++)
+ * xmm1 c[x], then Mu
+ * xmm2 mp
+ * xmm3 cy
+ * xmm4 _c[LO]
+ */
+
+#define MONT_START \
+ asm("movd %0,%%mm2"::"g"(mp))
+
+#define MONT_FINI \
+ asm("emms")
+
+#define LOOP_START \
+asm( \
+"movd %0,%%mm1 \n\t" \
+"pxor %%mm3,%%mm3 \n\t" \
+"pmuludq %%mm2,%%mm1 \n\t" \
+:: "g"(c[x]))
+
+/* pmuludq on mmx registers does a 32x32->64 multiply. */
+#define INNERMUL \
+asm( \
+ "movd %1,%%mm4 \n\t" \
+ "movd %2,%%mm0 \n\t" \
+ "paddq %%mm4,%%mm3 \n\t" \
+ "pmuludq %%mm1,%%mm0 \n\t" \
+ "paddq %%mm0,%%mm3 \n\t" \
+ "movd %%mm3,%0 \n\t" \
+ "psrlq $32, %%mm3 \n\t" \
+:"=g"(_c[LO]) : "0"(_c[LO]), "g"(*tmpm++) );
+
+#define INNERMUL8 \
+asm( \
+ "movd 0(%1),%%mm4 \n\t" \
+ "movd 0(%2),%%mm0 \n\t" \
+ "paddq %%mm4,%%mm3 \n\t" \
+ "pmuludq %%mm1,%%mm0 \n\t" \
+ "movd 4(%2),%%mm5 \n\t" \
+ "paddq %%mm0,%%mm3 \n\t" \
+ "movd 4(%1),%%mm6 \n\t" \
+ "movd %%mm3,0(%0) \n\t" \
+ "psrlq $32, %%mm3 \n\t" \
+\
+ "paddq %%mm6,%%mm3 \n\t" \
+ "pmuludq %%mm1,%%mm5 \n\t" \
+ "movd 8(%2),%%mm6 \n\t" \
+ "paddq %%mm5,%%mm3 \n\t" \
+ "movd 8(%1),%%mm7 \n\t" \
+ "movd %%mm3,4(%0) \n\t" \
+ "psrlq $32, %%mm3 \n\t" \
+\
+ "paddq %%mm7,%%mm3 \n\t" \
+ "pmuludq %%mm1,%%mm6 \n\t" \
+ "movd 12(%2),%%mm7 \n\t" \
+ "paddq %%mm6,%%mm3 \n\t" \
+ "movd 12(%1),%%mm5 \n\t" \
+ "movd %%mm3,8(%0) \n\t" \
+ "psrlq $32, %%mm3 \n\t" \
+\
+ "paddq %%mm5,%%mm3 \n\t" \
+ "pmuludq %%mm1,%%mm7 \n\t" \
+ "movd 16(%2),%%mm5 \n\t" \
+ "paddq %%mm7,%%mm3 \n\t" \
+ "movd 16(%1),%%mm6 \n\t" \
+ "movd %%mm3,12(%0) \n\t" \
+ "psrlq $32, %%mm3 \n\t" \
+\
+ "paddq %%mm6,%%mm3 \n\t" \
+ "pmuludq %%mm1,%%mm5 \n\t" \
+ "movd 20(%2),%%mm6 \n\t" \
+ "paddq %%mm5,%%mm3 \n\t" \
+ "movd 20(%1),%%mm7 \n\t" \
+ "movd %%mm3,16(%0) \n\t" \
+ "psrlq $32, %%mm3 \n\t" \
+\
+ "paddq %%mm7,%%mm3 \n\t" \
+ "pmuludq %%mm1,%%mm6 \n\t" \
+ "movd 24(%2),%%mm7 \n\t" \
+ "paddq %%mm6,%%mm3 \n\t" \
+ "movd 24(%1),%%mm5 \n\t" \
+ "movd %%mm3,20(%0) \n\t" \
+ "psrlq $32, %%mm3 \n\t" \
+\
+ "paddq %%mm5,%%mm3 \n\t" \
+ "pmuludq %%mm1,%%mm7 \n\t" \
+ "movd 28(%2),%%mm5 \n\t" \
+ "paddq %%mm7,%%mm3 \n\t" \
+ "movd 28(%1),%%mm6 \n\t" \
+ "movd %%mm3,24(%0) \n\t" \
+ "psrlq $32, %%mm3 \n\t" \
+\
+ "paddq %%mm6,%%mm3 \n\t" \
+ "pmuludq %%mm1,%%mm5 \n\t" \
+ "paddq %%mm5,%%mm3 \n\t" \
+ "movd %%mm3,28(%0) \n\t" \
+ "psrlq $32, %%mm3 \n\t" \
+:"=r"(_c) : "0"(_c), "g"(tmpm) );
+
+#define LOOP_END \
+asm( "movd %%mm3,%0 \n" :"=r"(cy))
+
+#define PROPCARRY \
+asm( \
+ "addl %1,%0 \n\t" \
+ "setb %%al \n\t" \
+ "movzbl %%al,%1 \n\t" \
+:"=g"(_c[LO]), "=r"(cy) \
+:"0"(_c[LO]), "1"(cy) \
+: "%eax", "cc")
+
+/******************************************************************/
+#elif defined(TFM_ARM)
+ /* ARMv4 code */
+
+#define MONT_START
+#define MONT_FINI
+#define LOOP_END
+#define LOOP_START \
+ mu = c[x] * mp
+
+#define INNERMUL \
+asm( \
+ " LDR r0,%1 \n\t" \
+ " ADDS r0,r0,%0 \n\t" \
+ " MOVCS %0,#1 \n\t" \
+ " MOVCC %0,#0 \n\t" \
+ " UMLAL r0,%0,%3,%4 \n\t" \
+ " STR r0,%1 \n\t" \
+:"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r0","cc");
+
+#define PROPCARRY \
+asm( \
+ " LDR r0,%1 \n\t" \
+ " ADDS r0,r0,%0 \n\t" \
+ " STR r0,%1 \n\t" \
+ " MOVCS %0,#1 \n\t" \
+ " MOVCC %0,#0 \n\t" \
+:"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r0","cc");
+
+/******************************************************************/
+#elif defined(TFM_PPC32)
+
+/* PPC32 */
+#define MONT_START
+#define MONT_FINI
+#define LOOP_END
+#define LOOP_START \
+ mu = c[x] * mp
+
+#define INNERMUL \
+asm( \
+ " mullw 16,%3,%4 \n\t" \
+ " mulhwu 17,%3,%4 \n\t" \
+ " addc 16,16,%2 \n\t" \
+ " addze 17,17 \n\t" \
+ " addc %1,16,%5 \n\t" \
+ " addze %0,17 \n\t" \
+:"=r"(cy),"=r"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "cc"); ++tmpm;
+
+#define PROPCARRY \
+asm( \
+ " addc %1,%3,%2 \n\t" \
+ " xor %0,%2,%2 \n\t" \
+ " addze %0,%2 \n\t" \
+:"=r"(cy),"=r"(_c[0]):"0"(cy),"1"(_c[0]):"cc");
+
+/******************************************************************/
+#elif defined(TFM_PPC64)
+
+/* PPC64 */
+#define MONT_START
+#define MONT_FINI
+#define LOOP_END
+#define LOOP_START \
+ mu = c[x] * mp
+
+#define INNERMUL \
+asm( \
+ " mulld r16,%3,%4 \n\t" \
+ " mulhdu r17,%3,%4 \n\t" \
+ " addc r16,16,%0 \n\t" \
+ " addze r17,r17 \n\t" \
+ " ldx r18,0,%1 \n\t" \
+ " addc r16,r16,r18 \n\t" \
+ " addze %0,r17 \n\t" \
+ " sdx r16,0,%1 \n\t" \
+:"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"r16", "r17", "r18","cc"); ++tmpm;
+
+#define PROPCARRY \
+asm( \
+ " ldx r16,0,%1 \n\t" \
+ " addc r16,r16,%0 \n\t" \
+ " sdx r16,0,%1 \n\t" \
+ " xor %0,%0,%0 \n\t" \
+ " addze %0,%0 \n\t" \
+:"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r16","cc");
+
+/******************************************************************/
+#elif defined(TFM_AVR32)
+
+/* AVR32 */
+#define MONT_START
+#define MONT_FINI
+#define LOOP_END
+#define LOOP_START \
+ mu = c[x] * mp
+
+#define INNERMUL \
+asm( \
+ " ld.w r2,%1 \n\t" \
+ " add r2,%0 \n\t" \
+ " eor r3,r3 \n\t" \
+ " acr r3 \n\t" \
+ " macu.d r2,%3,%4 \n\t" \
+ " st.w %1,r2 \n\t" \
+ " mov %0,r3 \n\t" \
+:"=r"(cy),"=r"(_c):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c):"r2","r3");
+
+#define PROPCARRY \
+asm( \
+ " ld.w r2,%1 \n\t" \
+ " add r2,%0 \n\t" \
+ " st.w %1,r2 \n\t" \
+ " eor %0,%0 \n\t" \
+ " acr %0 \n\t" \
+:"=r"(cy),"=r"(&_c[0]):"0"(cy),"1"(&_c[0]):"r2","cc");
+
+/******************************************************************/
+#elif defined(TFM_MIPS)
+
+/* MIPS */
+#define MONT_START
+#define MONT_FINI
+#define LOOP_END
+#define LOOP_START \
+ mu = c[x] * mp
+
+#define INNERMUL \
+asm( \
+ " multu %3,%4 \n\t" \
+ " mflo $12 \n\t" \
+ " mfhi $13 \n\t" \
+ " addu $12,$12,%0 \n\t" \
+ " sltu $10,$12,%0 \n\t" \
+ " addu $13,$13,$10 \n\t" \
+ " lw $10,%1 \n\t" \
+ " addu $12,$12,$10 \n\t" \
+ " sltu $10,$12,$10 \n\t" \
+ " addu %0,$13,$10 \n\t" \
+ " sw $12,%1 \n\t" \
+:"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"$10","$12","$13"); ++tmpm;
+
+#define PROPCARRY \
+asm( \
+ " lw $10,%1 \n\t" \
+ " addu $10,$10,%0 \n\t" \
+ " sw $10,%1 \n\t" \
+ " sltu %0,$10,%0 \n\t" \
+:"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"$10");
+
+/******************************************************************/
+#else
+
+/* ISO C code */
+#define MONT_START
+#define MONT_FINI
+#define LOOP_END
+#define LOOP_START \
+ mu = c[x] * mp
+
+#define INNERMUL \
+ do { fp_word t; \
+ _c[0] = t = ((fp_word)_c[0] + (fp_word)cy) + \
+ (((fp_word)mu) * ((fp_word)*tmpm++)); \
+ cy = (t >> DIGIT_BIT); \
+ } while (0)
+
+#define PROPCARRY \
+ do { fp_digit t = _c[0] += cy; cy = (t < cy); } while (0)
+
+#endif
+/******************************************************************/
+
+
+#define LO 0
+
+#ifdef TFM_SMALL_MONT_SET
+#include "fp_mont_small.i"
+#endif
+
+/* computes x/R == x (mod N) via Montgomery Reduction */
+void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
+{
+ fp_digit c[FP_SIZE], *_c, *tmpm, mu;
+ int oldused, x, y, pa;
+
+ /* bail if too large */
+ if (m->used > (FP_SIZE/2)) {
+ return;
+ }
+
+#ifdef TFM_SMALL_MONT_SET
+ if (m->used <= 16) {
+ fp_montgomery_reduce_small(a, m, mp);
+ return;
+ }
+#endif
+
+#if defined(USE_MEMSET)
+ /* now zero the buff */
+ memset(c, 0, sizeof c);
+#endif
+ pa = m->used;
+
+ /* copy the input */
+ oldused = a->used;
+ for (x = 0; x < oldused; x++) {
+ c[x] = a->dp[x];
+ }
+#if !defined(USE_MEMSET)
+ for (; x < 2*pa+1; x++) {
+ c[x] = 0;
+ }
+#endif
+ MONT_START;
+
+ for (x = 0; x < pa; x++) {
+ fp_digit cy = 0;
+ /* get Mu for this round */
+ LOOP_START;
+ _c = c + x;
+ tmpm = m->dp;
+ y = 0;
+ #if defined(INNERMUL8)
+ for (; y < (pa & ~7); y += 8) {
+ INNERMUL8;
+ _c += 8;
+ tmpm += 8;
+ }
+ #endif
+
+ for (; y < pa; y++) {
+ INNERMUL;
+ ++_c;
+ }
+ LOOP_END;
+ while (cy) {
+ PROPCARRY;
+ ++_c;
+ }
+ }
+
+ /* now copy out */
+ _c = c + pa;
+ tmpm = a->dp;
+ for (x = 0; x < pa+1; x++) {
+ *tmpm++ = *_c++;
+ }
+
+ for (; x < oldused; x++) {
+ *tmpm++ = 0;
+ }
+
+ MONT_FINI;
+
+ a->used = pa+1;
+ fp_clamp(a);
+
+ /* if A >= m then A = A - m */
+ if (fp_cmp_mag (a, m) != FP_LT) {
+ s_fp_sub (a, m, a);
+ }
+}
+
+
+/* $Source$ */
+/* $Revision$ */
+/* $Date$ */