5 Can overlap h with f or g.
8 |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
9 |g| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
12 |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
16 Notes on implementation strategy:
18 Using schoolbook multiplication.
19 Karatsuba would save a little in some cost models.
21 Most multiplications by 2 and 19 are 32-bit precomputations;
22 cheaper than 64-bit postcomputations.
24 There is one remaining multiplication by 19 in the carry chain;
25 one *19 precomputation can be merged into this,
26 but the resulting data flow is considerably less clean.
28 There are 12 carries below.
29 10 of them are 2-way parallelizable and vectorizable.
30 Can get away with 11 carries, but then data flow is much deeper.
32 With tighter constraints on inputs can squeeze carries into int32.
35 void fe_mul(int32_t h[10], int32_t f[10], int32_t g[10]) {
56 int32_t g1_19 = 19 * g1; /* 1.4*2^29 */
57 int32_t g2_19 = 19 * g2; /* 1.4*2^30; still ok */
58 int32_t g3_19 = 19 * g3;
59 int32_t g4_19 = 19 * g4;
60 int32_t g5_19 = 19 * g5;
61 int32_t g6_19 = 19 * g6;
62 int32_t g7_19 = 19 * g7;
63 int32_t g8_19 = 19 * g8;
64 int32_t g9_19 = 19 * g9;
65 int32_t f1_2 = 2 * f1;
66 int32_t f3_2 = 2 * f3;
67 int32_t f5_2 = 2 * f5;
68 int32_t f7_2 = 2 * f7;
69 int32_t f9_2 = 2 * f9;
70 int64_t f0g0 = f0 * (int64_t) g0;
71 int64_t f0g1 = f0 * (int64_t) g1;
72 int64_t f0g2 = f0 * (int64_t) g2;
73 int64_t f0g3 = f0 * (int64_t) g3;
74 int64_t f0g4 = f0 * (int64_t) g4;
75 int64_t f0g5 = f0 * (int64_t) g5;
76 int64_t f0g6 = f0 * (int64_t) g6;
77 int64_t f0g7 = f0 * (int64_t) g7;
78 int64_t f0g8 = f0 * (int64_t) g8;
79 int64_t f0g9 = f0 * (int64_t) g9;
80 int64_t f1g0 = f1 * (int64_t) g0;
81 int64_t f1g1_2 = f1_2 * (int64_t) g1;
82 int64_t f1g2 = f1 * (int64_t) g2;
83 int64_t f1g3_2 = f1_2 * (int64_t) g3;
84 int64_t f1g4 = f1 * (int64_t) g4;
85 int64_t f1g5_2 = f1_2 * (int64_t) g5;
86 int64_t f1g6 = f1 * (int64_t) g6;
87 int64_t f1g7_2 = f1_2 * (int64_t) g7;
88 int64_t f1g8 = f1 * (int64_t) g8;
89 int64_t f1g9_38 = f1_2 * (int64_t) g9_19;
90 int64_t f2g0 = f2 * (int64_t) g0;
91 int64_t f2g1 = f2 * (int64_t) g1;
92 int64_t f2g2 = f2 * (int64_t) g2;
93 int64_t f2g3 = f2 * (int64_t) g3;
94 int64_t f2g4 = f2 * (int64_t) g4;
95 int64_t f2g5 = f2 * (int64_t) g5;
96 int64_t f2g6 = f2 * (int64_t) g6;
97 int64_t f2g7 = f2 * (int64_t) g7;
98 int64_t f2g8_19 = f2 * (int64_t) g8_19;
99 int64_t f2g9_19 = f2 * (int64_t) g9_19;
100 int64_t f3g0 = f3 * (int64_t) g0;
101 int64_t f3g1_2 = f3_2 * (int64_t) g1;
102 int64_t f3g2 = f3 * (int64_t) g2;
103 int64_t f3g3_2 = f3_2 * (int64_t) g3;
104 int64_t f3g4 = f3 * (int64_t) g4;
105 int64_t f3g5_2 = f3_2 * (int64_t) g5;
106 int64_t f3g6 = f3 * (int64_t) g6;
107 int64_t f3g7_38 = f3_2 * (int64_t) g7_19;
108 int64_t f3g8_19 = f3 * (int64_t) g8_19;
109 int64_t f3g9_38 = f3_2 * (int64_t) g9_19;
110 int64_t f4g0 = f4 * (int64_t) g0;
111 int64_t f4g1 = f4 * (int64_t) g1;
112 int64_t f4g2 = f4 * (int64_t) g2;
113 int64_t f4g3 = f4 * (int64_t) g3;
114 int64_t f4g4 = f4 * (int64_t) g4;
115 int64_t f4g5 = f4 * (int64_t) g5;
116 int64_t f4g6_19 = f4 * (int64_t) g6_19;
117 int64_t f4g7_19 = f4 * (int64_t) g7_19;
118 int64_t f4g8_19 = f4 * (int64_t) g8_19;
119 int64_t f4g9_19 = f4 * (int64_t) g9_19;
120 int64_t f5g0 = f5 * (int64_t) g0;
121 int64_t f5g1_2 = f5_2 * (int64_t) g1;
122 int64_t f5g2 = f5 * (int64_t) g2;
123 int64_t f5g3_2 = f5_2 * (int64_t) g3;
124 int64_t f5g4 = f5 * (int64_t) g4;
125 int64_t f5g5_38 = f5_2 * (int64_t) g5_19;
126 int64_t f5g6_19 = f5 * (int64_t) g6_19;
127 int64_t f5g7_38 = f5_2 * (int64_t) g7_19;
128 int64_t f5g8_19 = f5 * (int64_t) g8_19;
129 int64_t f5g9_38 = f5_2 * (int64_t) g9_19;
130 int64_t f6g0 = f6 * (int64_t) g0;
131 int64_t f6g1 = f6 * (int64_t) g1;
132 int64_t f6g2 = f6 * (int64_t) g2;
133 int64_t f6g3 = f6 * (int64_t) g3;
134 int64_t f6g4_19 = f6 * (int64_t) g4_19;
135 int64_t f6g5_19 = f6 * (int64_t) g5_19;
136 int64_t f6g6_19 = f6 * (int64_t) g6_19;
137 int64_t f6g7_19 = f6 * (int64_t) g7_19;
138 int64_t f6g8_19 = f6 * (int64_t) g8_19;
139 int64_t f6g9_19 = f6 * (int64_t) g9_19;
140 int64_t f7g0 = f7 * (int64_t) g0;
141 int64_t f7g1_2 = f7_2 * (int64_t) g1;
142 int64_t f7g2 = f7 * (int64_t) g2;
143 int64_t f7g3_38 = f7_2 * (int64_t) g3_19;
144 int64_t f7g4_19 = f7 * (int64_t) g4_19;
145 int64_t f7g5_38 = f7_2 * (int64_t) g5_19;
146 int64_t f7g6_19 = f7 * (int64_t) g6_19;
147 int64_t f7g7_38 = f7_2 * (int64_t) g7_19;
148 int64_t f7g8_19 = f7 * (int64_t) g8_19;
149 int64_t f7g9_38 = f7_2 * (int64_t) g9_19;
150 int64_t f8g0 = f8 * (int64_t) g0;
151 int64_t f8g1 = f8 * (int64_t) g1;
152 int64_t f8g2_19 = f8 * (int64_t) g2_19;
153 int64_t f8g3_19 = f8 * (int64_t) g3_19;
154 int64_t f8g4_19 = f8 * (int64_t) g4_19;
155 int64_t f8g5_19 = f8 * (int64_t) g5_19;
156 int64_t f8g6_19 = f8 * (int64_t) g6_19;
157 int64_t f8g7_19 = f8 * (int64_t) g7_19;
158 int64_t f8g8_19 = f8 * (int64_t) g8_19;
159 int64_t f8g9_19 = f8 * (int64_t) g9_19;
160 int64_t f9g0 = f9 * (int64_t) g0;
161 int64_t f9g1_38 = f9_2 * (int64_t) g1_19;
162 int64_t f9g2_19 = f9 * (int64_t) g2_19;
163 int64_t f9g3_38 = f9_2 * (int64_t) g3_19;
164 int64_t f9g4_19 = f9 * (int64_t) g4_19;
165 int64_t f9g5_38 = f9_2 * (int64_t) g5_19;
166 int64_t f9g6_19 = f9 * (int64_t) g6_19;
167 int64_t f9g7_38 = f9_2 * (int64_t) g7_19;
168 int64_t f9g8_19 = f9 * (int64_t) g8_19;
169 int64_t f9g9_38 = f9_2 * (int64_t) g9_19;
170 int64_t h0 = f0g0+f1g9_38+f2g8_19+f3g7_38+f4g6_19+f5g5_38+f6g4_19+f7g3_38+f8g2_19+f9g1_38;
171 int64_t h1 = f0g1+f1g0 +f2g9_19+f3g8_19+f4g7_19+f5g6_19+f6g5_19+f7g4_19+f8g3_19+f9g2_19;
172 int64_t h2 = f0g2+f1g1_2 +f2g0 +f3g9_38+f4g8_19+f5g7_38+f6g6_19+f7g5_38+f8g4_19+f9g3_38;
173 int64_t h3 = f0g3+f1g2 +f2g1 +f3g0 +f4g9_19+f5g8_19+f6g7_19+f7g6_19+f8g5_19+f9g4_19;
174 int64_t h4 = f0g4+f1g3_2 +f2g2 +f3g1_2 +f4g0 +f5g9_38+f6g8_19+f7g7_38+f8g6_19+f9g5_38;
175 int64_t h5 = f0g5+f1g4 +f2g3 +f3g2 +f4g1 +f5g0 +f6g9_19+f7g8_19+f8g7_19+f9g6_19;
176 int64_t h6 = f0g6+f1g5_2 +f2g4 +f3g3_2 +f4g2 +f5g1_2 +f6g0 +f7g9_38+f8g8_19+f9g7_38;
177 int64_t h7 = f0g7+f1g6 +f2g5 +f3g4 +f4g3 +f5g2 +f6g1 +f7g0 +f8g9_19+f9g8_19;
178 int64_t h8 = f0g8+f1g7_2 +f2g6 +f3g5_2 +f4g4 +f5g3_2 +f6g2 +f7g1_2 +f8g0 +f9g9_38;
179 int64_t h9 = f0g9+f1g8 +f2g7 +f3g6 +f4g5 +f5g4 +f6g3 +f7g2 +f8g1 +f9g0 ;
192 |h0| <= (1.1*1.1*2^52*(1+19+19+19+19)+1.1*1.1*2^50*(38+38+38+38+38))
193 i.e. |h0| <= 1.2*2^59; narrower ranges for h2, h4, h6, h8
194 |h1| <= (1.1*1.1*2^51*(1+1+19+19+19+19+19+19+19+19))
195 i.e. |h1| <= 1.5*2^58; narrower ranges for h3, h5, h7, h9
198 carry0 = (h0 + (int64_t) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
199 carry4 = (h4 + (int64_t) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
202 /* |h1| <= 1.51*2^58 */
203 /* |h5| <= 1.51*2^58 */
205 carry1 = (h1 + (int64_t) (1<<24)) >> 25; h2 += carry1; h1 -= carry1 << 25;
206 carry5 = (h5 + (int64_t) (1<<24)) >> 25; h6 += carry5; h5 -= carry5 << 25;
207 /* |h1| <= 2^24; from now on fits into int32 */
208 /* |h5| <= 2^24; from now on fits into int32 */
209 /* |h2| <= 1.21*2^59 */
210 /* |h6| <= 1.21*2^59 */
212 carry2 = (h2 + (int64_t) (1<<25)) >> 26; h3 += carry2; h2 -= carry2 << 26;
213 carry6 = (h6 + (int64_t) (1<<25)) >> 26; h7 += carry6; h6 -= carry6 << 26;
214 /* |h2| <= 2^25; from now on fits into int32 unchanged */
215 /* |h6| <= 2^25; from now on fits into int32 unchanged */
216 /* |h3| <= 1.51*2^58 */
217 /* |h7| <= 1.51*2^58 */
219 carry3 = (h3 + (int64_t) (1<<24)) >> 25; h4 += carry3; h3 -= carry3 << 25;
220 carry7 = (h7 + (int64_t) (1<<24)) >> 25; h8 += carry7; h7 -= carry7 << 25;
221 /* |h3| <= 2^24; from now on fits into int32 unchanged */
222 /* |h7| <= 2^24; from now on fits into int32 unchanged */
223 /* |h4| <= 1.52*2^33 */
224 /* |h8| <= 1.52*2^33 */
226 carry4 = (h4 + (int64_t) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
227 carry8 = (h8 + (int64_t) (1<<25)) >> 26; h9 += carry8; h8 -= carry8 << 26;
228 /* |h4| <= 2^25; from now on fits into int32 unchanged */
229 /* |h8| <= 2^25; from now on fits into int32 unchanged */
230 /* |h5| <= 1.01*2^24 */
231 /* |h9| <= 1.51*2^58 */
233 carry9 = (h9 + (int64_t) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= carry9 << 25;
234 /* |h9| <= 2^24; from now on fits into int32 unchanged */
235 /* |h0| <= 1.8*2^37 */
237 carry0 = (h0 + (int64_t) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
238 /* |h0| <= 2^25; from now on fits into int32 unchanged */
239 /* |h1| <= 1.01*2^24 */