]> begriffs open source - freertos/blob - FreeRTOS-Plus/Source/WolfSSL/wolfcrypt/src/asm.c
Update WolfSSL library to the latest version.
[freertos] / FreeRTOS-Plus / Source / WolfSSL / wolfcrypt / src / asm.c
1 /* asm.c
2  *
3  * Copyright (C) 2006-2015 wolfSSL Inc.
4  *
5  * This file is part of wolfSSL. (formerly known as CyaSSL)
6  *
7  * wolfSSL is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * wolfSSL is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
20  */
21
22 #ifdef HAVE_CONFIG_H
23     #include <config.h>
24 #endif
25
26 #include <wolfssl/wolfcrypt/settings.h>
27
28 /*
29  * Based on public domain TomsFastMath 0.10 by Tom St Denis, tomstdenis@iahu.ca,
30  * http://math.libtomcrypt.com
31  */
32
33
34 /******************************************************************/
35 /* fp_montgomery_reduce.c asm or generic */
36
37
38 /* Each platform needs to query info type 1 from cpuid to see if aesni is
39  * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
40  */
41
42 #if defined(HAVE_INTEL_MULX)
43 #ifndef _MSC_VER
44     #define cpuid(reg, leaf, sub)\
45             __asm__ __volatile__ ("cpuid":\
46              "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\
47              "a" (leaf), "c"(sub));
48
49     #define XASM_LINK(f) asm(f)
50 #else
51
52     #include <intrin.h>
53     #define cpuid(a,b) __cpuid((int*)a,b)
54
55     #define XASM_LINK(f)
56
57 #endif /* _MSC_VER */
58
59 #define EAX 0
60 #define EBX 1
61 #define ECX 2 
62 #define EDX 3
63     
64 #define CPUID_AVX1   0x1
65 #define CPUID_AVX2   0x2
66 #define CPUID_RDRAND 0x4
67 #define CPUID_RDSEED 0x8
68 #define CPUID_BMI2   0x10   /* MULX, RORX */
69 #define CPUID_ADX    0x20   /* ADCX, ADOX */
70
71 #define IS_INTEL_AVX1       (cpuid_flags&CPUID_AVX1)
72 #define IS_INTEL_AVX2       (cpuid_flags&CPUID_AVX2)
73 #define IS_INTEL_BMI2       (cpuid_flags&CPUID_BMI2)
74 #define IS_INTEL_ADX        (cpuid_flags&CPUID_ADX)
75 #define IS_INTEL_RDRAND     (cpuid_flags&CPUID_RDRAND)
76 #define IS_INTEL_RDSEED     (cpuid_flags&CPUID_RDSEED)
77 #define SET_FLAGS         
78
79 static word32 cpuid_check = 0 ;
80 static word32 cpuid_flags = 0 ;
81
82 static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) {
83     int got_intel_cpu=0;
84     unsigned int reg[5]; 
85     
86     reg[4] = '\0' ;
87     cpuid(reg, 0, 0);  
88     if(memcmp((char *)&(reg[EBX]), "Genu", 4) == 0 &&  
89                 memcmp((char *)&(reg[EDX]), "ineI", 4) == 0 &&  
90                 memcmp((char *)&(reg[ECX]), "ntel", 4) == 0) {  
91         got_intel_cpu = 1;  
92     }    
93     if (got_intel_cpu) {
94         cpuid(reg, leaf, sub);
95         return((reg[num]>>bit)&0x1) ;
96     }
97     return 0 ;
98 }
99
100 INLINE static int set_cpuid_flags(void) {  
101     if(cpuid_check == 0) {
102         if(cpuid_flag(7, 0, EBX, 8)){  cpuid_flags |= CPUID_BMI2 ; }
103         if(cpuid_flag(7, 0, EBX,19)){  cpuid_flags |= CPUID_ADX  ; }
104                 cpuid_check = 1 ;
105                 return 0 ;
106     }
107     return 1 ;
108 }
109
110 #define RETURN return
111 #define IF_HAVE_INTEL_MULX(func, ret)    \
112    if(cpuid_check==0)set_cpuid_flags() ; \
113    if(IS_INTEL_BMI2 && IS_INTEL_ADX){  func;  ret ;  }
114
115 #else
116     #define IF_HAVE_INTEL_MULX(func, ret)
117 #endif
118
119 #if defined(TFM_X86) && !defined(TFM_SSE2) 
120 /* x86-32 code */
121
122 #define MONT_START 
123 #define MONT_FINI
124 #define LOOP_END
125 #define LOOP_START \
126    mu = c[x] * mp
127
128 #define INNERMUL                                          \
129 __asm__(                                                      \
130    "movl %5,%%eax \n\t"                                   \
131    "mull %4       \n\t"                                   \
132    "addl %1,%%eax \n\t"                                   \
133    "adcl $0,%%edx \n\t"                                   \
134    "addl %%eax,%0 \n\t"                                   \
135    "adcl $0,%%edx \n\t"                                   \
136    "movl %%edx,%1 \n\t"                                   \
137 :"=g"(_c[LO]), "=r"(cy)                                   \
138 :"0"(_c[LO]), "1"(cy), "g"(mu), "g"(*tmpm++)              \
139 : "%eax", "%edx", "cc")
140
141 #define PROPCARRY                           \
142 __asm__(                                        \
143    "addl   %1,%0    \n\t"                   \
144    "setb   %%al     \n\t"                   \
145    "movzbl %%al,%1 \n\t"                    \
146 :"=g"(_c[LO]), "=r"(cy)                     \
147 :"0"(_c[LO]), "1"(cy)                       \
148 : "%eax", "cc")
149
150 /******************************************************************/
151 #elif defined(TFM_X86_64)
152 /* x86-64 code */
153
154 #define MONT_START 
155 #define MONT_FINI
156 #define LOOP_END
157 #define LOOP_START \
158    mu = c[x] * mp;
159
160 #define INNERMUL                                          \
161 __asm__(                                                      \
162    "movq %5,%%rax \n\t"                                   \
163    "mulq %4       \n\t"                                   \
164    "addq %1,%%rax \n\t"                                   \
165    "adcq $0,%%rdx \n\t"                                   \
166    "addq %%rax,%0 \n\t"                                   \
167    "adcq $0,%%rdx \n\t"                                   \
168    "movq %%rdx,%1 \n\t"                                   \
169 :"=g"(_c[LO]), "=r"(cy)                                   \
170 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++)              \
171 : "%rax", "%rdx", "cc")
172
173 #if defined(HAVE_INTEL_MULX)
174 #define MULX_INIT(a0, c0, cy)\
175     __asm__ volatile(                                     \
176              "xorq  %%r10, %%r10\n\t"                     \
177              "movq  %1,%%rdx\n\t"                         \
178              "addq  %2, %0\n\t"       /* c0+=cy; Set CF, OF */ \
179              "adoxq %%r10, %%r10\n\t" /* Reset   OF */    \
180              :"+m"(c0):"r"(a0),"r"(cy):"%r8","%r9", "%r10","%r11","%r12","%rdx") ; \
181
182 #define MULX_INNERMUL_R1(c0, c1, pre, rdx)\
183    {                                                      \
184     __asm__  volatile (                                   \
185          "movq  %3, %%rdx\n\t"                            \
186          "mulx  %%r11,%%r9, %%r8 \n\t"                    \
187          "movq  %2, %%r12\n\t"                            \
188          "adoxq  %%r9,%0     \n\t"                        \
189          "adcxq  %%r8,%1     \n\t"                        \
190          :"+r"(c0),"+r"(c1):"m"(pre),"r"(rdx):"%r8","%r9", "%r10", "%r11","%r12","%rdx"    \
191     ); }
192     
193
194 #define MULX_INNERMUL_R2(c0, c1, pre, rdx)\
195    {                                                      \
196     __asm__  volatile (                                   \
197          "movq  %3, %%rdx\n\t"                            \
198          "mulx  %%r12,%%r9, %%r8 \n\t"                    \
199          "movq  %2, %%r11\n\t"                            \
200          "adoxq  %%r9,%0     \n\t"                        \
201          "adcxq  %%r8,%1     \n\t"                        \
202          :"+r"(c0),"+r"(c1):"m"(pre),"r"(rdx):"%r8","%r9", "%r10", "%r11","%r12","%rdx"    \
203     ); }
204
205 #define MULX_LOAD_R1(val)\
206     __asm__  volatile (                                   \
207         "movq %0, %%r11\n\t"\
208         ::"m"(val):"%r8","%r9", "%r10", "%r11","%r12","%rdx"\
209 ) ;
210
211 #define MULX_INNERMUL_LAST(c0, c1, rdx)\
212    {                                                      \
213     __asm__  volatile (                                   \
214          "movq   %2, %%rdx\n\t"                           \
215          "mulx   %%r12,%%r9, %%r8 \n\t"                   \
216          "movq   $0, %%r10      \n\t"                     \
217          "adoxq  %%r10, %%r9   \n\t"                      \
218          "adcq   $0,%%r8       \n\t"                      \
219          "addq   %%r9,%0       \n\t"                      \
220          "adcq   $0,%%r8       \n\t"                      \
221          "movq   %%r8,%1       \n\t"                      \
222          :"+m"(c0),"=m"(c1):"r"(rdx):"%r8","%r9","%r10", "%r11", "%r12","%rdx"\
223     ); }
224
225 #define MULX_INNERMUL8(x,y,z,cy)\
226 {       word64 rdx = y ;\
227         MULX_LOAD_R1(x[0]) ;\
228         MULX_INIT(y, _c0, cy) ; /* rdx=y; z0+=cy; */ \
229         MULX_INNERMUL_R1(_c0, _c1, x[1], rdx) ;\
230         MULX_INNERMUL_R2(_c1, _c2, x[2], rdx) ;\
231         MULX_INNERMUL_R1(_c2, _c3, x[3], rdx) ;\
232         MULX_INNERMUL_R2(_c3, _c4, x[4], rdx) ;\
233         MULX_INNERMUL_R1(_c4, _c5, x[5], rdx) ;\
234         MULX_INNERMUL_R2(_c5, _c6, x[6], rdx) ;\
235         MULX_INNERMUL_R1(_c6, _c7, x[7], rdx) ;\
236         MULX_INNERMUL_LAST(_c7, cy, rdx) ;\
237 }
238 #define INNERMUL8_MULX \
239 {\
240     MULX_INNERMUL8(tmpm, mu, _c, cy);\
241 }
242 #endif
243
244 #define INNERMUL8 \
245  __asm__(                  \
246  "movq 0(%5),%%rax    \n\t"  \
247  "movq 0(%2),%%r10    \n\t"  \
248  "movq 0x8(%5),%%r11  \n\t"  \
249  "mulq %4             \n\t"  \
250  "addq %%r10,%%rax    \n\t"  \
251  "adcq $0,%%rdx       \n\t"  \
252  "movq 0x8(%2),%%r10  \n\t"  \
253  "addq %3,%%rax       \n\t"  \
254  "adcq $0,%%rdx       \n\t"  \
255  "movq %%rax,0(%0)    \n\t"  \
256  "movq %%rdx,%1       \n\t"  \
257  \
258  "movq %%r11,%%rax    \n\t"  \
259  "movq 0x10(%5),%%r11 \n\t"  \
260  "mulq %4             \n\t"  \
261  "addq %%r10,%%rax    \n\t"  \
262  "adcq $0,%%rdx       \n\t"  \
263  "movq 0x10(%2),%%r10 \n\t"  \
264  "addq %3,%%rax       \n\t"  \
265  "adcq $0,%%rdx       \n\t"  \
266  "movq %%rax,0x8(%0)  \n\t"  \
267  "movq %%rdx,%1       \n\t"  \
268  \
269  "movq %%r11,%%rax    \n\t"  \
270  "movq 0x18(%5),%%r11 \n\t"  \
271  "mulq %4             \n\t"  \
272  "addq %%r10,%%rax    \n\t"  \
273  "adcq $0,%%rdx       \n\t"  \
274  "movq 0x18(%2),%%r10 \n\t"  \
275  "addq %3,%%rax       \n\t"  \
276  "adcq $0,%%rdx       \n\t"  \
277  "movq %%rax,0x10(%0) \n\t"  \
278  "movq %%rdx,%1       \n\t"  \
279  \
280  "movq %%r11,%%rax    \n\t"  \
281  "movq 0x20(%5),%%r11 \n\t"  \
282  "mulq %4             \n\t"  \
283  "addq %%r10,%%rax    \n\t"  \
284  "adcq $0,%%rdx       \n\t"  \
285  "movq 0x20(%2),%%r10 \n\t"  \
286  "addq %3,%%rax       \n\t"  \
287  "adcq $0,%%rdx       \n\t"  \
288  "movq %%rax,0x18(%0) \n\t"  \
289  "movq %%rdx,%1       \n\t"  \
290  \
291  "movq %%r11,%%rax    \n\t"  \
292  "movq 0x28(%5),%%r11 \n\t"  \
293  "mulq %4             \n\t"  \
294  "addq %%r10,%%rax    \n\t"  \
295  "adcq $0,%%rdx       \n\t"  \
296  "movq 0x28(%2),%%r10 \n\t"  \
297  "addq %3,%%rax       \n\t"  \
298  "adcq $0,%%rdx       \n\t"  \
299  "movq %%rax,0x20(%0) \n\t"  \
300  "movq %%rdx,%1       \n\t"  \
301  \
302  "movq %%r11,%%rax    \n\t"  \
303  "movq 0x30(%5),%%r11 \n\t"  \
304  "mulq %4             \n\t"  \
305  "addq %%r10,%%rax    \n\t"  \
306  "adcq $0,%%rdx       \n\t"  \
307  "movq 0x30(%2),%%r10 \n\t"  \
308  "addq %3,%%rax       \n\t"  \
309  "adcq $0,%%rdx       \n\t"  \
310  "movq %%rax,0x28(%0) \n\t"  \
311  "movq %%rdx,%1       \n\t"  \
312  \
313  "movq %%r11,%%rax    \n\t"  \
314  "movq 0x38(%5),%%r11 \n\t"  \
315  "mulq %4             \n\t"  \
316  "addq %%r10,%%rax    \n\t"  \
317  "adcq $0,%%rdx       \n\t"  \
318  "movq 0x38(%2),%%r10 \n\t"  \
319  "addq %3,%%rax       \n\t"  \
320  "adcq $0,%%rdx       \n\t"  \
321  "movq %%rax,0x30(%0) \n\t"  \
322  "movq %%rdx,%1       \n\t"  \
323  \
324  "movq %%r11,%%rax    \n\t"  \
325  "mulq %4             \n\t"  \
326  "addq %%r10,%%rax    \n\t"  \
327  "adcq $0,%%rdx       \n\t"  \
328  "addq %3,%%rax       \n\t"  \
329  "adcq $0,%%rdx       \n\t"  \
330  "movq %%rax,0x38(%0) \n\t"  \
331  "movq %%rdx,%1       \n\t"  \
332  \
333 :"=r"(_c), "=r"(cy)                    \
334 : "0"(_c),  "1"(cy), "g"(mu), "r"(tmpm)\
335 : "%rax", "%rdx", "%r10", "%r11", "cc")\
336
337 #define PROPCARRY                           \
338 __asm__(                                        \
339    "addq   %1,%0    \n\t"                   \
340    "setb   %%al     \n\t"                   \
341    "movzbq %%al,%1 \n\t"                    \
342 :"=g"(_c[LO]), "=r"(cy)                     \
343 :"0"(_c[LO]), "1"(cy)                       \
344 : "%rax", "cc")
345
346 /******************************************************************/
347 #elif defined(TFM_SSE2)  
348 /* SSE2 code (assumes 32-bit fp_digits) */
349 /* XMM register assignments:
350  * xmm0  *tmpm++, then Mu * (*tmpm++)
351  * xmm1  c[x], then Mu
352  * xmm2  mp
353  * xmm3  cy
354  * xmm4  _c[LO]
355  */
356
357 #define MONT_START \
358    __asm__("movd %0,%%mm2"::"g"(mp))
359
360 #define MONT_FINI \
361    __asm__("emms")
362
363 #define LOOP_START          \
364 __asm__(                        \
365 "movd %0,%%mm1        \n\t" \
366 "pxor %%mm3,%%mm3     \n\t" \
367 "pmuludq %%mm2,%%mm1  \n\t" \
368 :: "g"(c[x]))
369
370 /* pmuludq on mmx registers does a 32x32->64 multiply. */
371 #define INNERMUL               \
372 __asm__(                           \
373    "movd %1,%%mm4        \n\t" \
374    "movd %2,%%mm0        \n\t" \
375    "paddq %%mm4,%%mm3    \n\t" \
376    "pmuludq %%mm1,%%mm0  \n\t" \
377    "paddq %%mm0,%%mm3    \n\t" \
378    "movd %%mm3,%0        \n\t" \
379    "psrlq $32, %%mm3     \n\t" \
380 :"=g"(_c[LO]) : "0"(_c[LO]), "g"(*tmpm++) );
381
382 #define INNERMUL8 \
383 __asm__(                           \
384    "movd 0(%1),%%mm4     \n\t" \
385    "movd 0(%2),%%mm0     \n\t" \
386    "paddq %%mm4,%%mm3    \n\t" \
387    "pmuludq %%mm1,%%mm0  \n\t" \
388    "movd 4(%2),%%mm5     \n\t" \
389    "paddq %%mm0,%%mm3    \n\t" \
390    "movd 4(%1),%%mm6     \n\t" \
391    "movd %%mm3,0(%0)     \n\t" \
392    "psrlq $32, %%mm3     \n\t" \
393 \
394    "paddq %%mm6,%%mm3    \n\t" \
395    "pmuludq %%mm1,%%mm5  \n\t" \
396    "movd 8(%2),%%mm6     \n\t" \
397    "paddq %%mm5,%%mm3    \n\t" \
398    "movd 8(%1),%%mm7     \n\t" \
399    "movd %%mm3,4(%0)     \n\t" \
400    "psrlq $32, %%mm3     \n\t" \
401 \
402    "paddq %%mm7,%%mm3    \n\t" \
403    "pmuludq %%mm1,%%mm6  \n\t" \
404    "movd 12(%2),%%mm7    \n\t" \
405    "paddq %%mm6,%%mm3    \n\t" \
406    "movd 12(%1),%%mm5     \n\t" \
407    "movd %%mm3,8(%0)     \n\t" \
408    "psrlq $32, %%mm3     \n\t" \
409 \
410    "paddq %%mm5,%%mm3    \n\t" \
411    "pmuludq %%mm1,%%mm7  \n\t" \
412    "movd 16(%2),%%mm5    \n\t" \
413    "paddq %%mm7,%%mm3    \n\t" \
414    "movd 16(%1),%%mm6    \n\t" \
415    "movd %%mm3,12(%0)    \n\t" \
416    "psrlq $32, %%mm3     \n\t" \
417 \
418    "paddq %%mm6,%%mm3    \n\t" \
419    "pmuludq %%mm1,%%mm5  \n\t" \
420    "movd 20(%2),%%mm6    \n\t" \
421    "paddq %%mm5,%%mm3    \n\t" \
422    "movd 20(%1),%%mm7    \n\t" \
423    "movd %%mm3,16(%0)    \n\t" \
424    "psrlq $32, %%mm3     \n\t" \
425 \
426    "paddq %%mm7,%%mm3    \n\t" \
427    "pmuludq %%mm1,%%mm6  \n\t" \
428    "movd 24(%2),%%mm7    \n\t" \
429    "paddq %%mm6,%%mm3    \n\t" \
430    "movd 24(%1),%%mm5     \n\t" \
431    "movd %%mm3,20(%0)    \n\t" \
432    "psrlq $32, %%mm3     \n\t" \
433 \
434    "paddq %%mm5,%%mm3    \n\t" \
435    "pmuludq %%mm1,%%mm7  \n\t" \
436    "movd 28(%2),%%mm5    \n\t" \
437    "paddq %%mm7,%%mm3    \n\t" \
438    "movd 28(%1),%%mm6    \n\t" \
439    "movd %%mm3,24(%0)    \n\t" \
440    "psrlq $32, %%mm3     \n\t" \
441 \
442    "paddq %%mm6,%%mm3    \n\t" \
443    "pmuludq %%mm1,%%mm5  \n\t" \
444    "paddq %%mm5,%%mm3    \n\t" \
445    "movd %%mm3,28(%0)    \n\t" \
446    "psrlq $32, %%mm3     \n\t" \
447 :"=r"(_c) : "0"(_c), "r"(tmpm) );
448
449 /* TAO switched tmpm from "g" to "r" after gcc tried to index the indexed stack
450    pointer */
451
452 #define LOOP_END \
453 __asm__( "movd %%mm3,%0  \n" :"=r"(cy))
454
455 #define PROPCARRY                           \
456 __asm__(                                        \
457    "addl   %1,%0    \n\t"                   \
458    "setb   %%al     \n\t"                   \
459    "movzbl %%al,%1 \n\t"                    \
460 :"=g"(_c[LO]), "=r"(cy)                     \
461 :"0"(_c[LO]), "1"(cy)                       \
462 : "%eax", "cc")
463
464 /******************************************************************/
465 #elif defined(TFM_ARM)
466    /* ARMv4 code */
467
468 #define MONT_START 
469 #define MONT_FINI
470 #define LOOP_END
471 #define LOOP_START \
472    mu = c[x] * mp
473
474
475 #ifdef __thumb__
476
477 #define INNERMUL                    \
478 __asm__(                                \
479     " LDR    r0,%1            \n\t" \
480     " ADDS   r0,r0,%0         \n\t" \
481     " ITE    CS               \n\t" \
482     " MOVCS  %0,#1            \n\t" \
483     " MOVCC  %0,#0            \n\t" \
484     " UMLAL  r0,%0,%3,%4      \n\t" \
485     " STR    r0,%1            \n\t" \
486 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0]):"r0","cc");
487
488 #define PROPCARRY                  \
489 __asm__(                               \
490     " LDR   r0,%1            \n\t" \
491     " ADDS  r0,r0,%0         \n\t" \
492     " STR   r0,%1            \n\t" \
493     " ITE   CS               \n\t" \
494     " MOVCS %0,#1            \n\t" \
495     " MOVCC %0,#0            \n\t" \
496 :"=r"(cy),"=m"(_c[0]):"0"(cy),"m"(_c[0]):"r0","cc");
497
498
499 /* TAO thumb mode uses ite (if then else) to detect carry directly
500  * fixed unmatched constraint warning by changing 1 to m  */
501
502 #else  /* __thumb__ */
503
504 #define INNERMUL                    \
505 __asm__(                                \
506     " LDR    r0,%1            \n\t" \
507     " ADDS   r0,r0,%0         \n\t" \
508     " MOVCS  %0,#1            \n\t" \
509     " MOVCC  %0,#0            \n\t" \
510     " UMLAL  r0,%0,%3,%4      \n\t" \
511     " STR    r0,%1            \n\t" \
512 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r0","cc");
513
514 #define PROPCARRY                  \
515 __asm__(                               \
516     " LDR   r0,%1            \n\t" \
517     " ADDS  r0,r0,%0         \n\t" \
518     " STR   r0,%1            \n\t" \
519     " MOVCS %0,#1            \n\t" \
520     " MOVCC %0,#0            \n\t" \
521 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r0","cc");
522
523 #endif /* __thumb__ */
524
525 #elif defined(TFM_PPC32)
526
527 /* PPC32 */
528 #define MONT_START 
529 #define MONT_FINI
530 #define LOOP_END
531 #define LOOP_START \
532    mu = c[x] * mp
533
534 #define INNERMUL                     \
535 __asm__(                                 \
536    " mullw    16,%3,%4       \n\t"   \
537    " mulhwu   17,%3,%4       \n\t"   \
538    " addc     16,16,%0       \n\t"   \
539    " addze    17,17          \n\t"   \
540    " lwz      18,%1          \n\t"   \
541    " addc     16,16,18       \n\t"   \
542    " addze    %0,17          \n\t"   \
543    " stw      16,%1          \n\t"   \
544 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "18","cc"); ++tmpm;
545
546 #define PROPCARRY                    \
547 __asm__(                                 \
548    " lwz      16,%1         \n\t"    \
549    " addc     16,16,%0      \n\t"    \
550    " stw      16,%1         \n\t"    \
551    " xor      %0,%0,%0      \n\t"    \
552    " addze    %0,%0         \n\t"    \
553 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"16","cc");
554
555 #elif defined(TFM_PPC64)
556
557 /* PPC64 */
558 #define MONT_START 
559 #define MONT_FINI
560 #define LOOP_END
561 #define LOOP_START \
562    mu = c[x] * mp
563
564 #define INNERMUL                     \
565 __asm__(                                 \
566    " mulld    16,%3,%4       \n\t"   \
567    " mulhdu   17,%3,%4       \n\t"   \
568    " addc     16,16,%0       \n\t"   \
569    " addze    17,17          \n\t"   \
570    " ldx      18,0,%1        \n\t"   \
571    " addc     16,16,18       \n\t"   \
572    " addze    %0,17          \n\t"   \
573    " sdx      16,0,%1        \n\t"   \
574 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "18","cc"); ++tmpm;
575
576 #define PROPCARRY                    \
577 __asm__(                                 \
578    " ldx      16,0,%1       \n\t"    \
579    " addc     16,16,%0      \n\t"    \
580    " sdx      16,0,%1       \n\t"    \
581    " xor      %0,%0,%0      \n\t"    \
582    " addze    %0,%0         \n\t"    \
583 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"16","cc");
584
585 /******************************************************************/
586
587 #elif defined(TFM_AVR32)
588
589 /* AVR32 */
590 #define MONT_START 
591 #define MONT_FINI
592 #define LOOP_END
593 #define LOOP_START \
594    mu = c[x] * mp
595
596 #define INNERMUL                    \
597 __asm__(                                \
598     " ld.w   r2,%1            \n\t" \
599     " add    r2,%0            \n\t" \
600     " eor    r3,r3            \n\t" \
601     " acr    r3               \n\t" \
602     " macu.d r2,%3,%4         \n\t" \
603     " st.w   %1,r2            \n\t" \
604     " mov    %0,r3            \n\t" \
605 :"=r"(cy),"=r"(_c):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c):"r2","r3");
606
607 #define PROPCARRY                    \
608 __asm__(                                 \
609    " ld.w     r2,%1         \n\t"    \
610    " add      r2,%0         \n\t"    \
611    " st.w     %1,r2         \n\t"    \
612    " eor      %0,%0         \n\t"    \
613    " acr      %0            \n\t"    \
614 :"=r"(cy),"=r"(&_c[0]):"0"(cy),"1"(&_c[0]):"r2","cc");
615
616 #else
617
618 /* ISO C code */
619 #define MONT_START 
620 #define MONT_FINI
621 #define LOOP_END
622 #define LOOP_START \
623    mu = c[x] * mp
624
625 #define INNERMUL                                      \
626    do { fp_word t;                                    \
627    t  = ((fp_word)_c[0] + (fp_word)cy) +              \
628                 (((fp_word)mu) * ((fp_word)*tmpm++)); \
629    _c[0] = (fp_digit)t;                               \
630    cy = (fp_digit)(t >> DIGIT_BIT);                   \
631    } while (0)
632
633 #define PROPCARRY \
634    do { fp_digit t = _c[0] += cy; cy = (t < cy); } while (0)
635
636 #endif
637 /******************************************************************/
638
639
640 #define LO  0
641 /* end fp_montogomery_reduce.c asm */
642
643
644 /* start fp_sqr_comba.c asm */
645 #if defined(TFM_X86)
646
647 /* x86-32 optimized */
648
649 #define COMBA_START
650
651 #define CLEAR_CARRY \
652    c0 = c1 = c2 = 0;
653
654 #define COMBA_STORE(x) \
655    x = c0;
656
657 #define COMBA_STORE2(x) \
658    x = c1;
659
660 #define CARRY_FORWARD \
661    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
662
663 #define COMBA_FINI
664
665 #define SQRADD(i, j)                                      \
666 __asm__(                                            \
667      "movl  %6,%%eax     \n\t"                            \
668      "mull  %%eax        \n\t"                            \
669      "addl  %%eax,%0     \n\t"                            \
670      "adcl  %%edx,%1     \n\t"                            \
671      "adcl  $0,%2        \n\t"                            \
672      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","cc");
673
674 #define SQRADD2(i, j)                                     \
675 __asm__(                                            \
676      "movl  %6,%%eax     \n\t"                            \
677      "mull  %7           \n\t"                            \
678      "addl  %%eax,%0     \n\t"                            \
679      "adcl  %%edx,%1     \n\t"                            \
680      "adcl  $0,%2        \n\t"                            \
681      "addl  %%eax,%0     \n\t"                            \
682      "adcl  %%edx,%1     \n\t"                            \
683      "adcl  $0,%2        \n\t"                            \
684      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx", "cc");
685
686 #define SQRADDSC(i, j)                                    \
687 __asm__(                                                     \
688      "movl  %3,%%eax     \n\t"                            \
689      "mull  %4           \n\t"                            \
690      "movl  %%eax,%0     \n\t"                            \
691      "movl  %%edx,%1     \n\t"                            \
692      "xorl  %2,%2        \n\t"                            \
693      :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%eax","%edx","cc");
694
695 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
696
697 #define SQRADDAC(i, j)                                    \
698 __asm__(                                                     \
699      "movl  %6,%%eax     \n\t"                            \
700      "mull  %7           \n\t"                            \
701      "addl  %%eax,%0     \n\t"                            \
702      "adcl  %%edx,%1     \n\t"                            \
703      "adcl  $0,%2        \n\t"                            \
704      :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","cc");
705
706 #define SQRADDDB                                          \
707 __asm__(                                                     \
708      "addl %6,%0         \n\t"                            \
709      "adcl %7,%1         \n\t"                            \
710      "adcl %8,%2         \n\t"                            \
711      "addl %6,%0         \n\t"                            \
712      "adcl %7,%1         \n\t"                            \
713      "adcl %8,%2         \n\t"                            \
714      :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
715
716 #elif defined(TFM_X86_64)
717 /* x86-64 optimized */
718
719 #define COMBA_START
720
721 #define CLEAR_CARRY \
722    c0 = c1 = c2 = 0;
723
724 #define COMBA_STORE(x) \
725    x = c0;
726
727 #define COMBA_STORE2(x) \
728    x = c1;
729
730 #define CARRY_FORWARD \
731    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
732
733 #define COMBA_FINI
734
735 #define SQRADD(i, j)                                      \
736 __asm__(                                                     \
737      "movq  %6,%%rax     \n\t"                            \
738      "mulq  %%rax        \n\t"                            \
739      "addq  %%rax,%0     \n\t"                            \
740      "adcq  %%rdx,%1     \n\t"                            \
741      "adcq  $0,%2        \n\t"                            \
742      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","cc");
743
744 #define SQRADD2(i, j)                                     \
745 __asm__(                                                     \
746      "movq  %6,%%rax     \n\t"                            \
747      "mulq  %7           \n\t"                            \
748      "addq  %%rax,%0     \n\t"                            \
749      "adcq  %%rdx,%1     \n\t"                            \
750      "adcq  $0,%2        \n\t"                            \
751      "addq  %%rax,%0     \n\t"                            \
752      "adcq  %%rdx,%1     \n\t"                            \
753      "adcq  $0,%2        \n\t"                            \
754      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j)  :"%rax","%rdx","cc");
755
756 #define SQRADDSC(i, j)                                    \
757 __asm__(                                                     \
758      "movq  %3,%%rax     \n\t"                            \
759      "mulq  %4           \n\t"                            \
760      "movq  %%rax,%0     \n\t"                            \
761      "movq  %%rdx,%1     \n\t"                            \
762      "xorq  %2,%2        \n\t"                            \
763      :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%rax","%rdx","cc");
764
765 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
766
767 #define SQRADDAC(i, j)                                                         \
768 __asm__(                                                     \
769      "movq  %6,%%rax     \n\t"                            \
770      "mulq  %7           \n\t"                            \
771      "addq  %%rax,%0     \n\t"                            \
772      "adcq  %%rdx,%1     \n\t"                            \
773      "adcq  $0,%2        \n\t"                            \
774      :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc");
775
776 #define SQRADDDB                                          \
777 __asm__(                                                     \
778      "addq %6,%0         \n\t"                            \
779      "adcq %7,%1         \n\t"                            \
780      "adcq %8,%2         \n\t"                            \
781      "addq %6,%0         \n\t"                            \
782      "adcq %7,%1         \n\t"                            \
783      "adcq %8,%2         \n\t"                            \
784      :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
785
786 #elif defined(TFM_SSE2)
787
788 /* SSE2 Optimized */
789 #define COMBA_START
790
791 #define CLEAR_CARRY \
792    c0 = c1 = c2 = 0;
793
794 #define COMBA_STORE(x) \
795    x = c0;
796
797 #define COMBA_STORE2(x) \
798    x = c1;
799
800 #define CARRY_FORWARD \
801    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
802
803 #define COMBA_FINI \
804    __asm__("emms");
805
806 #define SQRADD(i, j)                                      \
807 __asm__(                                            \
808      "movd  %6,%%mm0     \n\t"                            \
809      "pmuludq %%mm0,%%mm0\n\t"                            \
810      "movd  %%mm0,%%eax  \n\t"                            \
811      "psrlq $32,%%mm0    \n\t"                            \
812      "addl  %%eax,%0     \n\t"                            \
813      "movd  %%mm0,%%eax  \n\t"                            \
814      "adcl  %%eax,%1     \n\t"                            \
815      "adcl  $0,%2        \n\t"                            \
816      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","cc");
817
818 #define SQRADD2(i, j)                                     \
819 __asm__(                                            \
820      "movd  %6,%%mm0     \n\t"                            \
821      "movd  %7,%%mm1     \n\t"                            \
822      "pmuludq %%mm1,%%mm0\n\t"                            \
823      "movd  %%mm0,%%eax  \n\t"                            \
824      "psrlq $32,%%mm0    \n\t"                            \
825      "movd  %%mm0,%%edx  \n\t"                            \
826      "addl  %%eax,%0     \n\t"                            \
827      "adcl  %%edx,%1     \n\t"                            \
828      "adcl  $0,%2        \n\t"                            \
829      "addl  %%eax,%0     \n\t"                            \
830      "adcl  %%edx,%1     \n\t"                            \
831      "adcl  $0,%2        \n\t"                            \
832      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","cc");
833
834 #define SQRADDSC(i, j)                                                         \
835 __asm__(                                            \
836      "movd  %3,%%mm0     \n\t"                            \
837      "movd  %4,%%mm1     \n\t"                            \
838      "pmuludq %%mm1,%%mm0\n\t"                            \
839      "movd  %%mm0,%0     \n\t"                            \
840      "psrlq $32,%%mm0    \n\t"                            \
841      "movd  %%mm0,%1     \n\t"                            \
842      "xorl  %2,%2        \n\t"                            \
843      :"=r"(sc0), "=r"(sc1), "=r"(sc2): "m"(i), "m"(j));
844
845 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
846
847 #define SQRADDAC(i, j)                                                         \
848 __asm__(                                            \
849      "movd  %6,%%mm0     \n\t"                            \
850      "movd  %7,%%mm1     \n\t"                            \
851      "pmuludq %%mm1,%%mm0\n\t"                            \
852      "movd  %%mm0,%%eax  \n\t"                            \
853      "psrlq $32,%%mm0    \n\t"                            \
854      "movd  %%mm0,%%edx  \n\t"                            \
855      "addl  %%eax,%0     \n\t"                            \
856      "adcl  %%edx,%1     \n\t"                            \
857      "adcl  $0,%2        \n\t"                            \
858      :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j)  :"%eax","%edx","cc");
859
860 #define SQRADDDB                                          \
861 __asm__(                                                     \
862      "addl %6,%0         \n\t"                            \
863      "adcl %7,%1         \n\t"                            \
864      "adcl %8,%2         \n\t"                            \
865      "addl %6,%0         \n\t"                            \
866      "adcl %7,%1         \n\t"                            \
867      "adcl %8,%2         \n\t"                            \
868      :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
869
870 #elif defined(TFM_ARM)
871
872 /* ARM code */
873
874 #define COMBA_START
875
876 #define CLEAR_CARRY \
877    c0 = c1 = c2 = 0;
878
879 #define COMBA_STORE(x) \
880    x = c0;
881
882 #define COMBA_STORE2(x) \
883    x = c1;
884
885 #define CARRY_FORWARD \
886    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
887
888 #define COMBA_FINI
889
890 /* multiplies point i and j, updates carry "c1" and digit c2 */
891 #define SQRADD(i, j)                                             \
892 __asm__(                                                             \
893 "  UMULL  r0,r1,%6,%6              \n\t"                         \
894 "  ADDS   %0,%0,r0                 \n\t"                         \
895 "  ADCS   %1,%1,r1                 \n\t"                         \
896 "  ADC    %2,%2,#0                 \n\t"                         \
897 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "cc");
898         
899 /* for squaring some of the terms are doubled... */
900 #define SQRADD2(i, j)                                            \
901 __asm__(                                                             \
902 "  UMULL  r0,r1,%6,%7              \n\t"                         \
903 "  ADDS   %0,%0,r0                 \n\t"                         \
904 "  ADCS   %1,%1,r1                 \n\t"                         \
905 "  ADC    %2,%2,#0                 \n\t"                         \
906 "  ADDS   %0,%0,r0                 \n\t"                         \
907 "  ADCS   %1,%1,r1                 \n\t"                         \
908 "  ADC    %2,%2,#0                 \n\t"                         \
909 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
910
911 #define SQRADDSC(i, j)                                           \
912 __asm__(                                                             \
913 "  UMULL  %0,%1,%3,%4              \n\t"                         \
914 "  SUB    %2,%2,%2                 \n\t"                         \
915 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "r"(i), "r"(j) : "cc");
916
917 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
918
919 #define SQRADDAC(i, j)                                           \
920 __asm__(                                                             \
921 "  UMULL  r0,r1,%6,%7              \n\t"                         \
922 "  ADDS   %0,%0,r0                 \n\t"                         \
923 "  ADCS   %1,%1,r1                 \n\t"                         \
924 "  ADC    %2,%2,#0                 \n\t"                         \
925 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "r0", "r1", "cc");
926
927 #define SQRADDDB                                                 \
928 __asm__(                                                             \
929 "  ADDS  %0,%0,%3                     \n\t"                      \
930 "  ADCS  %1,%1,%4                     \n\t"                      \
931 "  ADC   %2,%2,%5                     \n\t"                      \
932 "  ADDS  %0,%0,%3                     \n\t"                      \
933 "  ADCS  %1,%1,%4                     \n\t"                      \
934 "  ADC   %2,%2,%5                     \n\t"                      \
935 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
936
937 #elif defined(TFM_PPC32)
938
939 /* PPC32 */
940
941 #define COMBA_START
942
943 #define CLEAR_CARRY \
944    c0 = c1 = c2 = 0;
945
946 #define COMBA_STORE(x) \
947    x = c0;
948
949 #define COMBA_STORE2(x) \
950    x = c1;
951
952 #define CARRY_FORWARD \
953    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
954
955 #define COMBA_FINI
956
957 /* multiplies point i and j, updates carry "c1" and digit c2 */
958 #define SQRADD(i, j)             \
959 __asm__(                             \
960    " mullw  16,%6,%6       \n\t" \
961    " addc   %0,%0,16       \n\t" \
962    " mulhwu 16,%6,%6       \n\t" \
963    " adde   %1,%1,16       \n\t" \
964    " addze  %2,%2          \n\t" \
965 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","cc");
966
967 /* for squaring some of the terms are doubled... */
968 #define SQRADD2(i, j)            \
969 __asm__(                             \
970    " mullw  16,%6,%7       \n\t" \
971    " mulhwu 17,%6,%7       \n\t" \
972    " addc   %0,%0,16       \n\t" \
973    " adde   %1,%1,17       \n\t" \
974    " addze  %2,%2          \n\t" \
975    " addc   %0,%0,16       \n\t" \
976    " adde   %1,%1,17       \n\t" \
977    " addze  %2,%2          \n\t" \
978 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","cc");
979
980 #define SQRADDSC(i, j)            \
981 __asm__(                              \
982    " mullw  %0,%6,%7        \n\t" \
983    " mulhwu %1,%6,%7        \n\t" \
984    " xor    %2,%2,%2        \n\t" \
985 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
986
987 #define SQRADDAC(i, j)           \
988 __asm__(                             \
989    " mullw  16,%6,%7       \n\t" \
990    " addc   %0,%0,16       \n\t" \
991    " mulhwu 16,%6,%7       \n\t" \
992    " adde   %1,%1,16       \n\t" \
993    " addze  %2,%2          \n\t" \
994 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "cc");
995
996 #define SQRADDDB                  \
997 __asm__(                              \
998    " addc   %0,%0,%3        \n\t" \
999    " adde   %1,%1,%4        \n\t" \
1000    " adde   %2,%2,%5        \n\t" \
1001    " addc   %0,%0,%3        \n\t" \
1002    " adde   %1,%1,%4        \n\t" \
1003    " adde   %2,%2,%5        \n\t" \
1004 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
1005
1006 #elif defined(TFM_PPC64)
1007 /* PPC64 */
1008
1009 #define COMBA_START
1010
1011 #define CLEAR_CARRY \
1012    c0 = c1 = c2 = 0;
1013
1014 #define COMBA_STORE(x) \
1015    x = c0;
1016
1017 #define COMBA_STORE2(x) \
1018    x = c1;
1019
1020 #define CARRY_FORWARD \
1021    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1022
1023 #define COMBA_FINI
1024
1025 /* multiplies point i and j, updates carry "c1" and digit c2 */
1026 #define SQRADD(i, j)             \
1027 __asm__(                             \
1028    " mulld  16,%6,%6       \n\t" \
1029    " addc   %0,%0,16       \n\t" \
1030    " mulhdu 16,%6,%6       \n\t" \
1031    " adde   %1,%1,16       \n\t" \
1032    " addze  %2,%2          \n\t" \
1033 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","cc");
1034
1035 /* for squaring some of the terms are doubled... */
1036 #define SQRADD2(i, j)            \
1037 __asm__(                             \
1038    " mulld  16,%6,%7       \n\t" \
1039    " mulhdu 17,%6,%7       \n\t" \
1040    " addc   %0,%0,16       \n\t" \
1041    " adde   %1,%1,17       \n\t" \
1042    " addze  %2,%2          \n\t" \
1043    " addc   %0,%0,16       \n\t" \
1044    " adde   %1,%1,17       \n\t" \
1045    " addze  %2,%2          \n\t" \
1046 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","cc");
1047
1048 #define SQRADDSC(i, j)            \
1049 __asm__(                              \
1050    " mulld  %0,%6,%7        \n\t" \
1051    " mulhdu %1,%6,%7        \n\t" \
1052    " xor    %2,%2,%2        \n\t" \
1053 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
1054
1055 #define SQRADDAC(i, j)           \
1056 __asm__(                             \
1057    " mulld  16,%6,%7       \n\t" \
1058    " addc   %0,%0,16       \n\t" \
1059    " mulhdu 16,%6,%7       \n\t" \
1060    " adde   %1,%1,16       \n\t" \
1061    " addze  %2,%2          \n\t" \
1062 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "cc");
1063
1064 #define SQRADDDB                  \
1065 __asm__(                              \
1066    " addc   %0,%0,%3        \n\t" \
1067    " adde   %1,%1,%4        \n\t" \
1068    " adde   %2,%2,%5        \n\t" \
1069    " addc   %0,%0,%3        \n\t" \
1070    " adde   %1,%1,%4        \n\t" \
1071    " adde   %2,%2,%5        \n\t" \
1072 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
1073
1074
1075 #elif defined(TFM_AVR32)
1076
1077 /* AVR32 */
1078
1079 #define COMBA_START
1080
1081 #define CLEAR_CARRY \
1082    c0 = c1 = c2 = 0;
1083
1084 #define COMBA_STORE(x) \
1085    x = c0;
1086
1087 #define COMBA_STORE2(x) \
1088    x = c1;
1089
1090 #define CARRY_FORWARD \
1091    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1092
1093 #define COMBA_FINI
1094
1095 /* multiplies point i and j, updates carry "c1" and digit c2 */
1096 #define SQRADD(i, j)             \
1097 __asm__(                             \
1098    " mulu.d r2,%6,%6       \n\t" \
1099    " add    %0,%0,r2       \n\t" \
1100    " adc    %1,%1,r3       \n\t" \
1101    " acr    %2             \n\t" \
1102 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"r2","r3");
1103
1104 /* for squaring some of the terms are doubled... */
1105 #define SQRADD2(i, j)            \
1106 __asm__(                             \
1107    " mulu.d r2,%6,%7       \n\t" \
1108    " add    %0,%0,r2       \n\t" \
1109    " adc    %1,%1,r3       \n\t" \
1110    " acr    %2,            \n\t" \
1111    " add    %0,%0,r2       \n\t" \
1112    " adc    %1,%1,r3       \n\t" \
1113    " acr    %2,            \n\t" \
1114 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2", "r3");
1115
1116 #define SQRADDSC(i, j)            \
1117 __asm__(                              \
1118    " mulu.d r2,%6,%7        \n\t" \
1119    " mov    %0,r2           \n\t" \
1120    " mov    %1,r3           \n\t" \
1121    " eor    %2,%2           \n\t" \
1122 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "r2", "r3");
1123
1124 #define SQRADDAC(i, j)           \
1125 __asm__(                             \
1126    " mulu.d r2,%6,%7       \n\t" \
1127    " add    %0,%0,r2       \n\t" \
1128    " adc    %1,%1,r3       \n\t" \
1129    " acr    %2             \n\t" \
1130 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"r2", "r3");
1131
1132 #define SQRADDDB                  \
1133 __asm__(                              \
1134    " add    %0,%0,%3        \n\t" \
1135    " adc    %1,%1,%4        \n\t" \
1136    " adc    %2,%2,%5        \n\t" \
1137    " add    %0,%0,%3        \n\t" \
1138    " adc    %1,%1,%4        \n\t" \
1139    " adc    %2,%2,%5        \n\t" \
1140 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
1141
1142
1143 #else
1144
1145 #define TFM_ISO
1146
1147 /* ISO C portable code */
1148
1149 #define COMBA_START
1150
1151 #define CLEAR_CARRY \
1152    c0 = c1 = c2 = 0;
1153
1154 #define COMBA_STORE(x) \
1155    x = c0;
1156
1157 #define COMBA_STORE2(x) \
1158    x = c1;
1159
1160 #define CARRY_FORWARD \
1161    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1162
1163 #define COMBA_FINI
1164
1165 /* multiplies point i and j, updates carry "c1" and digit c2 */
1166 #define SQRADD(i, j)                                 \
1167    do { fp_word t;                                   \
1168    t = c0 + ((fp_word)i) * ((fp_word)j);  c0 = (fp_digit)t;    \
1169    t = c1 + (t >> DIGIT_BIT);             c1 = (fp_digit)t;    \
1170                                           c2 +=(fp_digit) (t >> DIGIT_BIT); \
1171    } while (0);
1172   
1173
1174 /* for squaring some of the terms are doubled... */
1175 #define SQRADD2(i, j)                                                 \
1176    do { fp_word t;                                                    \
1177    t  = ((fp_word)i) * ((fp_word)j);                                  \
1178    tt = (fp_word)c0 + t;                 c0 = (fp_digit)tt;           \
1179    tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = (fp_digit)tt;           \
1180                                          c2 +=(fp_digit)( tt >> DIGIT_BIT);    \
1181    tt = (fp_word)c0 + t;                 c0 = (fp_digit)tt;                    \
1182    tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = (fp_digit)tt;            \
1183                                          c2 +=(fp_digit) (tt >> DIGIT_BIT);    \
1184    } while (0);
1185
1186 #define SQRADDSC(i, j)                                                         \
1187    do { fp_word t;                                                             \
1188       t =  ((fp_word)i) * ((fp_word)j);                                        \
1189       sc0 = (fp_digit)t; sc1 = (t >> DIGIT_BIT); sc2 = 0;                      \
1190    } while (0);
1191
1192 #define SQRADDAC(i, j)                                                         \
1193    do { fp_word t;                                                             \
1194    t = sc0 + ((fp_word)i) * ((fp_word)j);  sc0 =  (fp_digit)t;                 \
1195    t = sc1 + (t >> DIGIT_BIT);             sc1 =  (fp_digit)t;                 \
1196                                            sc2 += (fp_digit)(t >> DIGIT_BIT);  \
1197    } while (0);
1198
1199 #define SQRADDDB                                                               \
1200    do { fp_word t;                                                             \
1201    t = ((fp_word)sc0) + ((fp_word)sc0) + c0; c0 = (fp_digit)t;                 \
1202    t = ((fp_word)sc1) + ((fp_word)sc1) + c1 + (t >> DIGIT_BIT);                \
1203                                              c1 = (fp_digit)t;                 \
1204    c2 = c2 + (fp_digit)(((fp_word)sc2) + ((fp_word)sc2) + (t >> DIGIT_BIT));   \
1205    } while (0);
1206
1207 #endif
1208
1209 #ifdef TFM_SMALL_SET
1210     #include "fp_sqr_comba_small_set.i"
1211 #endif
1212
1213 #if defined(TFM_SQR3)
1214     #include "fp_sqr_comba_3.i"
1215 #endif
1216 #if defined(TFM_SQR4)
1217     #include "fp_sqr_comba_4.i"
1218 #endif
1219 #if defined(TFM_SQR6)
1220     #include "fp_sqr_comba_6.i"
1221 #endif
1222 #if defined(TFM_SQR7)
1223     #include "fp_sqr_comba_7.i"
1224 #endif
1225 #if defined(TFM_SQR8)
1226     #include "fp_sqr_comba_8.i"
1227 #endif
1228 #if defined(TFM_SQR9)
1229     #include "fp_sqr_comba_9.i"
1230 #endif
1231 #if defined(TFM_SQR12)
1232     #include "fp_sqr_comba_12.i"
1233 #endif
1234 #if defined(TFM_SQR17)
1235     #include "fp_sqr_comba_17.i"
1236 #endif
1237 #if defined(TFM_SQR20)
1238     #include "fp_sqr_comba_20.i"
1239 #endif
1240 #if defined(TFM_SQR24)
1241     #include "fp_sqr_comba_24.i"
1242 #endif
1243 #if defined(TFM_SQR28)
1244     #include "fp_sqr_comba_28.i"
1245 #endif
1246 #if defined(TFM_SQR32)
1247     #include "fp_sqr_comba_32.i"
1248 #endif
1249 #if defined(TFM_SQR48)
1250     #include "fp_sqr_comba_48.i"
1251 #endif
1252 #if defined(TFM_SQR64)
1253     #include "fp_sqr_comba_64.i"
1254 #endif
1255 /* end fp_sqr_comba.c asm */
1256
1257 /* start fp_mul_comba.c asm */
1258 /* these are the combas.  Worship them. */
1259 #if defined(TFM_X86)
1260 /* Generic x86 optimized code */
1261
1262 /* anything you need at the start */
1263 #define COMBA_START
1264
1265 /* clear the chaining variables */
1266 #define COMBA_CLEAR \
1267    c0 = c1 = c2 = 0;
1268
1269 /* forward the carry to the next digit */
1270 #define COMBA_FORWARD \
1271    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1272
1273 /* store the first sum */
1274 #define COMBA_STORE(x) \
1275    x = c0;
1276
1277 /* store the second sum [carry] */
1278 #define COMBA_STORE2(x) \
1279    x = c1;
1280
1281 /* anything you need at the end */
1282 #define COMBA_FINI
1283
1284 /* this should multiply i and j  */
1285 #define MULADD(i, j)                                      \
1286 __asm__(                                                      \
1287      "movl  %6,%%eax     \n\t"                            \
1288      "mull  %7           \n\t"                            \
1289      "addl  %%eax,%0     \n\t"                            \
1290      "adcl  %%edx,%1     \n\t"                            \
1291      "adcl  $0,%2        \n\t"                            \
1292      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","cc");
1293
1294 #elif defined(TFM_X86_64)
1295 /* x86-64 optimized */
1296
1297 /* anything you need at the start */
1298 #define COMBA_START
1299
1300 /* clear the chaining variables */
1301 #define COMBA_CLEAR \
1302    c0 = c1 = c2 = 0;
1303
1304 /* forward the carry to the next digit */
1305 #define COMBA_FORWARD \
1306    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1307
1308 /* store the first sum */
1309 #define COMBA_STORE(x) \
1310    x = c0;
1311
1312 /* store the second sum [carry] */
1313 #define COMBA_STORE2(x) \
1314    x = c1;
1315
1316 /* anything you need at the end */
1317 #define COMBA_FINI
1318
1319 /* this should multiply i and j  */
1320 #define MULADD(i, j)                                      \
1321 __asm__  (                                                    \
1322      "movq  %6,%%rax     \n\t"                            \
1323      "mulq  %7           \n\t"                            \
1324      "addq  %%rax,%0     \n\t"                            \
1325      "adcq  %%rdx,%1     \n\t"                            \
1326      "adcq  $0,%2        \n\t"                            \
1327      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j)  :"%rax","%rdx","cc");
1328
1329
1330 #if defined(HAVE_INTEL_MULX)
1331 #define MULADD_MULX(b0, c0, c1, rdx)\
1332     __asm__  volatile (                                   \
1333          "movq   %3, %%rdx\n\t"                           \
1334          "mulx  %2,%%r9, %%r8 \n\t"                       \
1335          "adoxq  %%r9,%0     \n\t"                        \
1336          "adcxq  %%r8,%1     \n\t"                        \
1337          :"+r"(c0),"+r"(c1):"r"(b0), "r"(rdx):"%r8","%r9","%r10","%rdx"\
1338     )
1339
1340
1341 #define MULADD_MULX_ADD_CARRY(c0, c1)\
1342     __asm__ volatile(\
1343     "mov $0, %%r10\n\t"\
1344     "movq %1, %%r8\n\t"\
1345     "adox %%r10, %0\n\t"\
1346     "adcx %%r10, %1\n\t"\
1347     :"+r"(c0),"+r"(c1)::"%r8","%r9","%r10","%rdx") ;
1348
1349 #define MULADD_SET_A(a0)\
1350     __asm__ volatile("add $0, %%r8\n\t"                   \
1351              "movq  %0,%%rdx\n\t"                         \
1352              ::"r"(a0):"%r8","%r9","%r10","%rdx") ;
1353
1354 #define MULADD_BODY(a,b,c)\
1355     {   word64 rdx = a->dp[ix] ;      \
1356         cp = &(c->dp[iz]) ;           \
1357         c0 = cp[0] ; c1 = cp[1];      \
1358         MULADD_SET_A(rdx) ;           \
1359         MULADD_MULX(b0, c0, c1, rdx) ;\
1360         cp[0]=c0; c0=cp[2];           \
1361         MULADD_MULX(b1, c1, c0, rdx) ;\
1362         cp[1]=c1; c1=cp[3];           \
1363         MULADD_MULX(b2, c0, c1, rdx) ;\
1364         cp[2]=c0; c0=cp[4];           \
1365         MULADD_MULX(b3, c1, c0, rdx) ;\
1366         cp[3]=c1; c1=cp[5];           \
1367         MULADD_MULX_ADD_CARRY(c0, c1);\
1368         cp[4]=c0; cp[5]=c1;           \
1369     }
1370
1371 #define TFM_INTEL_MUL_COMBA(a, b, c)\
1372   for(ix=0; ix<pa; ix++)c->dp[ix]=0 ; \
1373   for(iy=0; (iy<b->used); iy+=4) {    \
1374     fp_digit *bp ;                    \
1375     bp = &(b->dp[iy+0]) ;             \
1376     fp_digit b0 = bp[0] , b1= bp[1],  \
1377              b2= bp[2], b3= bp[3];    \
1378     ix=0, iz=iy;                      \
1379     while(ix<a->used) {               \
1380         fp_digit c0, c1;              \
1381         fp_digit *cp ;                \
1382         MULADD_BODY(a,b,c);           \
1383         ix++ ; iz++ ;                 \
1384     }                                 \
1385 };
1386 #endif
1387
1388 #elif defined(TFM_SSE2)
1389 /* use SSE2 optimizations */
1390
1391 /* anything you need at the start */
1392 #define COMBA_START
1393
1394 /* clear the chaining variables */
1395 #define COMBA_CLEAR \
1396    c0 = c1 = c2 = 0;
1397
1398 /* forward the carry to the next digit */
1399 #define COMBA_FORWARD \
1400    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1401
1402 /* store the first sum */
1403 #define COMBA_STORE(x) \
1404    x = c0;
1405
1406 /* store the second sum [carry] */
1407 #define COMBA_STORE2(x) \
1408    x = c1;
1409
1410 /* anything you need at the end */
1411 #define COMBA_FINI \
1412    __asm__("emms");
1413
1414 /* this should multiply i and j  */
1415 #define MULADD(i, j)                                     \
1416 __asm__(                                                     \
1417     "movd  %6,%%mm0     \n\t"                            \
1418     "movd  %7,%%mm1     \n\t"                            \
1419     "pmuludq %%mm1,%%mm0\n\t"                            \
1420     "movd  %%mm0,%%eax  \n\t"                            \
1421     "psrlq $32,%%mm0    \n\t"                            \
1422     "addl  %%eax,%0     \n\t"                            \
1423     "movd  %%mm0,%%eax  \n\t"                            \
1424     "adcl  %%eax,%1     \n\t"                            \
1425     "adcl  $0,%2        \n\t"                            \
1426     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","cc");
1427
1428 #elif defined(TFM_ARM)
1429 /* ARM code */
1430
1431 #define COMBA_START 
1432
1433 #define COMBA_CLEAR \
1434    c0 = c1 = c2 = 0;
1435
1436 #define COMBA_FORWARD \
1437    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1438
1439 #define COMBA_STORE(x) \
1440    x = c0;
1441
1442 #define COMBA_STORE2(x) \
1443    x = c1;
1444
1445 #define COMBA_FINI
1446
1447 #define MULADD(i, j)                                          \
1448 __asm__(                                                          \
1449 "  UMULL  r0,r1,%6,%7           \n\t"                         \
1450 "  ADDS   %0,%0,r0              \n\t"                         \
1451 "  ADCS   %1,%1,r1              \n\t"                         \
1452 "  ADC    %2,%2,#0              \n\t"                         \
1453 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
1454
1455 #elif defined(TFM_PPC32)
1456 /* For 32-bit PPC */
1457
1458 #define COMBA_START
1459
1460 #define COMBA_CLEAR \
1461    c0 = c1 = c2 = 0;
1462
1463 #define COMBA_FORWARD \
1464    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1465
1466 #define COMBA_STORE(x) \
1467    x = c0;
1468
1469 #define COMBA_STORE2(x) \
1470    x = c1;
1471
1472 #define COMBA_FINI 
1473    
1474 /* untested: will mulhwu change the flags?  Docs say no */
1475 #define MULADD(i, j)              \
1476 __asm__(                              \
1477    " mullw  16,%6,%7       \n\t" \
1478    " addc   %0,%0,16       \n\t" \
1479    " mulhwu 16,%6,%7       \n\t" \
1480    " adde   %1,%1,16       \n\t" \
1481    " addze  %2,%2          \n\t" \
1482 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16");
1483
1484 #elif defined(TFM_PPC64)
1485 /* For 64-bit PPC */
1486
1487 #define COMBA_START
1488
1489 #define COMBA_CLEAR \
1490    c0 = c1 = c2 = 0;
1491
1492 #define COMBA_FORWARD \
1493    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1494
1495 #define COMBA_STORE(x) \
1496    x = c0;
1497
1498 #define COMBA_STORE2(x) \
1499    x = c1;
1500
1501 #define COMBA_FINI 
1502    
1503 /* untested: will mulhwu change the flags?  Docs say no */
1504 #define MULADD(i, j)              \
1505 ____asm__(                              \
1506    " mulld  16,%6,%7       \n\t" \
1507    " addc   %0,%0,16       \n\t" \
1508    " mulhdu 16,%6,%7       \n\t" \
1509    " adde   %1,%1,16       \n\t" \
1510    " addze  %2,%2          \n\t" \
1511 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16");
1512
1513 #elif defined(TFM_AVR32)
1514
1515 /* ISO C code */
1516
1517 #define COMBA_START
1518
1519 #define COMBA_CLEAR \
1520    c0 = c1 = c2 = 0;
1521
1522 #define COMBA_FORWARD \
1523    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1524
1525 #define COMBA_STORE(x) \
1526    x = c0;
1527
1528 #define COMBA_STORE2(x) \
1529    x = c1;
1530
1531 #define COMBA_FINI 
1532    
1533 #define MULADD(i, j)             \
1534 ____asm__(                             \
1535    " mulu.d r2,%6,%7        \n\t"\
1536    " add    %0,r2           \n\t"\
1537    " adc    %1,%1,r3        \n\t"\
1538    " acr    %2              \n\t"\
1539 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2","r3");
1540
1541 #else
1542 /* ISO C code */
1543
1544 #define COMBA_START
1545
1546 #define COMBA_CLEAR \
1547    c0 = c1 = c2 = 0;
1548
1549 #define COMBA_FORWARD \
1550    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1551
1552 #define COMBA_STORE(x) \
1553    x = c0;
1554
1555 #define COMBA_STORE2(x) \
1556    x = c1;
1557
1558 #define COMBA_FINI 
1559    
1560 #define MULADD(i, j)                                                                                                                                  \
1561    do { fp_word t;                                                    \
1562    t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j); c0 = (fp_digit)t;   \
1563    t = (fp_word)c1 + (t >> DIGIT_BIT);                                \
1564    c1 = (fp_digit)t; c2 += (fp_digit)(t >> DIGIT_BIT);                \
1565    } while (0);
1566
1567 #endif
1568
1569
1570 #ifdef TFM_SMALL_SET
1571     #include "fp_mul_comba_small_set.i"
1572 #endif
1573
1574 #if defined(TFM_MUL3)
1575     #include "fp_mul_comba_3.i"
1576 #endif
1577 #if defined(TFM_MUL4)
1578     #include "fp_mul_comba_4.i"
1579 #endif
1580 #if defined(TFM_MUL6)
1581     #include "fp_mul_comba_6.i"
1582 #endif
1583 #if defined(TFM_MUL7)
1584     #include "fp_mul_comba_7.i"
1585 #endif
1586 #if defined(TFM_MUL8)
1587     #include "fp_mul_comba_8.i"
1588 #endif
1589 #if defined(TFM_MUL9)
1590     #include "fp_mul_comba_9.i"
1591 #endif
1592 #if defined(TFM_MUL12)
1593     #include "fp_mul_comba_12.i"
1594 #endif
1595 #if defined(TFM_MUL17)
1596     #include "fp_mul_comba_17.i"
1597 #endif
1598 #if defined(TFM_MUL20)
1599     #include "fp_mul_comba_20.i"
1600 #endif
1601 #if defined(TFM_MUL24)
1602     #include "fp_mul_comba_24.i"
1603 #endif
1604 #if defined(TFM_MUL28)
1605     #include "fp_mul_comba_28.i"
1606 #endif
1607 #if defined(TFM_MUL32)
1608     #include "fp_mul_comba_32.i"
1609 #endif
1610 #if defined(TFM_MUL48)
1611     #include "fp_mul_comba_48.i"
1612 #endif
1613 #if defined(TFM_MUL64)
1614     #include "fp_mul_comba_64.i"
1615 #endif
1616
1617 /* end fp_mul_comba.c asm */
1618