3 * Copyright (C) 2006-2015 wolfSSL Inc.
5 * This file is part of wolfSSL. (formerly known as CyaSSL)
7 * wolfSSL is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * wolfSSL is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
26 #include <wolfssl/wolfcrypt/settings.h>
29 * Based on public domain TomsFastMath 0.10 by Tom St Denis, tomstdenis@iahu.ca,
30 * http://math.libtomcrypt.com
34 /******************************************************************/
35 /* fp_montgomery_reduce.c asm or generic */
38 /* Each platform needs to query info type 1 from cpuid to see if aesni is
39 * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
42 #if defined(HAVE_INTEL_MULX)
44 #define cpuid(reg, leaf, sub)\
45 __asm__ __volatile__ ("cpuid":\
46 "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\
47 "a" (leaf), "c"(sub));
49 #define XASM_LINK(f) asm(f)
53 #define cpuid(a,b) __cpuid((int*)a,b)
64 #define CPUID_AVX1 0x1
65 #define CPUID_AVX2 0x2
66 #define CPUID_RDRAND 0x4
67 #define CPUID_RDSEED 0x8
68 #define CPUID_BMI2 0x10 /* MULX, RORX */
69 #define CPUID_ADX 0x20 /* ADCX, ADOX */
71 #define IS_INTEL_AVX1 (cpuid_flags&CPUID_AVX1)
72 #define IS_INTEL_AVX2 (cpuid_flags&CPUID_AVX2)
73 #define IS_INTEL_BMI2 (cpuid_flags&CPUID_BMI2)
74 #define IS_INTEL_ADX (cpuid_flags&CPUID_ADX)
75 #define IS_INTEL_RDRAND (cpuid_flags&CPUID_RDRAND)
76 #define IS_INTEL_RDSEED (cpuid_flags&CPUID_RDSEED)
79 static word32 cpuid_check = 0 ;
80 static word32 cpuid_flags = 0 ;
82 static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) {
88 if(memcmp((char *)&(reg[EBX]), "Genu", 4) == 0 &&
89 memcmp((char *)&(reg[EDX]), "ineI", 4) == 0 &&
90 memcmp((char *)&(reg[ECX]), "ntel", 4) == 0) {
94 cpuid(reg, leaf, sub);
95 return((reg[num]>>bit)&0x1) ;
100 INLINE static int set_cpuid_flags(void) {
101 if(cpuid_check == 0) {
102 if(cpuid_flag(7, 0, EBX, 8)){ cpuid_flags |= CPUID_BMI2 ; }
103 if(cpuid_flag(7, 0, EBX,19)){ cpuid_flags |= CPUID_ADX ; }
110 #define RETURN return
111 #define IF_HAVE_INTEL_MULX(func, ret) \
112 if(cpuid_check==0)set_cpuid_flags() ; \
113 if(IS_INTEL_BMI2 && IS_INTEL_ADX){ func; ret ; }
116 #define IF_HAVE_INTEL_MULX(func, ret)
119 #if defined(TFM_X86) && !defined(TFM_SSE2)
130 "movl %5,%%eax \n\t" \
132 "addl %1,%%eax \n\t" \
133 "adcl $0,%%edx \n\t" \
134 "addl %%eax,%0 \n\t" \
135 "adcl $0,%%edx \n\t" \
136 "movl %%edx,%1 \n\t" \
137 :"=g"(_c[LO]), "=r"(cy) \
138 :"0"(_c[LO]), "1"(cy), "g"(mu), "g"(*tmpm++) \
139 : "%eax", "%edx", "cc")
145 "movzbl %%al,%1 \n\t" \
146 :"=g"(_c[LO]), "=r"(cy) \
147 :"0"(_c[LO]), "1"(cy) \
150 /******************************************************************/
151 #elif defined(TFM_X86_64)
162 "movq %5,%%rax \n\t" \
164 "addq %1,%%rax \n\t" \
165 "adcq $0,%%rdx \n\t" \
166 "addq %%rax,%0 \n\t" \
167 "adcq $0,%%rdx \n\t" \
168 "movq %%rdx,%1 \n\t" \
169 :"=g"(_c[LO]), "=r"(cy) \
170 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \
171 : "%rax", "%rdx", "cc")
173 #if defined(HAVE_INTEL_MULX)
174 #define MULX_INIT(a0, c0, cy)\
176 "xorq %%r10, %%r10\n\t" \
177 "movq %1,%%rdx\n\t" \
178 "addq %2, %0\n\t" /* c0+=cy; Set CF, OF */ \
179 "adoxq %%r10, %%r10\n\t" /* Reset OF */ \
180 :"+m"(c0):"r"(a0),"r"(cy):"%r8","%r9", "%r10","%r11","%r12","%rdx") ; \
182 #define MULX_INNERMUL_R1(c0, c1, pre, rdx)\
185 "movq %3, %%rdx\n\t" \
186 "mulx %%r11,%%r9, %%r8 \n\t" \
187 "movq %2, %%r12\n\t" \
188 "adoxq %%r9,%0 \n\t" \
189 "adcxq %%r8,%1 \n\t" \
190 :"+r"(c0),"+r"(c1):"m"(pre),"r"(rdx):"%r8","%r9", "%r10", "%r11","%r12","%rdx" \
194 #define MULX_INNERMUL_R2(c0, c1, pre, rdx)\
197 "movq %3, %%rdx\n\t" \
198 "mulx %%r12,%%r9, %%r8 \n\t" \
199 "movq %2, %%r11\n\t" \
200 "adoxq %%r9,%0 \n\t" \
201 "adcxq %%r8,%1 \n\t" \
202 :"+r"(c0),"+r"(c1):"m"(pre),"r"(rdx):"%r8","%r9", "%r10", "%r11","%r12","%rdx" \
205 #define MULX_LOAD_R1(val)\
207 "movq %0, %%r11\n\t"\
208 ::"m"(val):"%r8","%r9", "%r10", "%r11","%r12","%rdx"\
211 #define MULX_INNERMUL_LAST(c0, c1, rdx)\
214 "movq %2, %%rdx\n\t" \
215 "mulx %%r12,%%r9, %%r8 \n\t" \
216 "movq $0, %%r10 \n\t" \
217 "adoxq %%r10, %%r9 \n\t" \
218 "adcq $0,%%r8 \n\t" \
219 "addq %%r9,%0 \n\t" \
220 "adcq $0,%%r8 \n\t" \
221 "movq %%r8,%1 \n\t" \
222 :"+m"(c0),"=m"(c1):"r"(rdx):"%r8","%r9","%r10", "%r11", "%r12","%rdx"\
225 #define MULX_INNERMUL8(x,y,z,cy)\
227 MULX_LOAD_R1(x[0]) ;\
228 MULX_INIT(y, _c0, cy) ; /* rdx=y; z0+=cy; */ \
229 MULX_INNERMUL_R1(_c0, _c1, x[1], rdx) ;\
230 MULX_INNERMUL_R2(_c1, _c2, x[2], rdx) ;\
231 MULX_INNERMUL_R1(_c2, _c3, x[3], rdx) ;\
232 MULX_INNERMUL_R2(_c3, _c4, x[4], rdx) ;\
233 MULX_INNERMUL_R1(_c4, _c5, x[5], rdx) ;\
234 MULX_INNERMUL_R2(_c5, _c6, x[6], rdx) ;\
235 MULX_INNERMUL_R1(_c6, _c7, x[7], rdx) ;\
236 MULX_INNERMUL_LAST(_c7, cy, rdx) ;\
238 #define INNERMUL8_MULX \
240 MULX_INNERMUL8(tmpm, mu, _c, cy);\
246 "movq 0(%5),%%rax \n\t" \
247 "movq 0(%2),%%r10 \n\t" \
248 "movq 0x8(%5),%%r11 \n\t" \
250 "addq %%r10,%%rax \n\t" \
251 "adcq $0,%%rdx \n\t" \
252 "movq 0x8(%2),%%r10 \n\t" \
253 "addq %3,%%rax \n\t" \
254 "adcq $0,%%rdx \n\t" \
255 "movq %%rax,0(%0) \n\t" \
256 "movq %%rdx,%1 \n\t" \
258 "movq %%r11,%%rax \n\t" \
259 "movq 0x10(%5),%%r11 \n\t" \
261 "addq %%r10,%%rax \n\t" \
262 "adcq $0,%%rdx \n\t" \
263 "movq 0x10(%2),%%r10 \n\t" \
264 "addq %3,%%rax \n\t" \
265 "adcq $0,%%rdx \n\t" \
266 "movq %%rax,0x8(%0) \n\t" \
267 "movq %%rdx,%1 \n\t" \
269 "movq %%r11,%%rax \n\t" \
270 "movq 0x18(%5),%%r11 \n\t" \
272 "addq %%r10,%%rax \n\t" \
273 "adcq $0,%%rdx \n\t" \
274 "movq 0x18(%2),%%r10 \n\t" \
275 "addq %3,%%rax \n\t" \
276 "adcq $0,%%rdx \n\t" \
277 "movq %%rax,0x10(%0) \n\t" \
278 "movq %%rdx,%1 \n\t" \
280 "movq %%r11,%%rax \n\t" \
281 "movq 0x20(%5),%%r11 \n\t" \
283 "addq %%r10,%%rax \n\t" \
284 "adcq $0,%%rdx \n\t" \
285 "movq 0x20(%2),%%r10 \n\t" \
286 "addq %3,%%rax \n\t" \
287 "adcq $0,%%rdx \n\t" \
288 "movq %%rax,0x18(%0) \n\t" \
289 "movq %%rdx,%1 \n\t" \
291 "movq %%r11,%%rax \n\t" \
292 "movq 0x28(%5),%%r11 \n\t" \
294 "addq %%r10,%%rax \n\t" \
295 "adcq $0,%%rdx \n\t" \
296 "movq 0x28(%2),%%r10 \n\t" \
297 "addq %3,%%rax \n\t" \
298 "adcq $0,%%rdx \n\t" \
299 "movq %%rax,0x20(%0) \n\t" \
300 "movq %%rdx,%1 \n\t" \
302 "movq %%r11,%%rax \n\t" \
303 "movq 0x30(%5),%%r11 \n\t" \
305 "addq %%r10,%%rax \n\t" \
306 "adcq $0,%%rdx \n\t" \
307 "movq 0x30(%2),%%r10 \n\t" \
308 "addq %3,%%rax \n\t" \
309 "adcq $0,%%rdx \n\t" \
310 "movq %%rax,0x28(%0) \n\t" \
311 "movq %%rdx,%1 \n\t" \
313 "movq %%r11,%%rax \n\t" \
314 "movq 0x38(%5),%%r11 \n\t" \
316 "addq %%r10,%%rax \n\t" \
317 "adcq $0,%%rdx \n\t" \
318 "movq 0x38(%2),%%r10 \n\t" \
319 "addq %3,%%rax \n\t" \
320 "adcq $0,%%rdx \n\t" \
321 "movq %%rax,0x30(%0) \n\t" \
322 "movq %%rdx,%1 \n\t" \
324 "movq %%r11,%%rax \n\t" \
326 "addq %%r10,%%rax \n\t" \
327 "adcq $0,%%rdx \n\t" \
328 "addq %3,%%rax \n\t" \
329 "adcq $0,%%rdx \n\t" \
330 "movq %%rax,0x38(%0) \n\t" \
331 "movq %%rdx,%1 \n\t" \
333 :"=r"(_c), "=r"(cy) \
334 : "0"(_c), "1"(cy), "g"(mu), "r"(tmpm)\
335 : "%rax", "%rdx", "%r10", "%r11", "cc")\
341 "movzbq %%al,%1 \n\t" \
342 :"=g"(_c[LO]), "=r"(cy) \
343 :"0"(_c[LO]), "1"(cy) \
346 /******************************************************************/
347 #elif defined(TFM_SSE2)
348 /* SSE2 code (assumes 32-bit fp_digits) */
349 /* XMM register assignments:
350 * xmm0 *tmpm++, then Mu * (*tmpm++)
358 __asm__("movd %0,%%mm2"::"g"(mp))
365 "movd %0,%%mm1 \n\t" \
366 "pxor %%mm3,%%mm3 \n\t" \
367 "pmuludq %%mm2,%%mm1 \n\t" \
370 /* pmuludq on mmx registers does a 32x32->64 multiply. */
373 "movd %1,%%mm4 \n\t" \
374 "movd %2,%%mm0 \n\t" \
375 "paddq %%mm4,%%mm3 \n\t" \
376 "pmuludq %%mm1,%%mm0 \n\t" \
377 "paddq %%mm0,%%mm3 \n\t" \
378 "movd %%mm3,%0 \n\t" \
379 "psrlq $32, %%mm3 \n\t" \
380 :"=g"(_c[LO]) : "0"(_c[LO]), "g"(*tmpm++) );
384 "movd 0(%1),%%mm4 \n\t" \
385 "movd 0(%2),%%mm0 \n\t" \
386 "paddq %%mm4,%%mm3 \n\t" \
387 "pmuludq %%mm1,%%mm0 \n\t" \
388 "movd 4(%2),%%mm5 \n\t" \
389 "paddq %%mm0,%%mm3 \n\t" \
390 "movd 4(%1),%%mm6 \n\t" \
391 "movd %%mm3,0(%0) \n\t" \
392 "psrlq $32, %%mm3 \n\t" \
394 "paddq %%mm6,%%mm3 \n\t" \
395 "pmuludq %%mm1,%%mm5 \n\t" \
396 "movd 8(%2),%%mm6 \n\t" \
397 "paddq %%mm5,%%mm3 \n\t" \
398 "movd 8(%1),%%mm7 \n\t" \
399 "movd %%mm3,4(%0) \n\t" \
400 "psrlq $32, %%mm3 \n\t" \
402 "paddq %%mm7,%%mm3 \n\t" \
403 "pmuludq %%mm1,%%mm6 \n\t" \
404 "movd 12(%2),%%mm7 \n\t" \
405 "paddq %%mm6,%%mm3 \n\t" \
406 "movd 12(%1),%%mm5 \n\t" \
407 "movd %%mm3,8(%0) \n\t" \
408 "psrlq $32, %%mm3 \n\t" \
410 "paddq %%mm5,%%mm3 \n\t" \
411 "pmuludq %%mm1,%%mm7 \n\t" \
412 "movd 16(%2),%%mm5 \n\t" \
413 "paddq %%mm7,%%mm3 \n\t" \
414 "movd 16(%1),%%mm6 \n\t" \
415 "movd %%mm3,12(%0) \n\t" \
416 "psrlq $32, %%mm3 \n\t" \
418 "paddq %%mm6,%%mm3 \n\t" \
419 "pmuludq %%mm1,%%mm5 \n\t" \
420 "movd 20(%2),%%mm6 \n\t" \
421 "paddq %%mm5,%%mm3 \n\t" \
422 "movd 20(%1),%%mm7 \n\t" \
423 "movd %%mm3,16(%0) \n\t" \
424 "psrlq $32, %%mm3 \n\t" \
426 "paddq %%mm7,%%mm3 \n\t" \
427 "pmuludq %%mm1,%%mm6 \n\t" \
428 "movd 24(%2),%%mm7 \n\t" \
429 "paddq %%mm6,%%mm3 \n\t" \
430 "movd 24(%1),%%mm5 \n\t" \
431 "movd %%mm3,20(%0) \n\t" \
432 "psrlq $32, %%mm3 \n\t" \
434 "paddq %%mm5,%%mm3 \n\t" \
435 "pmuludq %%mm1,%%mm7 \n\t" \
436 "movd 28(%2),%%mm5 \n\t" \
437 "paddq %%mm7,%%mm3 \n\t" \
438 "movd 28(%1),%%mm6 \n\t" \
439 "movd %%mm3,24(%0) \n\t" \
440 "psrlq $32, %%mm3 \n\t" \
442 "paddq %%mm6,%%mm3 \n\t" \
443 "pmuludq %%mm1,%%mm5 \n\t" \
444 "paddq %%mm5,%%mm3 \n\t" \
445 "movd %%mm3,28(%0) \n\t" \
446 "psrlq $32, %%mm3 \n\t" \
447 :"=r"(_c) : "0"(_c), "r"(tmpm) );
449 /* TAO switched tmpm from "g" to "r" after gcc tried to index the indexed stack
453 __asm__( "movd %%mm3,%0 \n" :"=r"(cy))
459 "movzbl %%al,%1 \n\t" \
460 :"=g"(_c[LO]), "=r"(cy) \
461 :"0"(_c[LO]), "1"(cy) \
464 /******************************************************************/
465 #elif defined(TFM_ARM)
480 " ADDS r0,r0,%0 \n\t" \
482 " MOVCS %0,#1 \n\t" \
483 " MOVCC %0,#0 \n\t" \
484 " UMLAL r0,%0,%3,%4 \n\t" \
486 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0]):"r0","cc");
491 " ADDS r0,r0,%0 \n\t" \
494 " MOVCS %0,#1 \n\t" \
495 " MOVCC %0,#0 \n\t" \
496 :"=r"(cy),"=m"(_c[0]):"0"(cy),"m"(_c[0]):"r0","cc");
499 /* TAO thumb mode uses ite (if then else) to detect carry directly
500 * fixed unmatched constraint warning by changing 1 to m */
502 #else /* __thumb__ */
507 " ADDS r0,r0,%0 \n\t" \
508 " MOVCS %0,#1 \n\t" \
509 " MOVCC %0,#0 \n\t" \
510 " UMLAL r0,%0,%3,%4 \n\t" \
512 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r0","cc");
517 " ADDS r0,r0,%0 \n\t" \
519 " MOVCS %0,#1 \n\t" \
520 " MOVCC %0,#0 \n\t" \
521 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r0","cc");
523 #endif /* __thumb__ */
525 #elif defined(TFM_PPC32)
536 " mullw 16,%3,%4 \n\t" \
537 " mulhwu 17,%3,%4 \n\t" \
538 " addc 16,16,%0 \n\t" \
539 " addze 17,17 \n\t" \
541 " addc 16,16,18 \n\t" \
542 " addze %0,17 \n\t" \
544 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "18","cc"); ++tmpm;
549 " addc 16,16,%0 \n\t" \
551 " xor %0,%0,%0 \n\t" \
552 " addze %0,%0 \n\t" \
553 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"16","cc");
555 #elif defined(TFM_PPC64)
566 " mulld 16,%3,%4 \n\t" \
567 " mulhdu 17,%3,%4 \n\t" \
568 " addc 16,16,%0 \n\t" \
569 " addze 17,17 \n\t" \
570 " ldx 18,0,%1 \n\t" \
571 " addc 16,16,18 \n\t" \
572 " addze %0,17 \n\t" \
573 " sdx 16,0,%1 \n\t" \
574 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "18","cc"); ++tmpm;
578 " ldx 16,0,%1 \n\t" \
579 " addc 16,16,%0 \n\t" \
580 " sdx 16,0,%1 \n\t" \
581 " xor %0,%0,%0 \n\t" \
582 " addze %0,%0 \n\t" \
583 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"16","cc");
585 /******************************************************************/
587 #elif defined(TFM_AVR32)
602 " macu.d r2,%3,%4 \n\t" \
605 :"=r"(cy),"=r"(_c):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c):"r2","r3");
614 :"=r"(cy),"=r"(&_c[0]):"0"(cy),"1"(&_c[0]):"r2","cc");
627 t = ((fp_word)_c[0] + (fp_word)cy) + \
628 (((fp_word)mu) * ((fp_word)*tmpm++)); \
629 _c[0] = (fp_digit)t; \
630 cy = (fp_digit)(t >> DIGIT_BIT); \
634 do { fp_digit t = _c[0] += cy; cy = (t < cy); } while (0)
637 /******************************************************************/
641 /* end fp_montogomery_reduce.c asm */
644 /* start fp_sqr_comba.c asm */
647 /* x86-32 optimized */
651 #define CLEAR_CARRY \
654 #define COMBA_STORE(x) \
657 #define COMBA_STORE2(x) \
660 #define CARRY_FORWARD \
661 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
665 #define SQRADD(i, j) \
667 "movl %6,%%eax \n\t" \
669 "addl %%eax,%0 \n\t" \
670 "adcl %%edx,%1 \n\t" \
672 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","cc");
674 #define SQRADD2(i, j) \
676 "movl %6,%%eax \n\t" \
678 "addl %%eax,%0 \n\t" \
679 "adcl %%edx,%1 \n\t" \
681 "addl %%eax,%0 \n\t" \
682 "adcl %%edx,%1 \n\t" \
684 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx", "cc");
686 #define SQRADDSC(i, j) \
688 "movl %3,%%eax \n\t" \
690 "movl %%eax,%0 \n\t" \
691 "movl %%edx,%1 \n\t" \
693 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%eax","%edx","cc");
695 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
697 #define SQRADDAC(i, j) \
699 "movl %6,%%eax \n\t" \
701 "addl %%eax,%0 \n\t" \
702 "adcl %%edx,%1 \n\t" \
704 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","cc");
714 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
716 #elif defined(TFM_X86_64)
717 /* x86-64 optimized */
721 #define CLEAR_CARRY \
724 #define COMBA_STORE(x) \
727 #define COMBA_STORE2(x) \
730 #define CARRY_FORWARD \
731 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
735 #define SQRADD(i, j) \
737 "movq %6,%%rax \n\t" \
739 "addq %%rax,%0 \n\t" \
740 "adcq %%rdx,%1 \n\t" \
742 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","cc");
744 #define SQRADD2(i, j) \
746 "movq %6,%%rax \n\t" \
748 "addq %%rax,%0 \n\t" \
749 "adcq %%rdx,%1 \n\t" \
751 "addq %%rax,%0 \n\t" \
752 "adcq %%rdx,%1 \n\t" \
754 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc");
756 #define SQRADDSC(i, j) \
758 "movq %3,%%rax \n\t" \
760 "movq %%rax,%0 \n\t" \
761 "movq %%rdx,%1 \n\t" \
763 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%rax","%rdx","cc");
765 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
767 #define SQRADDAC(i, j) \
769 "movq %6,%%rax \n\t" \
771 "addq %%rax,%0 \n\t" \
772 "adcq %%rdx,%1 \n\t" \
774 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc");
784 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
786 #elif defined(TFM_SSE2)
791 #define CLEAR_CARRY \
794 #define COMBA_STORE(x) \
797 #define COMBA_STORE2(x) \
800 #define CARRY_FORWARD \
801 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
806 #define SQRADD(i, j) \
808 "movd %6,%%mm0 \n\t" \
809 "pmuludq %%mm0,%%mm0\n\t" \
810 "movd %%mm0,%%eax \n\t" \
811 "psrlq $32,%%mm0 \n\t" \
812 "addl %%eax,%0 \n\t" \
813 "movd %%mm0,%%eax \n\t" \
814 "adcl %%eax,%1 \n\t" \
816 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","cc");
818 #define SQRADD2(i, j) \
820 "movd %6,%%mm0 \n\t" \
821 "movd %7,%%mm1 \n\t" \
822 "pmuludq %%mm1,%%mm0\n\t" \
823 "movd %%mm0,%%eax \n\t" \
824 "psrlq $32,%%mm0 \n\t" \
825 "movd %%mm0,%%edx \n\t" \
826 "addl %%eax,%0 \n\t" \
827 "adcl %%edx,%1 \n\t" \
829 "addl %%eax,%0 \n\t" \
830 "adcl %%edx,%1 \n\t" \
832 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","cc");
834 #define SQRADDSC(i, j) \
836 "movd %3,%%mm0 \n\t" \
837 "movd %4,%%mm1 \n\t" \
838 "pmuludq %%mm1,%%mm0\n\t" \
839 "movd %%mm0,%0 \n\t" \
840 "psrlq $32,%%mm0 \n\t" \
841 "movd %%mm0,%1 \n\t" \
843 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "m"(i), "m"(j));
845 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
847 #define SQRADDAC(i, j) \
849 "movd %6,%%mm0 \n\t" \
850 "movd %7,%%mm1 \n\t" \
851 "pmuludq %%mm1,%%mm0\n\t" \
852 "movd %%mm0,%%eax \n\t" \
853 "psrlq $32,%%mm0 \n\t" \
854 "movd %%mm0,%%edx \n\t" \
855 "addl %%eax,%0 \n\t" \
856 "adcl %%edx,%1 \n\t" \
858 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j) :"%eax","%edx","cc");
868 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
870 #elif defined(TFM_ARM)
876 #define CLEAR_CARRY \
879 #define COMBA_STORE(x) \
882 #define COMBA_STORE2(x) \
885 #define CARRY_FORWARD \
886 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
890 /* multiplies point i and j, updates carry "c1" and digit c2 */
891 #define SQRADD(i, j) \
893 " UMULL r0,r1,%6,%6 \n\t" \
894 " ADDS %0,%0,r0 \n\t" \
895 " ADCS %1,%1,r1 \n\t" \
896 " ADC %2,%2,#0 \n\t" \
897 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "cc");
899 /* for squaring some of the terms are doubled... */
900 #define SQRADD2(i, j) \
902 " UMULL r0,r1,%6,%7 \n\t" \
903 " ADDS %0,%0,r0 \n\t" \
904 " ADCS %1,%1,r1 \n\t" \
905 " ADC %2,%2,#0 \n\t" \
906 " ADDS %0,%0,r0 \n\t" \
907 " ADCS %1,%1,r1 \n\t" \
908 " ADC %2,%2,#0 \n\t" \
909 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
911 #define SQRADDSC(i, j) \
913 " UMULL %0,%1,%3,%4 \n\t" \
914 " SUB %2,%2,%2 \n\t" \
915 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "r"(i), "r"(j) : "cc");
917 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
919 #define SQRADDAC(i, j) \
921 " UMULL r0,r1,%6,%7 \n\t" \
922 " ADDS %0,%0,r0 \n\t" \
923 " ADCS %1,%1,r1 \n\t" \
924 " ADC %2,%2,#0 \n\t" \
925 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "r0", "r1", "cc");
929 " ADDS %0,%0,%3 \n\t" \
930 " ADCS %1,%1,%4 \n\t" \
931 " ADC %2,%2,%5 \n\t" \
932 " ADDS %0,%0,%3 \n\t" \
933 " ADCS %1,%1,%4 \n\t" \
934 " ADC %2,%2,%5 \n\t" \
935 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
937 #elif defined(TFM_PPC32)
943 #define CLEAR_CARRY \
946 #define COMBA_STORE(x) \
949 #define COMBA_STORE2(x) \
952 #define CARRY_FORWARD \
953 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
957 /* multiplies point i and j, updates carry "c1" and digit c2 */
958 #define SQRADD(i, j) \
960 " mullw 16,%6,%6 \n\t" \
961 " addc %0,%0,16 \n\t" \
962 " mulhwu 16,%6,%6 \n\t" \
963 " adde %1,%1,16 \n\t" \
964 " addze %2,%2 \n\t" \
965 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","cc");
967 /* for squaring some of the terms are doubled... */
968 #define SQRADD2(i, j) \
970 " mullw 16,%6,%7 \n\t" \
971 " mulhwu 17,%6,%7 \n\t" \
972 " addc %0,%0,16 \n\t" \
973 " adde %1,%1,17 \n\t" \
974 " addze %2,%2 \n\t" \
975 " addc %0,%0,16 \n\t" \
976 " adde %1,%1,17 \n\t" \
977 " addze %2,%2 \n\t" \
978 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","cc");
980 #define SQRADDSC(i, j) \
982 " mullw %0,%6,%7 \n\t" \
983 " mulhwu %1,%6,%7 \n\t" \
984 " xor %2,%2,%2 \n\t" \
985 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
987 #define SQRADDAC(i, j) \
989 " mullw 16,%6,%7 \n\t" \
990 " addc %0,%0,16 \n\t" \
991 " mulhwu 16,%6,%7 \n\t" \
992 " adde %1,%1,16 \n\t" \
993 " addze %2,%2 \n\t" \
994 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "cc");
998 " addc %0,%0,%3 \n\t" \
999 " adde %1,%1,%4 \n\t" \
1000 " adde %2,%2,%5 \n\t" \
1001 " addc %0,%0,%3 \n\t" \
1002 " adde %1,%1,%4 \n\t" \
1003 " adde %2,%2,%5 \n\t" \
1004 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
1006 #elif defined(TFM_PPC64)
1011 #define CLEAR_CARRY \
1014 #define COMBA_STORE(x) \
1017 #define COMBA_STORE2(x) \
1020 #define CARRY_FORWARD \
1021 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1025 /* multiplies point i and j, updates carry "c1" and digit c2 */
1026 #define SQRADD(i, j) \
1028 " mulld 16,%6,%6 \n\t" \
1029 " addc %0,%0,16 \n\t" \
1030 " mulhdu 16,%6,%6 \n\t" \
1031 " adde %1,%1,16 \n\t" \
1032 " addze %2,%2 \n\t" \
1033 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","cc");
1035 /* for squaring some of the terms are doubled... */
1036 #define SQRADD2(i, j) \
1038 " mulld 16,%6,%7 \n\t" \
1039 " mulhdu 17,%6,%7 \n\t" \
1040 " addc %0,%0,16 \n\t" \
1041 " adde %1,%1,17 \n\t" \
1042 " addze %2,%2 \n\t" \
1043 " addc %0,%0,16 \n\t" \
1044 " adde %1,%1,17 \n\t" \
1045 " addze %2,%2 \n\t" \
1046 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","cc");
1048 #define SQRADDSC(i, j) \
1050 " mulld %0,%6,%7 \n\t" \
1051 " mulhdu %1,%6,%7 \n\t" \
1052 " xor %2,%2,%2 \n\t" \
1053 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
1055 #define SQRADDAC(i, j) \
1057 " mulld 16,%6,%7 \n\t" \
1058 " addc %0,%0,16 \n\t" \
1059 " mulhdu 16,%6,%7 \n\t" \
1060 " adde %1,%1,16 \n\t" \
1061 " addze %2,%2 \n\t" \
1062 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "cc");
1066 " addc %0,%0,%3 \n\t" \
1067 " adde %1,%1,%4 \n\t" \
1068 " adde %2,%2,%5 \n\t" \
1069 " addc %0,%0,%3 \n\t" \
1070 " adde %1,%1,%4 \n\t" \
1071 " adde %2,%2,%5 \n\t" \
1072 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
1075 #elif defined(TFM_AVR32)
1081 #define CLEAR_CARRY \
1084 #define COMBA_STORE(x) \
1087 #define COMBA_STORE2(x) \
1090 #define CARRY_FORWARD \
1091 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1095 /* multiplies point i and j, updates carry "c1" and digit c2 */
1096 #define SQRADD(i, j) \
1098 " mulu.d r2,%6,%6 \n\t" \
1099 " add %0,%0,r2 \n\t" \
1100 " adc %1,%1,r3 \n\t" \
1102 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"r2","r3");
1104 /* for squaring some of the terms are doubled... */
1105 #define SQRADD2(i, j) \
1107 " mulu.d r2,%6,%7 \n\t" \
1108 " add %0,%0,r2 \n\t" \
1109 " adc %1,%1,r3 \n\t" \
1111 " add %0,%0,r2 \n\t" \
1112 " adc %1,%1,r3 \n\t" \
1114 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2", "r3");
1116 #define SQRADDSC(i, j) \
1118 " mulu.d r2,%6,%7 \n\t" \
1122 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "r2", "r3");
1124 #define SQRADDAC(i, j) \
1126 " mulu.d r2,%6,%7 \n\t" \
1127 " add %0,%0,r2 \n\t" \
1128 " adc %1,%1,r3 \n\t" \
1130 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"r2", "r3");
1134 " add %0,%0,%3 \n\t" \
1135 " adc %1,%1,%4 \n\t" \
1136 " adc %2,%2,%5 \n\t" \
1137 " add %0,%0,%3 \n\t" \
1138 " adc %1,%1,%4 \n\t" \
1139 " adc %2,%2,%5 \n\t" \
1140 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
1147 /* ISO C portable code */
1151 #define CLEAR_CARRY \
1154 #define COMBA_STORE(x) \
1157 #define COMBA_STORE2(x) \
1160 #define CARRY_FORWARD \
1161 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1165 /* multiplies point i and j, updates carry "c1" and digit c2 */
1166 #define SQRADD(i, j) \
1168 t = c0 + ((fp_word)i) * ((fp_word)j); c0 = (fp_digit)t; \
1169 t = c1 + (t >> DIGIT_BIT); c1 = (fp_digit)t; \
1170 c2 +=(fp_digit) (t >> DIGIT_BIT); \
1174 /* for squaring some of the terms are doubled... */
1175 #define SQRADD2(i, j) \
1177 t = ((fp_word)i) * ((fp_word)j); \
1178 tt = (fp_word)c0 + t; c0 = (fp_digit)tt; \
1179 tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = (fp_digit)tt; \
1180 c2 +=(fp_digit)( tt >> DIGIT_BIT); \
1181 tt = (fp_word)c0 + t; c0 = (fp_digit)tt; \
1182 tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = (fp_digit)tt; \
1183 c2 +=(fp_digit) (tt >> DIGIT_BIT); \
1186 #define SQRADDSC(i, j) \
1188 t = ((fp_word)i) * ((fp_word)j); \
1189 sc0 = (fp_digit)t; sc1 = (t >> DIGIT_BIT); sc2 = 0; \
1192 #define SQRADDAC(i, j) \
1194 t = sc0 + ((fp_word)i) * ((fp_word)j); sc0 = (fp_digit)t; \
1195 t = sc1 + (t >> DIGIT_BIT); sc1 = (fp_digit)t; \
1196 sc2 += (fp_digit)(t >> DIGIT_BIT); \
1201 t = ((fp_word)sc0) + ((fp_word)sc0) + c0; c0 = (fp_digit)t; \
1202 t = ((fp_word)sc1) + ((fp_word)sc1) + c1 + (t >> DIGIT_BIT); \
1204 c2 = c2 + (fp_digit)(((fp_word)sc2) + ((fp_word)sc2) + (t >> DIGIT_BIT)); \
1209 #ifdef TFM_SMALL_SET
1210 #include "fp_sqr_comba_small_set.i"
1213 #if defined(TFM_SQR3)
1214 #include "fp_sqr_comba_3.i"
1216 #if defined(TFM_SQR4)
1217 #include "fp_sqr_comba_4.i"
1219 #if defined(TFM_SQR6)
1220 #include "fp_sqr_comba_6.i"
1222 #if defined(TFM_SQR7)
1223 #include "fp_sqr_comba_7.i"
1225 #if defined(TFM_SQR8)
1226 #include "fp_sqr_comba_8.i"
1228 #if defined(TFM_SQR9)
1229 #include "fp_sqr_comba_9.i"
1231 #if defined(TFM_SQR12)
1232 #include "fp_sqr_comba_12.i"
1234 #if defined(TFM_SQR17)
1235 #include "fp_sqr_comba_17.i"
1237 #if defined(TFM_SQR20)
1238 #include "fp_sqr_comba_20.i"
1240 #if defined(TFM_SQR24)
1241 #include "fp_sqr_comba_24.i"
1243 #if defined(TFM_SQR28)
1244 #include "fp_sqr_comba_28.i"
1246 #if defined(TFM_SQR32)
1247 #include "fp_sqr_comba_32.i"
1249 #if defined(TFM_SQR48)
1250 #include "fp_sqr_comba_48.i"
1252 #if defined(TFM_SQR64)
1253 #include "fp_sqr_comba_64.i"
1255 /* end fp_sqr_comba.c asm */
1257 /* start fp_mul_comba.c asm */
1258 /* these are the combas. Worship them. */
1259 #if defined(TFM_X86)
1260 /* Generic x86 optimized code */
1262 /* anything you need at the start */
1265 /* clear the chaining variables */
1266 #define COMBA_CLEAR \
1269 /* forward the carry to the next digit */
1270 #define COMBA_FORWARD \
1271 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1273 /* store the first sum */
1274 #define COMBA_STORE(x) \
1277 /* store the second sum [carry] */
1278 #define COMBA_STORE2(x) \
1281 /* anything you need at the end */
1284 /* this should multiply i and j */
1285 #define MULADD(i, j) \
1287 "movl %6,%%eax \n\t" \
1289 "addl %%eax,%0 \n\t" \
1290 "adcl %%edx,%1 \n\t" \
1292 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","cc");
1294 #elif defined(TFM_X86_64)
1295 /* x86-64 optimized */
1297 /* anything you need at the start */
1300 /* clear the chaining variables */
1301 #define COMBA_CLEAR \
1304 /* forward the carry to the next digit */
1305 #define COMBA_FORWARD \
1306 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1308 /* store the first sum */
1309 #define COMBA_STORE(x) \
1312 /* store the second sum [carry] */
1313 #define COMBA_STORE2(x) \
1316 /* anything you need at the end */
1319 /* this should multiply i and j */
1320 #define MULADD(i, j) \
1322 "movq %6,%%rax \n\t" \
1324 "addq %%rax,%0 \n\t" \
1325 "adcq %%rdx,%1 \n\t" \
1327 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc");
1330 #if defined(HAVE_INTEL_MULX)
1331 #define MULADD_MULX(b0, c0, c1, rdx)\
1332 __asm__ volatile ( \
1333 "movq %3, %%rdx\n\t" \
1334 "mulx %2,%%r9, %%r8 \n\t" \
1335 "adoxq %%r9,%0 \n\t" \
1336 "adcxq %%r8,%1 \n\t" \
1337 :"+r"(c0),"+r"(c1):"r"(b0), "r"(rdx):"%r8","%r9","%r10","%rdx"\
1341 #define MULADD_MULX_ADD_CARRY(c0, c1)\
1343 "mov $0, %%r10\n\t"\
1344 "movq %1, %%r8\n\t"\
1345 "adox %%r10, %0\n\t"\
1346 "adcx %%r10, %1\n\t"\
1347 :"+r"(c0),"+r"(c1)::"%r8","%r9","%r10","%rdx") ;
1349 #define MULADD_SET_A(a0)\
1350 __asm__ volatile("add $0, %%r8\n\t" \
1351 "movq %0,%%rdx\n\t" \
1352 ::"r"(a0):"%r8","%r9","%r10","%rdx") ;
1354 #define MULADD_BODY(a,b,c)\
1355 { word64 rdx = a->dp[ix] ; \
1356 cp = &(c->dp[iz]) ; \
1357 c0 = cp[0] ; c1 = cp[1]; \
1358 MULADD_SET_A(rdx) ; \
1359 MULADD_MULX(b0, c0, c1, rdx) ;\
1360 cp[0]=c0; c0=cp[2]; \
1361 MULADD_MULX(b1, c1, c0, rdx) ;\
1362 cp[1]=c1; c1=cp[3]; \
1363 MULADD_MULX(b2, c0, c1, rdx) ;\
1364 cp[2]=c0; c0=cp[4]; \
1365 MULADD_MULX(b3, c1, c0, rdx) ;\
1366 cp[3]=c1; c1=cp[5]; \
1367 MULADD_MULX_ADD_CARRY(c0, c1);\
1368 cp[4]=c0; cp[5]=c1; \
1371 #define TFM_INTEL_MUL_COMBA(a, b, c)\
1372 for(ix=0; ix<pa; ix++)c->dp[ix]=0 ; \
1373 for(iy=0; (iy<b->used); iy+=4) { \
1375 bp = &(b->dp[iy+0]) ; \
1376 fp_digit b0 = bp[0] , b1= bp[1], \
1377 b2= bp[2], b3= bp[3]; \
1379 while(ix<a->used) { \
1382 MULADD_BODY(a,b,c); \
1388 #elif defined(TFM_SSE2)
1389 /* use SSE2 optimizations */
1391 /* anything you need at the start */
1394 /* clear the chaining variables */
1395 #define COMBA_CLEAR \
1398 /* forward the carry to the next digit */
1399 #define COMBA_FORWARD \
1400 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1402 /* store the first sum */
1403 #define COMBA_STORE(x) \
1406 /* store the second sum [carry] */
1407 #define COMBA_STORE2(x) \
1410 /* anything you need at the end */
1411 #define COMBA_FINI \
1414 /* this should multiply i and j */
1415 #define MULADD(i, j) \
1417 "movd %6,%%mm0 \n\t" \
1418 "movd %7,%%mm1 \n\t" \
1419 "pmuludq %%mm1,%%mm0\n\t" \
1420 "movd %%mm0,%%eax \n\t" \
1421 "psrlq $32,%%mm0 \n\t" \
1422 "addl %%eax,%0 \n\t" \
1423 "movd %%mm0,%%eax \n\t" \
1424 "adcl %%eax,%1 \n\t" \
1426 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","cc");
1428 #elif defined(TFM_ARM)
1433 #define COMBA_CLEAR \
1436 #define COMBA_FORWARD \
1437 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1439 #define COMBA_STORE(x) \
1442 #define COMBA_STORE2(x) \
1447 #define MULADD(i, j) \
1449 " UMULL r0,r1,%6,%7 \n\t" \
1450 " ADDS %0,%0,r0 \n\t" \
1451 " ADCS %1,%1,r1 \n\t" \
1452 " ADC %2,%2,#0 \n\t" \
1453 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
1455 #elif defined(TFM_PPC32)
1456 /* For 32-bit PPC */
1460 #define COMBA_CLEAR \
1463 #define COMBA_FORWARD \
1464 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1466 #define COMBA_STORE(x) \
1469 #define COMBA_STORE2(x) \
1474 /* untested: will mulhwu change the flags? Docs say no */
1475 #define MULADD(i, j) \
1477 " mullw 16,%6,%7 \n\t" \
1478 " addc %0,%0,16 \n\t" \
1479 " mulhwu 16,%6,%7 \n\t" \
1480 " adde %1,%1,16 \n\t" \
1481 " addze %2,%2 \n\t" \
1482 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16");
1484 #elif defined(TFM_PPC64)
1485 /* For 64-bit PPC */
1489 #define COMBA_CLEAR \
1492 #define COMBA_FORWARD \
1493 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1495 #define COMBA_STORE(x) \
1498 #define COMBA_STORE2(x) \
1503 /* untested: will mulhwu change the flags? Docs say no */
1504 #define MULADD(i, j) \
1506 " mulld 16,%6,%7 \n\t" \
1507 " addc %0,%0,16 \n\t" \
1508 " mulhdu 16,%6,%7 \n\t" \
1509 " adde %1,%1,16 \n\t" \
1510 " addze %2,%2 \n\t" \
1511 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16");
1513 #elif defined(TFM_AVR32)
1519 #define COMBA_CLEAR \
1522 #define COMBA_FORWARD \
1523 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1525 #define COMBA_STORE(x) \
1528 #define COMBA_STORE2(x) \
1533 #define MULADD(i, j) \
1535 " mulu.d r2,%6,%7 \n\t"\
1537 " adc %1,%1,r3 \n\t"\
1539 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2","r3");
1546 #define COMBA_CLEAR \
1549 #define COMBA_FORWARD \
1550 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1552 #define COMBA_STORE(x) \
1555 #define COMBA_STORE2(x) \
1560 #define MULADD(i, j) \
1562 t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j); c0 = (fp_digit)t; \
1563 t = (fp_word)c1 + (t >> DIGIT_BIT); \
1564 c1 = (fp_digit)t; c2 += (fp_digit)(t >> DIGIT_BIT); \
1570 #ifdef TFM_SMALL_SET
1571 #include "fp_mul_comba_small_set.i"
1574 #if defined(TFM_MUL3)
1575 #include "fp_mul_comba_3.i"
1577 #if defined(TFM_MUL4)
1578 #include "fp_mul_comba_4.i"
1580 #if defined(TFM_MUL6)
1581 #include "fp_mul_comba_6.i"
1583 #if defined(TFM_MUL7)
1584 #include "fp_mul_comba_7.i"
1586 #if defined(TFM_MUL8)
1587 #include "fp_mul_comba_8.i"
1589 #if defined(TFM_MUL9)
1590 #include "fp_mul_comba_9.i"
1592 #if defined(TFM_MUL12)
1593 #include "fp_mul_comba_12.i"
1595 #if defined(TFM_MUL17)
1596 #include "fp_mul_comba_17.i"
1598 #if defined(TFM_MUL20)
1599 #include "fp_mul_comba_20.i"
1601 #if defined(TFM_MUL24)
1602 #include "fp_mul_comba_24.i"
1604 #if defined(TFM_MUL28)
1605 #include "fp_mul_comba_28.i"
1607 #if defined(TFM_MUL32)
1608 #include "fp_mul_comba_32.i"
1610 #if defined(TFM_MUL48)
1611 #include "fp_mul_comba_48.i"
1613 #if defined(TFM_MUL64)
1614 #include "fp_mul_comba_64.i"
1617 /* end fp_mul_comba.c asm */