3 * Copyright (C) 2006-2012 Sawtooth Consulting Ltd.
5 * This file is part of CyaSSL.
7 * CyaSSL is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * CyaSSL is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
27 * Based on public domain TomsFastMath 0.10 by Tom St Denis, tomstdenis@iahu.ca,
28 * http://math.libtomcrypt.com
32 /******************************************************************/
33 /* fp_montgomery_reduce.c asm or generic */
34 #if defined(TFM_X86) && !defined(TFM_SSE2)
45 "movl %5,%%eax \n\t" \
47 "addl %1,%%eax \n\t" \
48 "adcl $0,%%edx \n\t" \
49 "addl %%eax,%0 \n\t" \
50 "adcl $0,%%edx \n\t" \
51 "movl %%edx,%1 \n\t" \
52 :"=g"(_c[LO]), "=r"(cy) \
53 :"0"(_c[LO]), "1"(cy), "g"(mu), "g"(*tmpm++) \
54 : "%eax", "%edx", "%cc")
60 "movzbl %%al,%1 \n\t" \
61 :"=g"(_c[LO]), "=r"(cy) \
62 :"0"(_c[LO]), "1"(cy) \
65 /******************************************************************/
66 #elif defined(TFM_X86_64)
77 "movq %5,%%rax \n\t" \
79 "addq %1,%%rax \n\t" \
80 "adcq $0,%%rdx \n\t" \
81 "addq %%rax,%0 \n\t" \
82 "adcq $0,%%rdx \n\t" \
83 "movq %%rdx,%1 \n\t" \
84 :"=g"(_c[LO]), "=r"(cy) \
85 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \
86 : "%rax", "%rdx", "%cc")
90 "movq 0(%5),%%rax \n\t" \
91 "movq 0(%2),%%r10 \n\t" \
92 "movq 0x8(%5),%%r11 \n\t" \
94 "addq %%r10,%%rax \n\t" \
95 "adcq $0,%%rdx \n\t" \
96 "movq 0x8(%2),%%r10 \n\t" \
97 "addq %3,%%rax \n\t" \
98 "adcq $0,%%rdx \n\t" \
99 "movq %%rax,0(%0) \n\t" \
100 "movq %%rdx,%1 \n\t" \
102 "movq %%r11,%%rax \n\t" \
103 "movq 0x10(%5),%%r11 \n\t" \
105 "addq %%r10,%%rax \n\t" \
106 "adcq $0,%%rdx \n\t" \
107 "movq 0x10(%2),%%r10 \n\t" \
108 "addq %3,%%rax \n\t" \
109 "adcq $0,%%rdx \n\t" \
110 "movq %%rax,0x8(%0) \n\t" \
111 "movq %%rdx,%1 \n\t" \
113 "movq %%r11,%%rax \n\t" \
114 "movq 0x18(%5),%%r11 \n\t" \
116 "addq %%r10,%%rax \n\t" \
117 "adcq $0,%%rdx \n\t" \
118 "movq 0x18(%2),%%r10 \n\t" \
119 "addq %3,%%rax \n\t" \
120 "adcq $0,%%rdx \n\t" \
121 "movq %%rax,0x10(%0) \n\t" \
122 "movq %%rdx,%1 \n\t" \
124 "movq %%r11,%%rax \n\t" \
125 "movq 0x20(%5),%%r11 \n\t" \
127 "addq %%r10,%%rax \n\t" \
128 "adcq $0,%%rdx \n\t" \
129 "movq 0x20(%2),%%r10 \n\t" \
130 "addq %3,%%rax \n\t" \
131 "adcq $0,%%rdx \n\t" \
132 "movq %%rax,0x18(%0) \n\t" \
133 "movq %%rdx,%1 \n\t" \
135 "movq %%r11,%%rax \n\t" \
136 "movq 0x28(%5),%%r11 \n\t" \
138 "addq %%r10,%%rax \n\t" \
139 "adcq $0,%%rdx \n\t" \
140 "movq 0x28(%2),%%r10 \n\t" \
141 "addq %3,%%rax \n\t" \
142 "adcq $0,%%rdx \n\t" \
143 "movq %%rax,0x20(%0) \n\t" \
144 "movq %%rdx,%1 \n\t" \
146 "movq %%r11,%%rax \n\t" \
147 "movq 0x30(%5),%%r11 \n\t" \
149 "addq %%r10,%%rax \n\t" \
150 "adcq $0,%%rdx \n\t" \
151 "movq 0x30(%2),%%r10 \n\t" \
152 "addq %3,%%rax \n\t" \
153 "adcq $0,%%rdx \n\t" \
154 "movq %%rax,0x28(%0) \n\t" \
155 "movq %%rdx,%1 \n\t" \
157 "movq %%r11,%%rax \n\t" \
158 "movq 0x38(%5),%%r11 \n\t" \
160 "addq %%r10,%%rax \n\t" \
161 "adcq $0,%%rdx \n\t" \
162 "movq 0x38(%2),%%r10 \n\t" \
163 "addq %3,%%rax \n\t" \
164 "adcq $0,%%rdx \n\t" \
165 "movq %%rax,0x30(%0) \n\t" \
166 "movq %%rdx,%1 \n\t" \
168 "movq %%r11,%%rax \n\t" \
170 "addq %%r10,%%rax \n\t" \
171 "adcq $0,%%rdx \n\t" \
172 "addq %3,%%rax \n\t" \
173 "adcq $0,%%rdx \n\t" \
174 "movq %%rax,0x38(%0) \n\t" \
175 "movq %%rdx,%1 \n\t" \
177 :"=r"(_c), "=r"(cy) \
178 : "0"(_c), "1"(cy), "g"(mu), "r"(tmpm)\
179 : "%rax", "%rdx", "%r10", "%r11", "%cc")
186 "movzbq %%al,%1 \n\t" \
187 :"=g"(_c[LO]), "=r"(cy) \
188 :"0"(_c[LO]), "1"(cy) \
191 /******************************************************************/
192 #elif defined(TFM_SSE2)
193 /* SSE2 code (assumes 32-bit fp_digits) */
194 /* XMM register assignments:
195 * xmm0 *tmpm++, then Mu * (*tmpm++)
203 asm("movd %0,%%mm2"::"g"(mp))
210 "movd %0,%%mm1 \n\t" \
211 "pxor %%mm3,%%mm3 \n\t" \
212 "pmuludq %%mm2,%%mm1 \n\t" \
215 /* pmuludq on mmx registers does a 32x32->64 multiply. */
218 "movd %1,%%mm4 \n\t" \
219 "movd %2,%%mm0 \n\t" \
220 "paddq %%mm4,%%mm3 \n\t" \
221 "pmuludq %%mm1,%%mm0 \n\t" \
222 "paddq %%mm0,%%mm3 \n\t" \
223 "movd %%mm3,%0 \n\t" \
224 "psrlq $32, %%mm3 \n\t" \
225 :"=g"(_c[LO]) : "0"(_c[LO]), "g"(*tmpm++) );
229 "movd 0(%1),%%mm4 \n\t" \
230 "movd 0(%2),%%mm0 \n\t" \
231 "paddq %%mm4,%%mm3 \n\t" \
232 "pmuludq %%mm1,%%mm0 \n\t" \
233 "movd 4(%2),%%mm5 \n\t" \
234 "paddq %%mm0,%%mm3 \n\t" \
235 "movd 4(%1),%%mm6 \n\t" \
236 "movd %%mm3,0(%0) \n\t" \
237 "psrlq $32, %%mm3 \n\t" \
239 "paddq %%mm6,%%mm3 \n\t" \
240 "pmuludq %%mm1,%%mm5 \n\t" \
241 "movd 8(%2),%%mm6 \n\t" \
242 "paddq %%mm5,%%mm3 \n\t" \
243 "movd 8(%1),%%mm7 \n\t" \
244 "movd %%mm3,4(%0) \n\t" \
245 "psrlq $32, %%mm3 \n\t" \
247 "paddq %%mm7,%%mm3 \n\t" \
248 "pmuludq %%mm1,%%mm6 \n\t" \
249 "movd 12(%2),%%mm7 \n\t" \
250 "paddq %%mm6,%%mm3 \n\t" \
251 "movd 12(%1),%%mm5 \n\t" \
252 "movd %%mm3,8(%0) \n\t" \
253 "psrlq $32, %%mm3 \n\t" \
255 "paddq %%mm5,%%mm3 \n\t" \
256 "pmuludq %%mm1,%%mm7 \n\t" \
257 "movd 16(%2),%%mm5 \n\t" \
258 "paddq %%mm7,%%mm3 \n\t" \
259 "movd 16(%1),%%mm6 \n\t" \
260 "movd %%mm3,12(%0) \n\t" \
261 "psrlq $32, %%mm3 \n\t" \
263 "paddq %%mm6,%%mm3 \n\t" \
264 "pmuludq %%mm1,%%mm5 \n\t" \
265 "movd 20(%2),%%mm6 \n\t" \
266 "paddq %%mm5,%%mm3 \n\t" \
267 "movd 20(%1),%%mm7 \n\t" \
268 "movd %%mm3,16(%0) \n\t" \
269 "psrlq $32, %%mm3 \n\t" \
271 "paddq %%mm7,%%mm3 \n\t" \
272 "pmuludq %%mm1,%%mm6 \n\t" \
273 "movd 24(%2),%%mm7 \n\t" \
274 "paddq %%mm6,%%mm3 \n\t" \
275 "movd 24(%1),%%mm5 \n\t" \
276 "movd %%mm3,20(%0) \n\t" \
277 "psrlq $32, %%mm3 \n\t" \
279 "paddq %%mm5,%%mm3 \n\t" \
280 "pmuludq %%mm1,%%mm7 \n\t" \
281 "movd 28(%2),%%mm5 \n\t" \
282 "paddq %%mm7,%%mm3 \n\t" \
283 "movd 28(%1),%%mm6 \n\t" \
284 "movd %%mm3,24(%0) \n\t" \
285 "psrlq $32, %%mm3 \n\t" \
287 "paddq %%mm6,%%mm3 \n\t" \
288 "pmuludq %%mm1,%%mm5 \n\t" \
289 "paddq %%mm5,%%mm3 \n\t" \
290 "movd %%mm3,28(%0) \n\t" \
291 "psrlq $32, %%mm3 \n\t" \
292 :"=r"(_c) : "0"(_c), "r"(tmpm) );
294 /* TAO switched tmpm from "g" to "r" after gcc tried to index the indexed stack
298 asm( "movd %%mm3,%0 \n" :"=r"(cy))
304 "movzbl %%al,%1 \n\t" \
305 :"=g"(_c[LO]), "=r"(cy) \
306 :"0"(_c[LO]), "1"(cy) \
309 /******************************************************************/
310 #elif defined(TFM_ARM)
322 " ADDS r0,r0,%0 \n\t" \
323 " MOVCS %0,#1 \n\t" \
324 " MOVCC %0,#0 \n\t" \
325 " UMLAL r0,%0,%3,%4 \n\t" \
327 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r0","%cc");
332 " ADDS r0,r0,%0 \n\t" \
334 " MOVCS %0,#1 \n\t" \
335 " MOVCC %0,#0 \n\t" \
336 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r0","%cc");
338 #elif defined(TFM_PPC32)
349 " mullw 16,%3,%4 \n\t" \
350 " mulhwu 17,%3,%4 \n\t" \
351 " addc 16,16,%0 \n\t" \
352 " addze 17,17 \n\t" \
354 " addc 16,16,18 \n\t" \
355 " addze %0,17 \n\t" \
357 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "18","%cc"); ++tmpm;
362 " addc 16,16,%0 \n\t" \
364 " xor %0,%0,%0 \n\t" \
365 " addze %0,%0 \n\t" \
366 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"16","%cc");
368 #elif defined(TFM_PPC64)
379 " mulld 16,%3,%4 \n\t" \
380 " mulhdu 17,%3,%4 \n\t" \
381 " addc 16,16,%0 \n\t" \
382 " addze 17,17 \n\t" \
383 " ldx 18,0,%1 \n\t" \
384 " addc 16,16,18 \n\t" \
385 " addze %0,17 \n\t" \
386 " sdx 16,0,%1 \n\t" \
387 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "18","%cc"); ++tmpm;
391 " ldx 16,0,%1 \n\t" \
392 " addc 16,16,%0 \n\t" \
393 " sdx 16,0,%1 \n\t" \
394 " xor %0,%0,%0 \n\t" \
395 " addze %0,%0 \n\t" \
396 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"16","%cc");
398 /******************************************************************/
400 #elif defined(TFM_AVR32)
415 " macu.d r2,%3,%4 \n\t" \
418 :"=r"(cy),"=r"(_c):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c):"r2","r3");
427 :"=r"(cy),"=r"(&_c[0]):"0"(cy),"1"(&_c[0]):"r2","%cc");
440 _c[0] = t = ((fp_word)_c[0] + (fp_word)cy) + \
441 (((fp_word)mu) * ((fp_word)*tmpm++)); \
442 cy = (t >> DIGIT_BIT); \
446 do { fp_digit t = _c[0] += cy; cy = (t < cy); } while (0)
449 /******************************************************************/
453 /* end fp_montogomery_reduce.c asm */
456 /* start fp_sqr_comba.c asm */
459 /* x86-32 optimized */
463 #define CLEAR_CARRY \
466 #define COMBA_STORE(x) \
469 #define COMBA_STORE2(x) \
472 #define CARRY_FORWARD \
473 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
477 #define SQRADD(i, j) \
479 "movl %6,%%eax \n\t" \
481 "addl %%eax,%0 \n\t" \
482 "adcl %%edx,%1 \n\t" \
484 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc");
486 #define SQRADD2(i, j) \
488 "movl %6,%%eax \n\t" \
490 "addl %%eax,%0 \n\t" \
491 "adcl %%edx,%1 \n\t" \
493 "addl %%eax,%0 \n\t" \
494 "adcl %%edx,%1 \n\t" \
496 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx", "%cc");
498 #define SQRADDSC(i, j) \
500 "movl %3,%%eax \n\t" \
502 "movl %%eax,%0 \n\t" \
503 "movl %%edx,%1 \n\t" \
505 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%eax","%edx","%cc");
507 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
509 #define SQRADDAC(i, j) \
511 "movl %6,%%eax \n\t" \
513 "addl %%eax,%0 \n\t" \
514 "adcl %%edx,%1 \n\t" \
516 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc");
526 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "%cc");
528 #elif defined(TFM_X86_64)
529 /* x86-64 optimized */
533 #define CLEAR_CARRY \
536 #define COMBA_STORE(x) \
539 #define COMBA_STORE2(x) \
542 #define CARRY_FORWARD \
543 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
547 #define SQRADD(i, j) \
549 "movq %6,%%rax \n\t" \
551 "addq %%rax,%0 \n\t" \
552 "adcq %%rdx,%1 \n\t" \
554 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","%cc");
556 #define SQRADD2(i, j) \
558 "movq %6,%%rax \n\t" \
560 "addq %%rax,%0 \n\t" \
561 "adcq %%rdx,%1 \n\t" \
563 "addq %%rax,%0 \n\t" \
564 "adcq %%rdx,%1 \n\t" \
566 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","%cc");
568 #define SQRADDSC(i, j) \
570 "movq %3,%%rax \n\t" \
572 "movq %%rax,%0 \n\t" \
573 "movq %%rdx,%1 \n\t" \
575 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%rax","%rdx","%cc");
577 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
579 #define SQRADDAC(i, j) \
581 "movq %6,%%rax \n\t" \
583 "addq %%rax,%0 \n\t" \
584 "adcq %%rdx,%1 \n\t" \
586 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","%cc");
596 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "%cc");
598 #elif defined(TFM_SSE2)
603 #define CLEAR_CARRY \
606 #define COMBA_STORE(x) \
609 #define COMBA_STORE2(x) \
612 #define CARRY_FORWARD \
613 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
618 #define SQRADD(i, j) \
620 "movd %6,%%mm0 \n\t" \
621 "pmuludq %%mm0,%%mm0\n\t" \
622 "movd %%mm0,%%eax \n\t" \
623 "psrlq $32,%%mm0 \n\t" \
624 "addl %%eax,%0 \n\t" \
625 "movd %%mm0,%%eax \n\t" \
626 "adcl %%eax,%1 \n\t" \
628 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%cc");
630 #define SQRADD2(i, j) \
632 "movd %6,%%mm0 \n\t" \
633 "movd %7,%%mm1 \n\t" \
634 "pmuludq %%mm1,%%mm0\n\t" \
635 "movd %%mm0,%%eax \n\t" \
636 "psrlq $32,%%mm0 \n\t" \
637 "movd %%mm0,%%edx \n\t" \
638 "addl %%eax,%0 \n\t" \
639 "adcl %%edx,%1 \n\t" \
641 "addl %%eax,%0 \n\t" \
642 "adcl %%edx,%1 \n\t" \
644 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","%cc");
646 #define SQRADDSC(i, j) \
648 "movd %3,%%mm0 \n\t" \
649 "movd %4,%%mm1 \n\t" \
650 "pmuludq %%mm1,%%mm0\n\t" \
651 "movd %%mm0,%0 \n\t" \
652 "psrlq $32,%%mm0 \n\t" \
653 "movd %%mm0,%1 \n\t" \
655 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "m"(i), "m"(j));
657 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
659 #define SQRADDAC(i, j) \
661 "movd %6,%%mm0 \n\t" \
662 "movd %7,%%mm1 \n\t" \
663 "pmuludq %%mm1,%%mm0\n\t" \
664 "movd %%mm0,%%eax \n\t" \
665 "psrlq $32,%%mm0 \n\t" \
666 "movd %%mm0,%%edx \n\t" \
667 "addl %%eax,%0 \n\t" \
668 "adcl %%edx,%1 \n\t" \
670 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j) :"%eax","%edx","%cc");
680 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "%cc");
682 #elif defined(TFM_ARM)
688 #define CLEAR_CARRY \
691 #define COMBA_STORE(x) \
694 #define COMBA_STORE2(x) \
697 #define CARRY_FORWARD \
698 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
702 /* multiplies point i and j, updates carry "c1" and digit c2 */
703 #define SQRADD(i, j) \
705 " UMULL r0,r1,%6,%6 \n\t" \
706 " ADDS %0,%0,r0 \n\t" \
707 " ADCS %1,%1,r1 \n\t" \
708 " ADC %2,%2,#0 \n\t" \
709 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "%cc");
711 /* for squaring some of the terms are doubled... */
712 #define SQRADD2(i, j) \
714 " UMULL r0,r1,%6,%7 \n\t" \
715 " ADDS %0,%0,r0 \n\t" \
716 " ADCS %1,%1,r1 \n\t" \
717 " ADC %2,%2,#0 \n\t" \
718 " ADDS %0,%0,r0 \n\t" \
719 " ADCS %1,%1,r1 \n\t" \
720 " ADC %2,%2,#0 \n\t" \
721 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "%cc");
723 #define SQRADDSC(i, j) \
725 " UMULL %0,%1,%6,%7 \n\t" \
726 " SUB %2,%2,%2 \n\t" \
727 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "%cc");
729 #define SQRADDAC(i, j) \
731 " UMULL r0,r1,%6,%7 \n\t" \
732 " ADDS %0,%0,r0 \n\t" \
733 " ADCS %1,%1,r1 \n\t" \
734 " ADC %2,%2,#0 \n\t" \
735 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "r0", "r1", "%cc");
739 " ADDS %0,%0,%3 \n\t" \
740 " ADCS %1,%1,%4 \n\t" \
741 " ADC %2,%2,%5 \n\t" \
742 " ADDS %0,%0,%3 \n\t" \
743 " ADCS %1,%1,%4 \n\t" \
744 " ADC %2,%2,%5 \n\t" \
745 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "%cc");
747 #elif defined(TFM_PPC32)
753 #define CLEAR_CARRY \
756 #define COMBA_STORE(x) \
759 #define COMBA_STORE2(x) \
762 #define CARRY_FORWARD \
763 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
767 /* multiplies point i and j, updates carry "c1" and digit c2 */
768 #define SQRADD(i, j) \
770 " mullw 16,%6,%6 \n\t" \
771 " addc %0,%0,16 \n\t" \
772 " mulhwu 16,%6,%6 \n\t" \
773 " adde %1,%1,16 \n\t" \
774 " addze %2,%2 \n\t" \
775 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","%cc");
777 /* for squaring some of the terms are doubled... */
778 #define SQRADD2(i, j) \
780 " mullw 16,%6,%7 \n\t" \
781 " mulhwu 17,%6,%7 \n\t" \
782 " addc %0,%0,16 \n\t" \
783 " adde %1,%1,17 \n\t" \
784 " addze %2,%2 \n\t" \
785 " addc %0,%0,16 \n\t" \
786 " adde %1,%1,17 \n\t" \
787 " addze %2,%2 \n\t" \
788 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","%cc");
790 #define SQRADDSC(i, j) \
792 " mullw %0,%6,%7 \n\t" \
793 " mulhwu %1,%6,%7 \n\t" \
794 " xor %2,%2,%2 \n\t" \
795 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "%cc");
797 #define SQRADDAC(i, j) \
799 " mullw 16,%6,%7 \n\t" \
800 " addc %0,%0,16 \n\t" \
801 " mulhwu 16,%6,%7 \n\t" \
802 " adde %1,%1,16 \n\t" \
803 " addze %2,%2 \n\t" \
804 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "%cc");
808 " addc %0,%0,%3 \n\t" \
809 " adde %1,%1,%4 \n\t" \
810 " adde %2,%2,%5 \n\t" \
811 " addc %0,%0,%3 \n\t" \
812 " adde %1,%1,%4 \n\t" \
813 " adde %2,%2,%5 \n\t" \
814 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "%cc");
816 #elif defined(TFM_PPC64)
821 #define CLEAR_CARRY \
824 #define COMBA_STORE(x) \
827 #define COMBA_STORE2(x) \
830 #define CARRY_FORWARD \
831 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
835 /* multiplies point i and j, updates carry "c1" and digit c2 */
836 #define SQRADD(i, j) \
838 " mulld 16,%6,%6 \n\t" \
839 " addc %0,%0,16 \n\t" \
840 " mulhdu 16,%6,%6 \n\t" \
841 " adde %1,%1,16 \n\t" \
842 " addze %2,%2 \n\t" \
843 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","%cc");
845 /* for squaring some of the terms are doubled... */
846 #define SQRADD2(i, j) \
848 " mulld 16,%6,%7 \n\t" \
849 " mulhdu 17,%6,%7 \n\t" \
850 " addc %0,%0,16 \n\t" \
851 " adde %1,%1,17 \n\t" \
852 " addze %2,%2 \n\t" \
853 " addc %0,%0,16 \n\t" \
854 " adde %1,%1,17 \n\t" \
855 " addze %2,%2 \n\t" \
856 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","%cc");
858 #define SQRADDSC(i, j) \
860 " mulld %0,%6,%7 \n\t" \
861 " mulhdu %1,%6,%7 \n\t" \
862 " xor %2,%2,%2 \n\t" \
863 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "%cc");
865 #define SQRADDAC(i, j) \
867 " mulld 16,%6,%7 \n\t" \
868 " addc %0,%0,16 \n\t" \
869 " mulhdu 16,%6,%7 \n\t" \
870 " adde %1,%1,16 \n\t" \
871 " addze %2,%2 \n\t" \
872 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "%cc");
876 " addc %0,%0,%3 \n\t" \
877 " adde %1,%1,%4 \n\t" \
878 " adde %2,%2,%5 \n\t" \
879 " addc %0,%0,%3 \n\t" \
880 " adde %1,%1,%4 \n\t" \
881 " adde %2,%2,%5 \n\t" \
882 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "%cc");
885 #elif defined(TFM_AVR32)
891 #define CLEAR_CARRY \
894 #define COMBA_STORE(x) \
897 #define COMBA_STORE2(x) \
900 #define CARRY_FORWARD \
901 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
905 /* multiplies point i and j, updates carry "c1" and digit c2 */
906 #define SQRADD(i, j) \
908 " mulu.d r2,%6,%6 \n\t" \
909 " add %0,%0,r2 \n\t" \
910 " adc %1,%1,r3 \n\t" \
912 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"r2","r3");
914 /* for squaring some of the terms are doubled... */
915 #define SQRADD2(i, j) \
917 " mulu.d r2,%6,%7 \n\t" \
918 " add %0,%0,r2 \n\t" \
919 " adc %1,%1,r3 \n\t" \
921 " add %0,%0,r2 \n\t" \
922 " adc %1,%1,r3 \n\t" \
924 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2", "r3");
926 #define SQRADDSC(i, j) \
928 " mulu.d r2,%6,%7 \n\t" \
932 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "r2", "r3");
934 #define SQRADDAC(i, j) \
936 " mulu.d r2,%6,%7 \n\t" \
937 " add %0,%0,r2 \n\t" \
938 " adc %1,%1,r3 \n\t" \
940 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"r2", "r3");
944 " add %0,%0,%3 \n\t" \
945 " adc %1,%1,%4 \n\t" \
946 " adc %2,%2,%5 \n\t" \
947 " add %0,%0,%3 \n\t" \
948 " adc %1,%1,%4 \n\t" \
949 " adc %2,%2,%5 \n\t" \
950 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "%cc");
957 /* ISO C portable code */
961 #define CLEAR_CARRY \
964 #define COMBA_STORE(x) \
967 #define COMBA_STORE2(x) \
970 #define CARRY_FORWARD \
971 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
975 /* multiplies point i and j, updates carry "c1" and digit c2 */
976 #define SQRADD(i, j) \
978 t = c0 + ((fp_word)i) * ((fp_word)j); c0 = t; \
979 t = c1 + (t >> DIGIT_BIT); c1 = t; c2 += t >> DIGIT_BIT; \
983 /* for squaring some of the terms are doubled... */
984 #define SQRADD2(i, j) \
986 t = ((fp_word)i) * ((fp_word)j); \
987 tt = (fp_word)c0 + t; c0 = tt; \
988 tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = tt; c2 += tt >> DIGIT_BIT; \
989 tt = (fp_word)c0 + t; c0 = tt; \
990 tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = tt; c2 += tt >> DIGIT_BIT; \
993 #define SQRADDSC(i, j) \
995 t = ((fp_word)i) * ((fp_word)j); \
996 sc0 = (fp_digit)t; sc1 = (t >> DIGIT_BIT); sc2 = 0; \
999 #define SQRADDAC(i, j) \
1001 t = sc0 + ((fp_word)i) * ((fp_word)j); sc0 = t; \
1002 t = sc1 + (t >> DIGIT_BIT); sc1 = t; sc2 += t >> DIGIT_BIT; \
1007 t = ((fp_word)sc0) + ((fp_word)sc0) + c0; c0 = t; \
1008 t = ((fp_word)sc1) + ((fp_word)sc1) + c1 + (t >> DIGIT_BIT); c1 = t; \
1009 c2 = c2 + ((fp_word)sc2) + ((fp_word)sc2) + (t >> DIGIT_BIT); \
1014 #ifdef TFM_SMALL_SET
1015 #include "fp_sqr_comba_small_set.i"
1016 #include "fp_sqr_comba_3.i"
1017 #include "fp_sqr_comba_4.i"
1018 #include "fp_sqr_comba_6.i"
1019 #include "fp_sqr_comba_7.i"
1020 #include "fp_sqr_comba_8.i"
1021 #include "fp_sqr_comba_9.i"
1022 #include "fp_sqr_comba_12.i"
1023 #include "fp_sqr_comba_17.i"
1024 #include "fp_sqr_comba_20.i"
1025 #include "fp_sqr_comba_24.i"
1026 #include "fp_sqr_comba_28.i"
1027 #include "fp_sqr_comba_32.i"
1028 #include "fp_sqr_comba_48.i"
1029 #include "fp_sqr_comba_64.i"
1031 /* end fp_sqr_comba.c asm */
1033 /* start fp_mul_comba.c asm */
1034 /* these are the combas. Worship them. */
1035 #if defined(TFM_X86)
1036 /* Generic x86 optimized code */
1038 /* anything you need at the start */
1041 /* clear the chaining variables */
1042 #define COMBA_CLEAR \
1045 /* forward the carry to the next digit */
1046 #define COMBA_FORWARD \
1047 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1049 /* store the first sum */
1050 #define COMBA_STORE(x) \
1053 /* store the second sum [carry] */
1054 #define COMBA_STORE2(x) \
1057 /* anything you need at the end */
1060 /* this should multiply i and j */
1061 #define MULADD(i, j) \
1063 "movl %6,%%eax \n\t" \
1065 "addl %%eax,%0 \n\t" \
1066 "adcl %%edx,%1 \n\t" \
1068 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","%cc");
1070 #elif defined(TFM_X86_64)
1071 /* x86-64 optimized */
1073 /* anything you need at the start */
1076 /* clear the chaining variables */
1077 #define COMBA_CLEAR \
1080 /* forward the carry to the next digit */
1081 #define COMBA_FORWARD \
1082 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1084 /* store the first sum */
1085 #define COMBA_STORE(x) \
1088 /* store the second sum [carry] */
1089 #define COMBA_STORE2(x) \
1092 /* anything you need at the end */
1095 /* this should multiply i and j */
1096 #define MULADD(i, j) \
1098 "movq %6,%%rax \n\t" \
1100 "addq %%rax,%0 \n\t" \
1101 "adcq %%rdx,%1 \n\t" \
1103 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","%cc");
1105 #elif defined(TFM_SSE2)
1106 /* use SSE2 optimizations */
1108 /* anything you need at the start */
1111 /* clear the chaining variables */
1112 #define COMBA_CLEAR \
1115 /* forward the carry to the next digit */
1116 #define COMBA_FORWARD \
1117 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1119 /* store the first sum */
1120 #define COMBA_STORE(x) \
1123 /* store the second sum [carry] */
1124 #define COMBA_STORE2(x) \
1127 /* anything you need at the end */
1128 #define COMBA_FINI \
1131 /* this should multiply i and j */
1132 #define MULADD(i, j) \
1134 "movd %6,%%mm0 \n\t" \
1135 "movd %7,%%mm1 \n\t" \
1136 "pmuludq %%mm1,%%mm0\n\t" \
1137 "movd %%mm0,%%eax \n\t" \
1138 "psrlq $32,%%mm0 \n\t" \
1139 "addl %%eax,%0 \n\t" \
1140 "movd %%mm0,%%eax \n\t" \
1141 "adcl %%eax,%1 \n\t" \
1143 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%cc");
1145 #elif defined(TFM_ARM)
1150 #define COMBA_CLEAR \
1153 #define COMBA_FORWARD \
1154 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1156 #define COMBA_STORE(x) \
1159 #define COMBA_STORE2(x) \
1164 #define MULADD(i, j) \
1166 " UMULL r0,r1,%6,%7 \n\t" \
1167 " ADDS %0,%0,r0 \n\t" \
1168 " ADCS %1,%1,r1 \n\t" \
1169 " ADC %2,%2,#0 \n\t" \
1170 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "%cc");
1172 #elif defined(TFM_PPC32)
1173 /* For 32-bit PPC */
1177 #define COMBA_CLEAR \
1180 #define COMBA_FORWARD \
1181 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1183 #define COMBA_STORE(x) \
1186 #define COMBA_STORE2(x) \
1191 /* untested: will mulhwu change the flags? Docs say no */
1192 #define MULADD(i, j) \
1194 " mullw 16,%6,%7 \n\t" \
1195 " addc %0,%0,16 \n\t" \
1196 " mulhwu 16,%6,%7 \n\t" \
1197 " adde %1,%1,16 \n\t" \
1198 " addze %2,%2 \n\t" \
1199 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16");
1201 #elif defined(TFM_PPC64)
1202 /* For 64-bit PPC */
1206 #define COMBA_CLEAR \
1209 #define COMBA_FORWARD \
1210 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1212 #define COMBA_STORE(x) \
1215 #define COMBA_STORE2(x) \
1220 /* untested: will mulhwu change the flags? Docs say no */
1221 #define MULADD(i, j) \
1223 " mulld 16,%6,%7 \n\t" \
1224 " addc %0,%0,16 \n\t" \
1225 " mulhdu 16,%6,%7 \n\t" \
1226 " adde %1,%1,16 \n\t" \
1227 " addze %2,%2 \n\t" \
1228 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16");
1230 #elif defined(TFM_AVR32)
1236 #define COMBA_CLEAR \
1239 #define COMBA_FORWARD \
1240 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1242 #define COMBA_STORE(x) \
1245 #define COMBA_STORE2(x) \
1250 #define MULADD(i, j) \
1252 " mulu.d r2,%6,%7 \n\t"\
1254 " adc %1,%1,r3 \n\t"\
1256 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2","r3");
1263 #define COMBA_CLEAR \
1266 #define COMBA_FORWARD \
1267 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
1269 #define COMBA_STORE(x) \
1272 #define COMBA_STORE2(x) \
1277 #define MULADD(i, j) \
1279 t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j); c0 = t; \
1280 t = (fp_word)c1 + (t >> DIGIT_BIT); c1 = t; c2 += t >> DIGIT_BIT; \
1286 #ifdef TFM_SMALL_SET
1287 #include "fp_mul_comba_small_set.i"
1288 #include "fp_mul_comba_3.i"
1289 #include "fp_mul_comba_4.i"
1290 #include "fp_mul_comba_6.i"
1291 #include "fp_mul_comba_7.i"
1292 #include "fp_mul_comba_8.i"
1293 #include "fp_mul_comba_9.i"
1294 #include "fp_mul_comba_12.i"
1295 #include "fp_mul_comba_17.i"
1296 #include "fp_mul_comba_20.i"
1297 #include "fp_mul_comba_24.i"
1298 #include "fp_mul_comba_28.i"
1299 #include "fp_mul_comba_32.i"
1300 #include "fp_mul_comba_48.i"
1301 #include "fp_mul_comba_64.i"
1304 /* end fp_mul_comba.c asm */