std.internal.math.biguintx86 source code

1 /** Optimised asm arbitrary precision arithmetic ('bignum')
2  * routines for X86 processors.
3  *
4  * All functions operate on arrays of uints, stored LSB first.
5  * If there is a destination array, it will be the first parameter.
6  * Currently, all of these functions are subject to change, and are
7  * intended for internal use only.
8  * The symbol [#] indicates an array of machine words which is to be
9  * interpreted as a multi-byte number.
10  */
11 
12 /*          Copyright Don Clugston 2008 - 2010.
13  * Distributed under the Boost Software License, Version 1.0.
14  *    (See accompanying file LICENSE_1_0.txt or copy at
15  *          http://www.boost.org/LICENSE_1_0.txt)
16  */
17 /**
18  * In simple terms, there are 3 modern x86 microarchitectures:
19  * (a) the P6 family (Pentium Pro, PII, PIII, PM, Core), produced by Intel;
20  * (b) the K6, Athlon, and AMD64 families, produced by AMD; and
21  * (c) the Pentium 4, produced by Marketing.
22  *
23  * This code has been optimised for the Intel P6 family.
24  * Generally the code remains near-optimal for Intel Core2/Corei7, after
25  * translating EAX-> RAX, etc, since all these CPUs use essentially the same
26  * pipeline, and are typically limited by memory access.
27  * The code uses techniques described in Agner Fog's superb Pentium manuals
28  * available at www.agner.org.
29  * Not optimised for AMD, which can do two memory loads per cycle (Intel
30  * CPUs can only do one). Despite this, performance is superior on AMD.
31  * Performance is dreadful on P4.
32  *
33  *  Timing results (cycles per int)
34  *              --Intel Pentium--  --AMD--
35  *              PM     P4   Core2   K7
36  *  +,-         2.25  15.6   2.25   1.5
37  *  <<,>>       2.0    6.6   2.0    5.0
38  *    (<< MMX)  1.7    5.3   1.5    1.2
39  *  *           5.0   15.0   4.0    4.3
40  *  mulAdd      5.7   19.0   4.9    4.0
41  *  div        30.0   32.0  32.0   22.4
42  *  mulAcc(32)  6.5   20.0   5.4    4.9
43  *
44  * mulAcc(32) is multiplyAccumulate() for a 32*32 multiply. Thus it includes
45  * function call overhead.
46  * The timing for Div is quite unpredictable, but it's probably too slow
47  * to be useful. On 64-bit processors, these times should
48  * halve if run in 64-bit mode, except for the MMX functions.
49  */
50 
51 module std.internal.math.biguintx86;
52 
53 @system:
54 pure:
55 nothrow:
56 
57 /*
58   Naked asm is used throughout, because:
59   (a) it frees up the EBP register
60   (b) compiler bugs prevent the use of .ptr when a frame pointer is used.
61 */
62 
63 version (D_InlineAsm_X86)
64 {
65 
66 private:
67 
68 /* Duplicate string s, with n times, substituting index for '@'.
69  *
70  * Each instance of '@' in s is replaced by 0,1,...n-1. This is a helper
71  * function for some of the asm routines.
72  */
73 string indexedLoopUnroll(int n, string s) pure @safe
74 {
75     string u;
76     for (int i = 0; i<n; ++i)
77     {
78         string nstr= (i>9 ? ""~ cast(char)('0'+i/10) : "") ~ cast(char)('0' + i%10);
79 
80         int last = 0;
81         for (int j = 0; j<s.length; ++j)
82         {
83             if (s[j]=='@')
84             {
85                 u ~= s[last .. j] ~ nstr;
86                 last = j+1;
87             }
88         }
89         if (last<s.length) u = u ~ s[last..$];
90 
91     }
92     return u;
93 }
94 @safe unittest
95 {
96     assert(indexedLoopUnroll(3, "@*23;")=="0*23;1*23;2*23;");
97 }
98 
99 public:
100 
101 alias BigDigit = uint; // A Bignum is an array of BigDigits. Usually the machine word size.
102 
103 // Limits for when to switch between multiplication algorithms.
104 enum : int { KARATSUBALIMIT = 18 } // Minimum value for which Karatsuba is worthwhile.
105 enum : int { KARATSUBASQUARELIMIT=26 } // Minimum value for which square Karatsuba is worthwhile
106 
107 /** Multi-byte addition or subtraction
108  *    dest[#] = src1[#] + src2[#] + carry (0 or 1).
109  * or dest[#] = src1[#] - src2[#] - carry (0 or 1).
110  * Returns carry or borrow (0 or 1).
111  * Set op == '+' for addition, '-' for subtraction.
112  */
113 uint multibyteAddSub(char op)(uint[] dest, const uint [] src1, const uint []
114         src2, uint carry) pure @safe @nogc
115 {
116     // Timing:
117     // Pentium M: 2.25/int
118     // P6 family, Core2 have a partial flags stall when reading the carry flag in
119     // an ADC, SBB operation after an operation such as INC or DEC which
120     // modifies some, but not all, flags. We avoid this by storing carry into
121     // a resister (AL), and restoring it after the branch.
122 
123     enum { LASTPARAM = 4*4 } // 3* pushes + return address.
124     asm pure nothrow @nogc @trusted {
125         naked;
126         push EDI;
127         push EBX;
128         push ESI;
129         mov ECX, [ESP + LASTPARAM + 4*4]; // dest.length;
130         mov EDX, [ESP + LASTPARAM + 3*4]; // src1.ptr
131         mov ESI, [ESP + LASTPARAM + 1*4]; // src2.ptr
132         mov EDI, [ESP + LASTPARAM + 5*4]; // dest.ptr
133              // Carry is in EAX
134         // Count UP to zero (from -len) to minimize loop overhead.
135         lea EDX, [EDX + 4*ECX]; // EDX = end of src1.
136         lea ESI, [ESI + 4*ECX]; // EBP = end of src2.
137         lea EDI, [EDI + 4*ECX]; // EDI = end of dest.
138 
139         neg ECX;
140         add ECX, 8;
141         jb L2;  // if length < 8 , bypass the unrolled loop.
142 L_unrolled:
143         shr AL, 1; // get carry from EAX
144     }
145     mixin(" asm pure nothrow @nogc @trusted {"
146         ~ indexedLoopUnroll( 8,
147         "mov EAX, [@*4-8*4+EDX+ECX*4];"
148         ~ ( op == '+' ? "adc" : "sbb" ) ~ " EAX, [@*4-8*4+ESI+ECX*4];"
149         ~ "mov [@*4-8*4+EDI+ECX*4], EAX;")
150         ~ "}");
151     asm pure nothrow @nogc @trusted {
152         setc AL; // save carry
153         add ECX, 8;
154         ja L_unrolled;
155 L2:     // Do the residual 1 .. 7 ints.
156 
157         sub ECX, 8;
158         jz done;
159 L_residual:
160         shr AL, 1; // get carry from EAX
161     }
162     mixin(" asm pure nothrow @nogc @trusted {"
163         ~ indexedLoopUnroll( 1,
164         "mov EAX, [@*4+EDX+ECX*4];"
165         ~ ( op == '+' ? "adc" : "sbb" ) ~ " EAX, [@*4+ESI+ECX*4];"
166         ~ "mov [@*4+EDI+ECX*4], EAX;") ~ "}");
167     asm pure nothrow @nogc @trusted {
168         setc AL; // save carry
169         add ECX, 1;
170         jnz L_residual;
171 done:
172         and EAX, 1; // make it O or 1.
173         pop ESI;
174         pop EBX;
175         pop EDI;
176         ret 6*4;
177     }
178 }
179 
180 @safe unittest
181 {
182     uint [] a = new uint[40];
183     uint [] b = new uint[40];
184     uint [] c = new uint[40];
185     for (int i=0; i<a.length; ++i)
186     {
187         if (i&1) a[i]=0x8000_0000 + i;
188         else a[i]=i;
189         b[i]= 0x8000_0003;
190     }
191     c[19]=0x3333_3333;
192     uint carry = multibyteAddSub!('+')(c[0 .. 18], a[0 .. 18], b[0 .. 18], 0);
193     assert(carry == 1);
194     assert(c[0]==0x8000_0003);
195     assert(c[1]==4);
196     assert(c[19]==0x3333_3333); // check for overrun
197     for (int i=0; i<a.length; ++i)
198     {
199         a[i]=b[i]=c[i]=0;
200     }
201     a[8]=0x048D159E;
202     b[8]=0x048D159E;
203     a[10]=0x1D950C84;
204     b[10]=0x1D950C84;
205     a[5] =0x44444444;
206     carry = multibyteAddSub!('-')(a[0 .. 12], a[0 .. 12], b[0 .. 12], 0);
207     assert(a[11]==0);
208     for (int i=0; i<10; ++i) if (i != 5) assert(a[i]==0);
209 
210     for (int q=3; q<36;++q)
211     {
212         for (int i=0; i<a.length; ++i)
213         {
214             a[i]=b[i]=c[i]=0;
215         }
216         a[q-2]=0x040000;
217         b[q-2]=0x040000;
218        carry = multibyteAddSub!('-')(a[0 .. q], a[0 .. q], b[0 .. q], 0);
219        assert(a[q-2]==0);
220     }
221 }
222 
223 /** dest[#] += carry, or dest[#] -= carry.
224  *  op must be '+' or '-'
225  *  Returns final carry or borrow (0 or 1)
226  */
227 uint multibyteIncrementAssign(char op)(uint[] dest, uint carry) pure @safe @nogc
228 {
229     enum { LASTPARAM = 1*4 } // 0* pushes + return address.
230     asm pure nothrow @nogc @trusted {
231         naked;
232         mov ECX, [ESP + LASTPARAM + 0*4]; // dest.length;
233         mov EDX, [ESP + LASTPARAM + 1*4]; // dest.ptr
234         // EAX  = carry
235 L1: ;
236     }
237     static if (op=='+')
238         asm pure nothrow @nogc @trusted { add [EDX], EAX; }
239     else
240         asm pure nothrow @nogc @trusted { sub [EDX], EAX; }
241     asm pure nothrow @nogc @trusted {
242         mov EAX, 1;
243         jnc L2;
244         add EDX, 4;
245         dec ECX;
246         jnz L1;
247         mov EAX, 2;
248 L2:     dec EAX;
249         ret 2*4;
250     }
251 }
252 
253 /** dest[#] = src[#] << numbits
254  *  numbits must be in the range 1 .. 31
255  *  Returns the overflow
256  */
257 uint multibyteShlNoMMX(uint [] dest, const uint [] src, uint numbits) pure @safe @nogc
258 {
259     // Timing: Optimal for P6 family.
260     // 2.0 cycles/int on PPro .. PM (limited by execution port p0)
261     // 5.0 cycles/int on Athlon, which has 7 cycles for SHLD!!
262     enum { LASTPARAM = 4*4 } // 3* pushes + return address.
263     asm pure nothrow @nogc @trusted {
264         naked;
265         push ESI;
266         push EDI;
267         push EBX;
268         mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
269         mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length;
270         mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
271         mov ECX, EAX; // numbits;
272 
273         mov EAX, [-4+ESI + 4*EBX];
274         mov EDX, 0;
275         shld EDX, EAX, CL;
276         push EDX; // Save return value
277         cmp EBX, 1;
278         jz L_last;
279         mov EDX, [-4+ESI + 4*EBX];
280         test EBX, 1;
281         jz L_odd;
282         sub EBX, 1;
283 L_even:
284         mov EDX, [-4+ ESI + 4*EBX];
285         shld EAX, EDX, CL;
286         mov [EDI+4*EBX], EAX;
287 L_odd:
288         mov EAX, [-8+ESI + 4*EBX];
289         shld EDX, EAX, CL;
290         mov [-4+EDI + 4*EBX], EDX;
291         sub EBX, 2;
292         jg L_even;
293 L_last:
294         shl EAX, CL;
295         mov [EDI], EAX;
296         pop EAX; // pop return value
297         pop EBX;
298         pop EDI;
299         pop ESI;
300         ret 4*4;
301     }
302 }
303 
304 /** dest[#] = src[#] >> numbits
305  *  numbits must be in the range 1 .. 31
306  * This version uses MMX.
307  */
308 uint multibyteShl(uint [] dest, const uint [] src, uint numbits) pure @safe @nogc
309 {
310     // Timing:
311     // K7 1.2/int. PM 1.7/int P4 5.3/int
312     enum { LASTPARAM = 4*4 } // 3* pushes + return address.
313     asm pure nothrow @nogc @trusted {
314         naked;
315         push ESI;
316         push EDI;
317         push EBX;
318         mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
319         mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length;
320         mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
321 
322         movd MM3, EAX; // numbits = bits to shift left
323         xor EAX, 63;
324         align   16;
325         inc EAX;
326         movd MM4, EAX ; // 64-numbits = bits to shift right
327 
328         // Get the return value into EAX
329         and EAX, 31; // EAX = 32-numbits
330         movd MM2, EAX; // 32-numbits
331         movd MM1, [ESI+4*EBX-4];
332         psrlq MM1, MM2;
333         movd EAX, MM1;  // EAX = return value
334         test EBX, 1;
335         jz L_even;
336 L_odd:
337         cmp EBX, 1;
338         jz L_length1;
339 
340          // deal with odd lengths
341         movq MM1, [ESI+4*EBX-8];
342         psrlq MM1, MM2;
343         movd    [EDI +4*EBX-4], MM1;
344         sub EBX, 1;
345 L_even: // It's either singly or doubly even
346         movq    MM2, [ESI + 4*EBX - 8];
347         psllq   MM2, MM3;
348         sub EBX, 2;
349         jle L_last;
350         movq MM1, MM2;
351         add EBX, 2;
352         test EBX, 2;
353         jz L_onceeven;
354         sub EBX, 2;
355 
356         // MAIN LOOP -- 128 bytes per iteration
357  L_twiceeven:      // here MM2 is the carry
358         movq    MM0, [ESI + 4*EBX-8];
359         psrlq   MM0, MM4;
360         movq    MM1, [ESI + 4*EBX-8];
361         psllq   MM1, MM3;
362         por     MM2, MM0;
363         movq    [EDI +4*EBX], MM2;
364 L_onceeven:        // here MM1 is the carry
365         movq    MM0, [ESI + 4*EBX-16];
366         psrlq   MM0, MM4;
367         movq    MM2, [ESI + 4*EBX-16];
368         por     MM1, MM0;
369         movq    [EDI +4*EBX-8], MM1;
370         psllq   MM2, MM3;
371         sub EBX, 4;
372         jg L_twiceeven;
373 L_last:
374         movq    [EDI +4*EBX], MM2;
375 L_alldone:
376         emms;  // NOTE: costs 6 cycles on Intel CPUs
377         pop EBX;
378         pop EDI;
379         pop ESI;
380         ret 4*4;
381 
382 L_length1:
383         // length 1 is a special case
384         movd MM1, [ESI];
385         psllq MM1, MM3;
386         movd [EDI], MM1;
387         jmp L_alldone;
388     }
389 }
390 
391 void multibyteShr(uint [] dest, const uint [] src, uint numbits) pure @safe @nogc
392 {
393     enum { LASTPARAM = 4*4 } // 3* pushes + return address.
394     asm pure nothrow @nogc @trusted {
395         naked;
396         push ESI;
397         push EDI;
398         push EBX;
399         mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
400         mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length;
401 align 16;
402         mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
403         lea EDI, [EDI + 4*EBX]; // EDI = end of dest
404         lea ESI, [ESI + 4*EBX]; // ESI = end of src
405         neg EBX;                // count UP to zero.
406 
407         movd MM3, EAX; // numbits = bits to shift right
408         xor EAX, 63;
409         inc EAX;
410         movd MM4, EAX ; // 64-numbits = bits to shift left
411 
412         test EBX, 1;
413         jz L_even;
414 L_odd:
415          // deal with odd lengths
416         and EAX, 31; // EAX = 32-numbits
417         movd MM2, EAX; // 32-numbits
418         cmp EBX, -1;
419         jz L_length1;
420 
421         movq MM0, [ESI+4*EBX];
422         psrlq MM0, MM3;
423         movd    [EDI +4*EBX], MM0;
424         add EBX, 1;
425 L_even:
426         movq    MM2, [ESI + 4*EBX];
427         psrlq   MM2, MM3;
428 
429         movq MM1, MM2;
430         add EBX, 4;
431         cmp EBX, -2+4;
432         jz L_last;
433         // It's either singly or doubly even
434         sub EBX, 2;
435         test EBX, 2;
436         jnz L_onceeven;
437         add EBX, 2;
438 
439         // MAIN LOOP -- 128 bytes per iteration
440  L_twiceeven:      // here MM2 is the carry
441         movq    MM0, [ESI + 4*EBX-8];
442         psllq   MM0, MM4;
443         movq    MM1, [ESI + 4*EBX-8];
444         psrlq   MM1, MM3;
445         por     MM2, MM0;
446         movq    [EDI +4*EBX-16], MM2;
447 L_onceeven:        // here MM1 is the carry
448         movq    MM0, [ESI + 4*EBX];
449         psllq   MM0, MM4;
450         movq    MM2, [ESI + 4*EBX];
451         por     MM1, MM0;
452         movq    [EDI +4*EBX-8], MM1;
453         psrlq   MM2, MM3;
454         add EBX, 4;
455         jl L_twiceeven;
456 L_last:
457         movq    [EDI +4*EBX-16], MM2;
458 L_alldone:
459         emms;  // NOTE: costs 6 cycles on Intel CPUs
460         pop EBX;
461         pop EDI;
462         pop ESI;
463         ret 4*4;
464 
465 L_length1:
466         // length 1 is a special case
467         movd MM1, [ESI+4*EBX];
468         psrlq MM1, MM3;
469         movd    [EDI +4*EBX], MM1;
470         jmp L_alldone;
471 
472     }
473 }
474 
475 /** dest[#] = src[#] >> numbits
476  *  numbits must be in the range 1 .. 31
477  */
478 void multibyteShrNoMMX(uint [] dest, const uint [] src, uint numbits) pure @safe @nogc
479 {
480     // Timing: Optimal for P6 family.
481     // 2.0 cycles/int on PPro .. PM (limited by execution port p0)
482     // Terrible performance on AMD64, which has 7 cycles for SHRD!!
483     enum { LASTPARAM = 4*4 } // 3* pushes + return address.
484     asm pure nothrow @nogc @trusted {
485         naked;
486         push ESI;
487         push EDI;
488         push EBX;
489         mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
490         mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length;
491         mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
492         mov ECX, EAX; // numbits;
493 
494         lea EDI, [EDI + 4*EBX]; // EDI = end of dest
495         lea ESI, [ESI + 4*EBX]; // ESI = end of src
496         neg EBX;                // count UP to zero.
497         mov EAX, [ESI + 4*EBX];
498         cmp EBX, -1;
499         jz L_last;
500         mov EDX, [ESI + 4*EBX];
501         test EBX, 1;
502         jz L_odd;
503         add EBX, 1;
504 L_even:
505         mov EDX, [ ESI + 4*EBX];
506         shrd EAX, EDX, CL;
507         mov [-4 + EDI+4*EBX], EAX;
508 L_odd:
509         mov EAX, [4 + ESI + 4*EBX];
510         shrd EDX, EAX, CL;
511         mov [EDI + 4*EBX], EDX;
512         add EBX, 2;
513         jl L_even;
514 L_last:
515         shr EAX, CL;
516         mov [-4 + EDI], EAX;
517 
518         pop EBX;
519         pop EDI;
520         pop ESI;
521         ret 4*4;
522     }
523 }
524 
525 @safe unittest
526 {
527 
528     uint [] aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
529     multibyteShr(aa[0..$-1], aa, 4);
530     assert(aa[0] == 0x6122_2222 && aa[1]==0xA455_5555
531         && aa[2]==0xD899_9999 && aa[3]==0x0BCC_CCCC);
532 
533     aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
534     multibyteShr(aa[2..$-1], aa[2..$-1], 4);
535     assert(aa[0] == 0x1222_2223 && aa[1]==0x4555_5556
536         && aa[2]==0xD899_9999 && aa[3]==0x0BCC_CCCC);
537 
538     aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
539     multibyteShr(aa[0..$-2], aa, 4);
540     assert(aa[1]==0xA455_5555 && aa[2]==0x0899_9999);
541     assert(aa[0]==0x6122_2222);
542     assert(aa[3]==0xBCCC_CCCD);
543 
544 
545     aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
546     uint r = multibyteShl(aa[2 .. 4], aa[2 .. 4], 4);
547     assert(aa[0] == 0xF0FF_FFFF && aa[1]==0x1222_2223
548         && aa[2]==0x5555_5560 && aa[3]==0x9999_99A4 && aa[4]==0xBCCC_CCCD);
549     assert(r == 8);
550 
551     aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
552     r = multibyteShl(aa[1 .. 4], aa[1 .. 4], 4);
553     assert(aa[0] == 0xF0FF_FFFF
554         && aa[2]==0x5555_5561);
555         assert(aa[3]==0x9999_99A4 && aa[4]==0xBCCC_CCCD);
556     assert(r == 8);
557         assert(aa[1]==0x2222_2230);
558 
559     aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
560     r = multibyteShl(aa[0 .. 4], aa[1 .. 5], 31);
561 }
562 
563 /** dest[#] = src[#] * multiplier + carry.
564  * Returns carry.
565  */
566 uint multibyteMul(uint[] dest, const uint[] src, uint multiplier, uint carry)
567     pure @safe @nogc
568 {
569     // Timing: definitely not optimal.
570     // Pentium M: 5.0 cycles/operation, has 3 resource stalls/iteration
571     // Fastest implementation found was 4.6 cycles/op, but not worth the complexity.
572 
573     enum { LASTPARAM = 4*4 } // 4* pushes + return address.
574     // We'll use p2 (load unit) instead of the overworked p0 or p1 (ALU units)
575     // when initializing variables to zero.
576     version (D_PIC)
577     {
578         enum { zero = 0 }
579     }
580     else
581     {
582         static immutable int zero = 0;
583     }
584     asm pure nothrow @nogc @trusted {
585         naked;
586         push ESI;
587         push EDI;
588         push EBX;
589 
590         mov EDI, [ESP + LASTPARAM + 4*4]; // dest.ptr
591         mov EBX, [ESP + LASTPARAM + 4*3]; // dest.length
592         mov ESI, [ESP + LASTPARAM + 4*2];  // src.ptr
593         align 16;
594         lea EDI, [EDI + 4*EBX]; // EDI = end of dest
595         lea ESI, [ESI + 4*EBX]; // ESI = end of src
596         mov ECX, EAX; // [carry]; -- last param is in EAX.
597         neg EBX;                // count UP to zero.
598         test EBX, 1;
599         jnz L_odd;
600         add EBX, 1;
601  L1:
602         mov EAX, [-4 + ESI + 4*EBX];
603         mul int ptr [ESP+LASTPARAM]; //[multiplier];
604         add EAX, ECX;
605         mov ECX, zero;
606         mov [-4+EDI + 4*EBX], EAX;
607         adc ECX, EDX;
608 L_odd:
609         mov EAX, [ESI + 4*EBX];  // p2
610         mul int ptr [ESP+LASTPARAM]; //[multiplier]; // p0*3,
611         add EAX, ECX;
612         mov ECX, zero;
613         adc ECX, EDX;
614         mov [EDI + 4*EBX], EAX;
615         add EBX, 2;
616         jl L1;
617 
618         mov EAX, ECX; // get final carry
619 
620         pop EBX;
621         pop EDI;
622         pop ESI;
623         ret 5*4;
624     }
625 }
626 
627 @system unittest
628 {
629     uint [] aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
630     multibyteMul(aa[1 .. 4], aa[1 .. 4], 16, 0);
631     assert(aa[0] == 0xF0FF_FFFF && aa[1] == 0x2222_2230 &&
632         aa[2]==0x5555_5561 && aa[3]==0x9999_99A4 && aa[4]==0x0BCCC_CCCD);
633 }
634 
635 // The inner multiply-and-add loop, together with the Even entry point.
636 // Multiples by M_ADDRESS which should be "ESP+LASTPARAM" or "ESP". OP must be "add" or "sub"
637 // This is the most time-critical code in the BigInt library.
638 // It is used by both MulAdd, multiplyAccumulate, and triangleAccumulate
639 string asmMulAdd_innerloop(string OP, string M_ADDRESS) pure @safe {
640     // The bottlenecks in this code are extremely complicated. The MUL, ADD, and ADC
641     // need 4 cycles on each of the ALUs units p0 and p1. So we use memory load
642     // (unit p2) for initializing registers to zero.
643     // There are also dependencies between the instructions, and we run up against the
644     // ROB-read limit (can only read 2 registers per cycle).
645     // We also need the number of uops in the loop to be a multiple of 3.
646     // The only available execution unit for this is p3 (memory write). Unfortunately we can't do that
647         // if Position-Independent Code is required.
648 
649         // Register usage
650     // ESI = end of src
651     // EDI = end of dest
652     // EBX = index. Counts up to zero (in steps of 2).
653     // EDX:EAX = scratch, used in multiply.
654     // ECX = carry1.
655     // EBP = carry2.
656         // ESP = points to the multiplier.
657 
658         // The first member of 'dest' which will be modified is [EDI+4*EBX].
659         // EAX must already contain the first member of 'src', [ESI+4*EBX].
660 
661     version (D_PIC) { bool using_PIC = true; } else { bool using_PIC = false; }
662     return "
663         // Entry point for even length
664         add EBX, 1;
665         mov EBP, ECX; // carry
666 
667         mul int ptr [" ~ M_ADDRESS ~ "]; // M
668         mov ECX, 0;
669 
670         add EBP, EAX;
671         mov EAX, [ESI+4*EBX];
672         adc ECX, EDX;
673 
674         mul int ptr [" ~ M_ADDRESS ~ "]; // M
675         " ~ OP ~ " [-4+EDI+4*EBX], EBP;
676         mov EBP, zero;
677 
678         adc ECX, EAX;
679         mov EAX, [4+ESI+4*EBX];
680 
681         adc EBP, EDX;
682         add EBX, 2;
683         jnl L_done;
684 L1:
685         mul int ptr [" ~ M_ADDRESS ~ "];
686         " ~ OP ~ " [-8+EDI+4*EBX], ECX;
687         adc EBP, EAX;
688         mov ECX, zero;
689         mov EAX, [ESI+4*EBX];
690         adc ECX, EDX;
691 " ~
692         (using_PIC ? "" : "   mov storagenop, EDX; ") // make #uops in loop a multiple of 3, can't do this in PIC mode.
693 ~ "
694         mul int ptr [" ~ M_ADDRESS ~ "];
695         " ~ OP ~ " [-4+EDI+4*EBX], EBP;
696         mov EBP, zero;
697 
698         adc ECX, EAX;
699         mov EAX, [4+ESI+4*EBX];
700 
701         adc EBP, EDX;
702         add EBX, 2;
703         jl L1;
704 L_done: " ~ OP ~ " [-8+EDI+4*EBX], ECX;
705         adc EBP, 0;
706 ";
707                 // final carry is now in EBP
708 }
709 
710 string asmMulAdd_enter_odd(string OP, string M_ADDRESS) pure @safe
711 {
712     return "
713         mul int ptr [" ~M_ADDRESS ~"];
714         mov EBP, zero;
715         add ECX, EAX;
716         mov EAX, [4+ESI+4*EBX];
717 
718         adc EBP, EDX;
719         add EBX, 2;
720         jl L1;
721         jmp L_done;
722 ";
723 }
724 
725 version (D_PIC) {} else
726 {
727     // This cannot be declared inside the relevant functions because __gshared is
728     // not allowed in @safe functions. It cannot be regular shared either since
729     // that gives an access violation on Win32 for some reason.
730     private __gshared int storagenopTriangleAccumulate;
731     private __gshared int storagenopMulAdd;
732     private __gshared int storagenopMultiplyAccumulate;
733 }
734 
735 /**
736  * dest[#] += src[#] * multiplier OP carry(0 .. FFFF_FFFF).
737  * where op == '+' or '-'
738  * Returns carry out of MSB (0 .. FFFF_FFFF).
739  */
740 uint multibyteMulAdd(char op)(uint [] dest, const uint [] src, uint
741         multiplier, uint carry) pure @safe @nogc {
742     // Timing: This is the most time-critical bignum function.
743     // Pentium M: 5.4 cycles/operation, still has 2 resource stalls + 1load block/iteration
744 
745     // The main loop is pipelined and unrolled by 2,
746     //   so entry to the loop is also complicated.
747 
748     // Register usage
749     // EDX:EAX = multiply
750     // EBX = counter
751     // ECX = carry1
752     // EBP = carry2
753     // EDI = dest
754     // ESI = src
755 
756     enum string OP = (op=='+')? "add" : "sub";
757     version (D_PIC)
758     {
759         enum { zero = 0 }
760     }
761     else
762     {
763         // use p2 (load unit) instead of the overworked p0 or p1 (ALU units)
764         // when initializing registers to zero.
765         static immutable int zero = 0;
766         // use p3/p4 units
767         alias storagenop = storagenopMulAdd; // write-only
768     }
769 
770     enum { LASTPARAM = 5*4 } // 4* pushes + return address.
771     asm pure nothrow @nogc @trusted {
772         naked;
773 
774         push ESI;
775         push EDI;
776         push EBX;
777         push EBP;
778         mov EDI, [ESP + LASTPARAM + 4*4]; // dest.ptr
779         mov EBX, [ESP + LASTPARAM + 4*3]; // dest.length
780         align 16;
781         nop;
782         mov ESI, [ESP + LASTPARAM + 4*2];  // src.ptr
783         lea EDI, [EDI + 4*EBX]; // EDI = end of dest
784         lea ESI, [ESI + 4*EBX]; // ESI = end of src
785         mov EBP, 0;
786         mov ECX, EAX; // ECX = input carry.
787         neg EBX;                // count UP to zero.
788         mov EAX, [ESI+4*EBX];
789         test EBX, 1;
790         jnz L_enter_odd;
791     }
792     // Main loop, with entry point for even length
793     mixin("asm pure nothrow @nogc @trusted {" ~ asmMulAdd_innerloop(OP, "ESP+LASTPARAM") ~ "}");
794     asm pure nothrow @nogc @trusted {
795         mov EAX, EBP; // get final carry
796         pop EBP;
797         pop EBX;
798         pop EDI;
799         pop ESI;
800         ret 5*4;
801     }
802 L_enter_odd:
803     mixin("asm pure nothrow @nogc @trusted {" ~ asmMulAdd_enter_odd(OP, "ESP+LASTPARAM") ~ "}");
804 }
805 
806 @system unittest
807 {
808 
809     uint [] aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE];
810     uint [] bb = [0x1234_1234, 0xF0F0_F0F0, 0x00C0_C0C0, 0xF0F0_F0F0, 0xC0C0_C0C0];
811     multibyteMulAdd!('+')(bb[1..$-1], aa[1..$-2], 16, 5);
812     assert(bb[0] == 0x1234_1234 && bb[4] == 0xC0C0_C0C0);
813     assert(bb[1] == 0x2222_2230 + 0xF0F0_F0F0+5 && bb[2] == 0x5555_5561+0x00C0_C0C0+1
814          && bb[3] == 0x9999_99A4+0xF0F0_F0F0 );
815 }
816 
817 /**
818    Sets result[#] = result[0 .. left.length] + left[#] * right[#]
819 
820    It is defined in this way to allow cache-efficient multiplication.
821    This function is equivalent to:
822     ----
823     for (int i = 0; i< right.length; ++i)
824     {
825         dest[left.length + i] = multibyteMulAdd(dest[i .. left.length+i],
826                 left, right[i], 0);
827     }
828     ----
829  */
830 void multibyteMultiplyAccumulate(uint [] dest, const uint[] left,
831         const uint [] right) pure @safe @nogc {
832     // Register usage
833     // EDX:EAX = used in multiply
834     // EBX = index
835     // ECX = carry1
836     // EBP = carry2
837     // EDI = end of dest for this pass through the loop. Index for outer loop.
838     // ESI = end of left. never changes
839     // [ESP] = M = right[i] = multiplier for this pass through the loop.
840     // right.length is changed into dest.ptr+dest.length
841     version (D_PIC)
842     {
843         enum { zero = 0 }
844     }
845     else
846     {
847         // use p2 (load unit) instead of the overworked p0 or p1 (ALU units)
848         // when initializing registers to zero.
849         static immutable int zero = 0;
850         // use p3/p4 units
851         alias storagenop = storagenopMultiplyAccumulate; // write-only
852     }
853 
854     enum { LASTPARAM = 6*4 } // 4* pushes + local + return address.
855     asm pure nothrow @nogc @trusted {
856         naked;
857 
858         push ESI;
859         push EDI;
860         align 16;
861         push EBX;
862         push EBP;
863         push EAX;    // local variable M
864         mov EDI, [ESP + LASTPARAM + 4*5]; // dest.ptr
865         mov EBX, [ESP + LASTPARAM + 4*2]; // left.length
866         mov ESI, [ESP + LASTPARAM + 4*3];  // left.ptr
867         lea EDI, [EDI + 4*EBX]; // EDI = end of dest for first pass
868 
869         mov EAX, [ESP + LASTPARAM + 4*0]; // right.length
870         lea EAX, [EDI + 4*EAX];
871         mov [ESP + LASTPARAM + 4*0], EAX; // last value for EDI
872 
873         lea ESI, [ESI + 4*EBX]; // ESI = end of left
874         mov EAX, [ESP + LASTPARAM + 4*1]; // right.ptr
875         mov EAX, [EAX];
876         mov [ESP], EAX; // M
877 outer_loop:
878         mov EBP, 0;
879         mov ECX, 0; // ECX = input carry.
880         neg EBX;                // count UP to zero.
881         mov EAX, [ESI+4*EBX];
882         test EBX, 1;
883         jnz L_enter_odd;
884     }
885     // -- Inner loop, with even entry point
886     mixin("asm pure nothrow @nogc @trusted { " ~ asmMulAdd_innerloop("add", "ESP") ~ "}");
887     asm pure nothrow @nogc @trusted {
888         mov [-4+EDI+4*EBX], EBP;
889         add EDI, 4;
890         cmp EDI, [ESP + LASTPARAM + 4*0]; // is EDI = &dest[$]?
891         jz outer_done;
892         mov EAX, [ESP + LASTPARAM + 4*1]; // right.ptr
893         mov EAX, [EAX+4];                 // get new M
894         mov [ESP], EAX;                   // save new M
895         add int ptr [ESP + LASTPARAM + 4*1], 4; // right.ptr
896         mov EBX, [ESP + LASTPARAM + 4*2]; // left.length
897         jmp outer_loop;
898 outer_done:
899         pop EAX;
900         pop EBP;
901         pop EBX;
902         pop EDI;
903         pop ESI;
904         ret 6*4;
905     }
906 L_enter_odd:
907     mixin("asm pure nothrow @nogc @trusted {" ~ asmMulAdd_enter_odd("add", "ESP") ~ "}");
908 }
909 
910 /**  dest[#] /= divisor.
911  * overflow is the initial remainder, and must be in the range 0 .. divisor-1.
912  * divisor must not be a power of 2 (use right shift for that case;
913  * A division by zero will occur if divisor is a power of 2).
914  * Returns the final remainder
915  *
916  * Based on public domain code by Eric Bainville.
917  * (http://www.bealto.com/) Used with permission.
918  */
919 uint multibyteDivAssign(uint [] dest, uint divisor, uint overflow) pure @safe @nogc
920 {
921     // Timing: limited by a horrible dependency chain.
922     // Pentium M: 18 cycles/op, 8 resource stalls/op.
923     // EAX, EDX = scratch, used by MUL
924     // EDI = dest
925     // CL = shift
926     // ESI = quotient
927     // EBX = remainderhi
928     // EBP = remainderlo
929     // [ESP-4] = mask
930     // [ESP] = kinv (2^64 /divisor)
931     enum { LASTPARAM = 5*4 } // 4* pushes + return address.
932     enum { LOCALS = 2*4} // MASK, KINV
933     asm pure nothrow @nogc @trusted {
934         naked;
935 
936         push ESI;
937         push EDI;
938         push EBX;
939         push EBP;
940 
941         mov EDI, [ESP + LASTPARAM + 4*2]; // dest.ptr
942         mov EBX, [ESP + LASTPARAM + 4*1]; // dest.length
943 
944         // Loop from msb to lsb
945         lea     EDI, [EDI + 4*EBX];
946         mov EBP, EAX; // rem is the input remainder, in 0 .. divisor-1
947         // Build the pseudo-inverse of divisor k: 2^64/k
948         // First determine the shift in ecx to get the max number of bits in kinv
949         xor     ECX, ECX;
950         mov     EAX, [ESP + LASTPARAM]; //divisor;
951         mov     EDX, 1;
952 kinv1:
953         inc     ECX;
954         ror     EDX, 1;
955         shl     EAX, 1;
956         jnc     kinv1;
957         dec     ECX;
958         // Here, ecx is a left shift moving the msb of k to bit 32
959 
960         mov     EAX, 1;
961         shl     EAX, CL;
962         dec     EAX;
963         ror     EAX, CL ; //ecx bits at msb
964         push    EAX; // MASK
965 
966         // Then divide 2^(32+cx) by divisor (edx already ok)
967         xor     EAX, EAX;
968         div     int ptr [ESP + LASTPARAM +  LOCALS-4*1]; //divisor;
969         push    EAX; // kinv
970         align   16;
971 L2:
972         // Get 32 bits of quotient approx, multiplying
973         // most significant word of (rem*2^32+input)
974         mov     EAX, [ESP+4]; //MASK;
975         and     EAX, [EDI - 4];
976         or      EAX, EBP;
977         rol     EAX, CL;
978         mov     EBX, EBP;
979         mov     EBP, [EDI - 4];
980         mul     int ptr [ESP]; //KINV;
981 
982         shl     EAX, 1;
983         rcl     EDX, 1;
984 
985         // Multiply by k and subtract to get remainder
986         // Subtraction must be done on two words
987         mov     EAX, EDX;
988         mov     ESI, EDX; // quot = high word
989         mul     int ptr [ESP + LASTPARAM+LOCALS]; //divisor;
990         sub     EBP, EAX;
991         sbb     EBX, EDX;
992         jz      Lb;  // high word is 0, goto adjust on single word
993 
994         // Adjust quotient and remainder on two words
995 Ld:     inc     ESI;
996         sub     EBP, [ESP + LASTPARAM+LOCALS]; //divisor;
997         sbb     EBX, 0;
998         jnz     Ld;
999 
1000         // Adjust quotient and remainder on single word
1001 Lb:     cmp     EBP, [ESP + LASTPARAM+LOCALS]; //divisor;
1002         jc      Lc; // rem in 0 .. divisor-1, OK
1003         sub     EBP, [ESP + LASTPARAM+LOCALS]; //divisor;
1004         inc     ESI;
1005         jmp     Lb;
1006 
1007         // Store result
1008 Lc:
1009         mov     [EDI - 4], ESI;
1010         lea     EDI, [EDI - 4];
1011         dec     int ptr [ESP + LASTPARAM + 4*1+LOCALS]; // len
1012         jnz    L2;
1013 
1014         pop EAX; // discard kinv
1015         pop EAX; // discard mask
1016 
1017         mov     EAX, EBP; // return final remainder
1018         pop     EBP;
1019         pop     EBX;
1020         pop     EDI;
1021         pop     ESI;
1022         ret     3*4;
1023     }
1024 }
1025 
1026 @safe unittest
1027 {
1028     uint [] aa = new uint[101];
1029     for (int i=0; i<aa.length; ++i) aa[i] = 0x8765_4321 * (i+3);
1030     uint overflow = multibyteMul(aa, aa, 0x8EFD_FCFB, 0x33FF_7461);
1031     uint r = multibyteDivAssign(aa, 0x8EFD_FCFB, overflow);
1032     for (int i=0; i<aa.length-1; ++i) assert(aa[i] == 0x8765_4321 * (i+3));
1033     assert(r == 0x33FF_7461);
1034 }
1035 
1036 // Set dest[2*i .. 2*i+1]+=src[i]*src[i]
1037 void multibyteAddDiagonalSquares(uint [] dest, const uint [] src) pure @safe @nogc
1038 {
1039     /* Unlike mulAdd, the carry is only 1 bit,
1040            since FFFF*FFFF+FFFF_FFFF = 1_0000_0000.
1041            Note also that on the last iteration, no carry can occur.
1042            As for multibyteAdd, we save & restore carry flag through the loop.
1043 
1044            The timing is entirely dictated by the dependency chain. We could
1045            improve it by moving the mov EAX after the adc [EDI], EAX. Probably not worthwhile.
1046     */
1047     enum { LASTPARAM = 4*5 } // 4* pushes + return address.
1048     asm pure nothrow @nogc @trusted {
1049         naked;
1050         push ESI;
1051         push EDI;
1052         push EBX;
1053             push ECX;
1054         mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr;
1055         mov EBX, [ESP + LASTPARAM + 4*0]; //src.length;
1056         mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr;
1057         lea EDI, [EDI + 8*EBX];      // EDI = end of dest
1058         lea ESI, [ESI + 4*EBX];      // ESI = end of src
1059         neg EBX;                     // count UP to zero.
1060         xor ECX, ECX;             // initial carry = 0.
1061 L1:
1062         mov EAX, [ESI + 4*EBX];
1063         mul EAX, EAX;
1064         shr CL, 1;                 // get carry
1065         adc [EDI + 8*EBX], EAX;
1066         adc [EDI + 8*EBX + 4], EDX;
1067         setc CL;                   // save carry
1068         inc EBX;
1069         jnz L1;
1070 
1071         pop ECX;
1072         pop EBX;
1073         pop EDI;
1074         pop ESI;
1075         ret 4*4;
1076     }
1077 }
1078 
1079 @safe unittest
1080 {
1081     uint [] aa = new uint[13];
1082         uint [] bb = new uint[6];
1083     for (int i=0; i<aa.length; ++i) aa[i] = 0x8000_0000;
1084     for (int i=0; i<bb.length; ++i) bb[i] = i;
1085         aa[$-1]= 7;
1086     multibyteAddDiagonalSquares(aa[0..$-1], bb);
1087         assert(aa[$-1]==7);
1088         for (int i=0; i<bb.length; ++i) { assert(aa[2*i]==0x8000_0000+i*i); assert(aa[2*i+1]==0x8000_0000); }
1089 }
1090 
1091 void multibyteTriangleAccumulateD(uint[] dest, uint[] x) pure @safe @nogc
1092 {
1093     for (int i = 0; i < x.length-3; ++i)
1094     {
1095         dest[i+x.length] = multibyteMulAdd!('+')(
1096              dest[i+i+1 .. i+x.length], x[i+1..$], x[i], 0);
1097     }
1098     ulong c = cast(ulong)(x[$-3]) * x[$-2] + dest[$-5];
1099     dest[$-5] = cast(uint) c;
1100     c >>= 32;
1101     c += cast(ulong)(x[$-3]) * x[$-1] + dest[$-4];
1102     dest[$-4] = cast(uint) c;
1103     c >>= 32;
1104 length2:
1105     c += cast(ulong)(x[$-2]) * x[$-1];
1106         dest[$-3] = cast(uint) c;
1107         c >>= 32;
1108         dest[$-2] = cast(uint) c;
1109 }
1110 
1111 //dest += src[0]*src[1...$] + src[1]*src[2..$] + ... + src[$-3]*src[$-2..$]+ src[$-2]*src[$-1]
1112 // assert(dest.length = src.length*2);
1113 // assert(src.length >= 3);
1114 void multibyteTriangleAccumulateAsm(uint[] dest, const uint[] src) pure @safe @nogc
1115 {
1116     // Register usage
1117     // EDX:EAX = used in multiply
1118     // EBX = index
1119     // ECX = carry1
1120     // EBP = carry2
1121     // EDI = end of dest for this pass through the loop. Index for outer loop.
1122     // ESI = end of src. never changes
1123     // [ESP] = M = src[i] = multiplier for this pass through the loop.
1124     // dest.length is changed into dest.ptr+dest.length
1125     version (D_PIC)
1126     {
1127         enum { zero = 0 }
1128     }
1129     else
1130     {
1131         // use p2 (load unit) instead of the overworked p0 or p1 (ALU units)
1132         // when initializing registers to zero.
1133         static immutable int zero = 0;
1134         // use p3/p4 units
1135         alias storagenop = storagenopTriangleAccumulate; // write-only
1136     }
1137 
1138     enum { LASTPARAM = 6*4 } // 4* pushes + local + return address.
1139     asm pure nothrow @nogc @trusted {
1140         naked;
1141 
1142         push ESI;
1143         push EDI;
1144         align 16;
1145         push EBX;
1146         push EBP;
1147         push EAX;    // local variable M= src[i]
1148         mov EDI, [ESP + LASTPARAM + 4*3]; // dest.ptr
1149         mov EBX, [ESP + LASTPARAM + 4*0]; // src.length
1150         mov ESI, [ESP + LASTPARAM + 4*1];  // src.ptr
1151 
1152         lea ESI, [ESI + 4*EBX]; // ESI = end of left
1153         add int ptr [ESP + LASTPARAM + 4*1], 4; // src.ptr, used for getting M
1154 
1155         // local variable [ESP + LASTPARAM + 4*2] = last value for EDI
1156         lea EDI, [EDI + 4*EBX]; // EDI = end of dest for first pass
1157 
1158         lea EAX, [EDI + 4*EBX-3*4]; // up to src.length - 3
1159         mov [ESP + LASTPARAM + 4*2], EAX; // last value for EDI  = &dest[src.length*2 -3]
1160 
1161         cmp EBX, 3;
1162         jz length_is_3;
1163 
1164         // We start at src[1], not src[0].
1165         dec EBX;
1166         mov [ESP + LASTPARAM + 4*0], EBX;
1167 
1168 outer_loop:
1169         mov EBX, [ESP + LASTPARAM + 4*0]; // src.length
1170         mov EBP, 0;
1171         mov ECX, 0; // ECX = input carry.
1172         dec int ptr [ESP + LASTPARAM + 4*0]; // Next time, the length will be shorter by 1.
1173         neg EBX;                // count UP to zero.
1174 
1175         mov EAX, [ESI + 4*EBX - 4*1]; // get new M
1176         mov [ESP], EAX;                   // save new M
1177 
1178         mov EAX, [ESI+4*EBX];
1179         test EBX, 1;
1180         jnz L_enter_odd;
1181     }
1182     // -- Inner loop, with even entry point
1183     mixin("asm pure nothrow @nogc @trusted { " ~ asmMulAdd_innerloop("add", "ESP") ~ "}");
1184     asm pure nothrow @nogc @trusted {
1185         mov [-4+EDI+4*EBX], EBP;
1186         add EDI, 4;
1187         cmp EDI, [ESP + LASTPARAM + 4*2]; // is EDI = &dest[$-3]?
1188         jnz outer_loop;
1189 length_is_3:
1190         mov EAX, [ESI - 4*3];
1191         mul EAX, [ESI - 4*2];
1192         mov ECX, 0;
1193         add [EDI-2*4], EAX;  // ECX:dest[$-5] += x[$-3] * x[$-2]
1194         adc ECX, EDX;
1195 
1196         mov EAX, [ESI - 4*3];
1197         mul EAX, [ESI - 4*1]; // x[$-3] * x[$-1]
1198         add EAX, ECX;
1199         mov ECX, 0;
1200         adc EDX, 0;
1201         // now EDX: EAX = c + x[$-3] * x[$-1]
1202         add [EDI-1*4], EAX; // ECX:dest[$-4] += (EDX:EAX)
1203         adc ECX, EDX;  //  ECX holds dest[$-3], it acts as carry for the last row
1204 // do length == 2
1205         mov EAX, [ESI - 4*2];
1206         mul EAX, [ESI - 4*1];
1207         add ECX, EAX;
1208         adc EDX, 0;
1209         mov [EDI - 0*4], ECX; // dest[$-2:$-3] = c + x[$-2] * x[$-1];
1210         mov [EDI + 1*4], EDX;
1211 
1212         pop EAX;
1213         pop EBP;
1214         pop EBX;
1215         pop EDI;
1216         pop ESI;
1217         ret 4*4;
1218     }
1219 L_enter_odd:
1220     mixin("asm pure nothrow @nogc @trusted {" ~ asmMulAdd_enter_odd("add", "ESP") ~ "}");
1221 }
1222 
1223 @safe unittest
1224 {
1225    uint [] aa = new uint[200];
1226    uint [] a  = aa[0 .. 100];
1227    uint [] b  = new uint [100];
1228    aa[] = 761;
1229    a[] = 0;
1230    b[] = 0;
1231    a[3] = 6;
1232    b[0]=1;
1233    b[1] = 17;
1234    b[50 .. 100]=78;
1235    multibyteTriangleAccumulateAsm(a, b[0 .. 50]);
1236    uint [] c = new uint[100];
1237    c[] = 0;
1238    c[1] = 17;
1239    c[3] = 6;
1240    assert(a[]==c[]);
1241    assert(a[0]==0);
1242    aa[] = 0xFFFF_FFFF;
1243    a[] = 0;
1244    b[] = 0;
1245    b[0]= 0xbf6a1f01;
1246    b[1]=  0x6e38ed64;
1247    b[2]=  0xdaa797ed;
1248    b[3] = 0;
1249 
1250    multibyteTriangleAccumulateAsm(a[0 .. 8], b[0 .. 4]);
1251    assert(a[1]==0x3a600964);
1252    assert(a[2]==0x339974f6);
1253    assert(a[3]==0x46736fce);
1254    assert(a[4]==0x5e24a2b4);
1255 
1256    b[3] = 0xe93ff9f4;
1257    b[4] = 0x184f03;
1258    a[]=0;
1259    multibyteTriangleAccumulateAsm(a[0 .. 14], b[0 .. 7]);
1260    assert(a[3]==0x79fff5c2);
1261    assert(a[4]==0xcf384241);
1262    assert(a[5]== 0x4a17fc8);
1263    assert(a[6]==0x4d549025);
1264 }
1265 
1266 
1267 void multibyteSquare(BigDigit[] result, const BigDigit [] x) pure @safe @nogc
1268 {
1269     if (x.length < 4)
1270     {
1271         // Special cases, not worth doing triangular.
1272         result[x.length] = multibyteMul(result[0 .. x.length], x, x[0], 0);
1273         multibyteMultiplyAccumulate(result[1..$], x, x[1..$]);
1274         return;
1275     }
1276     //  Do half a square multiply.
1277     //  dest += src[0]*src[1...$] + src[1]*src[2..$] + ... + src[$-3]*src[$-2..$]+ src[$-2]*src[$-1]
1278     result[x.length] = multibyteMul(result[1 .. x.length], x[1..$], x[0], 0);
1279     multibyteTriangleAccumulateAsm(result[2..$], x[1..$]);
1280     // Multiply by 2
1281     result[$-1] = multibyteShlNoMMX(result[1..$-1], result[1..$-1], 1);
1282     // And add the diagonal elements
1283     result[0] = 0;
1284     multibyteAddDiagonalSquares(result, x);
1285 }
1286 
1287 version (BignumPerformanceTest)
1288 {
1289 import core.stdc.stdio;
1290 int clock() { asm { push EBX; xor EAX, EAX; cpuid; pop EBX; rdtsc; } }
1291 
1292 __gshared uint [2200] X1;
1293 __gshared uint [2200] Y1;
1294 __gshared uint [4000] Z1;
1295 
1296 void testPerformance() pure
1297 {
1298     // The performance results at the top of this file were obtained using
1299     // a Windows device driver to access the CPU performance counters.
1300     // The code below is less accurate but more widely usable.
1301     // The value for division is quite inconsistent.
1302     for (int i=0; i<X1.length; ++i) { X1[i]=i; Y1[i]=i; Z1[i]=i; }
1303     int t, t0;
1304     multibyteShl(Z1[0 .. 2000], X1[0 .. 2000], 7);
1305     t0 = clock();
1306     multibyteShl(Z1[0 .. 1000], X1[0 .. 1000], 7);
1307     t = clock();
1308     multibyteShl(Z1[0 .. 2000], X1[0 .. 2000], 7);
1309     auto shltime = (clock() - t) - (t - t0);
1310     t0 = clock();
1311     multibyteShr(Z1[2 .. 1002], X1[4 .. 1004], 13);
1312     t = clock();
1313     multibyteShr(Z1[2 .. 2002], X1[4 .. 2004], 13);
1314     auto shrtime = (clock() - t) - (t - t0);
1315     t0 = clock();
1316     multibyteAddSub!('+')(Z1[0 .. 1000], X1[0 .. 1000], Y1[0 .. 1000], 0);
1317     t = clock();
1318     multibyteAddSub!('+')(Z1[0 .. 2000], X1[0 .. 2000], Y1[0 .. 2000], 0);
1319     auto addtime = (clock() - t) - (t-t0);
1320     t0 = clock();
1321     multibyteMul(Z1[0 .. 1000], X1[0 .. 1000], 7, 0);
1322     t = clock();
1323     multibyteMul(Z1[0 .. 2000], X1[0 .. 2000], 7, 0);
1324     auto multime = (clock() - t) - (t - t0);
1325     multibyteMulAdd!('+')(Z1[0 .. 2000], X1[0 .. 2000], 217, 0);
1326     t0 = clock();
1327     multibyteMulAdd!('+')(Z1[0 .. 1000], X1[0 .. 1000], 217, 0);
1328     t = clock();
1329     multibyteMulAdd!('+')(Z1[0 .. 2000], X1[0 .. 2000], 217, 0);
1330     auto muladdtime = (clock() - t) - (t - t0);
1331     multibyteMultiplyAccumulate(Z1[0 .. 64], X1[0 .. 32], Y1[0 .. 32]);
1332     t = clock();
1333     multibyteMultiplyAccumulate(Z1[0 .. 64], X1[0 .. 32], Y1[0 .. 32]);
1334     auto accumtime = clock() - t;
1335     t0 = clock();
1336     multibyteDivAssign(Z1[0 .. 2000], 217, 0);
1337     t = clock();
1338     multibyteDivAssign(Z1[0 .. 1000], 37, 0);
1339     auto divtime = (t - t0) - (clock() - t);
1340         t= clock();
1341     multibyteSquare(Z1[0 .. 64], X1[0 .. 32]);
1342     auto squaretime = clock() - t;
1343 
1344     printf("-- BigInt asm performance (cycles/int) --\n");
1345     printf("Add:        %.2f\n", addtime/1000.0);
1346     printf("Shl:        %.2f\n", shltime/1000.0);
1347     printf("Shr:        %.2f\n", shrtime/1000.0);
1348     printf("Mul:        %.2f\n", multime/1000.0);
1349     printf("MulAdd:     %.2f\n", muladdtime/1000.0);
1350     printf("Div:        %.2f\n", divtime/1000.0);
1351     printf("MulAccum32: %.2f*n*n (total %d)\n", accumtime/(32.0*32.0), accumtime);
1352     printf("Square32: %.2f*n*n (total %d)\n\n", squaretime/(32.0*32.0), squaretime);
1353 }
1354 
1355 static this()
1356 {
1357     testPerformance();
1358 }
1359 }
1360 
1361 } // version (D_InlineAsm_X86)