1 /** Optimised asm arbitrary precision arithmetic ('bignum') 2 * routines for X86 processors. 3 * 4 * All functions operate on arrays of uints, stored LSB first. 5 * If there is a destination array, it will be the first parameter. 6 * Currently, all of these functions are subject to change, and are 7 * intended for internal use only. 8 * The symbol [#] indicates an array of machine words which is to be 9 * interpreted as a multi-byte number. 10 */ 11 12 /* Copyright Don Clugston 2008 - 2010. 13 * Distributed under the Boost Software License, Version 1.0. 14 * (See accompanying file LICENSE_1_0.txt or copy at 15 * http://www.boost.org/LICENSE_1_0.txt) 16 */ 17 /** 18 * In simple terms, there are 3 modern x86 microarchitectures: 19 * (a) the P6 family (Pentium Pro, PII, PIII, PM, Core), produced by Intel; 20 * (b) the K6, Athlon, and AMD64 families, produced by AMD; and 21 * (c) the Pentium 4, produced by Marketing. 22 * 23 * This code has been optimised for the Intel P6 family. 24 * Generally the code remains near-optimal for Intel Core2/Corei7, after 25 * translating EAX-> RAX, etc, since all these CPUs use essentially the same 26 * pipeline, and are typically limited by memory access. 27 * The code uses techniques described in Agner Fog's superb Pentium manuals 28 * available at www.agner.org. 29 * Not optimised for AMD, which can do two memory loads per cycle (Intel 30 * CPUs can only do one). Despite this, performance is superior on AMD. 31 * Performance is dreadful on P4. 32 * 33 * Timing results (cycles per int) 34 * --Intel Pentium-- --AMD-- 35 * PM P4 Core2 K7 36 * +,- 2.25 15.6 2.25 1.5 37 * <<,>> 2.0 6.6 2.0 5.0 38 * (<< MMX) 1.7 5.3 1.5 1.2 39 * * 5.0 15.0 4.0 4.3 40 * mulAdd 5.7 19.0 4.9 4.0 41 * div 30.0 32.0 32.0 22.4 42 * mulAcc(32) 6.5 20.0 5.4 4.9 43 * 44 * mulAcc(32) is multiplyAccumulate() for a 32*32 multiply. Thus it includes 45 * function call overhead. 46 * The timing for Div is quite unpredictable, but it's probably too slow 47 * to be useful. On 64-bit processors, these times should 48 * halve if run in 64-bit mode, except for the MMX functions. 49 */ 50 51 module std.internal.math.biguintx86; 52 53 @system: 54 pure: 55 nothrow: 56 57 /* 58 Naked asm is used throughout, because: 59 (a) it frees up the EBP register 60 (b) compiler bugs prevent the use of .ptr when a frame pointer is used. 61 */ 62 63 version (D_InlineAsm_X86) 64 { 65 66 private: 67 68 /* Duplicate string s, with n times, substituting index for '@'. 69 * 70 * Each instance of '@' in s is replaced by 0,1,...n-1. This is a helper 71 * function for some of the asm routines. 72 */ 73 string indexedLoopUnroll(int n, string s) pure @safe 74 { 75 string u; 76 for (int i = 0; i<n; ++i) 77 { 78 string nstr= (i>9 ? ""~ cast(char)('0'+i/10) : "") ~ cast(char)('0' + i%10); 79 80 int last = 0; 81 for (int j = 0; j<s.length; ++j) 82 { 83 if (s[j]=='@') 84 { 85 u ~= s[last .. j] ~ nstr; 86 last = j+1; 87 } 88 } 89 if (last<s.length) u = u ~ s[last..$]; 90 91 } 92 return u; 93 } 94 @safe unittest 95 { 96 assert(indexedLoopUnroll(3, "@*23;")=="0*23;1*23;2*23;"); 97 } 98 99 public: 100 101 alias BigDigit = uint; // A Bignum is an array of BigDigits. Usually the machine word size. 102 103 // Limits for when to switch between multiplication algorithms. 104 enum : int { KARATSUBALIMIT = 18 } // Minimum value for which Karatsuba is worthwhile. 105 enum : int { KARATSUBASQUARELIMIT=26 } // Minimum value for which square Karatsuba is worthwhile 106 107 /** Multi-byte addition or subtraction 108 * dest[#] = src1[#] + src2[#] + carry (0 or 1). 109 * or dest[#] = src1[#] - src2[#] - carry (0 or 1). 110 * Returns carry or borrow (0 or 1). 111 * Set op == '+' for addition, '-' for subtraction. 112 */ 113 uint multibyteAddSub(char op)(uint[] dest, const uint [] src1, const uint [] 114 src2, uint carry) pure @safe @nogc 115 { 116 // Timing: 117 // Pentium M: 2.25/int 118 // P6 family, Core2 have a partial flags stall when reading the carry flag in 119 // an ADC, SBB operation after an operation such as INC or DEC which 120 // modifies some, but not all, flags. We avoid this by storing carry into 121 // a resister (AL), and restoring it after the branch. 122 123 enum { LASTPARAM = 4*4 } // 3* pushes + return address. 124 asm pure nothrow @nogc @trusted { 125 naked; 126 push EDI; 127 push EBX; 128 push ESI; 129 mov ECX, [ESP + LASTPARAM + 4*4]; // dest.length; 130 mov EDX, [ESP + LASTPARAM + 3*4]; // src1.ptr 131 mov ESI, [ESP + LASTPARAM + 1*4]; // src2.ptr 132 mov EDI, [ESP + LASTPARAM + 5*4]; // dest.ptr 133 // Carry is in EAX 134 // Count UP to zero (from -len) to minimize loop overhead. 135 lea EDX, [EDX + 4*ECX]; // EDX = end of src1. 136 lea ESI, [ESI + 4*ECX]; // EBP = end of src2. 137 lea EDI, [EDI + 4*ECX]; // EDI = end of dest. 138 139 neg ECX; 140 add ECX, 8; 141 jb L2; // if length < 8 , bypass the unrolled loop. 142 L_unrolled: 143 shr AL, 1; // get carry from EAX 144 } 145 mixin(" asm pure nothrow @nogc @trusted {" 146 ~ indexedLoopUnroll( 8, 147 "mov EAX, [@*4-8*4+EDX+ECX*4];" 148 ~ ( op == '+' ? "adc" : "sbb" ) ~ " EAX, [@*4-8*4+ESI+ECX*4];" 149 ~ "mov [@*4-8*4+EDI+ECX*4], EAX;") 150 ~ "}"); 151 asm pure nothrow @nogc @trusted { 152 setc AL; // save carry 153 add ECX, 8; 154 ja L_unrolled; 155 L2: // Do the residual 1 .. 7 ints. 156 157 sub ECX, 8; 158 jz done; 159 L_residual: 160 shr AL, 1; // get carry from EAX 161 } 162 mixin(" asm pure nothrow @nogc @trusted {" 163 ~ indexedLoopUnroll( 1, 164 "mov EAX, [@*4+EDX+ECX*4];" 165 ~ ( op == '+' ? "adc" : "sbb" ) ~ " EAX, [@*4+ESI+ECX*4];" 166 ~ "mov [@*4+EDI+ECX*4], EAX;") ~ "}"); 167 asm pure nothrow @nogc @trusted { 168 setc AL; // save carry 169 add ECX, 1; 170 jnz L_residual; 171 done: 172 and EAX, 1; // make it O or 1. 173 pop ESI; 174 pop EBX; 175 pop EDI; 176 ret 6*4; 177 } 178 } 179 180 @safe unittest 181 { 182 uint [] a = new uint[40]; 183 uint [] b = new uint[40]; 184 uint [] c = new uint[40]; 185 for (int i=0; i<a.length; ++i) 186 { 187 if (i&1) a[i]=0x8000_0000 + i; 188 else a[i]=i; 189 b[i]= 0x8000_0003; 190 } 191 c[19]=0x3333_3333; 192 uint carry = multibyteAddSub!('+')(c[0 .. 18], a[0 .. 18], b[0 .. 18], 0); 193 assert(carry == 1); 194 assert(c[0]==0x8000_0003); 195 assert(c[1]==4); 196 assert(c[19]==0x3333_3333); // check for overrun 197 for (int i=0; i<a.length; ++i) 198 { 199 a[i]=b[i]=c[i]=0; 200 } 201 a[8]=0x048D159E; 202 b[8]=0x048D159E; 203 a[10]=0x1D950C84; 204 b[10]=0x1D950C84; 205 a[5] =0x44444444; 206 carry = multibyteAddSub!('-')(a[0 .. 12], a[0 .. 12], b[0 .. 12], 0); 207 assert(a[11]==0); 208 for (int i=0; i<10; ++i) if (i != 5) assert(a[i]==0); 209 210 for (int q=3; q<36;++q) 211 { 212 for (int i=0; i<a.length; ++i) 213 { 214 a[i]=b[i]=c[i]=0; 215 } 216 a[q-2]=0x040000; 217 b[q-2]=0x040000; 218 carry = multibyteAddSub!('-')(a[0 .. q], a[0 .. q], b[0 .. q], 0); 219 assert(a[q-2]==0); 220 } 221 } 222 223 /** dest[#] += carry, or dest[#] -= carry. 224 * op must be '+' or '-' 225 * Returns final carry or borrow (0 or 1) 226 */ 227 uint multibyteIncrementAssign(char op)(uint[] dest, uint carry) pure @safe @nogc 228 { 229 enum { LASTPARAM = 1*4 } // 0* pushes + return address. 230 asm pure nothrow @nogc @trusted { 231 naked; 232 mov ECX, [ESP + LASTPARAM + 0*4]; // dest.length; 233 mov EDX, [ESP + LASTPARAM + 1*4]; // dest.ptr 234 // EAX = carry 235 L1: ; 236 } 237 static if (op=='+') 238 asm pure nothrow @nogc @trusted { add [EDX], EAX; } 239 else 240 asm pure nothrow @nogc @trusted { sub [EDX], EAX; } 241 asm pure nothrow @nogc @trusted { 242 mov EAX, 1; 243 jnc L2; 244 add EDX, 4; 245 dec ECX; 246 jnz L1; 247 mov EAX, 2; 248 L2: dec EAX; 249 ret 2*4; 250 } 251 } 252 253 /** dest[#] = src[#] << numbits 254 * numbits must be in the range 1 .. 31 255 * Returns the overflow 256 */ 257 uint multibyteShlNoMMX(uint [] dest, const uint [] src, uint numbits) pure @safe @nogc 258 { 259 // Timing: Optimal for P6 family. 260 // 2.0 cycles/int on PPro .. PM (limited by execution port p0) 261 // 5.0 cycles/int on Athlon, which has 7 cycles for SHLD!! 262 enum { LASTPARAM = 4*4 } // 3* pushes + return address. 263 asm pure nothrow @nogc @trusted { 264 naked; 265 push ESI; 266 push EDI; 267 push EBX; 268 mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr; 269 mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length; 270 mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr; 271 mov ECX, EAX; // numbits; 272 273 mov EAX, [-4+ESI + 4*EBX]; 274 mov EDX, 0; 275 shld EDX, EAX, CL; 276 push EDX; // Save return value 277 cmp EBX, 1; 278 jz L_last; 279 mov EDX, [-4+ESI + 4*EBX]; 280 test EBX, 1; 281 jz L_odd; 282 sub EBX, 1; 283 L_even: 284 mov EDX, [-4+ ESI + 4*EBX]; 285 shld EAX, EDX, CL; 286 mov [EDI+4*EBX], EAX; 287 L_odd: 288 mov EAX, [-8+ESI + 4*EBX]; 289 shld EDX, EAX, CL; 290 mov [-4+EDI + 4*EBX], EDX; 291 sub EBX, 2; 292 jg L_even; 293 L_last: 294 shl EAX, CL; 295 mov [EDI], EAX; 296 pop EAX; // pop return value 297 pop EBX; 298 pop EDI; 299 pop ESI; 300 ret 4*4; 301 } 302 } 303 304 /** dest[#] = src[#] >> numbits 305 * numbits must be in the range 1 .. 31 306 * This version uses MMX. 307 */ 308 uint multibyteShl(uint [] dest, const uint [] src, uint numbits) pure @safe @nogc 309 { 310 // Timing: 311 // K7 1.2/int. PM 1.7/int P4 5.3/int 312 enum { LASTPARAM = 4*4 } // 3* pushes + return address. 313 asm pure nothrow @nogc @trusted { 314 naked; 315 push ESI; 316 push EDI; 317 push EBX; 318 mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr; 319 mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length; 320 mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr; 321 322 movd MM3, EAX; // numbits = bits to shift left 323 xor EAX, 63; 324 align 16; 325 inc EAX; 326 movd MM4, EAX ; // 64-numbits = bits to shift right 327 328 // Get the return value into EAX 329 and EAX, 31; // EAX = 32-numbits 330 movd MM2, EAX; // 32-numbits 331 movd MM1, [ESI+4*EBX-4]; 332 psrlq MM1, MM2; 333 movd EAX, MM1; // EAX = return value 334 test EBX, 1; 335 jz L_even; 336 L_odd: 337 cmp EBX, 1; 338 jz L_length1; 339 340 // deal with odd lengths 341 movq MM1, [ESI+4*EBX-8]; 342 psrlq MM1, MM2; 343 movd [EDI +4*EBX-4], MM1; 344 sub EBX, 1; 345 L_even: // It's either singly or doubly even 346 movq MM2, [ESI + 4*EBX - 8]; 347 psllq MM2, MM3; 348 sub EBX, 2; 349 jle L_last; 350 movq MM1, MM2; 351 add EBX, 2; 352 test EBX, 2; 353 jz L_onceeven; 354 sub EBX, 2; 355 356 // MAIN LOOP -- 128 bytes per iteration 357 L_twiceeven: // here MM2 is the carry 358 movq MM0, [ESI + 4*EBX-8]; 359 psrlq MM0, MM4; 360 movq MM1, [ESI + 4*EBX-8]; 361 psllq MM1, MM3; 362 por MM2, MM0; 363 movq [EDI +4*EBX], MM2; 364 L_onceeven: // here MM1 is the carry 365 movq MM0, [ESI + 4*EBX-16]; 366 psrlq MM0, MM4; 367 movq MM2, [ESI + 4*EBX-16]; 368 por MM1, MM0; 369 movq [EDI +4*EBX-8], MM1; 370 psllq MM2, MM3; 371 sub EBX, 4; 372 jg L_twiceeven; 373 L_last: 374 movq [EDI +4*EBX], MM2; 375 L_alldone: 376 emms; // NOTE: costs 6 cycles on Intel CPUs 377 pop EBX; 378 pop EDI; 379 pop ESI; 380 ret 4*4; 381 382 L_length1: 383 // length 1 is a special case 384 movd MM1, [ESI]; 385 psllq MM1, MM3; 386 movd [EDI], MM1; 387 jmp L_alldone; 388 } 389 } 390 391 void multibyteShr(uint [] dest, const uint [] src, uint numbits) pure @safe @nogc 392 { 393 enum { LASTPARAM = 4*4 } // 3* pushes + return address. 394 asm pure nothrow @nogc @trusted { 395 naked; 396 push ESI; 397 push EDI; 398 push EBX; 399 mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr; 400 mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length; 401 align 16; 402 mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr; 403 lea EDI, [EDI + 4*EBX]; // EDI = end of dest 404 lea ESI, [ESI + 4*EBX]; // ESI = end of src 405 neg EBX; // count UP to zero. 406 407 movd MM3, EAX; // numbits = bits to shift right 408 xor EAX, 63; 409 inc EAX; 410 movd MM4, EAX ; // 64-numbits = bits to shift left 411 412 test EBX, 1; 413 jz L_even; 414 L_odd: 415 // deal with odd lengths 416 and EAX, 31; // EAX = 32-numbits 417 movd MM2, EAX; // 32-numbits 418 cmp EBX, -1; 419 jz L_length1; 420 421 movq MM0, [ESI+4*EBX]; 422 psrlq MM0, MM3; 423 movd [EDI +4*EBX], MM0; 424 add EBX, 1; 425 L_even: 426 movq MM2, [ESI + 4*EBX]; 427 psrlq MM2, MM3; 428 429 movq MM1, MM2; 430 add EBX, 4; 431 cmp EBX, -2+4; 432 jz L_last; 433 // It's either singly or doubly even 434 sub EBX, 2; 435 test EBX, 2; 436 jnz L_onceeven; 437 add EBX, 2; 438 439 // MAIN LOOP -- 128 bytes per iteration 440 L_twiceeven: // here MM2 is the carry 441 movq MM0, [ESI + 4*EBX-8]; 442 psllq MM0, MM4; 443 movq MM1, [ESI + 4*EBX-8]; 444 psrlq MM1, MM3; 445 por MM2, MM0; 446 movq [EDI +4*EBX-16], MM2; 447 L_onceeven: // here MM1 is the carry 448 movq MM0, [ESI + 4*EBX]; 449 psllq MM0, MM4; 450 movq MM2, [ESI + 4*EBX]; 451 por MM1, MM0; 452 movq [EDI +4*EBX-8], MM1; 453 psrlq MM2, MM3; 454 add EBX, 4; 455 jl L_twiceeven; 456 L_last: 457 movq [EDI +4*EBX-16], MM2; 458 L_alldone: 459 emms; // NOTE: costs 6 cycles on Intel CPUs 460 pop EBX; 461 pop EDI; 462 pop ESI; 463 ret 4*4; 464 465 L_length1: 466 // length 1 is a special case 467 movd MM1, [ESI+4*EBX]; 468 psrlq MM1, MM3; 469 movd [EDI +4*EBX], MM1; 470 jmp L_alldone; 471 472 } 473 } 474 475 /** dest[#] = src[#] >> numbits 476 * numbits must be in the range 1 .. 31 477 */ 478 void multibyteShrNoMMX(uint [] dest, const uint [] src, uint numbits) pure @safe @nogc 479 { 480 // Timing: Optimal for P6 family. 481 // 2.0 cycles/int on PPro .. PM (limited by execution port p0) 482 // Terrible performance on AMD64, which has 7 cycles for SHRD!! 483 enum { LASTPARAM = 4*4 } // 3* pushes + return address. 484 asm pure nothrow @nogc @trusted { 485 naked; 486 push ESI; 487 push EDI; 488 push EBX; 489 mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr; 490 mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length; 491 mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr; 492 mov ECX, EAX; // numbits; 493 494 lea EDI, [EDI + 4*EBX]; // EDI = end of dest 495 lea ESI, [ESI + 4*EBX]; // ESI = end of src 496 neg EBX; // count UP to zero. 497 mov EAX, [ESI + 4*EBX]; 498 cmp EBX, -1; 499 jz L_last; 500 mov EDX, [ESI + 4*EBX]; 501 test EBX, 1; 502 jz L_odd; 503 add EBX, 1; 504 L_even: 505 mov EDX, [ ESI + 4*EBX]; 506 shrd EAX, EDX, CL; 507 mov [-4 + EDI+4*EBX], EAX; 508 L_odd: 509 mov EAX, [4 + ESI + 4*EBX]; 510 shrd EDX, EAX, CL; 511 mov [EDI + 4*EBX], EDX; 512 add EBX, 2; 513 jl L_even; 514 L_last: 515 shr EAX, CL; 516 mov [-4 + EDI], EAX; 517 518 pop EBX; 519 pop EDI; 520 pop ESI; 521 ret 4*4; 522 } 523 } 524 525 @safe unittest 526 { 527 528 uint [] aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE]; 529 multibyteShr(aa[0..$-1], aa, 4); 530 assert(aa[0] == 0x6122_2222 && aa[1]==0xA455_5555 531 && aa[2]==0xD899_9999 && aa[3]==0x0BCC_CCCC); 532 533 aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE]; 534 multibyteShr(aa[2..$-1], aa[2..$-1], 4); 535 assert(aa[0] == 0x1222_2223 && aa[1]==0x4555_5556 536 && aa[2]==0xD899_9999 && aa[3]==0x0BCC_CCCC); 537 538 aa = [0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE]; 539 multibyteShr(aa[0..$-2], aa, 4); 540 assert(aa[1]==0xA455_5555 && aa[2]==0x0899_9999); 541 assert(aa[0]==0x6122_2222); 542 assert(aa[3]==0xBCCC_CCCD); 543 544 545 aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE]; 546 uint r = multibyteShl(aa[2 .. 4], aa[2 .. 4], 4); 547 assert(aa[0] == 0xF0FF_FFFF && aa[1]==0x1222_2223 548 && aa[2]==0x5555_5560 && aa[3]==0x9999_99A4 && aa[4]==0xBCCC_CCCD); 549 assert(r == 8); 550 551 aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE]; 552 r = multibyteShl(aa[1 .. 4], aa[1 .. 4], 4); 553 assert(aa[0] == 0xF0FF_FFFF 554 && aa[2]==0x5555_5561); 555 assert(aa[3]==0x9999_99A4 && aa[4]==0xBCCC_CCCD); 556 assert(r == 8); 557 assert(aa[1]==0x2222_2230); 558 559 aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE]; 560 r = multibyteShl(aa[0 .. 4], aa[1 .. 5], 31); 561 } 562 563 /** dest[#] = src[#] * multiplier + carry. 564 * Returns carry. 565 */ 566 uint multibyteMul(uint[] dest, const uint[] src, uint multiplier, uint carry) 567 pure @safe @nogc 568 { 569 // Timing: definitely not optimal. 570 // Pentium M: 5.0 cycles/operation, has 3 resource stalls/iteration 571 // Fastest implementation found was 4.6 cycles/op, but not worth the complexity. 572 573 enum { LASTPARAM = 4*4 } // 4* pushes + return address. 574 // We'll use p2 (load unit) instead of the overworked p0 or p1 (ALU units) 575 // when initializing variables to zero. 576 version (D_PIC) 577 { 578 enum { zero = 0 } 579 } 580 else 581 { 582 static immutable int zero = 0; 583 } 584 asm pure nothrow @nogc @trusted { 585 naked; 586 push ESI; 587 push EDI; 588 push EBX; 589 590 mov EDI, [ESP + LASTPARAM + 4*4]; // dest.ptr 591 mov EBX, [ESP + LASTPARAM + 4*3]; // dest.length 592 mov ESI, [ESP + LASTPARAM + 4*2]; // src.ptr 593 align 16; 594 lea EDI, [EDI + 4*EBX]; // EDI = end of dest 595 lea ESI, [ESI + 4*EBX]; // ESI = end of src 596 mov ECX, EAX; // [carry]; -- last param is in EAX. 597 neg EBX; // count UP to zero. 598 test EBX, 1; 599 jnz L_odd; 600 add EBX, 1; 601 L1: 602 mov EAX, [-4 + ESI + 4*EBX]; 603 mul int ptr [ESP+LASTPARAM]; //[multiplier]; 604 add EAX, ECX; 605 mov ECX, zero; 606 mov [-4+EDI + 4*EBX], EAX; 607 adc ECX, EDX; 608 L_odd: 609 mov EAX, [ESI + 4*EBX]; // p2 610 mul int ptr [ESP+LASTPARAM]; //[multiplier]; // p0*3, 611 add EAX, ECX; 612 mov ECX, zero; 613 adc ECX, EDX; 614 mov [EDI + 4*EBX], EAX; 615 add EBX, 2; 616 jl L1; 617 618 mov EAX, ECX; // get final carry 619 620 pop EBX; 621 pop EDI; 622 pop ESI; 623 ret 5*4; 624 } 625 } 626 627 @system unittest 628 { 629 uint [] aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE]; 630 multibyteMul(aa[1 .. 4], aa[1 .. 4], 16, 0); 631 assert(aa[0] == 0xF0FF_FFFF && aa[1] == 0x2222_2230 && 632 aa[2]==0x5555_5561 && aa[3]==0x9999_99A4 && aa[4]==0x0BCCC_CCCD); 633 } 634 635 // The inner multiply-and-add loop, together with the Even entry point. 636 // Multiples by M_ADDRESS which should be "ESP+LASTPARAM" or "ESP". OP must be "add" or "sub" 637 // This is the most time-critical code in the BigInt library. 638 // It is used by both MulAdd, multiplyAccumulate, and triangleAccumulate 639 string asmMulAdd_innerloop(string OP, string M_ADDRESS) pure @safe { 640 // The bottlenecks in this code are extremely complicated. The MUL, ADD, and ADC 641 // need 4 cycles on each of the ALUs units p0 and p1. So we use memory load 642 // (unit p2) for initializing registers to zero. 643 // There are also dependencies between the instructions, and we run up against the 644 // ROB-read limit (can only read 2 registers per cycle). 645 // We also need the number of uops in the loop to be a multiple of 3. 646 // The only available execution unit for this is p3 (memory write). Unfortunately we can't do that 647 // if Position-Independent Code is required. 648 649 // Register usage 650 // ESI = end of src 651 // EDI = end of dest 652 // EBX = index. Counts up to zero (in steps of 2). 653 // EDX:EAX = scratch, used in multiply. 654 // ECX = carry1. 655 // EBP = carry2. 656 // ESP = points to the multiplier. 657 658 // The first member of 'dest' which will be modified is [EDI+4*EBX]. 659 // EAX must already contain the first member of 'src', [ESI+4*EBX]. 660 661 version (D_PIC) { bool using_PIC = true; } else { bool using_PIC = false; } 662 return " 663 // Entry point for even length 664 add EBX, 1; 665 mov EBP, ECX; // carry 666 667 mul int ptr [" ~ M_ADDRESS ~ "]; // M 668 mov ECX, 0; 669 670 add EBP, EAX; 671 mov EAX, [ESI+4*EBX]; 672 adc ECX, EDX; 673 674 mul int ptr [" ~ M_ADDRESS ~ "]; // M 675 " ~ OP ~ " [-4+EDI+4*EBX], EBP; 676 mov EBP, zero; 677 678 adc ECX, EAX; 679 mov EAX, [4+ESI+4*EBX]; 680 681 adc EBP, EDX; 682 add EBX, 2; 683 jnl L_done; 684 L1: 685 mul int ptr [" ~ M_ADDRESS ~ "]; 686 " ~ OP ~ " [-8+EDI+4*EBX], ECX; 687 adc EBP, EAX; 688 mov ECX, zero; 689 mov EAX, [ESI+4*EBX]; 690 adc ECX, EDX; 691 " ~ 692 (using_PIC ? "" : " mov storagenop, EDX; ") // make #uops in loop a multiple of 3, can't do this in PIC mode. 693 ~ " 694 mul int ptr [" ~ M_ADDRESS ~ "]; 695 " ~ OP ~ " [-4+EDI+4*EBX], EBP; 696 mov EBP, zero; 697 698 adc ECX, EAX; 699 mov EAX, [4+ESI+4*EBX]; 700 701 adc EBP, EDX; 702 add EBX, 2; 703 jl L1; 704 L_done: " ~ OP ~ " [-8+EDI+4*EBX], ECX; 705 adc EBP, 0; 706 "; 707 // final carry is now in EBP 708 } 709 710 string asmMulAdd_enter_odd(string OP, string M_ADDRESS) pure @safe 711 { 712 return " 713 mul int ptr [" ~M_ADDRESS ~"]; 714 mov EBP, zero; 715 add ECX, EAX; 716 mov EAX, [4+ESI+4*EBX]; 717 718 adc EBP, EDX; 719 add EBX, 2; 720 jl L1; 721 jmp L_done; 722 "; 723 } 724 725 version (D_PIC) {} else 726 { 727 // This cannot be declared inside the relevant functions because __gshared is 728 // not allowed in @safe functions. It cannot be regular shared either since 729 // that gives an access violation on Win32 for some reason. 730 private __gshared int storagenopTriangleAccumulate; 731 private __gshared int storagenopMulAdd; 732 private __gshared int storagenopMultiplyAccumulate; 733 } 734 735 /** 736 * dest[#] += src[#] * multiplier OP carry(0 .. FFFF_FFFF). 737 * where op == '+' or '-' 738 * Returns carry out of MSB (0 .. FFFF_FFFF). 739 */ 740 uint multibyteMulAdd(char op)(uint [] dest, const uint [] src, uint 741 multiplier, uint carry) pure @safe @nogc { 742 // Timing: This is the most time-critical bignum function. 743 // Pentium M: 5.4 cycles/operation, still has 2 resource stalls + 1load block/iteration 744 745 // The main loop is pipelined and unrolled by 2, 746 // so entry to the loop is also complicated. 747 748 // Register usage 749 // EDX:EAX = multiply 750 // EBX = counter 751 // ECX = carry1 752 // EBP = carry2 753 // EDI = dest 754 // ESI = src 755 756 enum string OP = (op=='+')? "add" : "sub"; 757 version (D_PIC) 758 { 759 enum { zero = 0 } 760 } 761 else 762 { 763 // use p2 (load unit) instead of the overworked p0 or p1 (ALU units) 764 // when initializing registers to zero. 765 static immutable int zero = 0; 766 // use p3/p4 units 767 alias storagenop = storagenopMulAdd; // write-only 768 } 769 770 enum { LASTPARAM = 5*4 } // 4* pushes + return address. 771 asm pure nothrow @nogc @trusted { 772 naked; 773 774 push ESI; 775 push EDI; 776 push EBX; 777 push EBP; 778 mov EDI, [ESP + LASTPARAM + 4*4]; // dest.ptr 779 mov EBX, [ESP + LASTPARAM + 4*3]; // dest.length 780 align 16; 781 nop; 782 mov ESI, [ESP + LASTPARAM + 4*2]; // src.ptr 783 lea EDI, [EDI + 4*EBX]; // EDI = end of dest 784 lea ESI, [ESI + 4*EBX]; // ESI = end of src 785 mov EBP, 0; 786 mov ECX, EAX; // ECX = input carry. 787 neg EBX; // count UP to zero. 788 mov EAX, [ESI+4*EBX]; 789 test EBX, 1; 790 jnz L_enter_odd; 791 } 792 // Main loop, with entry point for even length 793 mixin("asm pure nothrow @nogc @trusted {" ~ asmMulAdd_innerloop(OP, "ESP+LASTPARAM") ~ "}"); 794 asm pure nothrow @nogc @trusted { 795 mov EAX, EBP; // get final carry 796 pop EBP; 797 pop EBX; 798 pop EDI; 799 pop ESI; 800 ret 5*4; 801 } 802 L_enter_odd: 803 mixin("asm pure nothrow @nogc @trusted {" ~ asmMulAdd_enter_odd(OP, "ESP+LASTPARAM") ~ "}"); 804 } 805 806 @system unittest 807 { 808 809 uint [] aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE]; 810 uint [] bb = [0x1234_1234, 0xF0F0_F0F0, 0x00C0_C0C0, 0xF0F0_F0F0, 0xC0C0_C0C0]; 811 multibyteMulAdd!('+')(bb[1..$-1], aa[1..$-2], 16, 5); 812 assert(bb[0] == 0x1234_1234 && bb[4] == 0xC0C0_C0C0); 813 assert(bb[1] == 0x2222_2230 + 0xF0F0_F0F0+5 && bb[2] == 0x5555_5561+0x00C0_C0C0+1 814 && bb[3] == 0x9999_99A4+0xF0F0_F0F0 ); 815 } 816 817 /** 818 Sets result[#] = result[0 .. left.length] + left[#] * right[#] 819 820 It is defined in this way to allow cache-efficient multiplication. 821 This function is equivalent to: 822 ---- 823 for (int i = 0; i< right.length; ++i) 824 { 825 dest[left.length + i] = multibyteMulAdd(dest[i .. left.length+i], 826 left, right[i], 0); 827 } 828 ---- 829 */ 830 void multibyteMultiplyAccumulate(uint [] dest, const uint[] left, 831 const uint [] right) pure @safe @nogc { 832 // Register usage 833 // EDX:EAX = used in multiply 834 // EBX = index 835 // ECX = carry1 836 // EBP = carry2 837 // EDI = end of dest for this pass through the loop. Index for outer loop. 838 // ESI = end of left. never changes 839 // [ESP] = M = right[i] = multiplier for this pass through the loop. 840 // right.length is changed into dest.ptr+dest.length 841 version (D_PIC) 842 { 843 enum { zero = 0 } 844 } 845 else 846 { 847 // use p2 (load unit) instead of the overworked p0 or p1 (ALU units) 848 // when initializing registers to zero. 849 static immutable int zero = 0; 850 // use p3/p4 units 851 alias storagenop = storagenopMultiplyAccumulate; // write-only 852 } 853 854 enum { LASTPARAM = 6*4 } // 4* pushes + local + return address. 855 asm pure nothrow @nogc @trusted { 856 naked; 857 858 push ESI; 859 push EDI; 860 align 16; 861 push EBX; 862 push EBP; 863 push EAX; // local variable M 864 mov EDI, [ESP + LASTPARAM + 4*5]; // dest.ptr 865 mov EBX, [ESP + LASTPARAM + 4*2]; // left.length 866 mov ESI, [ESP + LASTPARAM + 4*3]; // left.ptr 867 lea EDI, [EDI + 4*EBX]; // EDI = end of dest for first pass 868 869 mov EAX, [ESP + LASTPARAM + 4*0]; // right.length 870 lea EAX, [EDI + 4*EAX]; 871 mov [ESP + LASTPARAM + 4*0], EAX; // last value for EDI 872 873 lea ESI, [ESI + 4*EBX]; // ESI = end of left 874 mov EAX, [ESP + LASTPARAM + 4*1]; // right.ptr 875 mov EAX, [EAX]; 876 mov [ESP], EAX; // M 877 outer_loop: 878 mov EBP, 0; 879 mov ECX, 0; // ECX = input carry. 880 neg EBX; // count UP to zero. 881 mov EAX, [ESI+4*EBX]; 882 test EBX, 1; 883 jnz L_enter_odd; 884 } 885 // -- Inner loop, with even entry point 886 mixin("asm pure nothrow @nogc @trusted { " ~ asmMulAdd_innerloop("add", "ESP") ~ "}"); 887 asm pure nothrow @nogc @trusted { 888 mov [-4+EDI+4*EBX], EBP; 889 add EDI, 4; 890 cmp EDI, [ESP + LASTPARAM + 4*0]; // is EDI = &dest[$]? 891 jz outer_done; 892 mov EAX, [ESP + LASTPARAM + 4*1]; // right.ptr 893 mov EAX, [EAX+4]; // get new M 894 mov [ESP], EAX; // save new M 895 add int ptr [ESP + LASTPARAM + 4*1], 4; // right.ptr 896 mov EBX, [ESP + LASTPARAM + 4*2]; // left.length 897 jmp outer_loop; 898 outer_done: 899 pop EAX; 900 pop EBP; 901 pop EBX; 902 pop EDI; 903 pop ESI; 904 ret 6*4; 905 } 906 L_enter_odd: 907 mixin("asm pure nothrow @nogc @trusted {" ~ asmMulAdd_enter_odd("add", "ESP") ~ "}"); 908 } 909 910 /** dest[#] /= divisor. 911 * overflow is the initial remainder, and must be in the range 0 .. divisor-1. 912 * divisor must not be a power of 2 (use right shift for that case; 913 * A division by zero will occur if divisor is a power of 2). 914 * Returns the final remainder 915 * 916 * Based on public domain code by Eric Bainville. 917 * (http://www.bealto.com/) Used with permission. 918 */ 919 uint multibyteDivAssign(uint [] dest, uint divisor, uint overflow) pure @safe @nogc 920 { 921 // Timing: limited by a horrible dependency chain. 922 // Pentium M: 18 cycles/op, 8 resource stalls/op. 923 // EAX, EDX = scratch, used by MUL 924 // EDI = dest 925 // CL = shift 926 // ESI = quotient 927 // EBX = remainderhi 928 // EBP = remainderlo 929 // [ESP-4] = mask 930 // [ESP] = kinv (2^64 /divisor) 931 enum { LASTPARAM = 5*4 } // 4* pushes + return address. 932 enum { LOCALS = 2*4} // MASK, KINV 933 asm pure nothrow @nogc @trusted { 934 naked; 935 936 push ESI; 937 push EDI; 938 push EBX; 939 push EBP; 940 941 mov EDI, [ESP + LASTPARAM + 4*2]; // dest.ptr 942 mov EBX, [ESP + LASTPARAM + 4*1]; // dest.length 943 944 // Loop from msb to lsb 945 lea EDI, [EDI + 4*EBX]; 946 mov EBP, EAX; // rem is the input remainder, in 0 .. divisor-1 947 // Build the pseudo-inverse of divisor k: 2^64/k 948 // First determine the shift in ecx to get the max number of bits in kinv 949 xor ECX, ECX; 950 mov EAX, [ESP + LASTPARAM]; //divisor; 951 mov EDX, 1; 952 kinv1: 953 inc ECX; 954 ror EDX, 1; 955 shl EAX, 1; 956 jnc kinv1; 957 dec ECX; 958 // Here, ecx is a left shift moving the msb of k to bit 32 959 960 mov EAX, 1; 961 shl EAX, CL; 962 dec EAX; 963 ror EAX, CL ; //ecx bits at msb 964 push EAX; // MASK 965 966 // Then divide 2^(32+cx) by divisor (edx already ok) 967 xor EAX, EAX; 968 div int ptr [ESP + LASTPARAM + LOCALS-4*1]; //divisor; 969 push EAX; // kinv 970 align 16; 971 L2: 972 // Get 32 bits of quotient approx, multiplying 973 // most significant word of (rem*2^32+input) 974 mov EAX, [ESP+4]; //MASK; 975 and EAX, [EDI - 4]; 976 or EAX, EBP; 977 rol EAX, CL; 978 mov EBX, EBP; 979 mov EBP, [EDI - 4]; 980 mul int ptr [ESP]; //KINV; 981 982 shl EAX, 1; 983 rcl EDX, 1; 984 985 // Multiply by k and subtract to get remainder 986 // Subtraction must be done on two words 987 mov EAX, EDX; 988 mov ESI, EDX; // quot = high word 989 mul int ptr [ESP + LASTPARAM+LOCALS]; //divisor; 990 sub EBP, EAX; 991 sbb EBX, EDX; 992 jz Lb; // high word is 0, goto adjust on single word 993 994 // Adjust quotient and remainder on two words 995 Ld: inc ESI; 996 sub EBP, [ESP + LASTPARAM+LOCALS]; //divisor; 997 sbb EBX, 0; 998 jnz Ld; 999 1000 // Adjust quotient and remainder on single word 1001 Lb: cmp EBP, [ESP + LASTPARAM+LOCALS]; //divisor; 1002 jc Lc; // rem in 0 .. divisor-1, OK 1003 sub EBP, [ESP + LASTPARAM+LOCALS]; //divisor; 1004 inc ESI; 1005 jmp Lb; 1006 1007 // Store result 1008 Lc: 1009 mov [EDI - 4], ESI; 1010 lea EDI, [EDI - 4]; 1011 dec int ptr [ESP + LASTPARAM + 4*1+LOCALS]; // len 1012 jnz L2; 1013 1014 pop EAX; // discard kinv 1015 pop EAX; // discard mask 1016 1017 mov EAX, EBP; // return final remainder 1018 pop EBP; 1019 pop EBX; 1020 pop EDI; 1021 pop ESI; 1022 ret 3*4; 1023 } 1024 } 1025 1026 @safe unittest 1027 { 1028 uint [] aa = new uint[101]; 1029 for (int i=0; i<aa.length; ++i) aa[i] = 0x8765_4321 * (i+3); 1030 uint overflow = multibyteMul(aa, aa, 0x8EFD_FCFB, 0x33FF_7461); 1031 uint r = multibyteDivAssign(aa, 0x8EFD_FCFB, overflow); 1032 for (int i=0; i<aa.length-1; ++i) assert(aa[i] == 0x8765_4321 * (i+3)); 1033 assert(r == 0x33FF_7461); 1034 } 1035 1036 // Set dest[2*i .. 2*i+1]+=src[i]*src[i] 1037 void multibyteAddDiagonalSquares(uint [] dest, const uint [] src) pure @safe @nogc 1038 { 1039 /* Unlike mulAdd, the carry is only 1 bit, 1040 since FFFF*FFFF+FFFF_FFFF = 1_0000_0000. 1041 Note also that on the last iteration, no carry can occur. 1042 As for multibyteAdd, we save & restore carry flag through the loop. 1043 1044 The timing is entirely dictated by the dependency chain. We could 1045 improve it by moving the mov EAX after the adc [EDI], EAX. Probably not worthwhile. 1046 */ 1047 enum { LASTPARAM = 4*5 } // 4* pushes + return address. 1048 asm pure nothrow @nogc @trusted { 1049 naked; 1050 push ESI; 1051 push EDI; 1052 push EBX; 1053 push ECX; 1054 mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr; 1055 mov EBX, [ESP + LASTPARAM + 4*0]; //src.length; 1056 mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr; 1057 lea EDI, [EDI + 8*EBX]; // EDI = end of dest 1058 lea ESI, [ESI + 4*EBX]; // ESI = end of src 1059 neg EBX; // count UP to zero. 1060 xor ECX, ECX; // initial carry = 0. 1061 L1: 1062 mov EAX, [ESI + 4*EBX]; 1063 mul EAX, EAX; 1064 shr CL, 1; // get carry 1065 adc [EDI + 8*EBX], EAX; 1066 adc [EDI + 8*EBX + 4], EDX; 1067 setc CL; // save carry 1068 inc EBX; 1069 jnz L1; 1070 1071 pop ECX; 1072 pop EBX; 1073 pop EDI; 1074 pop ESI; 1075 ret 4*4; 1076 } 1077 } 1078 1079 @safe unittest 1080 { 1081 uint [] aa = new uint[13]; 1082 uint [] bb = new uint[6]; 1083 for (int i=0; i<aa.length; ++i) aa[i] = 0x8000_0000; 1084 for (int i=0; i<bb.length; ++i) bb[i] = i; 1085 aa[$-1]= 7; 1086 multibyteAddDiagonalSquares(aa[0..$-1], bb); 1087 assert(aa[$-1]==7); 1088 for (int i=0; i<bb.length; ++i) { assert(aa[2*i]==0x8000_0000+i*i); assert(aa[2*i+1]==0x8000_0000); } 1089 } 1090 1091 void multibyteTriangleAccumulateD(uint[] dest, uint[] x) pure @safe @nogc 1092 { 1093 for (int i = 0; i < x.length-3; ++i) 1094 { 1095 dest[i+x.length] = multibyteMulAdd!('+')( 1096 dest[i+i+1 .. i+x.length], x[i+1..$], x[i], 0); 1097 } 1098 ulong c = cast(ulong)(x[$-3]) * x[$-2] + dest[$-5]; 1099 dest[$-5] = cast(uint) c; 1100 c >>= 32; 1101 c += cast(ulong)(x[$-3]) * x[$-1] + dest[$-4]; 1102 dest[$-4] = cast(uint) c; 1103 c >>= 32; 1104 length2: 1105 c += cast(ulong)(x[$-2]) * x[$-1]; 1106 dest[$-3] = cast(uint) c; 1107 c >>= 32; 1108 dest[$-2] = cast(uint) c; 1109 } 1110 1111 //dest += src[0]*src[1...$] + src[1]*src[2..$] + ... + src[$-3]*src[$-2..$]+ src[$-2]*src[$-1] 1112 // assert(dest.length = src.length*2); 1113 // assert(src.length >= 3); 1114 void multibyteTriangleAccumulateAsm(uint[] dest, const uint[] src) pure @safe @nogc 1115 { 1116 // Register usage 1117 // EDX:EAX = used in multiply 1118 // EBX = index 1119 // ECX = carry1 1120 // EBP = carry2 1121 // EDI = end of dest for this pass through the loop. Index for outer loop. 1122 // ESI = end of src. never changes 1123 // [ESP] = M = src[i] = multiplier for this pass through the loop. 1124 // dest.length is changed into dest.ptr+dest.length 1125 version (D_PIC) 1126 { 1127 enum { zero = 0 } 1128 } 1129 else 1130 { 1131 // use p2 (load unit) instead of the overworked p0 or p1 (ALU units) 1132 // when initializing registers to zero. 1133 static immutable int zero = 0; 1134 // use p3/p4 units 1135 alias storagenop = storagenopTriangleAccumulate; // write-only 1136 } 1137 1138 enum { LASTPARAM = 6*4 } // 4* pushes + local + return address. 1139 asm pure nothrow @nogc @trusted { 1140 naked; 1141 1142 push ESI; 1143 push EDI; 1144 align 16; 1145 push EBX; 1146 push EBP; 1147 push EAX; // local variable M= src[i] 1148 mov EDI, [ESP + LASTPARAM + 4*3]; // dest.ptr 1149 mov EBX, [ESP + LASTPARAM + 4*0]; // src.length 1150 mov ESI, [ESP + LASTPARAM + 4*1]; // src.ptr 1151 1152 lea ESI, [ESI + 4*EBX]; // ESI = end of left 1153 add int ptr [ESP + LASTPARAM + 4*1], 4; // src.ptr, used for getting M 1154 1155 // local variable [ESP + LASTPARAM + 4*2] = last value for EDI 1156 lea EDI, [EDI + 4*EBX]; // EDI = end of dest for first pass 1157 1158 lea EAX, [EDI + 4*EBX-3*4]; // up to src.length - 3 1159 mov [ESP + LASTPARAM + 4*2], EAX; // last value for EDI = &dest[src.length*2 -3] 1160 1161 cmp EBX, 3; 1162 jz length_is_3; 1163 1164 // We start at src[1], not src[0]. 1165 dec EBX; 1166 mov [ESP + LASTPARAM + 4*0], EBX; 1167 1168 outer_loop: 1169 mov EBX, [ESP + LASTPARAM + 4*0]; // src.length 1170 mov EBP, 0; 1171 mov ECX, 0; // ECX = input carry. 1172 dec int ptr [ESP + LASTPARAM + 4*0]; // Next time, the length will be shorter by 1. 1173 neg EBX; // count UP to zero. 1174 1175 mov EAX, [ESI + 4*EBX - 4*1]; // get new M 1176 mov [ESP], EAX; // save new M 1177 1178 mov EAX, [ESI+4*EBX]; 1179 test EBX, 1; 1180 jnz L_enter_odd; 1181 } 1182 // -- Inner loop, with even entry point 1183 mixin("asm pure nothrow @nogc @trusted { " ~ asmMulAdd_innerloop("add", "ESP") ~ "}"); 1184 asm pure nothrow @nogc @trusted { 1185 mov [-4+EDI+4*EBX], EBP; 1186 add EDI, 4; 1187 cmp EDI, [ESP + LASTPARAM + 4*2]; // is EDI = &dest[$-3]? 1188 jnz outer_loop; 1189 length_is_3: 1190 mov EAX, [ESI - 4*3]; 1191 mul EAX, [ESI - 4*2]; 1192 mov ECX, 0; 1193 add [EDI-2*4], EAX; // ECX:dest[$-5] += x[$-3] * x[$-2] 1194 adc ECX, EDX; 1195 1196 mov EAX, [ESI - 4*3]; 1197 mul EAX, [ESI - 4*1]; // x[$-3] * x[$-1] 1198 add EAX, ECX; 1199 mov ECX, 0; 1200 adc EDX, 0; 1201 // now EDX: EAX = c + x[$-3] * x[$-1] 1202 add [EDI-1*4], EAX; // ECX:dest[$-4] += (EDX:EAX) 1203 adc ECX, EDX; // ECX holds dest[$-3], it acts as carry for the last row 1204 // do length == 2 1205 mov EAX, [ESI - 4*2]; 1206 mul EAX, [ESI - 4*1]; 1207 add ECX, EAX; 1208 adc EDX, 0; 1209 mov [EDI - 0*4], ECX; // dest[$-2:$-3] = c + x[$-2] * x[$-1]; 1210 mov [EDI + 1*4], EDX; 1211 1212 pop EAX; 1213 pop EBP; 1214 pop EBX; 1215 pop EDI; 1216 pop ESI; 1217 ret 4*4; 1218 } 1219 L_enter_odd: 1220 mixin("asm pure nothrow @nogc @trusted {" ~ asmMulAdd_enter_odd("add", "ESP") ~ "}"); 1221 } 1222 1223 @safe unittest 1224 { 1225 uint [] aa = new uint[200]; 1226 uint [] a = aa[0 .. 100]; 1227 uint [] b = new uint [100]; 1228 aa[] = 761; 1229 a[] = 0; 1230 b[] = 0; 1231 a[3] = 6; 1232 b[0]=1; 1233 b[1] = 17; 1234 b[50 .. 100]=78; 1235 multibyteTriangleAccumulateAsm(a, b[0 .. 50]); 1236 uint [] c = new uint[100]; 1237 c[] = 0; 1238 c[1] = 17; 1239 c[3] = 6; 1240 assert(a[]==c[]); 1241 assert(a[0]==0); 1242 aa[] = 0xFFFF_FFFF; 1243 a[] = 0; 1244 b[] = 0; 1245 b[0]= 0xbf6a1f01; 1246 b[1]= 0x6e38ed64; 1247 b[2]= 0xdaa797ed; 1248 b[3] = 0; 1249 1250 multibyteTriangleAccumulateAsm(a[0 .. 8], b[0 .. 4]); 1251 assert(a[1]==0x3a600964); 1252 assert(a[2]==0x339974f6); 1253 assert(a[3]==0x46736fce); 1254 assert(a[4]==0x5e24a2b4); 1255 1256 b[3] = 0xe93ff9f4; 1257 b[4] = 0x184f03; 1258 a[]=0; 1259 multibyteTriangleAccumulateAsm(a[0 .. 14], b[0 .. 7]); 1260 assert(a[3]==0x79fff5c2); 1261 assert(a[4]==0xcf384241); 1262 assert(a[5]== 0x4a17fc8); 1263 assert(a[6]==0x4d549025); 1264 } 1265 1266 1267 void multibyteSquare(BigDigit[] result, const BigDigit [] x) pure @safe @nogc 1268 { 1269 if (x.length < 4) 1270 { 1271 // Special cases, not worth doing triangular. 1272 result[x.length] = multibyteMul(result[0 .. x.length], x, x[0], 0); 1273 multibyteMultiplyAccumulate(result[1..$], x, x[1..$]); 1274 return; 1275 } 1276 // Do half a square multiply. 1277 // dest += src[0]*src[1...$] + src[1]*src[2..$] + ... + src[$-3]*src[$-2..$]+ src[$-2]*src[$-1] 1278 result[x.length] = multibyteMul(result[1 .. x.length], x[1..$], x[0], 0); 1279 multibyteTriangleAccumulateAsm(result[2..$], x[1..$]); 1280 // Multiply by 2 1281 result[$-1] = multibyteShlNoMMX(result[1..$-1], result[1..$-1], 1); 1282 // And add the diagonal elements 1283 result[0] = 0; 1284 multibyteAddDiagonalSquares(result, x); 1285 } 1286 1287 version (BignumPerformanceTest) 1288 { 1289 import core.stdc.stdio; 1290 int clock() { asm { push EBX; xor EAX, EAX; cpuid; pop EBX; rdtsc; } } 1291 1292 __gshared uint [2200] X1; 1293 __gshared uint [2200] Y1; 1294 __gshared uint [4000] Z1; 1295 1296 void testPerformance() pure 1297 { 1298 // The performance results at the top of this file were obtained using 1299 // a Windows device driver to access the CPU performance counters. 1300 // The code below is less accurate but more widely usable. 1301 // The value for division is quite inconsistent. 1302 for (int i=0; i<X1.length; ++i) { X1[i]=i; Y1[i]=i; Z1[i]=i; } 1303 int t, t0; 1304 multibyteShl(Z1[0 .. 2000], X1[0 .. 2000], 7); 1305 t0 = clock(); 1306 multibyteShl(Z1[0 .. 1000], X1[0 .. 1000], 7); 1307 t = clock(); 1308 multibyteShl(Z1[0 .. 2000], X1[0 .. 2000], 7); 1309 auto shltime = (clock() - t) - (t - t0); 1310 t0 = clock(); 1311 multibyteShr(Z1[2 .. 1002], X1[4 .. 1004], 13); 1312 t = clock(); 1313 multibyteShr(Z1[2 .. 2002], X1[4 .. 2004], 13); 1314 auto shrtime = (clock() - t) - (t - t0); 1315 t0 = clock(); 1316 multibyteAddSub!('+')(Z1[0 .. 1000], X1[0 .. 1000], Y1[0 .. 1000], 0); 1317 t = clock(); 1318 multibyteAddSub!('+')(Z1[0 .. 2000], X1[0 .. 2000], Y1[0 .. 2000], 0); 1319 auto addtime = (clock() - t) - (t-t0); 1320 t0 = clock(); 1321 multibyteMul(Z1[0 .. 1000], X1[0 .. 1000], 7, 0); 1322 t = clock(); 1323 multibyteMul(Z1[0 .. 2000], X1[0 .. 2000], 7, 0); 1324 auto multime = (clock() - t) - (t - t0); 1325 multibyteMulAdd!('+')(Z1[0 .. 2000], X1[0 .. 2000], 217, 0); 1326 t0 = clock(); 1327 multibyteMulAdd!('+')(Z1[0 .. 1000], X1[0 .. 1000], 217, 0); 1328 t = clock(); 1329 multibyteMulAdd!('+')(Z1[0 .. 2000], X1[0 .. 2000], 217, 0); 1330 auto muladdtime = (clock() - t) - (t - t0); 1331 multibyteMultiplyAccumulate(Z1[0 .. 64], X1[0 .. 32], Y1[0 .. 32]); 1332 t = clock(); 1333 multibyteMultiplyAccumulate(Z1[0 .. 64], X1[0 .. 32], Y1[0 .. 32]); 1334 auto accumtime = clock() - t; 1335 t0 = clock(); 1336 multibyteDivAssign(Z1[0 .. 2000], 217, 0); 1337 t = clock(); 1338 multibyteDivAssign(Z1[0 .. 1000], 37, 0); 1339 auto divtime = (t - t0) - (clock() - t); 1340 t= clock(); 1341 multibyteSquare(Z1[0 .. 64], X1[0 .. 32]); 1342 auto squaretime = clock() - t; 1343 1344 printf("-- BigInt asm performance (cycles/int) --\n"); 1345 printf("Add: %.2f\n", addtime/1000.0); 1346 printf("Shl: %.2f\n", shltime/1000.0); 1347 printf("Shr: %.2f\n", shrtime/1000.0); 1348 printf("Mul: %.2f\n", multime/1000.0); 1349 printf("MulAdd: %.2f\n", muladdtime/1000.0); 1350 printf("Div: %.2f\n", divtime/1000.0); 1351 printf("MulAccum32: %.2f*n*n (total %d)\n", accumtime/(32.0*32.0), accumtime); 1352 printf("Square32: %.2f*n*n (total %d)\n\n", squaretime/(32.0*32.0), squaretime); 1353 } 1354 1355 static this() 1356 { 1357 testPerformance(); 1358 } 1359 } 1360 1361 } // version (D_InlineAsm_X86)