The world's most popular open source database
00001 /* integer.cpp 00002 * 00003 * Copyright (C) 2003 Sawtooth Consulting Ltd. 00004 * 00005 * This file is part of yaSSL. 00006 * 00007 * yaSSL is free software; you can redistribute it and/or modify 00008 * it under the terms of the GNU General Public License as published by 00009 * the Free Software Foundation; either version 2 of the License, or 00010 * (at your option) any later version. 00011 * 00012 * There are special exceptions to the terms and conditions of the GPL as it 00013 * is applied to yaSSL. View the full text of the exception in the file 00014 * FLOSS-EXCEPTIONS in the directory of this software distribution. 00015 * 00016 * yaSSL is distributed in the hope that it will be useful, 00017 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00018 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00019 * GNU General Public License for more details. 00020 * 00021 * You should have received a copy of the GNU General Public License 00022 * along with this program; if not, write to the Free Software 00023 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA 00024 */ 00025 00026 00027 00028 /* based on Wei Dai's integer.cpp from CryptoPP */ 00029 00030 #include "runtime.hpp" 00031 #include "integer.hpp" 00032 #include "modarith.hpp" 00033 #include "asn.hpp" 00034 00035 00036 00037 #ifdef __DECCXX 00038 #include <c_asm.h> // for asm overflow assembly 00039 #endif 00040 00041 00042 // 64bit multiply overflow intrinsic 00043 #if defined(_MSC_VER) && defined(_WIN64) && !defined(__INTEL_COMPILER) && \ 00044 !defined(TAOCRYPT_NATIVE_DWORD_AVAILABLE) 00045 #ifdef __ia64__ 00046 #define myUMULH __UMULH 00047 #elif __x86_64__ 00048 #define myUMULH __umulh 00049 #else 00050 #error unknown 64bit windows 00051 #endif 00052 00053 extern "C" word myUMULH(word, word); 00054 00055 #pragma intrinsic (myUMULH) 00056 #endif 00057 00058 00059 #ifdef SSE2_INTRINSICS_AVAILABLE 00060 #ifdef __GNUC__ 00061 #include <xmmintrin.h> 00062 #include <signal.h> 00063 #include <setjmp.h> 00064 #ifdef TAOCRYPT_MEMALIGN_AVAILABLE 00065 #include <malloc.h> 00066 #else 00067 #include <stdlib.h> 00068 #endif 00069 #else 00070 #include <emmintrin.h> 00071 #endif 00072 #elif defined(_MSC_VER) && defined(_M_IX86) 00073 #pragma message("You do not seem to have the Visual C++ Processor Pack ") 00074 #pragma message("installed, so use of SSE2 intrinsics will be disabled.") 00075 #elif defined(__GNUC__) && defined(__i386__) 00076 /* #warning You do not have GCC 3.3 or later, or did not specify the -msse2 \ 00077 compiler option. Use of SSE2 intrinsics will be disabled. 00078 */ 00079 #endif 00080 00081 00082 namespace TaoCrypt { 00083 00084 00085 #ifdef SSE2_INTRINSICS_AVAILABLE 00086 00087 template <class T> 00088 CPP_TYPENAME AllocatorBase<T>::pointer AlignedAllocator<T>::allocate( 00089 size_type n, const void *) 00090 { 00091 CheckSize(n); 00092 if (n == 0) 00093 return 0; 00094 if (n >= 4) 00095 { 00096 void* p; 00097 #ifdef TAOCRYPT_MM_MALLOC_AVAILABLE 00098 p = _mm_malloc(sizeof(T)*n, 16); 00099 #elif defined(TAOCRYPT_MEMALIGN_AVAILABLE) 00100 p = memalign(16, sizeof(T)*n); 00101 #elif defined(TAOCRYPT_MALLOC_ALIGNMENT_IS_16) 00102 p = malloc(sizeof(T)*n); 00103 #else 00104 p = (byte *)malloc(sizeof(T)*n + 8); 00105 // assume malloc alignment is at least 8 00106 #endif 00107 00108 #ifdef TAOCRYPT_NO_ALIGNED_ALLOC 00109 assert(m_pBlock == 0); 00110 m_pBlock = p; 00111 if (!IsAlignedOn(p, 16)) 00112 { 00113 assert(IsAlignedOn(p, 8)); 00114 p = (byte *)p + 8; 00115 } 00116 #endif 00117 00118 assert(IsAlignedOn(p, 16)); 00119 return (T*)p; 00120 } 00121 return NEW_TC T[n]; 00122 } 00123 00124 00125 template <class T> 00126 void AlignedAllocator<T>::deallocate(void* p, size_type n) 00127 { 00128 memset(p, 0, n*sizeof(T)); 00129 if (n >= 4) 00130 { 00131 #ifdef TAOCRYPT_MM_MALLOC_AVAILABLE 00132 _mm_free(p); 00133 #elif defined(TAOCRYPT_NO_ALIGNED_ALLOC) 00134 assert(m_pBlock == p || (byte*)m_pBlock+8 == p); 00135 free(m_pBlock); 00136 m_pBlock = 0; 00137 #else 00138 free(p); 00139 #endif 00140 } 00141 else 00142 tcArrayDelete((T *)p); 00143 } 00144 00145 #endif // SSE2 00146 00147 00148 // ******** start of integer needs 00149 00150 // start 5.2.1 adds DWord and Word ******** 00151 00152 // ******************************************************** 00153 00154 class DWord { 00155 public: 00156 DWord() {} 00157 00158 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE 00159 explicit DWord(word low) 00160 { 00161 whole_ = low; 00162 } 00163 #else 00164 explicit DWord(word low) 00165 { 00166 halfs_.low = low; 00167 halfs_.high = 0; 00168 } 00169 #endif 00170 00171 DWord(word low, word high) 00172 { 00173 halfs_.low = low; 00174 halfs_.high = high; 00175 } 00176 00177 static DWord Multiply(word a, word b) 00178 { 00179 DWord r; 00180 00181 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE 00182 r.whole_ = (dword)a * b; 00183 00184 #elif defined(_MSC_VER) 00185 r.halfs_.low = a*b; 00186 r.halfs_.high = myUMULH(a,b); 00187 00188 #elif defined(__alpha__) 00189 r.halfs_.low = a*b; 00190 #ifdef __GNUC__ 00191 __asm__("umulh %1,%2,%0" : "=r" (r.halfs_.high) 00192 : "r" (a), "r" (b)); 00193 #elif defined(__DECCXX) 00194 r.halfs_.high = asm("umulh %a0, %a1, %v0", a, b); 00195 #else 00196 #error unknown alpha compiler 00197 #endif 00198 00199 #elif defined(__ia64__) 00200 r.halfs_.low = a*b; 00201 __asm__("xmpy.hu %0=%1,%2" : "=f" (r.halfs_.high) 00202 : "f" (a), "f" (b)); 00203 00204 #elif defined(_ARCH_PPC64) 00205 r.halfs_.low = a*b; 00206 __asm__("mulhdu %0,%1,%2" : "=r" (r.halfs_.high) 00207 : "r" (a), "r" (b) : "cc"); 00208 00209 #elif defined(__x86_64__) 00210 __asm__("mulq %3" : "=d" (r.halfs_.high), "=a" (r.halfs_.low) : 00211 "a" (a), "rm" (b) : "cc"); 00212 00213 #elif defined(__mips64) 00214 __asm__("dmultu %2,%3" : "=h" (r.halfs_.high), "=l" (r.halfs_.low) 00215 : "r" (a), "r" (b)); 00216 00217 #elif defined(_M_IX86) 00218 // for testing 00219 word64 t = (word64)a * b; 00220 r.halfs_.high = ((word32 *)(&t))[1]; 00221 r.halfs_.low = (word32)t; 00222 #else 00223 #error can not implement DWord 00224 #endif 00225 00226 return r; 00227 } 00228 00229 static DWord MultiplyAndAdd(word a, word b, word c) 00230 { 00231 DWord r = Multiply(a, b); 00232 return r += c; 00233 } 00234 00235 DWord & operator+=(word a) 00236 { 00237 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE 00238 whole_ = whole_ + a; 00239 #else 00240 halfs_.low += a; 00241 halfs_.high += (halfs_.low < a); 00242 #endif 00243 return *this; 00244 } 00245 00246 DWord operator+(word a) 00247 { 00248 DWord r; 00249 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE 00250 r.whole_ = whole_ + a; 00251 #else 00252 r.halfs_.low = halfs_.low + a; 00253 r.halfs_.high = halfs_.high + (r.halfs_.low < a); 00254 #endif 00255 return r; 00256 } 00257 00258 DWord operator-(DWord a) 00259 { 00260 DWord r; 00261 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE 00262 r.whole_ = whole_ - a.whole_; 00263 #else 00264 r.halfs_.low = halfs_.low - a.halfs_.low; 00265 r.halfs_.high = halfs_.high - a.halfs_.high - 00266 (r.halfs_.low > halfs_.low); 00267 #endif 00268 return r; 00269 } 00270 00271 DWord operator-(word a) 00272 { 00273 DWord r; 00274 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE 00275 r.whole_ = whole_ - a; 00276 #else 00277 r.halfs_.low = halfs_.low - a; 00278 r.halfs_.high = halfs_.high - (r.halfs_.low > halfs_.low); 00279 #endif 00280 return r; 00281 } 00282 00283 // returns quotient, which must fit in a word 00284 word operator/(word divisor); 00285 00286 word operator%(word a); 00287 00288 bool operator!() const 00289 { 00290 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE 00291 return !whole_; 00292 #else 00293 return !halfs_.high && !halfs_.low; 00294 #endif 00295 } 00296 00297 word GetLowHalf() const {return halfs_.low;} 00298 word GetHighHalf() const {return halfs_.high;} 00299 word GetHighHalfAsBorrow() const {return 0-halfs_.high;} 00300 00301 private: 00302 union 00303 { 00304 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE 00305 dword whole_; 00306 #endif 00307 struct 00308 { 00309 #ifdef LITTLE_ENDIAN_ORDER 00310 word low; 00311 word high; 00312 #else 00313 word high; 00314 word low; 00315 #endif 00316 } halfs_; 00317 }; 00318 }; 00319 00320 00321 class Word { 00322 public: 00323 Word() {} 00324 00325 Word(word value) 00326 { 00327 whole_ = value; 00328 } 00329 00330 Word(hword low, hword high) 00331 { 00332 whole_ = low | (word(high) << (WORD_BITS/2)); 00333 } 00334 00335 static Word Multiply(hword a, hword b) 00336 { 00337 Word r; 00338 r.whole_ = (word)a * b; 00339 return r; 00340 } 00341 00342 Word operator-(Word a) 00343 { 00344 Word r; 00345 r.whole_ = whole_ - a.whole_; 00346 return r; 00347 } 00348 00349 Word operator-(hword a) 00350 { 00351 Word r; 00352 r.whole_ = whole_ - a; 00353 return r; 00354 } 00355 00356 // returns quotient, which must fit in a word 00357 hword operator/(hword divisor) 00358 { 00359 return hword(whole_ / divisor); 00360 } 00361 00362 bool operator!() const 00363 { 00364 return !whole_; 00365 } 00366 00367 word GetWhole() const {return whole_;} 00368 hword GetLowHalf() const {return hword(whole_);} 00369 hword GetHighHalf() const {return hword(whole_>>(WORD_BITS/2));} 00370 hword GetHighHalfAsBorrow() const {return 0-hword(whole_>>(WORD_BITS/2));} 00371 00372 private: 00373 word whole_; 00374 }; 00375 00376 00377 // dummy is VC60 compiler bug workaround 00378 // do a 3 word by 2 word divide, returns quotient and leaves remainder in A 00379 template <class S, class D> 00380 S DivideThreeWordsByTwo(S* A, S B0, S B1, D* dummy_VC6_WorkAround = 0) 00381 { 00382 // assert {A[2],A[1]} < {B1,B0}, so quotient can fit in a S 00383 assert(A[2] < B1 || (A[2]==B1 && A[1] < B0)); 00384 00385 // estimate the quotient: do a 2 S by 1 S divide 00386 S Q; 00387 if (S(B1+1) == 0) 00388 Q = A[2]; 00389 else 00390 Q = D(A[1], A[2]) / S(B1+1); 00391 00392 // now subtract Q*B from A 00393 D p = D::Multiply(B0, Q); 00394 D u = (D) A[0] - p.GetLowHalf(); 00395 A[0] = u.GetLowHalf(); 00396 u = (D) A[1] - p.GetHighHalf() - u.GetHighHalfAsBorrow() - 00397 D::Multiply(B1, Q); 00398 A[1] = u.GetLowHalf(); 00399 A[2] += u.GetHighHalf(); 00400 00401 // Q <= actual quotient, so fix it 00402 while (A[2] || A[1] > B1 || (A[1]==B1 && A[0]>=B0)) 00403 { 00404 u = (D) A[0] - B0; 00405 A[0] = u.GetLowHalf(); 00406 u = (D) A[1] - B1 - u.GetHighHalfAsBorrow(); 00407 A[1] = u.GetLowHalf(); 00408 A[2] += u.GetHighHalf(); 00409 Q++; 00410 assert(Q); // shouldn't overflow 00411 } 00412 00413 return Q; 00414 } 00415 00416 00417 // do a 4 word by 2 word divide, returns 2 word quotient in Q0 and Q1 00418 template <class S, class D> 00419 inline D DivideFourWordsByTwo(S *T, const D &Al, const D &Ah, const D &B) 00420 { 00421 if (!B) // if divisor is 0, we assume divisor==2**(2*WORD_BITS) 00422 return D(Ah.GetLowHalf(), Ah.GetHighHalf()); 00423 else 00424 { 00425 S Q[2]; 00426 T[0] = Al.GetLowHalf(); 00427 T[1] = Al.GetHighHalf(); 00428 T[2] = Ah.GetLowHalf(); 00429 T[3] = Ah.GetHighHalf(); 00430 Q[1] = DivideThreeWordsByTwo<S, D>(T+1, B.GetLowHalf(), 00431 B.GetHighHalf()); 00432 Q[0] = DivideThreeWordsByTwo<S, D>(T, B.GetLowHalf(), B.GetHighHalf()); 00433 return D(Q[0], Q[1]); 00434 } 00435 } 00436 00437 00438 // returns quotient, which must fit in a word 00439 inline word DWord::operator/(word a) 00440 { 00441 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE 00442 return word(whole_ / a); 00443 #else 00444 hword r[4]; 00445 return DivideFourWordsByTwo<hword, Word>(r, halfs_.low, 00446 halfs_.high, a).GetWhole(); 00447 #endif 00448 } 00449 00450 inline word DWord::operator%(word a) 00451 { 00452 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE 00453 return word(whole_ % a); 00454 #else 00455 if (a < (word(1) << (WORD_BITS/2))) 00456 { 00457 hword h = hword(a); 00458 word r = halfs_.high % h; 00459 r = ((halfs_.low >> (WORD_BITS/2)) + (r << (WORD_BITS/2))) % h; 00460 return hword((hword(halfs_.low) + (r << (WORD_BITS/2))) % h); 00461 } 00462 else 00463 { 00464 hword r[4]; 00465 DivideFourWordsByTwo<hword, Word>(r, halfs_.low, halfs_.high, a); 00466 return Word(r[0], r[1]).GetWhole(); 00467 } 00468 #endif 00469 } 00470 00471 00472 00473 // end 5.2.1 DWord and Word adds 00474 00475 00476 00477 00478 00479 static const unsigned int RoundupSizeTable[] = {2, 2, 2, 4, 4, 8, 8, 8, 8}; 00480 00481 static inline unsigned int RoundupSize(unsigned int n) 00482 { 00483 if (n<=8) 00484 return RoundupSizeTable[n]; 00485 else if (n<=16) 00486 return 16; 00487 else if (n<=32) 00488 return 32; 00489 else if (n<=64) 00490 return 64; 00491 else return 1U << BitPrecision(n-1); 00492 } 00493 00494 00495 static int Compare(const word *A, const word *B, unsigned int N) 00496 { 00497 while (N--) 00498 if (A[N] > B[N]) 00499 return 1; 00500 else if (A[N] < B[N]) 00501 return -1; 00502 00503 return 0; 00504 } 00505 00506 static word Increment(word *A, unsigned int N, word B=1) 00507 { 00508 assert(N); 00509 word t = A[0]; 00510 A[0] = t+B; 00511 if (A[0] >= t) 00512 return 0; 00513 for (unsigned i=1; i<N; i++) 00514 if (++A[i]) 00515 return 0; 00516 return 1; 00517 } 00518 00519 static word Decrement(word *A, unsigned int N, word B=1) 00520 { 00521 assert(N); 00522 word t = A[0]; 00523 A[0] = t-B; 00524 if (A[0] <= t) 00525 return 0; 00526 for (unsigned i=1; i<N; i++) 00527 if (A[i]--) 00528 return 0; 00529 return 1; 00530 } 00531 00532 static void TwosComplement(word *A, unsigned int N) 00533 { 00534 Decrement(A, N); 00535 for (unsigned i=0; i<N; i++) 00536 A[i] = ~A[i]; 00537 } 00538 00539 00540 static word LinearMultiply(word *C, const word *A, word B, unsigned int N) 00541 { 00542 word carry=0; 00543 for(unsigned i=0; i<N; i++) 00544 { 00545 DWord p = DWord::MultiplyAndAdd(A[i], B, carry); 00546 C[i] = p.GetLowHalf(); 00547 carry = p.GetHighHalf(); 00548 } 00549 return carry; 00550 } 00551 00552 00553 static word AtomicInverseModPower2(word A) 00554 { 00555 assert(A%2==1); 00556 00557 word R=A%8; 00558 00559 for (unsigned i=3; i<WORD_BITS; i*=2) 00560 R = R*(2-R*A); 00561 00562 assert(word(R*A)==1); 00563 return R; 00564 } 00565 00566 00567 // ******************************************************** 00568 00569 class Portable 00570 { 00571 public: 00572 static word Add(word *C, const word *A, const word *B, unsigned int N); 00573 static word Subtract(word *C, const word *A, const word*B, unsigned int N); 00574 00575 static void Multiply2(word *C, const word *A, const word *B); 00576 static word Multiply2Add(word *C, const word *A, const word *B); 00577 static void Multiply4(word *C, const word *A, const word *B); 00578 static void Multiply8(word *C, const word *A, const word *B); 00579 static unsigned int MultiplyRecursionLimit() {return 8;} 00580 00581 static void Multiply2Bottom(word *C, const word *A, const word *B); 00582 static void Multiply4Bottom(word *C, const word *A, const word *B); 00583 static void Multiply8Bottom(word *C, const word *A, const word *B); 00584 static unsigned int MultiplyBottomRecursionLimit() {return 8;} 00585 00586 static void Square2(word *R, const word *A); 00587 static void Square4(word *R, const word *A); 00588 static void Square8(word *R, const word *A) {assert(false);} 00589 static unsigned int SquareRecursionLimit() {return 4;} 00590 }; 00591 00592 word Portable::Add(word *C, const word *A, const word *B, unsigned int N) 00593 { 00594 assert (N%2 == 0); 00595 00596 DWord u(0, 0); 00597 for (unsigned int i = 0; i < N; i+=2) 00598 { 00599 u = DWord(A[i]) + B[i] + u.GetHighHalf(); 00600 C[i] = u.GetLowHalf(); 00601 u = DWord(A[i+1]) + B[i+1] + u.GetHighHalf(); 00602 C[i+1] = u.GetLowHalf(); 00603 } 00604 return u.GetHighHalf(); 00605 } 00606 00607 word Portable::Subtract(word *C, const word *A, const word *B, unsigned int N) 00608 { 00609 assert (N%2 == 0); 00610 00611 DWord u(0, 0); 00612 for (unsigned int i = 0; i < N; i+=2) 00613 { 00614 u = (DWord) A[i] - B[i] - u.GetHighHalfAsBorrow(); 00615 C[i] = u.GetLowHalf(); 00616 u = (DWord) A[i+1] - B[i+1] - u.GetHighHalfAsBorrow(); 00617 C[i+1] = u.GetLowHalf(); 00618 } 00619 return 0-u.GetHighHalf(); 00620 } 00621 00622 void Portable::Multiply2(word *C, const word *A, const word *B) 00623 { 00624 /* 00625 word s; 00626 dword d; 00627 00628 if (A1 >= A0) 00629 if (B0 >= B1) 00630 { 00631 s = 0; 00632 d = (dword)(A1-A0)*(B0-B1); 00633 } 00634 else 00635 { 00636 s = (A1-A0); 00637 d = (dword)s*(word)(B0-B1); 00638 } 00639 else 00640 if (B0 > B1) 00641 { 00642 s = (B0-B1); 00643 d = (word)(A1-A0)*(dword)s; 00644 } 00645 else 00646 { 00647 s = 0; 00648 d = (dword)(A0-A1)*(B1-B0); 00649 } 00650 */ 00651 // this segment is the branchless equivalent of above 00652 word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]}; 00653 unsigned int ai = A[1] < A[0]; 00654 unsigned int bi = B[0] < B[1]; 00655 unsigned int di = ai & bi; 00656 DWord d = DWord::Multiply(D[di], D[di+2]); 00657 D[1] = D[3] = 0; 00658 unsigned int si = ai + !bi; 00659 word s = D[si]; 00660 00661 DWord A0B0 = DWord::Multiply(A[0], B[0]); 00662 C[0] = A0B0.GetLowHalf(); 00663 00664 DWord A1B1 = DWord::Multiply(A[1], B[1]); 00665 DWord t = (DWord) A0B0.GetHighHalf() + A0B0.GetLowHalf() + d.GetLowHalf() 00666 + A1B1.GetLowHalf(); 00667 C[1] = t.GetLowHalf(); 00668 00669 t = A1B1 + t.GetHighHalf() + A0B0.GetHighHalf() + d.GetHighHalf() 00670 + A1B1.GetHighHalf() - s; 00671 C[2] = t.GetLowHalf(); 00672 C[3] = t.GetHighHalf(); 00673 } 00674 00675 void Portable::Multiply2Bottom(word *C, const word *A, const word *B) 00676 { 00677 DWord t = DWord::Multiply(A[0], B[0]); 00678 C[0] = t.GetLowHalf(); 00679 C[1] = t.GetHighHalf() + A[0]*B[1] + A[1]*B[0]; 00680 } 00681 00682 word Portable::Multiply2Add(word *C, const word *A, const word *B) 00683 { 00684 word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]}; 00685 unsigned int ai = A[1] < A[0]; 00686 unsigned int bi = B[0] < B[1]; 00687 unsigned int di = ai & bi; 00688 DWord d = DWord::Multiply(D[di], D[di+2]); 00689 D[1] = D[3] = 0; 00690 unsigned int si = ai + !bi; 00691 word s = D[si]; 00692 00693 DWord A0B0 = DWord::Multiply(A[0], B[0]); 00694 DWord t = A0B0 + C[0]; 00695 C[0] = t.GetLowHalf(); 00696 00697 DWord A1B1 = DWord::Multiply(A[1], B[1]); 00698 t = (DWord) t.GetHighHalf() + A0B0.GetLowHalf() + d.GetLowHalf() + 00699 A1B1.GetLowHalf() + C[1]; 00700 C[1] = t.GetLowHalf(); 00701 00702 t = (DWord) t.GetHighHalf() + A1B1.GetLowHalf() + A0B0.GetHighHalf() + 00703 d.GetHighHalf() + A1B1.GetHighHalf() - s + C[2]; 00704 C[2] = t.GetLowHalf(); 00705 00706 t = (DWord) t.GetHighHalf() + A1B1.GetHighHalf() + C[3]; 00707 C[3] = t.GetLowHalf(); 00708 return t.GetHighHalf(); 00709 } 00710 00711 00712 #define MulAcc(x, y) \ 00713 p = DWord::MultiplyAndAdd(A[x], B[y], c); \ 00714 c = p.GetLowHalf(); \ 00715 p = (DWord) d + p.GetHighHalf(); \ 00716 d = p.GetLowHalf(); \ 00717 e += p.GetHighHalf(); 00718 00719 #define SaveMulAcc(s, x, y) \ 00720 R[s] = c; \ 00721 p = DWord::MultiplyAndAdd(A[x], B[y], d); \ 00722 c = p.GetLowHalf(); \ 00723 p = (DWord) e + p.GetHighHalf(); \ 00724 d = p.GetLowHalf(); \ 00725 e = p.GetHighHalf(); 00726 00727 #define SquAcc(x, y) \ 00728 q = DWord::Multiply(A[x], A[y]); \ 00729 p = q + c; \ 00730 c = p.GetLowHalf(); \ 00731 p = (DWord) d + p.GetHighHalf(); \ 00732 d = p.GetLowHalf(); \ 00733 e += p.GetHighHalf(); \ 00734 p = q + c; \ 00735 c = p.GetLowHalf(); \ 00736 p = (DWord) d + p.GetHighHalf(); \ 00737 d = p.GetLowHalf(); \ 00738 e += p.GetHighHalf(); 00739 00740 #define SaveSquAcc(s, x, y) \ 00741 R[s] = c; \ 00742 q = DWord::Multiply(A[x], A[y]); \ 00743 p = q + d; \ 00744 c = p.GetLowHalf(); \ 00745 p = (DWord) e + p.GetHighHalf(); \ 00746 d = p.GetLowHalf(); \ 00747 e = p.GetHighHalf(); \ 00748 p = q + c; \ 00749 c = p.GetLowHalf(); \ 00750 p = (DWord) d + p.GetHighHalf(); \ 00751 d = p.GetLowHalf(); \ 00752 e += p.GetHighHalf(); 00753 00754 00755 void Portable::Multiply4(word *R, const word *A, const word *B) 00756 { 00757 DWord p; 00758 word c, d, e; 00759 00760 p = DWord::Multiply(A[0], B[0]); 00761 R[0] = p.GetLowHalf(); 00762 c = p.GetHighHalf(); 00763 d = e = 0; 00764 00765 MulAcc(0, 1); 00766 MulAcc(1, 0); 00767 00768 SaveMulAcc(1, 2, 0); 00769 MulAcc(1, 1); 00770 MulAcc(0, 2); 00771 00772 SaveMulAcc(2, 0, 3); 00773 MulAcc(1, 2); 00774 MulAcc(2, 1); 00775 MulAcc(3, 0); 00776 00777 SaveMulAcc(3, 3, 1); 00778 MulAcc(2, 2); 00779 MulAcc(1, 3); 00780 00781 SaveMulAcc(4, 2, 3); 00782 MulAcc(3, 2); 00783 00784 R[5] = c; 00785 p = DWord::MultiplyAndAdd(A[3], B[3], d); 00786 R[6] = p.GetLowHalf(); 00787 R[7] = e + p.GetHighHalf(); 00788 } 00789 00790 void Portable::Square2(word *R, const word *A) 00791 { 00792 DWord p, q; 00793 word c, d, e; 00794 00795 p = DWord::Multiply(A[0], A[0]); 00796 R[0] = p.GetLowHalf(); 00797 c = p.GetHighHalf(); 00798 d = e = 0; 00799 00800 SquAcc(0, 1); 00801 00802 R[1] = c; 00803 p = DWord::MultiplyAndAdd(A[1], A[1], d); 00804 R[2] = p.GetLowHalf(); 00805 R[3] = e + p.GetHighHalf(); 00806 } 00807 00808 void Portable::Square4(word *R, const word *A) 00809 { 00810 #ifdef _MSC_VER 00811 // VC60 workaround: MSVC 6.0 has an optimization bug that makes 00812 // (dword)A*B where either A or B has been cast to a dword before 00813 // very expensive. Revisit this function when this 00814 // bug is fixed. 00815 Multiply4(R, A, A); 00816 #else 00817 const word *B = A; 00818 DWord p, q; 00819 word c, d, e; 00820 00821 p = DWord::Multiply(A[0], A[0]); 00822 R[0] = p.GetLowHalf(); 00823 c = p.GetHighHalf(); 00824 d = e = 0; 00825 00826 SquAcc(0, 1); 00827 00828 SaveSquAcc(1, 2, 0); 00829 MulAcc(1, 1); 00830 00831 SaveSquAcc(2, 0, 3); 00832 SquAcc(1, 2); 00833 00834 SaveSquAcc(3, 3, 1); 00835 MulAcc(2, 2); 00836 00837 SaveSquAcc(4, 2, 3); 00838 00839 R[5] = c; 00840 p = DWord::MultiplyAndAdd(A[3], A[3], d); 00841 R[6] = p.GetLowHalf(); 00842 R[7] = e + p.GetHighHalf(); 00843 #endif 00844 } 00845 00846 void Portable::Multiply8(word *R, const word *A, const word *B) 00847 { 00848 DWord p; 00849 word c, d, e; 00850 00851 p = DWord::Multiply(A[0], B[0]); 00852 R[0] = p.GetLowHalf(); 00853 c = p.GetHighHalf(); 00854 d = e = 0; 00855 00856 MulAcc(0, 1); 00857 MulAcc(1, 0); 00858 00859 SaveMulAcc(1, 2, 0); 00860 MulAcc(1, 1); 00861 MulAcc(0, 2); 00862 00863 SaveMulAcc(2, 0, 3); 00864 MulAcc(1, 2); 00865 MulAcc(2, 1); 00866 MulAcc(3, 0); 00867 00868 SaveMulAcc(3, 0, 4); 00869 MulAcc(1, 3); 00870 MulAcc(2, 2); 00871 MulAcc(3, 1); 00872 MulAcc(4, 0); 00873 00874 SaveMulAcc(4, 0, 5); 00875 MulAcc(1, 4); 00876 MulAcc(2, 3); 00877 MulAcc(3, 2); 00878 MulAcc(4, 1); 00879 MulAcc(5, 0); 00880 00881 SaveMulAcc(5, 0, 6); 00882 MulAcc(1, 5); 00883 MulAcc(2, 4); 00884 MulAcc(3, 3); 00885 MulAcc(4, 2); 00886 MulAcc(5, 1); 00887 MulAcc(6, 0); 00888 00889 SaveMulAcc(6, 0, 7); 00890 MulAcc(1, 6); 00891 MulAcc(2, 5); 00892 MulAcc(3, 4); 00893 MulAcc(4, 3); 00894 MulAcc(5, 2); 00895 MulAcc(6, 1); 00896 MulAcc(7, 0); 00897 00898 SaveMulAcc(7, 1, 7); 00899 MulAcc(2, 6); 00900 MulAcc(3, 5); 00901 MulAcc(4, 4); 00902 MulAcc(5, 3); 00903 MulAcc(6, 2); 00904 MulAcc(7, 1); 00905 00906 SaveMulAcc(8, 2, 7); 00907 MulAcc(3, 6); 00908 MulAcc(4, 5); 00909 MulAcc(5, 4); 00910 MulAcc(6, 3); 00911 MulAcc(7, 2); 00912 00913 SaveMulAcc(9, 3, 7); 00914 MulAcc(4, 6); 00915 MulAcc(5, 5); 00916 MulAcc(6, 4); 00917 MulAcc(7, 3); 00918 00919 SaveMulAcc(10, 4, 7); 00920 MulAcc(5, 6); 00921 MulAcc(6, 5); 00922 MulAcc(7, 4); 00923 00924 SaveMulAcc(11, 5, 7); 00925 MulAcc(6, 6); 00926 MulAcc(7, 5); 00927 00928 SaveMulAcc(12, 6, 7); 00929 MulAcc(7, 6); 00930 00931 R[13] = c; 00932 p = DWord::MultiplyAndAdd(A[7], B[7], d); 00933 R[14] = p.GetLowHalf(); 00934 R[15] = e + p.GetHighHalf(); 00935 } 00936 00937 void Portable::Multiply4Bottom(word *R, const word *A, const word *B) 00938 { 00939 DWord p; 00940 word c, d, e; 00941 00942 p = DWord::Multiply(A[0], B[0]); 00943 R[0] = p.GetLowHalf(); 00944 c = p.GetHighHalf(); 00945 d = e = 0; 00946 00947 MulAcc(0, 1); 00948 MulAcc(1, 0); 00949 00950 SaveMulAcc(1, 2, 0); 00951 MulAcc(1, 1); 00952 MulAcc(0, 2); 00953 00954 R[2] = c; 00955 R[3] = d + A[0] * B[3] + A[1] * B[2] + A[2] * B[1] + A[3] * B[0]; 00956 } 00957 00958 void Portable::Multiply8Bottom(word *R, const word *A, const word *B) 00959 { 00960 DWord p; 00961 word c, d, e; 00962 00963 p = DWord::Multiply(A[0], B[0]); 00964 R[0] = p.GetLowHalf(); 00965 c = p.GetHighHalf(); 00966 d = e = 0; 00967 00968 MulAcc(0, 1); 00969 MulAcc(1, 0); 00970 00971 SaveMulAcc(1, 2, 0); 00972 MulAcc(1, 1); 00973 MulAcc(0, 2); 00974 00975 SaveMulAcc(2, 0, 3); 00976 MulAcc(1, 2); 00977 MulAcc(2, 1); 00978 MulAcc(3, 0); 00979 00980 SaveMulAcc(3, 0, 4); 00981 MulAcc(1, 3); 00982 MulAcc(2, 2); 00983 MulAcc(3, 1); 00984 MulAcc(4, 0); 00985 00986 SaveMulAcc(4, 0, 5); 00987 MulAcc(1, 4); 00988 MulAcc(2, 3); 00989 MulAcc(3, 2); 00990 MulAcc(4, 1); 00991 MulAcc(5, 0); 00992 00993 SaveMulAcc(5, 0, 6); 00994 MulAcc(1, 5); 00995 MulAcc(2, 4); 00996 MulAcc(3, 3); 00997 MulAcc(4, 2); 00998 MulAcc(5, 1); 00999 MulAcc(6, 0); 01000 01001 R[6] = c; 01002 R[7] = d + A[0] * B[7] + A[1] * B[6] + A[2] * B[5] + A[3] * B[4] + 01003 A[4] * B[3] + A[5] * B[2] + A[6] * B[1] + A[7] * B[0]; 01004 } 01005 01006 01007 #undef MulAcc 01008 #undef SaveMulAcc 01009 #undef SquAcc 01010 #undef SaveSquAcc 01011 01012 // optimized 01013 01014 #ifdef TAOCRYPT_X86ASM_AVAILABLE 01015 01016 // ************** x86 feature detection *************** 01017 01018 static bool s_sse2Enabled = true; 01019 01020 static void CpuId(word32 input, word32 *output) 01021 { 01022 #ifdef __GNUC__ 01023 __asm__ 01024 ( 01025 // save ebx in case -fPIC is being used 01026 "push %%ebx; cpuid; mov %%ebx, %%edi; pop %%ebx" 01027 : "=a" (output[0]), "=D" (output[1]), "=c" (output[2]), "=d"(output[3]) 01028 : "a" (input) 01029 ); 01030 #else 01031 __asm 01032 { 01033 mov eax, input 01034 cpuid 01035 mov edi, output 01036 mov [edi], eax 01037 mov [edi+4], ebx 01038 mov [edi+8], ecx 01039 mov [edi+12], edx 01040 } 01041 #endif 01042 } 01043 01044 #ifdef SSE2_INTRINSICS_AVAILABLE 01045 #ifndef _MSC_VER 01046 static jmp_buf s_env; 01047 static void SigIllHandler(int) 01048 { 01049 longjmp(s_env, 1); 01050 } 01051 #endif 01052 01053 static bool HasSSE2() 01054 { 01055 if (!s_sse2Enabled) 01056 return false; 01057 01058 word32 cpuid[4]; 01059 CpuId(1, cpuid); 01060 if ((cpuid[3] & (1 << 26)) == 0) 01061 return false; 01062 01063 #ifdef _MSC_VER 01064 __try 01065 { 01066 __asm xorpd xmm0, xmm0 // executing SSE2 instruction 01067 } 01068 __except (1) 01069 { 01070 return false; 01071 } 01072 return true; 01073 #else 01074 typedef void (*SigHandler)(int); 01075 01076 SigHandler oldHandler = signal(SIGILL, SigIllHandler); 01077 if (oldHandler == SIG_ERR) 01078 return false; 01079 01080 bool result = true; 01081 if (setjmp(s_env)) 01082 result = false; 01083 else 01084 __asm __volatile ("xorps %xmm0, %xmm0"); 01085 01086 signal(SIGILL, oldHandler); 01087 return result; 01088 #endif 01089 } 01090 #endif 01091 01092 static bool IsP4() 01093 { 01094 word32 cpuid[4]; 01095 01096 CpuId(0, cpuid); 01097 mySTL::swap(cpuid[2], cpuid[3]); 01098 if (memcmp(cpuid+1, "GenuineIntel", 12) != 0) 01099 return false; 01100 01101 CpuId(1, cpuid); 01102 return ((cpuid[0] >> 8) & 0xf) == 0xf; 01103 } 01104 01105 // ************** Pentium/P4 optimizations *************** 01106 01107 class PentiumOptimized : public Portable 01108 { 01109 public: 01110 static word TAOCRYPT_CDECL Add(word *C, const word *A, const word *B, 01111 unsigned int N); 01112 static word TAOCRYPT_CDECL Subtract(word *C, const word *A, const word *B, 01113 unsigned int N); 01114 static void TAOCRYPT_CDECL Multiply4(word *C, const word *A, 01115 const word *B); 01116 static void TAOCRYPT_CDECL Multiply8(word *C, const word *A, 01117 const word *B); 01118 static void TAOCRYPT_CDECL Multiply8Bottom(word *C, const word *A, 01119 const word *B); 01120 }; 01121 01122 class P4Optimized 01123 { 01124 public: 01125 static word TAOCRYPT_CDECL Add(word *C, const word *A, const word *B, 01126 unsigned int N); 01127 static word TAOCRYPT_CDECL Subtract(word *C, const word *A, const word *B, 01128 unsigned int N); 01129 #ifdef SSE2_INTRINSICS_AVAILABLE 01130 static void TAOCRYPT_CDECL Multiply4(word *C, const word *A, 01131 const word *B); 01132 static void TAOCRYPT_CDECL Multiply8(word *C, const word *A, 01133 const word *B); 01134 static void TAOCRYPT_CDECL Multiply8Bottom(word *C, const word *A, 01135 const word *B); 01136 #endif 01137 }; 01138 01139 typedef word (TAOCRYPT_CDECL * PAddSub)(word *C, const word *A, const word *B, 01140 unsigned int N); 01141 typedef void (TAOCRYPT_CDECL * PMul)(word *C, const word *A, const word *B); 01142 01143 static PAddSub s_pAdd, s_pSub; 01144 #ifdef SSE2_INTRINSICS_AVAILABLE 01145 static PMul s_pMul4, s_pMul8, s_pMul8B; 01146 #endif 01147 01148 static void SetPentiumFunctionPointers() 01149 { 01150 if (IsP4()) 01151 { 01152 s_pAdd = &P4Optimized::Add; 01153 s_pSub = &P4Optimized::Subtract; 01154 } 01155 else 01156 { 01157 s_pAdd = &PentiumOptimized::Add; 01158 s_pSub = &PentiumOptimized::Subtract; 01159 } 01160 01161 #ifdef SSE2_INTRINSICS_AVAILABLE 01162 if (HasSSE2()) 01163 { 01164 s_pMul4 = &P4Optimized::Multiply4; 01165 s_pMul8 = &P4Optimized::Multiply8; 01166 s_pMul8B = &P4Optimized::Multiply8Bottom; 01167 } 01168 else 01169 { 01170 s_pMul4 = &PentiumOptimized::Multiply4; 01171 s_pMul8 = &PentiumOptimized::Multiply8; 01172 s_pMul8B = &PentiumOptimized::Multiply8Bottom; 01173 } 01174 #endif 01175 } 01176 01177 static const char s_RunAtStartupSetPentiumFunctionPointers = 01178 (SetPentiumFunctionPointers(), 0); 01179 01180 void DisableSSE2() 01181 { 01182 s_sse2Enabled = false; 01183 SetPentiumFunctionPointers(); 01184 } 01185 01186 class LowLevel : public PentiumOptimized 01187 { 01188 public: 01189 inline static word Add(word *C, const word *A, const word *B, 01190 unsigned int N) 01191 {return s_pAdd(C, A, B, N);} 01192 inline static word Subtract(word *C, const word *A, const word *B, 01193 unsigned int N) 01194 {return s_pSub(C, A, B, N);} 01195 inline static void Square4(word *R, const word *A) 01196 {Multiply4(R, A, A);} 01197 #ifdef SSE2_INTRINSICS_AVAILABLE 01198 inline static void Multiply4(word *C, const word *A, const word *B) 01199 {s_pMul4(C, A, B);} 01200 inline static void Multiply8(word *C, const word *A, const word *B) 01201 {s_pMul8(C, A, B);} 01202 inline static void Multiply8Bottom(word *C, const word *A, const word *B) 01203 {s_pMul8B(C, A, B);} 01204 #endif 01205 }; 01206 01207 // use some tricks to share assembly code between MSVC and GCC 01208 #ifdef _MSC_VER 01209 #define TAOCRYPT_NAKED __declspec(naked) 01210 #define AS1(x) __asm x 01211 #define AS2(x, y) __asm x, y 01212 #define AddPrologue \ 01213 __asm push ebp \ 01214 __asm push ebx \ 01215 __asm push esi \ 01216 __asm push edi \ 01217 __asm mov ecx, [esp+20] \ 01218 __asm mov edx, [esp+24] \ 01219 __asm mov ebx, [esp+28] \ 01220 __asm mov esi, [esp+32] 01221 #define AddEpilogue \ 01222 __asm pop edi \ 01223 __asm pop esi \ 01224 __asm pop ebx \ 01225 __asm pop ebp \ 01226 __asm ret 01227 #define MulPrologue \ 01228 __asm push ebp \ 01229 __asm push ebx \ 01230 __asm push esi \ 01231 __asm push edi \ 01232 __asm mov ecx, [esp+28] \ 01233 __asm mov esi, [esp+24] \ 01234 __asm push [esp+20] 01235 #define MulEpilogue \ 01236 __asm add esp, 4 \ 01237 __asm pop edi \ 01238 __asm pop esi \ 01239 __asm pop ebx \ 01240 __asm pop ebp \ 01241 __asm ret 01242 #else 01243 #define TAOCRYPT_NAKED 01244 #define AS1(x) #x ";" 01245 #define AS2(x, y) #x ", " #y ";" 01246 #define AddPrologue \ 01247 __asm__ __volatile__ \ 01248 ( \ 01249 "push %%ebx;" /* save this manually, in case of -fPIC */ \ 01250 "mov %2, %%ebx;" \ 01251 ".intel_syntax noprefix;" \ 01252 "push ebp;" 01253 #define AddEpilogue \ 01254 "pop ebp;" \ 01255 ".att_syntax prefix;" \ 01256 "pop %%ebx;" \ 01257 : \ 01258 : "c" (C), "d" (A), "m" (B), "S" (N) \ 01259 : "%edi", "memory", "cc" \ 01260 ); 01261 #define MulPrologue \ 01262 __asm__ __volatile__ \ 01263 ( \ 01264 "push %%ebx;" /* save this manually, in case of -fPIC */ \ 01265 "push %%ebp;" \ 01266 "push %0;" \ 01267 ".intel_syntax noprefix;" 01268 #define MulEpilogue \ 01269 "add esp, 4;" \ 01270 "pop ebp;" \ 01271 "pop ebx;" \ 01272 ".att_syntax prefix;" \ 01273 : \ 01274 : "rm" (Z), "S" (X), "c" (Y) \ 01275 : "%eax", "%edx", "%edi", "memory", "cc" \ 01276 ); 01277 #endif 01278 01279 TAOCRYPT_NAKED word PentiumOptimized::Add(word *C, const word *A, 01280 const word *B, unsigned int N) 01281 { 01282 AddPrologue 01283 01284 // now: ebx = B, ecx = C, edx = A, esi = N 01285 AS2( sub ecx, edx) // hold the distance between C & A so we 01286 // can add this to A to get C 01287 AS2( xor eax, eax) // clear eax 01288 01289 AS2( sub eax, esi) // eax is a negative index from end of B 01290 AS2( lea ebx, [ebx+4*esi]) // ebx is end of B 01291 01292 AS2( sar eax, 1) // unit of eax is now dwords; this also 01293 // clears the carry flag 01294 AS1( jz loopendAdd) // if no dwords then nothing to do 01295 01296 AS1(loopstartAdd:) 01297 AS2( mov esi,[edx]) // load lower word of A 01298 AS2( mov ebp,[edx+4]) // load higher word of A 01299 01300 AS2( mov edi,[ebx+8*eax]) // load lower word of B 01301 AS2( lea edx,[edx+8]) // advance A and C 01302 01303 AS2( adc esi,edi) // add lower words 01304 AS2( mov edi,[ebx+8*eax+4]) // load higher word of B 01305 01306 AS2( adc ebp,edi) // add higher words 01307 AS1( inc eax) // advance B 01308 01309 AS2( mov [edx+ecx-8],esi) // store lower word result 01310 AS2( mov [edx+ecx-4],ebp) // store higher word result 01311 01312 AS1( jnz loopstartAdd) // loop until eax overflows and becomes zero 01313 01314 AS1(loopendAdd:) 01315 AS2( adc eax, 0) // store carry into eax (return result register) 01316 01317 AddEpilogue 01318 } 01319 01320 TAOCRYPT_NAKED word PentiumOptimized::Subtract(word *C, const word *A, 01321 const word *B, unsigned int N) 01322 { 01323 AddPrologue 01324 01325 // now: ebx = B, ecx = C, edx = A, esi = N 01326 AS2( sub ecx, edx) // hold the distance between C & A so we 01327 // can add this to A to get C 01328 AS2( xor eax, eax) // clear eax 01329 01330 AS2( sub eax, esi) // eax is a negative index from end of B 01331 AS2( lea ebx, [ebx+4*esi]) // ebx is end of B 01332 01333 AS2( sar eax, 1) // unit of eax is now dwords; this also 01334 // clears the carry flag 01335 AS1( jz loopendSub) // if no dwords then nothing to do 01336 01337 AS1(loopstartSub:) 01338 AS2( mov esi,[edx]) // load lower word of A 01339 AS2( mov ebp,[edx+4]) // load higher word of A 01340 01341 AS2( mov edi,[ebx+8*eax]) // load lower word of B 01342 AS2( lea edx,[edx+8]) // advance A and C 01343 01344 AS2( sbb esi,edi) // subtract lower words 01345 AS2( mov edi,[ebx+8*eax+4]) // load higher word of B 01346 01347 AS2( sbb ebp,edi) // subtract higher words 01348 AS1( inc eax) // advance B 01349 01350 AS2( mov [edx+ecx-8],esi) // store lower word result 01351 AS2( mov [edx+ecx-4],ebp) // store higher word result 01352 01353 AS1( jnz loopstartSub) // loop until eax overflows and becomes zero 01354 01355 AS1(loopendSub:) 01356 AS2( adc eax, 0) // store carry into eax (return result register) 01357 01358 AddEpilogue 01359 } 01360 01361 // On Pentium 4, the adc and sbb instructions are very expensive, so avoid them. 01362 01363 TAOCRYPT_NAKED word P4Optimized::Add(word *C, const word *A, const word *B, 01364 unsigned int N) 01365 { 01366 AddPrologue 01367 01368 // now: ebx = B, ecx = C, edx = A, esi = N 01369 AS2( xor eax, eax) 01370 AS1( neg esi) 01371 AS1( jz loopendAddP4) // if no dwords then nothing to do 01372 01373 AS2( mov edi, [edx]) 01374 AS2( mov ebp, [ebx]) 01375 AS1( jmp carry1AddP4) 01376 01377 AS1(loopstartAddP4:) 01378 AS2( mov edi, [edx+8]) 01379 AS2( add ecx, 8) 01380 AS2( add edx, 8) 01381 AS2( mov ebp, [ebx]) 01382 AS2( add edi, eax) 01383 AS1( jc carry1AddP4) 01384 AS2( xor eax, eax) 01385 01386 AS1(carry1AddP4:) 01387 AS2( add edi, ebp) 01388 AS2( mov ebp, 1) 01389 AS2( mov [ecx], edi) 01390 AS2( mov edi, [edx+4]) 01391 AS2( cmovc eax, ebp) 01392 AS2( mov ebp, [ebx+4]) 01393 AS2( add ebx, 8) 01394 AS2( add edi, eax) 01395 AS1( jc carry2AddP4) 01396 AS2( xor eax, eax) 01397 01398 AS1(carry2AddP4:) 01399 AS2( add edi, ebp) 01400 AS2( mov ebp, 1) 01401 AS2( cmovc eax, ebp) 01402 AS2( mov [ecx+4], edi) 01403 AS2( add esi, 2) 01404 AS1( jnz loopstartAddP4) 01405 01406 AS1(loopendAddP4:) 01407 01408 AddEpilogue 01409 } 01410 01411 TAOCRYPT_NAKED word P4Optimized::Subtract(word *C, const word *A, 01412 const word *B, unsigned int N) 01413 { 01414 AddPrologue 01415 01416 // now: ebx = B, ecx = C, edx = A, esi = N 01417 AS2( xor eax, eax) 01418 AS1( neg esi) 01419 AS1( jz loopendSubP4) // if no dwords then nothing to do 01420 01421 AS2( mov edi, [edx]) 01422 AS2( mov ebp, [ebx]) 01423 AS1( jmp carry1SubP4) 01424 01425 AS1(loopstartSubP4:) 01426 AS2( mov edi, [edx+8]) 01427 AS2( add edx, 8) 01428 AS2( add ecx, 8) 01429 AS2( mov ebp, [ebx]) 01430 AS2( sub edi, eax) 01431 AS1( jc carry1SubP4) 01432 AS2( xor eax, eax) 01433 01434 AS1(carry1SubP4:) 01435 AS2( sub edi, ebp) 01436 AS2( mov ebp, 1) 01437 AS2( mov [ecx], edi) 01438 AS2( mov edi, [edx+4]) 01439 AS2( cmovc eax, ebp) 01440 AS2( mov ebp, [ebx+4]) 01441 AS2( add ebx, 8) 01442 AS2( sub edi, eax) 01443 AS1( jc carry2SubP4) 01444 AS2( xor eax, eax) 01445 01446 AS1(carry2SubP4:) 01447 AS2( sub edi, ebp) 01448 AS2( mov ebp, 1) 01449 AS2( cmovc eax, ebp) 01450 AS2( mov [ecx+4], edi) 01451 AS2( add esi, 2) 01452 AS1( jnz loopstartSubP4) 01453 01454 AS1(loopendSubP4:) 01455 01456 AddEpilogue 01457 } 01458 01459 // multiply assembly code originally contributed by Leonard Janke 01460 01461 #define MulStartup \ 01462 AS2(xor ebp, ebp) \ 01463 AS2(xor edi, edi) \ 01464 AS2(xor ebx, ebx) 01465 01466 #define MulShiftCarry \ 01467 AS2(mov ebp, edx) \ 01468 AS2(mov edi, ebx) \ 01469 AS2(xor ebx, ebx) 01470 01471 #define MulAccumulateBottom(i,j) \ 01472 AS2(mov eax, [ecx+4*j]) \ 01473 AS2(imul eax, dword ptr [esi+4*i]) \ 01474 AS2(add ebp, eax) 01475 01476 #define MulAccumulate(i,j) \ 01477 AS2(mov eax, [ecx+4*j]) \ 01478 AS1(mul dword ptr [esi+4*i]) \ 01479 AS2(add ebp, eax) \ 01480 AS2(adc edi, edx) \ 01481 AS2(adc bl, bh) 01482 01483 #define MulStoreDigit(i) \ 01484 AS2(mov edx, edi) \ 01485 AS2(mov edi, [esp]) \ 01486 AS2(mov [edi+4*i], ebp) 01487 01488 #define MulLastDiagonal(digits) \ 01489 AS2(mov eax, [ecx+4*(digits-1)]) \ 01490 AS1(mul dword ptr [esi+4*(digits-1)]) \ 01491 AS2(add ebp, eax) \ 01492 AS2(adc edx, edi) \ 01493 AS2(mov edi, [esp]) \ 01494 AS2(mov [edi+4*(2*digits-2)], ebp) \ 01495 AS2(mov [edi+4*(2*digits-1)], edx) 01496 01497 TAOCRYPT_NAKED void PentiumOptimized::Multiply4(word* Z, const word* X, 01498 const word* Y) 01499 { 01500 MulPrologue 01501 // now: [esp] = Z, esi = X, ecx = Y 01502 MulStartup 01503 MulAccumulate(0,0) 01504 MulStoreDigit(0) 01505 MulShiftCarry 01506 01507 MulAccumulate(1,0) 01508 MulAccumulate(0,1) 01509 MulStoreDigit(1) 01510 MulShiftCarry 01511 01512 MulAccumulate(2,0) 01513 MulAccumulate(1,1) 01514 MulAccumulate(0,2) 01515 MulStoreDigit(2) 01516 MulShiftCarry 01517 01518 MulAccumulate(3,0) 01519 MulAccumulate(2,1) 01520 MulAccumulate(1,2) 01521 MulAccumulate(0,3) 01522 MulStoreDigit(3) 01523 MulShiftCarry 01524 01525 MulAccumulate(3,1) 01526 MulAccumulate(2,2) 01527 MulAccumulate(1,3) 01528 MulStoreDigit(4) 01529 MulShiftCarry 01530 01531 MulAccumulate(3,2) 01532 MulAccumulate(2,3) 01533 MulStoreDigit(5) 01534 MulShiftCarry 01535 01536 MulLastDiagonal(4) 01537 MulEpilogue 01538 } 01539 01540 TAOCRYPT_NAKED void PentiumOptimized::Multiply8(word* Z, const word* X, 01541 const word* Y) 01542 { 01543 MulPrologue 01544 // now: [esp] = Z, esi = X, ecx = Y 01545 MulStartup 01546 MulAccumulate(0,0) 01547 MulStoreDigit(0) 01548 MulShiftCarry 01549 01550 MulAccumulate(1,0) 01551 MulAccumulate(0,1) 01552 MulStoreDigit(1) 01553 MulShiftCarry 01554 01555 MulAccumulate(2,0) 01556 MulAccumulate(1,1) 01557 MulAccumulate(0,2) 01558 MulStoreDigit(2) 01559 MulShiftCarry 01560 01561 MulAccumulate(3,0) 01562 MulAccumulate(2,1) 01563 MulAccumulate(1,2) 01564 MulAccumulate(0,3) 01565 MulStoreDigit(3) 01566 MulShiftCarry 01567 01568 MulAccumulate(4,0) 01569 MulAccumulate(3,1) 01570 MulAccumulate(2,2) 01571 MulAccumulate(1,3) 01572 MulAccumulate(0,4) 01573 MulStoreDigit(4) 01574 MulShiftCarry 01575 01576 MulAccumulate(5,0) 01577 MulAccumulate(4,1) 01578 MulAccumulate(3,2) 01579 MulAccumulate(2,3) 01580 MulAccumulate(1,4) 01581 MulAccumulate(0,5) 01582 MulStoreDigit(5) 01583 MulShiftCarry 01584 01585 MulAccumulate(6,0) 01586 MulAccumulate(5,1) 01587 MulAccumulate(4,2) 01588 MulAccumulate(3,3) 01589 MulAccumulate(2,4) 01590 MulAccumulate(1,5) 01591 MulAccumulate(0,6) 01592 MulStoreDigit(6) 01593 MulShiftCarry 01594 01595 MulAccumulate(7,0) 01596 MulAccumulate(6,1) 01597 MulAccumulate(5,2) 01598 MulAccumulate(4,3) 01599 MulAccumulate(3,4) 01600 MulAccumulate(2,5) 01601 MulAccumulate(1,6) 01602 MulAccumulate(0,7) 01603 MulStoreDigit(7) 01604 MulShiftCarry 01605 01606 MulAccumulate(7,1) 01607 MulAccumulate(6,2) 01608 MulAccumulate(5,3) 01609 MulAccumulate(4,4) 01610 MulAccumulate(3,5) 01611 MulAccumulate(2,6) 01612 MulAccumulate(1,7) 01613 MulStoreDigit(8) 01614 MulShiftCarry 01615 01616 MulAccumulate(7,2) 01617 MulAccumulate(6,3) 01618 MulAccumulate(5,4) 01619 MulAccumulate(4,5) 01620 MulAccumulate(3,6) 01621 MulAccumulate(2,7) 01622 MulStoreDigit(9) 01623 MulShiftCarry 01624 01625 MulAccumulate(7,3) 01626 MulAccumulate(6,4) 01627 MulAccumulate(5,5) 01628 MulAccumulate(4,6) 01629 MulAccumulate(3,7) 01630 MulStoreDigit(10) 01631 MulShiftCarry 01632 01633 MulAccumulate(7,4) 01634 MulAccumulate(6,5) 01635 MulAccumulate(5,6) 01636 MulAccumulate(4,7) 01637 MulStoreDigit(11) 01638 MulShiftCarry 01639 01640 MulAccumulate(7,5) 01641 MulAccumulate(6,6) 01642 MulAccumulate(5,7) 01643 MulStoreDigit(12) 01644 MulShiftCarry 01645 01646 MulAccumulate(7,6) 01647 MulAccumulate(6,7) 01648 MulStoreDigit(13) 01649 MulShiftCarry 01650 01651 MulLastDiagonal(8) 01652 MulEpilogue 01653 } 01654 01655 TAOCRYPT_NAKED void PentiumOptimized::Multiply8Bottom(word* Z, const word* X, 01656 const word* Y) 01657 { 01658 MulPrologue 01659 // now: [esp] = Z, esi = X, ecx = Y 01660 MulStartup 01661 MulAccumulate(0,0) 01662 MulStoreDigit(0) 01663 MulShiftCarry 01664 01665 MulAccumulate(1,0) 01666 MulAccumulate(0,1) 01667 MulStoreDigit(1) 01668 MulShiftCarry 01669 01670 MulAccumulate(2,0) 01671 MulAccumulate(1,1) 01672 MulAccumulate(0,2) 01673 MulStoreDigit(2) 01674 MulShiftCarry 01675 01676 MulAccumulate(3,0) 01677 MulAccumulate(2,1) 01678 MulAccumulate(1,2) 01679 MulAccumulate(0,3) 01680 MulStoreDigit(3) 01681 MulShiftCarry 01682 01683 MulAccumulate(4,0) 01684 MulAccumulate(3,1) 01685 MulAccumulate(2,2) 01686 MulAccumulate(1,3) 01687 MulAccumulate(0,4) 01688 MulStoreDigit(4) 01689 MulShiftCarry 01690 01691 MulAccumulate(5,0) 01692 MulAccumulate(4,1) 01693 MulAccumulate(3,2) 01694 MulAccumulate(2,3) 01695 MulAccumulate(1,4) 01696 MulAccumulate(0,5) 01697 MulStoreDigit(5) 01698 MulShiftCarry 01699 01700 MulAccumulate(6,0) 01701 MulAccumulate(5,1) 01702 MulAccumulate(4,2) 01703 MulAccumulate(3,3) 01704 MulAccumulate(2,4) 01705 MulAccumulate(1,5) 01706 MulAccumulate(0,6) 01707 MulStoreDigit(6) 01708 MulShiftCarry 01709 01710 MulAccumulateBottom(7,0) 01711 MulAccumulateBottom(6,1) 01712 MulAccumulateBottom(5,2) 01713 MulAccumulateBottom(4,3) 01714 MulAccumulateBottom(3,4) 01715 MulAccumulateBottom(2,5) 01716 MulAccumulateBottom(1,6) 01717 MulAccumulateBottom(0,7) 01718 MulStoreDigit(7) 01719 MulEpilogue 01720 } 01721 01722 #undef AS1 01723 #undef AS2 01724 01725 #else // not x86 - no processor specific code at this layer 01726 01727 typedef Portable LowLevel; 01728 01729 #endif 01730 01731 #ifdef SSE2_INTRINSICS_AVAILABLE 01732 01733 #ifdef __GNUC__ 01734 #define TAOCRYPT_FASTCALL 01735 #else 01736 #define TAOCRYPT_FASTCALL __fastcall 01737 #endif 01738 01739 static void TAOCRYPT_FASTCALL P4_Mul(__m128i *C, const __m128i *A, 01740 const __m128i *B) 01741 { 01742 __m128i a3210 = _mm_load_si128(A); 01743 __m128i b3210 = _mm_load_si128(B); 01744 01745 __m128i sum; 01746 01747 __m128i z = _mm_setzero_si128(); 01748 __m128i a2b2_a0b0 = _mm_mul_epu32(a3210, b3210); 01749 C[0] = a2b2_a0b0; 01750 01751 __m128i a3120 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(3, 1, 2, 0)); 01752 __m128i b3021 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 2, 1)); 01753 __m128i a1b0_a0b1 = _mm_mul_epu32(a3120, b3021); 01754 __m128i a1b0 = _mm_unpackhi_epi32(a1b0_a0b1, z); 01755 __m128i a0b1 = _mm_unpacklo_epi32(a1b0_a0b1, z); 01756 C[1] = _mm_add_epi64(a1b0, a0b1); 01757 01758 __m128i a31 = _mm_srli_epi64(a3210, 32); 01759 __m128i b31 = _mm_srli_epi64(b3210, 32); 01760 __m128i a3b3_a1b1 = _mm_mul_epu32(a31, b31); 01761 C[6] = a3b3_a1b1; 01762 01763 __m128i a1b1 = _mm_unpacklo_epi32(a3b3_a1b1, z); 01764 __m128i b3012 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 1, 2)); 01765 __m128i a2b0_a0b2 = _mm_mul_epu32(a3210, b3012); 01766 __m128i a0b2 = _mm_unpacklo_epi32(a2b0_a0b2, z); 01767 __m128i a2b0 = _mm_unpackhi_epi32(a2b0_a0b2, z); 01768 sum = _mm_add_epi64(a1b1, a0b2); 01769 C[2] = _mm_add_epi64(sum, a2b0); 01770 01771 __m128i a2301 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(2, 3, 0, 1)); 01772 __m128i b2103 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(2, 1, 0, 3)); 01773 __m128i a3b0_a1b2 = _mm_mul_epu32(a2301, b3012); 01774 __m128i a2b1_a0b3 = _mm_mul_epu32(a3210, b2103); 01775 __m128i a3b0 = _mm_unpackhi_epi32(a3b0_a1b2, z); 01776 __m128i a1b2 = _mm_unpacklo_epi32(a3b0_a1b2, z); 01777 __m128i a2b1 = _mm_unpackhi_epi32(a2b1_a0b3, z); 01778 __m128i a0b3 = _mm_unpacklo_epi32(a2b1_a0b3, z); 01779 __m128i sum1 = _mm_add_epi64(a3b0, a1b2); 01780 sum = _mm_add_epi64(a2b1, a0b3); 01781 C[3] = _mm_add_epi64(sum, sum1); 01782 01783 __m128i a3b1_a1b3 = _mm_mul_epu32(a2301, b2103); 01784 __m128i a2b2 = _mm_unpackhi_epi32(a2b2_a0b0, z); 01785 __m128i a3b1 = _mm_unpackhi_epi32(a3b1_a1b3, z); 01786 __m128i a1b3 = _mm_unpacklo_epi32(a3b1_a1b3, z); 01787 sum = _mm_add_epi64(a2b2, a3b1); 01788 C[4] = _mm_add_epi64(sum, a1b3); 01789 01790 __m128i a1302 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(1, 3, 0, 2)); 01791 __m128i b1203 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(1, 2, 0, 3)); 01792 __m128i a3b2_a2b3 = _mm_mul_epu32(a1302, b1203); 01793 __m128i a3b2 = _mm_unpackhi_epi32(a3b2_a2b3, z); 01794 __m128i a2b3 = _mm_unpacklo_epi32(a3b2_a2b3, z); 01795 C[5] = _mm_add_epi64(a3b2, a2b3); 01796 } 01797 01798 void P4Optimized::Multiply4(word *C, const word *A, const word *B) 01799 { 01800 __m128i temp[7]; 01801 const word *w = (word *)temp; 01802 const __m64 *mw = (__m64 *)w; 01803 01804 P4_Mul(temp, (__m128i *)A, (__m128i *)B); 01805 01806 C[0] = w[0]; 01807 01808 __m64 s1, s2; 01809 01810 __m64 w1 = _mm_cvtsi32_si64(w[1]); 01811 __m64 w4 = mw[2]; 01812 __m64 w6 = mw[3]; 01813 __m64 w8 = mw[4]; 01814 __m64 w10 = mw[5]; 01815 __m64 w12 = mw[6]; 01816 __m64 w14 = mw[7]; 01817 __m64 w16 = mw[8]; 01818 __m64 w18 = mw[9]; 01819 __m64 w20 = mw[10]; 01820 __m64 w22 = mw[11]; 01821 __m64 w26 = _mm_cvtsi32_si64(w[26]); 01822 01823 s1 = _mm_add_si64(w1, w4); 01824 C[1] = _mm_cvtsi64_si32(s1); 01825 s1 = _mm_srli_si64(s1, 32); 01826 01827 s2 = _mm_add_si64(w6, w8); 01828 s1 = _mm_add_si64(s1, s2); 01829 C[2] = _mm_cvtsi64_si32(s1); 01830 s1 = _mm_srli_si64(s1, 32); 01831 01832 s2 = _mm_add_si64(w10, w12); 01833 s1 = _mm_add_si64(s1, s2); 01834 C[3] = _mm_cvtsi64_si32(s1); 01835 s1 = _mm_srli_si64(s1, 32); 01836 01837 s2 = _mm_add_si64(w14, w16); 01838 s1 = _mm_add_si64(s1, s2); 01839 C[4] = _mm_cvtsi64_si32(s1); 01840 s1 = _mm_srli_si64(s1, 32); 01841 01842 s2 = _mm_add_si64(w18, w20); 01843 s1 = _mm_add_si64(s1, s2); 01844 C[5] = _mm_cvtsi64_si32(s1); 01845 s1 = _mm_srli_si64(s1, 32); 01846 01847 s2 = _mm_add_si64(w22, w26); 01848 s1 = _mm_add_si64(s1, s2); 01849 C[6] = _mm_cvtsi64_si32(s1); 01850 s1 = _mm_srli_si64(s1, 32); 01851 01852 C[7] = _mm_cvtsi64_si32(s1) + w[27]; 01853 _mm_empty(); 01854 } 01855 01856 void P4Optimized::Multiply8(word *C, const word *A, const word *B) 01857 { 01858 __m128i temp[28]; 01859 const word *w = (word *)temp; 01860 const __m64 *mw = (__m64 *)w; 01861 const word *x = (word *)temp+7*4; 01862 const __m64 *mx = (__m64 *)x; 01863 const word *y = (word *)temp+7*4*2; 01864 const __m64 *my = (__m64 *)y; 01865 const word *z = (word *)temp+7*4*3; 01866 const __m64 *mz = (__m64 *)z; 01867 01868 P4_Mul(temp, (__m128i *)A, (__m128i *)B); 01869 01870 P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B); 01871 01872 P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1); 01873 01874 P4_Mul(temp+21, (__m128i *)A+1, (__m128i *)B+1); 01875 01876 C[0] = w[0]; 01877 01878 __m64 s1, s2, s3, s4; 01879 01880 __m64 w1 = _mm_cvtsi32_si64(w[1]); 01881 __m64 w4 = mw[2]; 01882 __m64 w6 = mw[3]; 01883 __m64 w8 = mw[4]; 01884 __m64 w10 = mw[5]; 01885 __m64 w12 = mw[6]; 01886 __m64 w14 = mw[7]; 01887 __m64 w16 = mw[8]; 01888 __m64 w18 = mw[9]; 01889 __m64 w20 = mw[10]; 01890 __m64 w22 = mw[11]; 01891 __m64 w26 = _mm_cvtsi32_si64(w[26]); 01892 __m64 w27 = _mm_cvtsi32_si64(w[27]); 01893 01894 __m64 x0 = _mm_cvtsi32_si64(x[0]); 01895 __m64 x1 = _mm_cvtsi32_si64(x[1]); 01896 __m64 x4 = mx[2]; 01897 __m64 x6 = mx[3]; 01898 __m64 x8 = mx[4]; 01899 __m64 x10 = mx[5]; 01900 __m64 x12 = mx[6]; 01901 __m64 x14 = mx[7]; 01902 __m64 x16 = mx[8]; 01903 __m64 x18 = mx[9]; 01904 __m64 x20 = mx[10]; 01905 __m64 x22 = mx[11]; 01906 __m64 x26 = _mm_cvtsi32_si64(x[26]); 01907 __m64 x27 = _mm_cvtsi32_si64(x[27]); 01908 01909 __m64 y0 = _mm_cvtsi32_si64(y[0]); 01910 __m64 y1 = _mm_cvtsi32_si64(y[1]); 01911 __m64 y4 = my[2]; 01912 __m64 y6 = my[3]; 01913 __m64 y8 = my[4]; 01914 __m64 y10 = my[5]; 01915 __m64 y12 = my[6]; 01916 __m64 y14 = my[7]; 01917 __m64 y16 = my[8]; 01918 __m64 y18 = my[9]; 01919 __m64 y20 = my[10]; 01920 __m64 y22 = my[11]; 01921 __m64 y26 = _mm_cvtsi32_si64(y[26]); 01922 __m64 y27 = _mm_cvtsi32_si64(y[27]); 01923 01924 __m64 z0 = _mm_cvtsi32_si64(z[0]); 01925 __m64 z1 = _mm_cvtsi32_si64(z[1]); 01926 __m64 z4 = mz[2]; 01927 __m64 z6 = mz[3]; 01928 __m64 z8 = mz[4]; 01929 __m64 z10 = mz[5]; 01930 __m64 z12 = mz[6]; 01931 __m64 z14 = mz[7]; 01932 __m64 z16 = mz[8]; 01933 __m64 z18 = mz[9]; 01934 __m64 z20 = mz[10]; 01935 __m64 z22 = mz[11]; 01936 __m64 z26 = _mm_cvtsi32_si64(z[26]); 01937 01938 s1 = _mm_add_si64(w1, w4); 01939 C[1] = _mm_cvtsi64_si32(s1); 01940 s1 = _mm_srli_si64(s1, 32); 01941 01942 s2 = _mm_add_si64(w6, w8); 01943 s1 = _mm_add_si64(s1, s2); 01944 C[2] = _mm_cvtsi64_si32(s1); 01945 s1 = _mm_srli_si64(s1, 32); 01946 01947 s2 = _mm_add_si64(w10, w12); 01948 s1 = _mm_add_si64(s1, s2); 01949 C[3] = _mm_cvtsi64_si32(s1); 01950 s1 = _mm_srli_si64(s1, 32); 01951 01952 s3 = _mm_add_si64(x0, y0); 01953 s2 = _mm_add_si64(w14, w16); 01954 s1 = _mm_add_si64(s1, s3); 01955 s1 = _mm_add_si64(s1, s2); 01956 C[4] = _mm_cvtsi64_si32(s1); 01957 s1 = _mm_srli_si64(s1, 32); 01958 01959 s3 = _mm_add_si64(x1, y1); 01960 s4 = _mm_add_si64(x4, y4); 01961 s1 = _mm_add_si64(s1, w18); 01962 s3 = _mm_add_si64(s3, s4); 01963 s1 = _mm_add_si64(s1, w20); 01964 s1 = _mm_add_si64(s1, s3); 01965 C[5] = _mm_cvtsi64_si32(s1); 01966 s1 = _mm_srli_si64(s1, 32); 01967 01968 s3 = _mm_add_si64(x6, y6); 01969 s4 = _mm_add_si64(x8, y8); 01970 s1 = _mm_add_si64(s1, w22); 01971 s3 = _mm_add_si64(s3, s4); 01972 s1 = _mm_add_si64(s1, w26); 01973 s1 = _mm_add_si64(s1, s3); 01974 C[6] = _mm_cvtsi64_si32(s1); 01975 s1 = _mm_srli_si64(s1, 32); 01976 01977 s3 = _mm_add_si64(x10, y10); 01978 s4 = _mm_add_si64(x12, y12); 01979 s1 = _mm_add_si64(s1, w27); 01980 s3 = _mm_add_si64(s3, s4); 01981 s1 = _mm_add_si64(s1, s3); 01982 C[7] = _mm_cvtsi64_si32(s1); 01983 s1 = _mm_srli_si64(s1, 32); 01984 01985 s3 = _mm_add_si64(x14, y14); 01986 s4 = _mm_add_si64(x16, y16); 01987 s1 = _mm_add_si64(s1, z0); 01988 s3 = _mm_add_si64(s3, s4); 01989 s1 = _mm_add_si64(s1, s3); 01990 C[8] = _mm_cvtsi64_si32(s1); 01991 s1 = _mm_srli_si64(s1, 32); 01992 01993 s3 = _mm_add_si64(x18, y18); 01994 s4 = _mm_add_si64(x20, y20); 01995 s1 = _mm_add_si64(s1, z1); 01996 s3 = _mm_add_si64(s3, s4); 01997 s1 = _mm_add_si64(s1, z4); 01998 s1 = _mm_add_si64(s1, s3); 01999 C[9] = _mm_cvtsi64_si32(s1); 02000 s1 = _mm_srli_si64(s1, 32); 02001 02002 s3 = _mm_add_si64(x22, y22); 02003 s4 = _mm_add_si64(x26, y26); 02004 s1 = _mm_add_si64(s1, z6); 02005 s3 = _mm_add_si64(s3, s4); 02006 s1 = _mm_add_si64(s1, z8); 02007 s1 = _mm_add_si64(s1, s3); 02008 C[10] = _mm_cvtsi64_si32(s1); 02009 s1 = _mm_srli_si64(s1, 32); 02010 02011 s3 = _mm_add_si64(x27, y27); 02012 s1 = _mm_add_si64(s1, z10); 02013 s1 = _mm_add_si64(s1, z12); 02014 s1 = _mm_add_si64(s1, s3); 02015 C[11] = _mm_cvtsi64_si32(s1); 02016 s1 = _mm_srli_si64(s1, 32); 02017 02018 s3 = _mm_add_si64(z14, z16); 02019 s1 = _mm_add_si64(s1, s3); 02020 C[12] = _mm_cvtsi64_si32(s1); 02021 s1 = _mm_srli_si64(s1, 32); 02022 02023 s3 = _mm_add_si64(z18, z20); 02024 s1 = _mm_add_si64(s1, s3); 02025 C[13] = _mm_cvtsi64_si32(s1); 02026 s1 = _mm_srli_si64(s1, 32); 02027 02028 s3 = _mm_add_si64(z22, z26); 02029 s1 = _mm_add_si64(s1, s3); 02030 C[14] = _mm_cvtsi64_si32(s1); 02031 s1 = _mm_srli_si64(s1, 32); 02032 02033 C[15] = z[27] + _mm_cvtsi64_si32(s1); 02034 _mm_empty(); 02035 } 02036 02037 void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B) 02038 { 02039 __m128i temp[21]; 02040 const word *w = (word *)temp; 02041 const __m64 *mw = (__m64 *)w; 02042 const word *x = (word *)temp+7*4; 02043 const __m64 *mx = (__m64 *)x; 02044 const word *y = (word *)temp+7*4*2; 02045 const __m64 *my = (__m64 *)y; 02046 02047 P4_Mul(temp, (__m128i *)A, (__m128i *)B); 02048 02049 P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B); 02050 02051 P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1); 02052 02053 C[0] = w[0]; 02054 02055 __m64 s1, s2, s3, s4; 02056 02057 __m64 w1 = _mm_cvtsi32_si64(w[1]); 02058 __m64 w4 = mw[2]; 02059 __m64 w6 = mw[3]; 02060 __m64 w8 = mw[4]; 02061 __m64 w10 = mw[5]; 02062 __m64 w12 = mw[6]; 02063 __m64 w14 = mw[7]; 02064 __m64 w16 = mw[8]; 02065 __m64 w18 = mw[9]; 02066 __m64 w20 = mw[10]; 02067 __m64 w22 = mw[11]; 02068 __m64 w26 = _mm_cvtsi32_si64(w[26]); 02069 02070 __m64 x0 = _mm_cvtsi32_si64(x[0]); 02071 __m64 x1 = _mm_cvtsi32_si64(x[1]); 02072 __m64 x4 = mx[2]; 02073 __m64 x6 = mx[3]; 02074 __m64 x8 = mx[4]; 02075 02076 __m64 y0 = _mm_cvtsi32_si64(y[0]); 02077 __m64 y1 = _mm_cvtsi32_si64(y[1]); 02078 __m64 y4 = my[2]; 02079 __m64 y6 = my[3]; 02080 __m64 y8 = my[4]; 02081 02082 s1 = _mm_add_si64(w1, w4); 02083 C[1] = _mm_cvtsi64_si32(s1); 02084 s1 = _mm_srli_si64(s1, 32); 02085 02086 s2 = _mm_add_si64(w6, w8); 02087 s1 = _mm_add_si64(s1, s2); 02088 C[2] = _mm_cvtsi64_si32(s1); 02089 s1 = _mm_srli_si64(s1, 32); 02090 02091 s2 = _mm_add_si64(w10, w12); 02092 s1 = _mm_add_si64(s1, s2); 02093 C[3] = _mm_cvtsi64_si32(s1); 02094 s1 = _mm_srli_si64(s1, 32); 02095 02096 s3 = _mm_add_si64(x0, y0); 02097 s2 = _mm_add_si64(w14, w16); 02098 s1 = _mm_add_si64(s1, s3); 02099 s1 = _mm_add_si64(s1, s2); 02100 C[4] = _mm_cvtsi64_si32(s1); 02101 s1 = _mm_srli_si64(s1, 32); 02102 02103 s3 = _mm_add_si64(x1, y1); 02104 s4 = _mm_add_si64(x4, y4); 02105 s1 = _mm_add_si64(s1, w18); 02106 s3 = _mm_add_si64(s3, s4); 02107 s1 = _mm_add_si64(s1, w20); 02108 s1 = _mm_add_si64(s1, s3); 02109 C[5] = _mm_cvtsi64_si32(s1); 02110 s1 = _mm_srli_si64(s1, 32); 02111 02112 s3 = _mm_add_si64(x6, y6); 02113 s4 = _mm_add_si64(x8, y8); 02114 s1 = _mm_add_si64(s1, w22); 02115 s3 = _mm_add_si64(s3, s4); 02116 s1 = _mm_add_si64(s1, w26); 02117 s1 = _mm_add_si64(s1, s3); 02118 C[6] = _mm_cvtsi64_si32(s1); 02119 s1 = _mm_srli_si64(s1, 32); 02120 02121 C[7] = _mm_cvtsi64_si32(s1) + w[27] + x[10] + y[10] + x[12] + y[12]; 02122 _mm_empty(); 02123 } 02124 02125 #endif // #ifdef SSE2_INTRINSICS_AVAILABLE 02126 02127 // end optimized 02128 02129 // ******************************************************** 02130 02131 #define A0 A 02132 #define A1 (A+N2) 02133 #define B0 B 02134 #define B1 (B+N2) 02135 02136 #define T0 T 02137 #define T1 (T+N2) 02138 #define T2 (T+N) 02139 #define T3 (T+N+N2) 02140 02141 #define R0 R 02142 #define R1 (R+N2) 02143 #define R2 (R+N) 02144 #define R3 (R+N+N2) 02145 02146 //VC60 workaround: compiler bug triggered without the extra dummy parameters 02147 02148 // R[2*N] - result = A*B 02149 // T[2*N] - temporary work space 02150 // A[N] --- multiplier 02151 // B[N] --- multiplicant 02152 02153 02154 void RecursiveMultiply(word *R, word *T, const word *A, const word *B, 02155 unsigned int N) 02156 { 02157 assert(N>=2 && N%2==0); 02158 02159 if (LowLevel::MultiplyRecursionLimit() >= 8 && N==8) 02160 LowLevel::Multiply8(R, A, B); 02161 else if (LowLevel::MultiplyRecursionLimit() >= 4 && N==4) 02162 LowLevel::Multiply4(R, A, B); 02163 else if (N==2) 02164 LowLevel::Multiply2(R, A, B); 02165 else 02166 { 02167 const unsigned int N2 = N/2; 02168 int carry; 02169 02170 int aComp = Compare(A0, A1, N2); 02171 int bComp = Compare(B0, B1, N2); 02172 02173 switch (2*aComp + aComp + bComp) 02174 { 02175 case -4: 02176 LowLevel::Subtract(R0, A1, A0, N2); 02177 LowLevel::Subtract(R1, B0, B1, N2); 02178 RecursiveMultiply(T0, T2, R0, R1, N2); 02179 LowLevel::Subtract(T1, T1, R0, N2); 02180 carry = -1; 02181 break; 02182 case -2: 02183 LowLevel::Subtract(R0, A1, A0, N2); 02184 LowLevel::Subtract(R1, B0, B1, N2); 02185 RecursiveMultiply(T0, T2, R0, R1, N2); 02186 carry = 0; 02187 break; 02188 case 2: 02189 LowLevel::Subtract(R0, A0, A1, N2); 02190 LowLevel::Subtract(R1, B1, B0, N2); 02191 RecursiveMultiply(T0, T2, R0, R1, N2); 02192 carry = 0; 02193 break; 02194 case 4: 02195 LowLevel::Subtract(R0, A1, A0, N2); 02196 LowLevel::Subtract(R1, B0, B1, N2); 02197 RecursiveMultiply(T0, T2, R0, R1, N2); 02198 LowLevel::Subtract(T1, T1, R1, N2); 02199 carry = -1; 02200 break; 02201 default: 02202 SetWords(T0, 0, N); 02203 carry = 0; 02204 } 02205 02206 RecursiveMultiply(R0, T2, A0, B0, N2); 02207 RecursiveMultiply(R2, T2, A1, B1, N2); 02208 02209 // now T[01] holds (A1-A0)*(B0-B1),R[01] holds A0*B0, R[23] holds A1*B1 02210 02211 carry += LowLevel::Add(T0, T0, R0, N); 02212 carry += LowLevel::Add(T0, T0, R2, N); 02213 carry += LowLevel::Add(R1, R1, T0, N); 02214 02215 assert (carry >= 0 && carry <= 2); 02216 Increment(R3, N2, carry); 02217 } 02218 } 02219 02220 02221 void RecursiveSquare(word *R, word *T, const word *A, unsigned int N) 02222 { 02223 assert(N && N%2==0); 02224 if (LowLevel::SquareRecursionLimit() >= 8 && N==8) 02225 LowLevel::Square8(R, A); 02226 if (LowLevel::SquareRecursionLimit() >= 4 && N==4) 02227 LowLevel::Square4(R, A); 02228 else if (N==2) 02229 LowLevel::Square2(R, A); 02230 else 02231 { 02232 const unsigned int N2 = N/2; 02233 02234 RecursiveSquare(R0, T2, A0, N2); 02235 RecursiveSquare(R2, T2, A1, N2); 02236 RecursiveMultiply(T0, T2, A0, A1, N2); 02237 02238 word carry = LowLevel::Add(R1, R1, T0, N); 02239 carry += LowLevel::Add(R1, R1, T0, N); 02240 Increment(R3, N2, carry); 02241 } 02242 } 02243 02244 02245 // R[N] - bottom half of A*B 02246 // T[N] - temporary work space 02247 // A[N] - multiplier 02248 // B[N] - multiplicant 02249 02250 02251 void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B, 02252 unsigned int N) 02253 { 02254 assert(N>=2 && N%2==0); 02255 if (LowLevel::MultiplyBottomRecursionLimit() >= 8 && N==8) 02256 LowLevel::Multiply8Bottom(R, A, B); 02257 else if (LowLevel::MultiplyBottomRecursionLimit() >= 4 && N==4) 02258 LowLevel::Multiply4Bottom(R, A, B); 02259 else if (N==2) 02260 LowLevel::Multiply2Bottom(R, A, B); 02261 else 02262 { 02263 const unsigned int N2 = N/2; 02264 02265 RecursiveMultiply(R, T, A0, B0, N2); 02266 RecursiveMultiplyBottom(T0, T1, A1, B0, N2); 02267 LowLevel::Add(R1, R1, T0, N2); 02268 RecursiveMultiplyBottom(T0, T1, A0, B1, N2); 02269 LowLevel::Add(R1, R1, T0, N2); 02270 } 02271 } 02272 02273 02274 void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A, 02275 const word *B, unsigned int N) 02276 { 02277 assert(N>=2 && N%2==0); 02278 02279 if (N==4) 02280 { 02281 LowLevel::Multiply4(T, A, B); 02282 memcpy(R, T+4, 4*WORD_SIZE); 02283 } 02284 else if (N==2) 02285 { 02286 LowLevel::Multiply2(T, A, B); 02287 memcpy(R, T+2, 2*WORD_SIZE); 02288 } 02289 else 02290 { 02291 const unsigned int N2 = N/2; 02292 int carry; 02293 02294 int aComp = Compare(A0, A1, N2); 02295 int bComp = Compare(B0, B1, N2); 02296 02297 switch (2*aComp + aComp + bComp) 02298 { 02299 case -4: 02300 LowLevel::Subtract(R0, A1, A0, N2); 02301 LowLevel::Subtract(R1, B0, B1, N2); 02302 RecursiveMultiply(T0, T2, R0, R1, N2); 02303 LowLevel::Subtract(T1, T1, R0, N2); 02304 carry = -1; 02305 break; 02306 case -2: 02307 LowLevel::Subtract(R0, A1, A0, N2); 02308 LowLevel::Subtract(R1, B0, B1, N2); 02309 RecursiveMultiply(T0, T2, R0, R1, N2); 02310 carry = 0; 02311 break; 02312 case 2: 02313 LowLevel::Subtract(R0, A0, A1, N2); 02314 LowLevel::Subtract(R1, B1, B0, N2); 02315 RecursiveMultiply(T0, T2, R0, R1, N2); 02316 carry = 0; 02317 break; 02318 case 4: 02319 LowLevel::Subtract(R0, A1, A0, N2); 02320 LowLevel::Subtract(R1, B0, B1, N2); 02321 RecursiveMultiply(T0, T2, R0, R1, N2); 02322 LowLevel::Subtract(T1, T1, R1, N2); 02323 carry = -1; 02324 break; 02325 default: 02326 SetWords(T0, 0, N); 02327 carry = 0; 02328 } 02329 02330 RecursiveMultiply(T2, R0, A1, B1, N2); 02331 02332 // now T[01] holds (A1-A0)*(B0-B1), T[23] holds A1*B1 02333 02334 word c2 = LowLevel::Subtract(R0, L+N2, L, N2); 02335 c2 += LowLevel::Subtract(R0, R0, T0, N2); 02336 word t = (Compare(R0, T2, N2) == -1); 02337 02338 carry += t; 02339 carry += Increment(R0, N2, c2+t); 02340 carry += LowLevel::Add(R0, R0, T1, N2); 02341 carry += LowLevel::Add(R0, R0, T3, N2); 02342 assert (carry >= 0 && carry <= 2); 02343 02344 CopyWords(R1, T3, N2); 02345 Increment(R1, N2, carry); 02346 } 02347 } 02348 02349 02350 inline word Add(word *C, const word *A, const word *B, unsigned int N) 02351 { 02352 return LowLevel::Add(C, A, B, N); 02353 } 02354 02355 inline word Subtract(word *C, const word *A, const word *B, unsigned int N) 02356 { 02357 return LowLevel::Subtract(C, A, B, N); 02358 } 02359 02360 inline void Multiply(word *R, word *T, const word *A, const word *B, 02361 unsigned int N) 02362 { 02363 RecursiveMultiply(R, T, A, B, N); 02364 } 02365 02366 inline void Square(word *R, word *T, const word *A, unsigned int N) 02367 { 02368 RecursiveSquare(R, T, A, N); 02369 } 02370 02371 02372 void AsymmetricMultiply(word *R, word *T, const word *A, unsigned int NA, 02373 const word *B, unsigned int NB) 02374 { 02375 if (NA == NB) 02376 { 02377 if (A == B) 02378 Square(R, T, A, NA); 02379 else 02380 Multiply(R, T, A, B, NA); 02381 02382 return; 02383 } 02384 02385 if (NA > NB) 02386 { 02387 mySTL::swap(A, B); 02388 mySTL::swap(NA, NB); 02389 } 02390 02391 assert(NB % NA == 0); 02392 assert((NB/NA)%2 == 0); // NB is an even multiple of NA 02393 02394 if (NA==2 && !A[1]) 02395 { 02396 switch (A[0]) 02397 { 02398 case 0: 02399 SetWords(R, 0, NB+2); 02400 return; 02401 case 1: 02402 CopyWords(R, B, NB); 02403 R[NB] = R[NB+1] = 0; 02404 return; 02405 default: 02406 R[NB] = LinearMultiply(R, B, A[0], NB); 02407 R[NB+1] = 0; 02408 return; 02409 } 02410 } 02411 02412 Multiply(R, T, A, B, NA); 02413 CopyWords(T+2*NA, R+NA, NA); 02414 02415 unsigned i; 02416 02417 for (i=2*NA; i<NB; i+=2*NA) 02418 Multiply(T+NA+i, T, A, B+i, NA); 02419 for (i=NA; i<NB; i+=2*NA) 02420 Multiply(R+i, T, A, B+i, NA); 02421 02422 if (Add(R+NA, R+NA, T+2*NA, NB-NA)) 02423 Increment(R+NB, NA); 02424 } 02425 02426 02427 void PositiveMultiply(Integer& product, const Integer& a, const Integer& b) 02428 { 02429 unsigned int aSize = RoundupSize(a.WordCount()); 02430 unsigned int bSize = RoundupSize(b.WordCount()); 02431 02432 product.reg_.CleanNew(RoundupSize(aSize + bSize)); 02433 product.sign_ = Integer::POSITIVE; 02434 02435 AlignedWordBlock workspace(aSize + bSize); 02436 AsymmetricMultiply(product.reg_.get_buffer(), workspace.get_buffer(), 02437 a.reg_.get_buffer(), aSize, b.reg_.get_buffer(), bSize); 02438 } 02439 02440 void Multiply(Integer &product, const Integer &a, const Integer &b) 02441 { 02442 PositiveMultiply(product, a, b); 02443 02444 if (a.NotNegative() != b.NotNegative()) 02445 product.Negate(); 02446 } 02447 02448 02449 static inline unsigned int EvenWordCount(const word *X, unsigned int N) 02450 { 02451 while (N && X[N-2]==0 && X[N-1]==0) 02452 N-=2; 02453 return N; 02454 } 02455 02456 02457 unsigned int AlmostInverse(word *R, word *T, const word *A, unsigned int NA, 02458 const word *M, unsigned int N) 02459 { 02460 assert(NA<=N && N && N%2==0); 02461 02462 word *b = T; 02463 word *c = T+N; 02464 word *f = T+2*N; 02465 word *g = T+3*N; 02466 unsigned int bcLen=2, fgLen=EvenWordCount(M, N); 02467 unsigned int k=0, s=0; 02468 02469 SetWords(T, 0, 3*N); 02470 b[0]=1; 02471 CopyWords(f, A, NA); 02472 CopyWords(g, M, N); 02473 02474 while (1) 02475 { 02476 word t=f[0]; 02477 while (!t) 02478 { 02479 if (EvenWordCount(f, fgLen)==0) 02480 { 02481 SetWords(R, 0, N); 02482 return 0; 02483 } 02484 02485 ShiftWordsRightByWords(f, fgLen, 1); 02486 if (c[bcLen-1]) bcLen+=2; 02487 assert(bcLen <= N); 02488 ShiftWordsLeftByWords(c, bcLen, 1); 02489 k+=WORD_BITS; 02490 t=f[0]; 02491 } 02492 02493 unsigned int i=0; 02494 while (t%2 == 0) 02495 { 02496 t>>=1; 02497 i++; 02498 } 02499 k+=i; 02500 02501 if (t==1 && f[1]==0 && EvenWordCount(f, fgLen)==2) 02502 { 02503 if (s%2==0) 02504 CopyWords(R, b, N); 02505 else 02506 Subtract(R, M, b, N); 02507 return k; 02508 } 02509 02510 ShiftWordsRightByBits(f, fgLen, i); 02511 t=ShiftWordsLeftByBits(c, bcLen, i); 02512 if (t) 02513 { 02514 c[bcLen] = t; 02515 bcLen+=2; 02516 assert(bcLen <= N); 02517 } 02518 02519 if (f[fgLen-2]==0 && g[fgLen-2]==0 && f[fgLen-1]==0 && g[fgLen-1]==0) 02520 fgLen-=2; 02521 02522 if (Compare(f, g, fgLen)==-1) 02523 { 02524 mySTL::swap(f, g); 02525 mySTL::swap(b, c); 02526 s++; 02527 } 02528 02529 Subtract(f, f, g, fgLen); 02530 02531 if (Add(b, b, c, bcLen)) 02532 { 02533 b[bcLen] = 1; 02534 bcLen+=2; 02535 assert(bcLen <= N); 02536 } 02537 } 02538 } 02539 02540 // R[N] - result = A/(2^k) mod M 02541 // A[N] - input 02542 // M[N] - modulus 02543 02544 void DivideByPower2Mod(word *R, const word *A, unsigned int k, const word *M, 02545 unsigned int N) 02546 { 02547 CopyWords(R, A, N); 02548 02549 while (k--) 02550 { 02551 if (R[0]%2==0) 02552 ShiftWordsRightByBits(R, N, 1); 02553 else 02554 { 02555 word carry = Add(R, R, M, N); 02556 ShiftWordsRightByBits(R, N, 1); 02557 R[N-1] += carry<<(WORD_BITS-1); 02558 } 02559 } 02560 } 02561 02562 // R[N] - result = A*(2^k) mod M 02563 // A[N] - input 02564 // M[N] - modulus 02565 02566 void MultiplyByPower2Mod(word *R, const word *A, unsigned int k, const word *M, 02567 unsigned int N) 02568 { 02569 CopyWords(R, A, N); 02570 02571 while (k--) 02572 if (ShiftWordsLeftByBits(R, N, 1) || Compare(R, M, N)>=0) 02573 Subtract(R, R, M, N); 02574 } 02575 02576 02577 // ********** end of integer needs 02578 02579 02580 Integer::Integer() 02581 : reg_(2), sign_(POSITIVE) 02582 { 02583 reg_[0] = reg_[1] = 0; 02584 } 02585 02586 02587 Integer::Integer(const Integer& t) 02588 : reg_(RoundupSize(t.WordCount())), sign_(t.sign_) 02589 { 02590 CopyWords(reg_.get_buffer(), t.reg_.get_buffer(), reg_.size()); 02591 } 02592 02593 02594 Integer::Integer(signed long value) 02595 : reg_(2) 02596 { 02597 if (value >= 0) 02598 sign_ = POSITIVE; 02599 else 02600 { 02601 sign_ = NEGATIVE; 02602 value = -value; 02603 } 02604 reg_[0] = word(value); 02605 reg_[1] = word(SafeRightShift<WORD_BITS, unsigned long>(value)); 02606 } 02607 02608 02609 Integer::Integer(Sign s, word high, word low) 02610 : reg_(2), sign_(s) 02611 { 02612 reg_[0] = low; 02613 reg_[1] = high; 02614 } 02615 02616 02617 Integer::Integer(word value, unsigned int length) 02618 : reg_(RoundupSize(length)), sign_(POSITIVE) 02619 { 02620 reg_[0] = value; 02621 SetWords(reg_ + 1, 0, reg_.size() - 1); 02622 } 02623 02624 02625 Integer::Integer(const byte *encodedInteger, unsigned int byteCount, 02626 Signedness s) 02627 { 02628 Decode(encodedInteger, byteCount, s); 02629 } 02630 02631 class BadBER {}; 02632 02633 // BER Decode Source 02634 Integer::Integer(Source& source) 02635 : reg_(2), sign_(POSITIVE) 02636 { 02637 Decode(source); 02638 } 02639 02640 void Integer::Decode(Source& source) 02641 { 02642 byte b = source.next(); 02643 if (b != INTEGER) { 02644 source.SetError(INTEGER_E); 02645 return; 02646 } 02647 02648 word32 length = GetLength(source); 02649 02650 if ( (b = source.next()) == 0x00) 02651 length--; 02652 else 02653 source.prev(); 02654 02655 unsigned int words = (length + WORD_SIZE - 1) / WORD_SIZE; 02656 words = RoundupSize(words); 02657 if (words > reg_.size()) reg_.CleanNew(words); 02658 02659 for (int j = length; j > 0; j--) { 02660 b = source.next(); 02661 reg_ [(j-1) / WORD_SIZE] |= (word)b << ((j-1) % WORD_SIZE) * 8; 02662 } 02663 } 02664 02665 02666 void Integer::Decode(const byte* input, unsigned int inputLen, Signedness s) 02667 { 02668 unsigned int idx(0); 02669 byte b = input[idx++]; 02670 sign_ = ((s==SIGNED) && (b & 0x80)) ? NEGATIVE : POSITIVE; 02671 02672 while (inputLen>0 && (sign_==POSITIVE ? b==0 : b==0xff)) 02673 { 02674 inputLen--; 02675 b = input[idx++]; 02676 } 02677 02678 reg_.CleanNew(RoundupSize(BytesToWords(inputLen))); 02679 02680 --idx; 02681 for (unsigned int i=inputLen; i > 0; i--) 02682 { 02683 b = input[idx++]; 02684 reg_[(i-1)/WORD_SIZE] |= (word)b << ((i-1)%WORD_SIZE)*8; 02685 } 02686 02687 if (sign_ == NEGATIVE) 02688 { 02689 for (unsigned i=inputLen; i<reg_.size()*WORD_SIZE; i++) 02690 reg_[i/WORD_SIZE] |= (word)0xff << (i%WORD_SIZE)*8; 02691 TwosComplement(reg_.get_buffer(), reg_.size()); 02692 } 02693 } 02694 02695 02696 unsigned int Integer::Encode(byte* output, unsigned int outputLen, 02697 Signedness signedness) const 02698 { 02699 unsigned int idx(0); 02700 if (signedness == UNSIGNED || NotNegative()) 02701 { 02702 for (unsigned int i=outputLen; i > 0; i--) 02703 output[idx++] = GetByte(i-1); 02704 } 02705 else 02706 { 02707 // take two's complement of *this 02708 Integer temp = Integer::Power2(8*max(ByteCount(), outputLen)) + *this; 02709 for (unsigned i=0; i<outputLen; i++) 02710 output[idx++] = temp.GetByte(outputLen-i-1); 02711 } 02712 return outputLen; 02713 } 02714 02715 02716 static Integer* zero = 0; 02717 02718 const Integer &Integer::Zero() 02719 { 02720 if (!zero) 02721 zero = NEW_TC Integer; 02722 return *zero; 02723 } 02724 02725 02726 static Integer* one = 0; 02727 02728 const Integer &Integer::One() 02729 { 02730 if (!one) 02731 one = NEW_TC Integer(1,2); 02732 return *one; 02733 } 02734 02735 02736 // Clean up static singleton holders, not a leak, but helpful to have gone 02737 // when checking for leaks 02738 void CleanUp() 02739 { 02740 tcDelete(one); 02741 tcDelete(zero); 02742 02743 // In case user calls more than once, prevent seg fault 02744 one = 0; 02745 zero = 0; 02746 } 02747 02748 Integer::Integer(RandomNumberGenerator& rng, const Integer& min, 02749 const Integer& max) 02750 { 02751 Randomize(rng, min, max); 02752 } 02753 02754 02755 void Integer::Randomize(RandomNumberGenerator& rng, unsigned int nbits) 02756 { 02757 const unsigned int nbytes = nbits/8 + 1; 02758 ByteBlock buf(nbytes); 02759 rng.GenerateBlock(buf.get_buffer(), nbytes); 02760 if (nbytes) 02761 buf[0] = (byte)Crop(buf[0], nbits % 8); 02762 Decode(buf.get_buffer(), nbytes, UNSIGNED); 02763 } 02764 02765 void Integer::Randomize(RandomNumberGenerator& rng, const Integer& min, 02766 const Integer& max) 02767 { 02768 assert(min <= max); 02769 02770 Integer range = max - min; 02771 const unsigned int nbits = range.BitCount(); 02772 02773 do 02774 { 02775 Randomize(rng, nbits); 02776 } 02777 while (*this > range); 02778 02779 *this += min; 02780 } 02781 02782 02783 Integer Integer::Power2(unsigned int e) 02784 { 02785 Integer r((word)0, BitsToWords(e + 1)); 02786 r.SetBit(e); 02787 return r; 02788 } 02789 02790 02791 void Integer::SetBit(unsigned int n, bool value) 02792 { 02793 if (value) 02794 { 02795 reg_.CleanGrow(RoundupSize(BitsToWords(n + 1))); 02796 reg_[n / WORD_BITS] |= (word(1) << (n % WORD_BITS)); 02797 } 02798 else 02799 { 02800 if (n / WORD_BITS < reg_.size()) 02801 reg_[n / WORD_BITS] &= ~(word(1) << (n % WORD_BITS)); 02802 } 02803 } 02804 02805 02806 void Integer::SetByte(unsigned int n, byte value) 02807 { 02808 reg_.CleanGrow(RoundupSize(BytesToWords(n+1))); 02809 reg_[n/WORD_SIZE] &= ~(word(0xff) << 8*(n%WORD_SIZE)); 02810 reg_[n/WORD_SIZE] |= (word(value) << 8*(n%WORD_SIZE)); 02811 } 02812 02813 02814 void Integer::Negate() 02815 { 02816 if (!!(*this)) // don't flip sign if *this==0 02817 sign_ = Sign(1 - sign_); 02818 } 02819 02820 02821 bool Integer::operator!() const 02822 { 02823 return IsNegative() ? false : (reg_[0]==0 && WordCount()==0); 02824 } 02825 02826 02827 Integer& Integer::operator=(const Integer& t) 02828 { 02829 if (this != &t) 02830 { 02831 reg_.New(RoundupSize(t.WordCount())); 02832 CopyWords(reg_.get_buffer(), t.reg_.get_buffer(), reg_.size()); 02833 sign_ = t.sign_; 02834 } 02835 return *this; 02836 } 02837 02838 02839 Integer& Integer::operator+=(const Integer& t) 02840 { 02841 reg_.CleanGrow(t.reg_.size()); 02842 if (NotNegative()) 02843 { 02844 if (t.NotNegative()) 02845 PositiveAdd(*this, *this, t); 02846 else 02847 PositiveSubtract(*this, *this, t); 02848 } 02849 else 02850 { 02851 if (t.NotNegative()) 02852 PositiveSubtract(*this, t, *this); 02853 else 02854 { 02855 PositiveAdd(*this, *this, t); 02856 sign_ = Integer::NEGATIVE; 02857 } 02858 } 02859 return *this; 02860 } 02861 02862 02863 Integer Integer::operator-() const 02864 { 02865 Integer result(*this); 02866 result.Negate(); 02867 return result; 02868 } 02869 02870 02871 Integer& Integer::operator-=(const Integer& t) 02872 { 02873 reg_.CleanGrow(t.reg_.size()); 02874 if (NotNegative()) 02875 { 02876 if (t.NotNegative()) 02877 PositiveSubtract(*this, *this, t); 02878 else 02879 PositiveAdd(*this, *this, t); 02880 } 02881 else 02882 { 02883 if (t.NotNegative()) 02884 { 02885 PositiveAdd(*this, *this, t); 02886 sign_ = Integer::NEGATIVE; 02887 } 02888 else 02889 PositiveSubtract(*this, t, *this); 02890 } 02891 return *this; 02892 } 02893 02894 02895 Integer& Integer::operator++() 02896 { 02897 if (NotNegative()) 02898 { 02899 if (Increment(reg_.get_buffer(), reg_.size())) 02900 { 02901 reg_.CleanGrow(2*reg_.size()); 02902 reg_[reg_.size()/2]=1; 02903 } 02904 } 02905 else 02906 { 02907 word borrow = Decrement(reg_.get_buffer(), reg_.size()); 02908 assert(!borrow); 02909 if (WordCount()==0) 02910 *this = Zero(); 02911 } 02912 return *this; 02913 } 02914 02915 Integer& Integer::operator--() 02916 { 02917 if (IsNegative()) 02918 { 02919 if (Increment(reg_.get_buffer(), reg_.size())) 02920 { 02921 reg_.CleanGrow(2*reg_.size()); 02922 reg_[reg_.size()/2]=1; 02923 } 02924 } 02925 else 02926 { 02927 if (Decrement(reg_.get_buffer(), reg_.size())) 02928 *this = -One(); 02929 } 02930 return *this; 02931 } 02932 02933 02934 Integer& Integer::operator<<=(unsigned int n) 02935 { 02936 const unsigned int wordCount = WordCount(); 02937 const unsigned int shiftWords = n / WORD_BITS; 02938 const unsigned int shiftBits = n % WORD_BITS; 02939 02940 reg_.CleanGrow(RoundupSize(wordCount+BitsToWords(n))); 02941 ShiftWordsLeftByWords(reg_.get_buffer(), wordCount + shiftWords, 02942 shiftWords); 02943 ShiftWordsLeftByBits(reg_+shiftWords, wordCount+BitsToWords(shiftBits), 02944 shiftBits); 02945 return *this; 02946 } 02947 02948 Integer& Integer::operator>>=(unsigned int n) 02949 { 02950 const unsigned int wordCount = WordCount(); 02951 const unsigned int shiftWords = n / WORD_BITS; 02952 const unsigned int shiftBits = n % WORD_BITS; 02953 02954 ShiftWordsRightByWords(reg_.get_buffer(), wordCount, shiftWords); 02955 if (wordCount > shiftWords) 02956 ShiftWordsRightByBits(reg_.get_buffer(), wordCount-shiftWords, 02957 shiftBits); 02958 if (IsNegative() && WordCount()==0) // avoid -0 02959 *this = Zero(); 02960 return *this; 02961 } 02962 02963 02964 void PositiveAdd(Integer& sum, const Integer& a, const Integer& b) 02965 { 02966 word carry; 02967 if (a.reg_.size() == b.reg_.size()) 02968 carry = Add(sum.reg_.get_buffer(), a.reg_.get_buffer(), 02969 b.reg_.get_buffer(), a.reg_.size()); 02970 else if (a.reg_.size() > b.reg_.size()) 02971 { 02972 carry = Add(sum.reg_.get_buffer(), a.reg_.get_buffer(), 02973 b.reg_.get_buffer(), b.reg_.size()); 02974 CopyWords(sum.reg_+b.reg_.size(), a.reg_+b.reg_.size(), 02975 a.reg_.size()-b.reg_.size()); 02976 carry = Increment(sum.reg_+b.reg_.size(), a.reg_.size()-b.reg_.size(), 02977 carry); 02978 } 02979 else 02980 { 02981 carry = Add(sum.reg_.get_buffer(), a.reg_.get_buffer(), 02982 b.reg_.get_buffer(), a.reg_.size()); 02983 CopyWords(sum.reg_+a.reg_.size(), b.reg_+a.reg_.size(), 02984 b.reg_.size()-a.reg_.size()); 02985 carry = Increment(sum.reg_+a.reg_.size(), b.reg_.size()-a.reg_.size(), 02986 carry); 02987 } 02988 02989 if (carry) 02990 { 02991 sum.reg_.CleanGrow(2*sum.reg_.size()); 02992 sum.reg_[sum.reg_.size()/2] = 1; 02993 } 02994 sum.sign_ = Integer::POSITIVE; 02995 } 02996 02997 void PositiveSubtract(Integer &diff, const Integer &a, const Integer& b) 02998 { 02999 unsigned aSize = a.WordCount(); 03000 aSize += aSize%2; 03001 unsigned bSize = b.WordCount(); 03002 bSize += bSize%2; 03003 03004 if (aSize == bSize) 03005 { 03006 if (Compare(a.reg_.get_buffer(), b.reg_.get_buffer(), aSize) >= 0) 03007 { 03008 Subtract(diff.reg_.get_buffer(), a.reg_.get_buffer(), 03009 b.reg_.get_buffer(), aSize); 03010 diff.sign_ = Integer::POSITIVE; 03011 } 03012 else 03013 { 03014 Subtract(diff.reg_.get_buffer(), b.reg_.get_buffer(), 03015 a.reg_.get_buffer(), aSize); 03016 diff.sign_ = Integer::NEGATIVE; 03017 } 03018 } 03019 else if (aSize > bSize) 03020 { 03021 word borrow = Subtract(diff.reg_.get_buffer(), a.reg_.get_buffer(), 03022 b.reg_.get_buffer(), bSize); 03023 CopyWords(diff.reg_+bSize, a.reg_+bSize, aSize-bSize); 03024 borrow = Decrement(diff.reg_+bSize, aSize-bSize, borrow); 03025 assert(!borrow); 03026 diff.sign_ = Integer::POSITIVE; 03027 } 03028 else 03029 { 03030 word borrow = Subtract(diff.reg_.get_buffer(), b.reg_.get_buffer(), 03031 a.reg_.get_buffer(), aSize); 03032 CopyWords(diff.reg_+aSize, b.reg_+aSize, bSize-aSize); 03033 borrow = Decrement(diff.reg_+aSize, bSize-aSize, borrow); 03034 assert(!borrow); 03035 diff.sign_ = Integer::NEGATIVE; 03036 } 03037 } 03038 03039 03040 unsigned int Integer::MinEncodedSize(Signedness signedness) const 03041 { 03042 unsigned int outputLen = max(1U, ByteCount()); 03043 if (signedness == UNSIGNED) 03044 return outputLen; 03045 if (NotNegative() && (GetByte(outputLen-1) & 0x80)) 03046 outputLen++; 03047 if (IsNegative() && *this < -Power2(outputLen*8-1)) 03048 outputLen++; 03049 return outputLen; 03050 } 03051 03052 03053 int Integer::Compare(const Integer& t) const 03054 { 03055 if (NotNegative()) 03056 { 03057 if (t.NotNegative()) 03058 return PositiveCompare(t); 03059 else 03060 return 1; 03061 } 03062 else 03063 { 03064 if (t.NotNegative()) 03065 return -1; 03066 else 03067 return -PositiveCompare(t); 03068 } 03069 } 03070 03071 03072 int Integer::PositiveCompare(const Integer& t) const 03073 { 03074 unsigned size = WordCount(), tSize = t.WordCount(); 03075 03076 if (size == tSize) 03077 return TaoCrypt::Compare(reg_.get_buffer(), t.reg_.get_buffer(), size); 03078 else 03079 return size > tSize ? 1 : -1; 03080 } 03081 03082 03083 bool Integer::GetBit(unsigned int n) const 03084 { 03085 if (n/WORD_BITS >= reg_.size()) 03086 return 0; 03087 else 03088 return bool((reg_[n/WORD_BITS] >> (n % WORD_BITS)) & 1); 03089 } 03090 03091 03092 unsigned long Integer::GetBits(unsigned int i, unsigned int n) const 03093 { 03094 assert(n <= sizeof(unsigned long)*8); 03095 unsigned long v = 0; 03096 for (unsigned int j=0; j<n; j++) 03097 v |= GetBit(i+j) << j; 03098 return v; 03099 } 03100 03101 03102 byte Integer::GetByte(unsigned int n) const 03103 { 03104 if (n/WORD_SIZE >= reg_.size()) 03105 return 0; 03106 else 03107 return byte(reg_[n/WORD_SIZE] >> ((n%WORD_SIZE)*8)); 03108 } 03109 03110 03111 unsigned int Integer::BitCount() const 03112 { 03113 unsigned wordCount = WordCount(); 03114 if (wordCount) 03115 return (wordCount-1)*WORD_BITS + BitPrecision(reg_[wordCount-1]); 03116 else 03117 return 0; 03118 } 03119 03120 03121 unsigned int Integer::ByteCount() const 03122 { 03123 unsigned wordCount = WordCount(); 03124 if (wordCount) 03125 return (wordCount-1)*WORD_SIZE + BytePrecision(reg_[wordCount-1]); 03126 else 03127 return 0; 03128 } 03129 03130 03131 unsigned int Integer::WordCount() const 03132 { 03133 return CountWords(reg_.get_buffer(), reg_.size()); 03134 } 03135 03136 03137 bool Integer::IsConvertableToLong() const 03138 { 03139 if (ByteCount() > sizeof(long)) 03140 return false; 03141 03142 unsigned long value = reg_[0]; 03143 value += SafeLeftShift<WORD_BITS, unsigned long>(reg_[1]); 03144 03145 if (sign_ == POSITIVE) 03146 return (signed long)value >= 0; 03147 else 03148 return -(signed long)value < 0; 03149 } 03150 03151 03152 signed long Integer::ConvertToLong() const 03153 { 03154 assert(IsConvertableToLong()); 03155 03156 unsigned long value = reg_[0]; 03157 value += SafeLeftShift<WORD_BITS, unsigned long>(reg_[1]); 03158 return sign_ == POSITIVE ? value : -(signed long)value; 03159 } 03160 03161 03162 void Integer::Swap(Integer& a) 03163 { 03164 reg_.Swap(a.reg_); 03165 mySTL::swap(sign_, a.sign_); 03166 } 03167 03168 03169 Integer Integer::Plus(const Integer& b) const 03170 { 03171 Integer sum((word)0, max(reg_.size(), b.reg_.size())); 03172 if (NotNegative()) 03173 { 03174 if (b.NotNegative()) 03175 PositiveAdd(sum, *this, b); 03176 else 03177 PositiveSubtract(sum, *this, b); 03178 } 03179 else 03180 { 03181 if (b.NotNegative()) 03182 PositiveSubtract(sum, b, *this); 03183 else 03184 { 03185 PositiveAdd(sum, *this, b); 03186 sum.sign_ = Integer::NEGATIVE; 03187 } 03188 } 03189 return sum; 03190 } 03191 03192 03193 Integer Integer::Minus(const Integer& b) const 03194 { 03195 Integer diff((word)0, max(reg_.size(), b.reg_.size())); 03196 if (NotNegative()) 03197 { 03198 if (b.NotNegative()) 03199 PositiveSubtract(diff, *this, b); 03200 else 03201 PositiveAdd(diff, *this, b); 03202 } 03203 else 03204 { 03205 if (b.NotNegative()) 03206 { 03207 PositiveAdd(diff, *this, b); 03208 diff.sign_ = Integer::NEGATIVE; 03209 } 03210 else 03211 PositiveSubtract(diff, b, *this); 03212 } 03213 return diff; 03214 } 03215 03216 03217 Integer Integer::Times(const Integer &b) const 03218 { 03219 Integer product; 03220 Multiply(product, *this, b); 03221 return product; 03222 } 03223 03224 03225 #undef A0 03226 #undef A1 03227 #undef B0 03228 #undef B1 03229 03230 #undef T0 03231 #undef T1 03232 #undef T2 03233 #undef T3 03234 03235 #undef R0 03236 #undef R1 03237 #undef R2 03238 #undef R3 03239 03240 03241 static inline void AtomicDivide(word *Q, const word *A, const word *B) 03242 { 03243 word T[4]; 03244 DWord q = DivideFourWordsByTwo<word, DWord>(T, DWord(A[0], A[1]), 03245 DWord(A[2], A[3]), DWord(B[0], B[1])); 03246 Q[0] = q.GetLowHalf(); 03247 Q[1] = q.GetHighHalf(); 03248 03249 #ifndef NDEBUG 03250 if (B[0] || B[1]) 03251 { 03252 // multiply quotient and divisor and add remainder, make sure it 03253 // equals dividend 03254 assert(!T[2] && !T[3] && (T[1] < B[1] || (T[1]==B[1] && T[0]<B[0]))); 03255 word P[4]; 03256 Portable::Multiply2(P, Q, B); 03257 Add(P, P, T, 4); 03258 assert(memcmp(P, A, 4*WORD_SIZE)==0); 03259 } 03260 #endif 03261 } 03262 03263 03264 // for use by Divide(), corrects the underestimated quotient {Q1,Q0} 03265 static void CorrectQuotientEstimate(word *R, word *T, word *Q, const word *B, 03266 unsigned int N) 03267 { 03268 assert(N && N%2==0); 03269 03270 if (Q[1]) 03271 { 03272 T[N] = T[N+1] = 0; 03273 unsigned i; 03274 for (i=0; i<N; i+=4) 03275 LowLevel::Multiply2(T+i, Q, B+i); 03276 for (i=2; i<N; i+=4) 03277 if (LowLevel::Multiply2Add(T+i, Q, B+i)) 03278 T[i+5] += (++T[i+4]==0); 03279 } 03280 else 03281 { 03282 T[N] = LinearMultiply(T, B, Q[0], N); 03283 T[N+1] = 0; 03284 } 03285 03286 word borrow = Subtract(R, R, T, N+2); 03287 assert(!borrow && !R[N+1]); 03288 03289 while (R[N] || Compare(R, B, N) >= 0) 03290 { 03291 R[N] -= Subtract(R, R, B, N); 03292 Q[1] += (++Q[0]==0); 03293 assert(Q[0] || Q[1]); // no overflow 03294 } 03295 } 03296 03297 // R[NB] -------- remainder = A%B 03298 // Q[NA-NB+2] --- quotient = A/B 03299 // T[NA+2*NB+4] - temp work space 03300 // A[NA] -------- dividend 03301 // B[NB] -------- divisor 03302 03303 03304 void Divide(word* R, word* Q, word* T, const word* A, unsigned int NA, 03305 const word* B, unsigned int NB) 03306 { 03307 assert(NA && NB && NA%2==0 && NB%2==0); 03308 assert(B[NB-1] || B[NB-2]); 03309 assert(NB <= NA); 03310 03311 // set up temporary work space 03312 word *const TA=T; 03313 word *const TB=T+NA+2; 03314 word *const TP=T+NA+2+NB; 03315 03316 // copy B into TB and normalize it so that TB has highest bit set to 1 03317 unsigned shiftWords = (B[NB-1]==0); 03318 TB[0] = TB[NB-1] = 0; 03319 CopyWords(TB+shiftWords, B, NB-shiftWords); 03320 unsigned shiftBits = WORD_BITS - BitPrecision(TB[NB-1]); 03321 assert(shiftBits < WORD_BITS); 03322 ShiftWordsLeftByBits(TB, NB, shiftBits); 03323 03324 // copy A into TA and normalize it 03325 TA[0] = TA[NA] = TA[NA+1] = 0; 03326 CopyWords(TA+shiftWords, A, NA); 03327 ShiftWordsLeftByBits(TA, NA+2, shiftBits); 03328 03329 if (TA[NA+1]==0 && TA[NA] <= 1) 03330 { 03331 Q[NA-NB+1] = Q[NA-NB] = 0; 03332 while (TA[NA] || Compare(TA+NA-NB, TB, NB) >= 0) 03333 { 03334 TA[NA] -= Subtract(TA+NA-NB, TA+NA-NB, TB, NB); 03335 ++Q[NA-NB]; 03336 } 03337 } 03338 else 03339 { 03340 NA+=2; 03341 assert(Compare(TA+NA-NB, TB, NB) < 0); 03342 } 03343 03344 word BT[2]; 03345 BT[0] = TB[NB-2] + 1; 03346 BT[1] = TB[NB-1] + (BT[0]==0); 03347 03348 // start reducing TA mod TB, 2 words at a time 03349 for (unsigned i=NA-2; i>=NB; i-=2) 03350 { 03351 AtomicDivide(Q+i-NB, TA+i-2, BT); 03352 CorrectQuotientEstimate(TA+i-NB, TP, Q+i-NB, TB, NB); 03353 } 03354 03355 // copy TA into R, and denormalize it 03356 CopyWords(R, TA+shiftWords, NB); 03357 ShiftWordsRightByBits(R, NB, shiftBits); 03358 } 03359 03360 03361 void PositiveDivide(Integer& remainder, Integer& quotient, 03362 const Integer& a, const Integer& b) 03363 { 03364 unsigned aSize = a.WordCount(); 03365 unsigned bSize = b.WordCount(); 03366 03367 assert(bSize); 03368 03369 if (a.PositiveCompare(b) == -1) 03370 { 03371 remainder = a; 03372 remainder.sign_ = Integer::POSITIVE; 03373 quotient = Integer::Zero(); 03374 return; 03375 } 03376 03377 aSize += aSize%2; // round up to next even number 03378 bSize += bSize%2; 03379 03380 remainder.reg_.CleanNew(RoundupSize(bSize)); 03381 remainder.sign_ = Integer::POSITIVE; 03382 quotient.reg_.CleanNew(RoundupSize(aSize-bSize+2)); 03383 quotient.sign_ = Integer::POSITIVE; 03384 03385 AlignedWordBlock T(aSize+2*bSize+4); 03386 Divide(remainder.reg_.get_buffer(), quotient.reg_.get_buffer(), 03387 T.get_buffer(), a.reg_.get_buffer(), aSize, b.reg_.get_buffer(), 03388 bSize); 03389 } 03390 03391 void Integer::Divide(Integer &remainder, Integer "ient, 03392 const Integer ÷nd, const Integer &divisor) 03393 { 03394 PositiveDivide(remainder, quotient, dividend, divisor); 03395 03396 if (dividend.IsNegative()) 03397 { 03398 quotient.Negate(); 03399 if (remainder.NotZero()) 03400 { 03401 --quotient; 03402 remainder = divisor.AbsoluteValue() - remainder; 03403 } 03404 } 03405 03406 if (divisor.IsNegative()) 03407 quotient.Negate(); 03408 } 03409 03410 void Integer::DivideByPowerOf2(Integer &r, Integer &q, const Integer &a, 03411 unsigned int n) 03412 { 03413 q = a; 03414 q >>= n; 03415 03416 const unsigned int wordCount = BitsToWords(n); 03417 if (wordCount <= a.WordCount()) 03418 { 03419 r.reg_.resize(RoundupSize(wordCount)); 03420 CopyWords(r.reg_.get_buffer(), a.reg_.get_buffer(), wordCount); 03421 SetWords(r.reg_+wordCount, 0, r.reg_.size()-wordCount); 03422 if (n % WORD_BITS != 0) 03423 r.reg_[wordCount-1] %= (1 << (n % WORD_BITS)); 03424 } 03425 else 03426 { 03427 r.reg_.resize(RoundupSize(a.WordCount())); 03428 CopyWords(r.reg_.get_buffer(), a.reg_.get_buffer(), r.reg_.size()); 03429 } 03430 r.sign_ = POSITIVE; 03431 03432 if (a.IsNegative() && r.NotZero()) 03433 { 03434 --q; 03435 r = Power2(n) - r; 03436 } 03437 } 03438 03439 Integer Integer::DividedBy(const Integer &b) const 03440 { 03441 Integer remainder, quotient; 03442 Integer::Divide(remainder, quotient, *this, b); 03443 return quotient; 03444 } 03445 03446 Integer Integer::Modulo(const Integer &b) const 03447 { 03448 Integer remainder, quotient; 03449 Integer::Divide(remainder, quotient, *this, b); 03450 return remainder; 03451 } 03452 03453 void Integer::Divide(word &remainder, Integer "ient, 03454 const Integer ÷nd, word divisor) 03455 { 03456 assert(divisor); 03457 03458 if ((divisor & (divisor-1)) == 0) // divisor is a power of 2 03459 { 03460 quotient = dividend >> (BitPrecision(divisor)-1); 03461 remainder = dividend.reg_[0] & (divisor-1); 03462 return; 03463 } 03464 03465 unsigned int i = dividend.WordCount(); 03466 quotient.reg_.CleanNew(RoundupSize(i)); 03467 remainder = 0; 03468 while (i--) 03469 { 03470 quotient.reg_[i] = DWord(dividend.reg_[i], remainder) / divisor; 03471 remainder = DWord(dividend.reg_[i], remainder) % divisor; 03472 } 03473 03474 if (dividend.NotNegative()) 03475 quotient.sign_ = POSITIVE; 03476 else 03477 { 03478 quotient.sign_ = NEGATIVE; 03479 if (remainder) 03480 { 03481 --quotient; 03482 remainder = divisor - remainder; 03483 } 03484 } 03485 } 03486 03487 Integer Integer::DividedBy(word b) const 03488 { 03489 word remainder; 03490 Integer quotient; 03491 Integer::Divide(remainder, quotient, *this, b); 03492 return quotient; 03493 } 03494 03495 word Integer::Modulo(word divisor) const 03496 { 03497 assert(divisor); 03498 03499 word remainder; 03500 03501 if ((divisor & (divisor-1)) == 0) // divisor is a power of 2 03502 remainder = reg_[0] & (divisor-1); 03503 else 03504 { 03505 unsigned int i = WordCount(); 03506 03507 if (divisor <= 5) 03508 { 03509 DWord sum(0, 0); 03510 while (i--) 03511 sum += reg_[i]; 03512 remainder = sum % divisor; 03513 } 03514 else 03515 { 03516 remainder = 0; 03517 while (i--) 03518 remainder = DWord(reg_[i], remainder) % divisor; 03519 } 03520 } 03521 03522 if (IsNegative() && remainder) 03523 remainder = divisor - remainder; 03524 03525 return remainder; 03526 } 03527 03528 03529 Integer Integer::AbsoluteValue() const 03530 { 03531 Integer result(*this); 03532 result.sign_ = POSITIVE; 03533 return result; 03534 } 03535 03536 03537 Integer Integer::SquareRoot() const 03538 { 03539 if (!IsPositive()) 03540 return Zero(); 03541 03542 // overestimate square root 03543 Integer x, y = Power2((BitCount()+1)/2); 03544 assert(y*y >= *this); 03545 03546 do 03547 { 03548 x = y; 03549 y = (x + *this/x) >> 1; 03550 } while (y<x); 03551 03552 return x; 03553 } 03554 03555 bool Integer::IsSquare() const 03556 { 03557 Integer r = SquareRoot(); 03558 return *this == r.Squared(); 03559 } 03560 03561 bool Integer::IsUnit() const 03562 { 03563 return (WordCount() == 1) && (reg_[0] == 1); 03564 } 03565 03566 Integer Integer::MultiplicativeInverse() const 03567 { 03568 return IsUnit() ? *this : Zero(); 03569 } 03570 03571 Integer a_times_b_mod_c(const Integer &x, const Integer& y, const Integer& m) 03572 { 03573 return x*y%m; 03574 } 03575 03576 Integer a_exp_b_mod_c(const Integer &x, const Integer& e, const Integer& m) 03577 { 03578 ModularArithmetic mr(m); 03579 return mr.Exponentiate(x, e); 03580 } 03581 03582 Integer Integer::Gcd(const Integer &a, const Integer &b) 03583 { 03584 return EuclideanDomainOf().Gcd(a, b); 03585 } 03586 03587 Integer Integer::InverseMod(const Integer &m) const 03588 { 03589 assert(m.NotNegative()); 03590 03591 if (IsNegative() || *this>=m) 03592 return (*this%m).InverseMod(m); 03593 03594 if (m.IsEven()) 03595 { 03596 if (!m || IsEven()) 03597 return Zero(); // no inverse 03598 if (*this == One()) 03599 return One(); 03600 03601 Integer u = m.InverseMod(*this); 03602 return !u ? Zero() : (m*(*this-u)+1)/(*this); 03603 } 03604 03605 AlignedWordBlock T(m.reg_.size() * 4); 03606 Integer r((word)0, m.reg_.size()); 03607 unsigned k = AlmostInverse(r.reg_.get_buffer(), T.get_buffer(), 03608 reg_.get_buffer(), reg_.size(), 03609 m.reg_.get_buffer(), m.reg_.size()); 03610 DivideByPower2Mod(r.reg_.get_buffer(), r.reg_.get_buffer(), k, 03611 m.reg_.get_buffer(), m.reg_.size()); 03612 return r; 03613 } 03614 03615 word Integer::InverseMod(const word mod) const 03616 { 03617 word g0 = mod, g1 = *this % mod; 03618 word v0 = 0, v1 = 1; 03619 word y; 03620 03621 while (g1) 03622 { 03623 if (g1 == 1) 03624 return v1; 03625 y = g0 / g1; 03626 g0 = g0 % g1; 03627 v0 += y * v1; 03628 03629 if (!g0) 03630 break; 03631 if (g0 == 1) 03632 return mod-v0; 03633 y = g1 / g0; 03634 g1 = g1 % g0; 03635 v1 += y * v0; 03636 } 03637 return 0; 03638 } 03639 03640 // ********* ModArith stuff 03641 03642 const Integer& ModularArithmetic::Half(const Integer &a) const 03643 { 03644 if (a.reg_.size()==modulus.reg_.size()) 03645 { 03646 TaoCrypt::DivideByPower2Mod(result.reg_.begin(), a.reg_.begin(), 1, 03647 modulus.reg_.begin(), a.reg_.size()); 03648 return result; 03649 } 03650 else 03651 return result1 = (a.IsEven() ? (a >> 1) : ((a+modulus) >> 1)); 03652 } 03653 03654 const Integer& ModularArithmetic::Add(const Integer &a, const Integer &b) const 03655 { 03656 if (a.reg_.size()==modulus.reg_.size() && 03657 b.reg_.size()==modulus.reg_.size()) 03658 { 03659 if (TaoCrypt::Add(result.reg_.begin(), a.reg_.begin(), b.reg_.begin(), 03660 a.reg_.size()) 03661 || Compare(result.reg_.get_buffer(), modulus.reg_.get_buffer(), 03662 a.reg_.size()) >= 0) 03663 { 03664 TaoCrypt::Subtract(result.reg_.begin(), result.reg_.begin(), 03665 modulus.reg_.begin(), a.reg_.size()); 03666 } 03667 return result; 03668 } 03669 else 03670 { 03671 result1 = a+b; 03672 if (result1 >= modulus) 03673 result1 -= modulus; 03674 return result1; 03675 } 03676 } 03677 03678 Integer& ModularArithmetic::Accumulate(Integer &a, const Integer &b) const 03679 { 03680 if (a.reg_.size()==modulus.reg_.size() && 03681 b.reg_.size()==modulus.reg_.size()) 03682 { 03683 if (TaoCrypt::Add(a.reg_.get_buffer(), a.reg_.get_buffer(), 03684 b.reg_.get_buffer(), a.reg_.size()) 03685 || Compare(a.reg_.get_buffer(), modulus.reg_.get_buffer(), 03686 a.reg_.size()) >= 0) 03687 { 03688 TaoCrypt::Subtract(a.reg_.get_buffer(), a.reg_.get_buffer(), 03689 modulus.reg_.get_buffer(), a.reg_.size()); 03690 } 03691 } 03692 else 03693 { 03694 a+=b; 03695 if (a>=modulus) 03696 a-=modulus; 03697 } 03698 03699 return a; 03700 } 03701 03702 const Integer& ModularArithmetic::Subtract(const Integer &a, 03703 const Integer &b) const 03704 { 03705 if (a.reg_.size()==modulus.reg_.size() && 03706 b.reg_.size()==modulus.reg_.size()) 03707 { 03708 if (TaoCrypt::Subtract(result.reg_.begin(), a.reg_.begin(), 03709 b.reg_.begin(), a.reg_.size())) 03710 TaoCrypt::Add(result.reg_.begin(), result.reg_.begin(), 03711 modulus.reg_.begin(), a.reg_.size()); 03712 return result; 03713 } 03714 else 03715 { 03716 result1 = a-b; 03717 if (result1.IsNegative()) 03718 result1 += modulus; 03719 return result1; 03720 } 03721 } 03722 03723 Integer& ModularArithmetic::Reduce(Integer &a, const Integer &b) const 03724 { 03725 if (a.reg_.size()==modulus.reg_.size() && 03726 b.reg_.size()==modulus.reg_.size()) 03727 { 03728 if (TaoCrypt::Subtract(a.reg_.get_buffer(), a.reg_.get_buffer(), 03729 b.reg_.get_buffer(), a.reg_.size())) 03730 TaoCrypt::Add(a.reg_.get_buffer(), a.reg_.get_buffer(), 03731 modulus.reg_.get_buffer(), a.reg_.size()); 03732 } 03733 else 03734 { 03735 a-=b; 03736 if (a.IsNegative()) 03737 a+=modulus; 03738 } 03739 03740 return a; 03741 } 03742 03743 const Integer& ModularArithmetic::Inverse(const Integer &a) const 03744 { 03745 if (!a) 03746 return a; 03747 03748 CopyWords(result.reg_.begin(), modulus.reg_.begin(), modulus.reg_.size()); 03749 if (TaoCrypt::Subtract(result.reg_.begin(), result.reg_.begin(), 03750 a.reg_.begin(), a.reg_.size())) 03751 Decrement(result.reg_.begin()+a.reg_.size(), 1, 03752 modulus.reg_.size()-a.reg_.size()); 03753 03754 return result; 03755 } 03756 03757 Integer ModularArithmetic::CascadeExponentiate(const Integer &x, 03758 const Integer &e1, const Integer &y, const Integer &e2) const 03759 { 03760 if (modulus.IsOdd()) 03761 { 03762 MontgomeryRepresentation dr(modulus); 03763 return dr.ConvertOut(dr.CascadeExponentiate(dr.ConvertIn(x), e1, 03764 dr.ConvertIn(y), e2)); 03765 } 03766 else 03767 return AbstractRing::CascadeExponentiate(x, e1, y, e2); 03768 } 03769 03770 void ModularArithmetic::SimultaneousExponentiate(Integer *results, 03771 const Integer &base, const Integer *exponents, 03772 unsigned int exponentsCount) const 03773 { 03774 if (modulus.IsOdd()) 03775 { 03776 MontgomeryRepresentation dr(modulus); 03777 dr.SimultaneousExponentiate(results, dr.ConvertIn(base), exponents, 03778 exponentsCount); 03779 for (unsigned int i=0; i<exponentsCount; i++) 03780 results[i] = dr.ConvertOut(results[i]); 03781 } 03782 else 03783 AbstractRing::SimultaneousExponentiate(results, base, 03784 exponents, exponentsCount); 03785 } 03786 03787 03788 // ******************************************************** 03789 03790 #define A0 A 03791 #define A1 (A+N2) 03792 #define B0 B 03793 #define B1 (B+N2) 03794 03795 #define T0 T 03796 #define T1 (T+N2) 03797 #define T2 (T+N) 03798 #define T3 (T+N+N2) 03799 03800 #define R0 R 03801 #define R1 (R+N2) 03802 #define R2 (R+N) 03803 #define R3 (R+N+N2) 03804 03805 03806 inline void MultiplyBottom(word *R, word *T, const word *A, const word *B, 03807 unsigned int N) 03808 { 03809 RecursiveMultiplyBottom(R, T, A, B, N); 03810 } 03811 03812 inline void MultiplyTop(word *R, word *T, const word *L, const word *A, 03813 const word *B, unsigned int N) 03814 { 03815 RecursiveMultiplyTop(R, T, L, A, B, N); 03816 } 03817 03818 03819 // R[N] --- result = X/(2**(WORD_BITS*N)) mod M 03820 // T[3*N] - temporary work space 03821 // X[2*N] - number to be reduced 03822 // M[N] --- modulus 03823 // U[N] --- multiplicative inverse of M mod 2**(WORD_BITS*N) 03824 03825 void MontgomeryReduce(word *R, word *T, const word *X, const word *M, 03826 const word *U, unsigned int N) 03827 { 03828 MultiplyBottom(R, T, X, U, N); 03829 MultiplyTop(T, T+N, X, R, M, N); 03830 word borrow = Subtract(T, X+N, T, N); 03831 // defend against timing attack by doing this Add even when not needed 03832 word carry = Add(T+N, T, M, N); 03833 assert(carry || !borrow); 03834 CopyWords(R, T + (borrow ? N : 0), N); 03835 } 03836 03837 // R[N] ----- result = A inverse mod 2**(WORD_BITS*N) 03838 // T[3*N/2] - temporary work space 03839 // A[N] ----- an odd number as input 03840 03841 void RecursiveInverseModPower2(word *R, word *T, const word *A, unsigned int N) 03842 { 03843 if (N==2) 03844 { 03845 T[0] = AtomicInverseModPower2(A[0]); 03846 T[1] = 0; 03847 LowLevel::Multiply2Bottom(T+2, T, A); 03848 TwosComplement(T+2, 2); 03849 Increment(T+2, 2, 2); 03850 LowLevel::Multiply2Bottom(R, T, T+2); 03851 } 03852 else 03853 { 03854 const unsigned int N2 = N/2; 03855 RecursiveInverseModPower2(R0, T0, A0, N2); 03856 T0[0] = 1; 03857 SetWords(T0+1, 0, N2-1); 03858 MultiplyTop(R1, T1, T0, R0, A0, N2); 03859 MultiplyBottom(T0, T1, R0, A1, N2); 03860 Add(T0, R1, T0, N2); 03861 TwosComplement(T0, N2); 03862 MultiplyBottom(R1, T1, R0, T0, N2); 03863 } 03864 } 03865 03866 03867 #undef A0 03868 #undef A1 03869 #undef B0 03870 #undef B1 03871 03872 #undef T0 03873 #undef T1 03874 #undef T2 03875 #undef T3 03876 03877 #undef R0 03878 #undef R1 03879 #undef R2 03880 #undef R3 03881 03882 03883 // modulus must be odd 03884 MontgomeryRepresentation::MontgomeryRepresentation(const Integer &m) 03885 : ModularArithmetic(m), 03886 u((word)0, modulus.reg_.size()), 03887 workspace(5*modulus.reg_.size()) 03888 { 03889 assert(modulus.IsOdd()); 03890 RecursiveInverseModPower2(u.reg_.get_buffer(), workspace.get_buffer(), 03891 modulus.reg_.get_buffer(), modulus.reg_.size()); 03892 } 03893 03894 const Integer& MontgomeryRepresentation::Multiply(const Integer &a, 03895 const Integer &b) const 03896 { 03897 word *const T = workspace.begin(); 03898 word *const R = result.reg_.begin(); 03899 const unsigned int N = modulus.reg_.size(); 03900 assert(a.reg_.size()<=N && b.reg_.size()<=N); 03901 03902 AsymmetricMultiply(T, T+2*N, a.reg_.get_buffer(), a.reg_.size(), 03903 b.reg_.get_buffer(), b.reg_.size()); 03904 SetWords(T+a.reg_.size()+b.reg_.size(),0, 2*N-a.reg_.size()-b.reg_.size()); 03905 MontgomeryReduce(R, T+2*N, T, modulus.reg_.get_buffer(), 03906 u.reg_.get_buffer(), N); 03907 return result; 03908 } 03909 03910 const Integer& MontgomeryRepresentation::Square(const Integer &a) const 03911 { 03912 word *const T = workspace.begin(); 03913 word *const R = result.reg_.begin(); 03914 const unsigned int N = modulus.reg_.size(); 03915 assert(a.reg_.size()<=N); 03916 03917 TaoCrypt::Square(T, T+2*N, a.reg_.get_buffer(), a.reg_.size()); 03918 SetWords(T+2*a.reg_.size(), 0, 2*N-2*a.reg_.size()); 03919 MontgomeryReduce(R, T+2*N, T, modulus.reg_.get_buffer(), 03920 u.reg_.get_buffer(), N); 03921 return result; 03922 } 03923 03924 Integer MontgomeryRepresentation::ConvertOut(const Integer &a) const 03925 { 03926 word *const T = workspace.begin(); 03927 word *const R = result.reg_.begin(); 03928 const unsigned int N = modulus.reg_.size(); 03929 assert(a.reg_.size()<=N); 03930 03931 CopyWords(T, a.reg_.get_buffer(), a.reg_.size()); 03932 SetWords(T+a.reg_.size(), 0, 2*N-a.reg_.size()); 03933 MontgomeryReduce(R, T+2*N, T, modulus.reg_.get_buffer(), 03934 u.reg_.get_buffer(), N); 03935 return result; 03936 } 03937 03938 const Integer& MontgomeryRepresentation::MultiplicativeInverse( 03939 const Integer &a) const 03940 { 03941 // return (EuclideanMultiplicativeInverse(a, modulus)<< 03942 // (2*WORD_BITS*modulus.reg_.size()))%modulus; 03943 word *const T = workspace.begin(); 03944 word *const R = result.reg_.begin(); 03945 const unsigned int N = modulus.reg_.size(); 03946 assert(a.reg_.size()<=N); 03947 03948 CopyWords(T, a.reg_.get_buffer(), a.reg_.size()); 03949 SetWords(T+a.reg_.size(), 0, 2*N-a.reg_.size()); 03950 MontgomeryReduce(R, T+2*N, T, modulus.reg_.get_buffer(), 03951 u.reg_.get_buffer(), N); 03952 unsigned k = AlmostInverse(R, T, R, N, modulus.reg_.get_buffer(), N); 03953 03954 // cout << "k=" << k << " N*32=" << 32*N << endl; 03955 03956 if (k>N*WORD_BITS) 03957 DivideByPower2Mod(R, R, k-N*WORD_BITS, modulus.reg_.get_buffer(), N); 03958 else 03959 MultiplyByPower2Mod(R, R, N*WORD_BITS-k, modulus.reg_.get_buffer(), N); 03960 03961 return result; 03962 } 03963 03964 03965 // mod Root stuff 03966 Integer ModularRoot(const Integer &a, const Integer &dp, const Integer &dq, 03967 const Integer &p, const Integer &q, const Integer &u) 03968 { 03969 Integer p2 = ModularExponentiation((a % p), dp, p); 03970 Integer q2 = ModularExponentiation((a % q), dq, q); 03971 return CRT(p2, p, q2, q, u); 03972 } 03973 03974 Integer CRT(const Integer &xp, const Integer &p, const Integer &xq, 03975 const Integer &q, const Integer &u) 03976 { 03977 // isn't operator overloading great? 03978 return p * (u * (xq-xp) % q) + xp; 03979 } 03980 03981 03982 #ifdef HAVE_EXPLICIT_TEMPLATE_INSTANTIATION 03983 #ifndef TAOCRYPT_NATIVE_DWORD_AVAILABLE 03984 template hword DivideThreeWordsByTwo<hword, Word>(hword*, hword, hword, Word*); 03985 #endif 03986 template word DivideThreeWordsByTwo<word, DWord>(word*, word, word, DWord*); 03987 #endif 03988 03989 03990 } // namespace 03991