The world's most popular open source database

MySQL 5.1: mysql/src/5.1-dbg/extra/yassl/taocrypt/src/integer.cpp Source File
mysql/src/5.1-dbg/extra/yassl/taocrypt/src/integer.cpp

00001 /* integer.cpp                                
00002  *
00003  * Copyright (C) 2003 Sawtooth Consulting Ltd.
00004  *
00005  * This file is part of yaSSL.
00006  *
00007  * yaSSL is free software; you can redistribute it and/or modify
00008  * it under the terms of the GNU General Public License as published by
00009  * the Free Software Foundation; either version 2 of the License, or
00010  * (at your option) any later version.
00011  *
00012  * There are special exceptions to the terms and conditions of the GPL as it
00013  * is applied to yaSSL. View the full text of the exception in the file
00014  * FLOSS-EXCEPTIONS in the directory of this software distribution.
00015  *
00016  * yaSSL is distributed in the hope that it will be useful,
00017  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00018  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00019  * GNU General Public License for more details.
00020  *
00021  * You should have received a copy of the GNU General Public License
00022  * along with this program; if not, write to the Free Software
00023  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
00024  */
00025 
00026 
00027 
00028 /* based on Wei Dai's integer.cpp from CryptoPP */
00029 
00030 #include "runtime.hpp"
00031 #include "integer.hpp"
00032 #include "modarith.hpp"
00033 #include "asn.hpp"
00034 
00035 
00036 
00037 #ifdef __DECCXX
00038     #include <c_asm.h>  // for asm overflow assembly
00039 #endif
00040 
00041 
00042 // 64bit multiply overflow intrinsic
00043 #if defined(_MSC_VER) && defined(_WIN64) && !defined(__INTEL_COMPILER) && \
00044    !defined(TAOCRYPT_NATIVE_DWORD_AVAILABLE)
00045     #ifdef __ia64__
00046         #define myUMULH __UMULH
00047     #elif  __x86_64__
00048         #define myUMULH __umulh
00049     #else
00050         #error unknown 64bit windows
00051     #endif
00052 
00053 extern "C" word myUMULH(word, word); 
00054 
00055 #pragma intrinsic (myUMULH)
00056 #endif
00057 
00058 
00059 #ifdef SSE2_INTRINSICS_AVAILABLE
00060     #ifdef __GNUC__
00061         #include <xmmintrin.h>
00062         #include <signal.h>
00063         #include <setjmp.h>
00064         #ifdef TAOCRYPT_MEMALIGN_AVAILABLE
00065             #include <malloc.h>
00066         #else
00067             #include <stdlib.h>
00068         #endif
00069     #else
00070         #include <emmintrin.h>
00071     #endif
00072 #elif defined(_MSC_VER) && defined(_M_IX86)
00073     #pragma message("You do not seem to have the Visual C++ Processor Pack ")
00074     #pragma message("installed, so use of SSE2 intrinsics will be disabled.")
00075 #elif defined(__GNUC__) && defined(__i386__)
00076 /*   #warning You do not have GCC 3.3 or later, or did not specify the -msse2 \
00077              compiler option. Use of SSE2 intrinsics will be disabled.
00078 */
00079 #endif
00080 
00081 
00082 namespace TaoCrypt {
00083 
00084 
00085 #ifdef SSE2_INTRINSICS_AVAILABLE
00086 
00087 template <class T>
00088 CPP_TYPENAME AllocatorBase<T>::pointer AlignedAllocator<T>::allocate(
00089                                            size_type n, const void *)
00090 {
00091     CheckSize(n);
00092     if (n == 0)
00093         return 0;
00094     if (n >= 4)
00095     {
00096         void* p;
00097     #ifdef TAOCRYPT_MM_MALLOC_AVAILABLE
00098         p = _mm_malloc(sizeof(T)*n, 16);
00099     #elif defined(TAOCRYPT_MEMALIGN_AVAILABLE)
00100         p = memalign(16, sizeof(T)*n);
00101     #elif defined(TAOCRYPT_MALLOC_ALIGNMENT_IS_16)
00102         p = malloc(sizeof(T)*n);
00103     #else
00104         p = (byte *)malloc(sizeof(T)*n + 8);
00105         // assume malloc alignment is at least 8
00106     #endif
00107 
00108     #ifdef TAOCRYPT_NO_ALIGNED_ALLOC
00109         assert(m_pBlock == 0);
00110         m_pBlock = p;
00111         if (!IsAlignedOn(p, 16))
00112         {
00113             assert(IsAlignedOn(p, 8));
00114             p = (byte *)p + 8;
00115         }
00116     #endif
00117 
00118         assert(IsAlignedOn(p, 16));
00119         return (T*)p;
00120     }
00121     return NEW_TC T[n];
00122 }
00123 
00124 
00125 template <class T>
00126 void AlignedAllocator<T>::deallocate(void* p, size_type n)
00127 {
00128     memset(p, 0, n*sizeof(T));
00129     if (n >= 4)
00130     {
00131         #ifdef TAOCRYPT_MM_MALLOC_AVAILABLE
00132             _mm_free(p);
00133         #elif defined(TAOCRYPT_NO_ALIGNED_ALLOC)
00134             assert(m_pBlock == p || (byte*)m_pBlock+8 == p);
00135             free(m_pBlock);
00136             m_pBlock = 0;
00137         #else
00138             free(p);
00139         #endif
00140     }
00141     else
00142         tcArrayDelete((T *)p);
00143 }
00144 
00145 #endif  // SSE2
00146 
00147 
00148 // ********  start of integer needs
00149 
00150 // start 5.2.1 adds DWord and Word ********
00151 
00152 // ********************************************************
00153 
00154 class DWord {
00155 public:
00156 DWord() {}
00157 
00158 #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
00159     explicit DWord(word low)
00160     {
00161         whole_ = low;
00162     }
00163 #else
00164     explicit DWord(word low)
00165     {
00166         halfs_.low = low;
00167         halfs_.high = 0;
00168     }
00169 #endif
00170 
00171     DWord(word low, word high)
00172     {
00173         halfs_.low = low;
00174         halfs_.high = high;
00175     }
00176 
00177     static DWord Multiply(word a, word b)
00178     {
00179         DWord r;
00180 
00181         #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
00182             r.whole_ = (dword)a * b;
00183 
00184         #elif defined(_MSC_VER)
00185             r.halfs_.low = a*b;
00186             r.halfs_.high = myUMULH(a,b);
00187 
00188         #elif defined(__alpha__)
00189             r.halfs_.low = a*b;
00190             #ifdef __GNUC__
00191                 __asm__("umulh %1,%2,%0" : "=r" (r.halfs_.high)
00192                     : "r" (a), "r" (b));
00193             #elif defined(__DECCXX)
00194                 r.halfs_.high = asm("umulh %a0, %a1, %v0", a, b);
00195             #else
00196                 #error unknown alpha compiler
00197             #endif
00198 
00199         #elif defined(__ia64__)
00200             r.halfs_.low = a*b;
00201             __asm__("xmpy.hu %0=%1,%2" : "=f" (r.halfs_.high)
00202                 : "f" (a), "f" (b));
00203 
00204         #elif defined(_ARCH_PPC64)
00205             r.halfs_.low = a*b;
00206             __asm__("mulhdu %0,%1,%2" : "=r" (r.halfs_.high)
00207                 : "r" (a), "r" (b) : "cc");
00208 
00209         #elif defined(__x86_64__)
00210             __asm__("mulq %3" : "=d" (r.halfs_.high), "=a" (r.halfs_.low) :
00211                 "a" (a), "rm" (b) : "cc");
00212 
00213         #elif defined(__mips64)
00214             __asm__("dmultu %2,%3" : "=h" (r.halfs_.high), "=l" (r.halfs_.low)
00215                 : "r" (a), "r" (b));
00216 
00217         #elif defined(_M_IX86)
00218             // for testing
00219             word64 t = (word64)a * b;
00220             r.halfs_.high = ((word32 *)(&t))[1];
00221             r.halfs_.low = (word32)t;
00222         #else
00223             #error can not implement DWord
00224         #endif
00225 
00226         return r;
00227     }
00228 
00229     static DWord MultiplyAndAdd(word a, word b, word c)
00230     {
00231         DWord r = Multiply(a, b);
00232         return r += c;
00233     }
00234 
00235     DWord & operator+=(word a)
00236     {
00237         #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
00238             whole_ = whole_ + a;
00239         #else
00240             halfs_.low += a;
00241             halfs_.high += (halfs_.low < a);
00242         #endif
00243         return *this;
00244     }
00245 
00246     DWord operator+(word a)
00247     {
00248         DWord r;
00249         #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
00250             r.whole_ = whole_ + a;
00251         #else
00252             r.halfs_.low = halfs_.low + a;
00253             r.halfs_.high = halfs_.high + (r.halfs_.low < a);
00254         #endif
00255         return r;
00256     }
00257 
00258     DWord operator-(DWord a)
00259     {
00260         DWord r;
00261         #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
00262             r.whole_ = whole_ - a.whole_;
00263         #else
00264             r.halfs_.low = halfs_.low - a.halfs_.low;
00265             r.halfs_.high = halfs_.high - a.halfs_.high -
00266                              (r.halfs_.low > halfs_.low);
00267         #endif
00268         return r;
00269     }
00270 
00271     DWord operator-(word a)
00272     {
00273         DWord r;
00274         #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
00275             r.whole_ = whole_ - a;
00276         #else
00277             r.halfs_.low = halfs_.low - a;
00278             r.halfs_.high = halfs_.high - (r.halfs_.low > halfs_.low);
00279         #endif
00280         return r;
00281     }
00282 
00283     // returns quotient, which must fit in a word
00284     word operator/(word divisor);
00285 
00286     word operator%(word a);
00287 
00288     bool operator!() const
00289     {
00290     #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
00291         return !whole_;
00292     #else
00293         return !halfs_.high && !halfs_.low;
00294     #endif
00295     }
00296 
00297     word GetLowHalf() const {return halfs_.low;}
00298     word GetHighHalf() const {return halfs_.high;}
00299     word GetHighHalfAsBorrow() const {return 0-halfs_.high;}
00300 
00301 private:
00302     union
00303     {
00304     #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
00305         dword whole_;
00306     #endif
00307         struct
00308         {
00309         #ifdef LITTLE_ENDIAN_ORDER
00310             word low;
00311             word high;
00312         #else
00313             word high;
00314             word low;
00315         #endif
00316         } halfs_;
00317     };
00318 };
00319 
00320 
00321 class Word {
00322 public:
00323     Word() {}
00324 
00325     Word(word value)
00326     {
00327         whole_ = value;
00328     }
00329 
00330     Word(hword low, hword high)
00331     {
00332         whole_ = low | (word(high) << (WORD_BITS/2));
00333     }
00334 
00335     static Word Multiply(hword a, hword b)
00336     {
00337         Word r;
00338         r.whole_ = (word)a * b;
00339         return r;
00340     }
00341 
00342     Word operator-(Word a)
00343     {
00344         Word r;
00345         r.whole_ = whole_ - a.whole_;
00346         return r;
00347     }
00348 
00349     Word operator-(hword a)
00350     {
00351         Word r;
00352         r.whole_ = whole_ - a;
00353         return r;
00354     }
00355 
00356     // returns quotient, which must fit in a word
00357     hword operator/(hword divisor)
00358     {
00359         return hword(whole_ / divisor);
00360     }
00361 
00362     bool operator!() const
00363     {
00364         return !whole_;
00365     }
00366 
00367     word GetWhole() const {return whole_;}
00368     hword GetLowHalf() const {return hword(whole_);}
00369     hword GetHighHalf() const {return hword(whole_>>(WORD_BITS/2));}
00370     hword GetHighHalfAsBorrow() const {return 0-hword(whole_>>(WORD_BITS/2));}
00371 
00372 private:
00373     word whole_;
00374 };
00375 
00376 
00377 // dummy is VC60 compiler bug workaround
00378 // do a 3 word by 2 word divide, returns quotient and leaves remainder in A
00379 template <class S, class D>
00380 S DivideThreeWordsByTwo(S* A, S B0, S B1, D* dummy_VC6_WorkAround = 0)
00381 {
00382     // assert {A[2],A[1]} < {B1,B0}, so quotient can fit in a S
00383     assert(A[2] < B1 || (A[2]==B1 && A[1] < B0));
00384 
00385     // estimate the quotient: do a 2 S by 1 S divide
00386     S Q;
00387     if (S(B1+1) == 0)
00388         Q = A[2];
00389     else
00390         Q = D(A[1], A[2]) / S(B1+1);
00391 
00392     // now subtract Q*B from A
00393     D p = D::Multiply(B0, Q);
00394     D u = (D) A[0] - p.GetLowHalf();
00395     A[0] = u.GetLowHalf();
00396     u = (D) A[1] - p.GetHighHalf() - u.GetHighHalfAsBorrow() - 
00397             D::Multiply(B1, Q);
00398     A[1] = u.GetLowHalf();
00399     A[2] += u.GetHighHalf();
00400 
00401     // Q <= actual quotient, so fix it
00402     while (A[2] || A[1] > B1 || (A[1]==B1 && A[0]>=B0))
00403     {
00404         u = (D) A[0] - B0;
00405         A[0] = u.GetLowHalf();
00406         u = (D) A[1] - B1 - u.GetHighHalfAsBorrow();
00407         A[1] = u.GetLowHalf();
00408         A[2] += u.GetHighHalf();
00409         Q++;
00410         assert(Q);      // shouldn't overflow
00411     }
00412 
00413     return Q;
00414 }
00415 
00416 
00417 // do a 4 word by 2 word divide, returns 2 word quotient in Q0 and Q1
00418 template <class S, class D>
00419 inline D DivideFourWordsByTwo(S *T, const D &Al, const D &Ah, const D &B)
00420 {
00421     if (!B) // if divisor is 0, we assume divisor==2**(2*WORD_BITS)
00422         return D(Ah.GetLowHalf(), Ah.GetHighHalf());
00423     else
00424     {
00425         S Q[2];
00426         T[0] = Al.GetLowHalf();
00427         T[1] = Al.GetHighHalf(); 
00428         T[2] = Ah.GetLowHalf();
00429         T[3] = Ah.GetHighHalf();
00430         Q[1] = DivideThreeWordsByTwo<S, D>(T+1, B.GetLowHalf(),
00431                                                 B.GetHighHalf());
00432         Q[0] = DivideThreeWordsByTwo<S, D>(T, B.GetLowHalf(), B.GetHighHalf());
00433         return D(Q[0], Q[1]);
00434     }
00435 }
00436 
00437 
00438 // returns quotient, which must fit in a word
00439 inline word DWord::operator/(word a)
00440 {
00441     #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
00442         return word(whole_ / a);
00443     #else
00444         hword r[4];
00445         return DivideFourWordsByTwo<hword, Word>(r, halfs_.low,
00446                                                     halfs_.high, a).GetWhole();
00447     #endif
00448 }
00449 
00450 inline word DWord::operator%(word a)
00451 {
00452     #ifdef TAOCRYPT_NATIVE_DWORD_AVAILABLE
00453         return word(whole_ % a);
00454     #else
00455         if (a < (word(1) << (WORD_BITS/2)))
00456         {
00457             hword h = hword(a);
00458             word r = halfs_.high % h;
00459             r = ((halfs_.low >> (WORD_BITS/2)) + (r << (WORD_BITS/2))) % h;
00460             return hword((hword(halfs_.low) + (r << (WORD_BITS/2))) % h);
00461         }
00462         else
00463         {
00464             hword r[4];
00465             DivideFourWordsByTwo<hword, Word>(r, halfs_.low, halfs_.high, a);
00466             return Word(r[0], r[1]).GetWhole();
00467         }
00468     #endif
00469 }
00470 
00471 
00472 
00473 // end 5.2.1 DWord and Word adds
00474 
00475 
00476 
00477 
00478 
00479 static const unsigned int RoundupSizeTable[] = {2, 2, 2, 4, 4, 8, 8, 8, 8};
00480 
00481 static inline unsigned int RoundupSize(unsigned int n)
00482 {
00483     if (n<=8)
00484         return RoundupSizeTable[n];
00485     else if (n<=16)
00486         return 16;
00487     else if (n<=32)
00488         return 32;
00489     else if (n<=64)
00490         return 64;
00491     else return 1U << BitPrecision(n-1);
00492 }
00493 
00494 
00495 static int Compare(const word *A, const word *B, unsigned int N)
00496 {
00497     while (N--)
00498         if (A[N] > B[N])
00499             return 1;
00500         else if (A[N] < B[N])
00501             return -1;
00502 
00503     return 0;
00504 }
00505 
00506 static word Increment(word *A, unsigned int N, word B=1)
00507 {
00508     assert(N);
00509     word t = A[0];
00510     A[0] = t+B;
00511     if (A[0] >= t)
00512         return 0;
00513     for (unsigned i=1; i<N; i++)
00514         if (++A[i])
00515             return 0;
00516     return 1;
00517 }
00518 
00519 static word Decrement(word *A, unsigned int N, word B=1)
00520 {
00521     assert(N);
00522     word t = A[0];
00523     A[0] = t-B;
00524     if (A[0] <= t)
00525         return 0;
00526     for (unsigned i=1; i<N; i++)
00527         if (A[i]--)
00528             return 0;
00529     return 1;
00530 }
00531 
00532 static void TwosComplement(word *A, unsigned int N)
00533 {
00534     Decrement(A, N);
00535     for (unsigned i=0; i<N; i++)
00536         A[i] = ~A[i];
00537 }
00538 
00539 
00540 static word LinearMultiply(word *C, const word *A, word B, unsigned int N)
00541 {
00542     word carry=0;
00543     for(unsigned i=0; i<N; i++)
00544     {
00545         DWord p = DWord::MultiplyAndAdd(A[i], B, carry);
00546         C[i] = p.GetLowHalf();
00547         carry = p.GetHighHalf();
00548     }
00549     return carry;
00550 }
00551 
00552 
00553 static word AtomicInverseModPower2(word A)
00554 {
00555     assert(A%2==1);
00556 
00557     word R=A%8;
00558 
00559     for (unsigned i=3; i<WORD_BITS; i*=2)
00560         R = R*(2-R*A);
00561 
00562     assert(word(R*A)==1);
00563     return R;
00564 }
00565 
00566 
00567 // ********************************************************
00568 
00569 class Portable
00570 {
00571 public:
00572     static word Add(word *C, const word *A, const word *B, unsigned int N);
00573     static word Subtract(word *C, const word *A, const word*B, unsigned int N);
00574 
00575     static void Multiply2(word *C, const word *A, const word *B);
00576     static word Multiply2Add(word *C, const word *A, const word *B);
00577     static void Multiply4(word *C, const word *A, const word *B);
00578     static void Multiply8(word *C, const word *A, const word *B);
00579     static unsigned int MultiplyRecursionLimit() {return 8;}
00580 
00581     static void Multiply2Bottom(word *C, const word *A, const word *B);
00582     static void Multiply4Bottom(word *C, const word *A, const word *B);
00583     static void Multiply8Bottom(word *C, const word *A, const word *B);
00584     static unsigned int MultiplyBottomRecursionLimit() {return 8;}
00585 
00586     static void Square2(word *R, const word *A);
00587     static void Square4(word *R, const word *A);
00588     static void Square8(word *R, const word *A) {assert(false);}
00589     static unsigned int SquareRecursionLimit() {return 4;}
00590 };
00591 
00592 word Portable::Add(word *C, const word *A, const word *B, unsigned int N)
00593 {
00594     assert (N%2 == 0);
00595 
00596     DWord u(0, 0);
00597     for (unsigned int i = 0; i < N; i+=2)
00598     {
00599         u = DWord(A[i]) + B[i] + u.GetHighHalf();
00600         C[i] = u.GetLowHalf();
00601         u = DWord(A[i+1]) + B[i+1] + u.GetHighHalf();
00602         C[i+1] = u.GetLowHalf();
00603     }
00604     return u.GetHighHalf();
00605 }
00606 
00607 word Portable::Subtract(word *C, const word *A, const word *B, unsigned int N)
00608 {
00609     assert (N%2 == 0);
00610 
00611     DWord u(0, 0);
00612     for (unsigned int i = 0; i < N; i+=2)
00613     {
00614         u = (DWord) A[i] - B[i] - u.GetHighHalfAsBorrow();
00615         C[i] = u.GetLowHalf();
00616         u = (DWord) A[i+1] - B[i+1] - u.GetHighHalfAsBorrow();
00617         C[i+1] = u.GetLowHalf();
00618     }
00619     return 0-u.GetHighHalf();
00620 }
00621 
00622 void Portable::Multiply2(word *C, const word *A, const word *B)
00623 {
00624 /*
00625     word s;
00626     dword d;
00627 
00628     if (A1 >= A0)
00629         if (B0 >= B1)
00630         {
00631             s = 0;
00632             d = (dword)(A1-A0)*(B0-B1);
00633         }
00634         else
00635         {
00636             s = (A1-A0);
00637             d = (dword)s*(word)(B0-B1);
00638         }
00639     else
00640         if (B0 > B1)
00641         {
00642             s = (B0-B1);
00643             d = (word)(A1-A0)*(dword)s;
00644         }
00645         else
00646         {
00647             s = 0;
00648             d = (dword)(A0-A1)*(B1-B0);
00649         }
00650 */
00651     // this segment is the branchless equivalent of above
00652     word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
00653     unsigned int ai = A[1] < A[0];
00654     unsigned int bi = B[0] < B[1];
00655     unsigned int di = ai & bi;
00656     DWord d = DWord::Multiply(D[di], D[di+2]);
00657     D[1] = D[3] = 0;
00658     unsigned int si = ai + !bi;
00659     word s = D[si];
00660 
00661     DWord A0B0 = DWord::Multiply(A[0], B[0]);
00662     C[0] = A0B0.GetLowHalf();
00663 
00664     DWord A1B1 = DWord::Multiply(A[1], B[1]);
00665     DWord t = (DWord) A0B0.GetHighHalf() + A0B0.GetLowHalf() + d.GetLowHalf()
00666                        + A1B1.GetLowHalf();
00667     C[1] = t.GetLowHalf();
00668 
00669     t = A1B1 + t.GetHighHalf() + A0B0.GetHighHalf() + d.GetHighHalf()
00670              + A1B1.GetHighHalf() - s;
00671     C[2] = t.GetLowHalf();
00672     C[3] = t.GetHighHalf();
00673 }
00674 
00675 void Portable::Multiply2Bottom(word *C, const word *A, const word *B)
00676 {
00677     DWord t = DWord::Multiply(A[0], B[0]);
00678     C[0] = t.GetLowHalf();
00679     C[1] = t.GetHighHalf() + A[0]*B[1] + A[1]*B[0];
00680 }
00681 
00682 word Portable::Multiply2Add(word *C, const word *A, const word *B)
00683 {
00684     word D[4] = {A[1]-A[0], A[0]-A[1], B[0]-B[1], B[1]-B[0]};
00685     unsigned int ai = A[1] < A[0];
00686     unsigned int bi = B[0] < B[1];
00687     unsigned int di = ai & bi;
00688     DWord d = DWord::Multiply(D[di], D[di+2]);
00689     D[1] = D[3] = 0;
00690     unsigned int si = ai + !bi;
00691     word s = D[si];
00692 
00693     DWord A0B0 = DWord::Multiply(A[0], B[0]);
00694     DWord t = A0B0 + C[0];
00695     C[0] = t.GetLowHalf();
00696 
00697     DWord A1B1 = DWord::Multiply(A[1], B[1]);
00698     t = (DWord) t.GetHighHalf() + A0B0.GetLowHalf() + d.GetLowHalf() +
00699         A1B1.GetLowHalf() + C[1];
00700     C[1] = t.GetLowHalf();
00701 
00702     t = (DWord) t.GetHighHalf() + A1B1.GetLowHalf() + A0B0.GetHighHalf() +
00703         d.GetHighHalf() + A1B1.GetHighHalf() - s + C[2];
00704     C[2] = t.GetLowHalf();
00705 
00706     t = (DWord) t.GetHighHalf() + A1B1.GetHighHalf() + C[3];
00707     C[3] = t.GetLowHalf();
00708     return t.GetHighHalf();
00709 }
00710 
00711 
00712 #define MulAcc(x, y)                                \
00713     p = DWord::MultiplyAndAdd(A[x], B[y], c);       \
00714     c = p.GetLowHalf();                             \
00715     p = (DWord) d + p.GetHighHalf();                \
00716     d = p.GetLowHalf();                             \
00717     e += p.GetHighHalf();
00718 
00719 #define SaveMulAcc(s, x, y)                         \
00720     R[s] = c;                                       \
00721     p = DWord::MultiplyAndAdd(A[x], B[y], d);       \
00722     c = p.GetLowHalf();                             \
00723     p = (DWord) e + p.GetHighHalf();                \
00724     d = p.GetLowHalf();                             \
00725     e = p.GetHighHalf();
00726 
00727 #define SquAcc(x, y)                                \
00728     q = DWord::Multiply(A[x], A[y]);                \
00729     p = q + c;                                      \
00730     c = p.GetLowHalf();                             \
00731     p = (DWord) d + p.GetHighHalf();                \
00732     d = p.GetLowHalf();                             \
00733     e += p.GetHighHalf();                           \
00734     p = q + c;                                      \
00735     c = p.GetLowHalf();                             \
00736     p = (DWord) d + p.GetHighHalf();                \
00737     d = p.GetLowHalf();                             \
00738     e += p.GetHighHalf();
00739 
00740 #define SaveSquAcc(s, x, y)                         \
00741     R[s] = c;                                       \
00742     q = DWord::Multiply(A[x], A[y]);                \
00743     p = q + d;                                      \
00744     c = p.GetLowHalf();                             \
00745     p = (DWord) e + p.GetHighHalf();                \
00746     d = p.GetLowHalf();                             \
00747     e = p.GetHighHalf();                            \
00748     p = q + c;                                      \
00749     c = p.GetLowHalf();                             \
00750     p = (DWord) d + p.GetHighHalf();                \
00751     d = p.GetLowHalf();                             \
00752     e += p.GetHighHalf();
00753 
00754 
00755 void Portable::Multiply4(word *R, const word *A, const word *B)
00756 {
00757     DWord p;
00758     word c, d, e;
00759 
00760     p = DWord::Multiply(A[0], B[0]);
00761     R[0] = p.GetLowHalf();
00762     c = p.GetHighHalf();
00763     d = e = 0;
00764 
00765     MulAcc(0, 1);
00766     MulAcc(1, 0);
00767 
00768     SaveMulAcc(1, 2, 0);
00769     MulAcc(1, 1);
00770     MulAcc(0, 2);
00771 
00772     SaveMulAcc(2, 0, 3);
00773     MulAcc(1, 2);
00774     MulAcc(2, 1);
00775     MulAcc(3, 0);
00776 
00777     SaveMulAcc(3, 3, 1);
00778     MulAcc(2, 2);
00779     MulAcc(1, 3);
00780 
00781     SaveMulAcc(4, 2, 3);
00782     MulAcc(3, 2);
00783 
00784     R[5] = c;
00785     p = DWord::MultiplyAndAdd(A[3], B[3], d);
00786     R[6] = p.GetLowHalf();
00787     R[7] = e + p.GetHighHalf();
00788 }
00789 
00790 void Portable::Square2(word *R, const word *A)
00791 {
00792     DWord p, q;
00793     word c, d, e;
00794 
00795     p = DWord::Multiply(A[0], A[0]);
00796     R[0] = p.GetLowHalf();
00797     c = p.GetHighHalf();
00798     d = e = 0;
00799 
00800     SquAcc(0, 1);
00801 
00802     R[1] = c;
00803     p = DWord::MultiplyAndAdd(A[1], A[1], d);
00804     R[2] = p.GetLowHalf();
00805     R[3] = e + p.GetHighHalf();
00806 }
00807 
00808 void Portable::Square4(word *R, const word *A)
00809 {
00810 #ifdef _MSC_VER
00811     // VC60 workaround: MSVC 6.0 has an optimization bug that makes
00812     // (dword)A*B where either A or B has been cast to a dword before
00813     // very expensive. Revisit this function when this
00814     // bug is fixed.
00815     Multiply4(R, A, A);
00816 #else
00817     const word *B = A;
00818     DWord p, q;
00819     word c, d, e;
00820 
00821     p = DWord::Multiply(A[0], A[0]);
00822     R[0] = p.GetLowHalf();
00823     c = p.GetHighHalf();
00824     d = e = 0;
00825 
00826     SquAcc(0, 1);
00827 
00828     SaveSquAcc(1, 2, 0);
00829     MulAcc(1, 1);
00830 
00831     SaveSquAcc(2, 0, 3);
00832     SquAcc(1, 2);
00833 
00834     SaveSquAcc(3, 3, 1);
00835     MulAcc(2, 2);
00836 
00837     SaveSquAcc(4, 2, 3);
00838 
00839     R[5] = c;
00840     p = DWord::MultiplyAndAdd(A[3], A[3], d);
00841     R[6] = p.GetLowHalf();
00842     R[7] = e + p.GetHighHalf();
00843 #endif
00844 }
00845 
00846 void Portable::Multiply8(word *R, const word *A, const word *B)
00847 {
00848     DWord p;
00849     word c, d, e;
00850 
00851     p = DWord::Multiply(A[0], B[0]);
00852     R[0] = p.GetLowHalf();
00853     c = p.GetHighHalf();
00854     d = e = 0;
00855 
00856     MulAcc(0, 1);
00857     MulAcc(1, 0);
00858 
00859     SaveMulAcc(1, 2, 0);
00860     MulAcc(1, 1);
00861     MulAcc(0, 2);
00862 
00863     SaveMulAcc(2, 0, 3);
00864     MulAcc(1, 2);
00865     MulAcc(2, 1);
00866     MulAcc(3, 0);
00867 
00868     SaveMulAcc(3, 0, 4);
00869     MulAcc(1, 3);
00870     MulAcc(2, 2);
00871     MulAcc(3, 1);
00872     MulAcc(4, 0);
00873 
00874     SaveMulAcc(4, 0, 5);
00875     MulAcc(1, 4);
00876     MulAcc(2, 3);
00877     MulAcc(3, 2);
00878     MulAcc(4, 1);
00879     MulAcc(5, 0);
00880 
00881     SaveMulAcc(5, 0, 6);
00882     MulAcc(1, 5);
00883     MulAcc(2, 4);
00884     MulAcc(3, 3);
00885     MulAcc(4, 2);
00886     MulAcc(5, 1);
00887     MulAcc(6, 0);
00888 
00889     SaveMulAcc(6, 0, 7);
00890     MulAcc(1, 6);
00891     MulAcc(2, 5);
00892     MulAcc(3, 4);
00893     MulAcc(4, 3);
00894     MulAcc(5, 2);
00895     MulAcc(6, 1);
00896     MulAcc(7, 0);
00897 
00898     SaveMulAcc(7, 1, 7);
00899     MulAcc(2, 6);
00900     MulAcc(3, 5);
00901     MulAcc(4, 4);
00902     MulAcc(5, 3);
00903     MulAcc(6, 2);
00904     MulAcc(7, 1);
00905 
00906     SaveMulAcc(8, 2, 7);
00907     MulAcc(3, 6);
00908     MulAcc(4, 5);
00909     MulAcc(5, 4);
00910     MulAcc(6, 3);
00911     MulAcc(7, 2);
00912 
00913     SaveMulAcc(9, 3, 7);
00914     MulAcc(4, 6);
00915     MulAcc(5, 5);
00916     MulAcc(6, 4);
00917     MulAcc(7, 3);
00918 
00919     SaveMulAcc(10, 4, 7);
00920     MulAcc(5, 6);
00921     MulAcc(6, 5);
00922     MulAcc(7, 4);
00923 
00924     SaveMulAcc(11, 5, 7);
00925     MulAcc(6, 6);
00926     MulAcc(7, 5);
00927 
00928     SaveMulAcc(12, 6, 7);
00929     MulAcc(7, 6);
00930 
00931     R[13] = c;
00932     p = DWord::MultiplyAndAdd(A[7], B[7], d);
00933     R[14] = p.GetLowHalf();
00934     R[15] = e + p.GetHighHalf();
00935 }
00936 
00937 void Portable::Multiply4Bottom(word *R, const word *A, const word *B)
00938 {
00939     DWord p;
00940     word c, d, e;
00941 
00942     p = DWord::Multiply(A[0], B[0]);
00943     R[0] = p.GetLowHalf();
00944     c = p.GetHighHalf();
00945     d = e = 0;
00946 
00947     MulAcc(0, 1);
00948     MulAcc(1, 0);
00949 
00950     SaveMulAcc(1, 2, 0);
00951     MulAcc(1, 1);
00952     MulAcc(0, 2);
00953 
00954     R[2] = c;
00955     R[3] = d + A[0] * B[3] + A[1] * B[2] + A[2] * B[1] + A[3] * B[0];
00956 }
00957 
00958 void Portable::Multiply8Bottom(word *R, const word *A, const word *B)
00959 {
00960     DWord p;
00961     word c, d, e;
00962 
00963     p = DWord::Multiply(A[0], B[0]);
00964     R[0] = p.GetLowHalf();
00965     c = p.GetHighHalf();
00966     d = e = 0;
00967 
00968     MulAcc(0, 1);
00969     MulAcc(1, 0);
00970 
00971     SaveMulAcc(1, 2, 0);
00972     MulAcc(1, 1);
00973     MulAcc(0, 2);
00974 
00975     SaveMulAcc(2, 0, 3);
00976     MulAcc(1, 2);
00977     MulAcc(2, 1);
00978     MulAcc(3, 0);
00979 
00980     SaveMulAcc(3, 0, 4);
00981     MulAcc(1, 3);
00982     MulAcc(2, 2);
00983     MulAcc(3, 1);
00984     MulAcc(4, 0);
00985 
00986     SaveMulAcc(4, 0, 5);
00987     MulAcc(1, 4);
00988     MulAcc(2, 3);
00989     MulAcc(3, 2);
00990     MulAcc(4, 1);
00991     MulAcc(5, 0);
00992 
00993     SaveMulAcc(5, 0, 6);
00994     MulAcc(1, 5);
00995     MulAcc(2, 4);
00996     MulAcc(3, 3);
00997     MulAcc(4, 2);
00998     MulAcc(5, 1);
00999     MulAcc(6, 0);
01000 
01001     R[6] = c;
01002     R[7] = d + A[0] * B[7] + A[1] * B[6] + A[2] * B[5] + A[3] * B[4] +
01003                A[4] * B[3] + A[5] * B[2] + A[6] * B[1] + A[7] * B[0];
01004 }
01005 
01006 
01007 #undef MulAcc
01008 #undef SaveMulAcc
01009 #undef SquAcc
01010 #undef SaveSquAcc
01011 
01012 // optimized
01013 
01014 #ifdef TAOCRYPT_X86ASM_AVAILABLE
01015 
01016 // ************** x86 feature detection ***************
01017 
01018 static bool s_sse2Enabled = true;
01019 
01020 static void CpuId(word32 input, word32 *output)
01021 {
01022 #ifdef __GNUC__
01023     __asm__
01024     (
01025         // save ebx in case -fPIC is being used
01026         "push %%ebx; cpuid; mov %%ebx, %%edi; pop %%ebx"
01027         : "=a" (output[0]), "=D" (output[1]), "=c" (output[2]), "=d"(output[3])
01028         : "a" (input)
01029     );
01030 #else
01031     __asm
01032     {
01033         mov eax, input
01034         cpuid
01035         mov edi, output
01036         mov [edi], eax
01037         mov [edi+4], ebx
01038         mov [edi+8], ecx
01039         mov [edi+12], edx
01040     }
01041 #endif
01042 }
01043 
01044 #ifdef SSE2_INTRINSICS_AVAILABLE
01045 #ifndef _MSC_VER
01046 static jmp_buf s_env;
01047 static void SigIllHandler(int)
01048 {
01049     longjmp(s_env, 1);
01050 }
01051 #endif
01052 
01053 static bool HasSSE2()
01054 {
01055     if (!s_sse2Enabled)
01056         return false;
01057 
01058     word32 cpuid[4];
01059     CpuId(1, cpuid);
01060     if ((cpuid[3] & (1 << 26)) == 0)
01061         return false;
01062 
01063 #ifdef _MSC_VER
01064     __try
01065     {
01066         __asm xorpd xmm0, xmm0        // executing SSE2 instruction
01067     }
01068     __except (1)
01069     {
01070         return false;
01071     }
01072     return true;
01073 #else
01074     typedef void (*SigHandler)(int);
01075 
01076     SigHandler oldHandler = signal(SIGILL, SigIllHandler);
01077     if (oldHandler == SIG_ERR)
01078         return false;
01079 
01080     bool result = true;
01081     if (setjmp(s_env))
01082         result = false;
01083     else
01084         __asm __volatile ("xorps %xmm0, %xmm0");
01085 
01086     signal(SIGILL, oldHandler);
01087     return result;
01088 #endif
01089 }
01090 #endif
01091 
01092 static bool IsP4()
01093 {
01094     word32 cpuid[4];
01095 
01096     CpuId(0, cpuid);
01097     mySTL::swap(cpuid[2], cpuid[3]);
01098     if (memcmp(cpuid+1, "GenuineIntel", 12) != 0)
01099         return false;
01100 
01101     CpuId(1, cpuid);
01102     return ((cpuid[0] >> 8) & 0xf) == 0xf;
01103 }
01104 
01105 // ************** Pentium/P4 optimizations ***************
01106 
01107 class PentiumOptimized : public Portable
01108 {
01109 public:
01110     static word TAOCRYPT_CDECL Add(word *C, const word *A, const word *B,
01111                                    unsigned int N);
01112     static word TAOCRYPT_CDECL Subtract(word *C, const word *A, const word *B,
01113                                         unsigned int N);
01114     static void TAOCRYPT_CDECL Multiply4(word *C, const word *A,
01115                                          const word *B);
01116     static void TAOCRYPT_CDECL Multiply8(word *C, const word *A,
01117                                          const word *B);
01118     static void TAOCRYPT_CDECL Multiply8Bottom(word *C, const word *A,
01119                                                const word *B);
01120 };
01121 
01122 class P4Optimized
01123 {
01124 public:
01125     static word TAOCRYPT_CDECL Add(word *C, const word *A, const word *B,
01126                                    unsigned int N);
01127     static word TAOCRYPT_CDECL Subtract(word *C, const word *A, const word *B,
01128                                         unsigned int N);
01129 #ifdef SSE2_INTRINSICS_AVAILABLE
01130     static void TAOCRYPT_CDECL Multiply4(word *C, const word *A,
01131                                          const word *B);
01132     static void TAOCRYPT_CDECL Multiply8(word *C, const word *A,
01133                                          const word *B);
01134     static void TAOCRYPT_CDECL Multiply8Bottom(word *C, const word *A,
01135                                                const word *B);
01136 #endif
01137 };
01138 
01139 typedef word (TAOCRYPT_CDECL * PAddSub)(word *C, const word *A, const word *B,
01140                                         unsigned int N);
01141 typedef void (TAOCRYPT_CDECL * PMul)(word *C, const word *A, const word *B);
01142 
01143 static PAddSub s_pAdd, s_pSub;
01144 #ifdef SSE2_INTRINSICS_AVAILABLE
01145 static PMul s_pMul4, s_pMul8, s_pMul8B;
01146 #endif
01147 
01148 static void SetPentiumFunctionPointers()
01149 {
01150     if (IsP4())
01151     {
01152         s_pAdd = &P4Optimized::Add;
01153         s_pSub = &P4Optimized::Subtract;
01154     }
01155     else
01156     {
01157         s_pAdd = &PentiumOptimized::Add;
01158         s_pSub = &PentiumOptimized::Subtract;
01159     }
01160 
01161 #ifdef SSE2_INTRINSICS_AVAILABLE
01162     if (HasSSE2())
01163     {
01164         s_pMul4 = &P4Optimized::Multiply4;
01165         s_pMul8 = &P4Optimized::Multiply8;
01166         s_pMul8B = &P4Optimized::Multiply8Bottom;
01167     }
01168     else
01169     {
01170         s_pMul4 = &PentiumOptimized::Multiply4;
01171         s_pMul8 = &PentiumOptimized::Multiply8;
01172         s_pMul8B = &PentiumOptimized::Multiply8Bottom;
01173     }
01174 #endif
01175 }
01176 
01177 static const char s_RunAtStartupSetPentiumFunctionPointers =
01178     (SetPentiumFunctionPointers(), 0);
01179 
01180 void DisableSSE2()
01181 {
01182     s_sse2Enabled = false;
01183     SetPentiumFunctionPointers();
01184 }
01185 
01186 class LowLevel : public PentiumOptimized
01187 {
01188 public:
01189     inline static word Add(word *C, const word *A, const word *B,
01190                            unsigned int N)
01191         {return s_pAdd(C, A, B, N);}
01192     inline static word Subtract(word *C, const word *A, const word *B,
01193                                 unsigned int N)
01194         {return s_pSub(C, A, B, N);}
01195     inline static void Square4(word *R, const word *A)
01196         {Multiply4(R, A, A);}
01197 #ifdef SSE2_INTRINSICS_AVAILABLE
01198     inline static void Multiply4(word *C, const word *A, const word *B)
01199         {s_pMul4(C, A, B);}
01200     inline static void Multiply8(word *C, const word *A, const word *B)
01201         {s_pMul8(C, A, B);}
01202     inline static void Multiply8Bottom(word *C, const word *A, const word *B)
01203         {s_pMul8B(C, A, B);}
01204 #endif
01205 };
01206 
01207 // use some tricks to share assembly code between MSVC and GCC
01208 #ifdef _MSC_VER
01209     #define TAOCRYPT_NAKED __declspec(naked)
01210     #define AS1(x) __asm x
01211     #define AS2(x, y) __asm x, y
01212     #define AddPrologue \
01213         __asm   push ebp \
01214         __asm   push ebx \
01215         __asm   push esi \
01216         __asm   push edi \
01217         __asm   mov             ecx, [esp+20] \
01218         __asm   mov             edx, [esp+24] \
01219         __asm   mov             ebx, [esp+28] \
01220         __asm   mov             esi, [esp+32]
01221     #define AddEpilogue \
01222         __asm   pop edi \
01223         __asm   pop esi \
01224         __asm   pop ebx \
01225         __asm   pop ebp \
01226         __asm   ret
01227     #define MulPrologue \
01228         __asm   push ebp \
01229         __asm   push ebx \
01230         __asm   push esi \
01231         __asm   push edi \
01232         __asm   mov ecx, [esp+28] \
01233         __asm   mov esi, [esp+24] \
01234         __asm   push [esp+20]
01235     #define MulEpilogue \
01236         __asm   add esp, 4 \
01237         __asm   pop edi \
01238         __asm   pop esi \
01239         __asm   pop ebx \
01240         __asm   pop ebp \
01241         __asm   ret
01242 #else
01243     #define TAOCRYPT_NAKED
01244     #define AS1(x) #x ";"
01245     #define AS2(x, y) #x ", " #y ";"
01246     #define AddPrologue \
01247         __asm__ __volatile__ \
01248         ( \
01249             "push %%ebx;"       /* save this manually, in case of -fPIC */ \
01250             "mov %2, %%ebx;" \
01251             ".intel_syntax noprefix;" \
01252             "push ebp;"
01253     #define AddEpilogue \
01254             "pop ebp;" \
01255             ".att_syntax prefix;" \
01256             "pop %%ebx;" \
01257                     : \
01258                     : "c" (C), "d" (A), "m" (B), "S" (N) \
01259                     : "%edi", "memory", "cc" \
01260         );
01261     #define MulPrologue \
01262         __asm__ __volatile__ \
01263         ( \
01264             "push %%ebx;"       /* save this manually, in case of -fPIC */ \
01265             "push %%ebp;" \
01266             "push %0;" \
01267             ".intel_syntax noprefix;"
01268     #define MulEpilogue \
01269             "add esp, 4;" \
01270             "pop ebp;" \
01271             "pop ebx;" \
01272             ".att_syntax prefix;" \
01273             : \
01274             : "rm" (Z), "S" (X), "c" (Y) \
01275             : "%eax", "%edx", "%edi", "memory", "cc" \
01276         );
01277 #endif
01278 
01279 TAOCRYPT_NAKED word PentiumOptimized::Add(word *C, const word *A,
01280                                           const word *B, unsigned int N)
01281 {
01282     AddPrologue
01283 
01284     // now: ebx = B, ecx = C, edx = A, esi = N
01285     AS2(    sub ecx, edx)           // hold the distance between C & A so we
01286                                     // can add this to A to get C
01287     AS2(    xor eax, eax)           // clear eax
01288 
01289     AS2(    sub eax, esi)           // eax is a negative index from end of B
01290     AS2(    lea ebx, [ebx+4*esi])   // ebx is end of B
01291 
01292     AS2(    sar eax, 1)             // unit of eax is now dwords; this also
01293                                     // clears the carry flag
01294     AS1(    jz  loopendAdd)         // if no dwords then nothing to do
01295 
01296     AS1(loopstartAdd:)
01297     AS2(    mov    esi,[edx])           // load lower word of A
01298     AS2(    mov    ebp,[edx+4])         // load higher word of A
01299 
01300     AS2(    mov    edi,[ebx+8*eax])     // load lower word of B
01301     AS2(    lea    edx,[edx+8])         // advance A and C
01302 
01303     AS2(    adc    esi,edi)             // add lower words
01304     AS2(    mov    edi,[ebx+8*eax+4])   // load higher word of B
01305 
01306     AS2(    adc    ebp,edi)             // add higher words
01307     AS1(    inc    eax)                 // advance B
01308 
01309     AS2(    mov    [edx+ecx-8],esi)     // store lower word result
01310     AS2(    mov    [edx+ecx-4],ebp)     // store higher word result
01311 
01312     AS1(    jnz    loopstartAdd)   // loop until eax overflows and becomes zero
01313 
01314     AS1(loopendAdd:)
01315     AS2(    adc eax, 0)     // store carry into eax (return result register)
01316 
01317     AddEpilogue
01318 }
01319 
01320 TAOCRYPT_NAKED word PentiumOptimized::Subtract(word *C, const word *A,
01321                                                const word *B, unsigned int N)
01322 {
01323     AddPrologue
01324 
01325     // now: ebx = B, ecx = C, edx = A, esi = N
01326     AS2(    sub ecx, edx)           // hold the distance between C & A so we
01327                                     // can add this to A to get C
01328     AS2(    xor eax, eax)           // clear eax
01329 
01330     AS2(    sub eax, esi)           // eax is a negative index from end of B
01331     AS2(    lea ebx, [ebx+4*esi])   // ebx is end of B
01332 
01333     AS2(    sar eax, 1)             // unit of eax is now dwords; this also
01334                                     // clears the carry flag
01335     AS1(    jz  loopendSub)         // if no dwords then nothing to do
01336 
01337     AS1(loopstartSub:)
01338     AS2(    mov    esi,[edx])           // load lower word of A
01339     AS2(    mov    ebp,[edx+4])         // load higher word of A
01340 
01341     AS2(    mov    edi,[ebx+8*eax])     // load lower word of B
01342     AS2(    lea    edx,[edx+8])         // advance A and C
01343 
01344     AS2(    sbb    esi,edi)             // subtract lower words
01345     AS2(    mov    edi,[ebx+8*eax+4])   // load higher word of B
01346 
01347     AS2(    sbb    ebp,edi)             // subtract higher words
01348     AS1(    inc    eax)                 // advance B
01349 
01350     AS2(    mov    [edx+ecx-8],esi)     // store lower word result
01351     AS2(    mov    [edx+ecx-4],ebp)     // store higher word result
01352 
01353     AS1(    jnz    loopstartSub)   // loop until eax overflows and becomes zero
01354 
01355     AS1(loopendSub:)
01356     AS2(    adc eax, 0)     // store carry into eax (return result register)
01357 
01358     AddEpilogue
01359 }
01360 
01361 // On Pentium 4, the adc and sbb instructions are very expensive, so avoid them.
01362 
01363 TAOCRYPT_NAKED word P4Optimized::Add(word *C, const word *A, const word *B,
01364                                      unsigned int N)
01365 {
01366     AddPrologue
01367 
01368     // now: ebx = B, ecx = C, edx = A, esi = N
01369     AS2(    xor     eax, eax)
01370     AS1(    neg     esi)
01371     AS1(    jz      loopendAddP4)       // if no dwords then nothing to do
01372 
01373     AS2(    mov     edi, [edx])
01374     AS2(    mov     ebp, [ebx])
01375     AS1(    jmp     carry1AddP4)
01376 
01377     AS1(loopstartAddP4:)
01378     AS2(    mov     edi, [edx+8])
01379     AS2(    add     ecx, 8)
01380     AS2(    add     edx, 8)
01381     AS2(    mov     ebp, [ebx])
01382     AS2(    add     edi, eax)
01383     AS1(    jc      carry1AddP4)
01384     AS2(    xor     eax, eax)
01385 
01386     AS1(carry1AddP4:)
01387     AS2(    add     edi, ebp)
01388     AS2(    mov     ebp, 1)
01389     AS2(    mov     [ecx], edi)
01390     AS2(    mov     edi, [edx+4])
01391     AS2(    cmovc   eax, ebp)
01392     AS2(    mov     ebp, [ebx+4])
01393     AS2(    add     ebx, 8)
01394     AS2(    add     edi, eax)
01395     AS1(    jc      carry2AddP4)
01396     AS2(    xor     eax, eax)
01397 
01398     AS1(carry2AddP4:)
01399     AS2(    add     edi, ebp)
01400     AS2(    mov     ebp, 1)
01401     AS2(    cmovc   eax, ebp)
01402     AS2(    mov     [ecx+4], edi)
01403     AS2(    add     esi, 2)
01404     AS1(    jnz     loopstartAddP4)
01405 
01406     AS1(loopendAddP4:)
01407 
01408     AddEpilogue
01409 }
01410 
01411 TAOCRYPT_NAKED word P4Optimized::Subtract(word *C, const word *A,
01412                                           const word *B, unsigned int N)
01413 {
01414     AddPrologue
01415 
01416     // now: ebx = B, ecx = C, edx = A, esi = N
01417     AS2(    xor     eax, eax)
01418     AS1(    neg     esi)
01419     AS1(    jz      loopendSubP4)       // if no dwords then nothing to do
01420 
01421     AS2(    mov     edi, [edx])
01422     AS2(    mov     ebp, [ebx])
01423     AS1(    jmp     carry1SubP4)
01424 
01425     AS1(loopstartSubP4:)
01426     AS2(    mov     edi, [edx+8])
01427     AS2(    add     edx, 8)
01428     AS2(    add     ecx, 8)
01429     AS2(    mov     ebp, [ebx])
01430     AS2(    sub     edi, eax)
01431     AS1(    jc      carry1SubP4)
01432     AS2(    xor     eax, eax)
01433 
01434     AS1(carry1SubP4:)
01435     AS2(    sub     edi, ebp)
01436     AS2(    mov     ebp, 1)
01437     AS2(    mov     [ecx], edi)
01438     AS2(    mov     edi, [edx+4])
01439     AS2(    cmovc   eax, ebp)
01440     AS2(    mov     ebp, [ebx+4])
01441     AS2(    add     ebx, 8)
01442     AS2(    sub     edi, eax)
01443     AS1(    jc      carry2SubP4)
01444     AS2(    xor     eax, eax)
01445 
01446     AS1(carry2SubP4:)
01447     AS2(    sub     edi, ebp)
01448     AS2(    mov     ebp, 1)
01449     AS2(    cmovc   eax, ebp)
01450     AS2(    mov     [ecx+4], edi)
01451     AS2(    add     esi, 2)
01452     AS1(    jnz     loopstartSubP4)
01453 
01454     AS1(loopendSubP4:)
01455 
01456     AddEpilogue
01457 }
01458 
01459 // multiply assembly code originally contributed by Leonard Janke
01460 
01461 #define MulStartup \
01462     AS2(xor ebp, ebp) \
01463     AS2(xor edi, edi) \
01464     AS2(xor ebx, ebx) 
01465 
01466 #define MulShiftCarry \
01467     AS2(mov ebp, edx) \
01468     AS2(mov edi, ebx) \
01469     AS2(xor ebx, ebx)
01470 
01471 #define MulAccumulateBottom(i,j) \
01472     AS2(mov eax, [ecx+4*j]) \
01473     AS2(imul eax, dword ptr [esi+4*i]) \
01474     AS2(add ebp, eax)
01475 
01476 #define MulAccumulate(i,j) \
01477     AS2(mov eax, [ecx+4*j]) \
01478     AS1(mul dword ptr [esi+4*i]) \
01479     AS2(add ebp, eax) \
01480     AS2(adc edi, edx) \
01481     AS2(adc bl, bh)
01482 
01483 #define MulStoreDigit(i)  \
01484     AS2(mov edx, edi) \
01485     AS2(mov edi, [esp]) \
01486     AS2(mov [edi+4*i], ebp)
01487 
01488 #define MulLastDiagonal(digits) \
01489     AS2(mov eax, [ecx+4*(digits-1)]) \
01490     AS1(mul dword ptr [esi+4*(digits-1)]) \
01491     AS2(add ebp, eax) \
01492     AS2(adc edx, edi) \
01493     AS2(mov edi, [esp]) \
01494     AS2(mov [edi+4*(2*digits-2)], ebp) \
01495     AS2(mov [edi+4*(2*digits-1)], edx)
01496 
01497 TAOCRYPT_NAKED void PentiumOptimized::Multiply4(word* Z, const word* X,
01498                                                 const word* Y)
01499 {
01500     MulPrologue
01501     // now: [esp] = Z, esi = X, ecx = Y
01502     MulStartup
01503     MulAccumulate(0,0)
01504     MulStoreDigit(0)
01505     MulShiftCarry
01506 
01507     MulAccumulate(1,0)
01508     MulAccumulate(0,1)
01509     MulStoreDigit(1)
01510     MulShiftCarry
01511 
01512     MulAccumulate(2,0)
01513     MulAccumulate(1,1)
01514     MulAccumulate(0,2)
01515     MulStoreDigit(2)
01516     MulShiftCarry
01517 
01518     MulAccumulate(3,0)
01519     MulAccumulate(2,1)
01520     MulAccumulate(1,2)
01521     MulAccumulate(0,3)
01522     MulStoreDigit(3)
01523     MulShiftCarry
01524 
01525     MulAccumulate(3,1)
01526     MulAccumulate(2,2)
01527     MulAccumulate(1,3)
01528     MulStoreDigit(4)
01529     MulShiftCarry
01530 
01531     MulAccumulate(3,2)
01532     MulAccumulate(2,3)
01533     MulStoreDigit(5)
01534     MulShiftCarry
01535 
01536     MulLastDiagonal(4)
01537     MulEpilogue
01538 }
01539 
01540 TAOCRYPT_NAKED void PentiumOptimized::Multiply8(word* Z, const word* X,
01541                                                 const word* Y)
01542 {
01543     MulPrologue
01544     // now: [esp] = Z, esi = X, ecx = Y
01545     MulStartup
01546     MulAccumulate(0,0)
01547     MulStoreDigit(0)
01548     MulShiftCarry
01549 
01550     MulAccumulate(1,0)
01551     MulAccumulate(0,1)
01552     MulStoreDigit(1)
01553     MulShiftCarry
01554 
01555     MulAccumulate(2,0)
01556     MulAccumulate(1,1)
01557     MulAccumulate(0,2)
01558     MulStoreDigit(2)
01559     MulShiftCarry
01560 
01561     MulAccumulate(3,0)
01562     MulAccumulate(2,1)
01563     MulAccumulate(1,2)
01564     MulAccumulate(0,3)
01565     MulStoreDigit(3)
01566     MulShiftCarry
01567 
01568     MulAccumulate(4,0)
01569     MulAccumulate(3,1)
01570     MulAccumulate(2,2)
01571     MulAccumulate(1,3)
01572     MulAccumulate(0,4)
01573     MulStoreDigit(4)
01574     MulShiftCarry
01575 
01576     MulAccumulate(5,0)
01577     MulAccumulate(4,1)
01578     MulAccumulate(3,2)
01579     MulAccumulate(2,3)
01580     MulAccumulate(1,4)
01581     MulAccumulate(0,5)
01582     MulStoreDigit(5)
01583     MulShiftCarry
01584 
01585     MulAccumulate(6,0)
01586     MulAccumulate(5,1)
01587     MulAccumulate(4,2)
01588     MulAccumulate(3,3)
01589     MulAccumulate(2,4)
01590     MulAccumulate(1,5)
01591     MulAccumulate(0,6)
01592     MulStoreDigit(6)
01593     MulShiftCarry
01594 
01595     MulAccumulate(7,0)
01596     MulAccumulate(6,1)
01597     MulAccumulate(5,2)
01598     MulAccumulate(4,3)
01599     MulAccumulate(3,4)
01600     MulAccumulate(2,5)
01601     MulAccumulate(1,6)
01602     MulAccumulate(0,7)
01603     MulStoreDigit(7)
01604     MulShiftCarry
01605 
01606     MulAccumulate(7,1)
01607     MulAccumulate(6,2)
01608     MulAccumulate(5,3)
01609     MulAccumulate(4,4)
01610     MulAccumulate(3,5)
01611     MulAccumulate(2,6)
01612     MulAccumulate(1,7)
01613     MulStoreDigit(8)
01614     MulShiftCarry
01615 
01616     MulAccumulate(7,2)
01617     MulAccumulate(6,3)
01618     MulAccumulate(5,4)
01619     MulAccumulate(4,5)
01620     MulAccumulate(3,6)
01621     MulAccumulate(2,7)
01622     MulStoreDigit(9)
01623     MulShiftCarry
01624 
01625     MulAccumulate(7,3)
01626     MulAccumulate(6,4)
01627     MulAccumulate(5,5)
01628     MulAccumulate(4,6)
01629     MulAccumulate(3,7)
01630     MulStoreDigit(10)
01631     MulShiftCarry
01632 
01633     MulAccumulate(7,4)
01634     MulAccumulate(6,5)
01635     MulAccumulate(5,6)
01636     MulAccumulate(4,7)
01637     MulStoreDigit(11)
01638     MulShiftCarry
01639 
01640     MulAccumulate(7,5)
01641     MulAccumulate(6,6)
01642     MulAccumulate(5,7)
01643     MulStoreDigit(12)
01644     MulShiftCarry
01645 
01646     MulAccumulate(7,6)
01647     MulAccumulate(6,7)
01648     MulStoreDigit(13)
01649     MulShiftCarry
01650 
01651     MulLastDiagonal(8)
01652     MulEpilogue
01653 }
01654 
01655 TAOCRYPT_NAKED void PentiumOptimized::Multiply8Bottom(word* Z, const word* X,
01656                                                       const word* Y)
01657 {
01658     MulPrologue
01659     // now: [esp] = Z, esi = X, ecx = Y
01660     MulStartup
01661     MulAccumulate(0,0)
01662     MulStoreDigit(0)
01663     MulShiftCarry
01664 
01665     MulAccumulate(1,0)
01666     MulAccumulate(0,1)
01667     MulStoreDigit(1)
01668     MulShiftCarry
01669 
01670     MulAccumulate(2,0)
01671     MulAccumulate(1,1)
01672     MulAccumulate(0,2)
01673     MulStoreDigit(2)
01674     MulShiftCarry
01675 
01676     MulAccumulate(3,0)
01677     MulAccumulate(2,1)
01678     MulAccumulate(1,2)
01679     MulAccumulate(0,3)
01680     MulStoreDigit(3)
01681     MulShiftCarry
01682 
01683     MulAccumulate(4,0)
01684     MulAccumulate(3,1)
01685     MulAccumulate(2,2)
01686     MulAccumulate(1,3)
01687     MulAccumulate(0,4)
01688     MulStoreDigit(4)
01689     MulShiftCarry
01690 
01691     MulAccumulate(5,0)
01692     MulAccumulate(4,1)
01693     MulAccumulate(3,2)
01694     MulAccumulate(2,3)
01695     MulAccumulate(1,4)
01696     MulAccumulate(0,5)
01697     MulStoreDigit(5)
01698     MulShiftCarry
01699 
01700     MulAccumulate(6,0)
01701     MulAccumulate(5,1)
01702     MulAccumulate(4,2)
01703     MulAccumulate(3,3)
01704     MulAccumulate(2,4)
01705     MulAccumulate(1,5)
01706     MulAccumulate(0,6)
01707     MulStoreDigit(6)
01708     MulShiftCarry
01709 
01710     MulAccumulateBottom(7,0)
01711     MulAccumulateBottom(6,1)
01712     MulAccumulateBottom(5,2)
01713     MulAccumulateBottom(4,3)
01714     MulAccumulateBottom(3,4)
01715     MulAccumulateBottom(2,5)
01716     MulAccumulateBottom(1,6)
01717     MulAccumulateBottom(0,7)
01718     MulStoreDigit(7)
01719     MulEpilogue
01720 }
01721 
01722 #undef AS1
01723 #undef AS2
01724 
01725 #else   // not x86 - no processor specific code at this layer
01726 
01727 typedef Portable LowLevel;
01728 
01729 #endif
01730 
01731 #ifdef SSE2_INTRINSICS_AVAILABLE
01732 
01733 #ifdef __GNUC__
01734 #define TAOCRYPT_FASTCALL
01735 #else
01736 #define TAOCRYPT_FASTCALL __fastcall
01737 #endif
01738 
01739 static void TAOCRYPT_FASTCALL P4_Mul(__m128i *C, const __m128i *A,
01740                                      const __m128i *B)
01741 {
01742     __m128i a3210 = _mm_load_si128(A);
01743     __m128i b3210 = _mm_load_si128(B);
01744 
01745     __m128i sum;
01746 
01747     __m128i z = _mm_setzero_si128();
01748     __m128i a2b2_a0b0 = _mm_mul_epu32(a3210, b3210);
01749     C[0] = a2b2_a0b0;
01750 
01751     __m128i a3120 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(3, 1, 2, 0));
01752     __m128i b3021 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 2, 1));
01753     __m128i a1b0_a0b1 = _mm_mul_epu32(a3120, b3021);
01754     __m128i a1b0 = _mm_unpackhi_epi32(a1b0_a0b1, z);
01755     __m128i a0b1 = _mm_unpacklo_epi32(a1b0_a0b1, z);
01756     C[1] = _mm_add_epi64(a1b0, a0b1);
01757 
01758     __m128i a31 = _mm_srli_epi64(a3210, 32);
01759     __m128i b31 = _mm_srli_epi64(b3210, 32);
01760     __m128i a3b3_a1b1 = _mm_mul_epu32(a31, b31);
01761     C[6] = a3b3_a1b1;
01762 
01763     __m128i a1b1 = _mm_unpacklo_epi32(a3b3_a1b1, z);
01764     __m128i b3012 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 1, 2));
01765     __m128i a2b0_a0b2 = _mm_mul_epu32(a3210, b3012);
01766     __m128i a0b2 = _mm_unpacklo_epi32(a2b0_a0b2, z);
01767     __m128i a2b0 = _mm_unpackhi_epi32(a2b0_a0b2, z);
01768     sum = _mm_add_epi64(a1b1, a0b2);
01769     C[2] = _mm_add_epi64(sum, a2b0);
01770 
01771     __m128i a2301 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(2, 3, 0, 1));
01772     __m128i b2103 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(2, 1, 0, 3));
01773     __m128i a3b0_a1b2 = _mm_mul_epu32(a2301, b3012);
01774     __m128i a2b1_a0b3 = _mm_mul_epu32(a3210, b2103);
01775     __m128i a3b0 = _mm_unpackhi_epi32(a3b0_a1b2, z);
01776     __m128i a1b2 = _mm_unpacklo_epi32(a3b0_a1b2, z);
01777     __m128i a2b1 = _mm_unpackhi_epi32(a2b1_a0b3, z);
01778     __m128i a0b3 = _mm_unpacklo_epi32(a2b1_a0b3, z);
01779     __m128i sum1 = _mm_add_epi64(a3b0, a1b2);
01780     sum = _mm_add_epi64(a2b1, a0b3);
01781     C[3] = _mm_add_epi64(sum, sum1);
01782 
01783     __m128i     a3b1_a1b3 = _mm_mul_epu32(a2301, b2103);
01784     __m128i a2b2 = _mm_unpackhi_epi32(a2b2_a0b0, z);
01785     __m128i a3b1 = _mm_unpackhi_epi32(a3b1_a1b3, z);
01786     __m128i a1b3 = _mm_unpacklo_epi32(a3b1_a1b3, z);
01787     sum = _mm_add_epi64(a2b2, a3b1);
01788     C[4] = _mm_add_epi64(sum, a1b3);
01789 
01790     __m128i a1302 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(1, 3, 0, 2));
01791     __m128i b1203 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(1, 2, 0, 3));
01792     __m128i a3b2_a2b3 = _mm_mul_epu32(a1302, b1203);
01793     __m128i a3b2 = _mm_unpackhi_epi32(a3b2_a2b3, z);
01794     __m128i a2b3 = _mm_unpacklo_epi32(a3b2_a2b3, z);
01795     C[5] = _mm_add_epi64(a3b2, a2b3);
01796 }
01797 
01798 void P4Optimized::Multiply4(word *C, const word *A, const word *B)
01799 {
01800     __m128i temp[7];
01801     const word *w = (word *)temp;
01802     const __m64 *mw = (__m64 *)w;
01803 
01804     P4_Mul(temp, (__m128i *)A, (__m128i *)B);
01805 
01806     C[0] = w[0];
01807 
01808     __m64 s1, s2;
01809 
01810     __m64 w1 = _mm_cvtsi32_si64(w[1]);
01811     __m64 w4 = mw[2];
01812     __m64 w6 = mw[3];
01813     __m64 w8 = mw[4];
01814     __m64 w10 = mw[5];
01815     __m64 w12 = mw[6];
01816     __m64 w14 = mw[7];
01817     __m64 w16 = mw[8];
01818     __m64 w18 = mw[9];
01819     __m64 w20 = mw[10];
01820     __m64 w22 = mw[11];
01821     __m64 w26 = _mm_cvtsi32_si64(w[26]);
01822 
01823     s1 = _mm_add_si64(w1, w4);
01824     C[1] = _mm_cvtsi64_si32(s1);
01825     s1 = _mm_srli_si64(s1, 32);
01826 
01827     s2 = _mm_add_si64(w6, w8);
01828     s1 = _mm_add_si64(s1, s2);
01829     C[2] = _mm_cvtsi64_si32(s1);
01830     s1 = _mm_srli_si64(s1, 32);
01831 
01832     s2 = _mm_add_si64(w10, w12);
01833     s1 = _mm_add_si64(s1, s2);
01834     C[3] = _mm_cvtsi64_si32(s1);
01835     s1 = _mm_srli_si64(s1, 32);
01836 
01837     s2 = _mm_add_si64(w14, w16);
01838     s1 = _mm_add_si64(s1, s2);
01839     C[4] = _mm_cvtsi64_si32(s1);
01840     s1 = _mm_srli_si64(s1, 32);
01841 
01842     s2 = _mm_add_si64(w18, w20);
01843     s1 = _mm_add_si64(s1, s2);
01844     C[5] = _mm_cvtsi64_si32(s1);
01845     s1 = _mm_srli_si64(s1, 32);
01846 
01847     s2 = _mm_add_si64(w22, w26);
01848     s1 = _mm_add_si64(s1, s2);
01849     C[6] = _mm_cvtsi64_si32(s1);
01850     s1 = _mm_srli_si64(s1, 32);
01851 
01852     C[7] = _mm_cvtsi64_si32(s1) + w[27];
01853     _mm_empty();
01854 }
01855 
01856 void P4Optimized::Multiply8(word *C, const word *A, const word *B)
01857 {
01858     __m128i temp[28];
01859     const word *w = (word *)temp;
01860     const __m64 *mw = (__m64 *)w;
01861     const word *x = (word *)temp+7*4;
01862     const __m64 *mx = (__m64 *)x;
01863     const word *y = (word *)temp+7*4*2;
01864     const __m64 *my = (__m64 *)y;
01865     const word *z = (word *)temp+7*4*3;
01866     const __m64 *mz = (__m64 *)z;
01867 
01868     P4_Mul(temp, (__m128i *)A, (__m128i *)B);
01869 
01870     P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
01871 
01872     P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
01873 
01874     P4_Mul(temp+21, (__m128i *)A+1, (__m128i *)B+1);
01875 
01876     C[0] = w[0];
01877 
01878     __m64 s1, s2, s3, s4;
01879 
01880     __m64 w1 = _mm_cvtsi32_si64(w[1]);
01881     __m64 w4 = mw[2];
01882     __m64 w6 = mw[3];
01883     __m64 w8 = mw[4];
01884     __m64 w10 = mw[5];
01885     __m64 w12 = mw[6];
01886     __m64 w14 = mw[7];
01887     __m64 w16 = mw[8];
01888     __m64 w18 = mw[9];
01889     __m64 w20 = mw[10];
01890     __m64 w22 = mw[11];
01891     __m64 w26 = _mm_cvtsi32_si64(w[26]);
01892     __m64 w27 = _mm_cvtsi32_si64(w[27]);
01893 
01894     __m64 x0 = _mm_cvtsi32_si64(x[0]);
01895     __m64 x1 = _mm_cvtsi32_si64(x[1]);
01896     __m64 x4 = mx[2];
01897     __m64 x6 = mx[3];
01898     __m64 x8 = mx[4];
01899     __m64 x10 = mx[5];
01900     __m64 x12 = mx[6];
01901     __m64 x14 = mx[7];
01902     __m64 x16 = mx[8];
01903     __m64 x18 = mx[9];
01904     __m64 x20 = mx[10];
01905     __m64 x22 = mx[11];
01906     __m64 x26 = _mm_cvtsi32_si64(x[26]);
01907     __m64 x27 = _mm_cvtsi32_si64(x[27]);
01908 
01909     __m64 y0 = _mm_cvtsi32_si64(y[0]);
01910     __m64 y1 = _mm_cvtsi32_si64(y[1]);
01911     __m64 y4 = my[2];
01912     __m64 y6 = my[3];
01913     __m64 y8 = my[4];
01914     __m64 y10 = my[5];
01915     __m64 y12 = my[6];
01916     __m64 y14 = my[7];
01917     __m64 y16 = my[8];
01918     __m64 y18 = my[9];
01919     __m64 y20 = my[10];
01920     __m64 y22 = my[11];
01921     __m64 y26 = _mm_cvtsi32_si64(y[26]);
01922     __m64 y27 = _mm_cvtsi32_si64(y[27]);
01923 
01924     __m64 z0 = _mm_cvtsi32_si64(z[0]);
01925     __m64 z1 = _mm_cvtsi32_si64(z[1]);
01926     __m64 z4 = mz[2];
01927     __m64 z6 = mz[3];
01928     __m64 z8 = mz[4];
01929     __m64 z10 = mz[5];
01930     __m64 z12 = mz[6];
01931     __m64 z14 = mz[7];
01932     __m64 z16 = mz[8];
01933     __m64 z18 = mz[9];
01934     __m64 z20 = mz[10];
01935     __m64 z22 = mz[11];
01936     __m64 z26 = _mm_cvtsi32_si64(z[26]);
01937 
01938     s1 = _mm_add_si64(w1, w4);
01939     C[1] = _mm_cvtsi64_si32(s1);
01940     s1 = _mm_srli_si64(s1, 32);
01941 
01942     s2 = _mm_add_si64(w6, w8);
01943     s1 = _mm_add_si64(s1, s2);
01944     C[2] = _mm_cvtsi64_si32(s1);
01945     s1 = _mm_srli_si64(s1, 32);
01946 
01947     s2 = _mm_add_si64(w10, w12);
01948     s1 = _mm_add_si64(s1, s2);
01949     C[3] = _mm_cvtsi64_si32(s1);
01950     s1 = _mm_srli_si64(s1, 32);
01951 
01952     s3 = _mm_add_si64(x0, y0);
01953     s2 = _mm_add_si64(w14, w16);
01954     s1 = _mm_add_si64(s1, s3);
01955     s1 = _mm_add_si64(s1, s2);
01956     C[4] = _mm_cvtsi64_si32(s1);
01957     s1 = _mm_srli_si64(s1, 32);
01958 
01959     s3 = _mm_add_si64(x1, y1);
01960     s4 = _mm_add_si64(x4, y4);
01961     s1 = _mm_add_si64(s1, w18);
01962     s3 = _mm_add_si64(s3, s4);
01963     s1 = _mm_add_si64(s1, w20);
01964     s1 = _mm_add_si64(s1, s3);
01965     C[5] = _mm_cvtsi64_si32(s1);
01966     s1 = _mm_srli_si64(s1, 32);
01967 
01968     s3 = _mm_add_si64(x6, y6);
01969     s4 = _mm_add_si64(x8, y8);
01970     s1 = _mm_add_si64(s1, w22);
01971     s3 = _mm_add_si64(s3, s4);
01972     s1 = _mm_add_si64(s1, w26);
01973     s1 = _mm_add_si64(s1, s3);
01974     C[6] = _mm_cvtsi64_si32(s1);
01975     s1 = _mm_srli_si64(s1, 32);
01976 
01977     s3 = _mm_add_si64(x10, y10);
01978     s4 = _mm_add_si64(x12, y12);
01979     s1 = _mm_add_si64(s1, w27);
01980     s3 = _mm_add_si64(s3, s4);
01981     s1 = _mm_add_si64(s1, s3);
01982     C[7] = _mm_cvtsi64_si32(s1);
01983     s1 = _mm_srli_si64(s1, 32);
01984 
01985     s3 = _mm_add_si64(x14, y14);
01986     s4 = _mm_add_si64(x16, y16);
01987     s1 = _mm_add_si64(s1, z0);
01988     s3 = _mm_add_si64(s3, s4);
01989     s1 = _mm_add_si64(s1, s3);
01990     C[8] = _mm_cvtsi64_si32(s1);
01991     s1 = _mm_srli_si64(s1, 32);
01992 
01993     s3 = _mm_add_si64(x18, y18);
01994     s4 = _mm_add_si64(x20, y20);
01995     s1 = _mm_add_si64(s1, z1);
01996     s3 = _mm_add_si64(s3, s4);
01997     s1 = _mm_add_si64(s1, z4);
01998     s1 = _mm_add_si64(s1, s3);
01999     C[9] = _mm_cvtsi64_si32(s1);
02000     s1 = _mm_srli_si64(s1, 32);
02001 
02002     s3 = _mm_add_si64(x22, y22);
02003     s4 = _mm_add_si64(x26, y26);
02004     s1 = _mm_add_si64(s1, z6);
02005     s3 = _mm_add_si64(s3, s4);
02006     s1 = _mm_add_si64(s1, z8);
02007     s1 = _mm_add_si64(s1, s3);
02008     C[10] = _mm_cvtsi64_si32(s1);
02009     s1 = _mm_srli_si64(s1, 32);
02010 
02011     s3 = _mm_add_si64(x27, y27);
02012     s1 = _mm_add_si64(s1, z10);
02013     s1 = _mm_add_si64(s1, z12);
02014     s1 = _mm_add_si64(s1, s3);
02015     C[11] = _mm_cvtsi64_si32(s1);
02016     s1 = _mm_srli_si64(s1, 32);
02017 
02018     s3 = _mm_add_si64(z14, z16);
02019     s1 = _mm_add_si64(s1, s3);
02020     C[12] = _mm_cvtsi64_si32(s1);
02021     s1 = _mm_srli_si64(s1, 32);
02022 
02023     s3 = _mm_add_si64(z18, z20);
02024     s1 = _mm_add_si64(s1, s3);
02025     C[13] = _mm_cvtsi64_si32(s1);
02026     s1 = _mm_srli_si64(s1, 32);
02027 
02028     s3 = _mm_add_si64(z22, z26);
02029     s1 = _mm_add_si64(s1, s3);
02030     C[14] = _mm_cvtsi64_si32(s1);
02031     s1 = _mm_srli_si64(s1, 32);
02032 
02033     C[15] = z[27] + _mm_cvtsi64_si32(s1);
02034     _mm_empty();
02035 }
02036 
02037 void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B)
02038 {
02039     __m128i temp[21];
02040     const word *w = (word *)temp;
02041     const __m64 *mw = (__m64 *)w;
02042     const word *x = (word *)temp+7*4;
02043     const __m64 *mx = (__m64 *)x;
02044     const word *y = (word *)temp+7*4*2;
02045     const __m64 *my = (__m64 *)y;
02046 
02047     P4_Mul(temp, (__m128i *)A, (__m128i *)B);
02048 
02049     P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
02050 
02051     P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
02052 
02053     C[0] = w[0];
02054 
02055     __m64 s1, s2, s3, s4;
02056 
02057     __m64 w1 = _mm_cvtsi32_si64(w[1]);
02058     __m64 w4 = mw[2];
02059     __m64 w6 = mw[3];
02060     __m64 w8 = mw[4];
02061     __m64 w10 = mw[5];
02062     __m64 w12 = mw[6];
02063     __m64 w14 = mw[7];
02064     __m64 w16 = mw[8];
02065     __m64 w18 = mw[9];
02066     __m64 w20 = mw[10];
02067     __m64 w22 = mw[11];
02068     __m64 w26 = _mm_cvtsi32_si64(w[26]);
02069 
02070     __m64 x0 = _mm_cvtsi32_si64(x[0]);
02071     __m64 x1 = _mm_cvtsi32_si64(x[1]);
02072     __m64 x4 = mx[2];
02073     __m64 x6 = mx[3];
02074     __m64 x8 = mx[4];
02075 
02076     __m64 y0 = _mm_cvtsi32_si64(y[0]);
02077     __m64 y1 = _mm_cvtsi32_si64(y[1]);
02078     __m64 y4 = my[2];
02079     __m64 y6 = my[3];
02080     __m64 y8 = my[4];
02081 
02082     s1 = _mm_add_si64(w1, w4);
02083     C[1] = _mm_cvtsi64_si32(s1);
02084     s1 = _mm_srli_si64(s1, 32);
02085 
02086     s2 = _mm_add_si64(w6, w8);
02087     s1 = _mm_add_si64(s1, s2);
02088     C[2] = _mm_cvtsi64_si32(s1);
02089     s1 = _mm_srli_si64(s1, 32);
02090 
02091     s2 = _mm_add_si64(w10, w12);
02092     s1 = _mm_add_si64(s1, s2);
02093     C[3] = _mm_cvtsi64_si32(s1);
02094     s1 = _mm_srli_si64(s1, 32);
02095 
02096     s3 = _mm_add_si64(x0, y0);
02097     s2 = _mm_add_si64(w14, w16);
02098     s1 = _mm_add_si64(s1, s3);
02099     s1 = _mm_add_si64(s1, s2);
02100     C[4] = _mm_cvtsi64_si32(s1);
02101     s1 = _mm_srli_si64(s1, 32);
02102 
02103     s3 = _mm_add_si64(x1, y1);
02104     s4 = _mm_add_si64(x4, y4);
02105     s1 = _mm_add_si64(s1, w18);
02106     s3 = _mm_add_si64(s3, s4);
02107     s1 = _mm_add_si64(s1, w20);
02108     s1 = _mm_add_si64(s1, s3);
02109     C[5] = _mm_cvtsi64_si32(s1);
02110     s1 = _mm_srli_si64(s1, 32);
02111 
02112     s3 = _mm_add_si64(x6, y6);
02113     s4 = _mm_add_si64(x8, y8);
02114     s1 = _mm_add_si64(s1, w22);
02115     s3 = _mm_add_si64(s3, s4);
02116     s1 = _mm_add_si64(s1, w26);
02117     s1 = _mm_add_si64(s1, s3);
02118     C[6] = _mm_cvtsi64_si32(s1);
02119     s1 = _mm_srli_si64(s1, 32);
02120 
02121     C[7] = _mm_cvtsi64_si32(s1) + w[27] + x[10] + y[10] + x[12] + y[12];
02122     _mm_empty();
02123 }
02124 
02125 #endif  // #ifdef SSE2_INTRINSICS_AVAILABLE
02126 
02127 // end optimized
02128 
02129 // ********************************************************
02130 
02131 #define A0      A
02132 #define A1      (A+N2)
02133 #define B0      B
02134 #define B1      (B+N2)
02135 
02136 #define T0      T
02137 #define T1      (T+N2)
02138 #define T2      (T+N)
02139 #define T3      (T+N+N2)
02140 
02141 #define R0      R
02142 #define R1      (R+N2)
02143 #define R2      (R+N)
02144 #define R3      (R+N+N2)
02145 
02146 //VC60 workaround: compiler bug triggered without the extra dummy parameters
02147 
02148 // R[2*N] - result = A*B
02149 // T[2*N] - temporary work space
02150 // A[N] --- multiplier
02151 // B[N] --- multiplicant
02152 
02153 
02154 void RecursiveMultiply(word *R, word *T, const word *A, const word *B,
02155                        unsigned int N)
02156 {
02157     assert(N>=2 && N%2==0);
02158 
02159     if (LowLevel::MultiplyRecursionLimit() >= 8 && N==8)
02160         LowLevel::Multiply8(R, A, B);
02161     else if (LowLevel::MultiplyRecursionLimit() >= 4 && N==4)
02162         LowLevel::Multiply4(R, A, B);
02163     else if (N==2)
02164         LowLevel::Multiply2(R, A, B);
02165     else
02166     {
02167         const unsigned int N2 = N/2;
02168         int carry;
02169 
02170         int aComp = Compare(A0, A1, N2);
02171         int bComp = Compare(B0, B1, N2);
02172 
02173         switch (2*aComp + aComp + bComp)
02174         {
02175         case -4:
02176             LowLevel::Subtract(R0, A1, A0, N2);
02177             LowLevel::Subtract(R1, B0, B1, N2);
02178             RecursiveMultiply(T0, T2, R0, R1, N2);
02179             LowLevel::Subtract(T1, T1, R0, N2);
02180             carry = -1;
02181             break;
02182         case -2:
02183             LowLevel::Subtract(R0, A1, A0, N2);
02184             LowLevel::Subtract(R1, B0, B1, N2);
02185             RecursiveMultiply(T0, T2, R0, R1, N2);
02186             carry = 0;
02187             break;
02188         case 2:
02189             LowLevel::Subtract(R0, A0, A1, N2);
02190             LowLevel::Subtract(R1, B1, B0, N2);
02191             RecursiveMultiply(T0, T2, R0, R1, N2);
02192             carry = 0;
02193             break;
02194         case 4:
02195             LowLevel::Subtract(R0, A1, A0, N2);
02196             LowLevel::Subtract(R1, B0, B1, N2);
02197             RecursiveMultiply(T0, T2, R0, R1, N2);
02198             LowLevel::Subtract(T1, T1, R1, N2);
02199             carry = -1;
02200             break;
02201         default:
02202             SetWords(T0, 0, N);
02203             carry = 0;
02204         }
02205 
02206         RecursiveMultiply(R0, T2, A0, B0, N2);
02207         RecursiveMultiply(R2, T2, A1, B1, N2);
02208 
02209         // now T[01] holds (A1-A0)*(B0-B1),R[01] holds A0*B0, R[23] holds A1*B1
02210 
02211         carry += LowLevel::Add(T0, T0, R0, N);
02212         carry += LowLevel::Add(T0, T0, R2, N);
02213         carry += LowLevel::Add(R1, R1, T0, N);
02214 
02215         assert (carry >= 0 && carry <= 2);
02216         Increment(R3, N2, carry);
02217     }
02218 }
02219 
02220 
02221 void RecursiveSquare(word *R, word *T, const word *A, unsigned int N)                     
02222 {
02223     assert(N && N%2==0);
02224     if (LowLevel::SquareRecursionLimit() >= 8 && N==8)
02225         LowLevel::Square8(R, A);
02226     if (LowLevel::SquareRecursionLimit() >= 4 && N==4)
02227         LowLevel::Square4(R, A);
02228     else if (N==2)
02229         LowLevel::Square2(R, A);
02230     else
02231     {
02232         const unsigned int N2 = N/2;
02233 
02234         RecursiveSquare(R0, T2, A0, N2);
02235         RecursiveSquare(R2, T2, A1, N2);
02236         RecursiveMultiply(T0, T2, A0, A1, N2);
02237 
02238         word carry = LowLevel::Add(R1, R1, T0, N);
02239         carry += LowLevel::Add(R1, R1, T0, N);
02240         Increment(R3, N2, carry);
02241     }
02242 }
02243 
02244 
02245 // R[N] - bottom half of A*B
02246 // T[N] - temporary work space
02247 // A[N] - multiplier
02248 // B[N] - multiplicant
02249 
02250 
02251 void RecursiveMultiplyBottom(word *R, word *T, const word *A, const word *B,
02252                              unsigned int N)
02253 {
02254     assert(N>=2 && N%2==0);
02255     if (LowLevel::MultiplyBottomRecursionLimit() >= 8 && N==8)
02256         LowLevel::Multiply8Bottom(R, A, B);
02257     else if (LowLevel::MultiplyBottomRecursionLimit() >= 4 && N==4)
02258         LowLevel::Multiply4Bottom(R, A, B);
02259     else if (N==2)
02260         LowLevel::Multiply2Bottom(R, A, B);
02261     else
02262     {
02263         const unsigned int N2 = N/2;
02264 
02265         RecursiveMultiply(R, T, A0, B0, N2);
02266         RecursiveMultiplyBottom(T0, T1, A1, B0, N2);
02267         LowLevel::Add(R1, R1, T0, N2);
02268         RecursiveMultiplyBottom(T0, T1, A0, B1, N2);
02269         LowLevel::Add(R1, R1, T0, N2);
02270     }
02271 }
02272 
02273 
02274 void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A,
02275                           const word *B, unsigned int N)
02276 {
02277     assert(N>=2 && N%2==0);
02278 
02279     if (N==4)
02280     {
02281         LowLevel::Multiply4(T, A, B);
02282         memcpy(R, T+4, 4*WORD_SIZE);
02283     }
02284     else if (N==2)
02285     {
02286         LowLevel::Multiply2(T, A, B);
02287         memcpy(R, T+2, 2*WORD_SIZE);
02288     }
02289     else
02290     {
02291         const unsigned int N2 = N/2;
02292         int carry;
02293 
02294         int aComp = Compare(A0, A1, N2);
02295         int bComp = Compare(B0, B1, N2);
02296 
02297         switch (2*aComp + aComp + bComp)
02298         {
02299         case -4:
02300             LowLevel::Subtract(R0, A1, A0, N2);
02301             LowLevel::Subtract(R1, B0, B1, N2);
02302             RecursiveMultiply(T0, T2, R0, R1, N2);
02303             LowLevel::Subtract(T1, T1, R0, N2);
02304             carry = -1;
02305             break;
02306         case -2:
02307             LowLevel::Subtract(R0, A1, A0, N2);
02308             LowLevel::Subtract(R1, B0, B1, N2);
02309             RecursiveMultiply(T0, T2, R0, R1, N2);
02310             carry = 0;
02311             break;
02312         case 2:
02313             LowLevel::Subtract(R0, A0, A1, N2);
02314             LowLevel::Subtract(R1, B1, B0, N2);
02315             RecursiveMultiply(T0, T2, R0, R1, N2);
02316             carry = 0;
02317             break;
02318         case 4:
02319             LowLevel::Subtract(R0, A1, A0, N2);
02320             LowLevel::Subtract(R1, B0, B1, N2);
02321             RecursiveMultiply(T0, T2, R0, R1, N2);
02322             LowLevel::Subtract(T1, T1, R1, N2);
02323             carry = -1;
02324             break;
02325         default:
02326             SetWords(T0, 0, N);
02327             carry = 0;
02328         }
02329 
02330         RecursiveMultiply(T2, R0, A1, B1, N2);
02331 
02332         // now T[01] holds (A1-A0)*(B0-B1), T[23] holds A1*B1
02333 
02334         word c2 = LowLevel::Subtract(R0, L+N2, L, N2);
02335         c2 += LowLevel::Subtract(R0, R0, T0, N2);
02336         word t = (Compare(R0, T2, N2) == -1);
02337 
02338         carry += t;
02339         carry += Increment(R0, N2, c2+t);
02340         carry += LowLevel::Add(R0, R0, T1, N2);
02341         carry += LowLevel::Add(R0, R0, T3, N2);
02342         assert (carry >= 0 && carry <= 2);
02343 
02344         CopyWords(R1, T3, N2);
02345         Increment(R1, N2, carry);
02346     }
02347 }
02348 
02349 
02350 inline word Add(word *C, const word *A, const word *B, unsigned int N)
02351 {
02352     return LowLevel::Add(C, A, B, N);
02353 }
02354 
02355 inline word Subtract(word *C, const word *A, const word *B, unsigned int N)
02356 {
02357     return LowLevel::Subtract(C, A, B, N);
02358 }
02359 
02360 inline void Multiply(word *R, word *T, const word *A, const word *B,
02361                      unsigned int N)
02362 {
02363     RecursiveMultiply(R, T, A, B, N);
02364 }
02365 
02366 inline void Square(word *R, word *T, const word *A, unsigned int N)
02367 {
02368     RecursiveSquare(R, T, A, N);
02369 }
02370 
02371 
02372 void AsymmetricMultiply(word *R, word *T, const word *A, unsigned int NA,
02373                         const word *B, unsigned int NB)
02374 {
02375     if (NA == NB)
02376     {
02377         if (A == B)
02378             Square(R, T, A, NA);
02379         else
02380             Multiply(R, T, A, B, NA);
02381 
02382         return;
02383     }
02384 
02385     if (NA > NB)
02386     {
02387         mySTL::swap(A, B);
02388         mySTL::swap(NA, NB);
02389     }
02390 
02391     assert(NB % NA == 0);
02392     assert((NB/NA)%2 == 0);     // NB is an even multiple of NA
02393 
02394     if (NA==2 && !A[1])
02395     {
02396         switch (A[0])
02397         {
02398         case 0:
02399             SetWords(R, 0, NB+2);
02400             return;
02401         case 1:
02402             CopyWords(R, B, NB);
02403             R[NB] = R[NB+1] = 0;
02404             return;
02405         default:
02406             R[NB] = LinearMultiply(R, B, A[0], NB);
02407             R[NB+1] = 0;
02408             return;
02409         }
02410     }
02411 
02412     Multiply(R, T, A, B, NA);
02413     CopyWords(T+2*NA, R+NA, NA);
02414 
02415     unsigned i;
02416 
02417     for (i=2*NA; i<NB; i+=2*NA)
02418         Multiply(T+NA+i, T, A, B+i, NA);
02419     for (i=NA; i<NB; i+=2*NA)
02420         Multiply(R+i, T, A, B+i, NA);
02421 
02422     if (Add(R+NA, R+NA, T+2*NA, NB-NA))
02423         Increment(R+NB, NA);
02424 }
02425 
02426 
02427 void PositiveMultiply(Integer& product, const Integer& a, const Integer& b)
02428 {
02429     unsigned int aSize = RoundupSize(a.WordCount());
02430     unsigned int bSize = RoundupSize(b.WordCount());
02431 
02432     product.reg_.CleanNew(RoundupSize(aSize + bSize));
02433     product.sign_ = Integer::POSITIVE;
02434 
02435     AlignedWordBlock workspace(aSize + bSize);
02436     AsymmetricMultiply(product.reg_.get_buffer(), workspace.get_buffer(),
02437                        a.reg_.get_buffer(), aSize, b.reg_.get_buffer(), bSize);
02438 }
02439 
02440 void Multiply(Integer &product, const Integer &a, const Integer &b)
02441 {
02442     PositiveMultiply(product, a, b);
02443 
02444     if (a.NotNegative() != b.NotNegative())
02445         product.Negate();
02446 }
02447 
02448 
02449 static inline unsigned int EvenWordCount(const word *X, unsigned int N)
02450 {
02451     while (N && X[N-2]==0 && X[N-1]==0)
02452         N-=2;
02453     return N;
02454 }
02455 
02456 
02457 unsigned int AlmostInverse(word *R, word *T, const word *A, unsigned int NA,
02458                            const word *M, unsigned int N)
02459 {
02460     assert(NA<=N && N && N%2==0);
02461 
02462     word *b = T;
02463     word *c = T+N;
02464     word *f = T+2*N;
02465     word *g = T+3*N;
02466     unsigned int bcLen=2, fgLen=EvenWordCount(M, N);
02467     unsigned int k=0, s=0;
02468 
02469     SetWords(T, 0, 3*N);
02470     b[0]=1;
02471     CopyWords(f, A, NA);
02472     CopyWords(g, M, N);
02473 
02474     while (1)
02475     {
02476         word t=f[0];
02477         while (!t)
02478         {
02479             if (EvenWordCount(f, fgLen)==0)
02480             {
02481                 SetWords(R, 0, N);
02482                 return 0;
02483             }
02484 
02485             ShiftWordsRightByWords(f, fgLen, 1);
02486             if (c[bcLen-1]) bcLen+=2;
02487             assert(bcLen <= N);
02488             ShiftWordsLeftByWords(c, bcLen, 1);
02489             k+=WORD_BITS;
02490             t=f[0];
02491         }
02492 
02493         unsigned int i=0;
02494         while (t%2 == 0)
02495         {
02496             t>>=1;
02497             i++;
02498         }
02499         k+=i;
02500 
02501         if (t==1 && f[1]==0 && EvenWordCount(f, fgLen)==2)
02502         {
02503             if (s%2==0)
02504                 CopyWords(R, b, N);
02505             else
02506                 Subtract(R, M, b, N);
02507             return k;
02508         }
02509 
02510         ShiftWordsRightByBits(f, fgLen, i);
02511         t=ShiftWordsLeftByBits(c, bcLen, i);
02512         if (t)
02513         {
02514             c[bcLen] = t;
02515             bcLen+=2;
02516             assert(bcLen <= N);
02517         }
02518 
02519         if (f[fgLen-2]==0 && g[fgLen-2]==0 && f[fgLen-1]==0 && g[fgLen-1]==0)
02520             fgLen-=2;
02521 
02522         if (Compare(f, g, fgLen)==-1)
02523         {
02524             mySTL::swap(f, g);
02525             mySTL::swap(b, c);
02526             s++;
02527         }
02528 
02529         Subtract(f, f, g, fgLen);
02530 
02531         if (Add(b, b, c, bcLen))
02532         {
02533             b[bcLen] = 1;
02534             bcLen+=2;
02535             assert(bcLen <= N);
02536         }
02537     }
02538 }
02539 
02540 // R[N] - result = A/(2^k) mod M
02541 // A[N] - input
02542 // M[N] - modulus
02543 
02544 void DivideByPower2Mod(word *R, const word *A, unsigned int k, const word *M,
02545                        unsigned int N)
02546 {
02547     CopyWords(R, A, N);
02548 
02549     while (k--)
02550     {
02551         if (R[0]%2==0)
02552             ShiftWordsRightByBits(R, N, 1);
02553         else
02554         {
02555             word carry = Add(R, R, M, N);
02556             ShiftWordsRightByBits(R, N, 1);
02557             R[N-1] += carry<<(WORD_BITS-1);
02558         }
02559     }
02560 }
02561 
02562 // R[N] - result = A*(2^k) mod M
02563 // A[N] - input
02564 // M[N] - modulus
02565 
02566 void MultiplyByPower2Mod(word *R, const word *A, unsigned int k, const word *M,
02567                          unsigned int N)
02568 {
02569     CopyWords(R, A, N);
02570 
02571     while (k--)
02572         if (ShiftWordsLeftByBits(R, N, 1) || Compare(R, M, N)>=0)
02573             Subtract(R, R, M, N);
02574 }
02575 
02576 
02577 // ********** end of integer needs
02578 
02579 
02580 Integer::Integer()
02581     : reg_(2), sign_(POSITIVE)
02582 {
02583     reg_[0] = reg_[1] = 0;
02584 }
02585 
02586 
02587 Integer::Integer(const Integer& t)
02588     : reg_(RoundupSize(t.WordCount())), sign_(t.sign_)
02589 {
02590     CopyWords(reg_.get_buffer(), t.reg_.get_buffer(), reg_.size());
02591 }
02592 
02593 
02594 Integer::Integer(signed long value)
02595     : reg_(2)
02596 {
02597     if (value >= 0)
02598         sign_ = POSITIVE;
02599     else
02600     {
02601         sign_ = NEGATIVE;
02602         value = -value;
02603     }
02604     reg_[0] = word(value);
02605     reg_[1] = word(SafeRightShift<WORD_BITS, unsigned long>(value));
02606 }
02607 
02608 
02609 Integer::Integer(Sign s, word high, word low)
02610     : reg_(2), sign_(s)
02611 {
02612     reg_[0] = low;
02613     reg_[1] = high;
02614 }
02615 
02616 
02617 Integer::Integer(word value, unsigned int length)
02618     : reg_(RoundupSize(length)), sign_(POSITIVE)
02619 {
02620     reg_[0] = value;
02621     SetWords(reg_ + 1, 0, reg_.size() - 1);
02622 }
02623 
02624 
02625 Integer::Integer(const byte *encodedInteger, unsigned int byteCount,
02626                  Signedness s)
02627 {
02628     Decode(encodedInteger, byteCount, s);
02629 }
02630 
02631 class BadBER {};
02632 
02633 // BER Decode Source
02634 Integer::Integer(Source& source)
02635     : reg_(2), sign_(POSITIVE)
02636 {
02637     Decode(source);
02638 }
02639 
02640 void Integer::Decode(Source& source)
02641 {
02642     byte b = source.next();
02643     if (b != INTEGER) {
02644         source.SetError(INTEGER_E);
02645         return;
02646     }
02647 
02648     word32 length = GetLength(source);
02649 
02650     if ( (b = source.next()) == 0x00)
02651         length--;
02652     else
02653         source.prev();
02654  
02655     unsigned int words = (length + WORD_SIZE - 1) / WORD_SIZE;
02656     words = RoundupSize(words);
02657     if (words > reg_.size()) reg_.CleanNew(words);
02658 
02659     for (int j = length; j > 0; j--) {
02660         b = source.next();
02661         reg_ [(j-1) / WORD_SIZE] |= (word)b << ((j-1) % WORD_SIZE) * 8;
02662     }
02663 }
02664 
02665 
02666 void Integer::Decode(const byte* input, unsigned int inputLen, Signedness s)
02667 {
02668     unsigned int idx(0);
02669     byte b = input[idx++];
02670     sign_  = ((s==SIGNED) && (b & 0x80)) ? NEGATIVE : POSITIVE;
02671 
02672     while (inputLen>0 && (sign_==POSITIVE ? b==0 : b==0xff))
02673     {
02674         inputLen--;
02675         b = input[idx++];
02676     }
02677 
02678     reg_.CleanNew(RoundupSize(BytesToWords(inputLen)));
02679 
02680     --idx;
02681     for (unsigned int i=inputLen; i > 0; i--)
02682     {
02683         b = input[idx++];
02684         reg_[(i-1)/WORD_SIZE] |= (word)b << ((i-1)%WORD_SIZE)*8;
02685     }
02686 
02687     if (sign_ == NEGATIVE)
02688     {
02689         for (unsigned i=inputLen; i<reg_.size()*WORD_SIZE; i++)
02690             reg_[i/WORD_SIZE] |= (word)0xff << (i%WORD_SIZE)*8;
02691         TwosComplement(reg_.get_buffer(), reg_.size());
02692     }
02693 }
02694 
02695 
02696 unsigned int Integer::Encode(byte* output, unsigned int outputLen,
02697                        Signedness signedness) const
02698 {
02699     unsigned int idx(0);
02700     if (signedness == UNSIGNED || NotNegative())
02701     {
02702         for (unsigned int i=outputLen; i > 0; i--)
02703             output[idx++] = GetByte(i-1);
02704     }
02705     else
02706     {
02707         // take two's complement of *this
02708         Integer temp = Integer::Power2(8*max(ByteCount(), outputLen)) + *this;
02709         for (unsigned i=0; i<outputLen; i++)
02710             output[idx++] = temp.GetByte(outputLen-i-1);
02711     }
02712     return outputLen;
02713 }
02714 
02715 
02716 static Integer* zero = 0;
02717 
02718 const Integer &Integer::Zero()
02719 {
02720     if (!zero)
02721         zero = NEW_TC Integer;
02722     return *zero;
02723 }
02724 
02725 
02726 static Integer* one = 0;
02727 
02728 const Integer &Integer::One()
02729 {
02730     if (!one)
02731         one = NEW_TC Integer(1,2);
02732     return *one;
02733 }
02734 
02735 
02736 // Clean up static singleton holders, not a leak, but helpful to have gone
02737 // when checking for leaks
02738 void CleanUp()
02739 {
02740     tcDelete(one);
02741     tcDelete(zero);
02742 
02743     // In case user calls more than once, prevent seg fault
02744     one  = 0;
02745     zero = 0;
02746 }
02747 
02748 Integer::Integer(RandomNumberGenerator& rng, const Integer& min,
02749                  const Integer& max)
02750 {
02751     Randomize(rng, min, max);
02752 }
02753 
02754 
02755 void Integer::Randomize(RandomNumberGenerator& rng, unsigned int nbits)
02756 {
02757     const unsigned int nbytes = nbits/8 + 1;
02758     ByteBlock buf(nbytes);
02759     rng.GenerateBlock(buf.get_buffer(), nbytes);
02760     if (nbytes)
02761         buf[0] = (byte)Crop(buf[0], nbits % 8);
02762     Decode(buf.get_buffer(), nbytes, UNSIGNED);
02763 }
02764 
02765 void Integer::Randomize(RandomNumberGenerator& rng, const Integer& min,
02766                         const Integer& max)
02767 {
02768     assert(min <= max);
02769 
02770     Integer range = max - min;
02771     const unsigned int nbits = range.BitCount();
02772 
02773     do
02774     {
02775         Randomize(rng, nbits);
02776     }
02777     while (*this > range);
02778 
02779     *this += min;
02780 }
02781 
02782 
02783 Integer Integer::Power2(unsigned int e)
02784 {
02785     Integer r((word)0, BitsToWords(e + 1));
02786     r.SetBit(e);
02787     return r;
02788 }
02789 
02790 
02791 void Integer::SetBit(unsigned int n, bool value)
02792 {
02793     if (value)
02794     {
02795         reg_.CleanGrow(RoundupSize(BitsToWords(n + 1)));
02796         reg_[n / WORD_BITS] |= (word(1) << (n % WORD_BITS));
02797     }
02798     else
02799     {
02800         if (n / WORD_BITS < reg_.size())
02801             reg_[n / WORD_BITS] &= ~(word(1) << (n % WORD_BITS));
02802     }
02803 }
02804 
02805 
02806 void Integer::SetByte(unsigned int n, byte value)
02807 {
02808     reg_.CleanGrow(RoundupSize(BytesToWords(n+1)));
02809     reg_[n/WORD_SIZE] &= ~(word(0xff) << 8*(n%WORD_SIZE));
02810     reg_[n/WORD_SIZE] |= (word(value) << 8*(n%WORD_SIZE));
02811 }
02812 
02813 
02814 void Integer::Negate()
02815 {
02816     if (!!(*this))      // don't flip sign if *this==0
02817         sign_ = Sign(1 - sign_);
02818 }
02819 
02820 
02821 bool Integer::operator!() const
02822 {
02823     return IsNegative() ? false : (reg_[0]==0 && WordCount()==0);
02824 }
02825 
02826 
02827 Integer& Integer::operator=(const Integer& t)
02828 {
02829     if (this != &t)
02830     {
02831         reg_.New(RoundupSize(t.WordCount()));
02832         CopyWords(reg_.get_buffer(), t.reg_.get_buffer(), reg_.size());
02833         sign_ = t.sign_;
02834     }
02835     return *this;
02836 }
02837 
02838 
02839 Integer& Integer::operator+=(const Integer& t)
02840 {
02841     reg_.CleanGrow(t.reg_.size());
02842     if (NotNegative())
02843     {
02844         if (t.NotNegative())
02845             PositiveAdd(*this, *this, t);
02846         else
02847             PositiveSubtract(*this, *this, t);
02848     }
02849     else
02850     {
02851         if (t.NotNegative())
02852             PositiveSubtract(*this, t, *this);
02853         else
02854         {
02855             PositiveAdd(*this, *this, t);
02856             sign_ = Integer::NEGATIVE;
02857         }
02858     }
02859     return *this;
02860 }
02861 
02862 
02863 Integer Integer::operator-() const
02864 {
02865     Integer result(*this);
02866     result.Negate();
02867     return result;
02868 }
02869 
02870 
02871 Integer& Integer::operator-=(const Integer& t)
02872 {
02873     reg_.CleanGrow(t.reg_.size());
02874     if (NotNegative())
02875     {
02876         if (t.NotNegative())
02877             PositiveSubtract(*this, *this, t);
02878         else
02879             PositiveAdd(*this, *this, t);
02880     }
02881     else
02882     {
02883         if (t.NotNegative())
02884         {
02885             PositiveAdd(*this, *this, t);
02886             sign_ = Integer::NEGATIVE;
02887         }
02888         else
02889             PositiveSubtract(*this, t, *this);
02890     }
02891     return *this;
02892 }
02893 
02894 
02895 Integer& Integer::operator++()
02896 {
02897     if (NotNegative())
02898     {
02899         if (Increment(reg_.get_buffer(), reg_.size()))
02900         {
02901             reg_.CleanGrow(2*reg_.size());
02902             reg_[reg_.size()/2]=1;
02903         }
02904     }
02905     else
02906     {
02907         word borrow = Decrement(reg_.get_buffer(), reg_.size());
02908         assert(!borrow);
02909         if (WordCount()==0)
02910             *this = Zero();
02911     }
02912     return *this;
02913 }
02914 
02915 Integer& Integer::operator--()
02916 {
02917     if (IsNegative())
02918     {
02919         if (Increment(reg_.get_buffer(), reg_.size()))
02920         {
02921             reg_.CleanGrow(2*reg_.size());
02922             reg_[reg_.size()/2]=1;
02923         }
02924     }
02925     else
02926     {
02927         if (Decrement(reg_.get_buffer(), reg_.size()))
02928             *this = -One();
02929     }
02930     return *this;
02931 }
02932 
02933 
02934 Integer& Integer::operator<<=(unsigned int n)
02935 {
02936     const unsigned int wordCount = WordCount();
02937     const unsigned int shiftWords = n / WORD_BITS;
02938     const unsigned int shiftBits = n % WORD_BITS;
02939 
02940     reg_.CleanGrow(RoundupSize(wordCount+BitsToWords(n)));
02941     ShiftWordsLeftByWords(reg_.get_buffer(), wordCount + shiftWords,
02942                           shiftWords);
02943     ShiftWordsLeftByBits(reg_+shiftWords, wordCount+BitsToWords(shiftBits),
02944                          shiftBits);
02945     return *this;
02946 }
02947 
02948 Integer& Integer::operator>>=(unsigned int n)
02949 {
02950     const unsigned int wordCount = WordCount();
02951     const unsigned int shiftWords = n / WORD_BITS;
02952     const unsigned int shiftBits = n % WORD_BITS;
02953 
02954     ShiftWordsRightByWords(reg_.get_buffer(), wordCount, shiftWords);
02955     if (wordCount > shiftWords)
02956         ShiftWordsRightByBits(reg_.get_buffer(), wordCount-shiftWords,
02957                               shiftBits);
02958     if (IsNegative() && WordCount()==0)   // avoid -0
02959         *this = Zero();
02960     return *this;
02961 }
02962 
02963 
02964 void PositiveAdd(Integer& sum, const Integer& a, const Integer& b)
02965 {
02966     word carry;
02967     if (a.reg_.size() == b.reg_.size())
02968         carry = Add(sum.reg_.get_buffer(), a.reg_.get_buffer(),
02969                     b.reg_.get_buffer(), a.reg_.size());
02970     else if (a.reg_.size() > b.reg_.size())
02971     {
02972         carry = Add(sum.reg_.get_buffer(), a.reg_.get_buffer(),
02973                     b.reg_.get_buffer(), b.reg_.size());
02974         CopyWords(sum.reg_+b.reg_.size(), a.reg_+b.reg_.size(),
02975                   a.reg_.size()-b.reg_.size());
02976         carry = Increment(sum.reg_+b.reg_.size(), a.reg_.size()-b.reg_.size(),
02977                           carry);
02978     }
02979     else
02980     {
02981         carry = Add(sum.reg_.get_buffer(), a.reg_.get_buffer(),
02982                     b.reg_.get_buffer(), a.reg_.size());
02983         CopyWords(sum.reg_+a.reg_.size(), b.reg_+a.reg_.size(),
02984                   b.reg_.size()-a.reg_.size());
02985         carry = Increment(sum.reg_+a.reg_.size(), b.reg_.size()-a.reg_.size(),
02986                           carry);
02987     }
02988 
02989     if (carry)
02990     {
02991         sum.reg_.CleanGrow(2*sum.reg_.size());
02992         sum.reg_[sum.reg_.size()/2] = 1;
02993     }
02994     sum.sign_ = Integer::POSITIVE;
02995 }
02996 
02997 void PositiveSubtract(Integer &diff, const Integer &a, const Integer& b)
02998 {
02999     unsigned aSize = a.WordCount();
03000     aSize += aSize%2;
03001     unsigned bSize = b.WordCount();
03002     bSize += bSize%2;
03003 
03004     if (aSize == bSize)
03005     {
03006         if (Compare(a.reg_.get_buffer(), b.reg_.get_buffer(), aSize) >= 0)
03007         {
03008             Subtract(diff.reg_.get_buffer(), a.reg_.get_buffer(),
03009                      b.reg_.get_buffer(), aSize);
03010             diff.sign_ = Integer::POSITIVE;
03011         }
03012         else
03013         {
03014             Subtract(diff.reg_.get_buffer(), b.reg_.get_buffer(),
03015                      a.reg_.get_buffer(), aSize);
03016             diff.sign_ = Integer::NEGATIVE;
03017         }
03018     }
03019     else if (aSize > bSize)
03020     {
03021         word borrow = Subtract(diff.reg_.get_buffer(), a.reg_.get_buffer(),
03022                                b.reg_.get_buffer(), bSize);
03023         CopyWords(diff.reg_+bSize, a.reg_+bSize, aSize-bSize);
03024         borrow = Decrement(diff.reg_+bSize, aSize-bSize, borrow);
03025         assert(!borrow);
03026         diff.sign_ = Integer::POSITIVE;
03027     }
03028     else
03029     {
03030         word borrow = Subtract(diff.reg_.get_buffer(), b.reg_.get_buffer(),
03031                                a.reg_.get_buffer(), aSize);
03032         CopyWords(diff.reg_+aSize, b.reg_+aSize, bSize-aSize);
03033         borrow = Decrement(diff.reg_+aSize, bSize-aSize, borrow);
03034         assert(!borrow);
03035         diff.sign_ = Integer::NEGATIVE;
03036     }
03037 }
03038 
03039 
03040 unsigned int Integer::MinEncodedSize(Signedness signedness) const
03041 {
03042     unsigned int outputLen = max(1U, ByteCount());
03043     if (signedness == UNSIGNED)
03044         return outputLen;
03045     if (NotNegative() && (GetByte(outputLen-1) & 0x80))
03046         outputLen++;
03047     if (IsNegative() && *this < -Power2(outputLen*8-1))
03048         outputLen++;
03049     return outputLen;
03050 }
03051 
03052 
03053 int Integer::Compare(const Integer& t) const
03054 {
03055     if (NotNegative())
03056     {
03057         if (t.NotNegative())
03058             return PositiveCompare(t);
03059         else
03060             return 1;
03061     }
03062     else
03063     {
03064         if (t.NotNegative())
03065             return -1;
03066         else
03067             return -PositiveCompare(t);
03068     }
03069 }
03070 
03071 
03072 int Integer::PositiveCompare(const Integer& t) const
03073 {
03074     unsigned size = WordCount(), tSize = t.WordCount();
03075 
03076     if (size == tSize)
03077         return TaoCrypt::Compare(reg_.get_buffer(), t.reg_.get_buffer(), size);
03078     else
03079         return size > tSize ? 1 : -1;
03080 }
03081 
03082 
03083 bool Integer::GetBit(unsigned int n) const
03084 {
03085     if (n/WORD_BITS >= reg_.size())
03086         return 0;
03087     else
03088         return bool((reg_[n/WORD_BITS] >> (n % WORD_BITS)) & 1);
03089 }
03090 
03091 
03092 unsigned long Integer::GetBits(unsigned int i, unsigned int n) const
03093 {
03094     assert(n <= sizeof(unsigned long)*8);
03095     unsigned long v = 0;
03096     for (unsigned int j=0; j<n; j++)
03097         v |= GetBit(i+j) << j;
03098     return v;
03099 }
03100 
03101 
03102 byte Integer::GetByte(unsigned int n) const
03103 {
03104     if (n/WORD_SIZE >= reg_.size())
03105         return 0;
03106     else
03107         return byte(reg_[n/WORD_SIZE] >> ((n%WORD_SIZE)*8));
03108 }
03109 
03110 
03111 unsigned int Integer::BitCount() const
03112 {
03113     unsigned wordCount = WordCount();
03114     if (wordCount)
03115         return (wordCount-1)*WORD_BITS + BitPrecision(reg_[wordCount-1]);
03116     else
03117         return 0;
03118 }
03119 
03120 
03121 unsigned int Integer::ByteCount() const
03122 {
03123     unsigned wordCount = WordCount();
03124     if (wordCount)
03125         return (wordCount-1)*WORD_SIZE + BytePrecision(reg_[wordCount-1]);
03126     else
03127         return 0;
03128 }
03129 
03130 
03131 unsigned int Integer::WordCount() const
03132 {
03133     return CountWords(reg_.get_buffer(), reg_.size());
03134 }
03135 
03136 
03137 bool Integer::IsConvertableToLong() const
03138 {
03139     if (ByteCount() > sizeof(long))
03140         return false;
03141 
03142     unsigned long value = reg_[0];
03143     value += SafeLeftShift<WORD_BITS, unsigned long>(reg_[1]);
03144 
03145     if (sign_ == POSITIVE)
03146         return (signed long)value >= 0;
03147     else
03148         return -(signed long)value < 0;
03149 }
03150 
03151 
03152 signed long Integer::ConvertToLong() const
03153 {
03154     assert(IsConvertableToLong());
03155 
03156     unsigned long value = reg_[0];
03157     value += SafeLeftShift<WORD_BITS, unsigned long>(reg_[1]);
03158     return sign_ == POSITIVE ? value : -(signed long)value;
03159 }
03160 
03161 
03162 void Integer::Swap(Integer& a)
03163 {
03164     reg_.Swap(a.reg_);
03165     mySTL::swap(sign_, a.sign_);
03166 }
03167 
03168 
03169 Integer Integer::Plus(const Integer& b) const
03170 {
03171     Integer sum((word)0, max(reg_.size(), b.reg_.size()));
03172     if (NotNegative())
03173     {
03174         if (b.NotNegative())
03175             PositiveAdd(sum, *this, b);
03176         else
03177             PositiveSubtract(sum, *this, b);
03178     }
03179     else
03180     {
03181         if (b.NotNegative())
03182             PositiveSubtract(sum, b, *this);
03183         else
03184         {
03185             PositiveAdd(sum, *this, b);
03186             sum.sign_ = Integer::NEGATIVE;
03187         }
03188     }
03189     return sum;
03190 }
03191 
03192 
03193 Integer Integer::Minus(const Integer& b) const
03194 {
03195     Integer diff((word)0, max(reg_.size(), b.reg_.size()));
03196     if (NotNegative())
03197     {
03198         if (b.NotNegative())
03199             PositiveSubtract(diff, *this, b);
03200         else
03201             PositiveAdd(diff, *this, b);
03202     }
03203     else
03204     {
03205         if (b.NotNegative())
03206         {
03207             PositiveAdd(diff, *this, b);
03208             diff.sign_ = Integer::NEGATIVE;
03209         }
03210         else
03211             PositiveSubtract(diff, b, *this);
03212     }
03213     return diff;
03214 }
03215 
03216 
03217 Integer Integer::Times(const Integer &b) const
03218 {
03219     Integer product;
03220     Multiply(product, *this, b);
03221     return product;
03222 }
03223 
03224 
03225 #undef A0
03226 #undef A1
03227 #undef B0
03228 #undef B1
03229 
03230 #undef T0
03231 #undef T1
03232 #undef T2
03233 #undef T3
03234 
03235 #undef R0
03236 #undef R1
03237 #undef R2
03238 #undef R3
03239 
03240 
03241 static inline void AtomicDivide(word *Q, const word *A, const word *B)
03242 {
03243     word T[4];
03244     DWord q = DivideFourWordsByTwo<word, DWord>(T, DWord(A[0], A[1]),
03245                                          DWord(A[2], A[3]), DWord(B[0], B[1]));
03246     Q[0] = q.GetLowHalf();
03247     Q[1] = q.GetHighHalf();
03248 
03249 #ifndef NDEBUG
03250     if (B[0] || B[1])
03251     {
03252         // multiply quotient and divisor and add remainder, make sure it 
03253         // equals dividend
03254         assert(!T[2] && !T[3] && (T[1] < B[1] || (T[1]==B[1] && T[0]<B[0])));
03255         word P[4];
03256         Portable::Multiply2(P, Q, B);
03257         Add(P, P, T, 4);
03258         assert(memcmp(P, A, 4*WORD_SIZE)==0);
03259     }
03260 #endif
03261 }
03262 
03263 
03264 // for use by Divide(), corrects the underestimated quotient {Q1,Q0}
03265 static void CorrectQuotientEstimate(word *R, word *T, word *Q, const word *B,
03266                                     unsigned int N)
03267 {
03268     assert(N && N%2==0);
03269 
03270     if (Q[1])
03271     {
03272         T[N] = T[N+1] = 0;
03273         unsigned i;
03274         for (i=0; i<N; i+=4)
03275             LowLevel::Multiply2(T+i, Q, B+i);
03276         for (i=2; i<N; i+=4)
03277             if (LowLevel::Multiply2Add(T+i, Q, B+i))
03278                 T[i+5] += (++T[i+4]==0);
03279     }
03280     else
03281     {
03282         T[N] = LinearMultiply(T, B, Q[0], N);
03283         T[N+1] = 0;
03284     }
03285 
03286     word borrow = Subtract(R, R, T, N+2);
03287     assert(!borrow && !R[N+1]);
03288 
03289     while (R[N] || Compare(R, B, N) >= 0)
03290     {
03291         R[N] -= Subtract(R, R, B, N);
03292         Q[1] += (++Q[0]==0);
03293         assert(Q[0] || Q[1]); // no overflow
03294     }
03295 }
03296 
03297 // R[NB] -------- remainder = A%B
03298 // Q[NA-NB+2] --- quotient      = A/B
03299 // T[NA+2*NB+4] - temp work space
03300 // A[NA] -------- dividend
03301 // B[NB] -------- divisor
03302 
03303 
03304 void Divide(word* R, word* Q, word* T, const word* A, unsigned int NA,
03305             const word* B, unsigned int NB)
03306 {
03307     assert(NA && NB && NA%2==0 && NB%2==0);
03308     assert(B[NB-1] || B[NB-2]);
03309     assert(NB <= NA);
03310 
03311     // set up temporary work space
03312     word *const TA=T;
03313     word *const TB=T+NA+2;
03314     word *const TP=T+NA+2+NB;
03315 
03316     // copy B into TB and normalize it so that TB has highest bit set to 1
03317     unsigned shiftWords = (B[NB-1]==0);
03318     TB[0] = TB[NB-1] = 0;
03319     CopyWords(TB+shiftWords, B, NB-shiftWords);
03320     unsigned shiftBits = WORD_BITS - BitPrecision(TB[NB-1]);
03321     assert(shiftBits < WORD_BITS);
03322     ShiftWordsLeftByBits(TB, NB, shiftBits);
03323 
03324     // copy A into TA and normalize it
03325     TA[0] = TA[NA] = TA[NA+1] = 0;
03326     CopyWords(TA+shiftWords, A, NA);
03327     ShiftWordsLeftByBits(TA, NA+2, shiftBits);
03328 
03329     if (TA[NA+1]==0 && TA[NA] <= 1)
03330     {
03331         Q[NA-NB+1] = Q[NA-NB] = 0;
03332         while (TA[NA] || Compare(TA+NA-NB, TB, NB) >= 0)
03333         {
03334             TA[NA] -= Subtract(TA+NA-NB, TA+NA-NB, TB, NB);
03335             ++Q[NA-NB];
03336         }
03337     }
03338     else
03339     {
03340         NA+=2;
03341         assert(Compare(TA+NA-NB, TB, NB) < 0);
03342     }
03343 
03344     word BT[2];
03345     BT[0] = TB[NB-2] + 1;
03346     BT[1] = TB[NB-1] + (BT[0]==0);
03347 
03348     // start reducing TA mod TB, 2 words at a time
03349     for (unsigned i=NA-2; i>=NB; i-=2)
03350     {
03351         AtomicDivide(Q+i-NB, TA+i-2, BT);
03352         CorrectQuotientEstimate(TA+i-NB, TP, Q+i-NB, TB, NB);
03353     }
03354 
03355     // copy TA into R, and denormalize it
03356     CopyWords(R, TA+shiftWords, NB);
03357     ShiftWordsRightByBits(R, NB, shiftBits);
03358 }
03359 
03360 
03361 void PositiveDivide(Integer& remainder, Integer& quotient,
03362                    const Integer& a, const Integer& b)
03363 {
03364     unsigned aSize = a.WordCount();
03365     unsigned bSize = b.WordCount();
03366 
03367     assert(bSize);
03368 
03369     if (a.PositiveCompare(b) == -1)
03370     {
03371         remainder = a;
03372         remainder.sign_ = Integer::POSITIVE;
03373         quotient = Integer::Zero();
03374         return;
03375     }
03376 
03377     aSize += aSize%2;   // round up to next even number
03378     bSize += bSize%2;
03379 
03380     remainder.reg_.CleanNew(RoundupSize(bSize));
03381     remainder.sign_ = Integer::POSITIVE;
03382     quotient.reg_.CleanNew(RoundupSize(aSize-bSize+2));
03383     quotient.sign_ = Integer::POSITIVE;
03384 
03385     AlignedWordBlock T(aSize+2*bSize+4);
03386     Divide(remainder.reg_.get_buffer(), quotient.reg_.get_buffer(),
03387            T.get_buffer(), a.reg_.get_buffer(), aSize, b.reg_.get_buffer(),
03388            bSize);
03389 }
03390 
03391 void Integer::Divide(Integer &remainder, Integer &quotient,
03392                      const Integer &dividend, const Integer &divisor)
03393 {
03394     PositiveDivide(remainder, quotient, dividend, divisor);
03395 
03396     if (dividend.IsNegative())
03397     {
03398         quotient.Negate();
03399         if (remainder.NotZero())
03400         {
03401             --quotient;
03402             remainder = divisor.AbsoluteValue() - remainder;
03403         }
03404     }
03405 
03406     if (divisor.IsNegative())
03407         quotient.Negate();
03408 }
03409 
03410 void Integer::DivideByPowerOf2(Integer &r, Integer &q, const Integer &a,
03411                                unsigned int n)
03412 {
03413     q = a;
03414     q >>= n;
03415 
03416     const unsigned int wordCount = BitsToWords(n);
03417     if (wordCount <= a.WordCount())
03418     {
03419         r.reg_.resize(RoundupSize(wordCount));
03420         CopyWords(r.reg_.get_buffer(), a.reg_.get_buffer(), wordCount);
03421         SetWords(r.reg_+wordCount, 0, r.reg_.size()-wordCount);
03422         if (n % WORD_BITS != 0)
03423             r.reg_[wordCount-1] %= (1 << (n % WORD_BITS));
03424     }
03425     else
03426     {
03427         r.reg_.resize(RoundupSize(a.WordCount()));
03428         CopyWords(r.reg_.get_buffer(), a.reg_.get_buffer(), r.reg_.size());
03429     }
03430     r.sign_ = POSITIVE;
03431 
03432     if (a.IsNegative() && r.NotZero())
03433     {
03434         --q;
03435         r = Power2(n) - r;
03436     }
03437 }
03438 
03439 Integer Integer::DividedBy(const Integer &b) const
03440 {
03441     Integer remainder, quotient;
03442     Integer::Divide(remainder, quotient, *this, b);
03443     return quotient;
03444 }
03445 
03446 Integer Integer::Modulo(const Integer &b) const
03447 {
03448     Integer remainder, quotient;
03449     Integer::Divide(remainder, quotient, *this, b);
03450     return remainder;
03451 }
03452 
03453 void Integer::Divide(word &remainder, Integer &quotient,
03454                      const Integer &dividend, word divisor)
03455 {
03456     assert(divisor);
03457 
03458     if ((divisor & (divisor-1)) == 0)   // divisor is a power of 2
03459     {
03460         quotient = dividend >> (BitPrecision(divisor)-1);
03461         remainder = dividend.reg_[0] & (divisor-1);
03462         return;
03463     }
03464 
03465     unsigned int i = dividend.WordCount();
03466     quotient.reg_.CleanNew(RoundupSize(i));
03467     remainder = 0;
03468     while (i--)
03469     {
03470         quotient.reg_[i] = DWord(dividend.reg_[i], remainder) / divisor;
03471         remainder = DWord(dividend.reg_[i], remainder) % divisor;
03472     }
03473 
03474     if (dividend.NotNegative())
03475         quotient.sign_ = POSITIVE;
03476     else
03477     {
03478         quotient.sign_ = NEGATIVE;
03479         if (remainder)
03480         {
03481             --quotient;
03482             remainder = divisor - remainder;
03483         }
03484     }
03485 }
03486 
03487 Integer Integer::DividedBy(word b) const
03488 {
03489     word remainder;
03490     Integer quotient;
03491     Integer::Divide(remainder, quotient, *this, b);
03492     return quotient;
03493 }
03494 
03495 word Integer::Modulo(word divisor) const
03496 {
03497     assert(divisor);
03498 
03499     word remainder;
03500 
03501     if ((divisor & (divisor-1)) == 0)   // divisor is a power of 2
03502         remainder = reg_[0] & (divisor-1);
03503     else
03504     {
03505         unsigned int i = WordCount();
03506 
03507         if (divisor <= 5)
03508         {
03509             DWord sum(0, 0);
03510             while (i--)
03511                 sum += reg_[i];
03512             remainder = sum % divisor;
03513         }
03514         else
03515         {
03516             remainder = 0;
03517             while (i--)
03518                 remainder = DWord(reg_[i], remainder) % divisor;
03519         }
03520     }
03521 
03522     if (IsNegative() && remainder)
03523         remainder = divisor - remainder;
03524 
03525     return remainder;
03526 }
03527 
03528 
03529 Integer Integer::AbsoluteValue() const
03530 {
03531     Integer result(*this);
03532     result.sign_ = POSITIVE;
03533     return result;
03534 }
03535 
03536 
03537 Integer Integer::SquareRoot() const
03538 {
03539     if (!IsPositive())
03540         return Zero();
03541 
03542     // overestimate square root
03543     Integer x, y = Power2((BitCount()+1)/2);
03544     assert(y*y >= *this);
03545 
03546     do
03547     {
03548         x = y;
03549         y = (x + *this/x) >> 1;
03550     } while (y<x);
03551 
03552     return x;
03553 }
03554 
03555 bool Integer::IsSquare() const
03556 {
03557     Integer r = SquareRoot();
03558     return *this == r.Squared();
03559 }
03560 
03561 bool Integer::IsUnit() const
03562 {
03563     return (WordCount() == 1) && (reg_[0] == 1);
03564 }
03565 
03566 Integer Integer::MultiplicativeInverse() const
03567 {
03568     return IsUnit() ? *this : Zero();
03569 }
03570 
03571 Integer a_times_b_mod_c(const Integer &x, const Integer& y, const Integer& m)
03572 {
03573     return x*y%m;
03574 }
03575 
03576 Integer a_exp_b_mod_c(const Integer &x, const Integer& e, const Integer& m)
03577 {
03578     ModularArithmetic mr(m);
03579     return mr.Exponentiate(x, e);
03580 }
03581 
03582 Integer Integer::Gcd(const Integer &a, const Integer &b)
03583 {
03584     return EuclideanDomainOf().Gcd(a, b);
03585 }
03586 
03587 Integer Integer::InverseMod(const Integer &m) const
03588 {
03589     assert(m.NotNegative());
03590 
03591     if (IsNegative() || *this>=m)
03592         return (*this%m).InverseMod(m);
03593 
03594     if (m.IsEven())
03595     {
03596         if (!m || IsEven())
03597             return Zero();      // no inverse
03598         if (*this == One())
03599             return One();
03600 
03601         Integer u = m.InverseMod(*this);
03602         return !u ? Zero() : (m*(*this-u)+1)/(*this);
03603     }
03604 
03605     AlignedWordBlock T(m.reg_.size() * 4);
03606     Integer r((word)0, m.reg_.size());
03607     unsigned k = AlmostInverse(r.reg_.get_buffer(), T.get_buffer(),
03608                                reg_.get_buffer(), reg_.size(),
03609                                m.reg_.get_buffer(), m.reg_.size());
03610     DivideByPower2Mod(r.reg_.get_buffer(), r.reg_.get_buffer(), k,
03611                       m.reg_.get_buffer(), m.reg_.size());
03612     return r;
03613 }
03614 
03615 word Integer::InverseMod(const word mod) const
03616 {
03617     word g0 = mod, g1 = *this % mod;
03618     word v0 = 0, v1 = 1;
03619     word y;
03620 
03621     while (g1)
03622     {
03623         if (g1 == 1)
03624             return v1;
03625         y = g0 / g1;
03626         g0 = g0 % g1;
03627         v0 += y * v1;
03628 
03629         if (!g0)
03630             break;
03631         if (g0 == 1)
03632             return mod-v0;
03633         y = g1 / g0;
03634         g1 = g1 % g0;
03635         v1 += y * v0;
03636     }
03637     return 0;
03638 }
03639 
03640 // ********* ModArith stuff
03641 
03642 const Integer& ModularArithmetic::Half(const Integer &a) const
03643 {
03644     if (a.reg_.size()==modulus.reg_.size())
03645     {
03646         TaoCrypt::DivideByPower2Mod(result.reg_.begin(), a.reg_.begin(), 1,
03647                                     modulus.reg_.begin(), a.reg_.size());
03648         return result;
03649     }
03650     else
03651         return result1 = (a.IsEven() ? (a >> 1) : ((a+modulus) >> 1));
03652 }
03653 
03654 const Integer& ModularArithmetic::Add(const Integer &a, const Integer &b) const
03655 {
03656     if (a.reg_.size()==modulus.reg_.size() && 
03657         b.reg_.size()==modulus.reg_.size())
03658     {
03659         if (TaoCrypt::Add(result.reg_.begin(), a.reg_.begin(), b.reg_.begin(),
03660                           a.reg_.size())
03661             || Compare(result.reg_.get_buffer(), modulus.reg_.get_buffer(),
03662                        a.reg_.size()) >= 0)
03663         {
03664             TaoCrypt::Subtract(result.reg_.begin(), result.reg_.begin(),
03665                                modulus.reg_.begin(), a.reg_.size());
03666         }
03667         return result;
03668     }
03669     else
03670     {
03671         result1 = a+b;
03672         if (result1 >= modulus)
03673             result1 -= modulus;
03674         return result1;
03675     }
03676 }
03677 
03678 Integer& ModularArithmetic::Accumulate(Integer &a, const Integer &b) const
03679 {
03680     if (a.reg_.size()==modulus.reg_.size() && 
03681         b.reg_.size()==modulus.reg_.size())
03682     {
03683         if (TaoCrypt::Add(a.reg_.get_buffer(), a.reg_.get_buffer(),
03684                           b.reg_.get_buffer(), a.reg_.size())
03685             || Compare(a.reg_.get_buffer(), modulus.reg_.get_buffer(),
03686                        a.reg_.size()) >= 0)
03687         {
03688             TaoCrypt::Subtract(a.reg_.get_buffer(), a.reg_.get_buffer(),
03689                                modulus.reg_.get_buffer(), a.reg_.size());
03690         }
03691     }
03692     else
03693     {
03694         a+=b;
03695         if (a>=modulus)
03696             a-=modulus;
03697     }
03698 
03699     return a;
03700 }
03701 
03702 const Integer& ModularArithmetic::Subtract(const Integer &a,
03703                                            const Integer &b) const
03704 {
03705     if (a.reg_.size()==modulus.reg_.size() && 
03706         b.reg_.size()==modulus.reg_.size())
03707     {
03708         if (TaoCrypt::Subtract(result.reg_.begin(), a.reg_.begin(),
03709                                b.reg_.begin(), a.reg_.size()))
03710             TaoCrypt::Add(result.reg_.begin(), result.reg_.begin(),
03711                           modulus.reg_.begin(), a.reg_.size());
03712         return result;
03713     }
03714     else
03715     {
03716         result1 = a-b;
03717         if (result1.IsNegative())
03718             result1 += modulus;
03719         return result1;
03720     }
03721 }
03722 
03723 Integer& ModularArithmetic::Reduce(Integer &a, const Integer &b) const
03724 {
03725     if (a.reg_.size()==modulus.reg_.size() && 
03726         b.reg_.size()==modulus.reg_.size())
03727     {
03728         if (TaoCrypt::Subtract(a.reg_.get_buffer(), a.reg_.get_buffer(),
03729                                b.reg_.get_buffer(), a.reg_.size()))
03730             TaoCrypt::Add(a.reg_.get_buffer(), a.reg_.get_buffer(),
03731                           modulus.reg_.get_buffer(), a.reg_.size());
03732     }
03733     else
03734     {
03735         a-=b;
03736         if (a.IsNegative())
03737             a+=modulus;
03738     }
03739 
03740     return a;
03741 }
03742 
03743 const Integer& ModularArithmetic::Inverse(const Integer &a) const
03744 {
03745     if (!a)
03746         return a;
03747 
03748     CopyWords(result.reg_.begin(), modulus.reg_.begin(), modulus.reg_.size());
03749     if (TaoCrypt::Subtract(result.reg_.begin(), result.reg_.begin(),
03750                            a.reg_.begin(), a.reg_.size()))
03751         Decrement(result.reg_.begin()+a.reg_.size(), 1,
03752                   modulus.reg_.size()-a.reg_.size());
03753 
03754     return result;
03755 }
03756 
03757 Integer ModularArithmetic::CascadeExponentiate(const Integer &x,
03758                   const Integer &e1, const Integer &y, const Integer &e2) const
03759 {
03760     if (modulus.IsOdd())
03761     {
03762         MontgomeryRepresentation dr(modulus);
03763         return dr.ConvertOut(dr.CascadeExponentiate(dr.ConvertIn(x), e1,
03764                                                     dr.ConvertIn(y), e2));
03765     }
03766     else
03767         return AbstractRing::CascadeExponentiate(x, e1, y, e2);
03768 }
03769 
03770 void ModularArithmetic::SimultaneousExponentiate(Integer *results,
03771         const Integer &base, const Integer *exponents,
03772         unsigned int exponentsCount) const
03773 {
03774     if (modulus.IsOdd())
03775     {
03776         MontgomeryRepresentation dr(modulus);
03777         dr.SimultaneousExponentiate(results, dr.ConvertIn(base), exponents,
03778                                     exponentsCount);
03779         for (unsigned int i=0; i<exponentsCount; i++)
03780             results[i] = dr.ConvertOut(results[i]);
03781     }
03782     else
03783         AbstractRing::SimultaneousExponentiate(results, base,
03784                                                     exponents, exponentsCount);
03785 }
03786 
03787 
03788 // ********************************************************
03789 
03790 #define A0      A
03791 #define A1      (A+N2)
03792 #define B0      B
03793 #define B1      (B+N2)
03794 
03795 #define T0      T
03796 #define T1      (T+N2)
03797 #define T2      (T+N)
03798 #define T3      (T+N+N2)
03799 
03800 #define R0      R
03801 #define R1      (R+N2)
03802 #define R2      (R+N)
03803 #define R3      (R+N+N2)
03804 
03805 
03806 inline void MultiplyBottom(word *R, word *T, const word *A, const word *B,
03807                            unsigned int N)
03808 {
03809     RecursiveMultiplyBottom(R, T, A, B, N);
03810 }
03811 
03812 inline void MultiplyTop(word *R, word *T, const word *L, const word *A,
03813                         const word *B, unsigned int N)
03814 {
03815     RecursiveMultiplyTop(R, T, L, A, B, N);
03816 }
03817 
03818 
03819 // R[N] --- result = X/(2**(WORD_BITS*N)) mod M
03820 // T[3*N] - temporary work space
03821 // X[2*N] - number to be reduced
03822 // M[N] --- modulus
03823 // U[N] --- multiplicative inverse of M mod 2**(WORD_BITS*N)
03824 
03825 void MontgomeryReduce(word *R, word *T, const word *X, const word *M,
03826                       const word *U, unsigned int N)
03827 {
03828     MultiplyBottom(R, T, X, U, N);
03829     MultiplyTop(T, T+N, X, R, M, N);
03830     word borrow = Subtract(T, X+N, T, N);
03831     // defend against timing attack by doing this Add even when not needed
03832     word carry = Add(T+N, T, M, N);
03833     assert(carry || !borrow);
03834     CopyWords(R, T + (borrow ? N : 0), N);
03835 }
03836 
03837 // R[N] ----- result = A inverse mod 2**(WORD_BITS*N)
03838 // T[3*N/2] - temporary work space
03839 // A[N] ----- an odd number as input
03840 
03841 void RecursiveInverseModPower2(word *R, word *T, const word *A, unsigned int N)
03842 {
03843     if (N==2)
03844     {
03845         T[0] = AtomicInverseModPower2(A[0]);
03846         T[1] = 0;
03847         LowLevel::Multiply2Bottom(T+2, T, A);
03848         TwosComplement(T+2, 2);
03849         Increment(T+2, 2, 2);
03850         LowLevel::Multiply2Bottom(R, T, T+2);
03851     }
03852     else
03853     {
03854         const unsigned int N2 = N/2;
03855         RecursiveInverseModPower2(R0, T0, A0, N2);
03856         T0[0] = 1;
03857         SetWords(T0+1, 0, N2-1);
03858         MultiplyTop(R1, T1, T0, R0, A0, N2);
03859         MultiplyBottom(T0, T1, R0, A1, N2);
03860         Add(T0, R1, T0, N2);
03861         TwosComplement(T0, N2);
03862         MultiplyBottom(R1, T1, R0, T0, N2);
03863     }
03864 }
03865 
03866 
03867 #undef A0
03868 #undef A1
03869 #undef B0
03870 #undef B1
03871 
03872 #undef T0
03873 #undef T1
03874 #undef T2
03875 #undef T3
03876 
03877 #undef R0
03878 #undef R1
03879 #undef R2
03880 #undef R3
03881 
03882 
03883 // modulus must be odd
03884 MontgomeryRepresentation::MontgomeryRepresentation(const Integer &m)
03885     : ModularArithmetic(m),
03886       u((word)0, modulus.reg_.size()),
03887       workspace(5*modulus.reg_.size())
03888 {
03889     assert(modulus.IsOdd());
03890     RecursiveInverseModPower2(u.reg_.get_buffer(), workspace.get_buffer(),
03891                               modulus.reg_.get_buffer(), modulus.reg_.size());
03892 }
03893 
03894 const Integer& MontgomeryRepresentation::Multiply(const Integer &a,
03895                                                   const Integer &b) const
03896 {
03897     word *const T = workspace.begin();
03898     word *const R = result.reg_.begin();
03899     const unsigned int N = modulus.reg_.size();
03900     assert(a.reg_.size()<=N && b.reg_.size()<=N);
03901 
03902     AsymmetricMultiply(T, T+2*N, a.reg_.get_buffer(), a.reg_.size(),
03903                        b.reg_.get_buffer(), b.reg_.size());
03904     SetWords(T+a.reg_.size()+b.reg_.size(),0, 2*N-a.reg_.size()-b.reg_.size());
03905     MontgomeryReduce(R, T+2*N, T, modulus.reg_.get_buffer(),
03906                      u.reg_.get_buffer(), N);
03907     return result;
03908 }
03909 
03910 const Integer& MontgomeryRepresentation::Square(const Integer &a) const
03911 {
03912     word *const T = workspace.begin();
03913     word *const R = result.reg_.begin();
03914     const unsigned int N = modulus.reg_.size();
03915     assert(a.reg_.size()<=N);
03916 
03917     TaoCrypt::Square(T, T+2*N, a.reg_.get_buffer(), a.reg_.size());
03918     SetWords(T+2*a.reg_.size(), 0, 2*N-2*a.reg_.size());
03919     MontgomeryReduce(R, T+2*N, T, modulus.reg_.get_buffer(),
03920                      u.reg_.get_buffer(), N);
03921     return result;
03922 }
03923 
03924 Integer MontgomeryRepresentation::ConvertOut(const Integer &a) const
03925 {
03926     word *const T = workspace.begin();
03927     word *const R = result.reg_.begin();
03928     const unsigned int N = modulus.reg_.size();
03929     assert(a.reg_.size()<=N);
03930 
03931     CopyWords(T, a.reg_.get_buffer(), a.reg_.size());
03932     SetWords(T+a.reg_.size(), 0, 2*N-a.reg_.size());
03933     MontgomeryReduce(R, T+2*N, T, modulus.reg_.get_buffer(),
03934                      u.reg_.get_buffer(), N);
03935     return result;
03936 }
03937 
03938 const Integer& MontgomeryRepresentation::MultiplicativeInverse(
03939                                                         const Integer &a) const
03940 {
03941 //  return (EuclideanMultiplicativeInverse(a, modulus)<<
03942 //      (2*WORD_BITS*modulus.reg_.size()))%modulus;
03943     word *const T = workspace.begin();
03944     word *const R = result.reg_.begin();
03945     const unsigned int N = modulus.reg_.size();
03946     assert(a.reg_.size()<=N);
03947 
03948     CopyWords(T, a.reg_.get_buffer(), a.reg_.size());
03949     SetWords(T+a.reg_.size(), 0, 2*N-a.reg_.size());
03950     MontgomeryReduce(R, T+2*N, T, modulus.reg_.get_buffer(),
03951                      u.reg_.get_buffer(), N);
03952     unsigned k = AlmostInverse(R, T, R, N, modulus.reg_.get_buffer(), N);
03953 
03954 //  cout << "k=" << k << " N*32=" << 32*N << endl;
03955 
03956     if (k>N*WORD_BITS)
03957         DivideByPower2Mod(R, R, k-N*WORD_BITS, modulus.reg_.get_buffer(), N);
03958     else
03959         MultiplyByPower2Mod(R, R, N*WORD_BITS-k, modulus.reg_.get_buffer(), N);
03960 
03961     return result;
03962 }
03963 
03964 
03965 //  mod Root stuff
03966 Integer ModularRoot(const Integer &a, const Integer &dp, const Integer &dq,
03967                     const Integer &p, const Integer &q, const Integer &u)
03968 {
03969     Integer p2 = ModularExponentiation((a % p), dp, p);
03970     Integer q2 = ModularExponentiation((a % q), dq, q);
03971     return CRT(p2, p, q2, q, u);
03972 }
03973 
03974 Integer CRT(const Integer &xp, const Integer &p, const Integer &xq,
03975             const Integer &q, const Integer &u)
03976 {
03977     // isn't operator overloading great?
03978     return p * (u * (xq-xp) % q) + xp;
03979 }
03980 
03981 
03982 #ifdef HAVE_EXPLICIT_TEMPLATE_INSTANTIATION
03983 #ifndef TAOCRYPT_NATIVE_DWORD_AVAILABLE
03984 template hword DivideThreeWordsByTwo<hword, Word>(hword*, hword, hword, Word*);
03985 #endif
03986 template word DivideThreeWordsByTwo<word, DWord>(word*, word, word, DWord*);
03987 #endif
03988 
03989 
03990 } // namespace
03991
Generated on Sat Sep 9 00:49:49 2006 for MySQL 5.1 by
1.4.7