As a learning experience I decided to cook up my own bigint routines for taking modexp, in C. Code below, but to summarize: it's repeated squaring with Montgomery multiplication. And it takes forever to run. On my virtual machines which has 1gb ram, anything bigger than around a 128-bit prime modulus causes the program to kill itself, and smaller ones still take an appreciable amount of time. Meanwhile, in python, I did
def modexp ( g, u, p ):
s = 1
while u != 0:
if u & 1:
s = (s * g)%p
u >>= 1
g = (g * g)%p;
return s
p = 0xffffffffffffffffc90fdaa22168c234c4c6628b80dc1cd129024e088a67cc74020bbea63b139b22514a08798e3404ddef9519b3cd3a431b302b0a6df25f14374fe1356d6d51c245e485b576625e7ec6f44c42e9a637ed6b0bff5cb6f406b7edee386bfb5a899fa5ae9f24117c4b1fe649286651ece45b3dc2007cb8a163bf0598da48361c55d39a69163fa8fd24cf5f83655d23dca3ad961c62f356208552bb9ed529077096966d670c354e4abc9804f1746c08ca237327ffffffffffffffff
g = 0x02
a = int(urandom(1000).encode('hex'),base=16) % p
modexp(g,a,p)
And it's, like, instant. Does python just have a ton of stuff precomputed for dealing with bigints, or is there something very deep going on, or is it the most likely outcome once again, that is to say, I just have no idea what I'm doing?
Here's some of the relevant C code, not that I'm asking anyone to read through it all, but may as well include it...
Bigints are stored in a typedef struct string
which is just a wrapper for a char *
with len
and sign
fields.
string __modExpOdd(string a, string e, string n){//Assumes the modulus is odd. Repeated squaring with Mongtomery multiplication.
string r = charToS(0x01);
int rpow=0;
int ebits = countSignificantBits(e);
while(__bigIntComp(r,n)==-1){
r = stringLeftShift(r,1);
rpow++;
}
string nprime = extendedEuclidean(r,n,charToS(0x01))[1];
nprime.sign = 1;
string abar = bigIntDivide(bigIntMultiply(a,r),n)[1];
string xbar = bigIntDivide(r,n)[1];
for(int i = ebits-1; i>=0; i--){
xbar = __monPro(xbar, xbar, n, nprime, rpow);
if(bigIntParity(stringRightShift(e,i))){
xbar = __monPro(abar, xbar, n, nprime, rpow);
}
}
return __monPro(xbar, charToS(0x01), n, nprime, rpow);
}
Debugging tells me that worst bottleneck is the division step in the main loop of the extended euclidean algorithm:
string *extendedEuclidean(string A, string B, string C){
//returns x, y, with 0 <= x < B such that Ax + By = C
string S,s,T,t,R,r,q,temp;
S=charToS(0x00);
s=charToS(0x01);
T=charToS(0x01);
t=charToS(0x00);
R=bigIntCopy(B);
r=bigIntCopy(A);
int revflag = 0;
if(bigIntComp(A,B)==-1){// need to have A>B for this to work.
temp = R;
R = r;
r = temp;
revflag = 1;
}
while(bigIntComp(R,charToS(0x00))!=0){
q = bigIntDivide(r,R)[0];
memcpy(&temp, &R, sizeof(string)); R = bigIntSubtract(r, bigIntMultiply(q, R)); memcpy(&r, &temp, sizeof(string));
memcpy(&temp, &S, sizeof(string)); S = bigIntSubtract(s, bigIntMultiply(q, S)); memcpy(&s, &temp, sizeof(string));
memcpy(&temp, &T, sizeof(string)); T = bigIntSubtract(t, bigIntMultiply(q, T)); memcpy(&t, &temp, sizeof(string));
}
//at this point, s*A + t*B = r = gcd(A,B), and T,S are quotients of A,B by the gcd.
string *qr = bigIntDivide(C,r);
string ratio = bigIntCopy(qr[0]);
if(bigIntComp(qr[1],charToS(0x00))!=0) return NULL;
if(revflag==1){
temp = s;
s = t;
t = temp;
}
//normalise s, t so that 0 <= s < B
if(s.sign==-1){
qr = bigIntDivide(bigIntAbs(s),bigIntAbs(B));
string q = (bigIntComp(qr[1],charToS(0x00))==0) ? qr[0] : bigIntIncr(qr[0]);
s = bigIntAdd(s,bigIntMultiply(q,bigIntAbs(B)));
t = bigIntSubtract(t, bigIntMultiply(q,bigIntAbs(A)));
}
//multiply to get the correct coefficients
s = bigIntMultiply(s,ratio);
t = bigIntMultiply(t,ratio);
string *ret = calloc(2,sizeof(string));
ret[0] = s;
ret[1] = t;
return ret;
}
And you might be curious to see the division routine, which is just long division:
string *__bigIntDivide(string a, string b){ //assumes a, b positive
string *qr = calloc(2, sizeof(string));
if(__bigIntComp(b,charToS(0x00))==0){
return NULL;
}
if(__bigIntComp(a,b)==-1){
qr[0] = charToS(0x00);
qr[1] = bigIntCopy(b);
return qr;
}
if(__bigIntComp(a,b)==0){
qr[0] = charToS(0x01);
qr[1] = charToS(0x00);
return qr;
}
int pow = ispowerof2(b);
if(pow!=-1){
qr[0] = stringRightShift(a,pow);
qr[1] = getSignificantBits(a,pow);
qr[0].sign=1;
qr[1].sign=1;
return qr;
}
//long division, in base-256
string divisor = charToS(a.c[0]);
string quotient = NULLSTRING;
int i=1;
do{
while(__bigIntComp(divisor,b)==-1 && i<a.len){
divisor = stringCat(divisor, charToS(a.c[i]));
i++;
}
if(i == a.len && __bigIntComp(divisor, b) == -1){
break;
}
qr = __findQuotient(divisor, b, charToS(0x00), charToS(0xFF));
quotient = stringCat(quotient, qr[0]);
divisor = qr[1];
}while(i<a.len);
qr[0] = bigIntCopy(quotient);
qr[1] = bigIntCopy(divisor);
return qr;
}
string *__findQuotient(string divisor, string dividend, string minq, string maxq){ //an O(log n) division routine, finds q by a binary search instead of linear
string q = stringRightShift(bigIntAdd(minq,maxq),1);
string diff = stringRightShift(bigIntSubtract(maxq,minq),1);
string p = bigIntMultiply(dividend, q);
string *qr=calloc(2,sizeof(string));
while( __bigIntComp(diff, charToS(0x00)) == 1){
fflush(stdout);
if(__bigIntComp(divisor, p) == 1){ // if divisor > dividend *q, q is too small.
q = bigIntAdd(q, diff);
}
if(__bigIntComp(divisor, p) == -1){ // if divisor < dividend * q, q is too big.
q = bigIntSubtract(q, diff);
}
if(__bigIntComp(divisor, p)==0){
qr[0] = q;
qr[1] = charToS(0x00);
return qr;
}
p = bigIntMultiply(dividend, q);
diff = stringRightShift(diff,1);
}
while(__bigIntComp(divisor, p)==1){ // if a > b*q, q is too small, so increment it. Afterwards, it will be 1 too big.
q = bigIntIncr(q);
p = bigIntAdd(p, dividend);
}
while(__bigIntComp(divisor, p) == -1){ // while a < b*q, decrement q
q = bigIntDecr(q);
p = bigIntSubtract(p,dividend);
}
qr[0] = q;
qr[1] = bigIntSubtract(divisor, p);
return qr;
}
And the Montgomery product function, which is just directly from the paper where it was first described:
string __monPro(string abar, string bbar, string n, string nprime, int rpow){
string t = bigIntMultiply(abar, bbar);
string m = getSignificantBits(bigIntMultiply(t,nprime),rpow);
string u = stringRightShift(bigIntAdd(t,bigIntMultiply(m,n)),rpow);
if(bigIntComp(u,n)>=0) return bigIntSubtract(u,n);
return u;
}
So I'm just curious if it's all precomputed in python, or if I should be looking for a memory leak? I haven't tested it with any of the C bigint libraries because I've been lazy, and because I assume python is just using one of those internally anyway. Thanks for taking a look.