How to speed up this C++ code (especially factorial and pow part)

Question

I was doing this problem called arithmetic progression on Hackerrank.

https://www.hackerrank.com/challenges/arithmetic-progressions

My solution passed all first six tests. I have tried every possible way of optimizing my code, including caching, more efficient operation. I still could not pass the test. The two places I think I failed is factorial function and pow function.

Basically, I used unordered_map to store all of the previous results. If the argument is one of the key, I will just returned the result right away.

Here is my code:

#include <cstdio>
#include <vector>
#include <iostream>
#include <algorithm>
#include <unordered_map>
#include <utility>
#include <ctime>
#include <sys/time.h>
using namespace std;


#define mod(m) (m > 1000003) ? m % 1000003 : m

//used for hashing the pair
namespace std {

template<typename a, typename b>
struct hash< pair<a, b> > {
private:
   const hash<a> ah;
   const hash<b> bh;
public:
   hash() : ah(), bh() {};
   size_t operator()(const std::pair<a, b> &p) const {
      return ah(p.first) ^ bh(p.second);
   }
};
} // namespaces

//the code below is used for collecting statistics
/*
typedef unsigned long long timestamp_t;

static timestamp_t get_timestamp ()
{
    struct timeval now;
    gettimeofday (&now, NULL);
    return  now.tv_usec + (timestamp_t)now.tv_sec * 1000000;
}
*/

//note that the number could get really large, that's why I use unsigned long long for most of data type
static unsigned long long cache_D[100000];
static unsigned long long cache_d[100000];
static unsigned long long cache_p[100000];
static unordered_map<unsigned long long, unsigned long long> cache_F;           //use       unordered_map to store the factorial, avg insert, lookup O(1)
static unordered_map<pair<unsigned long long, unsigned long long>, unsigned long long>   cache_P; //use unordered_map to store the pow
//static double pow_sec = 0;
//static double fac_sec = 0;


/**
 * Use the fast pow algorithm. On top of that, I add caching (stored in unordered map) to     speed up the pow
 * @param  x base
 * @param  y exponent
 * @return   x ^ y
 */
unsigned long long pow(unsigned long long x, unsigned long long y)
{
    //timestamp_t t0 = get_timestamp();
    pair<unsigned long long, unsigned long long> curr(x, y);
    if(cache_P.find(curr) != cache_P.end())
    {
        //timestamp_t t1 = get_timestamp();
        //pow_sec += (t1 - t0) / 1000000.0L;
        return cache_P[curr];
    }
    unsigned long long result = 1;
    unsigned long long mod_x =  mod(x);
    //unsigned long long count = 0;

    while( y )
    {
        if ( y & 1 )
        {
            unsigned long long temp = result *  mod_x;
            result = mod(temp);
        }
        y >>= 1;
        unsigned long long temp = mod_x *  mod_x;
        mod_x =  mod(temp);
    }
    cache_P[curr] = result;
    //timestamp_t t1 = get_timestamp();
    //pow_sec += (t1 - t0) / 1000000.0L;
    return result;
}

/**
 * same idea as pow, caching whenever I can
 * @param  x number to be factorialized
 * @return   x!
 */
unsigned long long factorial(unsigned long long x)
{
    //timestamp_t t0 = get_timestamp();
    if (cache_F.find(x) != cache_F.end())
    {
        //timestamp_t t1 = get_timestamp();
        //fac_sec += (t1 - t0) / 1000000.0L;
        return cache_F[x];
    }
    else
    {
        unsigned long long result = 1;
        //here we go from x to 1 since we could speed up operation as soon as we have x - 1 or x - 2 or x - 3 in our caching (just x * (x - 1)! )
        for(unsigned long long i = x; i >= 1; i--)
        {
            if(cache_F.find(i) != cache_F.end())
            {
                unsigned long long temp1 = result * cache_F[i];
                result = mod(temp1);
                cache_F[x] = result;
                //timestamp_t t1 = get_timestamp();
                //fac_sec += (t1 - t0) / 1000000.0L;
                return result;
            }
            unsigned long long mod_i = mod(i);
            unsigned long long temp2 = result * mod_i;
            result = mod(temp2);
        }
        cache_F[x] = result;
        //timestamp_t t1 = get_timestamp();
        //fac_sec += (t1 - t0) / 1000000.0L;
        return result;
    } 
}
void query(int from, int to)
{
    unsigned long long k = 0;
    unsigned long long constant = 1;

    for(int i = from - 1; i < to; i++)
    {
        k += cache_p[i];
        unsigned long long temp = constant * cache_D[i];
        constant = mod(temp);
    }
    unsigned long long temp = constant * factorial(k);
    constant = mod(temp);
    printf("%llu %llu\n", k, constant);
}


void update(int from, int to, int how_much)
{
    for(int i = from - 1; i < to; i++)
    {
         cache_p[i] += how_much;
         unsigned long long temp = cache_D[i] * pow(cache_d[i], (unsigned long long)how_much);
         cache_D[i] = mod(temp);
    }
}

int main() {
    int num_vec, num_operations;
    FILE *pFile = fopen("input.txt", "r");

    fscanf(pFile, "%d", &num_vec);
    for(int i = 0; i < num_vec; i++)
    {
        unsigned long long a, d, q;
        fscanf(pFile, "%llu %llu %llu", &a, &d, &q);
        cache_d[i] = d;
        cache_p[i] = q;
        cache_D[i] = pow(d, q);
    }
    fscanf(pFile, "%d", &num_operations);
    for(int i = 0; i < num_operations; i++)
    {
        int what_operation, from, to;
        fscanf(pFile, "%d %d %d", &what_operation, &from, &to);
        if(what_operation == 0)
        {
             query(from, to);
        }
        else if (what_operation == 1)
        {
            int add_q;
            fscanf(pFile, "%d", &add_q);
            update(from, to, add_q);
        }
    }
    printf("sec for pow: %f\n sec for fac: %f", pow_sec, fac_sec);
    return 0;
}

It would be really helpful if anyone knows how to further optimize my code. Thanks!

And what about I/O operations? Maybe you should read a file in blocks with `fread`, not with `fscanf`. And process the data after reading the file. — jacek.ciach, May 27 '13 at 21:45
In the code I have written, I use scanf but when I test it I read input from a file. Sorry for the confusion! — Alan Liu, May 27 '13 at 22:15
The pow and fac is slow for really large cases. I haven't done profiled in more detail (probably should) — Alan Liu, May 27 '13 at 22:17
Inside your modPow function, you should be able to get rid of branching (the if statement and the while condition). Quite likely that will give you a speed-up since there won't be branch prediction required. — Xantix, May 28 '13 at 21:26

score 5 · Accepted Answer · edited May 23 '17 at 12:12

Regarding Factorial.

"unsigned long long" has a range from 0 to

18,446,744,073,709,551,615

the factorial of 21 is:

51,090,942,171,709,440,000

So, this means your function can only return the correct results for factorial of 0 through 20.

It will be a lot easier to start with a pre-built cache with 21 elements (zero through 20).

unsigned long long fact_cache [21] = {

 1                      ,
 1                      ,
 2                      ,
 6                      ,
 24                     ,
 120                    ,
 720                    ,
 5040                   ,
 40320                  ,
 362880                 ,
 3628800                ,
 39916800               ,
 479001600              ,  
 6227020800             ,
 87178291200            ,
 1307674368000          ,
 20922789888000         ,
 355687428096000        ,
 6402373705728000       ,
 121645100408832000     ,
 2432902008176640000                      

};

Now your factorial function can just look up the right value in the array (checking bounds if you like).

edit: (Realized the OP intended 'factorial mod 1000003")

I started by reading the answers to:

Fast way to calculate n! mod m where m is prime?

Based on the second answer, with code examples in python, and more information here:

http://www.algorithmist.com/index.php/Modular_inverse

He gave this Python code, as well as a possible improvement.

def factorialMod(n, modulus):
    ans=1
    if n <= modulus//2:
        #calculate the factorial normally (right argument of range() is exclusive)
        for i in range(1,n+1):
            ans = (ans * i) % modulus   
    else:
        #Fancypants method for large n
        for i in range(n+1,modulus):
            ans = (ans * i) % modulus
        ans = modinv(ans, modulus)
        ans = -1*ans + modulus
    return ans % modulus

This has some advantages, over the original method when n is larger than the modulus divided by 2, since we can reduce the number of multiplications by solving for the modular inverse.

Because we know the modulus (1000003), we can solve for the modinv by using its totient (1000002).

In pseudo-code we get something like:

long factorial_mod( long n ) {
    if( n > 1000003 )
         return 0;  //from Thomas's comment

    if( cache[n] != 0 ) //if the cache has the answer
        return cache[n];

    long result = 1;
    if ( n <= 500001 )
    {
        for( long i = 1; i <= n; i++ )
        {
            result = (result * i) % 1000003;
            cache[i] = result;
        }
    }
    else
    {
        for( long i = n+1; i <= 1000003; i++)
        {
            result = (result * i) % 1000003;
        }
        result = modinv(result, 1000003);
        result = -1*result + 1000003; 
    }

    result = result % 1000003; 
    cache[n] = result;
    return result;
}

long modinv( long a, int modulus ) {
    return modPow( a, 1000002, modulus); // ( (a to the totient) mod the modulus )
}

If we didn't want to compute the totient, we could have used extended euler GCD to solve for the modular inverse. (of course the totient of primes is very easy to compute...just subtract one).

The thing is that since the number could get really large, the problem states that we could use mod 1000003. So technically x could be larger than 21 but its result is still in the range of unsigned long long. — Alan Liu, May 27 '13 at 22:28
Good point. In the description/comments about the factorial function, it didn't mention the "mod" so I must have missed that. — Xantix, May 27 '13 at 22:50
@AllenLiu How much memory are you allowed? Storing an entire list of factorial remainders from 0 to 1000002 (everything after that is zero by definition) should fit in about 4 megabytes of memory. You can precalculate them once and for all quickly instead of constantly recalculating everything and messing around with caches. — Thomas, May 27 '13 at 23:17
@Thomas are you sure everything after 10000002 is zero by definition? I don't think mod works that way (either that or I am misunderstanding the problem) — Jeremy Friesner, May 28 '13 at 05:49
that's the case. 1*2*3*4*5*...*1000002*1000003*1000004*...etc must be divisible by 1000003 since we multiplied by that number. Therefore, it has no remainder when we divide by it. Or in other words, its mod is zero. — Xantix, May 28 '13 at 05:55
@JeremyFriesner I meant the factorial. When it is taken modulo N, and factorial above and including N has a remainder of zero when divided by N. — Thomas, May 28 '13 at 06:05

How to speed up this C++ code (especially factorial and pow part)

1 Answers1