How much can you quickly multiply?

Python 2.7

42.575 = ( 6,812,000 / 160,000 ) approx


Code:

import gmpy2

def fac1(n):
    m=lambda(L):([]if len(L)%2==0 else[L.pop()])+map(lambda(l):l[0]*l[1],zip(L[1::2],L[-2::-2]))
    L=map(gmpy2.mpz,xrange(1,n+1))
    Number = (len(L)-1).bit_length()
    while Number:Number-=1;L=m(L)
    return L[0]

def fac2(n):
    global E; E=0
    def f(i):
        global E; E+=i//2
        return[]if i==1 else f(i//2)+range(3,i,2)+[[1,i][i%2]]
    m=lambda(L):([]if len(L)%2==0 else[L.pop()])+map(lambda(l):l[0]*l[1],zip(L[1::2],L[-2::-2]))
    L=map(gmpy2.mpz,f(n))
    N=(len(L)-1).bit_length()
    while N: N-=1;L=m(L)
    return L[0]<<E

Test:

import time

start = time.time()
baseline(160000)
print time.time()-start

start = time.time()
fac1(6811000)
print time.time()-start

start = time.time()
fac2(6812000)
print time.time()-start

start = time.time()
gmpy2.fac(26000000)
print time.time()-start

Output:

10.0069999695
10.0729999542
10.0360000134
9.98699998856

How it works:

Bigger multiplications take more time, thus we want to do as many small multiplications as possible. This is especially true in Python where for numbers less that 2^64 we use hardware arithmetic, and above that we use software. So, in m(L), we start with a list L; if it's odd length we remove one number from consideration to make it even again. Then we multiply element 1 with element -2, element 3 with -4, etc, so that

m([1,2,3,4,5,6,7,8]) = [2*7, 4*5, 6*3, 8*1] = [14, 20, 18, 8]
m([10,12,6]) = [360,112]
m([120,6]) = [40320]

This approach ensures we're using hardware arithmetic for as long as possible, following which we switch onto the efficient gmc arithmetic library.

In fac2, we take a more classic divide and conquer approach as well, where we split out every multiple of 2 and bitshift them at the end for a trivial performance boost. I've included it here because it's usually around 0.5% faster than fac1.

Golfed Version of fac1 (because I can), 220B

import gmpy2
def f(n):
    m=lambda(L):([]if len(L)%2==0 else[L.pop()])+map(lambda(l):l[0]*l[1],zip(L[1::2],L[-2::-2]))
    L=map(gmpy2.mpz,xrange(1,n+1));N=(len(L)-1).bit_length()
    while N:N-=1;L=m(L)
return L[0]

C++ with GMP, score = 55.55 (10,000,000 / 180,000)

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <vector>
#include <iostream>
#include <queue>
#include <gmpxx.h>

int main(int argc, char *argv[]) {
  uint64_t n = atoi(argv[1]);

  // Iterate through 1..n.  Strip off powers of 2.  Multiply
  // remainders together into <= 64 bit chunks.
  uint64_t twos = 0;
  std::vector<uint64_t> terms;
  uint64_t m = 1;
  for(uint64_t i = 1; i <= n; i++) {
    uint64_t j = __builtin_ctzll(i);
    twos += j;
    uint64_t k = i >> j;
    if(__builtin_clzll(m) + __builtin_clzll(k) >= 64) {
      m *= k;
    } else {
      terms.push_back(m);
      m = k;
    }
  }
  if(m != 1) terms.push_back(m);

  // convert to gmp
  // why isn't there a 64-bit constructor?
  std::queue<mpz_class> gmpterms;
  for(int i = 0; i < terms.size(); i++) {
    mpz_class x = (uint32_t)(terms[i] >> 32);
    x <<= 32;
    x += (uint32_t)terms[i];
    gmpterms.push(x);
  }

  // pop two from the bottom, multiply them, push on the end.
  while(gmpterms.size() > 1) {
    mpz_class a = gmpterms.front();
    gmpterms.pop();
    mpz_class b = gmpterms.front();
    gmpterms.pop();
    gmpterms.push(a * b);
  }

  mpz_class r = gmpterms.front();
  r <<= twos;
  //std::cout << r << std::endl;
}

Java - 125.15 (21,400,000 / 171,000)

Also shamelessly copied from Peter Luschny's Github repo (thanks @semi-extrinsic) and licensed under the MIT license, this uses the "prime factorization nested squaring" algorithm as proposed by Albert Schönhage et al. (according to Luschny's factorial algorithms description page).

I slightly adapted the algorithm to use Java's BigInteger and to not use a lookup table for n < 20.

Compiled with gcj, which uses GMP for its BigInteger implementation, and ran on Linux 3.12.4 (Gentoo), on a Core i7 4700MQ at 2.40GHz

import java.math.BigInteger;

public class PrimeSieveFactorialSchoenhage {

    private static int[] primeList, multiList;

    public static BigInteger factorial(int n) {
        int log2n = 31 - Integer.numberOfLeadingZeros(n);
        int piN = log2n < 2 ? 1 : 2 + (15 * n) / (8 * (log2n - 1));

        primeList = new int[piN];
        multiList = new int[piN];

        int len = primeFactors(n);
        return nestedSquare(len).shiftLeft(n - Integer.bitCount(n));
    }

    private static BigInteger nestedSquare(int len) {
        if (len == 0) {
            return BigInteger.ONE;
        }

        int i = 0, mult = multiList[0];

        while (mult > 1) {
            if ((mult & 1) == 1) { // is mult odd ?
                primeList[len++] = primeList[i];
            }

            multiList[i++] = mult / 2;
            mult = multiList[i];
        }
        BigInteger ns = nestedSquare(i);
        if (len <= i) {
            return ns.multiply(ns);
        }

        return product(primeList, i, len - i).multiply(ns.multiply(ns));
    }

    private static BigInteger product(int[] a, int start, int length) {
        if (length == 0) {
            return BigInteger.ONE;
        }

        int len = (length + 1) / 2;
        long[] b = new long[len];

        int i, j, k;

        for (k = 0, i = start, j = start + length - 1; i < j; i++, k++, j--) {
            b[k] = a[i] * (long) a[j];
        }

        if (i == j) {
            b[k++] = a[j];
        }

        return recProduct(b, 0, k - 1);
    }

    private static BigInteger recProduct(long[] s, int n, int m) {
        if (n > m) {
            return BigInteger.ONE;
        }
        if (n == m) {
            return BigInteger.valueOf(s[n]);
        }
        int k = (n + m) >> 1;
        return recProduct(s, n, k).multiply(recProduct(s, k + 1, m));
    }

    private static int primeFactors(int n) {
        int[] primes = new int[n < 17 ? 6 : (int) Math.floor(n / (Math.log(n) - 1.5))];
        int numPrimes = makePrimeList(n, primes);

        int maxBound = n / 2, count = 0;

        int start = indexOf(primes, 2, 0, numPrimes - 1);
        int end = indexOf(primes, n, start, numPrimes);

        for (int i = start; i < end; i++) {
            int prime = primes[i];
            int m = prime > maxBound ? 1 : 0;

            if (prime <= maxBound) {
                int q = n;
                while (q >= prime) {
                    m += q /= prime;
                }
            }

            primeList[count] = prime;
            multiList[count++] = m;
        }
        return count;
    }

    private static int indexOf(final int[] data, int value, int low, int high) {
        while (low < high) {
            int mid = (low + high) >>> 1;

            if (data[mid] < value) {
                low = mid + 1;
            } else {
                high = mid;
            }
        }

        if (low >= data.length) {
            return low;
        }

        if (data[low] == value) {
            low++;
        }

        return low;
    }

    private static int makePrimeList(int n, int[] prime) {
        boolean[] composite = new boolean[n / 3];

        sieveOfEratosthenes(composite);

        boolean toggle = false;
        int p = 5, i = 0, j = 2;

        prime[0] = 2;
        prime[1] = 3;

        while (p <= n) {
            if (!composite[i++]) {
                prime[j++] = p;
            }
            // -- never mind, it's ok.
            p += (toggle = !toggle) ? 2 : 4;
        }

        return j; // number of primes
    }

    private static void sieveOfEratosthenes(final boolean[] composite) {
        int d1 = 8;
        int d2 = 8;
        int p1 = 3;
        int p2 = 7;
        int s1 = 7;
        int s2 = 3;
        int n = 0;
        int len = composite.length;
        boolean toggle = false;

        while (s1 < len) { // -- scan sieve
            if (!composite[n++]) { // -- if a prime is found, cancel its multiples
                int inc = p1 + p2;

                for (int k = s1; k < len; k += inc) {
                    composite[k] = true;
                }

                for (int k = s1 + s2; k < len; k += inc) {
                    composite[k] = true;
                }
            }

            if (toggle = !toggle) { // Never mind, it's ok.
                s1 += d2;
                d1 += 16;
                p1 += 2;
                p2 += 2;
                s2 = p2;
            } else {
                s1 += d1;
                d2 += 8;
                p1 += 2;
                p2 += 6;
                s2 = p1;
            }
        }
    }

    public static void main(String[] args) {
        int n = Integer.parseInt(args[0]);
        long nanos = System.nanoTime();
        BigInteger fact = factorial(n);
        nanos = System.nanoTime() - nanos;
        // Commented out because it takes ages to print
        //System.out.println(fact);
        System.out.println(nanos / 1e9);
    }
}