Digit sum of central binomial coefficients

C++ (GMP) - (287,000,000 / 422,000) = 680.09

Shamelessly combine Kummer's Theorem by xnor and GMP by qwr. Still not even close to the Go solution, not sure why.

Edit: Thanks Keith Randall for the reminder that multiplication is faster if the number is similar in size. I implemented multi-level multiplication, similar to memory coalescing concept on memory management. And the result is impressive. What used to take 51s, now takes only 0.5s (i.e., 100-fold improvement!!)

OLD CODE (n=14,000,000)
Done sieving in 0.343s
Done calculating binom in 51.929s
Done summing in 0.901s
14000000: 18954729

real    0m53.194s
user    0m53.116s
sys 0m0.060s

NEW CODE (n=14,000,000)
Done sieving in 0.343s
Done calculating binom in 0.552s
Done summing in 0.902s
14000000: 18954729

real    0m1.804s
user    0m1.776s
sys 0m0.023s

The run for n=287,000,000

Done sieving in 4.211s
Done calculating binom in 17.934s
Done summing in 37.677s
287000000: 388788354

real    0m59.928s
user    0m58.759s
sys 0m1.116s

The code. Compile with -lgmp -lgmpxx -O3

#include <gmpxx.h>
#include <iostream>
#include <time.h>
#include <cstdio>

const int MAX=287000000;
const int PRIME_COUNT=15700000;

int primes[PRIME_COUNT], factors[PRIME_COUNT], count;
bool sieve[MAX];
int max_idx=0;

void run_sieve(){
    sieve[2] = true;
    primes[0] = 2;
    count = 1;
    for(int i=3; i<MAX; i+=2){
        sieve[i] = true;
    }
    for(int i=3; i<17000; i+=2){
        if(!sieve[i]) continue;
        for(int j = i*i; j<MAX; j+=i){
            sieve[j] = false;
        }
    }
    for(int i=3; i<MAX; i+=2){
        if(sieve[i]) primes[count++] = i;
    }
}

mpz_class sum_digits(mpz_class n){
    clock_t t = clock();
    char* str = mpz_get_str(NULL, 10, n.get_mpz_t());
    int result = 0;
    for(int i=0;str[i]>0;i++){
        result+=str[i]-48;
    }
    printf("Done summing in %.3fs\n", ((float)(clock()-t))/CLOCKS_PER_SEC);
    return result;
}

mpz_class nc2_fast(const mpz_class &x){
    clock_t t = clock();
    int prime;
    const unsigned int n = mpz_get_ui(x.get_mpz_t());
    const unsigned int n2 = n/2;
    unsigned int m;
    unsigned int digit;
    unsigned int carry=0;
    unsigned int carries=0;
    mpz_class result = 1;
    mpz_class prime_prods = 1;
    mpz_class tmp;
    mpz_class tmp_prods[32], tmp_prime_prods[32];
    for(int i=0; i<32; i++){
        tmp_prods[i] = (mpz_class)NULL;
        tmp_prime_prods[i] = (mpz_class)NULL;
    }
    for(int i=0; i< count; i++){
        prime = primes[i];
        carry=0;
        carries=0;
        if(prime > n) break;
        if(prime > n2){
            tmp = prime;
            for(int j=0; j<32; j++){
                if(tmp_prime_prods[j] == NULL){
                    tmp_prime_prods[j] = tmp;
                    break;
                } else {
                    mpz_mul(tmp.get_mpz_t(), tmp.get_mpz_t(), tmp_prime_prods[j].get_mpz_t());
                    tmp_prime_prods[j] = (mpz_class)NULL;
                }
            }
            continue;
        }
        m=n2;
        while(m>0){
            digit = m%prime;
            carry = (2*digit + carry >= prime) ? 1 : 0;
            carries += carry;
            m/=prime;
        }
        if(carries>0){
            tmp = 0;
            mpz_ui_pow_ui(tmp.get_mpz_t(), prime, carries);
            for(int j=0; j<32; j++){
                if(tmp_prods[j] == NULL){
                    tmp_prods[j] = tmp;
                    break;
                } else {
                    mpz_mul(tmp.get_mpz_t(), tmp.get_mpz_t(), tmp_prods[j].get_mpz_t());
                    tmp_prods[j] = (mpz_class)NULL;
                }
            }
        }
    }
    result = 1;
    prime_prods = 1;
    for(int j=0; j<32; j++){
        if(tmp_prods[j] != NULL){
            mpz_mul(result.get_mpz_t(), result.get_mpz_t(), tmp_prods[j].get_mpz_t());
        }
        if(tmp_prime_prods[j] != NULL){
            mpz_mul(prime_prods.get_mpz_t(), prime_prods.get_mpz_t(), tmp_prime_prods[j].get_mpz_t());
        }
    }
    mpz_mul(result.get_mpz_t(), result.get_mpz_t(), prime_prods.get_mpz_t());
    printf("Done calculating binom in %.3fs\n", ((float)(clock()-t))/CLOCKS_PER_SEC);
    return result;
}

int main(int argc, char* argv[]){
    const mpz_class n = atoi(argv[1]);
    clock_t t = clock();
    run_sieve();
    printf("Done sieving in %.3fs\n", ((float)(clock()-t))/CLOCKS_PER_SEC);
    std::cout << n << ": " << sum_digits(nc2_fast(n)) << std::endl;
    return 0;
}

Go, 33.96 = (16300000 / 480000)

package main

import "math/big"

const n = 16300000

var (
    sieve     [n + 1]bool
    remaining [n + 1]int
    count     [n + 1]int
)

func main() {
    println("finding primes")
    for p := 2; p <= n; p++ {
        if sieve[p] {
            continue
        }
        for i := p * p; i <= n; i += p {
            sieve[i] = true
        }
    }

    // count net number of times each prime appears in the result.
    println("counting factors")
    for i := 2; i <= n; i++ {
        remaining[i] = i
    }
    for p := 2; p <= n; p++ {
        if sieve[p] {
            continue
        }

        for i := p; i <= n; i += p {
            for remaining[i]%p == 0 { // may have multiple factors of p
                remaining[i] /= p

                // count positive for n!
                count[p]++
                // count negative twice for ((n/2)!)^2
                if i <= n/2 {
                    count[p] -= 2
                }
            }
        }
    }

    // ignore all the trailing zeros
    count[2] -= count[5]
    count[5] = 0

    println("listing factors")
    var m []uint64
    for i := 0; i <= n; i++ {
        for count[i] > 0 {
            m = append(m, uint64(i))
            count[i]--
        }
    }

    println("grouping factors")
    m = group(m)

    println("multiplying")
    x := mul(m)

    println("converting to base 10")
    d := 0
    for _, c := range x.String() {
        d += int(c - '0')
    }
    println("sum of digits:", d)
}

// Return product of elements in a.
func mul(a []uint64) *big.Int {
    if len(a) == 1 {
        x := big.NewInt(0)
        x.SetUint64(a[0])
        return x
    }
    m := len(a) / 2
    x := mul(a[:m])
    y := mul(a[m:])
    x.Mul(x, y) // fast because x and y are about the same length
    return x
}

// return a slice whose members have the same product
// as the input slice, but hopefully shorter.
func group(a []uint64) []uint64 {
    var g []uint64
    r := uint64(1)
    b := 1
    for _, x := range a {
        c := bits(x)
        if b+c <= 64 {
            r *= x
            b += c
        } else {
            g = append(g, r)
            r = x
            b = c
        }
    }
    g = append(g, r)
    return g
}

// bits returns the number of bits in the representation of x
func bits(x uint64) int {
    n := 0
    for x != 0 {
        n++
        x >>= 1
    }
    return n
}

Works by counting all the prime factors in the numerator and denominator and canceling matching factors. Multiplies the leftovers to get the result.

More than 80% of the time is spent in converting to base 10. There's got to be a better way to do that...


Python 3 (8.8 = 2.2 million / 0.25 million)

This is in Python, which isn't known for speed, so you can probably do better porting this to another language.

Primes generator taken from this StackOverflow contest.

import numpy
import time

def primesfrom2to(n):
    """ Input n>=6, Returns a array of primes, 2 <= p < n """
    sieve = numpy.ones(n//3 + (n%6==2), dtype=numpy.bool)
    for i in range(1,int(n**0.5)//3+1):
        if sieve[i]:
            k=3*i+1|1
            sieve[       k*k/3     ::2*k] = False
            sieve[k*(k-2*(i&1)+4)/3::2*k] = False
    return numpy.r_[2,3,((3*numpy.nonzero(sieve)[0][1:]+1)|1)]

t0 = time.clock()

N=220*10**4
n=N//2

print("N = %d" % N)
print()

print("Generating primes.")
primes = primesfrom2to(N)

t1 = time.clock()
print ("Time taken: %f" % (t1-t0))

print("Computing product.")
product = 1

for p in primes:
    p=int(p)
    carries = 0 
    carry = 0

    if p>n:
        product*=p
        continue

    m=n

    #Count carries of n+n in base p as per Kummer's Theorem
    while m:
        digit = m%p
        carry = (2*digit + carry >= p)
        carries += carry
        m//=p

    if carries >0:
        for _ in range(carries):
            product *= p

    #print(p,carries,product)

t2 = time.clock()
print ("Time taken: %f" % (t2-t1))

print("Converting number to string.")

# digit_sum = 0
# result=product

# while result:
    # digit_sum+=result%10
    # result//=10

digit_sum = 0
digit_string = str(product)

t3 = time.clock()
print ("Time taken: %f" % (t3-t2))

print("Summing digits.")
for d in str(digit_string):digit_sum+=int(d)

t4 = time.clock()
print ("Time taken: %f" % (t4-t3))
print ()

print ("Total time: %f" % (t4-t0))
print()
print("Sum of digits = %d" % digit_sum)

The main idea of the algorithm is to use Kummer's Theorem to get the prime-factorization of the binomial. For each prime, we learn the highest power of it that divides the answer, and multiply the running product by that power of the prime. In this way, we only need to multiply once for each prime in the prime-factorization of the answer.

Output showing time breakdown:

N = 2200000
Generating primes.
Time taken: 0.046408
Computing product.
Time taken: 17.931472
Converting number to string.
Time taken: 39.083390
Summing digits.
Time taken: 1.502393

Total time: 58.563664

Sum of digits = 2980107

Surprisingly, most of the time is spent converting the number to a string to sum its digits. Also surprisingly, converting to a string was much faster than getting digits from repeated %10 and //10, even though the whole string must presumably be kept in memory.

Generating the primes takes negligible time (and hence I don't feel unfair copying existing code). Summing digits is fast. The actual multiplication takes one third of the time.

Given that digit summing seems to be the limiting factor, perhaps an algorithm to multiply numbers in decimal representation would save time in total by shortcutting the binary/decimal conversion.