Calculate the permanent as quickly as possible

gcc C++ n ≈ 36 (57 seconds on my system)

Uses Glynn formula with a Gray code for updates if all column sums are even, otherwise uses Ryser's method. Threaded and vectorized. Optimized for AVX, so don't expect much on older processors. Don't bother with n>=35 for a matrix with only +1's even if your system is fast enough since the signed 128 bit accumulator will overflow. For random matrices you will probably not hit the overflow. For n>=37 the internal multipliers will start to overflow for an all 1/-1 matrix. So only use this program for n<=36.

Just give the matrix elements on STDIN separated by any kind of whitespace

permanent
1 2
3 4
^D

permanent.cpp:

/*
  Compile using something like:
    g++ -Wall -O3 -march=native -fstrict-aliasing -std=c++11 -pthread -s permanent.cpp -o permanent
*/

#include <iostream>
#include <iomanip>
#include <cstdlib>
#include <cstdint>
#include <climits>
#include <array>
#include <vector>
#include <thread>
#include <future>
#include <ctgmath>
#include <immintrin.h>

using namespace std;

bool const DEBUG = false;
int const CACHE = 64;

using Index  = int_fast32_t;
Index glynn;
// Number of elements in our vectors
Index const POW   = 3;
Index const ELEMS = 1 << POW;
// Over how many floats we distribute each row
Index const WIDTH = 9;
// Number of bits in the fraction part of a floating point number
int const FLOAT_MANTISSA = 23;
// Type to use for the first add/multiply phase
using Sum  = float;
using SumN = __restrict__ Sum __attribute__((vector_size(ELEMS*sizeof(Sum))));
// Type to convert to between the first and second phase
using ProdN = __restrict__ int32_t __attribute__((vector_size(ELEMS*sizeof(int32_t))));
// Type to use for the third and last multiply phase.
// Also used for the final accumulator
using Value = __int128;
using UValue = unsigned __int128;

// Wrap Value so C++ doesn't really see it and we can put it in vectors etc.
// Needed since C++ doesn't fully support __int128
struct Number {
    Number& operator+=(Number const& right) {
        value += right.value;
        return *this;
    }
    // Output the value
    void print(ostream& os, bool dbl = false) const;
    friend ostream& operator<<(ostream& os, Number const& number) {
        number.print(os);
        return os;
    }

    Value value;
};

using ms = chrono::milliseconds;

auto nr_threads = thread::hardware_concurrency();
vector<Sum> input;

// Allocate cache aligned datastructures
template<typename T>
T* alloc(size_t n) {
    T* mem = static_cast<T*>(aligned_alloc(CACHE, sizeof(T) * n));
    if (mem == nullptr) throw(bad_alloc());
    return mem;
}

// Work assigned to thread k of nr_threads threads
Number permanent_part(Index n, Index k, SumN** more) {
    uint64_t loops = (UINT64_C(1) << n) / nr_threads;
    if (glynn) loops /= 2;
    Index l = loops < ELEMS ? loops : ELEMS;
    loops /= l;
    auto from = loops * k;
    auto to   = loops * (k+1);

    if (DEBUG) cout << "From=" << from << "\n";
    uint64_t old_gray = from ^ from/2;
    uint64_t bit = 1;
    bool bits = (to-from) & 1;

    Index nn = (n+WIDTH-1)/WIDTH;
    Index ww = nn * WIDTH;
    auto column = alloc<SumN>(ww);
    for (Index i=0; i<n; ++i)
        for (Index j=0; j<ELEMS; ++j) column[i][j] = 0;
    for (Index i=n; i<ww; ++i)
        for (Index j=0; j<ELEMS; ++j) column[i][j] = 1;
    Index b;
    if (glynn) {
        b = n > POW+1 ? n - POW - 1: 0;
        auto c = n-1-b;
        for (Index k=0; k<l; k++) {
            Index gray = k ^ k/2;
            for (Index j=0; j< c; ++j)
                if (gray & 1 << j)
                    for (Index i=0; i<n; ++i)
                        column[i][k] -= input[(b+j)*n+i];
                else
                    for (Index i=0; i<n; ++i)
                        column[i][k] += input[(b+j)*n+i];
        }
        for (Index i=0; i<n; ++i)
            for (Index k=0; k<l; k++)
                column[i][k] += input[n*(n-1)+i];

        for (Index k=1; k<l; k+=2)
            column[0][k] = -column[0][k];

        for (Index i=0; i<b; ++i, bit <<= 1) {
            if (old_gray & bit) {
                bits = bits ^ 1;
                for (Index j=0; j<ww; ++j)
                    column[j] -= more[i][j];
            } else {
                for (Index j=0; j<ww; ++j)
                    column[j] += more[i][j];
            }
        }

        for (Index i=0; i<n; ++i)
            for (Index k=0; k<l; k++)
                column[i][k] /= 2;
    } else {
        b = n > POW ? n - POW : 0;
        auto c = n-b;
        for (Index k=0; k<l; k++) {
            Index gray = k ^ k/2;
            for (Index j=0; j<c; ++j)
                if (gray & 1 << j)
                    for (Index i=0; i<n; ++i)
                        column[i][k] -= input[(b+j)*n+i];
        }

        for (Index k=1; k<l; k+=2)
            column[0][k] = -column[0][k];

        for (Index i=0; i<b; ++i, bit <<= 1) {
            if (old_gray & bit) {
                bits = bits ^ 1;
                for (Index j=0; j<ww; ++j)
                    column[j] -= more[i][j];
            }
        }
    }

    if (DEBUG) {
        for (Index i=0; i<ww; ++i) {
            cout << "Column[" << i << "]=";
            for (Index j=0; j<ELEMS; ++j) cout << " " << column[i][j];
            cout << "\n";
        }
    }

    --more;
    old_gray = (from ^ from/2) | UINT64_C(1) << b;
    Value total = 0;
    SumN accu[WIDTH];
    for (auto p=from; p<to; ++p) {
        uint64_t new_gray = p ^ p/2;
        uint64_t bit = old_gray ^ new_gray;
        Index i = __builtin_ffsl(bit);
        auto diff = more[i];
        auto c = column;
        if (new_gray > old_gray) {
            // Phase 1 add/multiply.
            // Uses floats until just before loss of precision
            for (Index i=0; i<WIDTH; ++i) accu[i] = *c++ -= *diff++;

            for (Index j=1; j < nn; ++j)
                for (Index i=0; i<WIDTH; ++i) accu[i] *= *c++ -= *diff++;
        } else {
            // Phase 1 add/multiply.
            // Uses floats until just before loss of precision
            for (Index i=0; i<WIDTH; ++i) accu[i] = *c++ += *diff++;

            for (Index j=1; j < nn; ++j)
                for (Index i=0; i<WIDTH; ++i) accu[i] *= *c++ += *diff++;
        }

        if (DEBUG) {
            cout << "p=" << p << "\n";
            for (Index i=0; i<ww; ++i) {
                cout << "Column[" << i << "]=";
                for (Index j=0; j<ELEMS; ++j) cout << " " << column[i][j];
                cout << "\n";
            }
        }

        // Convert floats to int32_t
        ProdN prod32[WIDTH] __attribute__((aligned (32)));
        for (Index i=0; i<WIDTH; ++i)
            // Unfortunately gcc doesn't recognize the static_cast<int32_t>
            // as a vector pattern, so force it with an intrinsic
#ifdef __AVX__
            //prod32[i] = static_cast<ProdN>(accu[i]);
            reinterpret_cast<__m256i&>(prod32[i]) = _mm256_cvttps_epi32(accu[i]);
#else   // __AVX__
            for (Index j=0; j<ELEMS; ++j)
                prod32[i][j] = static_cast<int32_t>(accu[i][j]);
#endif  // __AVX__

        // Phase 2 multiply. Uses int64_t until just before overflow
        int64_t prod64[3][ELEMS];
        for (Index i=0; i<3; ++i) {
            for (Index j=0; j<ELEMS; ++j)
                prod64[i][j] = static_cast<int64_t>(prod32[i][j]) * prod32[i+3][j] * prod32[i+6][j];
        }
        // Phase 3 multiply. Collect into __int128. For large matrices this will
        // actually overflow but that's ok as long as all 128 low bits are
        // correct. Terms will cancel and the final sum can fit into 128 bits
        // (This will start to fail at n=35 for the all 1 matrix)
        // Strictly speaking this needs the -fwrapv gcc option
        for (Index j=0; j<ELEMS; ++j) {
            auto value = static_cast<Value>(prod64[0][j]) * prod64[1][j] * prod64[2][j];
            if (DEBUG) cout << "value[" << j << "]=" << static_cast<double>(value) << "\n";
            total += value;
        }
        total = -total;

        old_gray = new_gray;
    }

    return bits ? Number{-total} : Number{total};
}

// Prepare datastructures, Assign work to threads
Number permanent(Index n) {
    Index nn = (n+WIDTH-1)/WIDTH;
    Index ww = nn*WIDTH;

    Index rows  = n > (POW+glynn) ? n-POW-glynn : 0;
    auto data = alloc<SumN>(ww*(rows+1));
    auto pointers = alloc<SumN *>(rows+1);
    auto more = &pointers[0];
    for (Index i=0; i<rows; ++i)
        more[i] = &data[ww*i];
    more[rows] = &data[ww*rows];
    for (Index j=0; j<ww; ++j)
        for (Index i=0; i<ELEMS; ++i)
            more[rows][j][i] = 0;

    Index loops = n >= POW+glynn ? ELEMS : 1 << (n-glynn);
    auto a = &input[0];
    for (Index r=0; r<rows; ++r) {
        for (Index j=0; j<n; ++j) {
            for (Index i=0; i<loops; ++i)
                more[r][j][i] = j == 0 && i %2 ? -*a : *a;
            for (Index i=loops; i<ELEMS; ++i)
                more[r][j][i] = 0;
            ++a;
        }
        for (Index j=n; j<ww; ++j)
            for (Index i=0; i<ELEMS; ++i)
                more[r][j][i] = 0;
    }

    if (DEBUG)
        for (Index r=0; r<=rows; ++r)
            for (Index j=0; j<ww; ++j) {
                cout << "more[" << r << "][" << j << "]=";
                for (Index i=0; i<ELEMS; ++i)
                    cout << " " << more[r][j][i];
                cout << "\n";
            }

    // Send work to threads...
    vector<future<Number>> results;
    for (auto i=1U; i < nr_threads; ++i)
        results.emplace_back(async(DEBUG ? launch::deferred: launch::async, permanent_part, n, i, more));
    // And collect results
    auto r = permanent_part(n, 0, more);
    for (auto& result: results)
        r += result.get();

    free(data);
    free(pointers);

    // For glynn we should double the result, but we will only do this during
    // the final print. This allows n=34 for an all 1 matrix to work
    // if (glynn) r *= 2;
    return r;
}

// Print 128 bit number
void Number::print(ostream& os, bool dbl) const {
    const UValue BILLION = 1000000000;

    UValue val;
    if (value < 0) {
        os << "-";
        val = -value;
    } else
        val = value;
    if (dbl) val *= 2;

    uint32_t output[5];
    for (int i=0; i<5; ++i) {
        output[i] = val % BILLION;
        val /= BILLION;
    }
    bool print = false;
    for (int i=4; i>=0; --i) {
        if (print) {
            os << setfill('0') << setw(9) << output[i];
        } else if (output[i] || i == 0) {
            print = true;
            os << output[i];
        }
    }
}

// Read matrix, check for sanity
void my_main() {
    Sum a;
    while (cin >> a)
        input.push_back(a);

    size_t n = sqrt(input.size());
    if (input.size() != n*n)
        throw(logic_error("Read " + to_string(input.size()) +
                          " elements which does not make a square matrix"));

    vector<double> columns_pos(n, 0);
    vector<double> columns_neg(n, 0);
    Sum *p = &input[0];
    for (size_t i=0; i<n; ++i)
        for (size_t j=0; j<n; ++j, ++p) {
            if (*p >= 0) columns_pos[j] += *p;
            else         columns_neg[j] -= *p;
        }
    std::array<double,WIDTH> prod;
    prod.fill(1);

    int32_t odd = 0;
    for (size_t j=0; j<n; ++j) {
        prod[j%WIDTH] *= max(columns_pos[j], columns_neg[j]);
        auto sum = static_cast<int32_t>(columns_pos[j] - columns_neg[j]);
        odd |= sum;
    }
    glynn = (odd & 1) ^ 1;
    for (Index i=0; i<WIDTH; ++i)
        // A float has an implicit 1. in front of the fraction so it can
        // represent 1 bit more than the mantissa size. And 1 << (mantissa+1)
        // itself is in fact representable
        if (prod[i] && log2(prod[i]) > FLOAT_MANTISSA+1)
            throw(range_error("Values in matrix are too large. A subproduct reaches " + to_string(prod[i]) + " which doesn't fit in a float without loss of precision"));

    for (Index i=0; i<3; ++i) {
        auto prod3 = prod[i] * prod[i+3] * prod[i+6];
        if (log2(prod3) >= CHAR_BIT*sizeof(int64_t)-1)
            throw(range_error("Values in matrix are too large. A subproduct reaches " + to_string(prod3) + " which doesn't fit in an int64"));
    }

    nr_threads = pow(2, ceil(log2(static_cast<float>(nr_threads))));
    uint64_t loops = UINT64_C(1) << n;
    if (glynn) loops /= 2;
    if (nr_threads * ELEMS > loops)
        nr_threads = max(loops / ELEMS, UINT64_C(1));
    // if (DEBUG) nr_threads = 1;

    cout << n << " x " << n << " matrix, method " << (glynn ? "Glynn" : "Ryser") << ", " << nr_threads << " threads" << endl;

    // Go for the actual calculation
    auto start = chrono::steady_clock::now();
    auto perm = permanent(n);
    auto end = chrono::steady_clock::now();
    auto elapsed = chrono::duration_cast<ms>(end-start).count();

    cout << "Permanent=";
    perm.print(cout, glynn);
    cout << " (" << elapsed / 1000. << " s)" << endl;
}

// Wrapper to print any exceptions
int main() {
    try {
        my_main();
    } catch(exception& e) {
        cerr << "Error: " << e.what() << endl;
        exit(EXIT_FAILURE);
    }
    exit(EXIT_SUCCESS);
}

C99, n ≈ 33 (35 seconds)

#include <stdint.h>
#include <stdio.h>

#define CHUNK_SIZE 12
#define NUM_THREADS 8

#define popcnt __builtin_popcountll
#define BILLION (1000 * 1000 * 1000)
#define UPDATE_ROW_PPROD() \
    update_row_pprod(row_pprod, row, rows, row_sums, mask, mask_popcnt)

typedef __int128 int128_t;

static inline int64_t update_row_pprod
(
    int64_t* row_pprod, int64_t row, int64_t* rows,
    int64_t* row_sums, int64_t mask, int64_t mask_popcnt
)
{
    int64_t temp = 2 * popcnt(rows[row] & mask) - mask_popcnt;

    row_pprod[0] *= temp;
    temp -= 1;
    row_pprod[1] *= temp;
    temp -= row_sums[row];
    row_pprod[2] *= temp;
    temp += 1;
    row_pprod[3] *= temp;

    return row + 1;
}

int main(int argc, char* argv[])
{
    int64_t size = argc - 1, rows[argc - 1];
    int64_t row_sums[argc - 1];
    int128_t permanent = 0, sign = size & 1 ? -1 : 1;

    if (argc == 2)
    {
        printf("%d\n", argv[1][0] == '-' ? -1 : 1);
        return 0;
    }

    for (int64_t row = 0; row < size; row++)
    {
        char positive = argv[row + 1][0] == '+' ? '-' : '+';

        sign *= ',' - positive;
        rows[row] = row_sums[row] = 0;

        for (char* p = &argv[row + 1][1]; *p; p++)
        {
            rows[row] <<= 1;
            rows[row] |= *p == positive;
            row_sums[row] += *p == positive;
        }

        row_sums[row] = 2 * row_sums[row] - size;
    }

    #pragma omp parallel for reduction(+:permanent) num_threads(NUM_THREADS)
    for (int64_t mask = 1; mask < 1LL << (size - 1); mask += 2)
    {
        int64_t mask_popcnt = popcnt(mask);
        int64_t row = 0;
        int128_t row_prod = 1 - 2 * (mask_popcnt & 1);
        int128_t row_prod_high = -row_prod;
        int128_t row_prod_inv = row_prod;
        int128_t row_prod_inv_high = -row_prod;

        for (int64_t chunk = 0; chunk < size / CHUNK_SIZE; chunk++)
        {
            int64_t row_pprod[4] = {1, 1, 1, 1};

            for (int64_t i = 0; i < CHUNK_SIZE; i++)
                row = UPDATE_ROW_PPROD();

            row_prod *= row_pprod[0], row_prod_high *= row_pprod[1];
            row_prod_inv *= row_pprod[3], row_prod_inv_high *= row_pprod[2];
        }

        int64_t row_pprod[4] = {1, 1, 1, 1};

        while (row < size)
            row = UPDATE_ROW_PPROD();

        row_prod *= row_pprod[0], row_prod_high *= row_pprod[1];
        row_prod_inv *= row_pprod[3], row_prod_inv_high *= row_pprod[2];
        permanent += row_prod + row_prod_high + row_prod_inv + row_prod_inv_high;
    }

    permanent *= sign;

    if (permanent < 0)
        printf("-"), permanent *= -1;

    int32_t output[5], print = 0;

    output[0] = permanent % BILLION, permanent /= BILLION;
    output[1] = permanent % BILLION, permanent /= BILLION;
    output[2] = permanent % BILLION, permanent /= BILLION;
    output[3] = permanent % BILLION, permanent /= BILLION;
    output[4] = permanent % BILLION;

    if (output[4])
        printf("%u", output[4]), print = 1;
    if (print)
        printf("%09u", output[3]);
    else if (output[3])
        printf("%u", output[3]), print = 1;
    if (print)
        printf("%09u", output[2]);
    else if (output[2])
        printf("%u", output[2]), print = 1;
    if (print)
        printf("%09u", output[1]);
    else if (output[1])
        printf("%u", output[1]), print = 1;
    if (print)
        printf("%09u\n", output[0]);
    else
        printf("%u\n", output[0]);
}

Input is currently a bit cumbersome; it is taken with rows as command line arguments, where each entry is represented by its sign, i.e., + indicates a 1 and - indicates a -1.

Test run

$ gcc -Wall -std=c99 -march=native -Ofast -fopenmp -fwrapv -o permanent permanent.c
$ ./permanent +--+ ---+ -+-+ +--+
-4
$ ./permanent ---- -+-- +--- +-+-
0
$ ./permanent +-+----- --++-++- +----+++ ---+-+++ +--++++- -+-+-++- +-+-+-+- --+-++++
192
$ ./permanent +-+--+++----++++-++- +-+++++-+--+++--+++- --+++----+-+++---+-- ---++-++++++------+- -+++-+++---+-+-+++++ +-++--+-++++-++-+--- +--+---+-++++---+++- +--+-++-+++-+-+++-++ +-----+++-----++-++- --+-+-++-+-++++++-++ -------+----++++---- ++---++--+-++-++++++ -++-----++++-----+-+ ++---+-+----+-++-+-+ +++++---+++-+-+++-++ +--+----+--++-+----- -+++-++--+++--++--++ ++--++-++-+++-++-+-+ +++---+--++---+----+ -+++-------++-++-+--
1021509632
$ time ./permanent +++++++++++++++++++++++++++++++{,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,}     # 31
8222838654177922817725562880000000

real    0m8.365s
user    1m6.504s
sys     0m0.000s
$ time ./permanent ++++++++++++++++++++++++++++++++{,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,}   # 32
263130836933693530167218012160000000

real    0m17.013s
user    2m15.226s
sys     0m0.001s
$ time ./permanent +++++++++++++++++++++++++++++++++{,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,} # 33
8683317618811886495518194401280000000

real    0m34.592s
user    4m35.354s
sys     0m0.001s

Python 2, n ≈ 28

from operator import mul

def fast_glynn_perm(M):
    row_comb = [sum(c) for c in zip(*M)]
    n=len(M)

    total = 0
    old_grey = 0 
    sign = +1

    binary_power_dict = {2**i:i for i in range(n)}
    num_loops = 2**(n-1)

    for bin_index in xrange(1, num_loops + 1):  
        total += sign * reduce(mul, row_comb)

        new_grey = bin_index^(bin_index/2)
        grey_diff = old_grey ^ new_grey
        grey_diff_index = binary_power_dict[grey_diff]

        new_vector = M[grey_diff_index]
        direction = 2 * cmp(old_grey,new_grey)      

        for i in range(n):
            row_comb[i] += new_vector[i] * direction

        sign = -sign
        old_grey = new_grey

    return total/num_loops

Uses the Glynn formula with a Gray code for updates. Runs up to n=23 in a minute on my machine. One can surely do better implementing this in a faster language and with better data structures. This doesn't use that the matrix is ±1-valued.

A Ryser formula implementation is very similar, summing over all 0/1 vectors of coefficients rather than ±1-vectors. It takes about twice as long as Glynn's formula because adds over all 2^n such vectors, whereas Glynn's halves that using symmetry to only those starting with +1.

from operator import mul

def fast_ryser_perm(M):
    n=len(M)
    row_comb = [0] * n

    total = 0
    old_grey = 0 
    sign = +1

    binary_power_dict = {2**i:i for i in range(n)}
    num_loops = 2**n

    for bin_index in range(1, num_loops) + [0]: 
        total += sign * reduce(mul, row_comb)

        new_grey = bin_index^(bin_index/2)
        grey_diff = old_grey ^ new_grey
        grey_diff_index = binary_power_dict[grey_diff]

        new_vector = M[grey_diff_index]
        direction = cmp(old_grey, new_grey)

        for i in range(n):
            row_comb[i] += new_vector[i] * direction

        sign = -sign
        old_grey = new_grey

    return total * (-1)**n