Find the number of n-by-n (-1, 0, 1) matrices with zero permanent as quickly as possible

Rust, \$A(5) = 186481694371\$ in 0.2 s, \$A(6) = 19733690332538577\$ in 580 s

(Unofficial times on a Core i7-10710U with 6 cores/12 threads.)

src/main.rs

use itertools::Itertools;
use num_integer::gcd;
use rayon::prelude::*;
use std::cmp::Ordering;
use std::collections::HashMap;
use std::convert::TryFrom;
use std::sync::Mutex;

type Count = u64;
type Coeff = i32;

fn sort_scols(n: u32, i: u32, sets: &[u32], state: &[Coeff]) -> Vec<u32> {
    let mut scol_ranks: Vec<u32> = (0..2 * n).map(|scol| (scol >> 1 > i) as u32).collect();
    let mut sorted_scols: Vec<u32> = (0..2 * n).collect();
    let mut scol_descs = vec![vec![]; 2 * n as usize];
    for (set_index, (&set, &coeff)) in sets.iter().zip(state).enumerate() {
        if coeff != 0 {
            for col in 0..n {
                if set & 1 << col != 0 {
                    scol_descs[(col as usize) << 1].push(set_index);
                    scol_descs[(col as usize) << 1 | 1].push(set_index);
                }
            }
        }
    }
    let mut sorted_set_indexes: Vec<_> = (0..sets.len()).collect();
    let mut set_ranks: Vec<u32> = vec![!0; sets.len()];
    let mut set_descs = vec![(0, vec![]); sets.len()];

    while {
        for (&coeff, set_desc) in state.iter().zip(&mut set_descs) {
            set_desc.0 = coeff;
            set_desc.1.clear();
        }
        let mut seen = 0;
        for &scol in &sorted_scols {
            if seen & 1 << (scol >> 1) == 0 {
                seen |= 1 << (scol >> 1);
                if scol & 1 == 0 {
                    for (&set, set_desc) in sets.iter().zip(&mut set_descs) {
                        if set_desc.0 != 0 && set & 1 << (scol >> 1) != 0 {
                            set_desc.1.push(scol_ranks[scol as usize]);
                        }
                    }
                } else {
                    for (&set, set_desc) in sets.iter().zip(&mut set_descs) {
                        if set_desc.0 != 0 && set & 1 << (scol >> 1) != 0 {
                            set_desc.0 = -set_desc.0;
                            set_desc.1.push(scol_ranks[scol as usize]);
                        }
                    }
                }
            }
        }

        let set_index_key =
            |&set_index: &usize| (set_descs[set_index].0.abs(), &set_descs[set_index].1);
        sorted_set_indexes.sort_by_key(set_index_key);
        if sets.len() != 0 {
            let mut next_set_rank = 0;
            set_ranks[sorted_set_indexes[0]] = next_set_rank;
            for (&set_index0, &set_index1) in sorted_set_indexes.iter().tuple_windows() {
                if set_index_key(&set_index0) < set_index_key(&set_index1) {
                    next_set_rank += 1;
                }
                set_ranks[set_index1] = next_set_rank;
            }
        }

        let scol_set_index_key = |scol: u32| {
            let sign = 1 - 2 * (scol & 1) as Coeff;
            let set_ranks = &set_ranks[..];
            let set_descs = &set_descs[..];
            move |&set_index: &usize| (set_ranks[set_index], set_descs[set_index].0 * sign)
        };
        for (scol, scol_desc) in scol_descs.iter_mut().enumerate() {
            scol_desc.sort_by_key(scol_set_index_key(scol as u32));
        }
        let scol_cmp = |&scol0: &u32, &scol1: &u32| {
            scol_ranks[scol0 as usize]
                .cmp(&scol_ranks[scol1 as usize])
                .then_with(|| {
                    scol_descs[scol0 as usize]
                        .iter()
                        .map(scol_set_index_key(scol0))
                        .cmp(
                            scol_descs[scol1 as usize]
                                .iter()
                                .map(scol_set_index_key(scol1)),
                        )
                })
        };
        sorted_scols.sort_by(scol_cmp);

        let mut new_scol_ranks = vec![!0; 2 * n as usize];
        let mut next_rank = 0;
        let mut changed = false;
        new_scol_ranks[sorted_scols[0] as usize] = next_rank;
        for (&scol0, &scol1) in sorted_scols.iter().tuple_windows() {
            if scol_ranks[scol0 as usize] < scol_ranks[scol1 as usize] {
                next_rank += 1;
            } else if scol_cmp(&scol0, &scol1) == Ordering::Less {
                changed = true;
                next_rank += 1;
            }
            new_scol_ranks[scol1 as usize] = next_rank;
        }
        scol_ranks = new_scol_ranks;

        changed
    } {}

    let mut seen = 0;
    sorted_scols.retain(|scol| {
        seen & 1 << (scol >> 1) == 0 && {
            seen |= 1 << (scol >> 1);
            true
        }
    });
    assert_eq!(seen, !(!0 << n));

    sorted_scols
}

fn count(n: u32) -> Count {
    static ERR: &str = "n is too large";
    Count::checked_pow(3, TryFrom::try_from(n * n).expect(ERR)).expect(ERR);
    (1..=Coeff::try_from(n).expect(ERR)).fold(1, |x: Coeff, y| x.checked_mul(y).expect(ERR));

    let mut cur_sets = vec![0];
    let mut cur_set_indexes = vec![0];

    let mut cur_states = HashMap::new();
    cur_states.insert(vec![1], 1);

    for k in 0..n {
        for i in 0..n {
            let keep_set = |&set: &u32| {
                if set.count_ones() == k {
                    !set & !1 << i & !(!0 << n) != 0
                } else {
                    set & 1 << i == 0
                }
            };
            let enlarge_set = |&set: &u32| set.count_ones() == k && set & 1 << i == 0;
            let new_sets: Vec<u32> = cur_sets
                .iter()
                .copied()
                .filter(keep_set)
                .chain(
                    cur_sets
                        .iter()
                        .copied()
                        .filter(enlarge_set)
                        .map(|set| set | 1 << i),
                )
                .collect();
            let mut new_set_indexes = vec![!0; 1 << n];
            for (set_index, &set) in new_sets.iter().enumerate() {
                new_set_indexes[set as usize] = set_index;
            }

            let new_states = Mutex::new(HashMap::new());
            cur_states.into_par_iter().for_each(|(cur_state, count)| {
                for elt in -1..=1 {
                    let mut new_state: Vec<Coeff> = cur_sets
                        .iter()
                        .copied()
                        .zip(&cur_state)
                        .filter(|(set, _)| keep_set(set))
                        .map(|(_, &coeff)| coeff)
                        .chain(
                            cur_sets
                                .iter()
                                .copied()
                                .zip(&cur_state)
                                .filter(|(set, _)| enlarge_set(set))
                                .map(|(set, &coeff)| {
                                    let mut new_coeff = coeff * elt;
                                    if set & !(!0 << i) != 0 {
                                        new_coeff +=
                                            cur_state[cur_set_indexes[set as usize | 1 << i]];
                                    }
                                    new_coeff
                                }),
                        )
                        .collect();

                    let factor = new_state.iter().copied().fold(0, gcd);
                    if factor > 1 {
                        for coeff in &mut new_state {
                            *coeff /= factor;
                        }
                    }

                    let sorted_scols = sort_scols(n, i, &new_sets, &new_state);

                    let new_state: Vec<Coeff> = new_sets
                        .iter()
                        .map(|&set| {
                            let mut permuted_set = 0;
                            let mut sign = 0;
                            for (col, &permuted_scol) in sorted_scols.iter().enumerate() {
                                sign ^= set >> col & permuted_scol & 1;
                                permuted_set |= (set >> col & 1) << (permuted_scol >> 1);
                            }
                            (1 - 2 * sign as Coeff)
                                * new_state[new_set_indexes[permuted_set as usize] as usize]
                        })
                        .collect();

                    *new_states.lock().unwrap().entry(new_state).or_insert(0) += count;
                }
            });

            cur_sets = new_sets;
            cur_set_indexes = new_set_indexes;
            cur_states = new_states.into_inner().unwrap();
        }
    }

    *cur_states.get(&vec![0]).unwrap_or(&0)
}

fn main() {
    for n in 1..=6 {
        println!("{} {}", n, count(n));
    }
}

Cargo.toml

[package]
name = "permanent"
version = "0.1.0"
authors = ["Anders Kaseorg <[email protected]>"]
edition = "2018"

[dependencies]
num-integer = "0.1.41"
rayon = "1.2.1"
itertools = "0.8.2"

Building and running

$ cargo build --release
$ time target/release/permanent
1 1
2 33
3 7555
4 13482049
5 186481694371
6 19733690332538577
5995.30user 282.67system 9:42.97elapsed 1076%CPU (0avgtext+0avgdata 6479028maxresident)k
0inputs+0outputs (0major+3259222minor)pagefaults 0swaps

How it works

This counts matrices using dynamic programming so we don’t need to enumerate all of them directly. Basically, the state of a partially filled matrix is represented by the coefficients of an expression for the permanent as a polynomial in the remaining unfilled elements. Specifically, it corresponds to a partial execution of the following algorithm for calculating the permanent:

\$\operatorname{perm} A = c_{\{0, 1, \dotsc, n - 1\}}\$, where \$c_\varnothing = 1\$, \$c_S = \sum_{i \in S} c_{S \smallsetminus \{i\}}a_{|S| - 1, i}\$ for \$S \ne \varnothing\$.

For example, here’s the \$n = 4\$ calculation written out explicitly:

c = 1

c0 = c * a00
c1 = c * a01
c2 = c * a02
c3 = c * a03

c01  = c1 * a10; c02  = c2 * a10; c03  = c3 * a10
c01 += c0 * a11; c12  = c2 * a11; c13  = c3 * a11
c02 += c0 * a12; c12 += c1 * a12; c23  = c3 * a12
c03 += c0 * a13; c13 += c1 * a13; c23 += c2 * a13

c012  = c12 * a20; c013  = c13 * a20; c023  = c23 * a20
c012 += c02 * a21; c013 += c03 * a21; c123  = c23 * a21
c012 += c01 * a22; c023 += c03 * a22; c123 += c13 * a22
c013 += c01 * a23; c023 += c02 * a23; c123 += c12 * a23

c0123  = c123 * a30
c0123 += c023 * a31
c0123 += c013 * a32
c0123 += c012 * a33

return c0123

After executing each line, we can forget the filled elements a… and represent the current state with just the c… variables that are still live. This representation collapses many different partially filled matrices into the same coefficient vector. For example, swapping any two filled rows results in the same coefficient vector.

The sort_scols function further collapses the state space by using the symmetries of column permutations and column sign flipping to try to find a canonical representation for each state. It doesn’t do a perfect job, but this imperfection only impacts performance, not correctness.


C# (\$\textrm{A330134}(5)=186481694371\$ in under 10 seconds)

using System;
using System.Collections.Generic;
using System.Linq;
using System.Numerics;

namespace Sandbox
{
    // See https://codegolf.stackexchange.com/q/196712
    static class Permanent
    {
        const int numThreads = 12;

        static void Main(string[] args)
        {
            var sw = new System.Diagnostics.Stopwatch();
            sw.Start();
            for (int n = 1; n < 8; n++)
            {
                Console.WriteLine($"{n}\t{_PPCG196712_LaplaceMT(n)}\t{sw.ElapsedMilliseconds}ms");
            }
        }

        internal static BigInteger _PPCG196712_LaplaceMT(int n)
        {
            // If we have a zero row then the permanent is zero.
            // If the first zero row is number i then the previous i rows have (3^n)-1 possibilities each, and the remaining ones have 3^n.
            BigInteger result = 0;
            int threePowN = (int)BigInteger.Pow(3, n);
            for (int i = 0; i < n; i++)
            {
                result += BigInteger.Pow(threePowN - 1, i) * BigInteger.Pow(threePowN, n - i - 1);
            }

            // Let's call a row containing a permutation of 1^1 0^{n-1} a "singleton row".
            // Under Laplace expansion it simplifies to a single {n-1}x{n-1} permanent.
            // If we have two singleton rows with the same non-zero index, that {n-1}x{n-1} permanent has a zero row, so the nxn permanent must be zero.
            // If we have multiple distinct singleton rows, we still need to worry about double-counting.
            // But all this effort would give roughly a 1.5% improvement at n=6, so is probably not worth it.

            // Now the interesting rows. We cluster similar rows as a technique to optimise column permutations.
            var clusters = new List<Cluster>();
            for (int ones = 1; ones <= n; ones++)
                for (int negs = 0; negs <= ones && ones + negs <= n; negs++)
                    clusters.Add(new Cluster(n, ones, negs));

            // Consider partitions of n-1 by cluster.
            foreach (var partition in PartitionAssignments(clusters, n-1))
            {
                var subclusters = partition.OrderBy(tuple =>
                {
                    var (cluster, k) = tuple;
                    // What's the speedup factor of pivoting on this cluster?
                    return k < cluster.DistinctCounts.Count ? cluster.DistinctCounts[k] / (double)cluster.NaiveCounts[k] : 1;
                }).ToList();

                // We pivot on the first cluster, and take a Cartesian product.
                result += Enumerable.Range(0, numThreads).AsParallel().
                    Select(threadId => CountByPartition(n, subclusters, 0, new sbyte[n - 1][], 0, 1, threadId)).
                    Aggregate(BigInteger.Zero, (a, b) => a + b);
            }

            return result;
        }

        private static BigInteger CountByPartition(int n, IReadOnlyList<(Cluster cluster, int repeat)> clusters, int clusterIdx, sbyte[][] matrix, int rowIdx, BigInteger weight, int threadId)
        {
            if (clusterIdx == clusters.Count) return weight * Count_Laplace(matrix) << (n - 1);

            int k = clusters[clusterIdx].repeat;
            if (clusterIdx == 0 && clusters[0].repeat < clusters[0].cluster.DistinctCounts.Count)
            {
                // Pivot
                weight *= Binom(n - 1, k) * Factorial(n - 1 - k); // Choose the pivoted rows and then permute the others freely

                BigInteger sum = 0;
                var reps = clusters[0].cluster.Representatives[k];

                int delta = reps.Count >= numThreads || clusterIdx == clusters.Count - 1 ? numThreads : 1;
                int skip = delta == 1 ? 0 : threadId;

                for (int repIdx = skip; repIdx < reps.Count; repIdx += delta)
                {
                    var (prefixWeight, prefix) = reps[repIdx];
                    for (int i = 0; i < k; i++) matrix[rowIdx + i] = prefix[i];
                    sum += CountByPartition(n, clusters, clusterIdx + 1, matrix, rowIdx + k, weight * prefixWeight, delta == 1 ? threadId : -1);
                }
                return sum;
            }

            {
                if (clusterIdx == 0) weight = Factorial(n - 1); // Permute all rows freely

                var rows = clusters[clusterIdx].cluster.Rows;
                var indices = new int[k];
                BigInteger sum = 0;

                int skip = threadId > 0 ? threadId : 0;
                int delta = threadId >= 0 ? numThreads : 1;

                while (true)
                {
                    if (skip == 0)
                    {
                        for (int i = 0; i < k; i++)
                        {
                            var j = indices[i];
                            var inRow = rows[j];
                            matrix[rowIdx + i] = inRow;
                        }

                        // Adjust the weight to account for duplicates in this cluster.
                        var recurWeight = weight;
                        int currRun = 1;
                        for (int i = 1; i < k; i++)
                        {
                            if (indices[i] == indices[i - 1]) currRun++;
                            else { recurWeight /= Factorial(currRun); currRun = 1; }
                        }
                        recurWeight /= Factorial(currRun);

                        sum += CountByPartition(n, clusters, clusterIdx + 1, matrix, rowIdx + k, recurWeight, -1);
                    }

                    skip--;
                    if (skip < 0) skip += delta;

                    if (indices[0] == rows.Count - 1) break;
                    NextOrderedMultiset(indices, rows.Count);
                }

                return sum;
            }
        }

        private static IEnumerable<IReadOnlyList<(T, int)>> PartitionAssignments<T>(IReadOnlyList<T> elts, int n)
        {
            IEnumerable<IReadOnlyList<(T, int)>> Inner(int m, int maxPart, int off, List<(T, int)> prefix, ISet<int> indicesUsed)
            {
                if (m == 0)
                {
                    yield return new List<(T, int)>(prefix);
                    yield break;
                }

                for (int part = Math.Min(m, maxPart); part > 0; part--)
                {
                    // When we reduce the part, we re-enable skipped elts.
                    for (int i = part == maxPart ? off : 0; i < elts.Count; i++)
                    {
                        if (indicesUsed.Contains(i)) continue;

                        prefix.Add((elts[i], part));
                        indicesUsed.Add(i);

                        foreach (var soln in Inner(m - part, part, i + 1, prefix, indicesUsed)) yield return soln;

                        indicesUsed.Remove(i);
                        prefix.RemoveAt(prefix.Count - 1);
                    }
                }
            }

            return Inner(n, n, 0, new List<(T, int)>(), new HashSet<int>());
        }

        private static void NextOrderedMultiset(int[] indices, int @base)
        {
            int j = indices.Length - 1;
            while (indices[j] == @base - 1) j--;
            indices[j]++;
            j++;
            while (j < indices.Length) { indices[j] = indices[j - 1]; j++; }
        }

        private static BigInteger Count_Laplace(sbyte[][] matrix)
        {
            int m = matrix.Length;
            int n = m + 1;
            BigInteger[] sums = new BigInteger[m];
            uint gray = 0;
            int sign = 1;
            BigInteger[] weights = new BigInteger[n];
            for (uint i = 0; i < 1u << n; i++)
            {
                BigInteger term = 1;
                foreach (var sum in sums) term *= sum;

                for (int j = 0; j < n; j++)
                {
                    if (((gray >> j) & 1) == 1) weights[j] += sign * term;
                }

                // Gray code update
                int flipPos = (i + 1).CountTrailingZeros();
                if (flipPos == n) break;

                uint bit = 1u << flipPos;
                if ((gray & bit) == bit)
                {
                    for (int j = 0; j < m; j++) sums[j] -= matrix[j][flipPos];
                }
                else
                {
                    for (int j = 0; j < m; j++) sums[j] += matrix[j][flipPos];
                }
                gray ^= bit;
                sign = -sign;
            }

            // Count weighted subsets which total zero, optimising with meet-in-the-middle
            var distribL = SubsetSums(weights, 0, n >> 1);
            var distribR = SubsetSums(weights, n >> 1, n - (n >> 1));
            BigInteger total = -1; // Discount the zero column solution
            foreach (var kvp in distribL)
            {
                if (distribR.TryGetValue(-kvp.Key, out var v2)) total += kvp.Value * v2;
            }

            return total;
        }

        private static Dictionary<BigInteger, BigInteger> SubsetSums(BigInteger[] weights, int off, int len)
        {
            var distrib = new Dictionary<BigInteger, BigInteger> { [0] = 1 };
            for (int i = 0; i < len; i++)
            {
                var weight = weights[off + i];
                var nextDistrib = new Dictionary<BigInteger, BigInteger>(distrib);
                foreach (var kvp in distrib)
                {
                    nextDistrib.TryGetValue(kvp.Key - weight, out var tmp);
                    nextDistrib[kvp.Key - weight] = tmp + kvp.Value;

                    nextDistrib.TryGetValue(kvp.Key + weight, out tmp);
                    nextDistrib[kvp.Key + weight] = tmp + kvp.Value;
                }

                distrib = nextDistrib;
            }

            return distrib;
        }

        class Cluster
        {
            internal readonly IReadOnlyList<sbyte[]> Rows;

            internal readonly IReadOnlyList<int> NaiveCounts;
            internal readonly IReadOnlyList<int> DistinctCounts;
            internal readonly IReadOnlyList<IReadOnlyList<(int weight, IReadOnlyList<sbyte[]> prefix)>> Representatives;

            internal Cluster(int n, int ones, int negs)
            {
                if (ones == 0) throw new ArgumentOutOfRangeException(nameof(ones));
                if (negs > ones) throw new ArgumentOutOfRangeException(nameof(negs));

                Rows = BuildRows(n, ones, negs);

                // Build pairs and triples, and calculate their advantage factors.
                var naiveCounts = new List<int> { 1, Rows.Count };
                var distinctCounts = new List<int> { 1, 1 };
                var representatives = new List<IReadOnlyList<(int, IReadOnlyList<sbyte[]>)>>
                {
                    new List<(int, IReadOnlyList<sbyte[]>)>{ (1, new List<sbyte[]>()) },
                    new List<(int, IReadOnlyList<sbyte[]>)>{ (Rows.Count, new List<sbyte[]> { Rows.First() }) }
                };

                for (int k = 2; k < 4; k++)
                {
                    var counts = new Dictionary<BigInteger, int>();
                    var reps = new Dictionary<BigInteger, IReadOnlyList<sbyte[]>>();
                    int uncollapsed = 0;

                    var indices = new int[k];
                    while (true)
                    {
                        var subset = new sbyte[k][];
                        for (int i = 0; i < k; i++) subset[i] = Rows[indices[i]];

                        BigInteger count = Factorial(k);
                        int currRun = 1;
                        for (int i = 1; i < k; i++)
                        {
                            if (indices[i] == indices[i - 1]) currRun++;
                            else { count /= Factorial(currRun); currRun = 1; }
                        }
                        count /= Factorial(currRun);

                        uncollapsed += (int)count;
                        var encoding = CanonicalEncoding(subset);
                        counts.TryGetValue(encoding, out var oldCount);
                        counts[encoding] = oldCount + (int)count;
                        if (oldCount == 0) reps[encoding] = (sbyte[][])subset.Clone();

                        if (indices[0] == Rows.Count - 1) break;
                        NextOrderedMultiset(indices, Rows.Count);
                    }

                    naiveCounts.Add(uncollapsed);
                    distinctCounts.Add(reps.Count);
                    representatives.Add(counts.Select(kvp => (kvp.Value, reps[kvp.Key])).ToList());
                }

                NaiveCounts = naiveCounts;
                DistinctCounts = distinctCounts;
                Representatives = representatives;
            }

            private static List<sbyte[]> BuildRows(int n, int ones, int negs)
            {
                var rows = new List<sbyte[]>();
                void Append(sbyte[] row)
                {
                    // Ensure that we don't have both a row and its negative by testing canonicity.
                    if (ones > negs || Array.IndexOf(row, (sbyte)1) < Array.IndexOf(row, (sbyte)-1)) rows.Add((sbyte[])row.Clone());
                }

                var perm = new sbyte[n];
                for (int i = 0; i < negs; i++) perm[i] = -1;
                for (int i = n - ones; i < n; i++) perm[i] = 1;

                while (true)
                {
                    Append(perm);

                    // Next permutation
                    int k;
                    for (k = n - 2; k >= 0; k--)
                    {
                        if (perm[k] < perm[k + 1]) break;
                    }
                    if (k == -1) return rows;

                    int l;
                    for (l = n - 1; l > k; l--)
                    {
                        if (perm[k] < perm[l]) break;
                    }

                    var tmp = perm[k]; perm[k] = perm[l]; perm[l] = tmp;

                    for (int i = k + 1, j = n - 1; i < j; i++, j--)
                    {
                        tmp = perm[i]; perm[i] = perm[j]; perm[j] = tmp;
                    }
                }
            }

            private static BigInteger CanonicalEncoding(sbyte[][] rows)
            {
                return rows.Permutations().Select(_CanonicalEncodingInner).Min();
            }

            private static BigInteger _CanonicalEncodingInner(sbyte[][] rows)
            {
                BigInteger @base = BigInteger.Pow(3, rows.Length);
                BigInteger[] digits = new BigInteger[rows[0].Length];
                for (int i = 0; i < digits.Length; i++)
                {
                    for (int j = 0; j < rows.Length; j++)
                    {
                        digits[i] = digits[i] * 3 + rows[j][i] + 1;
                    }
                }
                return digits.OrderBy(x => x).Aggregate(BigInteger.Zero, (accum, digit) => accum * @base + digit);
            }
        }

        private static readonly int[] _DeBruijn32 = new int[]
        {
            0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
            31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
        };

        // Returns 32 for input 0.
        public static int CountTrailingZeros(this uint i)
        {
            if (i == 0) return 32;

            int j = (int)i;
            i = (uint)(j & -j);
            var idx = (i * 0x077CB531U) >> 27;
            return _DeBruijn32[idx];
        }

        public static IEnumerable<T[]> Permutations<T>(this IEnumerable<T> elts)
        {
            T[] arr = elts.ToArray();
            if (arr.Length == 0)
            {
                yield return arr;
                yield break;
            }

            int[] indices = Enumerable.Range(0, arr.Length).ToArray();
            while (true)
            {
                yield return arr.ToArray();
                // Next permutation
                int i = indices.Length - 1;
                while (i > 0 && indices[i - 1] > indices[i]) i--;

                if (i == 0) yield break;

                int j = indices.Length - 1;
                while (indices[j] < indices[i - 1]) j--;

                _DoubleSwap(arr, indices, i - 1, j);
                for (j = indices.Length - 1; i < j; i++, j--) _DoubleSwap(arr, indices, i, j);
            }
        }

        private static void _DoubleSwap<T>(T[] elts, int[] indices, int i, int j)
        {
            var t1 = elts[i];
            elts[i] = elts[j];
            elts[j] = t1;

            var t2 = indices[i];
            indices[i] = indices[j];
            indices[j] = t2;
        }

        private static BigInteger Factorial(int n) => n < 2 ? BigInteger.One : n * Factorial(n - 1);

        private static BigInteger Binom(int n, int k)
        {
            BigInteger result = 1;
            for (int i = 0; i < k; i++) result = result * (n - i) / (i + 1);
            return result;
        }
    }
}

Online demo

This employs the following tricks:

  • If there's a zero row, the permanent is zero. These can be counted very fast, and then we can assume there's no zero row.
  • Multiplying a row by \$-1\$ inverts the sign of the permanent. So we can assume a canonical sign for each row and then multiply by \$2^n\$.
  • Permuting the rows leaves the permanent unchanged, so we can assume a canonical permutation of the rows and then multiply by a suitable factor.
  • The permanent can be computed in better-than-brute-force time using Ryser's formula with Gray code enumeration of subsets of columns.
  • Laplace expansion of the last row turns a factor of \$\frac{3^n - 1}{2}\$ into an instance of the subset sum problem, which can be tackled using dynamic programming with meet-in-the-middle.
  • We can also permute columns, which is what Cluster is for. I think I've pushed the Cluster approach about as far as it can go, but I would like to find an efficient way of replacing it, because if we take the first row [1, 1, 0, -1, -1, -1] (with weight 60) then it would be nice to exploit the symmetry of the first two columns and the last three columns in the next cluster. The problem is that doing this naïvely adds a factor of \$n!\$, which is more expensive than the saving.
  • The permutations are independent, so after a bit of pre-processing we have an embarrassingly parallel problem.

I compiled with VS2017 Community targetting .Net Framework 4.7.1 and compiled for "Release" with default settings. You may prefer to use .Net Core.

\$n=6\$ is still technically out of range - I calculated it in just under 3 hours on a 4-core AMD with numThreads = 8.