Longest non-repeating Game-of-Life sequence

C++ up to N=6

3x3 answer 3: 111 000 000                                                                                        
4x4 answer 10: 1110 0010 1100 0000                                                                               
5x5 answer 52: 11010 10000 11011 10100 00000                                                                     
6x6 answer 91: 100011 010100 110011 110100 101000 100000                                                         

This technique is centered around a fast next state function. Each board is represented as a bitmask of N^2 bits, and bit-twiddly tricks are used to compute the next state for all cells at once. next compiles down to about 200 100 assembly instructions for N <= 8. Then we just do the standard try-all-states-until-they-repeat and pick the longest one.

Takes a few seconds up to 5x5, a few hours for 6x6.

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
using namespace std;

#define N 6

typedef uint64_t board;

// board is N bits by N bits, with indexes like this (N=4):                                                        
//  0  1  2  3                                                                                                     
//  4  5  6  7                                                                                                     
//  8  9 10 11                                                                                                     
// 12 13 14 15                                                                                                     

#if N==3
#define LEFT_COL (1 + (1<<3) + (1<<6))
#define RIGHT_COL ((1<<2) + (1<<5) + (1<<8))
#define ALL 0x1ffULL
#elif N==4
#define LEFT_COL 0x1111ULL
#define RIGHT_COL 0x8888ULL
#define ALL 0xffffULL
#elif N==5
#define LEFT_COL (1ULL + (1<<5) + (1<<10) + (1<<15) + (1<<20))
#define RIGHT_COL ((1ULL<<4) + (1<<9) + (1<<14) + (1<<19) + (1<<24))
#define ALL 0x1ffffffULL
#elif N==6
#define LEFT_COL (1 + (1<<6) + (1<<12) + (1<<18) + (1<<24) + (1ULL<<30))
#define RIGHT_COL ((1<<5) + (1<<11) + (1<<17) + (1<<23) + (1<<29) + (1ULL<<35))
#define ALL 0xfffffffffULL
#else
#error "bad N"
#endif

static inline board north(board b) {
  return (b >> N) + (b << N*N-N);
}
static inline board south(board b) {
  return (b << N) + (b >> N*N-N);
}
static inline board west(board b) {
  return ((b & ~LEFT_COL) >> 1) + ((b & LEFT_COL) << N-1);
}
static inline board east(board b) {
  return ((b & ~RIGHT_COL) << 1) + ((b & RIGHT_COL) >> N-1);
}

board next(board b) {
  board n1 = north(b);
  board n2 = south(b);
  board n3 = west(b);
  board n4 = east(b);
  board n5 = north(n3);
  board n6 = north(n4);
  board n7 = south(n3);
  board n8 = south(n4);

  // add all the bits bitparallel-y to a 2-bit accumulator with overflow
  board a0 = 0;
  board a1 = 0;
  board overflow = 0;
#define ADD(x) overflow |= a0 & a1 & x; a1 ^= a0 & x; a0 ^= x;

  a0 = n1; // no carry yet
  a1 ^= a0 & n2; a0 ^= n2; // no overflow yet
  a1 ^= a0 & n3; a0 ^= n3; // no overflow yet
  ADD(n4);
  ADD(n5);
  ADD(n6);
  ADD(n7);
  ADD(n8);
  return (a1 & (b | a0)) & ~overflow & ALL;
}
void print(board b) {
  for (int i = 0; i < N; i++) {
    for (int j = 0; j < N; j++) {
      printf("%d", (int)(b >> i*N+j & 1));
    }
    printf(" ");
  }
  if (b >> N*N) printf("*");
  printf("\n");
}

int main(int argc, char *argv[]) {
  // Somewhere in the starting pattern there are a 1 and 0 together.  Using translational                          
  // and rotational symmetry, we can put these in the first two bits.  So we need only consider                    
  // 1 mod 4 boards.                                                                                               

  board steps[10000];
  int maxsteps = -1;
  for (board b = 1; b < 1ULL << N*N; b += 4) {
    int nsteps = 0;
    board x = b;
    while (true) {
      int repeat = steps + nsteps - find(steps, steps + nsteps, x);
      if (repeat > 0) {
        if (repeat == 1 && nsteps > maxsteps) {
          printf("%d: ", nsteps);
          print(b);
          maxsteps = nsteps;
        }
        break;
      }
      steps[nsteps++] = x;
      x = next(x);
    }
  }
}

I see the following possible solution approaches:

  • Heavy theory. I know there is some literature on Life on a torus, but I haven't read much of it.
  • Brute force forwards: for every possible board, check its score. This is basically what Matthew and Keith's approaches do, although Keith reduces the number of boards to check by a factor of 4.
    • Optimisation: canonical representation. If we can check whether a board is in canonical representation much quicker than it takes to evaluate its score, we get a speed-up of a factor of about 8N^2. (There are also partial approaches with smaller equivalence classes).
    • Optimisation: DP. Cache the score for each board, so that rather than walking them through until they converge or diverge we just walk until we find a board we've seen before. In principle this would give a speed-up by a factor of the average score / cycle length (maybe 20 or more), but in practice we're likely to be swapping heavily. E.g. for N=6 we'd need capacity for 2^36 scores, which at a byte per score is 16GB, and we need random access so we can't expect good cache locality.
    • Combine the two. For N=6, the full canonical representation would allow us to reduce the DP cache to about 60 mega-scores. This is a promising approach.
  • Brute force backwards. This seems odd at first, but if we assume that we can easily find still lifes and that we can easily invert the Next(board) function, we see that it has some benefits, although not as many as I originally hoped.
    • We don't bother with diverging boards at all. Not much of a saving, because they are quite rare.
    • We don't need to store scores for all of the boards, so there should be less memory pressure than the forward DP approach.
    • Working backwards is actually quite easy by varying a technique I saw in the literature in the context of enumerating still lifes. It works by treating each column as a letter in an alphabet and then observing that a sequence of three letters determines the middle one in the next generation. The parallel with enumerating still lifes is so close that I've refactored them together into an only slightly awkward method, Prev2.
    • It might seem that we can just canonicalise the still lifes, and get something like the 8N^2 speed-up for very little cost. However, empirically we still get a big reduction in the number of boards considered if we canonicalise at each step.
    • A surprisingly high proportion of boards have a score of 2 or 3, so there is still memory pressure. I found it necessary to canonicalise on the fly rather than building the previous generation and then canonicalising. It might be interesting to reduce memory usage by doing depth-first rather than breadth-first search, but doing so without overflowing the stack and without doing redundant calculations requires a co-routine / continuation approach to enumerating the previous boards.

I don't think that micro-optimisation will let me catch up with Keith's code, but for the sake of interest I'll post what I have. This solves N=5 in about a minute on a 2GHz machine using Mono 2.4 or .Net (without PLINQ) and in about 20 seconds using PLINQ; N=6 runs for many hours.

using System;
using System.Collections.Generic;
using System.Linq;

namespace Sandbox {
    class Codegolf9393 {
        internal static void Main() {
            new Codegolf9393(4).Solve();
        }

        private readonly int _Size;
        private readonly uint _AlphabetSize;
        private readonly uint[] _Transitions;
        private readonly uint[][] _PrevData1;
        private readonly uint[][] _PrevData2;
        private readonly uint[,,] _CanonicalData;

        private Codegolf9393(int size) {
            if (size > 8) throw new NotImplementedException("We need to fit the bits in a ulong");

            _Size = size;
            _AlphabetSize = 1u << _Size;

            _Transitions = new uint[_AlphabetSize * _AlphabetSize * _AlphabetSize];
            _PrevData1 = new uint[_AlphabetSize * _AlphabetSize][];
            _PrevData2 = new uint[_AlphabetSize * _AlphabetSize * _AlphabetSize][];
            _CanonicalData = new uint[_Size, 2, _AlphabetSize];

            InitTransitions();
        }

        private void InitTransitions() {
            HashSet<uint>[] tmpPrev1 = new HashSet<uint>[_AlphabetSize * _AlphabetSize];
            HashSet<uint>[] tmpPrev2 = new HashSet<uint>[_AlphabetSize * _AlphabetSize * _AlphabetSize];
            for (int i = 0; i < tmpPrev1.Length; i++) tmpPrev1[i] = new HashSet<uint>();
            for (int i = 0; i < tmpPrev2.Length; i++) tmpPrev2[i] = new HashSet<uint>();

            for (uint i = 0; i < _AlphabetSize; i++) {
                for (uint j = 0; j < _AlphabetSize; j++) {
                    uint prefix = Pack(i, j);
                    for (uint k = 0; k < _AlphabetSize; k++) {
                        // Build table for forwards checking
                        uint jprime = 0;
                        for (int l = 0; l < _Size; l++) {
                            uint count = GetBit(i, l-1) + GetBit(i, l) + GetBit(i, l+1) + GetBit(j, l-1) + GetBit(j, l+1) + GetBit(k, l-1) + GetBit(k, l) + GetBit(k, l+1);
                            uint alive = GetBit(j, l);
                            jprime = SetBit(jprime, l, (count == 3 || (alive + count == 3)) ? 1u : 0u);
                        }
                        _Transitions[Pack(prefix, k)] = jprime;

                        // Build tables for backwards possibilities
                        tmpPrev1[Pack(jprime, j)].Add(k);
                        tmpPrev2[Pack(jprime, i, j)].Add(k);
                    }
                }
            }

            for (int i = 0; i < tmpPrev1.Length; i++) _PrevData1[i] = tmpPrev1[i].ToArray();
            for (int i = 0; i < tmpPrev2.Length; i++) _PrevData2[i] = tmpPrev2[i].ToArray();

            for (uint col = 0; col < _AlphabetSize; col++) {
                _CanonicalData[0, 0, col] = col;
                _CanonicalData[0, 1, col] = VFlip(col);
                for (int rot = 1; rot < _Size; rot++) {
                    _CanonicalData[rot, 0, col] = VRotate(_CanonicalData[rot - 1, 0, col]);
                    _CanonicalData[rot, 1, col] = VRotate(_CanonicalData[rot - 1, 1, col]);
                }
            }
        }

        private ICollection<ulong> Prev2(bool stillLife, ulong next, ulong prev, int idx, ICollection<ulong> accum) {
            if (stillLife) next = prev;

            if (idx == 0) {
                for (uint a = 0; a < _AlphabetSize; a++) Prev2(stillLife, next, SetColumn(0, idx, a), idx + 1, accum);
            }
            else if (idx < _Size) {
                uint i = GetColumn(prev, idx - 2), j = GetColumn(prev, idx - 1);
                uint jprime = GetColumn(next, idx - 1);
                uint[] succ = idx == 1 ? _PrevData1[Pack(jprime, j)] : _PrevData2[Pack(jprime, i, j)];
                foreach (uint b in succ) Prev2(stillLife, next, SetColumn(prev, idx, b), idx + 1, accum);
            }
            else {
                // Final checks: does the loop round work?
                uint a0 = GetColumn(prev, 0), a1 = GetColumn(prev, 1);
                uint am = GetColumn(prev, _Size - 2), an = GetColumn(prev, _Size - 1);
                if (_Transitions[Pack(am, an, a0)] == GetColumn(next, _Size - 1) &&
                    _Transitions[Pack(an, a0, a1)] == GetColumn(next, 0)) {
                    accum.Add(Canonicalise(prev));
                }
            }

            return accum;
        }

        internal void Solve() {
            DateTime start = DateTime.UtcNow;
            ICollection<ulong> gen = Prev2(true, 0, 0, 0, new HashSet<ulong>());
            for (int depth = 1; gen.Count > 0; depth++) {
                Console.WriteLine("Length {0}: {1}", depth, gen.Count);
                ICollection<ulong> nextGen;

                #if NET_40
                nextGen = new HashSet<ulong>(gen.AsParallel().SelectMany(board => Prev2(false, board, 0, 0, new HashSet<ulong>())));
                #else
                nextGen = new HashSet<ulong>();
                foreach (ulong board in gen) Prev2(false, board, 0, 0, nextGen);
                #endif

                // We don't want the still lifes to persist or we'll loop for ever
                if (depth == 1) {
                    foreach (ulong stilllife in gen) nextGen.Remove(stilllife);
                }

                gen = nextGen;
            }
            Console.WriteLine("Time taken: {0}", DateTime.UtcNow - start);
        }

        private ulong Canonicalise(ulong board)
        {
            // Find the minimum board under rotation and reflection using something akin to radix sort.
            Isomorphism canonical = new Isomorphism(0, 1, 0, 1);
            for (int xoff = 0; xoff < _Size; xoff++) {
                for (int yoff = 0; yoff < _Size; yoff++) {
                    for (int xdir = -1; xdir <= 1; xdir += 2) {
                        for (int ydir = 0; ydir <= 1; ydir++) {
                            Isomorphism candidate = new Isomorphism(xoff, xdir, yoff, ydir);

                            for (int col = 0; col < _Size; col++) {
                                uint a = canonical.Column(this, board, col);
                                uint b = candidate.Column(this, board, col);

                                if (b < a) canonical = candidate;
                                if (a != b) break;
                            }
                        }
                    }
                }
            }

            ulong canonicalValue = 0;
            for (int i = 0; i < _Size; i++) canonicalValue = SetColumn(canonicalValue, i, canonical.Column(this, board, i));
            return canonicalValue;
        }

        struct Isomorphism {
            int xoff, xdir, yoff, ydir;

            internal Isomorphism(int xoff, int xdir, int yoff, int ydir) {
                this.xoff = xoff;
                this.xdir = xdir;
                this.yoff = yoff;
                this.ydir = ydir;
            }

            internal uint Column(Codegolf9393 _this, ulong board, int col) {
                uint basic = _this.GetColumn(board, xoff + col * xdir);
                return _this._CanonicalData[yoff, ydir, basic];
            }
        }

        private uint VRotate(uint col) {
            return ((col << 1) | (col >> (_Size - 1))) & (_AlphabetSize - 1);
        }

        private uint VFlip(uint col) {
            uint replacement = 0;
            for (int row = 0; row < _Size; row++)
                replacement = SetBit(replacement, row, GetBit(col, _Size - row - 1));
            return replacement;
        }

        private uint GetBit(uint n, int bit) {
            bit %= _Size;
            if (bit < 0) bit += _Size;

            return (n >> bit) & 1;
        }

        private uint SetBit(uint n, int bit, uint value) {
            bit %= _Size;
            if (bit < 0) bit += _Size;

            uint mask = 1u << bit;
            return (n & ~mask) | (value == 0 ? 0 : mask);
        }

        private uint Pack(uint a, uint b) { return (a << _Size) | b; }
        private uint Pack(uint a, uint b, uint c) {
            return (((a << _Size) | b) << _Size) | c;
        }

        private uint GetColumn(ulong n, int col) {
            col %= _Size;
            if (col < 0) col += _Size;
            return (_AlphabetSize - 1) & (uint)(n >> (col * _Size));
        }

        private ulong SetColumn(ulong n, int col, uint value) {
            col %= _Size;
            if (col < 0) col += _Size;

            ulong mask = (_AlphabetSize - 1) << (col * _Size);
            return (n & ~mask) | (((ulong)value) << (col * _Size));
        }
    }
}