Another Euler Brick in the Wall

C using Saunderson's parameterisation and Berggren's tree

In 1740, Saunderson found the parameterisation that if (a, b, c) is a Pythagorean triple (i.e. a^2 + b^2 = c^2) then (a(2b-c)(2b+c), b(2a-c)(2a+c), 4abc) is a rational cuboid (this was before Euler studied them) with diagonals (c^3, a(5b^2+a^2), b(5a^2+b^2)). This doesn't generate all Euler bricks. Note that if the brick is primitive iff the Pythagorean triple is primitive.

If (x, y, z) is an Euler brick then so is (xy, xz, yz). Applying this once to a Saunderson brick creates a new brick which isn't Saunderson and which can be made primitive with division by ab. Applying the same transformation a second time gives the non-primitive xyz(x, y, z).

In 1934, Berggren discovered that the primitive Pythagorean triples form a ternary tree rooted at (3, 4, 5) which can be generated by a simple matrix multiplication and which contains each triple precisely once.

This implementation generates primitive Pythagorean triples and stores them in a stack, limiting the memory usage to 1.5GB. An earlier version used a circular buffer, but the stack gives much better results. This is probably a combination of fewer values being discarded and better cache locality.

Thanks to Steve Verrill for pointing out some bugs and for suggesting the use of a stack instead of a circular buffer. I've also followed him in removing the conjugate bricks for speed: previously the search space (minus the discarded branches) was exhausted within the two minutes, and there was value in spending time to get a few conjugates. But the new version can easily run for 5 minutes (as I discovered in the process of learning the value of the volatile keyword!), and most of the conjugates overflow, so on average the tests are wasted time.

#include <stdlib.h>
#include <inttypes.h>
#include <stdio.h>
#include <signal.h>
#include <string.h>

// Stack size. This is 1.5GB of memory. 
#define N 33554432

typedef unsigned __int128 scalar;

typedef struct {
    scalar a, b, c;
} triple;

// Stack operations
#define take(st) st[--stp];
#define offer(A,B,C) if (stp < N) { st[stp].a=(A); st[stp].b=(B); st[stp].c=(C); stp++; }

volatile sig_atomic_t stop;

void handler(int sig, siginfo_t *siginfo, void *context) {
    stop = 1;
}

int main(int argc, char **argv) {
    // Register interrupt handler
    stop = 0;
    struct sigaction act;
    memset(&act, 0, sizeof(act));
    act.sa_sigaction = &handler;
    act.sa_flags = SA_SIGINFO;
    if (sigaction(SIGINT, &act, NULL) < 0) return 1;

    // L = 10^30
    const scalar L = ((scalar)0xc9f2c9cd0UL << 64) + 0x4674edea40000000UL;
    const scalar T = (scalar)1 << 64;   // used in an anti-overflow test

    // To avoid the optimiser removing the calculation of the bricks.
    int writeToStdOut = argc > 1;

    triple *st = (triple *)malloc(N * sizeof(triple));
    if (!st) return 2;

    uint32_t stp = 1; // Index of first empty space in stack
    st[0].a = 3; st[0].b = 4; st[0].c = 5;
    uint64_t count = 0;

    while (stp > 0) {
        if (stop) break;
        triple t = take(st);

        // Extend queue of Pythagorean triples
        offer(t.a + ((t.c - t.b) << 1), ((t.a + t.c) << 1) - t.b, ((t.a - t.b + t.c) << 1) + t.c)
        offer(t.a + ((t.c + t.b) << 1), ((t.a + t.c) << 1) + t.b, ((t.a + t.b + t.c) << 1) + t.c)
        offer(-t.a + ((t.c + t.b) << 1), ((-t.a + t.c) << 1) + t.b, ((-t.a + t.b + t.c) << 1) + t.c)

        // Saunderson brick
        scalar x = t.b * ( ((t.a << 1) >= t.c) ? ((t.a << 1) - t.c) : (t.c - (t.a << 1)) ) * ((t.a << 1) + t.c);
        scalar y = t.a * ((t.b << 1) - t.c) * ((t.b << 1) + t.c);
        scalar z = t.a * t.b * t.c << 2;
        scalar u = t.c * t.c * t.c;
        scalar v = t.b * (t.b * t.b + 5 * t.a * t.a);
        scalar w = t.a * (t.a * t.a + 5 * t.b * t.b);
        count++;
        if (writeToStdOut) {
            printf("Sides: 0x%016"PRIx64"%016"PRIx64, (uint64_t)(x >> 64), (uint64_t)x);
            printf(", 0x%016"PRIx64"%016"PRIx64, (uint64_t)(y >> 64), (uint64_t)y);
            printf(", 0x%016"PRIx64"%016"PRIx64, (uint64_t)(z >> 64), (uint64_t)z);
            printf("; diags 0x%016"PRIx64"%016"PRIx64, (uint64_t)(u >> 64), (uint64_t)u);
            printf(", 0x%016"PRIx64"%016"PRIx64, (uint64_t)(v >> 64), (uint64_t)v);
            printf(", 0x%016"PRIx64"%016"PRIx64"\n", (uint64_t)(w >> 64), (uint64_t)w);
        }
    }

    printf("%ld\n", count);

    return 0;
}

Compiles with gcc -O3 euler_brick.c -o euler_brick and handles SIGINT (Ctrl-C). On my computer the earlier circular buffer version ran out of usably small Pythagorean triples after 85 seconds having found 843035517 (0.843 thousand million) Euler bricks. The new version runs to the two minutes (tested with timeout -sINT 2m ./euler_brick) and finds about 9.1 thousand million bricks. Exact numbers vary from run to run; the highest I've observed is 9113459216.

Note that this is single-threaded code. The process is embarrassingly parallel - just start each thread going down a different branch of the tree - and so the limitation to scaling it to multicore would be the shared memory bus.


C using Saunderson's parameterisation and a different tree generation method.

This is heavily influenced by Peter Taylor's answer, and by the following page: https://sites.google.com/site/tpiezas/0021. For consistency, my variable naming convention largely follows Peter's.

Instead of using a buffer, I use recursion to generate the three children of each node of the tree. The tree I generate is equivalent to Berggren's tree, but I use a two-parameter method (mainly for interest and variation.) The parameters p and q for all integers generate a pythagorean triple a=p^2-q^2, b=2pq, c=p^2+q^2. If we want to limit this to primitive triples only, we start with the vector (2,1) and multiply by the 2x2 matrices below. This is equivalent to Berggren's / Barning's method, in which we start with the vector (3,4,5) and multiply by the 3x3 matrices below. See http://en.wikipedia.org/wiki/Tree_of_primitive_Pythagorean_triples .

Barning's matrices                       2x2 matrices
1 -2  2      1  2  2     -1  2  2        2 -1    2  1    1  2
2 -1  2      2  1  2     -2  1  2        1  0    1  0    0  1
2 -2  3      2  2  3     -2  2  3

It can be seen from Barning's matrices that the hypotenuse c of the second child of each node will be between 3 and 7 times larger than that of its parent (more advanced treatment would narrow this range.) the growth rates of the other branches are rather slower.

In practice it was necessary to limit the depth of recursion in order to avoid stack overflow (segmentation fault.) However, it seems likely that only a few bricks were lost. Peter's program runs on my machine in about 6 minutes and generates 843035517 Euler bricks. My program takes just over twice as long and generates 1688135295 bricks with recursion depth limited to 5000, or 1671621046 bricks with depth limited to 1000. Therefore the rate of generation is about the same. With depth limited to 40, 1134616080 bricks are generated, of which 1131433861 are Saunderson and only 3182219 are conjugate. And the program runs about twice as fast with the code for generating conjugate bricks commented out!

Instead of generating the conjugate brick of (x,y,z) as (yz,xz,xy) I generate a conjugate brick ab times smaller. To do this intermediate values x',y',u',v' are stored during the calculation of x,y,z,u,v,w.

#include <inttypes.h>

int writeStdOut,count=0;

typedef __int128 scalar;

scalar L=(scalar)1000000*1000000*1000000*1000000*1000000,T=(scalar)1<<124;

f(scalar p, scalar q, int depth){
  scalar p2,q2,
         a,b,c,a2,b2,c2,
         uprime,vprime,u,v,w,
         xprime,yprime,x,y,z;

  p2=p*p;q2=q*q;
  a=p2-q2;b=p*q<<1;c=p2+q2;  

  z=a*b*c<<2;
  if(z<L){
    a2=a*a;b2=b*b;c2=c*c;

    xprime=(b2<<2)-c2;yprime=(a2<<2)-c2;
    if(xprime<0)xprime=-xprime;if(yprime<0)yprime=-yprime;
    x=a*xprime;y=b*yprime;
  
    uprime=b2*5+a2;vprime=a2*5+b2;
    u=a*uprime;v=b*vprime; w=c2*c;
  
    count++;
    if (writeStdOut) {
      printf("Sides: 0x%016"PRIx64"%016"PRIx64, (uint64_t)(x >> 64), (uint64_t)x);
      printf(", 0x%016"PRIx64"%016"PRIx64, (uint64_t)(y >> 64), (uint64_t)y);
      printf(", 0x%016"PRIx64"%016"PRIx64, (uint64_t)(z >> 64), (uint64_t)z);
      printf("; diags 0x%016"PRIx64"%016"PRIx64, (uint64_t)(u >> 64), (uint64_t)u);
      printf(", 0x%016"PRIx64"%016"PRIx64, (uint64_t)(v >> 64), (uint64_t)v);
      printf(", 0x%016"PRIx64"%016"PRIx64"\n", (uint64_t)(w >> 64), (uint64_t)w);
    }
    if(T/a/c>xprime && T/b/c>yprime){
      x=a*c*xprime<<2;y=b*c*yprime<<2; if(x<0 | y<0)puts("overflow"); 
      if(x<L && y<L){
      
        z=xprime*yprime;
        u=uprime*yprime;v=vprime*xprime;w=c2*c2<<2;
        count++; 
        if (writeStdOut) {
          printf("SIDES: 0x%016"PRIx64"%016"PRIx64, (uint64_t)(x >> 64), (uint64_t)x);
          printf(", 0x%016"PRIx64"%016"PRIx64, (uint64_t)(y >> 64), (uint64_t)y);
          printf(", 0x%016"PRIx64"%016"PRIx64, (uint64_t)(z >> 64), (uint64_t)z);
          printf("; diags 0x%016"PRIx64"%016"PRIx64, (uint64_t)(u >> 64), (uint64_t)u);
          printf(", 0x%016"PRIx64"%016"PRIx64, (uint64_t)(v >> 64), (uint64_t)v);
          printf(", 0x%016"PRIx64"%016"PRIx64"\n", (uint64_t)(w >> 64), (uint64_t)w);
        }

      }
    }
    if(depth<40){f((p<<1)-q,p,depth+1);f((p<<1)+q,p,depth+1);f(p+(q<<1),q,depth+1);}
  }
}

main(int argc, char **argv){
  writeStdOut=argc>1;
  f(2,1,0);
  printf(" %d bricks",count);
}