Gimli, make it even shorter?

C (gcc), 237 bytes

#define P(a,l)x=a;a=S[c=l>>r%4*2&3];S[c]=x;
r,c,x,y,z;G(unsigned*S){
for(r=24;r;*S^=r--%4?0:0x9e377901+r){
for(c=4;c--;*S++=z^y^8*(x&y))
x=*S<<24|*S>>8,y=S[4]<<9|S[4]>>23,z=S[8],S[8]=x^2*z^4*(y&z),S[4]=y^x^2*(x|z);
S-=4;P(*S,33)P(S[3],222)}}

I probably gained bytes with my swapping method, but it's too cute not to use.


C, 268 chars (268 bytes) using uint32_t

NB Since the original code uses <stdint.h> and types S as uint32_t *, I think the use of long is a cheat to get into 280 chars at the cost of the portability which is the reason for using uint32_t in the first place. If for fairness of comparison we require consistent use of uint32_t and the explicit signature void gimli(uint32_t *), the original code is really 284 chars, and orlp's code is 276 chars.

#include<stdint.h>
#define R(V)x=S[V],S[V]=S[V^y],S[V^y]=x,
void gimli(uint32_t*S){for(uint32_t r=24,x,y,z,*T;r--;y=72>>r%4*2&3,R(0)R(3)*S^=y&1?0x9e377901+r:0)for(T=S+4;T-->S;*T=z^y^8*(x&y),T[4]=y^x^2*(x|z),T[8]=x^2*z^4*(y&z))x=*T<<24|*T>>8,y=T[4]<<9|T[4]>>23,z=T[8];}

This can be split into two tweets with continuation markers as

#include<stdint.h>
#define R(V)x=S[V],S[V]=S[V^y],S[V^y]=x,
void gimli(uint32_t*S){for(uint32_t r=24,x,y,z,*T;r--;y=72>>r%4*2&3,R(0)R(3)// 1

and

*S^=y&1?0x9e377901+r:0)for(T=S+4;T-->S;*T=z^y^8*(x&y),T[4]=y^x^2*(x|z),T[8]=x^2*z^4*(y&z))x=*T<<24|*T>>8,y=T[4]<<9|T[4]>>23,z=T[8];}// 2/2

CJam (114 chars)

{24{[4/z{[8ZT].{8\#*G8#:Mmd+}__)2*\+.^W%\[_~;&8*\~@1$|2*@@&4*].^Mf%}%z([7TGT]R=4e!=\f=(2654435608R-_4%!*^\@]e_}fR}

This is an anonymous block (function): if you want to name it G then append :G. In CJam assigned names can only be single upper-case letters. There's space to append a comment e# Gimli in CJam and have characters left in a single tweet.

Online test

Dissection

{                                e# Define a block
  24{                            e# For R=0 to 23...
    [                            e#   Collect values in an array
      4/z                        e#     Transpose to columns
      {                          e#     Map over each column
        [8ZT].{8\#*G8#:Mmd+}     e#       Rotations, giving [x y z]
        __)2*\+.^W%\             e#       => [y^z x^y x^z*2] [x y z]
        [_~;&8*\~@1$|2*@@&4*].^  e#       => [x' y' z']
        Mf%                      e#       Map out any bits which overflowed
      }%
      z                          e#    Transpose to rows
      ([7TGT]R=4e!=\f=           e#    Permute first row
      (2654435608R-_4%!*^        e#    Apply round constant to first element
      \@                         e#    Put the parts in the right order
    ]e_                          e#  Finish collecting in array and flatten
  }fR
}