Smallest Bytecode Interpreter/VM

C, 752 (589+163 for define flags) * 0.5 (JIT) * 0.9 (extensions) * (0.75 optimization) * (0.64 seconds/2) = 81.216

C[S],J[S],i,j,k,y,c,X,Y,Z;char*R,a,b[9];main(x){R=mmap(0,S,6,34,-1,0);N=85;while(scanf("%c%s%*[ R]%d%*[ R]%d",&a,b,&x,&y)&&~getchar())a-=65,a-1?a-15?a-9?a?a-2?a-3?a-11?a-12?a-17?(N=41,v):(N=137,v):(N=137,u,N=247,g(H,4),N=139,u):(y?N=189+x,s(y):(N=51,g(G,G))):(N=137,u,N=247,g(H,6),N=139,u):(N=57,v,s(0xFC8A9F),--j):(N=1,v):(N=233,J[k++]=i,s(x)):b[1]-80?N=85+x:(N=93+x):(c=b[5],s(0x0F9EE78A),N=(c-69?c-71?c-76?1:8:11:0)+132,J[k++]=i,s(x)),C[++i]=j;U(E8,X)U(F0,Y)U(F8,Z)s(50013);i=j;while(k--)j=C[J[k]]+1,R[j-1]-233&&(j+=4),s(C[*(int*)(R+j)]-j-4);((int(*)())R)();printf("%u %u %u\n",X,Y,Z);}

Takes code (LOAD R0, etc), no trailing characters, single space, no blank lines in the middle, no comments, etc. Trailing newline required.

This is then converted to 80386 bytecode and executed.

Loading 0 into a register is replaced by xoring the register with itself instead of moving 0 into the register, which is three bytes shorter in the generated bytecode, and may be very marginally faster.

Compile with:

gcc -m32 -D"g(a,b)=(N=192|b<<3|a)"-D"s(b)=(*(int*)(R+j)=b,j+=4)"-DN=R[j++]-D"G=((x+1)|4)"
-D"H=((y+1)|4)"-DS=9999-D"u=g(0,G)"-D"v=g(G,H)"-D"U(b,c)=s(0xA3##b##89),--j,s(&c);"
bytecode.c -o bytecode

POSIX-compliant OS required.

Input is read from STDIN (use ./bytecode < file to pipe from a file).

Resulting bytecode for the test program:

; start
 0:   55                      push   %ebp
; LOAD R0 0
 1:   33 ed                   xor    %ebp,%ebp
; LOAD R2 0
 3:   33 ff                   xor    %edi,%edi
; LOAD R1 0
 5:   33 f6                   xor    %esi,%esi
; PUSH $1
 7:   56                      push   %esi
; MUL R1 R2
 8:   89 f0                   mov    %esi,%eax
 a:   f7 e7                   mul    %edi
 c:   8b f0                   mov    %eax,%esi
; PUSH R2
 e:   57                      push   %edi
; LOAD R2 3
 f:   bf 03 00 00 00          mov    $0x3,%edi
; DIV R1 R2
14:   89 f0                   mov    %esi,%eax
16:   f7 f7                   div    %edi
18:   8b f0                   mov    %eax,%esi
; POP R2
1a:   5f                      pop    %edi
; ADD R0 R1
1b:   01 f5                   add    %esi,%ebp
; POP R1
1d:   5e                      pop    %esi
; PUSH R2
1e:   57                      push   %edi
; LOAD R2 1
1f:   bf 01 00 00 00          mov    $0x1,%edi
; ADD R1 R2
24:   01 fe                   add    %edi,%esi
; POP R2
26:   5f                      pop    %edi
; PUSH R2
27:   57                      push   %edi
; LOAD R2 10000
28:   bf 10 27 00 00          mov    $0x2710,%ed
; CMP R1 R2
2d:   39 fe                   cmp    %edi,%esi
2f:   9f                      lahf
30:   8a fc                   mov    %ah,%bh
; POP R2
32:   5f                      pop    %edi
; BRANCHLT 3
33:   8a e7                   mov    %bh,%ah
35:   9e                      sahf
36:   0f 8c cb ff ff ff       jl     0x7
; LOAD R1 1
3c:   be 01 00 00 00          mov    $0x1,%esi
; ADD R2 R1
41:   01 f7                   add    %esi,%edi
; LOAD R1 10000
43:   be 10 27 00 00          mov    $0x2710,%es
; CMP R2 R1
48:   39 f7                   cmp    %esi,%edi
4a:   9f                      lahf
4b:   8a fc                   mov    %ah,%bh
; LOAD R1 0
4d:   33 f6                   xor    %esi,%esi
; BRANCHLT 2
4f:   8a e7                   mov    %bh,%ah
51:   9e                      sahf
52:   0f 8c ad ff ff ff       jl     0x5
; copy R0 to X
58:   89 e8                   mov    %ebp,%eax
5a:   a3 28 5b 42 00          mov    %eax,0x425b
; copy R1 to Y
5f:   89 f0                   mov    %esi,%eax
61:   a3 38 55 44 00          mov    %eax,0x4455
; copy R2 to Z
66:   89 f8                   mov    %edi,%eax
68:   a3 40 55 44 00          mov    %eax,0x4455
; exit
6d:   5d                      pop    %ebp
6e:   c3                      ret

Ungolfed:

C[9999],J[9999],i,j,k,y,c,X,Y,Z;
char *R,a,b[9];
main(x){
    // 6 is PROC_WRITE|PROC_EXEC
    // 34 is MAP_ANON|MAP_PRIVATE
    R=mmap(0,'~~',6,34,-1,0);

    N=0x55;
    while(scanf("%c%s%*[ R]%d%*[ R]%d",&a,b,&x,&y)&&~getchar())
        a-=65,
        a-1? // B[RANCH**]
            a-15? // P[USH/OP]
                a-9? // J[MP]
                    a? // A[DD]
                        a-2? // C[MP]
                            a-3? // D[IV]
                                a-11? // L[OAD]
                                    a-12? // M[UL]
                                        a-17? // R[LOAD]
                                            // SUB
                                            (N=0x29,g(G,H))
                                        :(N=0x89,g(G,H))
                                    :(N=0x89,g(0,G),N=0xF7,g(H,4),N=0x8B,g(0,G))
                                :(y?N=0xBD+x,s(y):(N=0x33,g(G,G)))
                            :(N=0x89,g(0,G),N=0xF7,g(H,6),N=0x8B,g(0,G))
                        :(N=0x39,g(G,H),s(0xfc8a9f),--j)
                    :(N=0x1,g(G,H))
                :(N=0xE9,J[k++]=i,s(x))
            :b[1]-80? 
                N=0x55+x // PUSH
            :(N=0x5D+x) // POP
        :(c=b[5],s(0x0f9ee78a),N=(
        c-69? // EQ
            c-71? // GT
                c-76? // LT
                    1 // NE
                :8
            :11
        :0
        )+0x84,J[k++]=i,s(x)),
        C[++i]=j
        ;
    // transfer registers to X,Y,Z
    s(0xA3E889),--j,s(&X);
    s(0xA3F089),--j,s(&Y);
    s(0xA3F889),--j,s(&Z);

    // pop and ret
    s(0xC35D);

    i=j;
    // fix distances for jmp/branch**
    while(k--)
        j=C[J[k]]+1,R[j-1]-0xE9&&(j+=4),
        s(C[*(int*)(R+j)]-j-4);

    // call
    ((int(*)())R)();

    // output
    printf("%u %u %u\n",X,Y,Z);
}

C, Score = 854byte × (~0.8sec / 2) × 0.5 [JIT] × 0.9 [Extensions] = ~154byte sec

#define G getchar()
#define L for(i=0;i<3;++i)
#define N*(int*)
#define M(x)"P\x8a\xe7\x9e\xf"#x"    KL"
*T[1<<20],**t=T,*F[1<<20],**f=F,R[3],r[]={1,6,7};char*I[]={"L\xb8    GGJH","I\x8b\xc0HHGH","H\x50GG","H\x58GG","I\3\xc0HHGH","I\53\xc0HHGH","M\x8b\xc0\xf7\xe0\x8b\xc0IHLGJ","O\63\xd2\x8b\xc0\xf7\xf0\x8b\xc0IJNGL","L\xe9    KH","L\73\xc0\x9f\x8a\xfcHHGH",M(\x82),M(\x84),M(\x87),M(\x85)},C[1<<24],*c=C;main(i,o,l,g){N c=0xb7ec8b60;c[4]=70;c+=5;while((o=G)>=0){char*s=I[o];l=*s-'G';memcpy(c,s+1,l);for(s+=l+1;o=*s++;){o-='G';if(o<3){g=r[G];c[*s++-'G']|=g<<3*(o&1);if(o>1)c[*s++-'G']|=g<<3;}else{if(o>3)*f++=c+*s-'G';for(i=4;i;--i)c[*s-'G'+i-1]=G;++s;}}*t++=c;c+=l;}*t=c;while(f>F)--f,**f=(int)T[**f]-(int)*f-4;L N&c[7*i]=0x5893e|r[i]<<19,N&c[3+7*i]=R+i;N&c[21]=0xc361e58b;mprotect((int)C>>12<<12,1<<24,7);((void(*)())C)();L printf("R%d %u\n",i,R[i]);}

Compile with gcc vm.c -ovm -m32 -w on a x86 POSIX compliant OS.
Run with ./vm < program, where program is a binary program file.


Going for the speed. The program performs a pretty straight-forward translation of the input program to x86 machine code and lets the CPU do the rest.

For instance, here's the translation of the test program. ecx, esi and edi correspond to R0, R1 and R2, respectively; bh holds the status flags; eax and edx are scratch registers; the call-stack corresponds to the VM's stack:

# Prologue
     0:   60                      pusha
     1:   8b ec                   mov    ebp,esp
     3:   b7 46                   mov    bh,0x46
# LOAD R0 0
     5:   b9 00 00 00 00          mov    ecx,0x0
# LOAD R2 0 <--outer loop value
     a:   bf 00 00 00 00          mov    edi,0x0
# LOAD R1 0 <--inner loop value
     f:   be 00 00 00 00          mov    esi,0x0
#      --Begin inner loop--
# PUSH R1 <--push inner loop value to the stack
    14:   56                      push   esi
# MUL R1 R2 <--(i*j)
    15:   8b c6                   mov    eax,esi
    15:   f7 e7                   mul    edi
    19:   8b f0                   mov    esi,eax
# PUSH R2
    1b:   57                      push   edi
# LOAD R2 3
    1c:   bf 03 00 00 00          mov    edi,0x3
# DIV R1 R2 <-- / 3
    21:   33 d2                   xor    edx,edx
    23:   8b c6                   mov    eax,esi
    25:   f7 f7                   div    edi
    27:   8b f0                   mov    esi,eax
# POP R2
    29:   5f                      pop    edi
# ADD R0 R1 <-- s+=
    2a:   03 ce                   add    ecx,esi
# POP R1
    2c:   5e                      pop    esi
# PUSH R2
    2d:   57                      push   edi
# LOAD R2 1
    2e:   bf 01 00 00 00          mov    edi,0x1
# ADD R1 R2 <--j++
    33:   03 f7                   add    esi,edi
# POP R2
    35:   5f                      pop    edi
# PUSH R2
    36:   57                      push   edi
# LOAD R2 10000
    37:   bf 10 27 00 00          mov    edi,0x2710
# CMP R1 R2 <-- j < 10000
    3c:   3b f7                   cmp    esi,edi
    3e:   9f                      lahf
    3f:   8a fc                   mov    bh,ah
# POP R2
    41:   5f                      pop    edi
# BRANCHLT 4 <--Go back to beginning inner loop
    42:   8a e7                   mov    ah,bh
    44:   9e                      sahf
    45:   0f 82 c9 ff ff ff       jb     0x14
# --Drop To outer loop--
# LOAD R1 1
    4b:   be 01 00 00 00          mov    esi,0x1
# ADD R2 R1 <--i++
    50:   03 fe                   add    edi,esi
# LOAD R1 10000
    52:   be 10 27 00 00          mov    esi,0x2710
# CMP R2 R1 <-- i < 10000
    57:   3b fe                   cmp    edi,esi
    59:   9f                      lahf
    5a:   8a fc                   mov    bh,ah
# LOAD R1 0 <--Reset inner loop
    5c:   be 00 00 00 00          mov    esi,0x0
# BRANCHLT 3
    61:   8a e7                   mov    ah,bh
    63:   9e                      sahf
    64:   0f 82 a5 ff ff ff       jb     0xf
# Epilogue
    6a:   3e 89 0d 60 ac 04 09    mov    DWORD PTR ds:0x904ac60,ecx
    71:   3e 89 35 64 ac 04 09    mov    DWORD PTR ds:0x904ac64,esi
    78:   3e 89 3d 68 ac 04 09    mov    DWORD PTR ds:0x904ac68,edi
    7f:   8b e5                   mov    esp,ebp
    81:   61                      popa
    82:   c3                      ret

Ungolfed

#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <sys/mman.h>

#define MAX_INST_COUNT (1 << 20)
#define MAX_CODE_SIZE (8 * MAX_INST_COUNT)
#define PAGE_SIZE (1 << 12)

uintptr_t *targets[MAX_INST_COUNT], **targets_ptr = targets;
int32_t *fixups[MAX_INST_COUNT], **fixups_ptr = fixups;
char code[MAX_CODE_SIZE], *code_ptr = code;
uint32_t R[3];

const char* inst_defs[] = {
    /* LOAD     */ "\5\xb8    \1\0\4\1",
    /* RLOAD    */ "\2\x8b\xc0\2\1\1\1",
    /* PUSH     */ "\1\x50\1\0",
    /* POP      */ "\1\x58\1\0",
    /* ADD      */ "\2\x03\xc0\2\1\1\1",
    /* SUB      */ "\2\x2b\xc0\2\1\1\1",
    /* MUL      */ "\6\x8b\xc0\xf7\xe0\x8b\xc0\3\1\5\1\3",
    /* DIV      */ "\x8\x33\xd2\x8b\xc0\xf7\xf0\x8b\xc0\3\3\7\1\5",
    /* JMP      */ "\5\xe9    \5\1",
    /* CMP      */ "\5\x3b\xc0\x9f\x8a\xfc\2\1\1\1",
    /* BRANCHLT */ "\x9\x8a\xe7\x9e\x0f\x82    \5\5",
    /* BRANCHEQ */ "\x9\x8a\xe7\x9e\x0f\x84    \5\5",
    /* BRANCHGT */ "\x9\x8a\xe7\x9e\x0f\x87    \5\5",
    /* BRANCHNE */ "\x9\x8a\xe7\x9e\x0f\x85    \5\5"
};

int main() {
    int i;

    {
        const char prologue[] =
            /* PUSHAD       */ "\x60"
            /* MOV EBP, ESP */ "\x8b\xec"
            /* MOV BH, 46h  */ "\xb7\x46"
        ;
        memcpy(code_ptr, prologue, sizeof(prologue) - 1);
        code_ptr += sizeof(prologue) - 1;
    }

    {
        const char reg[] = {1, 6, 7};
        char r;
        int op;
        while ((op = getchar()) != EOF) {
            const char* def = inst_defs[op];
            int len = def[0];
            memcpy(code_ptr, def + 1, len);
            for (def += len + 1; *def; ) {
                switch (*def++) {
                    case 1: code_ptr[*def++] |= reg[getchar()]; break;
                    case 2: code_ptr[*def++] |= reg[getchar()] << 3; break;
                    case 3:
                        r = reg[getchar()];
                        code_ptr[*def++] |= r; code_ptr[*def++] |= r << 3;
                        break;
                    case 5: *fixups_ptr = code_ptr + *def; ++fixups_ptr;
                    case 4:
                        for (i = 4; i; --i) code_ptr[*def + i - 1] = getchar();
                        ++def;
                        break;
                }
            }
            *targets_ptr = code_ptr; ++targets_ptr;
            code_ptr += len;
        }
        *targets_ptr = code_ptr; ++targets_ptr;

        while (fixups_ptr != fixups) {
            --fixups_ptr;
            **fixups_ptr = (char*)targets[**fixups_ptr] - (char*)*fixups_ptr - 4;
        }
    }

    {
        const char epilogue[] =
            /* MOV R[0], ECX */ "\x3e\x89\x0d    "
            /* MOV R[1], ESI */ "\x3e\x89\x35    "
            /* MOV R[2], EDI */ "\x3e\x89\x3d    "
            /* MOV ESP, EBP  */ "\x8b\xe5"
            /* POPAD         */ "\x61"
            /* RET           */ "\xc3"
        ;
        memcpy(code_ptr, epilogue, sizeof(epilogue) - 1);
        for (i = 0; i < 3; ++i) *(uintptr_t*)&code_ptr[3 + 7 * i] = &R[i];
        code_ptr += sizeof(epilogue) - 1;
    }

    mprotect(
        (uintptr_t)code / PAGE_SIZE * PAGE_SIZE, MAX_CODE_SIZE,
        PROT_READ | PROT_WRITE | PROT_EXEC
    );
    { void (*program)(void) = code; program(); }
    for (i = 0; i < 3; ++i) printf("R%d %u\n", i, R[i]);

    return 0;
}


CJam, 222 187 185 bytes * (too slow / 2)

I just wanted to see how short I can get a bytecode VM by writing it in CJam. Less than 200 bytes seems pretty decent. It's damn slow though, because CJam itself is interpreted. It takes ages to run the test program.

304402480 6b:P;q:iD-);{(_P=@/(\L*@@+\}h;]:P;TTT]:R;{_Rf=~}:Q;{4G#%R@0=@t:R;}:O;{TP=("R\(\GG*bt:R;  ~R= R\~@t:R; Q+O Q4G#+-O Q*O Q/O ~(:T; Rf=~-:U; GG*bU0<{(:T}*;"S/=~T):TP,<}g3,{'R\_S\R=N}/

To run it, download the Java interpreter at this sourceforge link, save the code in vm.cjam and run it with

java -jar cjam-0.6.2.jar vm.cjam

The program expects the bytecode on STDIN. I haven't found a way yet to pipe binary data into a program, without PowerShell adding a trailing line break and converting 0x0a to 0x0d 0x0a, which is really annoying. The code includes 4 bytes to fix that (D-);), which I haven't included in the total count, because it's not something the program should need to do if it actually received the bytecode itself on STDIN, instead of some weirdly encoded version of it. If someone knows a fix for that, please let me know.

Slightly ungolfed:

304402480 6b:P; "Create lookup table for instruction sizes. Store in P.";
q:i             "Read program and convert bytes to integers.";
D-);            "Remove spurious carriage returns. This shouldn't be necessary.";
{(_P=@/(\L*@@+\}h;]:P; "Split into instructions. Store in P.";
"We'll use T for the instruction pointer as it's initialised to 0.";
"Likewise, we'll use U for the CMP flag.";
TTT]:R; "Store [0 0 0] in R for the registers.";
{_Rf=~}:Q; "Register lookup block.";
{4G#%R@0=@t:R;}:O; "Save in register block.";
{TP=("R\(\GG*bt:R;

~R=
R\~@t:R;
Q+O
Q4G#+-O
Q*O
Q/O
~(:T;
Rf=~-:U;
GG*bU0<{(:T}*;"N/=~T):TP,<}g "Run program.";
3,{'R\_S\R=N}/

I'll add a proper explanation tomorrow.

In short, I'm storing all the registers, the instruction pointer and the comparison flag in variables, so that I can keep CJam's stack free to use as the VM's stack.