A Basic Pyth-like Syntax Checker

CJam, 65 bytes

q'"/La+2%{0s*}:Z~_,:L')*+{L{4{)_Z+s/Z}/L{'(\Z')++/Z}/}*')-}2*0s-!

Gosh, I wish CJam had Regex, this could have been completed in less than 50 bytes then

The main idea is to keep reducing stuff to 0 i.e. 10 to 0, 200 to 0 and so on. Once that is done, we reduce all matched brackets to 0, i.e. () to 0, (0) to 0, (00) to 0 and so on. The we repeat the cycle L times, where L is the input length.

The input string initially goes through extra processing where we adjust for unmatched " and add lots of ) to compensate for unmatched (

This ensures that after all the iterations, we should only have 0 (and no-op )) left in the string.

Update - fixed a bug where top level ) no-op are considered harmful

Code expansion

q'"/La+2%{0s*}:Z~_,:L')*      "Part 1 - Preprocessing";
q'"/                          "Read the input and split it on quote";
    La+                       "Add an extra empty array to the split array to compensate";
                              "for unmatched ending quote";
        2%                    "Take every other splitted part, ignoring the contents";
                              "of the strings in the code";
          {0s*}:Z~            "Join the parts by string 0. Also create a function Z";
                              "that does that for future use. We are now done with";
                              "reducing "XYZ" to 0 ";
                  _,:L        "Store the length of the remaining code in L";
                      ')*     "Append L ) to the string to compensate for unmatched (";

{L{4{)_Z+s/Z}/L{'(\Z')++/Z}/}*')-}2*   "Part 2 - the reducing loop";
 L{                         }*         "Run the reduction logic L times";
   4{)_Z+s/Z}/                         "Part 2a - reducing arities";
   4{       }/                         "Run the iteration for 0, 1, 2 and 3";
     )_                                "Increment the iteration number and make a copy";
       Z+s                             "Get that many 0 and append it to the number";
          /Z                           "Split the code onto this number and convert it";
                                       "to 0. This successfully reduces 10, 200, ";
                                       "3000 and 4000 to 0";
              L{'(\Z')++/Z}/           "Part 2b - reducing groups";
              L{          }/           "Run the iteration for 0 through L - 1";
                '(\Z')++               "Get the string '(<iteration number of 0>)'";
                        /Z             "split on it and join by 0. This successfully";
                                       "reduces (), (0), (00) and so on .. to 0";
                              ')-      "After running the part 2 loop L times, all";
                                       "reducible arities and brackets should be reduced";
                                       "except for cases like '30)00' where the no-op )";
                                       "breaks the flow. So we remove ) from the code";
                                       "and run Part 2 L times again";

0s-!                          "Now, if the code was valid, we should only have 0 in the";
                              "remaining code. If not, the code was invalid";

Try it online here or run the whole suite


Regex, PCRE flavor, 83 bytes

^(?3)*\)*$(()((?(2)|\)*)(\((?1)*(\)|$)|0|"[^"]*.?|(1|(2|(3|4(?3))(?3))(?3))(?3))))?

Try it here.

Regex, PCRE flavor, 85 bytes

^((?(3)|\)*)(?>\((?2)*(()(?1))?(\)|$)|0|"[^"]*.?|(1|(2|(3|4(?1))(?1))(?1))(?1)))*\)*$

Try it here.

Used some ideas in this dan1111's answer.

Some explanations about (?2)*(()(?1))?.


80386 Assembler, 97 bytes

Hex dump:

0000000: 8b54 2404 5589 e531 c96a ff8a 022c 303c  .T$.U..1.j...,0<
0000010: f275 04f6 d530 c084 ed75 263c f875 0141  .u...0...u&<.u.A
0000020: 3cf9 750f 84c9 7419 4958 3c00 7c03 50eb  <.u...t.IX<.|.P.
0000030: 2430 c084 c075 0958 483c ff7f 0140 ebf3  $0...u.XH<...@..
0000040: 5042 8a02 84c0 75c5 4a84 edb0 2275 be84  PB....u.J..."u..
0000050: c9b0 2975 b85a 31c0 39e5 7501 4089 ec5d  ..)u.Z1.9.u.@..]
0000060: c3                                       .

This runs through the input once, pushing numbers greater than zero onto the stack and decrementing them when a zero is processed. Unboundeds are processed as a -1.

Function prototype (in C) (function returns 0 if invalid and 1 if valid):

int __cdecl test(char *in);

Equivalent assembly (NASM):

bits 32
; get input pointer into edx
mov edx, [esp+4]                ; 8B 54 24 04

; save ebp; set ebp = esp
push ebp                        ; 55
mov ebp, esp                    ; 89 E5

; clear ecx
xor ecx, ecx                    ; 31 C9

; push base -1
push byte(-1)                   ; 6A FF

; get top char
mov al, [edx]                   ; 8A 02

    sub al, 0x30                ; 2C 30

    ; if al == quote
    cmp al, 0xF2                ; 3C F2
    jne $+6                     ; 75 04
        ; set ch (in quote?) to opposite
        not ch                  ; F6 D5
        ; set value to 0
        xor al, al              ; 30 C0

    ; if in quote, continue
    test ch, ch                 ; 84 ED
    jnz $+40                    ; 75 26

    cmp al, 0xF8                ; 3C F8
    jne $+3                     ; 75 01
        ; increment cl=depth
        inc ecx                 ; 41

    cmp al, 0xF9                ; 3C F9
    jne $+17                    ; 75 0F
        ; check depth = 0
        test cl, cl             ; 84 C9
        jz $+27                 ; 74 19
        ; decrement cl=depth
        dec ecx                 ; 49
        ; pop and check -1
        pop eax                 ; 58
        cmp al, 0               ; 3C 00
        jl $+5                  ; 7C 03
            push eax            ; 50
            jmp $+38            ; EB 24
        xor al, al              ; 30 C0

    test al, al                 ; 84 C0
    jnz $+11                    ; 75 09
        pop eax                 ; 58
        dec eax                 ; 48
        cmp al, -1              ; 3C FF
        jg $+3                  ; 7F 01
            inc eax             ; 40
        jmp $-11                ; EB F3
    push eax                    ; 50

    inc edx                     ; 42
    mov al, [edx]               ; 8A 02
    test al, al                 ; 84 C0
    jnz $-57                    ; 75 C5

    dec edx                     ; 4A

    ; in quote?
    test ch, ch                 ; 84 ED
    mov al, 0x22                ; B0 22
    jnz $-64                    ; 75 BE

    ; depth not zero?
    test cl, cl                 ; 84 C9
    mov al, 0x29                ; B0 29
    jnz $-70                    ; 75 B8

; pop base -1
pop edx                         ; 5A

; set return value based on ebp/esp comparison
xor eax, eax                    ; 31 C0
cmp ebp, esp                    ; 39 E5
jne $+3                         ; 75 01
inc eax                         ; 40
; restore esp
mov esp, ebp                    ; 89 EC
; restore ebp
pop ebp                         ; 5D
; return
ret                             ; C3

The following code in C can be used with GCC on a POSIX system to test:

#include <sys/mman.h>
#include <stdio.h>
#include <string.h>

int main(){
    char code[] = {
        0x8b, 0x54, 0x24, 0x04, 0x55, 0x89, 0xe5, 0x31, 0xc9, 0x6a, 0xff,
        0x8a, 0x02, 0x2c, 0x30, 0x3c, 0xf2, 0x75, 0x04, 0xf6, 0xd5, 0x30, 
        0xc0, 0x84, 0xed, 0x75, 0x26, 0x3c, 0xf8, 0x75, 0x01, 0x41, 0x3c, 
        0xf9, 0x75, 0x0f, 0x84, 0xc9, 0x74, 0x19, 0x49, 0x58, 0x3c, 0x00, 
        0x7c, 0x03, 0x50, 0xeb, 0x24, 0x30, 0xc0, 0x84, 0xc0, 0x75, 0x09, 
        0x58, 0x48, 0x3c, 0xff, 0x7f, 0x01, 0x40, 0xeb, 0xf3, 0x50, 0x42, 
        0x8a, 0x02, 0x84, 0xc0, 0x75, 0xc5, 0x4a, 0x84, 0xed, 0xb0, 0x22, 
        0x75, 0xbe, 0x84, 0xc9, 0xb0, 0x29, 0x75, 0xb8, 0x5a, 0x31, 0xc0, 
        0x39, 0xe5, 0x75, 0x01, 0x40, 0x89, 0xec, 0x5d, 0xc3,
    };
    void *mem = mmap(0, sizeof(code), PROT_WRITE|PROT_EXEC, MAP_ANON|MAP_PRIVATE, -1, 0);
    memcpy(mem, code, sizeof(code));
    int __cdecl (*test)(char *) = (int __cdecl (*)(char *)) mem;

    #define TRY(s) printf(s ": %d\n", test(s))

    printf("Truthy tests:\n");
    TRY("0");
    TRY(")");
    TRY("(");
    TRY("\"");
    TRY("()");
    TRY("\"\"");
    TRY("10");
    TRY("400010");
    TRY("210010");
    TRY("(\"\")00");
    TRY("3\"\"\"\"\"");
    TRY("(0)))0)1)0");
    TRY("2(2(2(0)0)0)0");
    TRY("2\"010\"\"44)()4\"");
    TRY(")31(0)0(201000100");
    TRY("())2)1))0\"3())\"))");
    TRY("3(\"4321(\"301(0)21100\"4\")\"123\"00)40\"121\"31000\"\"01010");

    printf("\nFalsy tests:\n");
    TRY("1");
    TRY("1(310");
    TRY("(1)0)");
    TRY("12(010");
    TRY("4\"00010\"");
    TRY("3120102100");
    TRY("20(2((0)(0)))");
    TRY("2(2(2(0)0)0)01)");
    TRY("4(0102)00)00000");
    TRY("2\"00\"(\"00\"2(\"\"))");

    munmap(mem, sizeof(code));
    return 0;
}