Compile Regexes

C, 627 characters

This program treats its first command-line argument as the input and generates C code as its output.

#define A(v) F[v]+strlen(F[v])
#define S sprintf
char*B="&&f%d(s)||f%d(s)",*J="&&f%d(s+%d)",*r,F[256][65536];h=2;e(f,n,o,R,C,O,t,g){for(C=O=f;*r;++r)switch(*r){case 40:r++;e(g=h++,C=h++,0,0);r[1]^42?t=g:(t=C,S(A(t),B,g,C=h++),r++);o=!S(A(O),J,t,o);O=C;break;case 41:goto L;case'|':S(A(C),J,n,o);o=!S(A(O=f),"||1");break;default:r[1]^42?S(A(C),"&&s[%d]==%d",o++,*r,O^f||R++):(o=!S(A(O),J,t=h++,o),O=C=h++,g=h++,S(A(g),"&&*s==%d&&f%d(s+1)",*r++,t),S(A(t),B,g,C));}L:S(A(C),J,n,o);}main(int c,char**v){r=v[1];for(e(1,!S(*F,"&&!*s"),0,0);h--;)printf("f%d(char*s){return 1%s;}",h,F[h]);puts("main(int c,char**v){exit(f1(v[1]));}");}

Here is its output for (0|1(0|1)*)(|A(0|1)*1) (with newlines added):

f11(char*s){return 1&&s[0]==49&&f7(s+1);}
f10(char*s){return 1&&s[0]==48&&f9(s+1)||1&&s[0]==49&&f9(s+1);}
f9(char*s){return 1&&f10(s)||f11(s);}
f8(char*s){return 1&&f7(s+0)||1&&s[0]==65&&f9(s+1);}
f7(char*s){return 1&&f0(s+0);}
f6(char*s){return 1&&f2(s+0);}
f5(char*s){return 1&&s[0]==48&&f4(s+1)||1&&s[0]==49&&f4(s+1);}
f4(char*s){return 1&&f5(s)||f6(s);}
f3(char*s){return 1&&s[0]==48&&f2(s+1)||1&&s[0]==49&&f4(s+1);}
f2(char*s){return 1&&f8(s+0);}
f1(char*s){return 1&&f3(s+0);}
f0(char*s){return 1&&!*s;}
main(int c,char**v){exit(f1(v[1]));}

If you provide valid input as its first command-line argument, it returns exit status 1. Otherwise, it returns exit status 0.

$ ./regexcompiler '(0|1(0|1)*)(|A(0|1)*1)' > floatprog.c
$ gcc -o floatprog floatprog.c
floatprog.c: In function ‘main’:
floatprog.c:1:519: warning: incompatible implicit declaration of built-in function ‘exit’ [enabled by default]
$ ./floatprog '1A00001' && echo invalid || echo valid
valid
$ ./floatprog '100A010' && echo invalid || echo valid
invalid

Either program, if you fail to provide the command-line argument, dereferences a null pointer, causing a segmentation fault. A sufficiently long regex will overflow the buffers of this submission, and the size of input to a generated program is limited by the size of the stack. However, all the test cases work.

Note that e(g=h++,C=h++,0,0); introduces undefined behavior. If, for example, generated programs do not compile, you can try replacing the statement with h+=2;e(g=h-1,C=h-2,0,0);, which is five characters longer.


Ruby, 641 651 543 characters

H=Hash.new{|h,k|[k]}
d=[[i=0,0,[]]]
o=[?(]
L="t,u,v=d.pop;q,r,s=d.pop;o.pop<?|&&(H[r]<<=t)||(H[q]<<=t;H[r]<<=u);d<<[q,u,s+v]"
gets.chop.chars.map{|c|c==?*&&(q,r,s=d.pop;H[r]|=[q,i+=1];d<<=[r,i,s];next)
eval(L)while/[|)]/=~c ?o[-1]>?(:o[-1]==?.
/[|(]/=~c&&d<<[i+=1,i,o<<c&&[]]||c!=?)&&d<<[i+=1,i+1,["s==#{o<<?.;i}&&c=='#{c}'&&#{i+=1}"]]||o[-1]=?.}
eval(L)while o.size>1
H.map{H.map{|k,v|v.map{|v|H[k]|=H[v]}}}
t,u,v=d[0]
$><<"s=#{H[t]};gets.chop.chars.map{|c|s=s.map{|s|#{v*'||'}}-[!0];#{H.map{|k,v|"s&[#{k}]!=[]&&s|=#{v}"}*?;}};p s&[#{u}]!=[]"

This ruby version became quite long because of several corner cases in the regex parser (maybe I should try a different approach). It expects the regular expression on STDIN and outputs the corresponding ruby code for the matcher to STDOUT.

The program directly generates code for a NFA-ε which is then executed in the matcher.

Test case 1: (output includes additional newlines and indentation)

>>>

s=[0];
gets.chop.chars.map{|c|
  s=s.map{|s|}-[!0];
};
p s&[0]!=[]

Test case 2:

>>> (b|)(ab)*(a|)

s=[0, 1, 2, 4, 9, 5, 10, 6, 11, 12, 14];
gets.chop.chars.map{|c|
  s=s.map{|s|s==2&&c=='b'&&3||s==6&&c=='a'&&7||s==8&&c=='b'&&9||s==12&&c=='a'&&13}-[!0];
  s&[1]!=[]&&s|=[1, 2, 4, 9, 5, 10, 6, 11, 12, 14];
  s&[3]!=[]&&s|=[3, 4, 9, 5, 10, 6, 11, 12, 14];
  s&[0]!=[]&&s|=[0, 1, 2, 4, 9, 5, 10, 6, 11, 12, 14];
  s&[5]!=[]&&s|=[5, 6];
  s&[7]!=[]&&s|=[7, 8];
  s&[9]!=[]&&s|=[9, 5, 10, 6, 11, 12, 14];
  s&[4]!=[]&&s|=[4, 9, 5, 10, 6, 11, 12, 14];
  s&[11]!=[]&&s|=[11, 12, 14];
  s&[13]!=[]&&s|=[13, 14];
  s&[10]!=[]&&s|=[10, 11, 12, 14]
};
p s&[14]!=[]

Another example:

>>> a|bc

s=[0, 1, 3, 4];
gets.chop.chars.map{|c|
  s=s.map{|s|s==1&&c=='a'&&2||s==4&&c=='b'&&5||s==6&&c=='c'&&7}-[!0];
  s&[0]!=[]&&s|=[0, 1, 3, 4];
  s&[3]!=[]&&s|=[3, 4];
  s&[5]!=[]&&s|=[5, 6];
  s&[2]!=[]&&s|=[2, 7]
};
p s&[7]!=[]

Edit: Added a transition to fix the bug PleaseStand noted in the comments. Also changed initialisation of state.