Regex in reverse - decompose regular expressions

Python v2.7 1069 1036 950 925 897 884 871 833 822

This answer seems rather long for a code golf, but there are a lot operators that need to be handled and I know what purpose each byte in this answer does. Since there is no existing answer I submit this as a target for other users to beat. See if you can make a shorter answer :).

The two main functions are f which parses the regex starting at the ith character, and d which generates the matching strings, using r the sub-regexes we can recursed into, 'a' the array representing the part of the current sub-regex not yet processed, and a string suffix s which represents the part of the string generated so far.

Also check out the sample output and a test harness.

import sys;V=sys.argv;n=int(V[2]);r=V[1];S=len;R=range;C=R(32,127)
Z=[];z=-1;D='d(r,p,';F='for j in '
def f(i,a):
 if i>=S(r):return a,i
 c=r[i];x=0;I="|)]".find(c)
 if c in"([|":x,i=f(i+1,Z)
 if I+1:return([c,a,x],[a],[c,a])[I],i
 if'\\'==c:i+=1;x=c+r[i]
 return f(i+1,a+[x or c])
def d(r,a,s):
 if S(s)>n:return
 while a==Z:
        if r==Z:print s;return
        a=r[z];r=r[:z]
 e=a[z];p=a[0:z]
 if'|'==a[0]:d(r,a[1],s);d(r,a[2],s)
 elif']'==a[0]:
        g=a[1];N=g[0]=='^';g=(g,g[1:])[N];B=[0]*127;O=[ord(c[z])for c in g]
        for i in R(0,S(g)):
         if'-'==g[i]:exec F+'R(O[i-1],O[i+1]):B[j]=1'
         else:B[O[i]]=1
        for c in C:N^B[c]<1or d(r,Z,chr(c)+s)
 elif' '>e:d(r+[p],e,s)
 else:c=p[:z];exec{'.':F+'C:'+D+'chr(j)+s)','?':D+'s);d(r,p[:z],s)','*':F+'R(0,n+1):d(r,c,s);c+=[p[z]]','+':"d(r,p+['*',p[z]],s)"}.get(e,D+'e[z]+s)')
d(Z,f(0,Z)[0],"")

Note that tabs in the original solution have been expanded. To count the number of characters in the original use unexpand < regex.py | wc.


Haskell 757 705 700 692 679 667

import Data.List
data R=L Char|A R R|T R R|E
h=[' '..'~']
k(']':s)a=(a,s)
k('^':s)_=l$k[]s
k('-':c:s)(a:b)=k([a..c]++b)s
k('\\':c:s)a=k s$c:a
k(c:s)a=k s$c:a
l(a,b)=(h\\a,b)
c#E=L c
c#r=A(L c)r
o(a,b)=(foldr(#)E a,b)
t%0=E
t%n=A(t%(n-1))$T t$t%(n-1)
d s n=m(fst$r s)[[]] where{m E a=a;m(L c)a=[b++[c]|b<-a,length b<n];m(A r s)x=nub$(m r x)++m s x;m(T r s)a=m s$m r a;r s=w$e s E;w(u,'|':v)=(\(a,b)->(A u a,b))$r v;w x=x;e(')':xs)t=(t,xs);e s@('|':_)t=(t,s);e s@(c:_)t=g t$f$b s;e[]t=(t,[]);g t(u,v)=e v$T t u;f(t,'*':s)=(t%n,s);f(t,'+':s)=(T t$t%n,s);f(t,'?':s)=(A t E,s);f(t,s)=(t,s);b('(':s)=r s;b('\\':s:t)=(L s,t);b('.':s)=o(h,s);b('[':s)=o$k s[];b(s:t)=(L s,t)}

output:

ghci> d ".*" 1
[""," ","!","\"","#","$","%","&","'","(",")","*","+",",","-",".","/","0","1","2","3","4","5","6","7","8","9",":",";","<","=",">","?","@","A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z","[","\\","]","^","_","`","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","{","|","}","~"]
ghci> d "w\\w+" 3
["ww","www"]
ghci> d "[abx-z][^ -}][\\\\]" 3
["x~\\","y~\\","z~\\","b~\\","a~\\"]
ghci> d "ab*a|c[de]*" 3
["aa","aba","c","ce","cd","cee","cde","ced","cdd"]
ghci> d "(foo)+(bar)?!?" 6
["foo!","foobar","foo","foofoo"]
ghci> d "(a+|b*c)d" 4
["ad","aad","aaad","cd","bcd","bbcd"]
ghci> d "p+cg" 4
["pcg","ppcg"]
ghci> d "a{3}" 4
["a{3}"]

Explanation: this one's a textbook regex implementation. R is the regex type, with constructors A (alternate), L (literal), T (then) and E (empty/epsilon). The usual 'Star' doesn't appear because I inline it as alternates during the parse (see '%'). 'm' runs the simulation. The parser (start at 'r s=....') is just recursive descent; 'k' parses ranges. The function '#' expands ranges into alternations.


Prolog (SWI), 586 bytes

The built in backtracking ability of Prolog makes it an awesome choice for this challenge. By taking advantage of backtracking, generating strings for a regex becomes exactly the same task as testing if a string is matched by a regex. Unfortunately, much of my golfing effort went into writing a shorter regex parsers. The actual task of decomposing a regular expression we easily golfed.

R-L-S:-R*A,-(B,A,[]),setof(Z,(0/L/M,length(C,M),C+B+[],Z*C),S).
-R-->e+S,S-R.
R-T-->{R=T};"|",e+S,u+R+S-T.
Z+Y-->(".",{setof(C,32/126/C,R)};"[^",!,\E,"]",{setof(C,(32/126/C,\+C^E),R)};"[",\R,"]";"(",-R,")";{R=[C]},([C],{\+C^`\\.[|+?*(`};[92,C])),("*",{S=k*R};"+",{S=c+R+k*R};"?",{S=u+e+R};{S=R}),({Y=c+Z+S};c+Z+S+Y).
\C-->{C=[H|T]},+H,\T;{C=[]};+A,"-",+B,\T,{setof(C,A/B/C,H),append(H,T,C)}.
+C-->[92,C];[C],{\+C^`\\]-`}.
S+e+S.
[C|S]+D+S:-C^D.
S+(B+L+R)+T:-B=c,!,S+L+U,U+R+T;S+L+T;S+R+T.
S+k*K+U:-S=U;S+K+T,S\=T,T+k*K+U.
A/B/C:-between(A,B,C).
A^B:-member(A,B).
A*B:-string_codes(A,B).

Try it online!

Ungolfed Code

generate_string(R, L, S) :-
    % parse regex
    string_codes(R, RC),
    regex_union(RE, RC, []),

    % bound string length
    between(0, L, M),
    length(SC, M),

    % find string
    match(SC, RE, []),

    string_codes(S, SC).

% Parsers
%%%%%%%%%  

regex_union(R) -->regex_concat(S), regex_union1(S, R).

regex_union1(R,T) --> [124], regex_concat(S), regex_union1(regex_union(R,S), T).
regex_union1(R, R) --> [].

regex_concat(R) --> regex_op(S), regex_concat1(S, R).

regex_concat1(R, T) --> regex_op(S), regex_concat1(regex_concat(R,S), T).
regex_concat1(R, R) --> [].

regex_op(regex_kleene(R)) --> regex_lit(R), [42].
regex_op(regex_concat(R,regex_kleene(R))) --> regex_lit(R), [43].
regex_op(regex_union(regex_empty,R)) --> regex_lit(R), [63].
regex_op(R) --> regex_lit(R).

regex_lit(regex_char([C])) --> [C], {\+ regex_ctrl(C)}.
regex_lit(regex_char([C])) --> [92], [C].

regex_lit(regex_char(CS)) --> [46], {findall(C, between(32,126, C), CS)}.

regex_lit(regex_char(DS)) --> 
    [91], [94], !, class_body(CS), [93],
    {findall(C, (between(32, 126, C), \+ member(C, CS)), DS)}.
regex_lit(regex_char(CS)) --> [91], class_body(CS), [93].

regex_lit(R) --> [40], regex_union(R), [41].

class_body([C|T]) --> class_lit(C),class_body(T).
class_body(CS) -->
    class_lit(C0), [45], class_lit(C1), class_body(T),
    {findall(C, between(C0, C1, C), H), append(H,T,CS)}.
class_body([]) --> [].

class_lit(C) --> [C], {\+ class_ctrl(C)}.
class_lit(C) --> [92], [C].

class_ctrl(C) :- string_codes("\\[]-", CS), member(C, CS).
regex_ctrl(C) :- string_codes("\\.[]|+?*()", CS), member(C, CS).

% Regex Engine
%%%%%%%%%%%%%% 

% The regex empty matches any string without consuming any characters.
match(S, regex_empty, S).

% A regex consisting of a single character matches any string starting with
% that character. The chanter is consumed.
match([C|S], regex_char(CS), S) :- member(C, CS).

% A union of two regex only needs to satisify one of the branches.
match(S, regex_union(L,R), T) :- match(S, L, T); match(S, R, T).     

% A concat of two regex must satisfy the left and then the right.
match(S, regex_concat(L, R), U) :- match(S, L, T), match(T, R, U).

% The kleene closure of a regex can match the regex 0 times or it can the regex
% once before matching the kleene closure again.
match(S, regex_kleene(_), S).
match(S, regex_kleene(K), U) :- match(S, K, T), S \= T, match(T, regex_kleene(K), U).