Klaus Neuner said:
I need a function that converts a list into a set of regexes. Like so:
string_list = ["blaa", "blab", "raaa", "rabb"]
print string_list2regexes(string_list)
This should return something like:
["bla(a|b)", "ra(aa|bb)"]
The program below does exactly this... Run it like this
$ ./words-to-regexp.pl 5
Extracting 5 letter words
blaa
blab
raaa
rabb
(Giving the list of words to stdin)
It produced this regexp which matches just the above strings.
Re: bla[ab]|ra(?:aa|bb)
For your input. Eg find a regexp to match all the 1&2 letter words
in the dictionary...
../words-to-regexp.pl 2 < /usr/share/dict/words
Re: [aa]|a[cdghlmmnrssttuy]|[bb]|b[aeeikry]|[cc]|c[adfilmorssu]|[dd]|d[bor]|[ee]|e[dhmrssux]|[ff]|f[aemr]|[gg]|g[adeeos]|[hh]|h[aeefgiooz]|[ii]|i[dfnnorstt]|[jj]|j[or]|[kk]|k[crsw]|[ll]|l[aaeiorstu]|[mm]|m[abdeginorsstuy]|[nn]|n[abdeiopu]|[oo]|o[bfhknrswxz]|[pp]|p[aabdhimotu]|[qqrr]|r[abdeehnsux]|[ss]|s[bcehimnort]|[tt]|t[abchiilmosy]|[uu]|u[hprs]|[vv]|v[as]|[ww]|w[emu]|[xx]|xe|[yy]|y[abe]|[zz]|z[nr]
Yes its in perl, but its almost entirely regexps! I haven't got the
time to pythonise it at the moment - hope you enjoy a challenge ;-)
#!/usr/bin/perl -w
#
# The challenge - to write a function which given a list of words
# returns a regexp which will match those and only those words.
use strict;
my $LIMIT = shift || 3;
$|=1;
print "Extracting $LIMIT letter words\n";
my @list = ();
while (<>)
{
chomp;
next if $_ eq "";
push @list, lc($_) if length($_) <= $LIMIT;
}
print "Extracted ", scalar(@list), " words\n";
my ($re, $old_re) = ("", "1");
while ($re ne $old_re)
{
$old_re = $re;
print "-" x 60, "\n";
$re = word_list_to_regexp( @list );
check_word_list_to_regexp($re, \@list);
print "Length: ", length($re), "\n";
}
exit;
############################################################
# Converts a list of words into a regexp which will
# match those words and those numbers only.
#
# It does this by constructing a regexp and then progressively
# simplifying it - recursively if necessary. It uses regexp's to
# transform the regexp of course! This is almost a general purpose
# regexp optimiser.
#
# We assume that the caller will bound the regexp with ^( and )$ or
# \W(?: and )\W or whatever takes their fancy
#
# Set $DEBUG to 1 if you want to print lots of info and check the
# regexp works after each transformation.
#
# Warning: code contains heavy regexps - lift with care ;-)
# Caution: Code may use exponential time and space ;-(
############################################################
sub word_list_to_regexp
{
my (@list) = @_;
my $DEBUG = 1;
# The basic regexp with |'s on the start and end to make our life
# easier
# Should uniq here too...
$re = join("|", sort {
#length($a) <=> length($b) ||
$a cmp $b
} @list);
$re = "|$re|";
# Transform the regexp in stages, making sure at all time the
# regexp is correct if $DEBUG is set
check_word_list_to_regexp($re, \@list) if $DEBUG;
# 1) Concatenate all the single characters a|b|c into [abc]'s
$re =~ s{ \| ( \w (?: \| \w )+ ) (?= \| ) }
{
my ( $string ) = ( $1 );
print "string = '$string'\n" if $DEBUG;
"|[" . join("", split m{\|}, $string) . "]"
}gex;
check_word_list_to_regexp($re, \@list) if $DEBUG;
# 2) Find all the Xa|Xb|Xc and change to X(?:a|b|c)]
$re =~ s{ \| ( (\w+)(\w+) (?: \| \2\w+ )+ ) (?= \| ) }
{
my ( $string, $prefix ) = ( $1, $2 );
print "prefix = '$prefix', string = '$string'\n" if $DEBUG;
"|$prefix\(?:" . join("|", map { substr($_, length $prefix) } split m{\|}, $string) . ")"
}gex;
check_word_list_to_regexp($re, \@list) if $DEBUG;
# 3) Find all the aX|bX|cX and change to (a|b|c)X]
$re =~ s{ \| ( (\w+?)(.+) (?: \| \w+\3 )+ ) (?= \| ) }
{
my ( $string, $postfix ) = ( $1, $3 );
print "postfix = '$postfix', string = '$string'\n" if $DEBUG;
$string =~ s{ \Q$postfix\E (?= \| | $ ) }{}gx;
print "...string = '$string'\n" if $DEBUG;
"|(?:$string)$postfix"
}gex;
check_word_list_to_regexp($re, \@list) if $DEBUG;
# 4) Change (?:a|b|c) into [abc]
$re =~ s{ \(\?\: ( \w (?: \| \w )+ ) \) }
{
my ( $string ) = ( $1 );
print "string = '$string'\n" if $DEBUG;
"[" . join("", split m{\|}, $string) . "]"
}gex;
check_word_list_to_regexp($re, \@list) if $DEBUG;
# 5) Optimise [abc] into [a-c] or \d
# This doesn't optimise all the cases only the complete continuous
# range in the [ ... ]
# $re =~ s{ \[ ( \w{3,} ) \] }
# {
# my ( $string, $start, $end ) = ( $1, substr($1, 0, 1), substr($1, -1, 1) );
# print "match ['$string']...range [$start-$end]\n" if $DEBUG;
# if ($end - $start + 1 == length $string)
# {
# $start == 0 && $end == 9 ? '\d' : "[$start-$end]";
# }
# else
# {
# "[$string]";
# }
# }gex;
check_word_list_to_regexp($re, \@list) if $DEBUG;
my $re_length;
do
{
$re_length = length($re);
# 6) recurse on any sequences left (?:ab|cd|ef)
$re =~ s{ \(\?\: ( \w+ (?: \| \w+ )+ ) \) }
{
my ( $string ) = ( $1 );
if (length($string) < length($re) - 4)
{
print "**** Recursing on '$string'\n" if $DEBUG;
"(?:" . word_list_to_regexp(split m{\|}, $string) . ")";
}
else
{
"(?:$string)";
}
}gex;
# 6a) recurse on any sequences left |ab|cd|ef|
$re =~ s{ \| ( \w+ (?: \| \w+ )+ ) \| }
{
my ( $string ) = ( $1 );
if (length($string) < length($re) - 2)
{
print "**** Recursing on '$string'\n" if $DEBUG;
"|" . word_list_to_regexp(split m{\|}, $string) . "|";
}
else
{
"|$string|";
}
}gex;
}
until (length($re) == $re_length);
check_word_list_to_regexp($re, \@list) if $DEBUG;
# 7) fix the | on each end
$re =~ s{^\|}{};
$re =~ s{\|$}{};
print "**** Returning '$re'\n" if $DEBUG;
return $re;
}
############################################################
# Test subroutine to check the regexp performs as advertised
#
# Call with a regexp and a reference to a list of numbers
# it will check that the regexp matches all the list and
# doesn't match some others (obviously it can't check them
# all can it!) die-ing on any failures.
############################################################
sub check_word_list_to_regexp
{
my ($re, $list) = @_;
my %list = map { $_ => 1 } @$list;
print "Re: $re\n";
# Put some other test cases in
$list{$_} += 0 for (0..999);
$list{int(rand()*1000)} += 0 for (0..99);
$list{int(rand()*10000)} += 0 for (0..99);
$list{int(rand()*100000)} += 0 for (0..99);
# print join(", ", map {"$_ => $list{$_}"} keys %list), "\n";
$re =~ s{^\|}{}; # fix | on start and end
$re =~ s{\|$}{};
$re = "^(?:$re)\$"; # put in ^(?: ... )$
$re = qr{$re}; # compile the regexp for speed
# Check all the keys in list against the regexp - some should pass
# and some should fail
for my $item (keys %list)
{
if ($list{$item} xor ($item =~ /$re/))
{
die "*** FAILED '$re' for '$item' ShouldMatch: $list{$item}\n";
}
else
{
# print "OK '$re' for '$item'\n";
}
}
}