Robert said:
I once made a script that takes a list of strings and creates a
single regexp from them that is supposedly matched fast (i.e. doesn't need
backtracking). If you're interested I can check at work tomorrow - I'm
quite sure I still have it somewhere (please email as reminder).
Is this a trie optimization? I've written something similar for
Regexp::English but the optimization itself is quite slow and takes up
lots of resources. (Because it needs to build the trie.)
It can do this:
irb(main):006:0> re
=> /foobar|foobark|foobard|foobarqux|food|fool|foolish/
irb(main):007:0> re.optimize
=> /foo(?:l(?:ish)?|bar(?:qux|[kd])?|d)/
It was able to create an optimized Regexp that matches the first few
hundred English words from a words file -- I've attached it.
More regards,
Florian Gross
/a(?:ki(?:mbo|n)|l(?:veol(?:ar|i|us)|k(?:al(?
ids?|i(?:ne|s)?)|yl)|ways|l(?:ay(?:ed|s|ing)?|y(?:ing)?|o(?:w(?:a(?:bl(?:[ye])|nces?)|ed|s|ing)?|ys?|cat(?:able|ors?|e(?:[ds])?|i(?:ng|ons?))|phon(?:es?|ic)|t(?:ments?|s|t(?:e(?:[dr])|ing))?)|e(?:viat(?:e(?:[ds])?|i(?:ng|on))|les?|mande|y(?:ways?|s)?|rg(?:y|i(?:c|es))|g(?:ations?|or(?:y|i(?:c(?:al(?:ly)?)?|es))|e(?:d(?:ly)?|s)?|rettos?|i(?:ances?|ng)))|i(?:ances?|e(?:[ds])|gators?|terati(?:ve|ons?))|u(?:d(?:e(?:[ds])?|ing)|r(?:e(?:ment)?|ing)|si(?:ve(?:ness)?|ons?)))?|a(?:baster|crity|rm(?:ed|s|i(?:ng(?:ly)?|st))?|s)|m(?:a(?:nacs?)?|o(?:n(?:ds?|er)|st)|s(?:man)?|ighty)|b(?:a(?:core|tross)?|eit|um(?:s|in)?)|nico|c(?
(?:ves?|hol(?:s|i(?:cs?|sm))?)|hemy)|o(?:n(?:e(?:ness)?|g(?:side)?)|of(?:ness)?|es?|ft|ha|ud)|der(?:m(?:an|en))?|p(?:ha(?:bet(?:s|i(?:c(?:al(?:ly)?|s)?|z(?:e(?:[ds])?|ing)))?|numeric)?|ine)|e(?:e|rt(?:ly|ness|e(?:d(?:ly)?|rs?)|s|ing)?)?|f(?:alfa|resco)|ready|g(?:a(?:e(?:cide)?)?|orithm(?:s|ic(?:ally)?)?|ebra(?:s|ic(?:ally)?)?|inate)|so|t(?:ars?|ogether|er(?:a(?:ble|tions?)|nat(?
rs?|e(?:[ds]|ly)?|i(?:ve(?:ly|s)?|ng|ons?))|cations?|e(?:d|rs?)|s|ing)?|ruis(?:m|t(?:ic(?:ally)?)?)|hough|itudes?)|i(?:ve|ke|as(?:e(?:[ds])|ing)?|m(?
ny|ents?)|bis?|en(?:at(?:e(?:[ds])?|i(?:ng|on))|s)?|g(?:n(?:ments?|ed|s|ing)?|ht))|u(?:m(?:n(?:ae?|i|us)|inum)?|ndum))|m(?:a(?:lgam(?:at(?:e(?:[ds])?|i(?:ng|on))|s)?|nuensis|z(?:e(?:ment|d(?:ly)?|rs?|s)?|ing(?:ly)?)|retto|ss(?:e(?:[ds])|ing)?|t(?
ry|eur(?:s|is(?:m|h(?:ness)?))?)|in)|m(?
(?:ni(?:ac?|um))?|unition)|b(?:l(?:e(?:[drs])?|ing)|assadors?|er|rosial|i(?:valen(?:ce|t(?:ly)?)|ance|dextrous(?:ly)?|ent|gu(?
us(?:ly)?|it(?:y|ies))|tio(?:ns?|us(?:ly)?))|u(?:la(?:nces?|tory)|s(?:cade|h(?:e(?:[ds]))?)))|yl|nesty|o(?:k|ng(?:st)?|eba(?:[es])?|r(?:al(?:ity)?|ous|phous(?:ly)?|tiz(?:e(?:[ds])?|ing)|ist)|u(?:nt(?:e(?:d|rs?)|s|ing)?|r))|p(?:l(?:y|e|i(?:f(?:y(?:ing)?|i(?:cation|e(?:d|rs?|s)))|tudes?))|oules?|er(?:age|es?|sands?)|h(?:etamines?|i(?:b(?
logy|i(?:ans?|ous(?:ly)?))|theaters?))|utat(?:e(?:[ds])?|ing))|e(?:liorat(?:ed?|i(?:ng|on))|n(?:able|orrhea|d(?:ments?|ed|s|ing)?|it(?:y|ies))?|ricium)|i(?:able|no|cabl(?:[ye])|d(?:e|st)?|go|ss|ty)|u(?:lets?|s(?:e(?:ments?|d(?:ly)?|rs?|s)?|ing(?:ly)?)))?|b(?:l(?:a(?:ze|t(?:e(?:[ds])?|i(?:ve|ng|on)))|y|e(?:r|st)?)|a(?:ndon(?:ment|ed|s|ing)?|ck|ft|s(?:e(?:ments?|d|s)?|h(?:e(?:[ds])|ing)?|ing)|t(?:e(?:ments?|d|r|s)?|ing))|b(?
ts?|e(?:ys?)?|reviat(?:e(?:[ds])?|i(?:ng|ons?)))|ys(?:mal(?:ly)?|s(?:es)?)|normal(?:ly|it(?:y|ies))?|o(?:ve(?:mentioned|board|ground)?|li(?:sh(?:ments?|e(?:d|rs?|s)|ing)?|tion(?:ists?)?)|ard|mina(?:ble|te)|des?|r(?:t(?:ed|s|i(?:ve(?:ly)?|ng|ons?))?|igin(?:al|es?))|u(?:nd(?:ed|s|ing)?|t))|d(?
m(?:ens?|inal)|uct(?
rs?|ed|s|ions?)?)|e(?:yance|d|rra(?:nt|tions?)|t(?:s|t(?:e(?:[dr])|ing))?)|r(?:a(?:d(?:e(?:[ds])?|ing)|si(?:ve|ons?))|o(?:ad|gat(?:e(?:[ds])?|ing))|ea(?:ctions?|st)|idg(?:ment|e(?:[ds])?|ing)|upt(?:ly|ness)?)|s(?:c(?
nd(?:ed|s|ing)?|ess(?:e(?:[ds]))?|issas?)|o(?:l(?:v(?:e(?:[ds])?|ing)|ut(?:e(?:ly|ness|s)?|ion))|r(?:b(?:e(?:n(?:cy|t)|d|r)|s|ing)?|pti(?:ve|ons?)))|en(?:ces?|t(?:ly|minded|e(?:d|e(?:s|ism)?)|s|i(?:a|ng))?)|t(?:ain(?:e(?:[dr])|s|ing)?|entions?|r(?:act(?:ly|ness|ors?|ed|s|i(?:ng|on(?:s|is(?:[mt]))?))?|use(?:ness)?)|inence)|inthe|urd(?:ly|it(?:y|ies))?)|hor(?:r(?:e(?:[dr]|nt)|ing)|s)?|i(?:lit(?:y|ies)|d(?:e(?:[ds])?|ing))|u(?:ndan(?:ce|t(?:ly)?)|s(?:e(?:[ds])?|i(?:ve|ng))|t(?:ment|s|t(?:e(?:d|rs?)|ing))?)|j(?:ect(?:ly|ness|ions?)?|ur(?:e(?:[ds])?|ing)))|n(?:kles?|a(?:l(?:y(?:z(?:able|e(?:d|rs?|s)?|ing)|s(?:es|ts?|is)|tic(?:al(?:ly)?|it(?:y|ies))?)|og(?:y|ous(?:ly)?|i(?:cal|es)|ues?)?)?|c(?
ndas?|hronis(?:ms?|tically))|p(?:lasmosis|hor(?:a|ic(?:ally)?))|erobic|rch(?:y|i(?:c(?:al)?|s(?:m|ts?)))|grams?|stomo(?:s(?:es|is)|tic)|t(?
m(?:y|ic(?:al(?:ly)?)?)|hema))|n(?:als?|o(?:y(?:ances?|e(?:d|rs?)|s|ing(?:ly)?)?|tat(?:e(?:[ds])?|i(?:ng|ons?))|unc(?:e(?:ments?|d|rs?|s)?|ing))|ex(?:ation|e(?:[ds])|ing)?|i(?:versar(?:y|ies)|hilat(?:e(?:[ds])?|i(?:ng|on)))|ual(?:ly|s)?)|c(?:est(?
rs?|r(?:al|y))|ho(?:v(?:y|ies)|r(?:ages?|ed|s|i(?:ng|t(?:e|ism)))?)|i(?:llary|ent(?:ly|s)?))|d(?:ers|ing)?|e(?:w|m(?
(?:met(?:ers?|ry)|ne)|i(?:[ac]))|c(?:dot(?:al|es?)|hoic)|sthe(?:sia|ti(?:c(?:ally|s)?|z(?:e(?:[ds])?|ing))))|g(?:l(?:e(?:d|rs?)?|ing)|e(?:l(?:s|ic)?|r(?:ed|s|ing)?)|r(?:y|i(?:ly|e(?:r|st)))|st(?:rom)?|iography|u(?:lar(?:ly)?|ish(?:ed)?))|hydrous(?:ly)?|i(?:line|m(?:a(?:ls?|t(?
rs?|e(?:ly|ness|d(?:ly)?|s)?|i(?:ng|ons?)))|osity|i(?:zed|sm))|on(?:s|ic)?|s(?
trop(?:y|ic)|e(?:ikonic)?)))?|c(?:knowledg(?:ments?|e(?:able|ments?|d|rs?|s)?|ing)|a(?:cia|dem(?:y|i(?:a|c(?:ally|s)?|es)))|me|yclic(?:ally)?|ne|c(?:l(?:a(?:mation|im(?:ed|s|ing)?)|imat(?:e(?:[ds])?|i(?:ng|z(?:ation|ed))))|o(?:lades?|m(?:modat(?:e(?:[ds])?|i(?:ng|ons?))|p(?:li(?:ces?|sh(?:ments?|e(?:d|rs?|s)|ing)?)|an(?:y(?:ing)?|i(?:ments?|e(?:[ds])|sts?))))|rd(?:ance|e(?:d|rs?)|s|i(?:ng(?:ly)?|ons?))?|st(?:ed|s|ing)?|unt(?:a(?:b(?:l(?:[ye])|ility)|n(?:cy|ts?))|ed|s|ing)?)|e(?:ler(?:at(?
rs?|e(?:[ds])?|i(?:ng|ons?))|ometers?)|nt(?:ed|s|ing|ua(?:l|t(?:e(?:[ds])?|i(?:ng|on))))?|de(?:[ds])?|pt(?:a(?:b(?:l(?:[ye])|ility)|nces?)|ors?|e(?:d|rs?)|s|ing)?|ss(?
r(?:[ys]|ies)|e(?:[ds])|i(?:b(?:l(?:[ye])|ility)|ng|ons?))?)|r(?:e(?:dit(?:ations?|ed)?|tions?)|u(?:e(?:[ds])?|ing))|ident(?:ly|al(?:ly)?|s)?|u(?:lturat(?:e(?:[ds])?|i(?:ng|on))|mulat(?
rs?|e(?:[ds])?|i(?:ng|ons?))|r(?:a(?:c(?:y|ies)|te(?:ly|ness)?)|sed)|s(?:a(?:l|ti(?:ve|ons?))|e(?:[drs])?|tom(?:ed|s|ing)?|ing(?:ly)?)))|o(?:lytes?|rns?|ustic(?:al(?:ly)?|s|ian)?)|e(?:s|t(?:ate|ylene|one))?|qu(?:aint(?:ances?|ed|s|ing)?|i(?:esc(?:e(?:n(?:ce|t)|d|s)?|ing)|r(?:able|e(?:[ds])?|ing)|siti(?:ve(?:ness)?|ons?)|t(?:s|t(?:al|e(?:[dr])|ing))?))|r(?:ylic|o(?:bat(?:s|ics?)?|nyms?|polis|ss)|e(?:age|s)?|i(?:mon(?:y|ious)|d))|h(?:e(?:[ds])?|i(?:ng|ev(?:able|e(?:ments?|d|rs?|s)?|ing)))|t(?
rs?|ed|ress(?:es)?|i(?:v(?:at(?
rs?|e(?:[ds])?|i(?:ng|ons?))|e(?:ly)?|i(?:s(?:m|ts?)|t(?:y|ies)))|n(?
meters?|g|ium)|ons?)|ua(?:l(?:ly|s|i(?:zation|t(?:y|ies)))?|rial(?:ly)?|t(?
rs?|e(?:[ds])?|ing)))?|id(?:ly|s|i(?:c|t(?:y|ies))|ulous)?|u(?:men|te(?:ly|ness)?|ity))|d(?:v(?:an(?:c(?:e(?:ments?|d|s)?|ing)|tage(?
us(?:ly)?|d|s)?)|oca(?:cy|t(?:e(?:[ds])?|ing))|e(?:nt(?:i(?:sts?|tious)|ur(?
us|e(?:d|rs?|s)?|ing))?|r(?:b(?:s|ial)?|s(?:ar(?:y|ies)|e(?:ly)?|it(?:y|ies))|t(?:is(?:e(?:ments?|d|rs?|s)?|ing))?))|i(?:ce|s(?:ab(?:l(?:[ye])|ility)|or(?:[ys])?|e(?:ments?|d(?:ly)?|es?|rs?|s)?|ing)))|a(?:mant(?:ly)?|pt(?:a(?:b(?:le|ility)|tions?)|ors?|e(?:d|rs?)|s|i(?:ve(?:ly)?|ng))?|g(?:es?|ios?))|m(?
ni(?:sh(?:ments?|e(?:[ds])|ing)?|tions?)|i(?:x(?:e(?:[ds])|ture)?|nist(?:er(?:ed|s|ings?)?|ra(?:ble|t(?
rs?|e|i(?:ve(?:ly)?|ons?))))|r(?:a(?:l(?:s|ty)?|bl(?:[ye])|tions?)|e(?:d|rs?|s)?|ing(?:ly)?)|ssi(?:b(?:le|ility)|ons?)|t(?:s|t(?:ance|e(?:d(?:ly)?|rs?)|ing))?))|o(?:lescen(?:ce|ts?)|be|pt(?:e(?:d|rs?)|s|i(?:ve|ng|ons?))?|r(?:a(?:ble|tion)|n(?:ments?|ed|s)?|e(?:[ds])?))?|d(?:e(?:nd(?:a|um)?|d|rs?)|ress(?:ab(?:le|ility)|e(?:d|es?|rs?|s)|ing)?|s|i(?:ng|ct(?:ed|s|i(?:ng|ons?))?|ti(?:v(?:es?|ity)|on(?:al(?:ly)?|s)?))|uc(?:e(?:[ds])?|t(?
r|ed|s|i(?:ng|on))?|i(?:ble|ng)))?|e(?
t|qua(?:c(?:y|ies)|te(?:ly)?))|r(?
it(?:ness)?|enal(?:ine)?|ift)|s(?
r(?:b(?:ed|s|ing)?|ption))?|he(?:r(?:e(?:n(?:ce|ts?)|d|rs?|s)?|ing)|si(?:ves?|ons?))|i(?:abatic(?:ally)?|eu)|u(?:l(?:at(?:e|i(?:ng|on))|t(?:er(?:at(?:e(?:[ds])?|ing)|y|ous(?:ly)?|ers?)|s|hood)?)|mbrat(?:e(?:[ds])?|i(?:ng|on)))|j(?:acen(?:cy|t)|o(?:in(?:ed|s|ing)?|urn(?:ment|ed|s|ing)?)|ectives?|u(?:ncts?|d(?:g(?:e(?:[ds])?|ing)|icat(?:e(?:[ds])?|i(?:ng|ons?)))|r(?:e(?:[ds])?|ing)|st(?:abl(?:[ye])|ments?|ors?|e(?:d|rs?)|s|ing)?|tants?)))?|e(?:r(?:at(?
rs?|e(?:[ds])?|i(?:ng|on))|o(?:acoustic|bics?|nautic(?:al|s)?|dynamics?|s(?
l(?:s|ize)?|pace))|ials?)|gis|sthetic(?:ally|s)?)|f(?:l(?:ame|oat)|ar|o(?
t|re(?:mentioned|said|thought)?|ul)|f(?:l(?:ict(?:ed|s|i(?:ve|ng|ons?))?|uen(?:ce|t))|a(?:ble|irs?)|ord(?:able|ed|s|ing)?|e(?:ct(?:ations?|ed|s|i(?:ve|ng(?:ly)?|on(?:ate(?:ly)?|s)?))?|rent)|r(?
nt(?:ed|s|ing)?|i(?:cates?|ght))|i(?:liat(?:e(?:[ds])?|i(?:ng|ons?))|anced|x(?:e(?:[ds])|ing)?|nit(?:y|ies)|davits?|rm(?:ati(?:ve(?:ly)?|ons?)|ed|s|ing)?))|r(?:aid|esh)|t(?:er(?:wards?|life|m(?:ath|ost)|noons?|effect|glow|shocks?|thoughts?|image)?)?|i(?:cionado|eld|re))|g(?:l(?
w|eam)|a(?
e|r|tes?|in(?:st)?)|nostics?|o(?:n(?:y|i(?:z(?:e(?:[ds])?|ing(?:ly)?)|es))|g)?|e(?:less|n(?:c(?:y|ies)|das?|ts?)|d|rs?|s)?|r(?:arian|ee(?:abl(?:[ye])|ments?|d|rs?|s|ing)?|icultur(?:al(?:ly)?|e))|g(?:l(?
merat(?:e(?:[ds])?|ion)|utin(?:at(?:e(?:[ds])?|i(?:ng|on))|ins?))|r(?:a(?:vat(?:e(?:[ds])?|ion)|ndize)|e(?:gat(?:e(?:[ds]|ly)?|i(?:ng|ons?))|ss(?
rs?|i(?:ve(?:ly|ness)?|ons?)))|iev(?:e(?:[ds])?|ing)))|hast|i(?:l(?:e(?:ly)?|ity)|ng|tat(?
rs?|e(?:[ds])?|i(?:ng|ons?)))|ue)|h(?:ead)?|i(?:l(?:ments?|erons?|ing)?|m(?:less(?:ly)?|e(?:d|rs?)|s|ing)?|d(?:ed?|s|ing)?|r(?:ways?|l(?
cks?|ess|i(?:ne(?:[rs])?|fts?))|m(?:a(?:n|ils?)|en)|b(?:ags?|orne)|y|craft|drops?|p(?:lanes?|orts?)|e(?:d|rs?)|f(?:low|are|oils?|rames?|ields?)|s(?
(?:ace|eed)|hips?|trips?)?|tight|i(?:ly|ngs?))?|sle)|jar)|A(?:k(?:ers|ron)|LGOL|l(?:v(?:a(?:rez)?|in)|l(?:a(?:[nh])|yn|e(?:n(?:dale|town)?|g(?:ra|hen(?:y|ies)))|state|is(?
n)?)|a(?:m(?
s?|eda)|bam(?:a(?:ns)?|ian)|n|ddin|r|s(?:kan?|tair))|maden|b(?:an(?:y|ia(?:ns?)?)|er(?:t(?:[ao])?|ich)|r(?:echt|ight)|uquerque)|yssa|c(?:mena|o(?:a|tt)|estis|ibiades)|d(?:e(?:baran|n)|rich)|p(?:ert|s|h(?
nse|eratz))|e(?:x(?:and(?:er|r(?:a|e|i(?:a|ne)))|ei|is)?|ck?|ut(?:ian)?)|f(?:a|onso|redo?)|g(?
(?:l|nqui(?:an|n))|e(?:nib|r(?:ian?)?)|iers)|s(?:atians?|op)|hambra|t(?:air|o(?:[ns])|haea)|i(?:c(?:e|ia)|s(?
n|tair))?)?|ar(?
n|hus)|m(?:a(?:nda|zons?|deus|rillo)|m(?:an|erman)|y|o(?:ntillado|co|s)|dahl|pex|e(?:lia|r(?:ada|ica(?:n(?:a|s|i(?:z(?:ations?|e(?:rs?|s)?)|sm))?|s)?)|s)|sterdam|h(?:aric|erst)|trak|iga)|b(?:aba|b(?:[ay]|ott)|yssinia(?:ns?)?|ner|os?|e(?:l(?:son|ian)?|r(?:nathy|deen))?|ra(?:m(?:s(?
n)?)?|ham)|i(?:lene|djan|gail)|u)|n(?:kara|a(?:lects|b(?:aptists?|el)|creon|stasia|heim|tol(?:e|ian?))|n(?:a(?:list(?:ic)?|polis)?|e(?:tte)?|ie)?|d(?:alusia(?:ns?)?|y|o(?:ver|rra)|e(?:an|rs(?
n|en)|s)|r(?
m(?:ache|eda)|e(?:ws?|a|i)?))|g(?:l(?
(?
h(?
bia|ilia))?|es|i(?:a|can(?:s|i(?:zes?|sm))?))|o(?:la|ra)|el(?:a|o|e(?:nos?|s)|i(?:n(?:[ae])|ca))|ie|us)|heuser|ita)|c(?:k(?:ley|erman)|a(?:dia|pulco)|cra|h(?:aeans?|illes)|t(?:a(?:eon)?|on|s))|d(?:kins|ler(?:ian)?|a(?:m(?:s(?
n)?)?|ir)?|o(?:lph(?:us)?|nis)|d(?:ressograph|is(?
n)?)|e(?:l(?:aide|e|ia)|n)|ri(?:a(?:n|tic)|enne)|irondacks?)|e(?:ne(?:as|id)|olus|robacter|gean|s(?:chylus|op))|f(?:ri(?:ka(?:ans|ners?)|ca(?:n(?:s|iz(?:ations?|e(?:[ds])?|ing))?)?)|ghan(?:s|istan)?)|g(?:way|a(?:memnon|tha)|ne(?:[ws])|ee|ricola|gies?)|hm(?:adabad|edabad)|i(?:ken|leen|nus?|d(?:a|es)|r(?:bus|e(?:dale|s))|tken)|jax)/