Fast search for all positions in a string

C

czechboy

Hi,
what is the fastest way to search for all occurences of a string in a
very long string (from several KB to MB). What I want to get is an
array of numbers with positions of each occurence.
thanks
 
B

Bart Van der Donck

czechboy said:
what is the fastest way to search for all occurences of a string in a
very long string (from several KB to MB). What I want to get is an
array of numbers with positions of each occurence.

var str = 'fdda54kokDdggfA54lf8754a544ro9a54ypx9Wa54z'
var mat = 'a54'
var arr = str.split(mat)
var res = new Array()
var r = 0
for (var i=0; i<arr.length-1; ++i) {
res = 1 + r + arr.length
r += arr.length + mat.length
}
alert(res)

Takes ca. 1 second runtime on my Firefox 2.0.0.14. for a 1MB string.
MSIE is slower as expected. I don't see relevant possible performance
boosts unless finding a way to not loop the array (which I don't think
is possible here). Generally I would recommend to avoid such heavy
strings.

The search is case-sensitive.

Hope this helps,
 
J

Jorge

Also I would like to mention that I need case insensitive search.

<html><head><style>text { font-size: 16px; }</style>
<script>
window.onload= function () {
var txt= "", b= [], i, d= document, t,
y= function (p) { return d.createElement(p) },
getMS= function (p) { return (new Date()).getTime()-p };

txt+= " Lorem ipsum dolor sit amet, consectetuer adipiscing.";
txt+= " Nullam consequat lectus sit amet enim. Proin , ipsum";
txt+= " eu tincidunt iaculis, massa enim quam, at molestie ";
txt+= "enim tellus ac dolor. Aenean tempor tortor. ";
txt+= "Vestibulum sed sem. Curabitur sed erat a nibh tristique.";
for (i=0; i<6; i++) { txt+= txt+txt+txt+txt }

t= "txt.length= " + txt.length;
t+= " : \""+txt.substr(0,30)+"...\"<br><br>";
(d.body.appendChild(y('text'))).innerHTML= t;

//Replace this f() with others' versions to compare them.
function search (whatTxt, inTxt) {
var l, p, a= [], i= 0;
inTxt= inTxt.toLowerCase();
l= (whatTxt= whatTxt.toLowerCase()).length;
while ((p= inTxt.indexOf(whatTxt,i)) >= 0) {
a.push(p);
i= p+l;
}
return a;
}

b= ["lorem","IPSUM","veni","iMac","CoNsEqUaT","."," ","r","iN"];
b.push(" LoReM iPsUm DoLoR ");
setTimeout(function () {
var t, a, mS, c= b.shift();
mS= getMS(0);
a= search(c,txt);
mS= getMS(mS);
t= "\""+c+"\" : "+a.length+" : "+mS+" mS<br>";
(d.body.appendChild(y('text'))).innerHTML= t;
if (b.length) { setTimeout(arguments.callee, 111) } else {
(d.body.appendChild(y('text'))).innerHTML= "Done.";
}
}, 1);
};
</script></head><body></body></html>

I just turn it all to lowercase before matching,
so it's case-insensitive. But that's not very smart : there
has to be a better way. (RegExps /i ?).

Replace search () with other versions to compare.

Gives me this results in Safari/Mac :
The test string is ~4Mb long.

txt.length= 4031250 : " Lorem ipsum dolor sit amet, c..."

"lorem" : 15625 : 121 mS
"IPSUM" : 31250 : 144 mS
"veni" : 0 : 128 mS
"iMac" : 0 : 125 mS
"CoNsEqUaT" : 15625 : 130 mS
"." : 93750 : 173 mS
" " : 625000 : 648 mS
"r" : 187500 : 258 mS
"iN" : 46875 : 156 mS
" LoReM iPsUm DoLoR " : 15625 : 146 mS
Done.

And this in IE8b/WinXP :

txt.length= 4031250 : " Lorem ipsum dolor sit amet, c..."

"lorem" : 15625 : 190 mS
"IPSUM" : 31250 : 310 mS
"veni" : 0 : 140 mS
"iMac" : 0 : 150 mS
"CoNsEqUaT" : 15625 : 230 mS
"." : 93750 : 491 mS
" " : 625000 : 2804 mS
"r" : 187500 : 530 mS
"iN" : 46875 : 331 mS
" LoReM iPsUm DoLoR " : 15625 : 250 mS
Done.

HTH,
--Jorge.
 
J

Jorge

   var str = 'fdda54kokDdggfA54lf8754a544ro9a54ypx9Wa54z'
   var mat = 'a54'
   var arr = str.split(mat)
   var res = new Array()
   var r = 0
   for (var i=0; i<arr.length-1; ++i)  {
     res = 1 + r + arr.length
     r += arr.length + mat.length
   }
   alert(res)


Pasted in as :

//Replace this f() with others' versions to compare them.
function search (whatTxt, inTxt) {
var i, arr= inTxt.split(whatTxt),
res= [], r= 0;

for (i=0; i<arr.length-1; ++i) {
res= 1 + r + arr.length;
r+= arr.length + whatTxt.length;
}

return res;
}

Gives :

Safari/Mac :
txt.length= 4031250 : " Lorem ipsum dolor sit amet, c..."

"Lorem" : 15625 : 21 mS
"ipsum" : 31250 : 52 mS
"veni" : 0 : 10 mS
"iMac" : 0 : 19 mS
"consequat" : 15625 : 39 mS
"." : 93750 : 109 mS
" " : 625000 : 706 mS
"r" : 187500 : 278 mS
"in" : 46875 : 85 mS
" Lorem ipsum dolor " : 15625 : 49 mS
Done.

IE8b/WinXP :

txt.length= 4031250 : " Lorem ipsum dolor sit amet, c..."

"Lorem" : 15625 : 150 mS
"ipsum" : 31250 : 281 mS
"veni" : 0 : 90 mS
"iMac" : 0 : 100 mS
"consequat" : 15625 : 171 mS
"." : 93750 : 601 mS
" " : 625000 : 3695 mS
"r" : 187500 : 1112 mS
"in" : 46875 : 190 mS
" Lorem ipsum dolor " : 15625 : 180 mS
Done.

You win... :)

Although that's case-sensitive.

--Jorge.
 
H

Henry

On Jun 4, 11:58 am, czechboy wrote:
txt.length= 4031250 : " Lorem ipsum dolor sit amet, c..."

"lorem" : 15625 : 121 mS
"IPSUM" : 31250 : 144 mS
"veni" : 0 : 128 mS
"iMac" : 0 : 125 mS
"CoNsEqUaT" : 15625 : 130 mS
"." : 93750 : 173 mS
" " : 625000 : 648 mS
"r" : 187500 : 258 mS
"iN" : 46875 : 156 mS
" LoReM iPsUm DoLoR " : 15625 : 146 mS
Done.

And this in IE8b/WinXP :

txt.length= 4031250 : " Lorem ipsum dolor sit amet, c..."

"lorem" : 15625 : 190 mS
"IPSUM" : 31250 : 310 mS
"veni" : 0 : 140 mS
"iMac" : 0 : 150 mS
"CoNsEqUaT" : 15625 : 230 mS
"." : 93750 : 491 mS
" " : 625000 : 2804 mS
"r" : 187500 : 530 mS
"iN" : 46875 : 331 mS
" LoReM iPsUm DoLoR " : 15625 : 250 mS
Done.

It addition to running timing tests in different browsers and on
different OSs it is also necessary to know the type of processor being
used for the tests and to perform the tests on different types of
processors before any useful picture of how meaningful the are can be
established. This is because processors are optimised in different
ways and so any tests on a single processor may say more about
artefacts of that processors design than about javascript or its
implementations. For example, using the same browser and the same OS
it was observed that on a P4 a regular expression text was faster than
direct string comparison (which is unintuitive given the relative
complexity of the two operations), while on a P3 the reverse was true.

In any event, given the OPs desire for case insensitive testing the
overheads of converting a very large mixed case string into a upper or
lower case normalised string is important for performance testing case
sensitive searching by string comparison.

The otherwise often problematic characteristic of Regular expression
where when the global flag is set the - lastIndex - property of the
regular expression instance is not re-set following a single match is
likely to exist for precisely this sort of situation, so here is a
regular expression version (which is case insensitive):-

<html>
<head>
<title></title>
</head>
<body>
<pre>
<script type="text/javascript">

var data = "XXXXXXX ... XXXXXXXXX "; // Long string

var testSt = 'xxxxxx'; // The substring to match.

var testStringLength = testSt.length;// To avoid re-resolving the
// property accessor in the loop.

/* Note that the sting used to construct the regular expression
really should be passed through an escaping process that
turns all characters that are significant in regular expressions
(such as '[' or '.') into (one of) their escape sequence
equivalents.
*/
var RX = new RegExp(testSt, 'gi') // Without the global flag
// this will loop forever.


var out = []; // The array that will contain
// the start indexes of all
// matches found
while(RX.test(data)){
/* The - lastIndex - property of the regular expression instance
is the index of the first character _after_ the match, and so
needs to be reduced by the length of the match in order to
find the beginning of the march in the original string. As
this regular expression is matching a fixed sequence of
characters it is valid to assume that the length of that
sequence is also the length of any match found.
*/
out.push((RX.lastIndex - testStringLength));
}

for(var c = 0,len = out.length;c < len;++c){
document.write(
(c+':\t'+out[c]+
'\t'+data.substring(out[c], (out[c]+testStringLength))
)+'\n'
);

}
</script>
</pre>
</body>
<html>
 
D

Dr J R Stockton

In comp.lang.javascript message <ad6736ef-bbc7-4e37-a70e-2e65c3f286f0@r6
6g2000hsg.googlegroups.com>, Wed, 4 Jun 2008 02:53:29, czechboy
what is the fastest way to search for all occurences of a string in a
very long string (from several KB to MB). What I want to get is an
array of numbers with positions of each occurence.

This could perhaps be developed into a clean method :-

S = "0aaaa1bbbb2222ccccccc3dd"
A = []
J = 10 // safety-belt
R = new RegExp("\\d+", "gi")

while (J--) {
M = R.exec(S)
if (!M) break
A.push(RegExp.lastIndex-RegExp.lastMatch.length)
}
A // Result -> [0,5,10,21]

It's a good idea to read the newsgroup c.l.j and its FAQ. See below.
 
J

Jorge

In any event, given the OPs desire for case insensitive testing the
overheads of converting a very large mixed case string into a upper or
lower case normalised string is important for performance testing case
sensitive searching by string comparison.

The otherwise often problematic characteristic of Regular expression
where when the global flag is set the - lastIndex - property of the
regular expression instance is not re-set following a single match is
likely to exist for precisely this sort of situation, so here is a
regular expression version (which is case insensitive):-

Your version, Bart's and mine, compared :

<html><head><style>text { font-size: 16px; }</style>
<script>
window.onload= function () {
var txt, d= document, t,
y= function (p) { return d.createElement(p) },
getMS= function (p) { return (new Date()).getTime()-p };

txt= " Lorem ipsum dolor sit amet, consectetuer adipiscing.";
txt+= " Nullam consequat lectus sit amet enim. Proin , ipsum";
txt+= " eu tincidunt iaculis, massa enim quam, at molestie ";
txt+= "enim tellus ac dolor. Aenean tempor tortor. ";
txt+= "Vestibulum sed sem. Curabitur sed erat a nibh tristique.";

for (i=0; i<6; i++) { txt+= txt+txt+txt+txt }

t= "txt.length= " + txt.length;
t+= " : \""+txt.substr(0,30)+"...\"<br><br>";
(d.body.appendChild(y('text'))).innerHTML= t;

/*By Jorge*/
function search0 (whatTxt, inTxt) {
var l, p, a= [], i= 0;
inTxt= inTxt.toLowerCase();
l= (whatTxt= whatTxt.toLowerCase()).length;
while ((p= inTxt.indexOf(whatTxt,i)) >= 0) {
a.push(p);
i= p+l;
}
return a;
};

/*By Bart, modified by Jorge: added .toLowerCase()*/
function search1 (whatTxt, inTxt) {
var l, i, arr, res= [], r= 0;
inTxt= inTxt.toLowerCase();
l= (whatTxt= whatTxt.toLowerCase()).length;
arr= inTxt.split(whatTxt);
for (i=0; i<arr.length-1; ++i) {
res= 1 + r + arr.length;
r+= arr.length + l;
}
return res;
};

/*By Bart, modified by Jorge: .split() with RegExp*/
function search2 (whatTxt, inTxt) {
var i, res= [], r= 0,
l= whatTxt.length;
arr= inTxt.split(new RegExp(whatTxt,"i"));
for (i=0; i<arr.length-1; ++i) {
res= 1 + r + arr.length;
r+= arr.length + l;
}
return res;
};

/* By Henry*/
function search3 (whatTxt, inTxt) {
var RX, out= [];
testStringLength= whatTxt.length;
/*To avoid re-resolving the property accessor in the loop.*/

/* Note that the string used to construct the regular expression
really should be passed through an escaping process that
turns all characters that are significant in regular expressions
(such as '[' or '.') into (one of) their escape sequence
equivalents.
*/
RX = new RegExp(whatTxt, 'gi');
/* Without the global flag this will loop forever.*/

while (RX.test(inTxt)) {
/* The - lastIndex - property of the regular expression instance
is the index of the first character _after_ the match, and so
needs to be reduced by the length of the match in order to
find the beginning of the march in the original string. As
this regular expression is matching a fixed sequence of
characters it is valid to assume that the length of that
sequence is also the length of any match found.
*/
out.push((RX.lastIndex - testStringLength));
}
return out;
};

function wrap (proc, what, i) {
var t, a, mS;
mS= getMS(0);
a= proc(what, txt);
mS= getMS(mS);
t= mS+" mS (search"+i+") : \""+what+"\" : "+a.length;
(d.body.appendChild(y('text'))).innerHTML= t+"<br>";
};

var procsIdx= 0, testsIdx= 0;
var tests= ["lorem","IPSUM","CoNsEqUaT","a"," ",
"iMac"," LoReM iPsUm DoLoR "];
var procs= [search0, search1, search2, search3];

setTimeout(function () {
wrap(procs[procsIdx], tests[testsIdx], procsIdx);
if (++procsIdx >= procs.length) {
procsIdx= 0;
d.body.appendChild(y('br'));
if (++testsIdx >= tests.length) {
(d.body.appendChild(y('text'))).innerHTML= "Done.";
return;
}
}
setTimeout(arguments.callee, 99);
}, 1);
};
</script></head><body></body></html>

Run it in different browsers... and see... :)

--Jorge.
 

Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Members online

Forum statistics

Threads
473,755
Messages
2,569,535
Members
45,007
Latest member
obedient dusk

Latest Threads

Top