Dr said:
JRS: In article <
[email protected]>
, dated Fri, 21 Apr 2006 10:23:41 remote, seen in
If you can alter the RegExp by inserting extra parentheses so that
everything is matched, them you could sum the lengths of all lower
matches.
This is, in effect, what I have done, code provided below. However, it
is a non trivial process that must account for nested parentheses
(...(...()...()...)...(...()...)...), back references (\#), and non
capturing subexpressions (?:...).
Or you could then, with .replace, substitute all lower matches to "",
and see by how much the length has changed.
But I don't know whether that would always work with sufficiently
complex RegExps.
You could .replace the parameter in question with an Unreasonable String
(it is, after all, Unicode) and then do indexOf(that US).
I appreciate the brainstorming. Back references render the remaining
above ideas unworkable, as far as I can tell. Below is a function I
coded up which does the job. It works by introducing parens ending at
the start of the specified capturing parens [those are parens that
don't start with (?:] and stretching back to the start of the
containing capturing parens. Of course the containing paren's position
must be identified, too, so you get the idea this is recursive. The
complete listing of the function in all its gory glory follows (not
extensively tested).
Csaba Gabor from Vienna
function regExpPos (text, re, parenNum) {
// returns the starting position of the parenNum-th capturing parens
// of the RegExp, re, when matching text; -1 if not successful
if (!parenNum) { // terminating case
if (!text.match(re)) return -1;
return RegExp.leftContext.length; }
var i, j, aParen, src=re.source;
if (arguments.length<4) { // initial entry - this section determines
// opening and closing positions of all capturing parens
var code, chr;
aParen = [[0, src.length]];
var mode = 0; // 0 => normal, 1 => character []
for (i=0;i<src.length;++i) {
if ((chr=src.charAt(i))=="\\") { ++i; continue; }
if (mode) { if (chr=="]") mode = 0; continue; }
if (chr=="[") { mode = 1; continue; }
if (chr=="(" && src.substr(i+1,2)!="?:") aParen.push([i, -1]);
else if (chr==")")
for (j=aParen.length;j--
if (aParen[j][1]<0) { aParen[j][1]=i; break; }
}
if (parenNum>=aParen.length) {
if (!text.match(re)) return -1;
return (RegExp.leftContext.length + RegExp.lastMatch.length); }
} else aParen = arguments[3];
// step 1 - find the containing parens (cp, aCP)
var aTP = aParen[parenNum]; // parenNum's start, end position
for (var cP=parenNum;cP--
if (aParen[cP][1]>aTP[1]) break;
var res, aP2, aCP = aParen[cP]; // containing paren's start, end pos
// step 2 - avoid introducing extra level of parens
// for when cP to parenNum is completely filled with parens
for (i=parenNum, aP2=
;--i>cP
if (aParen[aP2[aP2.length-1]][0]==aParen[1]+1)
aP2[aP2.length] = i;
if (aParen[aP2[aP2.length-1]][0]==aCP[0]+1) {
if (!text.match(re)) return -1;
for (res=0, i=aP2.length;--i res += RegExp['$'+aP2].length;
return res + (!cP ? RegExp.leftContext.length :
regExpPos(text, re, cP, aParen)); }
// step 3 - insert parens from start of cP to start of parenNum
//alert (aParen.join("\n"));
src = src.slice(0,i=aCP[0]) + "(" +
src.slice(i,i=aTP[0]) + ")" + src.slice(i);
// step 4 - replace back references >= parenNum
for (i=0;i<src.length;++i) {
if ((chr=src.charAt(i))=="\\") {
if (!mode && (code=src.charCodeAt(i+1))<57 && (code>=48+(cP+1)))
src = src.slice(0,i+1) + String.fromCharCode(code+1) +
src.slice(i+2);
++i;
continue; }
if (mode) { if (chr=="]") mode = 0; continue; }
if (chr=="[") { mode = 1; continue; }
}
// step 5 - do the regular expression
var rex = /x/;
rex.compile(src);
if (!text.match(rex)) return -1;
return RegExp['$'+(cP+1)].length +
(!cP ? RegExp.leftContext.length :
regExpPos(text, re, cP, aParen));
}