R
Rahul
Hi Everybody
I have some problem in my script. please help me. This is script file.
I have one *.inq file. I want run this script in XML files. But this
script errors shows [Line No. 1(*)]. If u want i am attach this script
files and inq files. I cant understand this error. Please suggest me.
You can talk with my yahoo id (e-mail address removed). Now i am online.
Plz....Plz..Plz...
from implib.filemanip import *
from implib.caseconv import convert_case, CASETAGS as _CASETAGS
import sys, os, getopt, string, random
# Turn off pre deprecation warnings (for >= Python 2.3).
try:
import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning)
except:
pass
""""Amongst our weaponry are such diverse elements as
fear, surprise, ruthless efficiency, an almost fanatical devotion
to the Pope, and nice red uniforms -- Oh damn!"
For information on how to use The Spanish Inquisition, which allows
you to search and replace text in multiple files, simply run this
program without supplying any arguments or supply one of the
standard help options on the command line: -h, -?, --help.
"""
##### inquisition (search-and-replace) file search #####
# ------------------------------------------------------------
# function: get_inqfiles
# ------------------------------------------------------------
def get_inqfiles(filelist):
"""Searches for inquisition (search and replace) and option files
in all directories indicated by the filenames in a list of files,
plus the current working directory, if different.
Usage: get_inqfiles(filelist) --> (inqfiles, optfiles)
If the files in filelist don't contain any paths (or are only
relative paths), then this function searches only the current
working directory for inquisition and option files. (Inquisition
files are identfied by ".inq" and ".data" extensions; option
files are identified by the ".opt" extension.)
"""
# print "Searching for inquisition files..."
# check the list of files to obtain paths, if provided
# (absolute paths would be provided on a mac, for instance)
temp_dirlist = []
for file in filelist:
dir, filename = os.path.split(file)
if dir:
if not os.path.isabs(dir):
dir = os.path.abspath(dir)
temp_dirlist.append(dir)
cwd = os.getcwd()
if cwd[-1] == os.sep:
# leave off trailing pathsep char returned by os.getcwd because
# os.path.split and os.path.abspath both drop the trailing pathsep
cwd = cwd[:-1]
temp_dirlist.append(cwd)
# weed out repeated directories
temp_dirlist.sort()
directories = []
lastdir = ""
for dir in temp_dirlist:
if dir != lastdir:
directories.append(dir)
lastdir = dir
# search each directory for inquisition files
inqfiles = []
optfiles = []
for dir in directories:
dirfiles = os.listdir(dir)
for file in dirfiles:
filename, ext = os.path.splitext(file)
if (ext == ".inq") or (ext == ".data"):
inqfiles.append(os.path.join(dir, file))
elif ext == ".opt":
optfiles.append(os.path.join(dir, file))
# sort the inqfiles and optfiles by filename alone
inqfiles = filenamesort(inqfiles)
inqfile_count = len(inqfiles)
## if inqfile_count > 1:
## print "%s inquisition files found.\n" % inqfile_count
## elif inqfile_count == 1:
## print "1 inquisition file found.\n"
optfiles = filenamesort(optfiles)
optfile_count = len(optfiles)
## if optfile_count > 1:
## print "%s option files found.\n" % optfile_count
## elif optfile_count == 1:
## print "1 option file found.\n"
return (inqfiles, optfiles)
##### inquisition file search-and-replace compilation #####
# ------------------------------------------------------------
# function: compile_searches
# ------------------------------------------------------------
def compile_searches(inqfiles, using_sre=0):
"""Takes the supplied list of inquisition (search and replace) files
and returns a list of tuples, each tuple containing an individual
search-replace pair.
Usage: compile_searches(inqfiles[, using_sre]) --> [(SEARCH, replace),
.... ]
SEARCH is a compiled regular expression object (from re.compile).
If a non-zero value is supplied for the using_sre argument (optional),
the 'u' ('UNICODE') regular expression flag is allowed.
(Use of the pre module is assumed.)
For information on how this function interprets data from
inquisition files, use the standard help options for the
span_inq.py script.
"""
#print "Compiling searches and replaces from inquisition files..."
# initialize global variables
regexes = []
searchpair_count = 0
LINE = re.compile(r"^(?P<id>[Ffser]):\t+(?P<pattern>.*$)")
CASE_CONVERT_FLAGS = re.compile(r"(?P<case>\\[CDLSUX=])")
case_start = '<convert_case case="%s">'
case_end = '</convert_case>'
err_verbose_flag = "ERROR: The 'x' and 'X' verbose pattern flags are
illegal. " \
"See line %s in the %s inquisition file."
err_unicode_flag = "ERROR: The 'u' and 'U' Unicode search flags are
not currently supported. " \
"See line %s in the %s inquisition file."
err_illegal_flag = "ERROR: Line %s in the %s inquisition file contains
an illegal flag."
for inqfile in inqfiles:
lines = readfile(inqfile, 1)
if not lines:
print "ERROR: %s inquisition file not found or is blank." % inqfile
return []
# initialize inquisition file-sensitive variables
linecount = 0
SEARCH = None
need_replace = 0
local_flags = ""
global_flags = ""
for line in lines:
linecount = linecount + 1
linematch = LINE.search(line)
if linematch:
lineid = linematch.group("id")
# treat the pattern as a raw string, so backslashes are handled
properly
pattern = r"%s" % linematch.group("pattern")
# identify what kind of line we have here
if lineid == "s": # search pattern
if need_replace:
print "ERROR: The search pattern above line %s in the %s
inquisition file " \
"is missing its corresponding replace pattern." % (linecount,
inqfile)
return []
search = pattern
if not search:
print "ERROR: The search declaration at line %s in the %s
inquisition file " \
"is blank." % (linecount, inqfile)
return []
need_replace = 1
# compile the search pattern right now
# (it may need to be redone if flags are present, but if there's a
syntax error
# an accurate line number will be reported in the error message)
try:
SEARCH = re.compile(search)
except re.error, err_value:
print "Syntax error in the search pattern at line %s in the %s "
\
"inquisition file.\n" % (linecount, inqfile)
print "ERROR:", err_value[0]
print "\ts:\t" + search
return []
elif lineid == "e": # search pattern with character escapes
if need_replace:
print "ERROR: The character escape search pattern above line %s
in the %s " \
"inquisition file is missing its corresponding replace pattern."
% (linecount, inqfile)
return []
# DO NOT treat the pattern as a raw string, so that character
escapes function
pattern = linematch.group("pattern")
search = pattern
if not search:
print "ERROR: The search declaration at line %s in the %s
inquisition file " \
"is blank." % (linecount, inqfile)
return []
need_replace = 1
# compile the search pattern right now
# (it may need to be redone if flags are present, but if there's a
syntax error
# an accurate line number will be reported in the error message)
try:
SEARCH = re.compile(search)
except re.error, err_value:
print "Syntax error in the character escape search pattern at
line %s in the %s " \
"inquisition file.\n" % (linecount, inqfile)
print "ERROR:", err_value[0]
print "\te:\t" + search
return []
elif lineid == "r": # replace pattern
if not need_replace:
print "ERROR: The replace pattern at line %s in the %s
inquisition file " \
"has no corresponding search pattern." % (linecount, inqfile)
return []
replace = pattern
need_replace = 0
searchpair_count = searchpair_count + 1
# check for case conversion flags
# first mark all escaped backslashes in the pattern, just in case
someone
# wishes to insert literal text matching one of the case
conversion flags
replace = string.join(string.split(replace, r"\\"),
"&escaped_backslash;")
case_convert_count = 0
casematch = CASE_CONVERT_FLAGS.search(replace)
while casematch:
casetype = casematch.group("case")
if casetype == r"\C":
replace = string.join(string.split(replace, r"\C", 1),
case_start % "allcaps")
case_convert_count = case_convert_count + 1
elif casetype == r"\D":
replace = string.join(string.split(replace, r"\D", 1),
case_start % "downstylecaps")
case_convert_count = case_convert_count + 1
elif casetype == r"\L":
replace = string.join(string.split(replace, r"\L", 1),
case_start % "lowercase")
case_convert_count = case_convert_count + 1
elif casetype == r"\S":
replace = string.join(string.split(replace, r"\S", 1),
case_start % "sentencecaps")
case_convert_count = case_convert_count + 1
elif casetype == r"\U":
replace = string.join(string.split(replace, r"\U", 1),
case_start % "upstylecaps")
case_convert_count = case_convert_count + 1
elif casetype == r"\X":
replace = string.join(string.split(replace, r"\X", 1),
case_start % "swapcase")
case_convert_count = case_convert_count + 1
elif casetype == r"\=":
replace = string.join(string.split(replace, r"\=", 1), case_end)
case_convert_count = case_convert_count - 1
else: # I can't imagine how we'd need this, but ...
print "ERROR: Malformed case conversion flag in the replace
pattern " \
"at line %s in the %s inquisition file." % (linecount, inqfile)
return []
casematch = CASE_CONVERT_FLAGS.search(replace)
if case_convert_count == 1:
replace = replace + case_end
elif case_convert_count > 1:
print "ERROR: The replace pattern at line %s in the %s
inquisition file " \
"requires one or more '\=' end case conversion flags." %
(linecount, inqfile)
return []
elif case_convert_count < 0:
print "ERROR: The replace pattern at line %s in the %s
inquisition file " \
"has at least one extra '\=' end case conversion flag." %
(linecount, inqfile)
return []
# re-insert escaped backslashes
replace = string.join(string.split(replace,
"&escaped_backslash;"), r"\\")
elif lineid == "f": # set a local flag for this pattern
if pattern:
local_flags = handle_re_flags(pattern, using_sre)
if local_flags == "err_verbose_flag":
print err_verbose_flag % (linecount, inqfile)
return []
elif local_flags == "err_unicode_flag":
print err_unicode_flag % (linecount, inqfile)
return []
elif local_flags == "err_illegal_flag":
print err_illegal_flag % (linecount, inqfile)
return []
else: # it's possible someone will leave a local flag line blank,
which is OK and must be registered
local_flags = "NO_FLAGS"
elif lineid == "F": # set a global flag starting at this point in
the file
if pattern:
global_flags = handle_re_flags(pattern, using_sre)
if global_flags == "err_verbose_flag":
print err_verbose_flag % (linecount, inqfile)
return []
elif global_flags == "err_unicode_flag":
print err_unicode_flag % (linecount, inqfile)
return []
elif global_flags == "err_illegal_flag":
print err_illegal_flag % (linecount, inqfile)
return []
else: # it's possible someone will leave the global flag line
blank, which is OK and must be registered
global_flags = ""
else: # I can't imagine how we'd need this, but ...
print "ERROR: Line %s is malformed." % linecount
return []
# add the search and replace regular expression pair to the list
if SEARCH and (not need_replace):
if not local_flags: local_flags = global_flags
if local_flags and local_flags != "NO_FLAGS": # recompile the
search pattern
SEARCH = re.compile("(?" + local_flags + ")" + search)
regexes.append((SEARCH, replace))
# reset these items so it's impossible to append the same pair
again,
# regardless of how many lines separate each search and replace
pair
SEARCH = None
search = ""
replace = ""
local_flags = ""
# ensure that the inquisition file has evenly matched search-replace
pairs
if need_replace:
print "ERROR: The search pattern near line %s in the %s inquisition
file " \
"is missing its corresponding replace pattern." % (linecount,
inqfile)
return []
# whew!
inqfile_count = len(inqfiles)
if inqfile_count > 1:
print " (%s search-replace pairs found in %s inquisition files)" %
(searchpair_count, inqfile_count)
else:
print " (%s search-replace pairs found in one inquisition file)" %
searchpair_count
return regexes
# ------------------------------------------------------------
# function: handle_re_flags
# ------------------------------------------------------------
def handle_re_flags(flags, using_sre=0):
"""Verifies that the regular expression flags entered on a flag
declaration
line of an inquisition file are valid.
Usage: handle_re_flags(flags[, using_sre]) --> valid_flags
valid_flags is returned as a string in a form compatible with the
'(?iLmsux)' regular expression syntax.
If the 'x' or 'X' verbose pattern flag is present, the
'err_verbose_flag'
string is returned.
If the 'u' or 'U' unicode search flag is present, the
'err_unicode_flag'
string is returned unless a non-zero value is supplied for the
using_sre optional argument (use of the pre module is assumed).
If any text characters other then 'i', 'I', 'l', 'L', 'm', 'M', 's',
or 'S'
(and 'u' or 'U' if using_sre is non-zero) are present, the
'err_illegal_flag' string is returned.
"""
if using_sre:
VALID_FLAGS = re.compile(r"^[ilmsuILMSU]+$")
else:
VALID_FLAGS = re.compile(r"^[ilmsILMS]+$")
if VALID_FLAGS.search(flags):
# enforce the 'iLms' form of flags because I'm compiling flags
directly
# into each search pattern using '(?iLms)'
re_flags = string.lower(flags)
re_flags = string.join(string.split(re_flags, "l"), "L")
return re_flags
else:
if ("x" in flags) or ("X" in flags):
return "err_verbose_flag"
elif ("u" in flags) or ("U" in flags):
return "err_unicode_flag"
else:
return "err_illegal_flag"
##### search-and-replace functions #####
# ------------------------------------------------------------
# function: search_replace
# ------------------------------------------------------------
def search_replace(text, regexes):
"""Performs each search and replace in a list of supplied
search-replace
pairs on the supplied text.
Usage: search_replace(text, regexes) --> newtext
text is any string.
regexes is a list of tuples where each tuple is a search-replace pair
of regular expressions: [(SEARCH, replace), ... ]
Note that each SEARCH is expected to be a compiled regular
expression object (re.compile); replace is always a string.
If an IndexError or RuntimeError is raised during processing, the
error
is printed to the console and the string 'ERROR' is returned.
This function tests for the presence of case conversion tags in each
replace, and calls upon the convert_case() function as necessary.
"""
err_index = "\nERROR: Backreference found in a replace without
corresponding " \
"parenthetical group in the search!\ns:\t%s\nr:\t%s"
err_runtime = "\nERROR: The following regular expressions caused this
RuntimeError:\n " \
"'%s'\ns:\t%s\nr:\t%s"
err_non_contrib_group = "\nERROR: %s. " \
"\nTry, for example, changing '(b)?' to '(b?)' or to
'((?:b?))'." \
"\nAlso, don't do things like 'a|(b)', because '(b)' doesn't
contribute if 'a' matches." \
"\ns:\t%s\nr:\t%s"
sys.stderr.write(' [Processing')
for SEARCH, replace in regexes:
sys.stderr.write('.')
try:
text = SEARCH.sub(replace, text)
except IndexError:
print err_index % (SEARCH.pattern, replace)
return "ERROR"
except RuntimeError, err_value:
print err_runtime % (err_value, SEARCH.pattern, replace)
return "ERROR"
except re.error, err_value:
print err_non_contrib_group % (err_value, SEARCH.pattern, replace)
return "ERROR"
if _CASETAGS.search(replace):
text = _CASETAGS.sub(_replace, text)
sys.stderr.write(']\n')
return text
# ------------------------------------------------------------
# function: _replace
# ------------------------------------------------------------
def _replace(match):
"""Private replace function called by the search_replace() function
when processing <convert_case> tags."""
return convert_case(match.group("text"), match.group("case"))
##### random silliness #####
# ------------------------------------------------------------
# function: span_inq_quote
# ------------------------------------------------------------
def span_inq_quote(*spam):
"""Returns a randomly selected quote from the Monty Python
'Spanish Inquisition' sketch.
Usage: span_inq_quote([spam, spam, spam, ...]) --> "funny quote"
Actually, this function will eat nearly anything, not just spam.
Eggs, bacon, sausage, and baked beans are particularly delightful.
"""
templist = []
for food in spam:
if food:
if (type(food) == type([])) or (type(food) == type(())):
for item in food:
templist.append(item)
else:
templist.append(food)
WORDBRK =
re.compile(r'''[/|\\<>\(\)\[\]{}"'@#$%&*+\-=^_`~,;:\.\?!\s]+''')
types_of_beans = ["baked", "BAKED", "Baked", "green", "GREEN",
"Green", "pinto", "PINTO", "Pinto", \
"garbonzo", "GARBONZO", "Garbonzo", "fava", "FAVA", "Fava",
"navy", "NAVY", "Navy", \
"black", "BLACK", "Black", "red", "RED", "Red", "white", "WHITE",
"White", \
"yellow", "YELLOW", "Yellow", "lima", "LIMA", "Lima", "refried",
"REFRIED", "Refried", \
"string", "STRING", "String", "wax", "WAX", "Wax"]
all_food = []
for item in templist:
if type(item) == type(""):
food_items = WORDBRK.split(item)
for food in food_items:
if food not in ["and", "or", "AND", "OR", "And", "Or"]:
if food in ["beans", "BEANS", "Beans"] and len(all_food) > 0 \
and all_food[len(all_food) - 1] in types_of_beans:
all_food[len(all_food) - 1] = all_food[len(all_food) - 1] + " " +
food
else:
all_food.append(food)
else:
all_food.append(item)
at_least_one = 0
for food in all_food:
if food:
if type(food) == type(""):
at_least_one = 1
if food in ["spam", "SPAM", "Spam"]:
print "Glorious spam! Wonderful spam!"
elif food in ["eggs", "EGGS", "Eggs", \
"bacon", "BACON", "Bacon", \
"sausage", "SAUSAGE", "Sausage",\
"baked beans", "BAKED BEANS", "Baked Beans"]:
print "I'm having spam, spam, %s, and spam." % food
else:
print "But I don't like %s!" % food
else:
try:
print "You can't eat a %s!" % repr(food)
except:
# raises 'TypeError: not all arguments converted' if nested tuple
was supplied to a spam argument
pass
if at_least_one: print "\n=========="
quotes = ["I didn't expect a kind of Spanish Inquisition.", \
"NOBODY expects the Spanish Inquisition!", \
"Our chief weapon is surprise. Surprise and fear ... fear and
surprise....\n" \
"Our TWO weapons are fear and surprise.", \
"Amongst our weaponry ... are such elements as fear,
surprise....\nI'll come in again.", \
"Amongst our weaponry are such diverse elements as:\n" \
"fear, surprise, ruthless efficiency, an almost fanatical
devotion\n" \
"to the Pope, and nice red uniforms -- Oh damn!", \
"Okay, stop. Stop. Stop there -- stop there. Stop. Phew!\n" \
"Ah! ... our chief weapons are surprise ... blah blah blah.", \
"Now, Cardinal -- the rack!\n" \
"[Biggles produces a plastic-coated dish-drying rack.]", \
"Biggles! Fetch ... THE CUSHIONS!", \
"Cardinal! Poke her with the soft cushions!", \
"Cardinal Fang! Fetch ... THE COMFY CHAIR!", \
"Now -- you will stay in the Comfy Chair until lunch time,\n" \
"with only a cup of coffee at eleven." \
]
return quotes[random.randrange(0, len(quotes))]
##### end of definitions #####
# ------------------------------------------------------------
# MAIN PROGRAM
# ------------------------------------------------------------
if __name__ == "__main__":
# initialize a few variables
valid_opts = r"""The available options for The Spanish Inquisition
are:
'-x' or '--extension='
Supply an alternate filename extension for output files.
'-o' or '--output_dir='
Supply an alternate folder to contain output files.
Absolute or relative paths are acceptable.
'-s' or '--sre'
Instructs The Spanish Inquisition to use Python's
Unicode-aware regular expression engine (sre).
(Be wary of using sre. As of this writing [7/24/01,
current Python version 2.1], the sre engine is broken.)
Use of pre (standard ASCII engine) is assumed.
'-p' or '--pre'
Instructs The Spanish Inquisition to use Python's
original pre module rather than the current re module.
[Python 2.3 addresses the final sre recursion errors but
at the cost of time -- lots and lots of time ]
'-r' or '--regex'
Prints out a description of Python's regular expression
syntax and quits.
'-?', '-h', or '--help'
Prints complete documentation for the program and quits.
'-v' or '--version'
Prints the current version of the program and quits.
'--food='
The Spanish Inquisition just loves spam!
"""
usage = r"""
THE SPANISH INQUISITION
by Damon Butler
========================================
Batch search and replace utility using the Python regular expression
syntax.
Inspired by David Niergarth's snr.py script and Greg Swann's Torquemada
the Inquisitor for the Macintosh.
----------------------------------------
Basic Description
----------------------------------------
The Spanish Inquisition takes a list of text files and 'inquisition'
files
(containing search and replace regular expressions) and, by iterating
through each search-replace pair in each inquisition file, creates
a new set of text files that it saves to disk in a new directory.
Each text file processed is submitted to the full battery of
search-and-
replace expression pairs contained in all inquisition files.
The order in which the searches and replaces are executed is determined
by alphanumerically sorting the inquisition files by filename, and the
searches and replaces are executed sequentially from top to bottom in
each inquisition file. A text file is processed completely by a search-
and-replace pair before the following search-and-replace pair is used.
----------------------------------------
Inquisition File Description
----------------------------------------
Inquisition files are text files containing pairs of search and replace
regular expressions. These files must be identified by the '.inq' or
'.data' filename extensions in order to be properly recognized by
The Spanish Inquisition.
An inquisition file is line-based and, in the simplest case, might look
something like the following:
s: search text
r: replace text
s: another search
r: another replace
Note how "s:<tab>" is used to declare a line containing a search
expression,
and "r:<tab>" is used to declare a line containing a replace
expression, and
how search lines must always precede replace lines. The other three
declarations allowed are:
e: search expression containing character escapes
f: "local" regular expression flag
F: "global" regular expression flag
Python allows you to access high-ASCII or even Unicode characters
directly
by declaring character escapes. A character escape takes the form of a
backslash followed the encoding position of the character in either
octal
or hexadecimal notation. For example, the copyright character is
encoded
in decimal position 169 in both Windows- and Macintosh-standard 8-bit
ASCII encodings. One could access that character by keying it directly,
or by using either of the following character escape codes:
hexadecimal: \xa9
octal: \251
When declaring an "e:<tab>" character escape search, be aware that you
must escape all backslashes that aren't a part of character escape
codes
as '\\'. Thus, '\s' (whitespace wildcard) would become '\\s', and
'\\' (literal backslash character) would become '\\\\'.
See the following section for more information about regular expression
flags.
Note that you must place at least one tab (though you may use more than
one tab) after the colon of the line declaration to delimit the start
of the
expression, even when the replace text is empty. The first character
that is NOT a tab signals the start of the expression. (To search or
replace
tab characters, always use the '\t' wildcard.)
Lines that do not begin with valid declarations (including blank lines
or
lines containing nothing but whitespace) are ignored, and thus provide
a mechanism by which you may insert comments into your inquisition
files. Any number of "comment" lines may fall before, after, and
inbetween legal line declarations. For example:
Searching for start and end 'spam' tags
s: <(/?)spam>
Replacing each 'spam' tag with an 'eggs' tag
r: <\1eggs>
----------------------------------------
Using Regular Expression Flags
----------------------------------------
Python's regular expression engine recognizes search expression flags
that affect case sensitivity, multiline search capability, and the
like.
The available flags are i, s, m, L, u, and x.
i (IGNORECASE)
Case insensitive matching.
s (DOTALL)
The '.' wildcard matches any character, including newlines.
('.' is not normally allowed to match linebreaks.)
m (MULTILINE)
The '^' and '$' wildcards match the beginning and end of each
line rather than the beginning and end of the entire string
matched by the search expression.
L (LOCALE)
Use current language locale.
(Useful mostly for non-English languages.)
u (UNICODE)
Makes the use of the '\w', '\W', '\b', and '\B' wildcards dependent
on the Python Unicode character properties database. By default,
this flag is not allowed by The Spanish Inquisition (i.e., its use
has been declared heretical) because Python's Unicode-aware
regular expression engine (sre) is broken as of this writing
(7/24/01, current Python version 2.1). At your own risk, you
can instruct The Spanish Inquisition to use the sre engine (and
thus use this flag) by declaring the '-s' or '--sre' options on the
command line. (See below for more information on the
available options.)
x (VERBOSE)
Ignores most white space and anything after '#' characters
in the search expression. This flag is disallowed because
inquisition files are line based, and thus no comments or
ignorable whitespace can be inserted in the middle of an
expression.
You may add these flags to regular expressions using the following
syntax
(in all of the following examples, note the varied format of inserted
comments):
%%% 'i' and 's' flags activated for this search %%%
s: (?is)<spam>.*?</spam>
^ the '(?ismL)' flag declaration must be the very first item in
the search expression
Or you may add them via local or global flag declaration lines:
- local flag example 1 -
'i' and 's' flags activated for next encountered search
f: is
s: <spam>(.*?)</spam>
r: <eggs>\1</eggs>
...
- local flag example 2 -
s: <spam>(.*?)</spam>
f: is
<-- 'i' and 's' flags activated for preceding search because
the associated replace hasn't been encountered yet -->
r: <eggs>\1</eggs>
...
- global flag example -
!!! 'i' and 's' flags activated for all searches after this point
in this inquisition file !!!
F: is
Local flags always supercede global flags. Global flags can be globally
superceded by declaring another global flag.
For instance, if most of your search expressions need to be case
insensitive and you want '.' to match newlines, you can save yourself
some typing by declaring these as global flags.
-- Case insensitivity and newline matching now assumed for all
searches
F: is
==== Delete all <bacon>s ====
s: <bacon>.*?</bacon>
r:
^ don't forget to type the tab followng ':'even when
the replace text is empty
==== Change 'spam and eggs' into 'eggs and spam' ====
c: temporarily disable global flags by declaring no local flags
f:
s: (spam) and (eggs)
r: \2 and \1
... (more searches and replaces)
-- Turn off all global flags
F:
... (more searches and replaces)
----------------------------------------
Case Conversion Flags
----------------------------------------
The Spanish Inquisition recognizes seven different case conversion
flags that are available only in replace expressions. These flags can
be used only on the replace side because they control the output
format of the text found; they are not themselves something that
can be searched for. Using these flags in search expressions may
produce unwanted results, especially since several of them already
have special meanings inside search expressions. The available flags
are:
\C (all Caps)
Converts the text into all caps.
\L (Lowercase)
Converts the text into all lowercase.
\U (Upstyle caps)
Converts each word into initial caps/lowercase.
(Word breaks are defined by any combination of
punctuation and/or whitespace.)
\D (Downstyle caps)
Same as upstyle caps except that words shorter than
four letters in length are left as all lowercase.
(Word breaks are defined by any combination of
punctuation and/or whitespace.)
\S (Sentence caps)
Converts the entire range into all lowercase,
but initial caps the first word of each sentence.
(Sentence breaks are defined by '.', '?', and '!'.)
\X (eXchange case, or swapcase)
Converts all capital letters into lowercase letters,
and all lowercase letters into capital letters
\= (end case conversion)
Terminates the currently active case conversion.
For all case conversions, the pronoun 'I' is handled intelligently.
Here is an example of how to use the flags:
find all 'spam' tagged text
s: <spam>(.*?)</spam>
convert the text into all caps
r: \C\1\=
You don't need to use the '\=' flag if it is the last thing in the
replace
string; The Spanish Inquisition will put one there if it doesn't find
one.
On the other hand, if you need to terminate case conversion within
the replace, then you must explicitly turn it off. You cannot activate
a second case conversion flag without first turning off the previous
case conversion flag. (Yes, the following can be done through a
literal search, but it's just an example!)
s: Spam, spam, bacon, eggs, and spam.
r: \CSpam, spam,\= \Ubacon, eggs,\= and \Cspam.
would result in the text 'SPAM, SPAM, Bacon, Eggs, and SPAM.'
"""
if sys.platform == "mac":
quit_msg = "\n\n<<[command] + [Q] to quit>>"
usage = usage + r"""----------------------------------------
The Spanish Inquisition on the Mac
----------------------------------------
The Spanish Inquisition is a drag-and-drop applet. Select one or more
text files you wish to process along with one or more inquisition files
you wish to use, and drag them all on top of the applet. Recall that
inquisition files are always used in alphanumerical order, regardless
of what folder or folders they reside in. (For information about
inquisition files, which must be text files terminating with either the
'.inq' or '.data' filename extensions, see above.) If you fail to
provide
at least one inquisition file along with the batch of text files, The
Spanish Inquisition will search for inquisition files (and option
files, see
below) in all the folders containing the text files plus the folder
where
the applet itself resides and utilize all that it finds.
By default, the output files created by The Spanish Inquisition will
all
have the '_inq.txt' filename extension appended to them. These output
files will reside in a 'free_from_heresy' folder inside the folder
containing the source text files. If text files are drawn from multiple
folders, a 'free_from_heresy' folder will be created inside each folder
containing at least one text file. You can alter this behavior by
specifying one or more "option" files.
----------------------------------------
Options and Option Files
----------------------------------------
Option files are text files containing option declarations and, when
necessary, argument values for those options. These files must be
identified by the '.opt' filename extension in order to be properly
recognized by The Spanish Inquisition. To use an option file, simply
drag it on top of the applet along with all other text and inquisition
files you are processing. You may use multiple option files at once.
An option file is line-based and, in the simplest case, might look
something like the following:
o: -s -x ".txt"
Note how "o:<tab>" is used to declare a line containing at least one
option. You may declare any number of option lines, and each line
may contain any number of options. Lines that do not begin with the
"o:<tab>" declaration are ignored.
You enter options exactly as though you were running The Spanish
Inquisition from the command line of a DOS or Unix shell session.
""" + valid_opts + r"""
Say you wished for all your output files to have the '.xml' filename
extension, and that you wanted them all to appear in a new folder
on your Mac's desktop. You could create an option file containing:
### Example 1 ###
o: -x ".xml" --output_dir="Macintosh HDesktop Folderutput"
or:
### Example 2 ###
o: -o "Macintosh HDesktop Folderutput"
o: --extension=".xml"
or some other similar combination.
----------------------------------------
Notes on Python's Regular Expression Syntax
----------------------------------------
By and large, Python's regular expression syntax is identical to the
standard Unix grep syntax employed by BBEdit. There are a few
important differences, however.
(1) When processing text, Python always converts all line-ending
characters into Unix-style newlines. That is, use the '\n'
wildcard instead of the '\r' wildcard to identify paragraph marks.
(2) In standard (and BBEdit) grep, the '*', '+', and '?' qualifiers are
all greedy; they match as much text as possible. Sometimes this
behavior isn't desired. If the expression '<.*>' is matched against
'<H1>title</H1>', it will match the entire string, and not just
'<H1>'.
Adding '?' after the qualifier makes it perform the match in
non-greedy
or minimal fashion; as few characters as possible will be matched.
Using '.*?' in the previous expression will match only '<H1>'.
(3) Python recognizes a large number of extensions to the grep syntax
not available in BBEdit. (The use of regular expression flags and
minimal matching are two examples of these extensions.) To find
out about them (highly recommended), enter 'r' + [return] below.
"""
else:
quit_msg = ""
usage = usage + r"""----------------------------------------
Running The Spanish Inquisition from the Command Line
----------------------------------------
In the current working directory, type:
span_inq.py [options] (glob | file) [(glob2 | file2) ...]
Items listed in square brackets are optional. For each parenthetical
group, choose to enter either a glob or a file.
glob is any standard file glob (e.g., '*.xml', '*.inq', or '*.*').
file is any single filename.
Both glob and file items may contain absolute or relative paths.
(e.g., '..\..\translate_set.inq' or 'D:\projects\temp\*.xml')
""" + valid_opts + r"""
Instead of declaring options on the command line (or even in
conjunction
with command line options), you can declare options via "option" files.
Option files are text files containing option declarations and, when
necessary, argument values for those options. These files must be
identified by the '.opt' filename extension in order to be properly
recognized by The Spanish Inquisition. To use an option file, simply
declare it along with any other files you are processing. You may use
multiple option files at once.
An option file is line-based and, in the simplest case, might look
something like the following:
o: -s -x ".txt"
Note how "o:<tab>" is used to declare a line containing at least one
option. You may declare any number of option lines, and each line
may contain any number of options. Lines that do not begin with the
"o:<tab>" declaration are ignored. You enter options exactly as though
you were running The Spanish Inquisition from the command line of a
DOS or Unix shell session, except that filenames and globs are ignored
in an option file.
----------------------------------------
Processing Details
----------------------------------------
The Spanish Inquisition will process all text files you declare against
all inquisition files (and option files) you declare. Recall that
inquisition
files are always used in alphanumerical order, regardless of what
directory or directories they reside in. (For information about
inquisition
files, which must be text files terminating with either the '.inq' or
'.data'
filename extensions, see above.) If you fail to declare at least one
inquisition file along with your batch of text files, The Spanish
Inquisition
will search for inquisition and option files in all the directories
containing
the text files plus the current working directory (if different) and
utilize
all that it finds.
By default, the output files created by The Spanish Inquisition will
reside in
a 'free_from_heresy' directory inside the directory containing the
source text
files. If text files are drawn from multiple directories, a
'free_from_heresy'
directory will be created inside each directory containing at least one
text
file. Use the available options to alter this behavior.
The Spanish Inquisition can utilize data files prepared for the snr.py
script without modification.
"""
re_syntax = r"""----------------------------------------
Python's Regular Expression Syntax
----------------------------------------
Regular expressions can contain both special and ordinary characters.
Most ordinary characters, like "A", "a", or "0", are the simplest
regular
expressions; they simply match themselves. You can concatenate ordinary
characters, so "last" matches the string 'last'.
Some characters, like "|" or "(", are special. Special characters
either stand
for classes of ordinary characters, or affect how the regular
expressions
around them are interpreted. The special characters are:
"." (Dot.)
In the default mode, this matches any character except a newline. If
the DOTALL
flag "s" has been specified, this matches any character including a
newline.
"^" (Caret.)
Matches the start of the string, and in MULTILINE mode ("m") also
matches
immediately after each newline.
"$"
Matches the end of the string, and in MULTILINE mode ("m") also matches
before a newline. "foo" matches both 'foo' and 'foobar', while the
regular
expression "foo$" matches only 'foo'.
"*"
Causes the resulting RE to match 0 or more repetitions of the preceding
RE,
as many repetitions as are possible. "ab*" will match 'a', 'ab', or 'a'
followed
by any number of 'b's.
"+"
Causes the resulting RE to match 1 or more repetitions of the preceding
RE.
"ab+" will match 'a' followed by any non-zero number of 'b's; it will
not match
just 'a'.
"?"
Causes the resulting RE to match 0 or 1 repetitions of the preceding
RE. "ab?"
will match either 'a' or 'ab'.
"*?", "+?", "??"
The "*", "+", and "?" qualifiers are all greedy; they match as much
text as
possible. Sometimes this behavior isn't desired; if the RE "<.*>" is
matched
against '<H1>title</H1>', it will match the entire string, and not just
'<H1>'.
Adding "?" after the qualifier makes it perform the match in non-greedy
or
minimal fashion; as few characters as possible will be matched. Using
".*?"
in the previous expression will match only '<H1>'.
{m,n}
Causes the resulting RE to match from m to n repetitions of the
preceding RE,
attempting to match as many repetitions as possible. For example,
"a{3,5}"
will match from 3 to 5 "a" characters. Omitting n specifies an infinite
upper
bound; you can't omit m.
{m,n}?
Causes the resulting RE to match from m to n repetitions of the
preceding RE,
attempting to match as few repetitions as possible. This is the
non-greedy
version of the previous qualifier. For example, on the 6-character
string
'aaaaaa', "a{3,5}" will match 5 "a" characters, while a{3,5}? will only
match 3 characters.
"\"
Either escapes special characters (permitting you to match characters
like "*",
"?", and so forth), or signals a special sequence; special sequences
are
discussed below.
[]
Used to indicate a set of characters. Characters can be listed
individually,
or a range of characters can be indicated by giving two characters and
separating them by a "-". Special characters are not active inside
sets. For
example, "[akm$]" will match any of the characters 'a', 'k', 'm', or
'$';
"[a-z]" will match any lowercase letter, and "[a-zA-Z0-9]" matches any
letter or digit. Character classes such as "\w" or "\S" (defined below)
are
also acceptable inside a range. If you want to include a "]" or a "-"
inside a
set, precede it with a backslash, or place it as the first character.
The
pattern "[]]" will match ']', for example.
You can match the characters not within a range by complementing the
set. This is indicated by including a "^" as the first character of the
set;
"^" elsewhere will simply match the "^" character. For example, "[^5]"
will match any character except "5".
"|"
A|B, where A and B can be arbitrary REs, creates a regular expression
that will
match either A or B. An arbitrary number of REs can be separated by the
"|" in
this way. This can be used inside groups (see below) as well. REs
separated
by "|" are tried from left to right, and the first one that allows the
complete
pattern to match is considered the accepted branch. This means that if
A
matches, B will never be tested, even if it would produce a longer
overall
match. In other words, the "|" operator is never greedy. To match a
literal
"|", use "\|", or enclose it inside a character class, as in "[|]".
(...)
Matches whatever regular expression is inside the parentheses, and
indicates
the start and end of a group; the contents of a group can be retrieved
after a
match has been performed, and can be matched later in the string with
the
\number special sequence, described below. To match the literals "(" or
")", use
"\(" or "\)", or enclose them inside a character class: "[(] [)]".
(?...)
This is an extension notation (a "?" following a "(" is not meaningful
otherwise). The first character after the "?" determines what the
meaning and
further syntax of the construct is. Extensions usually do not create a
new
group; "(?P<name>...)" is the only exception to this rule. Following
are the
currently supported extensions.
(?iLmsux)
(One or more letters from the set "i", "L", "m", "s", "u", "x".) The
group
matches the empty string; the letters set the corresponding flags
(IGNORECASE, LOCALE, MULTILINE, DOTALL, UNICODE, VERBOSE) for the
entire regular expression.
Note that the (?x) flag changes how the expression is parsed. It
should be
used first in the expression string, or after one or more whitespace
characters.
If there are non-whitespace characters before the flag, the results are
undefined.
(?:...)
A non-grouping version of regular parentheses. Matches whatever regular
expression is inside the parentheses, but the substring matched by the
group
cannot be retrieved after performing a match or referenced later in the
pattern.
(?P<name>...)
Similar to regular parentheses, but the substring matched by the group
is
accessible via the symbolic group name "name". Group names must be
valid
Python identifiers. A symbolic group is also a numbered group, just as
if the
group were not named. So the group named "id" in the example below can
also be referenced as the numbered group 1.
For example, if the pattern is "(?P<id>[a-zA-Z_]\w*)", the group can
be
referenced by its name in pattern text (e.g. "(?P=id)") and replacement
text
(e.g. "\g<id>").
(?P=name)
Matches whatever text was matched by the earlier group named "name".
(?#...)
A comment; the contents of the parentheses are simply ignored.
(?=...)
Matches if "..." matches next, but doesn't consume any of the string.
This is
called a lookahead assertion. For example, "Isaac (?=Asimov)" will
match
'Isaac' only if it's followed by 'Asimov'.
(?!...)
Matches if "..." doesn't match next. This is a negative lookahead
assertion.
For example, "Isaac (?!Asimov)" will match 'Isaac' only if it's NOT
followed
by 'Asimov'.
(?<=...)
Matches if the current position in the string is preceded by a match
for
"..." that ends at the current position. This is called a positive
lookbehind
assertion. "(?<=abc)def" will match 'abcdef', since the lookbehind will
back
up 3 characters and check if the contained pattern matches. The
contained
pattern must only match strings of some fixed length, meaning that
"abc"
or "a|b" are allowed, but "a*" isn't.
NOTE: To use the positive lookbehind assertion, you must specify use of
the sre regular expression engine via the '-s' or '--sre' options.
(?<!...)
Matches if the current position in the string is not preceded by a
match for
"...". This is called a negative lookbehind assertion. Similar to
positive
lookbehind assertions, the contained pattern must only match strings of
some fixed length.
NOTE: To use the negative lookbehind assertion, you must specify use of
the sre regular expression engine via the '-s' or '--sre' options.
The special sequences consist of "\" and a character from the list
below. If
the ordinary character is not on the list, then the resulting RE will
match the
second character. For example, "\$" matches the character "$".
\number
Matches the contents of the group of the same number. Groups are
numbered
starting from 1. For example, "(.+) \1" matches 'the the' or '55 55',
but not
'the end' (note the space after the group). This special sequence can
only be
used to match one of the first 99 groups. If the first digit of number
is 0, or
number is 3 octal digits long, it will not be interpreted as a group
match, but
as the character with octal value number. Inside the "[" and "]" of a
character
class, all numeric escapes are treated as characters.
\A
Matches only at the start of the string.
\b
Matches the empty string, but only at the beginning or end of a word. A
word
is defined as a sequence of alphanumeric characters, so the end of a
word is
indicated by whitespace or a non-alphanumeric character. Inside a
character
range, "\b" represents the backspace character, for compatibility with
Python's string literals.
\B
Matches the empty string, but only when it is not at the beginning or
end of
a word.
\d
Matches any decimal digit; this is equivalent to the set "[0-9]".
\D
Matches any non-digit character; this is equivalent to the set
"[^0-9]".
\s
Matches any whitespace character; this is equivalent to the set "[
\t\n\r\f\v]".
\S
Matches any non-whitespace character; this is equivalent to the set
"[^ \t\n\r\f\v]".
\w
When the LOCALE and UNICODE flags are not specified, matches any
alphanumeric
character; this is equivalent to the set "[a-zA-Z0-9_]". With LOCALE,
it will
match the set "[0-9_]" plus whatever characters are defined as letters
for the
current locale. If UNICODE is set, this will match the characters
"[0-9_]" plus
whatever is classified as alphanumeric in the Unicode character
properties
database.
\W
When the LOCALE and UNICODE flags are not specified, matches any non-
alphanumeric character; this is equivalent to the set "[^a-zA-Z0-9_]".
With LOCALE, it will match any character not in the set "[0-9_]", and
not
defined as a letter for the current locale. If UNICODE is set, this
will
match anything other than "[0-9_]" and characters marked at
alphanumeric
in the Unicode character properties database.
\Z
Matches only at the end of the string.
\\
Matches a literal backslash. """
version = "1.2.7: 08/18/03"
#heresy = ''
## Set the default for which re module is used by editing the next
uncommented line:
## using_sre = 0 -> don't use sre, use pre instead
## using_sre = 1 -> use sre, not pre
using_sre = 0
if using_sre:
import re
else:
import pre as re
short_opts = "x:rspvh?"
long_opts = ["extension=", "output_dir=", "version", "regex", "sre",
"pre", "help", "food="]
# separate command-line options from file list/file globs
try:
options, files = getopt.getopt(sys.argv[1:], short_opts, long_opts)
except getopt.error, err_value: # a bogus option or option argument
was entered
print "ERROR:", err_value, "\n\n", valid_opts
sys.exit(quit_msg)
# check for file list/file globs
# (obtain lists of text files and inquisition files)
if files:
inqfiles, datafiles, optfiles, files = sortfiles(files, [".inq",
".data", ".opt"])
# create absolute paths for each file in each group of files
# and sort inqfiles and optfiles by filename only
files = create_abspaths(files)
optfiles = filenamesort(create_abspaths(optfiles))
inqfiles.extend(datafiles)
inqfiles = filenamesort(create_abspaths(inqfiles))
if not files and not optfiles:
print "ERROR: No files matching the filename(s) you provided.\n\n"
sys.exit(quit_msg)
if not inqfiles and not optfiles:
if not optfiles:
inqfiles, optfiles = get_inqfiles(files)
else:
inqfiles, more_optfiles = get_inqfiles(files)
optfiles.extend(more_optfiles)
optfiles = filenamesort(optfiles)
if not inqfiles:
print "ERROR: No inquisition files found.\n\n"
sys.exit(quit_msg)
else:
optfiles = []
if not options:
print usage
if sys.platform == "mac":
input = raw_input("For a full description of Python's regular
expression syntax,\nenter 'r' <<or [command] + [Q] to quit>>: ")
if input == "r" or input == "R":
print "\n" + re_syntax
sys.exit(quit_msg)
# compile all options
all_options = compile_options(optfiles, short_opts, long_opts)
if all_options == "ERROR":
print "\n", valid_opts
sys.exit(quit_msg)
all_options.extend(options)
# set values for options
ext = ""
output_dir = "free_of_heresy" # dni: added this default (11-06-2002)
(see next dni comment)
food = []
for opt, arg in all_options:
if opt in ("-h", "-?", "--help"):
print usage
sys.exit(quit_msg)
elif opt in ("-v", "--version"):
print "Nobody expects The Spanish Inquisition!\nby Damon Butler\n(v"
+ version + ")"
sys.exit(quit_msg)
elif opt in ("-r", "--regex"):
print re_syntax
sys.exit(quit_msg)
elif opt in ("-x", "--extension"):
ext = arg
elif opt in ("-o", "--output_dir"):
output_dir = arg
elif opt in ("-s", "--sre"):
using_sre = 1
print ' [using sre module]'
elif opt in ("-p", "--pre"):
using_sre = 0
print ' [using pre module]'
elif opt == "--food":
food.append(arg)
else: # should normally be unnecessary, but ...
print "ERROR: option " + opt + " not recognized\n\n" + valid_opts
sys.exit(quit_msg)
if not ext and sys.platform == "mac": ext = "_inq.txt"
# dni: (11-06-2002) Let the output file overwrite input file if
--output_dir="" is supplied on the command line.
# if not output_dir: output_dir = "free_of_heresy"
# just in case valid, non sys.exit() options were provided without any
legal text or inquisition files
# (this is necessary because of the tortured logic surrounding
optfiles and file sorting above)
if not files:
print "ERROR: No files matching the filename(s) you provided
found.\n\n"
sys.exit(quit_msg)
if not inqfiles:
print "ERROR: No inquisition files found.\n\n"
sys.exit(quit_msg)
# process inquisition files to create master list of searches and
replaces
regexes = compile_searches(inqfiles, using_sre)
if not regexes:
print "\n"
sys.exit(quit_msg)
# process each file
at_least_one = 0
for file in files:
print ' [Reading: "%s"]' % file
text = readfile(file)
if text:
at_least_one = 1
text = search_replace(text, regexes)
if text == "ERROR":
print "\n"
sys.exit(quit_msg)
print ' [Writing: "%s"]' % (os.path.join(output_dir, file) + ext,)
writefile(text, file, ext, output_dir)
else:
print "%s not found or is blank. Skipping ..." % file
# done!
if at_least_one:
pass
else:
print "\nERROR: No text was processed.\n"
print quit_msg
Thanks and Regards.
I have some problem in my script. please help me. This is script file.
I have one *.inq file. I want run this script in XML files. But this
script errors shows [Line No. 1(*)]. If u want i am attach this script
files and inq files. I cant understand this error. Please suggest me.
You can talk with my yahoo id (e-mail address removed). Now i am online.
Plz....Plz..Plz...
from implib.filemanip import *
from implib.caseconv import convert_case, CASETAGS as _CASETAGS
import sys, os, getopt, string, random
# Turn off pre deprecation warnings (for >= Python 2.3).
try:
import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning)
except:
pass
""""Amongst our weaponry are such diverse elements as
fear, surprise, ruthless efficiency, an almost fanatical devotion
to the Pope, and nice red uniforms -- Oh damn!"
For information on how to use The Spanish Inquisition, which allows
you to search and replace text in multiple files, simply run this
program without supplying any arguments or supply one of the
standard help options on the command line: -h, -?, --help.
"""
##### inquisition (search-and-replace) file search #####
# ------------------------------------------------------------
# function: get_inqfiles
# ------------------------------------------------------------
def get_inqfiles(filelist):
"""Searches for inquisition (search and replace) and option files
in all directories indicated by the filenames in a list of files,
plus the current working directory, if different.
Usage: get_inqfiles(filelist) --> (inqfiles, optfiles)
If the files in filelist don't contain any paths (or are only
relative paths), then this function searches only the current
working directory for inquisition and option files. (Inquisition
files are identfied by ".inq" and ".data" extensions; option
files are identified by the ".opt" extension.)
"""
# print "Searching for inquisition files..."
# check the list of files to obtain paths, if provided
# (absolute paths would be provided on a mac, for instance)
temp_dirlist = []
for file in filelist:
dir, filename = os.path.split(file)
if dir:
if not os.path.isabs(dir):
dir = os.path.abspath(dir)
temp_dirlist.append(dir)
cwd = os.getcwd()
if cwd[-1] == os.sep:
# leave off trailing pathsep char returned by os.getcwd because
# os.path.split and os.path.abspath both drop the trailing pathsep
cwd = cwd[:-1]
temp_dirlist.append(cwd)
# weed out repeated directories
temp_dirlist.sort()
directories = []
lastdir = ""
for dir in temp_dirlist:
if dir != lastdir:
directories.append(dir)
lastdir = dir
# search each directory for inquisition files
inqfiles = []
optfiles = []
for dir in directories:
dirfiles = os.listdir(dir)
for file in dirfiles:
filename, ext = os.path.splitext(file)
if (ext == ".inq") or (ext == ".data"):
inqfiles.append(os.path.join(dir, file))
elif ext == ".opt":
optfiles.append(os.path.join(dir, file))
# sort the inqfiles and optfiles by filename alone
inqfiles = filenamesort(inqfiles)
inqfile_count = len(inqfiles)
## if inqfile_count > 1:
## print "%s inquisition files found.\n" % inqfile_count
## elif inqfile_count == 1:
## print "1 inquisition file found.\n"
optfiles = filenamesort(optfiles)
optfile_count = len(optfiles)
## if optfile_count > 1:
## print "%s option files found.\n" % optfile_count
## elif optfile_count == 1:
## print "1 option file found.\n"
return (inqfiles, optfiles)
##### inquisition file search-and-replace compilation #####
# ------------------------------------------------------------
# function: compile_searches
# ------------------------------------------------------------
def compile_searches(inqfiles, using_sre=0):
"""Takes the supplied list of inquisition (search and replace) files
and returns a list of tuples, each tuple containing an individual
search-replace pair.
Usage: compile_searches(inqfiles[, using_sre]) --> [(SEARCH, replace),
.... ]
SEARCH is a compiled regular expression object (from re.compile).
If a non-zero value is supplied for the using_sre argument (optional),
the 'u' ('UNICODE') regular expression flag is allowed.
(Use of the pre module is assumed.)
For information on how this function interprets data from
inquisition files, use the standard help options for the
span_inq.py script.
"""
#print "Compiling searches and replaces from inquisition files..."
# initialize global variables
regexes = []
searchpair_count = 0
LINE = re.compile(r"^(?P<id>[Ffser]):\t+(?P<pattern>.*$)")
CASE_CONVERT_FLAGS = re.compile(r"(?P<case>\\[CDLSUX=])")
case_start = '<convert_case case="%s">'
case_end = '</convert_case>'
err_verbose_flag = "ERROR: The 'x' and 'X' verbose pattern flags are
illegal. " \
"See line %s in the %s inquisition file."
err_unicode_flag = "ERROR: The 'u' and 'U' Unicode search flags are
not currently supported. " \
"See line %s in the %s inquisition file."
err_illegal_flag = "ERROR: Line %s in the %s inquisition file contains
an illegal flag."
for inqfile in inqfiles:
lines = readfile(inqfile, 1)
if not lines:
print "ERROR: %s inquisition file not found or is blank." % inqfile
return []
# initialize inquisition file-sensitive variables
linecount = 0
SEARCH = None
need_replace = 0
local_flags = ""
global_flags = ""
for line in lines:
linecount = linecount + 1
linematch = LINE.search(line)
if linematch:
lineid = linematch.group("id")
# treat the pattern as a raw string, so backslashes are handled
properly
pattern = r"%s" % linematch.group("pattern")
# identify what kind of line we have here
if lineid == "s": # search pattern
if need_replace:
print "ERROR: The search pattern above line %s in the %s
inquisition file " \
"is missing its corresponding replace pattern." % (linecount,
inqfile)
return []
search = pattern
if not search:
print "ERROR: The search declaration at line %s in the %s
inquisition file " \
"is blank." % (linecount, inqfile)
return []
need_replace = 1
# compile the search pattern right now
# (it may need to be redone if flags are present, but if there's a
syntax error
# an accurate line number will be reported in the error message)
try:
SEARCH = re.compile(search)
except re.error, err_value:
print "Syntax error in the search pattern at line %s in the %s "
\
"inquisition file.\n" % (linecount, inqfile)
print "ERROR:", err_value[0]
print "\ts:\t" + search
return []
elif lineid == "e": # search pattern with character escapes
if need_replace:
print "ERROR: The character escape search pattern above line %s
in the %s " \
"inquisition file is missing its corresponding replace pattern."
% (linecount, inqfile)
return []
# DO NOT treat the pattern as a raw string, so that character
escapes function
pattern = linematch.group("pattern")
search = pattern
if not search:
print "ERROR: The search declaration at line %s in the %s
inquisition file " \
"is blank." % (linecount, inqfile)
return []
need_replace = 1
# compile the search pattern right now
# (it may need to be redone if flags are present, but if there's a
syntax error
# an accurate line number will be reported in the error message)
try:
SEARCH = re.compile(search)
except re.error, err_value:
print "Syntax error in the character escape search pattern at
line %s in the %s " \
"inquisition file.\n" % (linecount, inqfile)
print "ERROR:", err_value[0]
print "\te:\t" + search
return []
elif lineid == "r": # replace pattern
if not need_replace:
print "ERROR: The replace pattern at line %s in the %s
inquisition file " \
"has no corresponding search pattern." % (linecount, inqfile)
return []
replace = pattern
need_replace = 0
searchpair_count = searchpair_count + 1
# check for case conversion flags
# first mark all escaped backslashes in the pattern, just in case
someone
# wishes to insert literal text matching one of the case
conversion flags
replace = string.join(string.split(replace, r"\\"),
"&escaped_backslash;")
case_convert_count = 0
casematch = CASE_CONVERT_FLAGS.search(replace)
while casematch:
casetype = casematch.group("case")
if casetype == r"\C":
replace = string.join(string.split(replace, r"\C", 1),
case_start % "allcaps")
case_convert_count = case_convert_count + 1
elif casetype == r"\D":
replace = string.join(string.split(replace, r"\D", 1),
case_start % "downstylecaps")
case_convert_count = case_convert_count + 1
elif casetype == r"\L":
replace = string.join(string.split(replace, r"\L", 1),
case_start % "lowercase")
case_convert_count = case_convert_count + 1
elif casetype == r"\S":
replace = string.join(string.split(replace, r"\S", 1),
case_start % "sentencecaps")
case_convert_count = case_convert_count + 1
elif casetype == r"\U":
replace = string.join(string.split(replace, r"\U", 1),
case_start % "upstylecaps")
case_convert_count = case_convert_count + 1
elif casetype == r"\X":
replace = string.join(string.split(replace, r"\X", 1),
case_start % "swapcase")
case_convert_count = case_convert_count + 1
elif casetype == r"\=":
replace = string.join(string.split(replace, r"\=", 1), case_end)
case_convert_count = case_convert_count - 1
else: # I can't imagine how we'd need this, but ...
print "ERROR: Malformed case conversion flag in the replace
pattern " \
"at line %s in the %s inquisition file." % (linecount, inqfile)
return []
casematch = CASE_CONVERT_FLAGS.search(replace)
if case_convert_count == 1:
replace = replace + case_end
elif case_convert_count > 1:
print "ERROR: The replace pattern at line %s in the %s
inquisition file " \
"requires one or more '\=' end case conversion flags." %
(linecount, inqfile)
return []
elif case_convert_count < 0:
print "ERROR: The replace pattern at line %s in the %s
inquisition file " \
"has at least one extra '\=' end case conversion flag." %
(linecount, inqfile)
return []
# re-insert escaped backslashes
replace = string.join(string.split(replace,
"&escaped_backslash;"), r"\\")
elif lineid == "f": # set a local flag for this pattern
if pattern:
local_flags = handle_re_flags(pattern, using_sre)
if local_flags == "err_verbose_flag":
print err_verbose_flag % (linecount, inqfile)
return []
elif local_flags == "err_unicode_flag":
print err_unicode_flag % (linecount, inqfile)
return []
elif local_flags == "err_illegal_flag":
print err_illegal_flag % (linecount, inqfile)
return []
else: # it's possible someone will leave a local flag line blank,
which is OK and must be registered
local_flags = "NO_FLAGS"
elif lineid == "F": # set a global flag starting at this point in
the file
if pattern:
global_flags = handle_re_flags(pattern, using_sre)
if global_flags == "err_verbose_flag":
print err_verbose_flag % (linecount, inqfile)
return []
elif global_flags == "err_unicode_flag":
print err_unicode_flag % (linecount, inqfile)
return []
elif global_flags == "err_illegal_flag":
print err_illegal_flag % (linecount, inqfile)
return []
else: # it's possible someone will leave the global flag line
blank, which is OK and must be registered
global_flags = ""
else: # I can't imagine how we'd need this, but ...
print "ERROR: Line %s is malformed." % linecount
return []
# add the search and replace regular expression pair to the list
if SEARCH and (not need_replace):
if not local_flags: local_flags = global_flags
if local_flags and local_flags != "NO_FLAGS": # recompile the
search pattern
SEARCH = re.compile("(?" + local_flags + ")" + search)
regexes.append((SEARCH, replace))
# reset these items so it's impossible to append the same pair
again,
# regardless of how many lines separate each search and replace
pair
SEARCH = None
search = ""
replace = ""
local_flags = ""
# ensure that the inquisition file has evenly matched search-replace
pairs
if need_replace:
print "ERROR: The search pattern near line %s in the %s inquisition
file " \
"is missing its corresponding replace pattern." % (linecount,
inqfile)
return []
# whew!
inqfile_count = len(inqfiles)
if inqfile_count > 1:
print " (%s search-replace pairs found in %s inquisition files)" %
(searchpair_count, inqfile_count)
else:
print " (%s search-replace pairs found in one inquisition file)" %
searchpair_count
return regexes
# ------------------------------------------------------------
# function: handle_re_flags
# ------------------------------------------------------------
def handle_re_flags(flags, using_sre=0):
"""Verifies that the regular expression flags entered on a flag
declaration
line of an inquisition file are valid.
Usage: handle_re_flags(flags[, using_sre]) --> valid_flags
valid_flags is returned as a string in a form compatible with the
'(?iLmsux)' regular expression syntax.
If the 'x' or 'X' verbose pattern flag is present, the
'err_verbose_flag'
string is returned.
If the 'u' or 'U' unicode search flag is present, the
'err_unicode_flag'
string is returned unless a non-zero value is supplied for the
using_sre optional argument (use of the pre module is assumed).
If any text characters other then 'i', 'I', 'l', 'L', 'm', 'M', 's',
or 'S'
(and 'u' or 'U' if using_sre is non-zero) are present, the
'err_illegal_flag' string is returned.
"""
if using_sre:
VALID_FLAGS = re.compile(r"^[ilmsuILMSU]+$")
else:
VALID_FLAGS = re.compile(r"^[ilmsILMS]+$")
if VALID_FLAGS.search(flags):
# enforce the 'iLms' form of flags because I'm compiling flags
directly
# into each search pattern using '(?iLms)'
re_flags = string.lower(flags)
re_flags = string.join(string.split(re_flags, "l"), "L")
return re_flags
else:
if ("x" in flags) or ("X" in flags):
return "err_verbose_flag"
elif ("u" in flags) or ("U" in flags):
return "err_unicode_flag"
else:
return "err_illegal_flag"
##### search-and-replace functions #####
# ------------------------------------------------------------
# function: search_replace
# ------------------------------------------------------------
def search_replace(text, regexes):
"""Performs each search and replace in a list of supplied
search-replace
pairs on the supplied text.
Usage: search_replace(text, regexes) --> newtext
text is any string.
regexes is a list of tuples where each tuple is a search-replace pair
of regular expressions: [(SEARCH, replace), ... ]
Note that each SEARCH is expected to be a compiled regular
expression object (re.compile); replace is always a string.
If an IndexError or RuntimeError is raised during processing, the
error
is printed to the console and the string 'ERROR' is returned.
This function tests for the presence of case conversion tags in each
replace, and calls upon the convert_case() function as necessary.
"""
err_index = "\nERROR: Backreference found in a replace without
corresponding " \
"parenthetical group in the search!\ns:\t%s\nr:\t%s"
err_runtime = "\nERROR: The following regular expressions caused this
RuntimeError:\n " \
"'%s'\ns:\t%s\nr:\t%s"
err_non_contrib_group = "\nERROR: %s. " \
"\nTry, for example, changing '(b)?' to '(b?)' or to
'((?:b?))'." \
"\nAlso, don't do things like 'a|(b)', because '(b)' doesn't
contribute if 'a' matches." \
"\ns:\t%s\nr:\t%s"
sys.stderr.write(' [Processing')
for SEARCH, replace in regexes:
sys.stderr.write('.')
try:
text = SEARCH.sub(replace, text)
except IndexError:
print err_index % (SEARCH.pattern, replace)
return "ERROR"
except RuntimeError, err_value:
print err_runtime % (err_value, SEARCH.pattern, replace)
return "ERROR"
except re.error, err_value:
print err_non_contrib_group % (err_value, SEARCH.pattern, replace)
return "ERROR"
if _CASETAGS.search(replace):
text = _CASETAGS.sub(_replace, text)
sys.stderr.write(']\n')
return text
# ------------------------------------------------------------
# function: _replace
# ------------------------------------------------------------
def _replace(match):
"""Private replace function called by the search_replace() function
when processing <convert_case> tags."""
return convert_case(match.group("text"), match.group("case"))
##### random silliness #####
# ------------------------------------------------------------
# function: span_inq_quote
# ------------------------------------------------------------
def span_inq_quote(*spam):
"""Returns a randomly selected quote from the Monty Python
'Spanish Inquisition' sketch.
Usage: span_inq_quote([spam, spam, spam, ...]) --> "funny quote"
Actually, this function will eat nearly anything, not just spam.
Eggs, bacon, sausage, and baked beans are particularly delightful.
"""
templist = []
for food in spam:
if food:
if (type(food) == type([])) or (type(food) == type(())):
for item in food:
templist.append(item)
else:
templist.append(food)
WORDBRK =
re.compile(r'''[/|\\<>\(\)\[\]{}"'@#$%&*+\-=^_`~,;:\.\?!\s]+''')
types_of_beans = ["baked", "BAKED", "Baked", "green", "GREEN",
"Green", "pinto", "PINTO", "Pinto", \
"garbonzo", "GARBONZO", "Garbonzo", "fava", "FAVA", "Fava",
"navy", "NAVY", "Navy", \
"black", "BLACK", "Black", "red", "RED", "Red", "white", "WHITE",
"White", \
"yellow", "YELLOW", "Yellow", "lima", "LIMA", "Lima", "refried",
"REFRIED", "Refried", \
"string", "STRING", "String", "wax", "WAX", "Wax"]
all_food = []
for item in templist:
if type(item) == type(""):
food_items = WORDBRK.split(item)
for food in food_items:
if food not in ["and", "or", "AND", "OR", "And", "Or"]:
if food in ["beans", "BEANS", "Beans"] and len(all_food) > 0 \
and all_food[len(all_food) - 1] in types_of_beans:
all_food[len(all_food) - 1] = all_food[len(all_food) - 1] + " " +
food
else:
all_food.append(food)
else:
all_food.append(item)
at_least_one = 0
for food in all_food:
if food:
if type(food) == type(""):
at_least_one = 1
if food in ["spam", "SPAM", "Spam"]:
print "Glorious spam! Wonderful spam!"
elif food in ["eggs", "EGGS", "Eggs", \
"bacon", "BACON", "Bacon", \
"sausage", "SAUSAGE", "Sausage",\
"baked beans", "BAKED BEANS", "Baked Beans"]:
print "I'm having spam, spam, %s, and spam." % food
else:
print "But I don't like %s!" % food
else:
try:
print "You can't eat a %s!" % repr(food)
except:
# raises 'TypeError: not all arguments converted' if nested tuple
was supplied to a spam argument
pass
if at_least_one: print "\n=========="
quotes = ["I didn't expect a kind of Spanish Inquisition.", \
"NOBODY expects the Spanish Inquisition!", \
"Our chief weapon is surprise. Surprise and fear ... fear and
surprise....\n" \
"Our TWO weapons are fear and surprise.", \
"Amongst our weaponry ... are such elements as fear,
surprise....\nI'll come in again.", \
"Amongst our weaponry are such diverse elements as:\n" \
"fear, surprise, ruthless efficiency, an almost fanatical
devotion\n" \
"to the Pope, and nice red uniforms -- Oh damn!", \
"Okay, stop. Stop. Stop there -- stop there. Stop. Phew!\n" \
"Ah! ... our chief weapons are surprise ... blah blah blah.", \
"Now, Cardinal -- the rack!\n" \
"[Biggles produces a plastic-coated dish-drying rack.]", \
"Biggles! Fetch ... THE CUSHIONS!", \
"Cardinal! Poke her with the soft cushions!", \
"Cardinal Fang! Fetch ... THE COMFY CHAIR!", \
"Now -- you will stay in the Comfy Chair until lunch time,\n" \
"with only a cup of coffee at eleven." \
]
return quotes[random.randrange(0, len(quotes))]
##### end of definitions #####
# ------------------------------------------------------------
# MAIN PROGRAM
# ------------------------------------------------------------
if __name__ == "__main__":
# initialize a few variables
valid_opts = r"""The available options for The Spanish Inquisition
are:
'-x' or '--extension='
Supply an alternate filename extension for output files.
'-o' or '--output_dir='
Supply an alternate folder to contain output files.
Absolute or relative paths are acceptable.
'-s' or '--sre'
Instructs The Spanish Inquisition to use Python's
Unicode-aware regular expression engine (sre).
(Be wary of using sre. As of this writing [7/24/01,
current Python version 2.1], the sre engine is broken.)
Use of pre (standard ASCII engine) is assumed.
'-p' or '--pre'
Instructs The Spanish Inquisition to use Python's
original pre module rather than the current re module.
[Python 2.3 addresses the final sre recursion errors but
at the cost of time -- lots and lots of time ]
'-r' or '--regex'
Prints out a description of Python's regular expression
syntax and quits.
'-?', '-h', or '--help'
Prints complete documentation for the program and quits.
'-v' or '--version'
Prints the current version of the program and quits.
'--food='
The Spanish Inquisition just loves spam!
"""
usage = r"""
THE SPANISH INQUISITION
by Damon Butler
========================================
Batch search and replace utility using the Python regular expression
syntax.
Inspired by David Niergarth's snr.py script and Greg Swann's Torquemada
the Inquisitor for the Macintosh.
----------------------------------------
Basic Description
----------------------------------------
The Spanish Inquisition takes a list of text files and 'inquisition'
files
(containing search and replace regular expressions) and, by iterating
through each search-replace pair in each inquisition file, creates
a new set of text files that it saves to disk in a new directory.
Each text file processed is submitted to the full battery of
search-and-
replace expression pairs contained in all inquisition files.
The order in which the searches and replaces are executed is determined
by alphanumerically sorting the inquisition files by filename, and the
searches and replaces are executed sequentially from top to bottom in
each inquisition file. A text file is processed completely by a search-
and-replace pair before the following search-and-replace pair is used.
----------------------------------------
Inquisition File Description
----------------------------------------
Inquisition files are text files containing pairs of search and replace
regular expressions. These files must be identified by the '.inq' or
'.data' filename extensions in order to be properly recognized by
The Spanish Inquisition.
An inquisition file is line-based and, in the simplest case, might look
something like the following:
s: search text
r: replace text
s: another search
r: another replace
Note how "s:<tab>" is used to declare a line containing a search
expression,
and "r:<tab>" is used to declare a line containing a replace
expression, and
how search lines must always precede replace lines. The other three
declarations allowed are:
e: search expression containing character escapes
f: "local" regular expression flag
F: "global" regular expression flag
Python allows you to access high-ASCII or even Unicode characters
directly
by declaring character escapes. A character escape takes the form of a
backslash followed the encoding position of the character in either
octal
or hexadecimal notation. For example, the copyright character is
encoded
in decimal position 169 in both Windows- and Macintosh-standard 8-bit
ASCII encodings. One could access that character by keying it directly,
or by using either of the following character escape codes:
hexadecimal: \xa9
octal: \251
When declaring an "e:<tab>" character escape search, be aware that you
must escape all backslashes that aren't a part of character escape
codes
as '\\'. Thus, '\s' (whitespace wildcard) would become '\\s', and
'\\' (literal backslash character) would become '\\\\'.
See the following section for more information about regular expression
flags.
Note that you must place at least one tab (though you may use more than
one tab) after the colon of the line declaration to delimit the start
of the
expression, even when the replace text is empty. The first character
that is NOT a tab signals the start of the expression. (To search or
replace
tab characters, always use the '\t' wildcard.)
Lines that do not begin with valid declarations (including blank lines
or
lines containing nothing but whitespace) are ignored, and thus provide
a mechanism by which you may insert comments into your inquisition
files. Any number of "comment" lines may fall before, after, and
inbetween legal line declarations. For example:
Searching for start and end 'spam' tags
s: <(/?)spam>
Replacing each 'spam' tag with an 'eggs' tag
r: <\1eggs>
----------------------------------------
Using Regular Expression Flags
----------------------------------------
Python's regular expression engine recognizes search expression flags
that affect case sensitivity, multiline search capability, and the
like.
The available flags are i, s, m, L, u, and x.
i (IGNORECASE)
Case insensitive matching.
s (DOTALL)
The '.' wildcard matches any character, including newlines.
('.' is not normally allowed to match linebreaks.)
m (MULTILINE)
The '^' and '$' wildcards match the beginning and end of each
line rather than the beginning and end of the entire string
matched by the search expression.
L (LOCALE)
Use current language locale.
(Useful mostly for non-English languages.)
u (UNICODE)
Makes the use of the '\w', '\W', '\b', and '\B' wildcards dependent
on the Python Unicode character properties database. By default,
this flag is not allowed by The Spanish Inquisition (i.e., its use
has been declared heretical) because Python's Unicode-aware
regular expression engine (sre) is broken as of this writing
(7/24/01, current Python version 2.1). At your own risk, you
can instruct The Spanish Inquisition to use the sre engine (and
thus use this flag) by declaring the '-s' or '--sre' options on the
command line. (See below for more information on the
available options.)
x (VERBOSE)
Ignores most white space and anything after '#' characters
in the search expression. This flag is disallowed because
inquisition files are line based, and thus no comments or
ignorable whitespace can be inserted in the middle of an
expression.
You may add these flags to regular expressions using the following
syntax
(in all of the following examples, note the varied format of inserted
comments):
%%% 'i' and 's' flags activated for this search %%%
s: (?is)<spam>.*?</spam>
^ the '(?ismL)' flag declaration must be the very first item in
the search expression
Or you may add them via local or global flag declaration lines:
- local flag example 1 -
'i' and 's' flags activated for next encountered search
f: is
s: <spam>(.*?)</spam>
r: <eggs>\1</eggs>
...
- local flag example 2 -
s: <spam>(.*?)</spam>
f: is
<-- 'i' and 's' flags activated for preceding search because
the associated replace hasn't been encountered yet -->
r: <eggs>\1</eggs>
...
- global flag example -
!!! 'i' and 's' flags activated for all searches after this point
in this inquisition file !!!
F: is
Local flags always supercede global flags. Global flags can be globally
superceded by declaring another global flag.
For instance, if most of your search expressions need to be case
insensitive and you want '.' to match newlines, you can save yourself
some typing by declaring these as global flags.
-- Case insensitivity and newline matching now assumed for all
searches
F: is
==== Delete all <bacon>s ====
s: <bacon>.*?</bacon>
r:
^ don't forget to type the tab followng ':'even when
the replace text is empty
==== Change 'spam and eggs' into 'eggs and spam' ====
c: temporarily disable global flags by declaring no local flags
f:
s: (spam) and (eggs)
r: \2 and \1
... (more searches and replaces)
-- Turn off all global flags
F:
... (more searches and replaces)
----------------------------------------
Case Conversion Flags
----------------------------------------
The Spanish Inquisition recognizes seven different case conversion
flags that are available only in replace expressions. These flags can
be used only on the replace side because they control the output
format of the text found; they are not themselves something that
can be searched for. Using these flags in search expressions may
produce unwanted results, especially since several of them already
have special meanings inside search expressions. The available flags
are:
\C (all Caps)
Converts the text into all caps.
\L (Lowercase)
Converts the text into all lowercase.
\U (Upstyle caps)
Converts each word into initial caps/lowercase.
(Word breaks are defined by any combination of
punctuation and/or whitespace.)
\D (Downstyle caps)
Same as upstyle caps except that words shorter than
four letters in length are left as all lowercase.
(Word breaks are defined by any combination of
punctuation and/or whitespace.)
\S (Sentence caps)
Converts the entire range into all lowercase,
but initial caps the first word of each sentence.
(Sentence breaks are defined by '.', '?', and '!'.)
\X (eXchange case, or swapcase)
Converts all capital letters into lowercase letters,
and all lowercase letters into capital letters
\= (end case conversion)
Terminates the currently active case conversion.
For all case conversions, the pronoun 'I' is handled intelligently.
Here is an example of how to use the flags:
find all 'spam' tagged text
s: <spam>(.*?)</spam>
convert the text into all caps
r: \C\1\=
You don't need to use the '\=' flag if it is the last thing in the
replace
string; The Spanish Inquisition will put one there if it doesn't find
one.
On the other hand, if you need to terminate case conversion within
the replace, then you must explicitly turn it off. You cannot activate
a second case conversion flag without first turning off the previous
case conversion flag. (Yes, the following can be done through a
literal search, but it's just an example!)
s: Spam, spam, bacon, eggs, and spam.
r: \CSpam, spam,\= \Ubacon, eggs,\= and \Cspam.
would result in the text 'SPAM, SPAM, Bacon, Eggs, and SPAM.'
"""
if sys.platform == "mac":
quit_msg = "\n\n<<[command] + [Q] to quit>>"
usage = usage + r"""----------------------------------------
The Spanish Inquisition on the Mac
----------------------------------------
The Spanish Inquisition is a drag-and-drop applet. Select one or more
text files you wish to process along with one or more inquisition files
you wish to use, and drag them all on top of the applet. Recall that
inquisition files are always used in alphanumerical order, regardless
of what folder or folders they reside in. (For information about
inquisition files, which must be text files terminating with either the
'.inq' or '.data' filename extensions, see above.) If you fail to
provide
at least one inquisition file along with the batch of text files, The
Spanish Inquisition will search for inquisition files (and option
files, see
below) in all the folders containing the text files plus the folder
where
the applet itself resides and utilize all that it finds.
By default, the output files created by The Spanish Inquisition will
all
have the '_inq.txt' filename extension appended to them. These output
files will reside in a 'free_from_heresy' folder inside the folder
containing the source text files. If text files are drawn from multiple
folders, a 'free_from_heresy' folder will be created inside each folder
containing at least one text file. You can alter this behavior by
specifying one or more "option" files.
----------------------------------------
Options and Option Files
----------------------------------------
Option files are text files containing option declarations and, when
necessary, argument values for those options. These files must be
identified by the '.opt' filename extension in order to be properly
recognized by The Spanish Inquisition. To use an option file, simply
drag it on top of the applet along with all other text and inquisition
files you are processing. You may use multiple option files at once.
An option file is line-based and, in the simplest case, might look
something like the following:
o: -s -x ".txt"
Note how "o:<tab>" is used to declare a line containing at least one
option. You may declare any number of option lines, and each line
may contain any number of options. Lines that do not begin with the
"o:<tab>" declaration are ignored.
You enter options exactly as though you were running The Spanish
Inquisition from the command line of a DOS or Unix shell session.
""" + valid_opts + r"""
Say you wished for all your output files to have the '.xml' filename
extension, and that you wanted them all to appear in a new folder
on your Mac's desktop. You could create an option file containing:
### Example 1 ###
o: -x ".xml" --output_dir="Macintosh HDesktop Folderutput"
or:
### Example 2 ###
o: -o "Macintosh HDesktop Folderutput"
o: --extension=".xml"
or some other similar combination.
----------------------------------------
Notes on Python's Regular Expression Syntax
----------------------------------------
By and large, Python's regular expression syntax is identical to the
standard Unix grep syntax employed by BBEdit. There are a few
important differences, however.
(1) When processing text, Python always converts all line-ending
characters into Unix-style newlines. That is, use the '\n'
wildcard instead of the '\r' wildcard to identify paragraph marks.
(2) In standard (and BBEdit) grep, the '*', '+', and '?' qualifiers are
all greedy; they match as much text as possible. Sometimes this
behavior isn't desired. If the expression '<.*>' is matched against
'<H1>title</H1>', it will match the entire string, and not just
'<H1>'.
Adding '?' after the qualifier makes it perform the match in
non-greedy
or minimal fashion; as few characters as possible will be matched.
Using '.*?' in the previous expression will match only '<H1>'.
(3) Python recognizes a large number of extensions to the grep syntax
not available in BBEdit. (The use of regular expression flags and
minimal matching are two examples of these extensions.) To find
out about them (highly recommended), enter 'r' + [return] below.
"""
else:
quit_msg = ""
usage = usage + r"""----------------------------------------
Running The Spanish Inquisition from the Command Line
----------------------------------------
In the current working directory, type:
span_inq.py [options] (glob | file) [(glob2 | file2) ...]
Items listed in square brackets are optional. For each parenthetical
group, choose to enter either a glob or a file.
glob is any standard file glob (e.g., '*.xml', '*.inq', or '*.*').
file is any single filename.
Both glob and file items may contain absolute or relative paths.
(e.g., '..\..\translate_set.inq' or 'D:\projects\temp\*.xml')
""" + valid_opts + r"""
Instead of declaring options on the command line (or even in
conjunction
with command line options), you can declare options via "option" files.
Option files are text files containing option declarations and, when
necessary, argument values for those options. These files must be
identified by the '.opt' filename extension in order to be properly
recognized by The Spanish Inquisition. To use an option file, simply
declare it along with any other files you are processing. You may use
multiple option files at once.
An option file is line-based and, in the simplest case, might look
something like the following:
o: -s -x ".txt"
Note how "o:<tab>" is used to declare a line containing at least one
option. You may declare any number of option lines, and each line
may contain any number of options. Lines that do not begin with the
"o:<tab>" declaration are ignored. You enter options exactly as though
you were running The Spanish Inquisition from the command line of a
DOS or Unix shell session, except that filenames and globs are ignored
in an option file.
----------------------------------------
Processing Details
----------------------------------------
The Spanish Inquisition will process all text files you declare against
all inquisition files (and option files) you declare. Recall that
inquisition
files are always used in alphanumerical order, regardless of what
directory or directories they reside in. (For information about
inquisition
files, which must be text files terminating with either the '.inq' or
'.data'
filename extensions, see above.) If you fail to declare at least one
inquisition file along with your batch of text files, The Spanish
Inquisition
will search for inquisition and option files in all the directories
containing
the text files plus the current working directory (if different) and
utilize
all that it finds.
By default, the output files created by The Spanish Inquisition will
reside in
a 'free_from_heresy' directory inside the directory containing the
source text
files. If text files are drawn from multiple directories, a
'free_from_heresy'
directory will be created inside each directory containing at least one
text
file. Use the available options to alter this behavior.
The Spanish Inquisition can utilize data files prepared for the snr.py
script without modification.
"""
re_syntax = r"""----------------------------------------
Python's Regular Expression Syntax
----------------------------------------
Regular expressions can contain both special and ordinary characters.
Most ordinary characters, like "A", "a", or "0", are the simplest
regular
expressions; they simply match themselves. You can concatenate ordinary
characters, so "last" matches the string 'last'.
Some characters, like "|" or "(", are special. Special characters
either stand
for classes of ordinary characters, or affect how the regular
expressions
around them are interpreted. The special characters are:
"." (Dot.)
In the default mode, this matches any character except a newline. If
the DOTALL
flag "s" has been specified, this matches any character including a
newline.
"^" (Caret.)
Matches the start of the string, and in MULTILINE mode ("m") also
matches
immediately after each newline.
"$"
Matches the end of the string, and in MULTILINE mode ("m") also matches
before a newline. "foo" matches both 'foo' and 'foobar', while the
regular
expression "foo$" matches only 'foo'.
"*"
Causes the resulting RE to match 0 or more repetitions of the preceding
RE,
as many repetitions as are possible. "ab*" will match 'a', 'ab', or 'a'
followed
by any number of 'b's.
"+"
Causes the resulting RE to match 1 or more repetitions of the preceding
RE.
"ab+" will match 'a' followed by any non-zero number of 'b's; it will
not match
just 'a'.
"?"
Causes the resulting RE to match 0 or 1 repetitions of the preceding
RE. "ab?"
will match either 'a' or 'ab'.
"*?", "+?", "??"
The "*", "+", and "?" qualifiers are all greedy; they match as much
text as
possible. Sometimes this behavior isn't desired; if the RE "<.*>" is
matched
against '<H1>title</H1>', it will match the entire string, and not just
'<H1>'.
Adding "?" after the qualifier makes it perform the match in non-greedy
or
minimal fashion; as few characters as possible will be matched. Using
".*?"
in the previous expression will match only '<H1>'.
{m,n}
Causes the resulting RE to match from m to n repetitions of the
preceding RE,
attempting to match as many repetitions as possible. For example,
"a{3,5}"
will match from 3 to 5 "a" characters. Omitting n specifies an infinite
upper
bound; you can't omit m.
{m,n}?
Causes the resulting RE to match from m to n repetitions of the
preceding RE,
attempting to match as few repetitions as possible. This is the
non-greedy
version of the previous qualifier. For example, on the 6-character
string
'aaaaaa', "a{3,5}" will match 5 "a" characters, while a{3,5}? will only
match 3 characters.
"\"
Either escapes special characters (permitting you to match characters
like "*",
"?", and so forth), or signals a special sequence; special sequences
are
discussed below.
[]
Used to indicate a set of characters. Characters can be listed
individually,
or a range of characters can be indicated by giving two characters and
separating them by a "-". Special characters are not active inside
sets. For
example, "[akm$]" will match any of the characters 'a', 'k', 'm', or
'$';
"[a-z]" will match any lowercase letter, and "[a-zA-Z0-9]" matches any
letter or digit. Character classes such as "\w" or "\S" (defined below)
are
also acceptable inside a range. If you want to include a "]" or a "-"
inside a
set, precede it with a backslash, or place it as the first character.
The
pattern "[]]" will match ']', for example.
You can match the characters not within a range by complementing the
set. This is indicated by including a "^" as the first character of the
set;
"^" elsewhere will simply match the "^" character. For example, "[^5]"
will match any character except "5".
"|"
A|B, where A and B can be arbitrary REs, creates a regular expression
that will
match either A or B. An arbitrary number of REs can be separated by the
"|" in
this way. This can be used inside groups (see below) as well. REs
separated
by "|" are tried from left to right, and the first one that allows the
complete
pattern to match is considered the accepted branch. This means that if
A
matches, B will never be tested, even if it would produce a longer
overall
match. In other words, the "|" operator is never greedy. To match a
literal
"|", use "\|", or enclose it inside a character class, as in "[|]".
(...)
Matches whatever regular expression is inside the parentheses, and
indicates
the start and end of a group; the contents of a group can be retrieved
after a
match has been performed, and can be matched later in the string with
the
\number special sequence, described below. To match the literals "(" or
")", use
"\(" or "\)", or enclose them inside a character class: "[(] [)]".
(?...)
This is an extension notation (a "?" following a "(" is not meaningful
otherwise). The first character after the "?" determines what the
meaning and
further syntax of the construct is. Extensions usually do not create a
new
group; "(?P<name>...)" is the only exception to this rule. Following
are the
currently supported extensions.
(?iLmsux)
(One or more letters from the set "i", "L", "m", "s", "u", "x".) The
group
matches the empty string; the letters set the corresponding flags
(IGNORECASE, LOCALE, MULTILINE, DOTALL, UNICODE, VERBOSE) for the
entire regular expression.
Note that the (?x) flag changes how the expression is parsed. It
should be
used first in the expression string, or after one or more whitespace
characters.
If there are non-whitespace characters before the flag, the results are
undefined.
(?:...)
A non-grouping version of regular parentheses. Matches whatever regular
expression is inside the parentheses, but the substring matched by the
group
cannot be retrieved after performing a match or referenced later in the
pattern.
(?P<name>...)
Similar to regular parentheses, but the substring matched by the group
is
accessible via the symbolic group name "name". Group names must be
valid
Python identifiers. A symbolic group is also a numbered group, just as
if the
group were not named. So the group named "id" in the example below can
also be referenced as the numbered group 1.
For example, if the pattern is "(?P<id>[a-zA-Z_]\w*)", the group can
be
referenced by its name in pattern text (e.g. "(?P=id)") and replacement
text
(e.g. "\g<id>").
(?P=name)
Matches whatever text was matched by the earlier group named "name".
(?#...)
A comment; the contents of the parentheses are simply ignored.
(?=...)
Matches if "..." matches next, but doesn't consume any of the string.
This is
called a lookahead assertion. For example, "Isaac (?=Asimov)" will
match
'Isaac' only if it's followed by 'Asimov'.
(?!...)
Matches if "..." doesn't match next. This is a negative lookahead
assertion.
For example, "Isaac (?!Asimov)" will match 'Isaac' only if it's NOT
followed
by 'Asimov'.
(?<=...)
Matches if the current position in the string is preceded by a match
for
"..." that ends at the current position. This is called a positive
lookbehind
assertion. "(?<=abc)def" will match 'abcdef', since the lookbehind will
back
up 3 characters and check if the contained pattern matches. The
contained
pattern must only match strings of some fixed length, meaning that
"abc"
or "a|b" are allowed, but "a*" isn't.
NOTE: To use the positive lookbehind assertion, you must specify use of
the sre regular expression engine via the '-s' or '--sre' options.
(?<!...)
Matches if the current position in the string is not preceded by a
match for
"...". This is called a negative lookbehind assertion. Similar to
positive
lookbehind assertions, the contained pattern must only match strings of
some fixed length.
NOTE: To use the negative lookbehind assertion, you must specify use of
the sre regular expression engine via the '-s' or '--sre' options.
The special sequences consist of "\" and a character from the list
below. If
the ordinary character is not on the list, then the resulting RE will
match the
second character. For example, "\$" matches the character "$".
\number
Matches the contents of the group of the same number. Groups are
numbered
starting from 1. For example, "(.+) \1" matches 'the the' or '55 55',
but not
'the end' (note the space after the group). This special sequence can
only be
used to match one of the first 99 groups. If the first digit of number
is 0, or
number is 3 octal digits long, it will not be interpreted as a group
match, but
as the character with octal value number. Inside the "[" and "]" of a
character
class, all numeric escapes are treated as characters.
\A
Matches only at the start of the string.
\b
Matches the empty string, but only at the beginning or end of a word. A
word
is defined as a sequence of alphanumeric characters, so the end of a
word is
indicated by whitespace or a non-alphanumeric character. Inside a
character
range, "\b" represents the backspace character, for compatibility with
Python's string literals.
\B
Matches the empty string, but only when it is not at the beginning or
end of
a word.
\d
Matches any decimal digit; this is equivalent to the set "[0-9]".
\D
Matches any non-digit character; this is equivalent to the set
"[^0-9]".
\s
Matches any whitespace character; this is equivalent to the set "[
\t\n\r\f\v]".
\S
Matches any non-whitespace character; this is equivalent to the set
"[^ \t\n\r\f\v]".
\w
When the LOCALE and UNICODE flags are not specified, matches any
alphanumeric
character; this is equivalent to the set "[a-zA-Z0-9_]". With LOCALE,
it will
match the set "[0-9_]" plus whatever characters are defined as letters
for the
current locale. If UNICODE is set, this will match the characters
"[0-9_]" plus
whatever is classified as alphanumeric in the Unicode character
properties
database.
\W
When the LOCALE and UNICODE flags are not specified, matches any non-
alphanumeric character; this is equivalent to the set "[^a-zA-Z0-9_]".
With LOCALE, it will match any character not in the set "[0-9_]", and
not
defined as a letter for the current locale. If UNICODE is set, this
will
match anything other than "[0-9_]" and characters marked at
alphanumeric
in the Unicode character properties database.
\Z
Matches only at the end of the string.
\\
Matches a literal backslash. """
version = "1.2.7: 08/18/03"
#heresy = ''
## Set the default for which re module is used by editing the next
uncommented line:
## using_sre = 0 -> don't use sre, use pre instead
## using_sre = 1 -> use sre, not pre
using_sre = 0
if using_sre:
import re
else:
import pre as re
short_opts = "x:rspvh?"
long_opts = ["extension=", "output_dir=", "version", "regex", "sre",
"pre", "help", "food="]
# separate command-line options from file list/file globs
try:
options, files = getopt.getopt(sys.argv[1:], short_opts, long_opts)
except getopt.error, err_value: # a bogus option or option argument
was entered
print "ERROR:", err_value, "\n\n", valid_opts
sys.exit(quit_msg)
# check for file list/file globs
# (obtain lists of text files and inquisition files)
if files:
inqfiles, datafiles, optfiles, files = sortfiles(files, [".inq",
".data", ".opt"])
# create absolute paths for each file in each group of files
# and sort inqfiles and optfiles by filename only
files = create_abspaths(files)
optfiles = filenamesort(create_abspaths(optfiles))
inqfiles.extend(datafiles)
inqfiles = filenamesort(create_abspaths(inqfiles))
if not files and not optfiles:
print "ERROR: No files matching the filename(s) you provided.\n\n"
sys.exit(quit_msg)
if not inqfiles and not optfiles:
if not optfiles:
inqfiles, optfiles = get_inqfiles(files)
else:
inqfiles, more_optfiles = get_inqfiles(files)
optfiles.extend(more_optfiles)
optfiles = filenamesort(optfiles)
if not inqfiles:
print "ERROR: No inquisition files found.\n\n"
sys.exit(quit_msg)
else:
optfiles = []
if not options:
print usage
if sys.platform == "mac":
input = raw_input("For a full description of Python's regular
expression syntax,\nenter 'r' <<or [command] + [Q] to quit>>: ")
if input == "r" or input == "R":
print "\n" + re_syntax
sys.exit(quit_msg)
# compile all options
all_options = compile_options(optfiles, short_opts, long_opts)
if all_options == "ERROR":
print "\n", valid_opts
sys.exit(quit_msg)
all_options.extend(options)
# set values for options
ext = ""
output_dir = "free_of_heresy" # dni: added this default (11-06-2002)
(see next dni comment)
food = []
for opt, arg in all_options:
if opt in ("-h", "-?", "--help"):
print usage
sys.exit(quit_msg)
elif opt in ("-v", "--version"):
print "Nobody expects The Spanish Inquisition!\nby Damon Butler\n(v"
+ version + ")"
sys.exit(quit_msg)
elif opt in ("-r", "--regex"):
print re_syntax
sys.exit(quit_msg)
elif opt in ("-x", "--extension"):
ext = arg
elif opt in ("-o", "--output_dir"):
output_dir = arg
elif opt in ("-s", "--sre"):
using_sre = 1
print ' [using sre module]'
elif opt in ("-p", "--pre"):
using_sre = 0
print ' [using pre module]'
elif opt == "--food":
food.append(arg)
else: # should normally be unnecessary, but ...
print "ERROR: option " + opt + " not recognized\n\n" + valid_opts
sys.exit(quit_msg)
if not ext and sys.platform == "mac": ext = "_inq.txt"
# dni: (11-06-2002) Let the output file overwrite input file if
--output_dir="" is supplied on the command line.
# if not output_dir: output_dir = "free_of_heresy"
# just in case valid, non sys.exit() options were provided without any
legal text or inquisition files
# (this is necessary because of the tortured logic surrounding
optfiles and file sorting above)
if not files:
print "ERROR: No files matching the filename(s) you provided
found.\n\n"
sys.exit(quit_msg)
if not inqfiles:
print "ERROR: No inquisition files found.\n\n"
sys.exit(quit_msg)
# process inquisition files to create master list of searches and
replaces
regexes = compile_searches(inqfiles, using_sre)
if not regexes:
print "\n"
sys.exit(quit_msg)
# process each file
at_least_one = 0
for file in files:
print ' [Reading: "%s"]' % file
text = readfile(file)
if text:
at_least_one = 1
text = search_replace(text, regexes)
if text == "ERROR":
print "\n"
sys.exit(quit_msg)
print ' [Writing: "%s"]' % (os.path.join(output_dir, file) + ext,)
writefile(text, file, ext, output_dir)
else:
print "%s not found or is blank. Skipping ..." % file
# done!
if at_least_one:
pass
else:
print "\nERROR: No text was processed.\n"
print quit_msg
Thanks and Regards.