Thanks Steven, Actually i wanted a do text processing for my office
where I can view all files in the system and use the first three to
give a summary of the document. Instead of having somebody actually
entering the summary. Seems there is no one code that can act as
convertor across formats, i'll have to check out convertors for
individual formats.
I have some old code that does just that. It uses pdftotext, catdoc
and links to convert .doc, .pdf and .html to text.
##################################################################
import mimetypes
from subprocess import call, Popen, PIPE
import sys
class ConversionError(Exception):
pass
class UnknownMimeType(ConversionError):
pass
class NotAMimeType(ConversionError):
pass
class ParseError(ConversionError):
pass
def has_program(progname):
return call(["which", progname], stdout = PIPE) == 0
def check_requirements():
missing = []
for prog in "catdoc", "pdftotext", "links":
if not has_program(prog):
missing.append(prog)
if missing:
print "You need to have the programs:", " ".join(missing)
return False
return True
if not check_requirements():
print "Needed external programs not found, quitting"
sys.exit(1)
def get_catdoc_args(infile):
return ["catdoc", "-s", "8859-1", infile]
def get_pdftotext_args(infile):
return ["pdftotext", infile, "-"]
def get_links_args(infile):
return ["links", infile, "-dump"]
def totext(document):
filetype_to_args_map = {"application/msword" : get_catdoc_args,
"application/pdf" : get_pdftotext_args,
"text/html" : get_links_args}
ftype, ign = mimetypes.guess_type(document)
if not ftype:
raise NotAMimeType, "Couldn't detect mimetype for %s" % document
try:
argfunc = filetype_to_args_map[ftype]
except KeyError:
s = "Don't know how to handle %s documents" % ftype
raise UnknownMimeType, s
p = Popen(argfunc(document), stdout = PIPE, stderr = PIPE)
text = p.stdout.read()
if p.wait():
# Force a better exception to be thrown if the file doesn't exist.
open(document)
raise ParseError, "Failed to parse %s" % document
return text
if __name__ == "__main__":
print totext("testpdf.pdf")