R
rabad
Hi,
I've created a custom filter based on HTMLParser, with the following
source:
class Filter(HTMLParser):
def __init__(self, keyfile):
HTMLParser.__init__(self)
mykwfile = open(keyfile, 'r')
self._keywords = []
for kw in mykwfile.read().split('\n'):
self._keywords.append(kw)
print kw
mykwfile.close()
self._toProcess = False
self.stack = []
def handle_starttag(self, tag, attrs):
if 'a' != tag:
self.stack.append(self.__html_start_tag(tag, attrs))
return
attrs = dict(attrs)
self._toProcess = True
for key in self._keywords:
if 'a' == tag:
p = re.compile(key, re.IGNORECASE)
if 'href' in attrs:
attrs['href'] = p.sub(r'XXXXX',attrs['href'])
self.stack.append(self.__html_start_tag(tag, attrs))
def handle_startendtag(self, tag, attrs):
if 'img' != tag and 'meta' != tag:
self.stack.append(self.__html_startend_tag(tag, attrs))
return
attrs = dict(attrs)
self._toProcess = True
for key in self._keywords:
p = re.compile(key, re.IGNORECASE)
if 'img' == tag:
if 'src' in attrs:
attrs['src'] = p.sub(r'XXXXX',attrs['src'])
if 'alt' in attrs:
attrs['alt'] = p.sub(r'XXXXX',attrs['alt'])
if 'meta' == tag:
if 'description' in attrs:
attrs['description'] =
p.sub(r'XXXXX',attrs['description'])
if 'content' in attrs:
attrs['content'] =
p.sub(r'XXXXX',attrs['content'])
if 'meta' == tag or 'img' == tag:
self._toProcess = False
self.stack.append(self.__html_startend_tag(tag, attrs))
def handle_endtag(self, tag):
self.stack.append(self.__html_end_tag(tag))
if self._toProcess:
self._toProcess = False
def handle_data(self, data):
if self._toProcess:
for key in self._keywords:
p = re.compile(key,re.IGNORECASE)
data = p.sub(r'XXXXX',data)
self.stack.append(data)
def __html_start_tag(self, tag, attrs):
return '<%s%s>' % (tag, self.__html_attrs(attrs))
def __html_startend_tag(self, tag, attrs):
return '<%s%s/>' % (tag, self.__html_attrs(attrs))
def __html_end_tag(self, tag):
return '</%s>' % (tag)
def __html_attrs(self, attrs):
_attrs = ''
if attrs:
_attrs = ' %s' % (' '.join([('%s="%s"' % (k,v)) for k,v in
attrs.iteritems()]))
return _attrs
But when I use it, it gives me the following error message:
ERROR Processor exception: AttributeError: 'list' object has no
attribute 'it
eritems'
Traceback (most recent call last):
File "d:\esp\lib\python2.3\processors\DocDumpF.py", line 87, in
Process
p.feed(document.GetValue("data"))
File "HTMLParser.py", line 108, in feed
File "HTMLParser.py", line 148, in goahead
File "HTMLParser.py", line 281, in parse_starttag
File "d:\esp\lib\python2.3\processors\DocDumpF.py", line 121, in
handle_startt
ag
self.stack.append(self.__html_start_tag(tag, attrs))
File "d:\esp\lib\python2.3\processors\DocDumpF.py", line 167, in
__html_start_
tag
return '<%s%s>' % (tag, self.__html_attrs(attrs))
File "d:\esp\lib\python2.3\processors\DocDumpF.py", line 178, in
__html_attrs
_attrs = ' %s' % (' '.join([('%s="%s"' % (k,v)) for k,v in
attrs.iteritems()
]))
Anybody knows why it says attrs is not a list element?
Thanks,
Rubén
I've created a custom filter based on HTMLParser, with the following
source:
class Filter(HTMLParser):
def __init__(self, keyfile):
HTMLParser.__init__(self)
mykwfile = open(keyfile, 'r')
self._keywords = []
for kw in mykwfile.read().split('\n'):
self._keywords.append(kw)
print kw
mykwfile.close()
self._toProcess = False
self.stack = []
def handle_starttag(self, tag, attrs):
if 'a' != tag:
self.stack.append(self.__html_start_tag(tag, attrs))
return
attrs = dict(attrs)
self._toProcess = True
for key in self._keywords:
if 'a' == tag:
p = re.compile(key, re.IGNORECASE)
if 'href' in attrs:
attrs['href'] = p.sub(r'XXXXX',attrs['href'])
self.stack.append(self.__html_start_tag(tag, attrs))
def handle_startendtag(self, tag, attrs):
if 'img' != tag and 'meta' != tag:
self.stack.append(self.__html_startend_tag(tag, attrs))
return
attrs = dict(attrs)
self._toProcess = True
for key in self._keywords:
p = re.compile(key, re.IGNORECASE)
if 'img' == tag:
if 'src' in attrs:
attrs['src'] = p.sub(r'XXXXX',attrs['src'])
if 'alt' in attrs:
attrs['alt'] = p.sub(r'XXXXX',attrs['alt'])
if 'meta' == tag:
if 'description' in attrs:
attrs['description'] =
p.sub(r'XXXXX',attrs['description'])
if 'content' in attrs:
attrs['content'] =
p.sub(r'XXXXX',attrs['content'])
if 'meta' == tag or 'img' == tag:
self._toProcess = False
self.stack.append(self.__html_startend_tag(tag, attrs))
def handle_endtag(self, tag):
self.stack.append(self.__html_end_tag(tag))
if self._toProcess:
self._toProcess = False
def handle_data(self, data):
if self._toProcess:
for key in self._keywords:
p = re.compile(key,re.IGNORECASE)
data = p.sub(r'XXXXX',data)
self.stack.append(data)
def __html_start_tag(self, tag, attrs):
return '<%s%s>' % (tag, self.__html_attrs(attrs))
def __html_startend_tag(self, tag, attrs):
return '<%s%s/>' % (tag, self.__html_attrs(attrs))
def __html_end_tag(self, tag):
return '</%s>' % (tag)
def __html_attrs(self, attrs):
_attrs = ''
if attrs:
_attrs = ' %s' % (' '.join([('%s="%s"' % (k,v)) for k,v in
attrs.iteritems()]))
return _attrs
But when I use it, it gives me the following error message:
ERROR Processor exception: AttributeError: 'list' object has no
attribute 'it
eritems'
Traceback (most recent call last):
File "d:\esp\lib\python2.3\processors\DocDumpF.py", line 87, in
Process
p.feed(document.GetValue("data"))
File "HTMLParser.py", line 108, in feed
File "HTMLParser.py", line 148, in goahead
File "HTMLParser.py", line 281, in parse_starttag
File "d:\esp\lib\python2.3\processors\DocDumpF.py", line 121, in
handle_startt
ag
self.stack.append(self.__html_start_tag(tag, attrs))
File "d:\esp\lib\python2.3\processors\DocDumpF.py", line 167, in
__html_start_
tag
return '<%s%s>' % (tag, self.__html_attrs(attrs))
File "d:\esp\lib\python2.3\processors\DocDumpF.py", line 178, in
__html_attrs
_attrs = ' %s' % (' '.join([('%s="%s"' % (k,v)) for k,v in
attrs.iteritems()
]))
Anybody knows why it says attrs is not a list element?
Thanks,
Rubén