Pulling stuff out of the internet archive (wayback machine)

D

David Fraser

I used this when trying to retrieve the McMillan site. Others might find
it useful...

David

#!/usr/bin/env python

import urlparse
import urllib2
import os
import HTMLParser
import sre

class HTMLLinkScanner(HTMLParser.HTMLParser):
tags = {'a':'href','img':'src','frame':'src','base':'href'}

def reset(self):
self.links = {}
self.replacements = []
HTMLParser.HTMLParser.reset(self)

def handle_starttag(self, tag, attrs):
if tag in self.tags:
checkattrs = self.tags[tag]
if isinstance(checkattrs, (str, unicode)):
checkattrs = [checkattrs]
for attr, value in attrs:
if attr in checkattrs:
if tag != 'base':
link = urlparse.urldefrag(value)[0]
self.links[link] = True
self.replacements.append((self.get_starttag_text(), attr, value))

class MirrorRetriever:
def __init__(self, archivedir):
self.archivedir = archivedir
self.urlmap = {}

def url2filename(self, url):
scheme, location, path, query, fragment = urlparse.urlsplit(url)
if not path or path.endswith('/'):
path += 'index.html'
path = os.path.join(*path.split('/'))
if scheme.lower() != 'http':
location = os.path.join(scheme, location)
# ignore query for the meantime
return os.path.join(self.archivedir, location, path)

def testinclude(self, url):
scheme, location, path, query, fragment = urlparse.urlsplit(url)
if scheme in ('mailto', 'javascript'): return False
# TODO: add ability to specify site
# return location.lower() == 'www.mcmillan-inc.com'
return True

def ensuredir(self, pathname):
if not os.path.isdir(pathname):
self.ensuredir(os.path.dirname(pathname))
os.mkdir(pathname)

def retrieveurl(self, url):
return urllib2.urlopen(url).read()

def mirror(self, url):
if url in self.urlmap:
return
else:
filename = self.url2filename(url)
if not self.testinclude(url):
return
print url,'->',filename
self.urlmap = filename # TODO: add an op...th('.'), sys.argv[2]) m.mirror(sys.argv[1])
 
P

Paul Rubin

David Fraser said:
I used this when trying to retrieve the McMillan site. Others might
find it useful...

Cool, thanks. I've done stuff like that a bunch of times, but I
usually just examine the HTML manually and identify a few fixed
strings to search for, to locate the links I want.
 

Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Members online

Forum statistics

Threads
473,769
Messages
2,569,580
Members
45,054
Latest member
TrimKetoBoost

Latest Threads

Top