L
Lex Hider
Hi,
Apologies if this is against etiquette. I've just got my first python app up
and running. It is a podcast aggregator depending on feedparser. I've really
only learnt enough to get this up and running.
Any tips on the code quality and use of python would be appreciated. I've got
a feeling the overall structure is up the creek.
approx 220 LOC.
file: GodCast.py
Cheers,
Lex.
#!/usr/bin/python
# GodCast: podcast aggregator!
# depends on wget & lynx
# * one of the main features of GodCast is it's use of bandwidth.
# Many podcatchers
# http://www.faqts.com/knowledge_base/view.phtml/aid/422/fid/17
# TODO: not found log
# TODO:
# config file
# opml feed list?
# pygtk/pyqt/qtkde gui?
# possible flags: test, print but don't actual do anything
import re, feedparser, os, sys, shutil, time, getopt
import urllib2
import urllib
import md5
boz = ""
HOME = os.path.expanduser("~")
# user configurable
#maxChecksPerDay = 8
#maxChecksPerDay = 12
maxChecksPerDay = 24
myTemp = '/tmp'
#podDir = os.path.join(HOME, 'Audio/Podcasts')
podDir = os.path.join(HOME, 'Podcasts')
# end user configurable
downDir = os.path.join(myTemp, 'Podcasts')
dotDir = os.path.join(HOME, '.aGodCast')
logFile = os.path.join(dotDir, 'log') #list of downloaded urls
cacheDir = os.path.join(dotDir, 'cache')
ignoreNotFound = False # if true, add files not found to log
# list of feeds, ignore lines not beginning ^http
feedList = os.path.join(dotDir, 'feeds.txt')
def exitFunc():
#f.close()
#log.close()
if boz:
print boz
def makeDirs(*dirs):
for dir in dirs:
if not os.path.exists(dir):
os.makedirs(dir)
# render is used because feeds use a lot of html, not just plain text.
def render(html):
if html:
html = re.sub('"', '\\"', html.encode('utf8'))
#command = 'echo "' + html + '" | w3m -dump -T text/html'
#command = 'echo "' + html + '" | html2text'
command = 'echo "' + html + '" | lynx -dump -stdin -force_html'
os.system(command)
def localMD5(url):
hash = md5.new(url).hexdigest() + '.xml' #unique name from url
return os.path.join(cacheDir, hash)
def cache(url):
max = 60 * 60 * 24 / maxChecksPerDay #seconds
myfile = localMD5(url)
if os.path.isfile(myfile):
elapsed = int(time.time()) - os.path.getmtime(myfile)
if elapsed <= max:
return
print "FETCHING:", url + ' ...'
urllib.urlretrieve(url, myfile)
# handle half finish?
def updateCache(feeds):
l = []
print "updating local xml cache..."
for feed in file(feeds, "r").read().split('\n'):
if not re.match('^http://', feed): # feedList ignores anything but
urls
continue
# TODO: handle whitespace, strip trailing
cache(feed)
l.append([localMD5(feed), feed])
print "cache up to date"
return l
def geturl(url):
try:
redir = urllib2.urlopen(url).geturl()
except urllib2.HTTPError, e:
if e.code != 404:
print url
print "geturl HTTPError:", e.code
return e.code
except urllib2.URLError, e:
# (110, 'Connection timed out')
print e.reason
#print "geturl URLError:", e.code
else:
return redir
return 0
def htmlTitle(mainTitle, subTitle):
s = '<HR>'
s += '<H2>' + mainTitle + '</H2>'
s += '<H3>' + subTitle + '</H3>'
return s
def downloadPod(url, dest):
kb = 2
success = 0
command = 'wget --continue -O "' + dest + '" "' + url + '"'
status = os.system(command)
if status == success:
return True
else:
print "\nWGET:", status
if status == kb:
pass
#raise KeyboardInterrupt
return False
def downloadQueue(q, latest):
for x in range(latest):
for [feedTitle, castList] in q:
if not len(castList) > x:
continue
cast = castList[x]
if cast is None:
continue
url = cast.enclosures[0]['href']
redirect = geturl(url) # TRAFFIC
if type(redirect) != int: #success
render(htmlTitle(feedTitle + ": #" + str(x+1), cast.title))
render(cast.description)
podFile = os.path.basename(redirect).split('?')[0]
permDir = os.path.join(podDir, feedTitle)
permFile = os.path.join(permDir, podFile)
tempDir = os.path.join(downDir, feedTitle)
tempFile = os.path.join(tempDir, podFile)
if not os.path.isfile(permFile):
makeDirs(tempDir, permDir)
if downloadPod(redirect, tempFile): # TRAFFIC
shutil.move(tempFile, permFile)
log(url)
else:
print "EXITING"
sys.exit(2)
else:
render("<BR>*** ON HARD-DRIVE ***")
log(url)
elif redirect == 404:
print 'NOT FOUND:', url
if ignoreNotFound:
print '\tWILL NO LONGER ATTEMPT TO DOWNLOAD\n'
log(url)
else:
sys.exit(2)
def log(url):
file(logFile, 'a').write(url + "\n")
def main(args):
sys.exitfunc = exitFunc
makeDirs(dotDir, podDir, downDir, cacheDir)
#make file if doesn't exist, may be better solution?
X = file(logFile, 'a')
latest = 13 #get the first x casts for each feed
try:
opts, args = getopt.getopt(sys.argv[1:], "l:",
["latest=", "notfound"])
except getopt.GetoptError:
sys.exit(2)
#usage()
for opt, arg in opts:
if opt in ("-l", "--latest"):
latest = int(arg)
elif opt in ("--notfound"):
ignoreNotFound = True #add notfound files to log
Q = []
for [xmlFile, url] in updateCache(feedList):
output = ""
xml = feedparser.parse(xmlFile)
if xml.channel.has_key('title'): #skip dodgy feeds
itemQ= []
for item in xml['items'][:latest]:
if item.has_key('enclosures'):
podURL = item.enclosures[0]['href']
#check if url in log
if file(logFile, 'r').read().find(podURL) < 0:
itemQ.append(item)
output += htmlTitle(xml.channel.title, item.title)
output += item.description
else:
itemQ.append(None)
Q.append([xml.channel.title, itemQ])
else:
print "DODGY FEED:", url
if xml.bozo:
boz += "BOZO: " + xml.bozo_exception.getMessage() + "\t" + url
sys.exit(2) #time.sleep(1) # allow ctrl+c #continue
render(output)
if Q is not None:
render('<HR><H1>DOWNLOADING QUEUE</H1><HR>')
downloadQueue(Q, latest)
######################################################
if __name__=="__main__":
main(sys.argv)
Apologies if this is against etiquette. I've just got my first python app up
and running. It is a podcast aggregator depending on feedparser. I've really
only learnt enough to get this up and running.
Any tips on the code quality and use of python would be appreciated. I've got
a feeling the overall structure is up the creek.
approx 220 LOC.
file: GodCast.py
Cheers,
Lex.
#!/usr/bin/python
# GodCast: podcast aggregator!
# depends on wget & lynx
# * one of the main features of GodCast is it's use of bandwidth.
# Many podcatchers
# http://www.faqts.com/knowledge_base/view.phtml/aid/422/fid/17
# TODO: not found log
# TODO:
# config file
# opml feed list?
# pygtk/pyqt/qtkde gui?
# possible flags: test, print but don't actual do anything
import re, feedparser, os, sys, shutil, time, getopt
import urllib2
import urllib
import md5
boz = ""
HOME = os.path.expanduser("~")
# user configurable
#maxChecksPerDay = 8
#maxChecksPerDay = 12
maxChecksPerDay = 24
myTemp = '/tmp'
#podDir = os.path.join(HOME, 'Audio/Podcasts')
podDir = os.path.join(HOME, 'Podcasts')
# end user configurable
downDir = os.path.join(myTemp, 'Podcasts')
dotDir = os.path.join(HOME, '.aGodCast')
logFile = os.path.join(dotDir, 'log') #list of downloaded urls
cacheDir = os.path.join(dotDir, 'cache')
ignoreNotFound = False # if true, add files not found to log
# list of feeds, ignore lines not beginning ^http
feedList = os.path.join(dotDir, 'feeds.txt')
def exitFunc():
#f.close()
#log.close()
if boz:
print boz
def makeDirs(*dirs):
for dir in dirs:
if not os.path.exists(dir):
os.makedirs(dir)
# render is used because feeds use a lot of html, not just plain text.
def render(html):
if html:
html = re.sub('"', '\\"', html.encode('utf8'))
#command = 'echo "' + html + '" | w3m -dump -T text/html'
#command = 'echo "' + html + '" | html2text'
command = 'echo "' + html + '" | lynx -dump -stdin -force_html'
os.system(command)
def localMD5(url):
hash = md5.new(url).hexdigest() + '.xml' #unique name from url
return os.path.join(cacheDir, hash)
def cache(url):
max = 60 * 60 * 24 / maxChecksPerDay #seconds
myfile = localMD5(url)
if os.path.isfile(myfile):
elapsed = int(time.time()) - os.path.getmtime(myfile)
if elapsed <= max:
return
print "FETCHING:", url + ' ...'
urllib.urlretrieve(url, myfile)
# handle half finish?
def updateCache(feeds):
l = []
print "updating local xml cache..."
for feed in file(feeds, "r").read().split('\n'):
if not re.match('^http://', feed): # feedList ignores anything but
urls
continue
# TODO: handle whitespace, strip trailing
cache(feed)
l.append([localMD5(feed), feed])
print "cache up to date"
return l
def geturl(url):
try:
redir = urllib2.urlopen(url).geturl()
except urllib2.HTTPError, e:
if e.code != 404:
print url
print "geturl HTTPError:", e.code
return e.code
except urllib2.URLError, e:
# (110, 'Connection timed out')
print e.reason
#print "geturl URLError:", e.code
else:
return redir
return 0
def htmlTitle(mainTitle, subTitle):
s = '<HR>'
s += '<H2>' + mainTitle + '</H2>'
s += '<H3>' + subTitle + '</H3>'
return s
def downloadPod(url, dest):
kb = 2
success = 0
command = 'wget --continue -O "' + dest + '" "' + url + '"'
status = os.system(command)
if status == success:
return True
else:
print "\nWGET:", status
if status == kb:
pass
#raise KeyboardInterrupt
return False
def downloadQueue(q, latest):
for x in range(latest):
for [feedTitle, castList] in q:
if not len(castList) > x:
continue
cast = castList[x]
if cast is None:
continue
url = cast.enclosures[0]['href']
redirect = geturl(url) # TRAFFIC
if type(redirect) != int: #success
render(htmlTitle(feedTitle + ": #" + str(x+1), cast.title))
render(cast.description)
podFile = os.path.basename(redirect).split('?')[0]
permDir = os.path.join(podDir, feedTitle)
permFile = os.path.join(permDir, podFile)
tempDir = os.path.join(downDir, feedTitle)
tempFile = os.path.join(tempDir, podFile)
if not os.path.isfile(permFile):
makeDirs(tempDir, permDir)
if downloadPod(redirect, tempFile): # TRAFFIC
shutil.move(tempFile, permFile)
log(url)
else:
print "EXITING"
sys.exit(2)
else:
render("<BR>*** ON HARD-DRIVE ***")
log(url)
elif redirect == 404:
print 'NOT FOUND:', url
if ignoreNotFound:
print '\tWILL NO LONGER ATTEMPT TO DOWNLOAD\n'
log(url)
else:
sys.exit(2)
def log(url):
file(logFile, 'a').write(url + "\n")
def main(args):
sys.exitfunc = exitFunc
makeDirs(dotDir, podDir, downDir, cacheDir)
#make file if doesn't exist, may be better solution?
X = file(logFile, 'a')
latest = 13 #get the first x casts for each feed
try:
opts, args = getopt.getopt(sys.argv[1:], "l:",
["latest=", "notfound"])
except getopt.GetoptError:
sys.exit(2)
#usage()
for opt, arg in opts:
if opt in ("-l", "--latest"):
latest = int(arg)
elif opt in ("--notfound"):
ignoreNotFound = True #add notfound files to log
Q = []
for [xmlFile, url] in updateCache(feedList):
output = ""
xml = feedparser.parse(xmlFile)
if xml.channel.has_key('title'): #skip dodgy feeds
itemQ= []
for item in xml['items'][:latest]:
if item.has_key('enclosures'):
podURL = item.enclosures[0]['href']
#check if url in log
if file(logFile, 'r').read().find(podURL) < 0:
itemQ.append(item)
output += htmlTitle(xml.channel.title, item.title)
output += item.description
else:
itemQ.append(None)
Q.append([xml.channel.title, itemQ])
else:
print "DODGY FEED:", url
if xml.bozo:
boz += "BOZO: " + xml.bozo_exception.getMessage() + "\t" + url
sys.exit(2) #time.sleep(1) # allow ctrl+c #continue
render(output)
if Q is not None:
render('<HR><H1>DOWNLOADING QUEUE</H1><HR>')
downloadQueue(Q, latest)
######################################################
if __name__=="__main__":
main(sys.argv)