Request for tips on my first python script.

L

Lex Hider

Hi,
Apologies if this is against etiquette. I've just got my first python app up
and running. It is a podcast aggregator depending on feedparser. I've really
only learnt enough to get this up and running.

Any tips on the code quality and use of python would be appreciated. I've got
a feeling the overall structure is up the creek.
approx 220 LOC.
file: GodCast.py

Cheers,
Lex.

#!/usr/bin/python
# GodCast: podcast aggregator!
# depends on wget & lynx
# * one of the main features of GodCast is it's use of bandwidth.
# Many podcatchers

# http://www.faqts.com/knowledge_base/view.phtml/aid/422/fid/17
# TODO: not found log
# TODO:
# config file
# opml feed list?
# pygtk/pyqt/qtkde gui?

# possible flags: test, print but don't actual do anything

import re, feedparser, os, sys, shutil, time, getopt
import urllib2
import urllib
import md5

boz = ""
HOME = os.path.expanduser("~")
# user configurable
#maxChecksPerDay = 8
#maxChecksPerDay = 12
maxChecksPerDay = 24
myTemp = '/tmp'
#podDir = os.path.join(HOME, 'Audio/Podcasts')
podDir = os.path.join(HOME, 'Podcasts')
# end user configurable
downDir = os.path.join(myTemp, 'Podcasts')
dotDir = os.path.join(HOME, '.aGodCast')
logFile = os.path.join(dotDir, 'log') #list of downloaded urls
cacheDir = os.path.join(dotDir, 'cache')
ignoreNotFound = False # if true, add files not found to log
# list of feeds, ignore lines not beginning ^http
feedList = os.path.join(dotDir, 'feeds.txt')


def exitFunc():
#f.close()
#log.close()
if boz:
print boz


def makeDirs(*dirs):
for dir in dirs:
if not os.path.exists(dir):
os.makedirs(dir)


# render is used because feeds use a lot of html, not just plain text.
def render(html):
if html:
html = re.sub('"', '\\"', html.encode('utf8'))
#command = 'echo "' + html + '" | w3m -dump -T text/html'
#command = 'echo "' + html + '" | html2text'
command = 'echo "' + html + '" | lynx -dump -stdin -force_html'
os.system(command)


def localMD5(url):
hash = md5.new(url).hexdigest() + '.xml' #unique name from url
return os.path.join(cacheDir, hash)


def cache(url):
max = 60 * 60 * 24 / maxChecksPerDay #seconds
myfile = localMD5(url)
if os.path.isfile(myfile):
elapsed = int(time.time()) - os.path.getmtime(myfile)
if elapsed <= max:
return
print "FETCHING:", url + ' ...'
urllib.urlretrieve(url, myfile)
# handle half finish?


def updateCache(feeds):
l = []
print "updating local xml cache..."
for feed in file(feeds, "r").read().split('\n'):
if not re.match('^http://', feed): # feedList ignores anything but
urls
continue
# TODO: handle whitespace, strip trailing
cache(feed)
l.append([localMD5(feed), feed])
print "cache up to date"
return l


def geturl(url):
try:
redir = urllib2.urlopen(url).geturl()
except urllib2.HTTPError, e:
if e.code != 404:
print url
print "geturl HTTPError:", e.code
return e.code
except urllib2.URLError, e:
# (110, 'Connection timed out')
print e.reason
#print "geturl URLError:", e.code
else:
return redir
return 0


def htmlTitle(mainTitle, subTitle):
s = '<HR>'
s += '<H2>' + mainTitle + '</H2>'
s += '<H3>' + subTitle + '</H3>'
return s


def downloadPod(url, dest):
kb = 2
success = 0
command = 'wget --continue -O "' + dest + '" "' + url + '"'
status = os.system(command)
if status == success:
return True
else:
print "\nWGET:", status
if status == kb:
pass
#raise KeyboardInterrupt
return False


def downloadQueue(q, latest):
for x in range(latest):
for [feedTitle, castList] in q:
if not len(castList) > x:
continue
cast = castList[x]
if cast is None:
continue
url = cast.enclosures[0]['href']
redirect = geturl(url) # TRAFFIC
if type(redirect) != int: #success
render(htmlTitle(feedTitle + ": #" + str(x+1), cast.title))
render(cast.description)

podFile = os.path.basename(redirect).split('?')[0]
permDir = os.path.join(podDir, feedTitle)
permFile = os.path.join(permDir, podFile)
tempDir = os.path.join(downDir, feedTitle)
tempFile = os.path.join(tempDir, podFile)
if not os.path.isfile(permFile):
makeDirs(tempDir, permDir)
if downloadPod(redirect, tempFile): # TRAFFIC
shutil.move(tempFile, permFile)
log(url)
else:
print "EXITING"
sys.exit(2)
else:
render("<BR>*** ON HARD-DRIVE ***")
log(url)
elif redirect == 404:
print 'NOT FOUND:', url
if ignoreNotFound:
print '\tWILL NO LONGER ATTEMPT TO DOWNLOAD\n'
log(url)
else:
sys.exit(2)


def log(url):
file(logFile, 'a').write(url + "\n")


def main(args):
sys.exitfunc = exitFunc
makeDirs(dotDir, podDir, downDir, cacheDir)
#make file if doesn't exist, may be better solution?
X = file(logFile, 'a')
latest = 13 #get the first x casts for each feed

try:
opts, args = getopt.getopt(sys.argv[1:], "l:",
["latest=", "notfound"])
except getopt.GetoptError:
sys.exit(2)
#usage()

for opt, arg in opts:
if opt in ("-l", "--latest"):
latest = int(arg)
elif opt in ("--notfound"):
ignoreNotFound = True #add notfound files to log

Q = []
for [xmlFile, url] in updateCache(feedList):
output = ""
xml = feedparser.parse(xmlFile)
if xml.channel.has_key('title'): #skip dodgy feeds
itemQ= []
for item in xml['items'][:latest]:
if item.has_key('enclosures'):
podURL = item.enclosures[0]['href']
#check if url in log
if file(logFile, 'r').read().find(podURL) < 0:
itemQ.append(item)
output += htmlTitle(xml.channel.title, item.title)
output += item.description
else:
itemQ.append(None)
Q.append([xml.channel.title, itemQ])
else:
print "DODGY FEED:", url
if xml.bozo:
boz += "BOZO: " + xml.bozo_exception.getMessage() + "\t" + url
sys.exit(2) #time.sleep(1) # allow ctrl+c #continue
render(output)

if Q is not None:
render('<HR><H1>DOWNLOADING QUEUE</H1><HR>')
downloadQueue(Q, latest)

######################################################
if __name__=="__main__":
main(sys.argv)
 
B

Bruno Desthuilliers

Lex said:
Hi,
Apologies if this is against etiquette. I've just got my first python app up
and running. It is a podcast aggregator depending on feedparser. I've really
only learnt enough to get this up and running.

Any tips on the code quality and use of python would be appreciated. I've got
a feeling the overall structure is up the creek.
approx 220 LOC.
file: GodCast.py

#!/usr/bin/python (snip)
# TODO: not found log http://docs.python.org/lib/module-logging.html

# TODO:
# config file http://docs.python.org/lib/module-ConfigParser.html

# opml feed list?
# pygtk/pyqt/qtkde gui?

# possible flags: test, print but don't actual do anything
http://docs.python.org/lib/module-optparse.html

import re, feedparser, os, sys, shutil, time, getopt
import urllib2
import urllib
import md5

boz = ""
HOME = os.path.expanduser("~")
# user configurable
#maxChecksPerDay = 8
#maxChecksPerDay = 12
maxChecksPerDay = 24
myTemp = '/tmp'

http://docs.python.org/lib/module-tempfile.html

BTW, note that the most common naming convention in Python is
all_lower_with_underscore (except for ClasseName).
#podDir = os.path.join(HOME, 'Audio/Podcasts')
podDir = os.path.join(HOME, 'Podcasts')
# end user configurable
downDir = os.path.join(myTemp, 'Podcasts')
dotDir = os.path.join(HOME, '.aGodCast')
logFile = os.path.join(dotDir, 'log') #list of downloaded urls
cacheDir = os.path.join(dotDir, 'cache')
ignoreNotFound = False # if true, add files not found to log
# list of feeds, ignore lines not beginning ^http
feedList = os.path.join(dotDir, 'feeds.txt')


def exitFunc():
#f.close()
#log.close()
if boz:
print boz


def makeDirs(*dirs):
for dir in dirs:
if not os.path.exists(dir):
os.makedirs(dir)


# render is used because feeds use a lot of html, not just plain text.
def render(html):
if html:
html = re.sub('"', '\\"', html.encode('utf8'))
#command = 'echo "' + html + '" | w3m -dump -T text/html'
#command = 'echo "' + html + '" | html2text'
command = 'echo "' + html + '" | lynx -dump -stdin -force_html'

another way :
command = 'echo "%s" | lynx -dump -stdin -force_html' % html
os.system(command)


def localMD5(url):
hash = md5.new(url).hexdigest() + '.xml' #unique name from url
return os.path.join(cacheDir, hash)


def cache(url):
max = 60 * 60 * 24 / maxChecksPerDay #seconds
myfile = localMD5(url)
if os.path.isfile(myfile):
elapsed = int(time.time()) - os.path.getmtime(myfile)
if elapsed <= max:
return
print "FETCHING:", url + ' ...'

Note that stdout is usually meant for normal program outputs (so one can
pipe programs). Error reporting and verbosities should go to stderr
urllib.urlretrieve(url, myfile)
# handle half finish?


def updateCache(feeds):
l = []
print "updating local xml cache..."
for feed in file(feeds, "r").read().split('\n'):
if not re.match('^http://', feed): # feedList ignores anything but
urls
continue
# TODO: handle whitespace, strip trailing
cache(feed)
l.append([localMD5(feed), feed])
print "cache up to date"
return l


def geturl(url):
try:
redir = urllib2.urlopen(url).geturl()
except urllib2.HTTPError, e:
if e.code != 404:
print url
print "geturl HTTPError:", e.code
return e.code
except urllib2.URLError, e:
# (110, 'Connection timed out')
print e.reason
#print "geturl URLError:", e.code
else:
return redir
return 0

I'm afraid you didn't get the point of exception handling - it's meant
to free function results from error code. Your above code totally
defeats this - as is obvious when reading the calling code.
redirect = geturl(url) # TRAFFIC
if type(redirect) != int: #success
do_something_useful_here()
elif redirect == 404:
print 'NOT FOUND:', url
if ignoreNotFound:
print '\tWILL NO LONGER ATTEMPT TO DOWNLOAD\n'
log(url)
else:
sys.exit(2)

may I suggest a rewrite :

try:
redirect = urllib2.urlopen(url).geturl()
except urllib2.HTTPError, e:
if e.code == 404:
print 'NOT FOUND:', url
if ignoreNotFound:
print '\tWILL NO LONGER ATTEMPT TO DOWNLOAD\n'
log(url)
else:
print "geturl HTTPError %s on url %s" % (e.code, url)
raise
except urllib2.URLError, e:
print "geturl URLError %s on url %s" % (e.reason, url)
else:
do_something_useful_here()

def htmlTitle(mainTitle, subTitle):
s = '<HR>'
s += '<H2>' + mainTitle + '</H2>'
s += '<H3>' + subTitle + '</H3>'
return s

html_title_template = """
<hr>
<h2>%(title)s</h2>
<h3>%(subtitle)s</h3>
"""

def html_title(title, subtitle):
return html_title_template % dict(title=title, subtitle=subtitle)
def downloadPod(url, dest):
kb = 2
success = 0
command = 'wget --continue -O "' + dest + '" "' + url + '"'
command = 'wget --continue -0 "%s" "%s"' % (dest, url)
status = os.system(command)
if status == success:
return True
else:
print "\nWGET:", status
if status == kb:
pass
#raise KeyboardInterrupt
return False


def downloadQueue(q, latest):
for x in range(latest):
for [feedTitle, castList] in q:
for feedTitle, castList in q:
if not len(castList) > x:

double negations are harder to grasp:
if len(castList) <= x

I'm not sure to get what you're doing here....
cast = castList[x]
if cast is None:
continue
url = cast.enclosures[0]['href']
redirect = geturl(url) # TRAFFIC
if type(redirect) != int: #success
render(htmlTitle(feedTitle + ": #" + str(x+1), cast.title))
render(cast.description)

Mixing logic and UI is usually a very bad idea IMHO.
podFile = os.path.basename(redirect).split('?')[0]
permDir = os.path.join(podDir, feedTitle)
permFile = os.path.join(permDir, podFile)
tempDir = os.path.join(downDir, feedTitle)
tempFile = os.path.join(tempDir, podFile)
if not os.path.isfile(permFile):
makeDirs(tempDir, permDir)
if downloadPod(redirect, tempFile): # TRAFFIC
shutil.move(tempFile, permFile)
log(url)
else:
print "EXITING"
sys.exit(2)

This will make this function hard to use in a different context (like ie
a mod_apache handler, a GUI, etc... You'd better raise some specific
exception and let the caller handle it.

(snip)

def log(url):
file(logFile, 'a').write(url + "\n")
def main(args):
sys.exitfunc = exitFunc
makeDirs(dotDir, podDir, downDir, cacheDir)
#make file if doesn't exist, may be better solution?
yes : using the logging module
X = file(logFile, 'a')
latest = 13 #get the first x casts for each feed
try:
opts, args = getopt.getopt(sys.argv[1:], "l:",
["latest=", "notfound"])
except getopt.GetoptError:
sys.exit(2)
#usage()

for opt, arg in opts:
if opt in ("-l", "--latest"):
latest = int(arg)
elif opt in ("--notfound"):
ignoreNotFound = True #add notfound files to log

OMG. optparse is far better than this.

(snip)

My overall feeling is that in most function, you're mixing too much
different concerns. Each function should do *one* thing (and do it
well). The most obvious problem here is about mixing application logic
and UI. Application logic (what your app is effectively doing) should be
as independant as possible of UI concerns. If possible, you should be
able to reuse most of the application logic (except for the main() func,
which in your case is the command-line UI) as a module in other
applications. Now there are time when the application code needs to
notify the UI. The good news is that Python makes it easy to makes this
generic, using callback functions or objects provided by the UI and
called when appropriate by the application logic. The key here is to
come up with a clear, well-defined interface between logic code and UI code.

A last point : avoid globals whenever possible. Either pass values as
arguments to functions or use a kind of config object if there are too
much configuration stuff.

My 2 cents
 
S

Sybren Stuvel

Lex Hider enlightened us with:
Any tips on the code quality and use of python would be appreciated.
I've got a feeling the overall structure is up the creek.

I'll post some remarks about the code ;-)
HOME = os.path.expanduser("~")

I wouldn't use this. Just use os.environ['HOME']. In most cases it
turns out to be the same directory, but it adds more flexibility. If
someone wants your app to read/write to another directory, he/she can
simply change the HOME environment variable.
# user configurable
#maxChecksPerDay = 8
#maxChecksPerDay = 12
maxChecksPerDay = 24
myTemp = '/tmp'
#podDir = os.path.join(HOME, 'Audio/Podcasts')
podDir = os.path.join(HOME, 'Podcasts')
# end user configurable

A bit of nitpicking: place a blank line between the user configurable
part and the rest of the code.
def exitFunc():
#f.close()
#log.close()
if boz:
print boz

Write function comments! Use the docstring to describe your function -
what does it do, and what does it return? Do this for all functions
you write.
# render is used because feeds use a lot of html, not just plain text.
def render(html):
if html:
html = re.sub('"', '\\"', html.encode('utf8'))

Use a raw string for the second argument to make it more readable:

html = re.sub('"', r'\"', html.encode('utf8'))
command = 'echo "' + html + '" | lynx -dump -stdin -force_html'
os.system(command)

Use the subprocess module or the popen2 module to open lynx. That way,
you can feed the HTML to lynx directly, and you're not bound to the
maximum line length of the shell. It also removes the need to escape
quotes.
def updateCache(feeds):
l = []

Use longer names than 'l'.
print "updating local xml cache..."
for feed in file(feeds, "r").read().split('\n'):
if not re.match('^http://', feed): # feedList ignores
# anything but urls

Here you say you only match URLs starting with "http://", but at the
start you claimed to only use URLs starting with "http". Be sure to
keep your documentation and your code in sync.
def htmlTitle(mainTitle, subTitle):
s = '<HR>'
s += '<H2>' + mainTitle + '</H2>'
s += '<H3>' + subTitle + '</H3>'
return s

It might be easier on the eyes if you use:

s = '<hr><h2>%s</h2><h3>%s</h3>' % (mainTitle, subTitle)

It might also be wise to use lower-case HTML tags to remain compatible
with XHTML.
def downloadPod(url, dest):
kb = 2
success = 0
command = 'wget --continue -O "' + dest + '" "' + url + '"'
status = os.system(command)

Here you pass the arguments of the function to a system call. This
means that before the downloadPod function is called, the 'url' and
'dest' should have been escaped or cleared of unwanted characters.
This should really be documented.

Overall, your code needs to be commented and documented much better.
Think of other people reading the code, and explain _why_ you do
something, instead of explaining _what_ you're doing.

Sybren
 
M

Maric Michaud

Le vendredi 08 septembre 2006 13:41, Sybren Stuvel a écrit :
HOME = os.path.expanduser("~")

I wouldn't use this. Just use os.environ['HOME']. In most cases it
turns out to be the same directory, but it adds more flexibility. If
someone wants your app to read/write to another directory, he/she can
simply change the HOME environment variable.

and ?

maric@redflag2 jeu sep 07 09:17:51:~/test$ export HOME=/etc
maric@redflag2 ven sep 08 13:53:17:/home/maric/test$ cd ~
maric@redflag2 ven sep 08 13:53:22:~$ pwd
/etc
maric@redflag2 ven sep 08 13:55:46:~$ python -c 'import os
print os.path.expanduser("~")
'
/etc


--
_____________

Maric Michaud
_____________

Aristote - www.aristote.info
3 place des tapis
69004 Lyon
Tel: +33 426 880 097
 
M

Maric Michaud

Le vendredi 08 septembre 2006 13:56, Maric Michaud a écrit :
maric@redflag2 jeu sep 07 09:17:51:~/test$ export HOME=/etc
maric@redflag2 ven sep 08 13:53:17:/home/maric/test$ cd ~
maric@redflag2 ven sep 08 13:53:22:~$ pwd
/etc
maric@redflag2 ven sep 08 13:55:46:~$ python -c 'import os


/etc

Of course it's not the same as :

maric@redflag2 ven sep 08 13:58:25:~$ export HOME=/etc
maric@redflag2 ven sep 08 14:00:11:~$ python -c 'import os
print os.path.expanduser("~maric")
'
/home/maric




--
_____________

Maric Michaud
_____________

Aristote - www.aristote.info
3 place des tapis
69004 Lyon
Tel: +33 426 880 097
 
R

Roberto Bonvallet

Lex said:
Any tips on the code quality and use of python would be appreciated. I've
got a feeling the overall structure is up the creek. [...]
for opt, arg in opts:
if opt in ("-l", "--latest"):
latest = int(arg)
elif opt in ("--notfound"):
ignoreNotFound = True #add notfound files to log

Subtle bug here: ("--notfound") is not a tuple, is just a string, so what
you are actually testing is whether opt is a substring of "--not-found".
To actually build a 1-element tuple, you have to put a trailing comma:

elif opt in ("--notfound", ):

but it would be clearer if you just use:

elif opt == "--notfound":
 
S

Steven Bethard

Lex said:
try:
opts, args = getopt.getopt(sys.argv[1:], "l:",
["latest=", "notfound"])
except getopt.GetoptError:
sys.exit(2)
#usage()

for opt, arg in opts:
if opt in ("-l", "--latest"):
latest = int(arg)
elif opt in ("--notfound"):
ignoreNotFound = True #add notfound files to log

You should definitely consider using optparse or argparse_. Here's a
rewrite using argparse::

parser = argparse.ArgumentParser(description='podcast aggregator')
parser.add_argument('-l', '--latest', type=int)
parser.add_argument('--notfound', action='store_true')
values = parser.parse_args()

Then just use ``values.latest`` and ``values.notfound`` instead of
``latest`` and ``ignoreNotFound`` in the rest of your code. Using
argparse also allows you to easily add your other directories as command
line options, e.g.::

parser.add_argument('podDir', nargs='?',
default=os.path.join(HOME, 'Podcasts'))

If you then use ``values.podDir`` instead of ``podDir`` throughout your
code, your users can invoke your script in either of the following ways::

GodCast.py # no args, use "$HOME/Podcasts"
GodCast.py podcast_dir # podDir specified, use "podcast_dir"

If I were writing your script, I would add optional or positional
arguments for all of ``maxChecksPerDay``, ``myTemp``, ``downDir``,
``logFile``, ``cacheDir`` and ``feedList``.

(If you'd like your code to look more like the Python "standard", I'd
use `PEP 8`_ compliant names, e.g. ``pod_dir`` instead of ``podDir`` and
``make_dirs`` instead of ``makeDirs``.)

... _argparse: http://argparse.python-hosting.com/
... _PEP 8: http://www.python.org/dev/peps/pep-0008/

HTH,

STeVe
 

Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Members online

No members online now.

Forum statistics

Threads
473,983
Messages
2,570,187
Members
46,747
Latest member
jojoBizaroo

Latest Threads

Top