Problem with uudecode

Discussion in 'Python' started by Juho Saarikko, May 25, 2004.

  1. I made a Python script which takes Usenet message bodies from a database,
    decodes uuencoded contents and inserts them as Large Object into a
    PostGreSQL database. However, it appears that the to last few bytes
    of uudecoded data are always mangled. Take a look of this hexdump output:

    Originals (decoded with Pan, each line is from a different file):
    000c2c0 e1bf 00ff 2541 a9e4 a724 d9ff
    0011a10 ff54 00d9
    00093e0 fb4f a80d ffd9 c200 ffef 00d9

    Decoded by the script:
    000c2c0 e1bf 00ff 2541 a9e4 a724 d0ff
    0011a10 ff54 00d8
    00093e0 fb4f a80d ffd9 c200 ffef 00d8

    As you can see, one of the last two bytes gets altered in all cases.

    The script also outputs the decoded file to disk for debugging purposes,
    and the database large object and filesystem file match so it can't be a
    PostGreSQL problem.

    So, if anyone has any idea what is wrong, please tell me ? I can't found
    any reason why the bytes would get mangled...

    The script follows:

    #!/usr/local/bin/python2.3

    # Insert message contents into the database, for each message-id already there
    #
    # Copyright 2004 by Juho Saarikko
    # License: GNU General Public License (GPL) version 2
    # See www.gnu.org for details

    from pyPgSQL import libpq
    import nntplib
    import sys
    import string
    import regex
    import sha
    import imghdr
    import binascii
    import StringIO
    import os

    def strip_trailing_dots(n):
    tmp = []
    for i in range(len(n)):
    if n[-1] == "," or n[-1] == ".":
    tmp.append(n[:-1])
    else:
    tmp.append(n)
    return tmp

    def findmimetype(body, filename):
    tail4 = string.lower(filename[-5:])
    tail3 = string.lower(filename[-4:])
    if tail4 == ".jpeg":
    return "image/jpeg"
    if tail3 == ".jpg":
    return "image/jpeg"
    if tail3 == ".png":
    return "image/png"
    if tail3 == ".jpe":
    return "image/jpeg"
    if tail3 == ".gif":
    return "image/gif"
    return None

    def insert_picture(conn, image, filename):
    hash = sha.new(image)
    qhash = libpq.PgQuoteBytea(hash.digest())
    candidates = conn.query("SELECT id, picture FROM pictures WHERE hash = " + qhash )
    if candidates.ntuples > 0:
    print "Found possible mathces " + str(candidates.ntuples)
    for x in range(candidates.ntuples):
    old = candidates.getvalue(x, 1)
    old.open("r")
    oldpic = old.read()
    old.close()
    if oldpic == image:
    print "Found a match"
    ret = (candidates.getvalue(x,0), 1)
    return ret
    mime = findmimetype(image, filename)
    print "attempting to get mimetype"
    if mime == None:
    print "No mimetype found"
    ret = (0, 0)
    return ret
    mime = libpq.PgQuoteString(mime)
    mimeres = conn.query("SELECT id FROM mimetypes WHERE mimetype = " + mime)
    if mimeres.ntuples == 0:
    conn.query("INSERT INTO mimetypes (mimetype) VALUES (" + mime + ")")
    mimeres = conn.query("SELECT id FROM mimetypes WHERE mimetype = " + mime)
    mimetype = mimeres.getvalue(0,0)
    picture = conn.lo_creat("rw")
    picture.open("rw")
    picture.write(image)
    picture.close()
    tmp = conn.query("INSERT INTO pictures (hash, mimetype, picture) VALUES (" + qhash + ", " +str(mimetype) + ", " + picture.name + ")")
    temp = conn.query("SELECT id FROM pictures WHERE OID = " + str(tmp.oidValue))
    id = temp.getvalue(0,0)
    ret = (id, 0)
    return ret

    def try_decode_and_insert_uuencoded(conn, id):
    begin = regex.compile("begin [0-9]+ \(.*\)")
    conn.query("BEGIN")
    basedir = "kuvat"
    message = conn.query("SELECT data FROM fragments_bodies WHERE message = " + str(id) + " ORDER BY line")
    # print message.ntuples

    keywords = []
    picids = []
    newpicids = []
    n = 0
    s = ""
    picid = 0
    print 'Starting message id ' + str(id)
    while n < message.ntuples:
    # print "length of row " + str(n)
    # print str(message.getlength(n, 0))
    # print "Got length"
    abcddummy = message.getvalue(n, 0)
    # print "Got value"
    s = message.getvalue(n, 0)
    # print "Got s"
    if begin.match(s) > 0:
    # if match_beginning(s) > 0:
    # print "Begin matched"
    body = []
    file = begin.group(1)
    # file = get_file_name(s)
    # print "Starting to decode, at line " + str(n + 1)
    for k in range(n+1, message.ntuples):
    # print "Decodind row " + str(k)
    s = message.getvalue(k, 0)
    if s[:3] == "end":
    n = k + 1
    break
    try:
    body.append(binascii.a2b_uu(s))
    except:
    try:
    bytes = (((ord(s[0])-32) & 63) * 4 + 3) / 3
    body.append(binascii.a2b_uu(s[:bytes]))
    except:
    print "Broken attachment in message " + str(id)
    conn.query("ROLLBACK")
    return
    # print "Got to end, at line " + str(n)
    # print "Attempting to join body"
    body = string.join(body, "")
    # print "Attempting to hash body"
    # hash = sha.new(body)
    # qhash = libpq.PgQuoteBytea(hash.digest())
    # qbody = libpq.PgQuoteBytea(body)
    # print "Attempting to find whether the pic already exists"
    print "Mimetype returned " + str(findmimetype(body, file))
    # temporary = open("dummy", "wb")
    # temporary.write(body)
    # temporary.close()
    # dummy.write("dsfds")
    print "Calling insert function"
    picid, exists = insert_picture(conn, body, file)
    print "Returned from insert function with value " + str(picid)
    if picid > 0:
    # already = conn.query("SELECT id FROM pictures WHERE hash = " + qhash)
    # if already.ntuples == 0:
    # print "Attempting to find mimetype"
    # mimetype = findmimetype(body, file)
    # print "Found mimetype"
    # if mimetype != None:
    # o = conn.query("INSERT INTO pictures (picture, hash, mimetype) VALUES (" + qbody + ", " + qhash + ", " + libpq.PgQuoteString(mimetype) + ")")
    # already = conn.query("SELECT id FROM pictures WHERE OID = " + str(o.oidValue()));
    # already = conn.query("SELECT id FROM pictures WHERE data = " + qbody)
    # already = conn.query("SELECT id FROM pictures WHERE hash = " + qhash)
    # print "Attempting to insert hash and mimetype"
    # conn.query("INSERT INTO pictures (hash, mimetype) VALUES (" + qhash + ", " + libpq.PgQuoteString(mimetype) + ")")
    # print "Attempting to get id"
    # already = conn.query("SELECT id FROM pictures WHERE hash = " + qhash)
    # print "Attempting to get value"
    # picid = already.getvalue(0, 0)
    print picid
    print "Attempting to OK dir"
    if os.access(basedir + "/tmp", os.F_OK) != 1:
    os.mkdir(basedir + "/tmp")
    fh = open(basedir + "/tmp/" + str(picid), "wb")
    fh.write(body)
    fh.close()
    print "File ok"
    picids.append(picid)
    if exists == 0:
    newpicids.append(picid)
    if file != "":
    keywords.append(file)
    # else:
    # picid = already.getvalue(0, 0)
    # if already.ntuples == 0:
    # conn.query("ROLLBACK")
    # return
    # picids.append(picid)
    # if already.ntuples == 0:
    # print "already.ntuples == 0, ROLLBACKing"
    # conn.query("ROLLBACK")
    # return
    # print "Appending picid"
    # picids.append(picid)
    # print "Picid appended"
    else:
    tmptmp = string.split(s)
    tmpkey = strip_trailing_dots(tmptmp)
    if len(tmpkey) > 0:
    for j in range(len(tmpkey)):
    keywords.append(tmpkey[j])
    # print "Adding 1 to n"
    n = n + 1
    if len(picids) > 0:
    print "Found " + str(len(picids)) + " pictures (" + str(len(newpicids)) + " new ones)"
    # print "Finding Subject"
    head = conn.query("SELECT contents FROM fragments_header_contents WHERE message = " + str(id) + " AND header = (SELECT id FROM fragments_header_names WHERE header ilike 'Subject')")
    if head.ntuples > 0:
    # print "Splitting Subject"
    blah = head.getvalue(0,0)
    # print str(blah)
    blahblah = string.split(str(blah))
    # print "Stripping"
    abctmpkey = strip_trailing_dots(blahblah)
    # print "Stripping done"
    # print "Really"
    tmpkey = abctmpkey
    # print "Subject split"
    if len(tmpkey) > 0:
    for j in range(len(tmpkey)):
    keywords.append(tmpkey[j])
    o = conn.query("INSERT INTO messages DEFAULT VALUES")
    mid = conn.query("SELECT id FROM messages WHERE OID = " + str(o.oidValue))
    messageid = mid.getvalue(0, 0)
    nresult = conn.query("SELECT contents FROM fragments_header_contents WHERE message = " + str(id) + " AND header = (SELECT id FROM fragments_header_names WHERE header ILIKE 'Newsgroups')")
    if nresult.ntuples > 0:
    for x in range(nresult.ntuples):
    newsgroups = string.split(nresult.getvalue(x, 0), ",")
    if len(newsgroups) > 0:
    for y in range (len(newsgroups)):
    newsgroup = libpq.PgQuoteString(newsgroups[y])
    ngroupres = conn.query("SELECT id FROM newsgroups WHERE name = " + newsgroup)
    if ngroupres.ntuples > 0:
    newsgid = ngroupres.getvalue(0, 0)
    else:
    conn.query("INSERT INTO newsgroups (name) VALUES (" + newsgroup + ")")
    ngrtmpres = conn.query("SELECT id FROM newsgroups WHERE name = " + newsgroup)
    newsgid = ngrtmpres.getvalue(0, 0)
    conn.query("INSERT INTO messages_ngroups_glue (message, newsgroup) VALUES (" + str(messageid) + ", " + str(newsgid) + ")")
    else:
    print "An empty Newsgroups: header at messag " + str(id)
    conn.query("ROLLBACK")
    return
    else:
    print "No Newsgroups: header at message " + str(id)
    conn.query("ROLLBACK")
    return
    for x in range(len(picids)):
    conn.query("INSERT INTO messages_pictures_glue (message, picture) VALUES (" + str(messageid) + ", " + str(picids[x]) + ")")
    if len(keywords) > 0:
    for x in range(len(tmpkey)):
    qword = libpq.PgQuoteString(str(keywords[x]))
    tmp = conn.query("SELECT id FROM keywords_words WHERE keyword = " + qword)
    if tmp.ntuples == 0:
    conn.query("INSERT INTO keywords_words (keyword) VALUES (" + qword + ")")
    tmp = conn.query("SELECT id FROM keywords_words WHERE keyword = " + qword)
    keyid = str(tmp.getvalue(0, 0))
    for y in range(len(picids)):
    conn.query("INSERT INTO keywords_glue(word, picture) VALUES (" + keyid + ", " + str(picids[y]) + ")")
    dummyone = "SELECT fragments_header_contents.line, fragments_header_names.header,"
    dummytwo = " fragments_header_contents.contents FROM fragments_header_names, fragments_header_contents"
    dummythree = " WHERE fragments_header_contents.message = " + str(id)
    dummyfour = " AND fragments_header_contents.header = fragments_header_names.id"
    head = conn.query(dummyone + dummytwo + dummythree + dummyfour)
    if head.ntuples > 0:
    for h in range(head.ntuples):
    qhead = libpq.PgQuoteString(str(head.getvalue(h, 1)))
    qcont = libpq.PgQuoteString(str(head.getvalue(h, 2)))
    tmp = conn.query("SELECT id FROM header_names WHERE header = " + qhead)
    if tmp.ntuples == 0:
    conn.query("INSERT INTO header_names (header) VALUES (" + qhead + ")")
    tmp = conn.query("SELECT id FROM header_names WHERE header = " + qhead)
    headid = str(tmp.getvalue(0, 0))
    line = str(head.getvalue(0, 0))
    conn.query("INSERT INTO header_contents (header, message, line, contents) VALUES (" + headid + ", " + str(messageid) + ", " + line + ", " + qcont + ")")
    conn.query("DELETE FROM fragments_header_contents WHERE message = " + str(id))
    conn.query("DELETE FROM fragments_bodies WHERE message = " + str(id))
    conn.query("COMMIT")
    if len(newpicids) > 0:
    tmpdir = basedir + "/tmp/"
    for i in range(len(newpicids)):
    picid = newpicids
    tmppicname = tmpdir + str(picid)
    permpicname = basedir + "/" + str(picid%1000) + "/" + str(picid)
    print tmppicname
    print permpicname
    if os.access(basedir + "/" + str(picid%1000), os.F_OK) != 1:
    os.mkdir(basedir + "/" + str(picid%1000))
    os.link(tmppicname, permpicname)
    os.unlink(tmpdir +str(picid))
    else:
    print "No pictures found"
    conn.query("ROLLBACK")
    return


    database = libpq.PQconnectdb('dbname = kuvat')
    items = database.query("SELECT message FROM whole_attachments")

    # try_decode_and_insert_uuencoded(database, 5407)

    for i in range(items.ntuples):
    try:
    print 'Starting call ' + str(i)
    try_decode_and_insert_uuencoded(database, items.getvalue(items.ntuples - 1 - i,0))
    print ' returned from call ' + str(i)
    except:
    print 'Some other error occurred at message " + str(i) + ", trying to continue...'
    Juho Saarikko, May 25, 2004
    #1
    1. Advertising

  2. Juho Saarikko

    Ville Vainio Guest

    >>>>> "Juho" == Juho Saarikko <> writes:

    Juho> I made a Python script which takes Usenet message bodies
    Juho> from a database, decodes uuencoded contents and inserts them
    Juho> as Large Object into a PostGreSQL database. However, it
    Juho> appears that the to last few bytes

    I skimmed through your program, and noticed that you use binascii
    module uuencode/decode. Have you given the "uu" module a try, to see
    if it works better?

    Also, get rid of "regex" module, it even gives a DeprecationWarning
    suggesting switching to "re".

    --
    Ville Vainio http://tinyurl.com/2prnb
    Ville Vainio, May 25, 2004
    #2
    1. Advertising

  3. On Tue, 25 May 2004 22:04:24 +0300, Ville Vainio wrote:

    >>>>>> "Juho" == Juho Saarikko <> writes:

    >
    > Juho> I made a Python script which takes Usenet message bodies
    > Juho> from a database, decodes uuencoded contents and inserts them
    > Juho> as Large Object into a PostGreSQL database. However, it
    > Juho> appears that the to last few bytes
    >
    > I skimmed through your program, and noticed that you use binascii
    > module uuencode/decode. Have you given the "uu" module a try, to see
    > if it works better?


    I did examine the uu module, but it would seem that I'd had to parse the
    message first anyway to get the file name and the non-binary parts of the
    message as keywords. Besides, as I understand it, the uu module uses the
    binascii module, so if there's something wrong with the binascii module,
    the uu module can't possibly work well.

    Oh well, I would had to write the parsing engine anyway (or learn to
    use the e-mail classes), to properly handle mime and yenc messages. And I
    suppose I'd better start using imagemagic to verify the mimetype of
    decoded files, instead of just believing the filename. And join together
    files that have been spread over multiple messages. Work, work, work...

    > Also, get rid of "regex" module, it even gives a DeprecationWarning
    > suggesting switching to "re".


    I would, if I knew how to make regular expressions; I found the uu-parsing
    snippet from the net and built my script around it, but the
    regular expression doesn't seem to work with the re module.
    Juho Saarikko, May 25, 2004
    #3
  4. Juho Saarikko

    Steve Holden Guest

    Juho Saarikko wrote:
    > I made a Python script which takes Usenet message bodies from a database,
    > decodes uuencoded contents and inserts them as Large Object into a
    > PostGreSQL database. However, it appears that the to last few bytes
    > of uudecoded data are always mangled. Take a look of this hexdump output:
    >
    > Originals (decoded with Pan, each line is from a different file):
    > 000c2c0 e1bf 00ff 2541 a9e4 a724 d9ff
    > 0011a10 ff54 00d9
    > 00093e0 fb4f a80d ffd9 c200 ffef 00d9
    >
    > Decoded by the script:
    > 000c2c0 e1bf 00ff 2541 a9e4 a724 d0ff
    > 0011a10 ff54 00d8
    > 00093e0 fb4f a80d ffd9 c200 ffef 00d8
    >
    > As you can see, one of the last two bytes gets altered in all cases.
    >
    > The script also outputs the decoded file to disk for debugging purposes,
    > and the database large object and filesystem file match so it can't be a
    > PostGreSQL problem.
    >
    > So, if anyone has any idea what is wrong, please tell me ? I can't found
    > any reason why the bytes would get mangled...
    >
    > The script follows:
    >

    [...]
    I note that you are dumping words rather than bytes. Is it possible that
    the last byte isn't actually a part of the file, that
    endianness makes the last byte look like the penultimate byte, and that
    what you are seeing is simply noise?

    If not then it should probably be looked into ...

    regards
    Steve
    Steve Holden, May 25, 2004
    #4
  5. On Tue, 25 May 2004 18:54:44 -0400, Steve Holden wrote:

    > I note that you are dumping words rather than bytes. Is it possible that
    > the last byte isn't actually a part of the file, that
    > endianness makes the last byte look like the penultimate byte, and that
    > what you are seeing is simply noise?


    Well, ImageMagick complains that the image contains errors (altought
    Eye of Gnome shows it with no artifacts), so it's likely to be part of the
    file itself.

    I get both

    "display: Premature end of JPEG file"

    and

    "display: Invalid JPEG file structure: two SOI markers"

    errors. The later error prevent ImageMagick's display-command from
    displaying the image (but not Eye of Gnome).

    > If not then it should probably be looked into ...


    Looked, looked, but where to start ? The bug could be anywhere from my
    script to binascii module to the nntp module to the string.join -function.
    Juho Saarikko, May 26, 2004
    #5
  6. Juho Saarikko

    Tim Roberts Guest

    Juho Saarikko <> wrote:

    >I made a Python script which takes Usenet message bodies from a database,
    >decodes uuencoded contents and inserts them as Large Object into a
    >PostGreSQL database. However, it appears that the to last few bytes
    >of uudecoded data are always mangled. Take a look of this hexdump output:
    >
    >Originals (decoded with Pan, each line is from a different file):
    >000c2c0 e1bf 00ff 2541 a9e4 a724 d9ff
    >0011a10 ff54 00d9
    >00093e0 fb4f a80d ffd9 c200 ffef 00d9
    >
    >Decoded by the script:
    >000c2c0 e1bf 00ff 2541 a9e4 a724 d0ff
    >0011a10 ff54 00d8
    >00093e0 fb4f a80d ffd9 c200 ffef 00d8
    >
    >As you can see, one of the last two bytes gets altered in all cases.


    As others have pointed out, it's really the last byte that is getting
    altered.

    > for k in range(n+1, message.ntuples):
    ># print "Decodind row " + str(k)
    > s = message.getvalue(k, 0)
    > if s[:3] == "end":
    > n = k + 1
    > break
    > try:
    > body.append(binascii.a2b_uu(s))
    > except:
    > try:
    > bytes = (((ord(s[0])-32) & 63) * 4 + 3) / 3
    > body.append(binascii.a2b_uu(s[:bytes]))
    > except:
    > print "Broken attachment in message " + str(id)
    > conn.query("ROLLBACK")
    > return


    Your computation of the number of bytes in the uuencoded string will come
    up one short: you're not accounting for the length byte. That will have
    exactly the effect you describe. You lose the last encoded character,
    which means you'll miss the last 6 bits of the file. Change it to this:

    bytes = (((ord(s[0])-32) & 63) * 4 + 3) / 3 + 1

    However, you should not need to wrap the first binascii.a2b_uu call with
    try/except at all. What is happening that causes the error in the first
    place? I suspect if you fix the root cause, you could eliminate the except
    clause altogether.
    --
    - Tim Roberts,
    Providenza & Boekelheide, Inc.
    Tim Roberts, May 26, 2004
    #6
    1. Advertising

Want to reply to this thread or ask your own question?

It takes just 2 minutes to sign up (and it's free!). Just click the sign up button to choose a username and then you can ask your own questions on the forum.
Similar Threads
  1. Andreas Suurkuusk
    Replies:
    0
    Views:
    3,968
    Andreas Suurkuusk
    Jul 27, 2003
  2. Ted Miller
    Replies:
    0
    Views:
    5,145
    Ted Miller
    Sep 13, 2003
  3. barry
    Replies:
    0
    Views:
    786
    barry
    Dec 18, 2003
  4. py

    uuDecode problem

    py, Dec 7, 2005, in forum: Python
    Replies:
    9
    Views:
    405
    Alex Martelli
    Dec 10, 2005
  5. Mike

    Problem problem problem :( Need Help

    Mike, May 7, 2004, in forum: ASP General
    Replies:
    2
    Views:
    543
    Bullschmidt
    May 11, 2004
Loading...

Share This Page