Flat file to associative array.

A

Alex

Hello,

What would be the best way to convert a flat file of products into
a multi-dimensional array? The flat file looks like this:

Beer|Molson|Dry|4.50
Beer|Molson|Export|4.50
Shot|Scotch|Macallan18|18.50
Shot|Regular|Jameson|3.00

I've managed to read the file properly and get the individual lines
into a list, but I've got to turn this list into something that can
be stored in memory like a dictionary / associative array so I can
properly display the data.

Here's the code I have so far:

Code:
#! /usr/bin/python

import string

file = open("definitions", "r")
    
while 1:
    line = file.readline()
    if not line:
        break
    else:
        data = string.split(line, '|')
        item = dict([('product', data[0])])
        print "You want a " + item['product'] +
         ". What kind?  " + data[1] + " for $" +
        data[3] + " ... right?"

file.close()

Thanks in advance,

Alex
 
A

Antti Kaihola

#!/usr/bin/env python

def line2dict(line, fieldnames, sep='|'):
"""
Convert a separated text line to a dictionary
with the given field names as keys
"""
d = {}
for fieldname, value in zip(fieldnames, line.split(sep)):
d[fieldname] = value
return d

drinkfields = 'drinktype', 'producer', 'brand', 'price'
def drinkline2dict(line):
"""
Convert a | -separated and linefeed-terminated text line
to a dictionary
"""
return line2dict(line.rstrip(), drinkfields, '|')

products = map(drinkline2dict, file('products.txt').readlines())
from pprint import pprint
pprint(products)
 
J

John Hunter

Alex> Hello, What would be the best way to convert a flat file of
Alex> products into a multi-dimensional array? The flat file looks
Alex> like this:

Alex> Beer|Molson|Dry|4.50 Beer|Molson|Export|4.50
Alex> Shot|Scotch|Macallan18|18.50 Shot|Regular|Jameson|3.00

I work with CSV files a lot, and do something similar. I have written
some helper functions and classes that allow you to slice through flat
files as a 2D array, where the indices of the slices can either be
normal integer indices, or string keys, as in

from csvhelpers import getrows, rows_to_array
#split the data on the | symbol
def splitpipe(x):
if len(x.replace('|','').strip())==0: return [] # empty row
return x.split('|')

rows = getrows( file('beer.flat'), splitpipe)
headers=('category', 'brand', 'type', 'price')
data = rows_to_array( rows, headers)

print data[:,'price'] # prints all the prices (as floats)
print data[0] # prints the first row
print data # prints everything

The getrows function does some extra work: for example it ignores
empty rows, tries to convert the individual data cells to int then
float, and ignores comments if you pass it an extra comment arg as in

rows = getrows( file('beer.flat'), splitpipe, comment='#')

With this call, your flat file can look like

Beer|Molson|Dry|4.50
Beer|Molson|Export|4.50

# now for some booze
Shot|Scotch|Macallan18|18.50
Shot|Regular|Jameson|3.00

For many of the CSV files I work with, there are headers in the first
non empty row of the file. If you pass 'headers=1', rows_to_array
will get the headers and allow you to index the array with the header
strings. Also, I often have a unique key for each row, like a medical
ID number. An optional arg rowKeyInd lets you index the data by row
key string as well. Here is an example CSV file containing stock
quote information. The row keys are in the first column

,,,
ticker,date,open,price
ADPT,2001-12-13,100.0,20.0
,,,
QQQ,2001-12-13,100.0,103.0
AAA,2001-12-13,100.0,10.2
PPP,2001-12-13,100.0,1.3

Note that the ,,, thing is fairly typical of CSV files that started
their life as an Excel spreadsheet. They represent the empty rows.
The first nonempty row contains the headers, and the first column the
row keys.

# use csv module in real life to handle legit commas in fields, etc..
def splitcomma(x):
if len(x.replace(',','').strip())==0: return [] # empty row
return x.split(',')

rows = getrows( file('test.csv'), splitcomma)
data = rows_to_array( rows, headers=1, rowKeyInd=0)

print data[:,'open']
print data[:,'price']
print data[:,'date']
print data['ADPT',:]

I am using Numeric arrays under the hood, and data[:,'open] and
data[:,'price'] are Numeric arrays of Floats. Thus you can do things
like

change = data[:,'price'] / data[:,'open']

You can also use any valid slice, as in

data[1:3,3]

Hope this helps,
John Hunter

# csvhelpers.py
# Requires Numeric: http://pfdubois.com/numpy/ and python2.2
from __future__ import generators
from Numeric import array, Int, Float

def iterable(obj):
try: iter(obj)
except: return 0
return 1

def enumerate(seq):
for i in range(len(seq)):
yield i, seq

def is_string_like(obj):
try: obj + ''
except (TypeError, ValueError): return 0
return 1

class rows_to_array:
def __init__(self, rows, headers, rowKeyInd=None):
"""
rows is a generator that produces the rows of the flatfile,
optionally including the header row. See getrows.

If headers is iterable, then it is a list containing the
headers. otherwise use the first row for the headers.
headers must be unique

rowKeyInd, if not None, is the column index of the row key so
that rows can be identified by key
"""

if not iterable(headers):
headers = [ h.strip() for h in rows.next()]

# make dict from header to col index
self.headerd = {}
for i,h in enumerate(headers):
key = h.strip()
if self.headerd.has_key(key):
raise RuntimeError, 'Headers must be unique. ' + \
'Found duplicate: ' + key
self.headerd[key] = i

data = [row for row in rows]

# use the first row to determine types; this should be improved
self.types = []
for val in data[0]:
if isinstance(val, int):
self.types.append(Int)
elif isinstance(val, float):
self.types.append(Float)
else:
self.types.append('O')


if rowKeyInd is not None:
self.rowKeys = {}
for i,row in enumerate(data):
self.rowKeys[row[rowKeyInd]] = i
self.data = array(data, typecode='O')

def __getitem__(self, key):

try:
rowKey, colKey = key
except TypeError:
# row index only
if is_string_like(key):
rowSlice = self.rowKeys[key]
else:
rowSlice = key
return self.data[rowSlice]


type = None
if is_string_like(rowKey):
rowSlice = self.rowKeys[rowKey]
else:
rowSlice = rowKey
if is_string_like(colKey):
colSlice = self.headerd[colKey]
type = self.types[colSlice]
else:
colSlice = colKey

ret = self.data[rowSlice, colSlice]

if type is not None: return ret.astype(type)
else: return ret

def __str__(self):
s = str(self.headerd.keys()) + '\n'
s += str(self.data)
return s


def try_to_num(val):
try: return int(val)
except ValueError:
try: return float(val)
except ValueError:
return val

def getrows(fh, parseline, comment=None):
"""
return the rows of the data in file object fh, converting to int
or float if possible

parseline returns the row as a list, ie, splits ths row.
An empty row should be a list of length 0

if comment is not None, ignore lines starting with comment symbol
"""
while 1:
line = fh.readline()
if comment is not None and line.startswith(comment):
continue
if len(line)==0: return
if len(line.strip())==0: continue
vals = parseline(line)
if len(vals):
maxLen = max([len(entry) for entry in vals])
if maxLen==0: continue
yield [ try_to_num(val) for val in vals]
 

Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Members online

No members online now.

Forum statistics

Threads
473,755
Messages
2,569,536
Members
45,009
Latest member
GidgetGamb

Latest Threads

Top