Alex> Hello, What would be the best way to convert a flat file of
Alex> products into a multi-dimensional array? The flat file looks
Alex> like this:
Alex> Beer|Molson|Dry|4.50 Beer|Molson|Export|4.50
Alex> Shot|Scotch|Macallan18|18.50 Shot|Regular|Jameson|3.00
I work with CSV files a lot, and do something similar. I have written
some helper functions and classes that allow you to slice through flat
files as a 2D array, where the indices of the slices can either be
normal integer indices, or string keys, as in
from csvhelpers import getrows, rows_to_array
#split the data on the | symbol
def splitpipe(x):
if len(x.replace('|','').strip())==0: return [] # empty row
return x.split('|')
rows = getrows( file('beer.flat'), splitpipe)
headers=('category', 'brand', 'type', 'price')
data = rows_to_array( rows, headers)
print data[:,'price'] # prints all the prices (as floats)
print data[0] # prints the first row
print data # prints everything
The getrows function does some extra work: for example it ignores
empty rows, tries to convert the individual data cells to int then
float, and ignores comments if you pass it an extra comment arg as in
rows = getrows( file('beer.flat'), splitpipe, comment='#')
With this call, your flat file can look like
Beer|Molson|Dry|4.50
Beer|Molson|Export|4.50
# now for some booze
Shot|Scotch|Macallan18|18.50
Shot|Regular|Jameson|3.00
For many of the CSV files I work with, there are headers in the first
non empty row of the file. If you pass 'headers=1', rows_to_array
will get the headers and allow you to index the array with the header
strings. Also, I often have a unique key for each row, like a medical
ID number. An optional arg rowKeyInd lets you index the data by row
key string as well. Here is an example CSV file containing stock
quote information. The row keys are in the first column
,,,
ticker,date,open,price
ADPT,2001-12-13,100.0,20.0
,,,
QQQ,2001-12-13,100.0,103.0
AAA,2001-12-13,100.0,10.2
PPP,2001-12-13,100.0,1.3
Note that the ,,, thing is fairly typical of CSV files that started
their life as an Excel spreadsheet. They represent the empty rows.
The first nonempty row contains the headers, and the first column the
row keys.
# use csv module in real life to handle legit commas in fields, etc..
def splitcomma(x):
if len(x.replace(',','').strip())==0: return [] # empty row
return x.split(',')
rows = getrows( file('test.csv'), splitcomma)
data = rows_to_array( rows, headers=1, rowKeyInd=0)
print data[:,'open']
print data[:,'price']
print data[:,'date']
print data['ADPT',:]
I am using Numeric arrays under the hood, and data[:,'open] and
data[:,'price'] are Numeric arrays of Floats. Thus you can do things
like
change = data[:,'price'] / data[:,'open']
You can also use any valid slice, as in
data[1:3,3]
Hope this helps,
John Hunter
# csvhelpers.py
# Requires Numeric:
http://pfdubois.com/numpy/ and python2.2
from __future__ import generators
from Numeric import array, Int, Float
def iterable(obj):
try: iter(obj)
except: return 0
return 1
def enumerate(seq):
for i in range(len(seq)):
yield i, seq
def is_string_like(obj):
try: obj + ''
except (TypeError, ValueError): return 0
return 1
class rows_to_array:
def __init__(self, rows, headers, rowKeyInd=None):
"""
rows is a generator that produces the rows of the flatfile,
optionally including the header row. See getrows.
If headers is iterable, then it is a list containing the
headers. otherwise use the first row for the headers.
headers must be unique
rowKeyInd, if not None, is the column index of the row key so
that rows can be identified by key
"""
if not iterable(headers):
headers = [ h.strip() for h in rows.next()]
# make dict from header to col index
self.headerd = {}
for i,h in enumerate(headers):
key = h.strip()
if self.headerd.has_key(key):
raise RuntimeError, 'Headers must be unique. ' + \
'Found duplicate: ' + key
self.headerd[key] = i
data = [row for row in rows]
# use the first row to determine types; this should be improved
self.types = []
for val in data[0]:
if isinstance(val, int):
self.types.append(Int)
elif isinstance(val, float):
self.types.append(Float)
else:
self.types.append('O')
if rowKeyInd is not None:
self.rowKeys = {}
for i,row in enumerate(data):
self.rowKeys[row[rowKeyInd]] = i
self.data = array(data, typecode='O')
def __getitem__(self, key):
try:
rowKey, colKey = key
except TypeError:
# row index only
if is_string_like(key):
rowSlice = self.rowKeys[key]
else:
rowSlice = key
return self.data[rowSlice]
type = None
if is_string_like(rowKey):
rowSlice = self.rowKeys[rowKey]
else:
rowSlice = rowKey
if is_string_like(colKey):
colSlice = self.headerd[colKey]
type = self.types[colSlice]
else:
colSlice = colKey
ret = self.data[rowSlice, colSlice]
if type is not None: return ret.astype(type)
else: return ret
def __str__(self):
s = str(self.headerd.keys()) + '\n'
s += str(self.data)
return s
def try_to_num(val):
try: return int(val)
except ValueError:
try: return float(val)
except ValueError:
return val
def getrows(fh, parseline, comment=None):
"""
return the rows of the data in file object fh, converting to int
or float if possible
parseline returns the row as a list, ie, splits ths row.
An empty row should be a list of length 0
if comment is not None, ignore lines starting with comment symbol
"""
while 1:
line = fh.readline()
if comment is not None and line.startswith(comment):
continue
if len(line)==0: return
if len(line.strip())==0: continue
vals = parseline(line)
if len(vals):
maxLen = max([len(entry) for entry in vals])
if maxLen==0: continue
yield [ try_to_num(val) for val in vals]