Flat file to associative array.

Alex · Jul 21, 2003

Code:

#! /usr/bin/python

import string

file = open("definitions", "r")
    
while 1:
    line = file.readline()
    if not line:
        break
    else:
        data = string.split(line, '|')
        item = dict([('product', data[0])])
        print "You want a " + item['product'] +
         ". What kind?  " + data[1] + " for $" +
        data[3] + " ... right?"

file.close()

Thanks in advance,

Alex

Antti Kaihola · Jul 21, 2003

#!/usr/bin/env python

def line2dict(line, fieldnames, sep='|'):
"""
Convert a separated text line to a dictionary
with the given field names as keys
"""
d = {}
for fieldname, value in zip(fieldnames, line.split(sep)):
d[fieldname] = value
return d

drinkfields = 'drinktype', 'producer', 'brand', 'price'
def drinkline2dict(line):
"""
Convert a | -separated and linefeed-terminated text line
to a dictionary
"""
return line2dict(line.rstrip(), drinkfields, '|')

products = map(drinkline2dict, file('products.txt').readlines())
from pprint import pprint
pprint(products)

John Hunter · Jul 22, 2003

Alex> Hello, What would be the best way to convert a flat file of
Alex> products into a multi-dimensional array? The flat file looks
Alex> like this:

Alex> Beer|Molson|Dry|4.50 Beer|Molson|Export|4.50
Alex> Shot|Scotch|Macallan18|18.50 Shot|Regular|Jameson|3.00

I work with CSV files a lot, and do something similar. I have written
some helper functions and classes that allow you to slice through flat
files as a 2D array, where the indices of the slices can either be
normal integer indices, or string keys, as in

from csvhelpers import getrows, rows_to_array
#split the data on the | symbol
def splitpipe(x):
if len(x.replace('|','').strip())==0: return [] # empty row
return x.split('|')

rows = getrows( file('beer.flat'), splitpipe)
headers=('category', 'brand', 'type', 'price')
data = rows_to_array( rows, headers)

print data[:,'price'] # prints all the prices (as floats)
print data[0] # prints the first row
print data # prints everything

The getrows function does some extra work: for example it ignores
empty rows, tries to convert the individual data cells to int then
float, and ignores comments if you pass it an extra comment arg as in

rows = getrows( file('beer.flat'), splitpipe, comment='#')

With this call, your flat file can look like

Beer|Molson|Dry|4.50
Beer|Molson|Export|4.50

# now for some booze
Shot|Scotch|Macallan18|18.50
Shot|Regular|Jameson|3.00

For many of the CSV files I work with, there are headers in the first
non empty row of the file. If you pass 'headers=1', rows_to_array
will get the headers and allow you to index the array with the header
strings. Also, I often have a unique key for each row, like a medical
ID number. An optional arg rowKeyInd lets you index the data by row
key string as well. Here is an example CSV file containing stock
quote information. The row keys are in the first column

,,,
ticker,date,open,price
ADPT,2001-12-13,100.0,20.0
,,,
QQQ,2001-12-13,100.0,103.0
AAA,2001-12-13,100.0,10.2
PPP,2001-12-13,100.0,1.3

Note that the ,,, thing is fairly typical of CSV files that started
their life as an Excel spreadsheet. They represent the empty rows.
The first nonempty row contains the headers, and the first column the
row keys.

# use csv module in real life to handle legit commas in fields, etc..
def splitcomma(x):
if len(x.replace(',','').strip())==0: return [] # empty row
return x.split(',')

rows = getrows( file('test.csv'), splitcomma)
data = rows_to_array( rows, headers=1, rowKeyInd=0)

print data[:,'open']
print data[:,'price']
print data[:,'date']
print data['ADPT',:]

I am using Numeric arrays under the hood, and data[:,'open] and
data[:,'price'] are Numeric arrays of Floats. Thus you can do things
like

change = data[:,'price'] / data[:,'open']

You can also use any valid slice, as in

data[1:3,3]

Hope this helps,
John Hunter

# csvhelpers.py
# Requires Numeric: http://pfdubois.com/numpy/ and python2.2
from __future__ import generators
from Numeric import array, Int, Float

def iterable(obj):
try: iter(obj)
except: return 0
return 1

def enumerate(seq):
for i in range(len(seq)):
yield i, seq

def is_string_like(obj):
try: obj + ''
except (TypeError, ValueError): return 0
return 1

class rows_to_array:
def __init__(self, rows, headers, rowKeyInd=None):
"""
rows is a generator that produces the rows of the flatfile,
optionally including the header row. See getrows.

If headers is iterable, then it is a list containing the
headers. otherwise use the first row for the headers.
headers must be unique

rowKeyInd, if not None, is the column index of the row key so
that rows can be identified by key
"""

if not iterable(headers):
headers = [ h.strip() for h in rows.next()]

# make dict from header to col index
self.headerd = {}
for i,h in enumerate(headers):
key = h.strip()
if self.headerd.has_key(key):
raise RuntimeError, 'Headers must be unique. ' + \
'Found duplicate: ' + key
self.headerd[key] = i

data = [row for row in rows]

# use the first row to determine types; this should be improved
self.types = []
for val in data[0]:
if isinstance(val, int):
self.types.append(Int)
elif isinstance(val, float):
self.types.append(Float)
else:
self.types.append('O')

if rowKeyInd is not None:
self.rowKeys = {}
for i,row in enumerate(data):
self.rowKeys[row[rowKeyInd]] = i
self.data = array(data, typecode='O')

def __getitem__(self, key):

try:
rowKey, colKey = key
except TypeError:
# row index only
if is_string_like(key):
rowSlice = self.rowKeys[key]
else:
rowSlice = key
return self.data[rowSlice]

type = None
if is_string_like(rowKey):
rowSlice = self.rowKeys[rowKey]
else:
rowSlice = rowKey
if is_string_like(colKey):
colSlice = self.headerd[colKey]
type = self.types[colSlice]
else:
colSlice = colKey

ret = self.data[rowSlice, colSlice]

if type is not None: return ret.astype(type)
else: return ret

def __str__(self):
s = str(self.headerd.keys()) + '\n'
s += str(self.data)
return s

def try_to_num(val):
try: return int(val)
except ValueError:
try: return float(val)
except ValueError:
return val

def getrows(fh, parseline, comment=None):
"""
return the rows of the data in file object fh, converting to int
or float if possible

parseline returns the row as a list, ie, splits ths row.
An empty row should be a list of length 0

if comment is not None, ignore lines starting with comment symbol
"""
while 1:
line = fh.readline()
if comment is not None and line.startswith(comment):
continue
if len(line)==0: return
if len(line.strip())==0: continue
vals = parseline(line)
if len(vals):
maxLen = max([len(entry) for entry in vals])
if maxLen==0: continue
yield [ try_to_num(val) for val in vals]

file data => array(s)	6	Dec 14, 2011
Writing and reading variables to/from flat file	6	Dec 14, 2006
Help with loading file into an array	3	May 5, 2013
parsing flat file to mssql (odbc)	1	Jul 8, 2003
Sharing: File Reader Generator with & w/o Policy	14	Mar 15, 2014
How do I encode and decode this data to write to a file?	11	Apr 29, 2013
writing results to array	4	Dec 3, 2007
[announce] 'Flat File to XML Data Conversion' Tops Stylus Studio Box Office	0	Jul 27, 2005

Flat file to associative array.

Alex

Antti Kaihola

John Hunter

Ask a Question

Similar Threads

Members online

Forum statistics

Latest Threads