CSV to array converter

T

tom t/LA

Here is a function to convert a CSV file to a Javascript array. Uses
idealized file reading functions based on the std C library, since
there is no Javascript standard. Not fully tested.


function csvToArray (f /* file handle */)
{
// convert csv file to Javascript 2d array (array of arrays)
// written in Javascript but file lib functions are idealized
var array2d = new Array(0);
var i = 0;
var maxlen = 0;
while (! feof (f)) { // return True if at EOF
array2d = csvRowToArray (f);
i++;
}
return array2d;
}

function csvRowToArray (f /* file handle */)
{


// convert one line in CSV format to array of strings
// return array

var array = new Array(0);

array [0] = "";
for (state = "cellData", si=0, ai=0; state != "end"; si++) {
// get one char
var c = getc(f);
// Assume EOL is just a character
// at end of file EOF is returned
LogMessage ("state "+state+", read char "+c+".");

switch (state) {
case "cellData":
if (c == '"') {
// **should also clear cell because chars before
// quote should be ignored.
state = "quotedCellData";
}
else if (c == ",") {
state = "cellBdy";
// ** for strict compliance, trim spaces from
// begin and end of string here
}
else if (c == EOL) { // end of line
state = "end";
}
else {
// regular char --add to cell data
array[ai] = array[ai] + c;
// stay in same state
}
break;

case "cellBdy":
// cell boundary- start new cell
ai++;
array [ai] = "";
if (c == '"') {
state = "quotedCellData";
}
else if (c == ",") {
// stay in same state
}
else if (c == EOL) { // end of line
state = "end";
}
else {
// regular char --
array[ai] = array[ai] + c;
state = "cellData";
}

break;
case "quotedCellData" :
if (c == '"') {
state = "quoteInQuote";
}else {
// normal char, add to cell
// Note EOLs are considered normal here
array[ai] = array[ai] + c;
// stay in same state
}

break;

case "quoteInQuote":
if (c == '"') {
// "" in quote string- add a " to string
array[ai] = array[ai] + '"';
state = "quotedCellData";
}else if (c == ",") {
state = "cellBdy";
}else if (c == EOL) { // end line;
state = "end";
}else {
// skip junk between end of quote and , or EOL
}

break;

case "end":
// never get here
break;

default:
LogError ("unknown state in switch: "+state);
} // end switch
LogMessage ("array ["+ai+"]="+array[ai]);

} // end while

// in case some elements are unassigned
for (var i = 0; i < array.length; i++) {
if (array == null) {
array == "";
}
LogMessage ("array ="+array);
}


return array;
}
 
T

tom t/LA

(I couldn't find an easy to understand CSV converter source code or
FSA on the web, so I'm posting this, even though it's not fully
tested. My application does not require full CSV).
 
B

Bart Van der Donck

tom said:
(I couldn't find an easy to understand CSV converter source
code or FSA on the web, so I'm posting this, even though it's
not fully tested. My application does not require full CSV).

Interesting exercise. There are 3 characteristics in CSV files:

1. Fields separated by: var SE below, any char(s), may not be empty,
mostly comma or semicolon
2. Fields enclosed by: var EN below, any char(s) or empty, mostly
single/double quotes or left empty
3. Lines terminated by (end-of-line): var EOL below, any char(s), may
not be empty, mostly \n \r or \r\n

var SE = ','
var EN = '"'
var EOL = '\n'
var CSV = '"A1","A2","A3","A4"\n"B1","B2","B3"\n"C1","C2"'
var RO = CSV.split(EOL)
var TIE = new Array()
for (var i = 0; i < RO.length; ++i) {
RO = RO.replace(new RegExp('^'+EN), '')
RO = RO.replace(new RegExp(EN+'$'), '')
TIE = RO.split(EN+SE+EN)
}

You then get the CSV tied to a matrix (2dim array) which is accessible
e.g. like this:

alert(TIE[0][0]) // says A1
alert(TIE[1][0]) // says B1
alert(TIE[0][3]) // says A4

I've not added escape characters (traditionally backslash), this might
be desirable too, but will be the hardest part :)

Hope this helps,
 
E

Evertjan.

Bart Van der Donck wrote on 19 mrt 2007 in comp.lang.javascript:
var SE = ','
var EN = '"'
var EOL = '\n'
var CSV = '"A1","A2","A3","A4"\n"B1","B2","B3"\n"C1","C2"'
var RO = CSV.split(EOL)
var TIE = new Array()
for (var i = 0; i < RO.length; ++i) {
RO = RO.replace(new RegExp('^'+EN), '')
RO = RO.replace(new RegExp(EN+'$'), '')
TIE = RO.split(EN+SE+EN)
}

You then get the CSV tied to a matrix (2dim array) which is accessible
e.g. like this:

alert(TIE[0][0]) // says A1
alert(TIE[1][0]) // says B1
alert(TIE[0][3]) // says A4


Trying to optimise your code, I found a strange IE error:

var a = '"z"';a = a.split(/"/);alert(a.length);

returns 1 in IE7, 3, as expected, in FF2.

while:

var a = '"z"';a = a.split('"');alert(a.length);

in both browsers returns 3.

=================================================================

So my roundabout result is:

<script type='text/javascript'>

var a = '"z"';
a = a.split(/"/);
var IE = a.length == 1;
var CSV = '"A1","A2","aa","A4"\n"B1","B2","B3"\n"C1","C2"'

var r = CSV.split('\n');
for (var i = 0; i < r.length; ++i){
r = r.split(/^"|"$|","/);
if (!IE){
r.shift();
r.pop();
}
}

for (var i = 0; i < r.length; ++i)
alert(r);

</script>
 
B

Bart Van der Donck

Evertjan. said:
var a = '"z"';a = a.split(/"/);alert(a.length);

returns 1 in IE7, 3, as expected, in FF2.

while:

var a = '"z"';a = a.split('"');alert(a.length);

in both browsers returns 3.

// is Perl syntax; I thought javascript only allowed ('')/("")
officially.

var a = '"z"';
a = a.split(/"/);
var IE = a.length == 1;
var CSV = '"A1","A2","aa","A4"\n"B1","B2","B3"\n"C1","C2"'

var r = CSV.split('\n');
for (var i = 0; i < r.length; ++i){
r = r.split(/^"|"$|","/);
if (!IE){
r.shift();
r.pop();
}
}

for (var i = 0; i < r.length; ++i)
alert(r);


I'm afraid it's not that simple. Your code does the job in an elegant
way (though a = a.split(/"/) wouldn't be my style). The problem is
that you can't take for granted that the input is using enclosing
characters -like double quotes- in which case your code would cease to
work. Also \n and comma are far from sure. The problem is that CSV is
not uniquely defined among applications.

The most strict definition is like the name says, comma-separated-
values:

1,John,Doe,California

Comma might be semicolon too, like e.g. in Excel.

Near all applications accept enclosing characters (mostly double
quote):

"1","John","Doe","California"

Escape sequences are sometimes supported, but often become a reason
for headaches:

"1";"John\";"Doe";"California"
"2";"Paul\\";"Doe";"California"
"3";"Alex\\\";"Doe";"Regards,\nJohn"

An additional problem here is the behaviour of backslashes in variable
assignments.

I'm also aware of at least one alternative CSV escape method:

"1";"John";"Doe";"He said ""Okay"" and left"

Some CSV conventions even do:

1;John;Doe;"He said ""Okay"" and left"

Operating systems may differ in end-of-line characters as well.
 
T

tom t/LA

I'm also aware of at least one alternative CSV escape method:

"1";"John";"Doe";"He said ""Okay"" and left"
this is the NORMAL CSV convention. See
http://en.wikipedia.org/wiki/Comma-separated_values.
This is what makes it hard. I believe CSV cannot be converted
by simple regexp search and replace, or I would have used
that method.

Some CSV conventions even do:

1;John;Doe;"He said ""Okay"" and left"
this is what MS Excel does, so it is good to be able to handle it.
 
B

Bart Van der Donck

tom said:

There is only one 'normal' CSV convention, and that is what the name
says; comma separated.

1,John,Doe
From www.whatis.com: http://searchsqlserver.techtarget.com/sDefinition/0,,sid87_gci213871,00.html

This is what makes it hard. I believe CSV cannot be converted
by simple regexp search and replace, or I would have used
that method.

I think it's possible, but it would be far from easy indeed. It also
depends on which options you want to support and which not.
this is what MS Excel does, so it is good to be able to handle it.

Even Excel itself can export 3 sorts of CSV's.

E.g. PhpMyAdmin exports 2 sorts of CSV's; one especially targetted
towards Excel and one with semicolon - double quote - backslash - \r\n
by default (changeable).

OpenOffice does: 1,"John","Doe" (literals use double quotes, numerics
not).

MySQL: "1","John","Doe"

etc...

Then you have things like the following (I'm not gonna make the
exercise to test where they're supported or not):

1,="002",3 (pass leading zeros)

"Microsoft, Inc.",Seattle,WA (only quotes when escaping comma)

Microsoft\, Inc.,Seattle,WA (backslash escape)

1,"Best regards,
Jonh",2,3,4,5 (fields containing EOLs)

1, 2, 3, 4 (leading field spaces to be stripped)

"1,057",2.00,3\,00,="04.49","5,452.687" (floating numbers)

CSV is very poorly standardized. You can at most name conventions that
are better supported than others.
 
E

Evertjan.

tom t/LA wrote on 20 mrt 2007 in comp.lang.javascript:
I believe CSV cannot be converted
by simple regexp search and replace, or I would have used
that method.

Wow, so your believes are founded on what you declined from doing?

Regex can do whatever you like, but is not "simple".

With a complete working definition of CSV,
using repeated regex replace'es,
and some final split()'s,
all is possible.
 
T

tom t/LA

There is only one 'normal' CSV convention, and that is what the name
says; comma separated.

1,John,Doe

Sorry, I did not mean to copy an example with semicolons (;) in it. My
point
was that nested double quote marks are normally converted to two
double quotes
" --> ""
So I should have used as an example:

1,John,"Doe",124 Main St,"He said ""Okay"" and left"
 
T

tom t/LA

Regex can do whatever you like, but is not "simple".
With a complete working definition ofCSV,
using repeated regex replace'es,
and some final split()'s,
all is possible.

well, yes.
I thought about it some more. It is not impossible using regexps. But
I think it is impossible using a small number of regexp search and
replace's, even complicated ones, as available in Javascript or Perl.
I think you have to have a loop in the program to handle the arbitrary
number of "" that may occur inside a "..." string.

// match series of strings of non-quote chars, followed by doubled
quote, followed by more non-quote chars, all enclosed by quotes

I think the following would convert doubled quotes to single quotes,
and find the end of the quoted string, return the result in 'r'.

m = str.match (/"(([^"]*)("")([^"]*))*"/);
for (i = 0; i<m.length; i++){
if (m == '""') {
r = r+'"';
}else{
r = r+m;
}
}

You cannot do that with a string.replace() call because in the
replacement string you would use $1, $2 $3 ... $n with no limit since
there can be any number of "" in the original string.
 
T

tom t/LA

Also, tab separated value is much easier to read in than CSV, because
it simply does not allow tabs inside its data values.
 
E

Evertjan.

tom t/LA wrote on 24 mrt 2007 in comp.lang.javascript:
Regex can do whatever you like, but is not "simple".

With a complete working definition ofCSV,
using repeated regex replace'es,
and some final split()'s,
all is possible.

well, yes.
I thought about it some more. It is not impossible using regexps. But
I think it is impossible using a small number of regexp search and
replace's, even complicated ones, as available in Javascript or Perl.
I think you have to have a loop in the program to handle the arbitrary
number of "" that may occur inside a "..." string.
[...]

You cannot do that with a string.replace() call because in the
replacement string you would use $1, $2 $3 ... $n with no limit since
there can be any number of "" in the original string.

As long as you can use "free" temporary characters the job is far easier,
again if a complete definition of the expected form of the CSV wuld exist.

s = s.replace(/""/g,String.fromCharCode(9991))
s = s.replace(/","/g,String.fromCharCode(9992))
s = s.replace(/^"/g,'')
s = s.replace(/"$/g,'')
re = new RegExp(String.fromCharCode(9991),"g")
s = s.replace(re,'"')
s = s.split(String.fromCharCode(9992))
 
E

Evertjan.

tom t/LA wrote on 24 mrt 2007 in comp.lang.javascript:
Also, tab separated value is much easier to read in than CSV, because
it simply does not allow tabs inside its data values.

[please always quote on usenet, this is not email]
 
T

tom t/LA

As long as you can use "free" temporary characters the job is far easier,
again if a complete definition of the expected form of theCSVwuld exist.

s = s.replace(/""/g,String.fromCharCode(9991))

can this handle a string beginning with a quote mark such as
abc,"""hi"""
 
E

Evertjan.

tom t/LA wrote on 25 mrt 2007 in comp.lang.javascript:
can this handle a string beginning with a quote mark such as
abc,"""hi"""

Wow,

I only showed you a way to use "unused characters".
Not a clear cut solution.

Yes,

You could first test for and correct

"""," -> String.fromCharCode(9991)+String.fromCharCode(9992)
",""" -> String.fromCharCode(9992)+String.fromCharCode(9991)
""",""" -> String.fromCharCode(9992)+String.fromCharCode(9991)
+String.fromCharCode(9992)

And perhaps start with something like?
s = ',' + s + ','


================================

However the old fashioned way to parse the file as a sting linearly from
left to right is also appealing, though in my view muuuuuuch slower:

pseudocode sort of first draft:

outside = true
stack = new array
member=''
do
take next char
if outside
if ,
push member onto stack:member=''
if "
outside=false
else // inside
if member="" and "
nop
if not "
add char to member
if " and [take] next char also "
add one " to member
else must be ",
end member push member on stack:member='':eek:utside=true
loop
 

Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Members online

Forum statistics

Threads
473,755
Messages
2,569,535
Members
45,007
Latest member
obedient dusk

Latest Threads

Top