I have a group of files in a format that is that is tab delimited with
about a million columns and a thousand rows.
Reading this file left-to-right top-to-bottom is not a problem but my
requirements are to read it top-to-bottom left-to-right (to read each
column in order as follows).
1,4,7
2,5,8
3,6,9
It's an O(n^2) problem if I read each line for each column (it could
take a week for a big file). The file is too big to hold the lines in
memory and I see no strategy where I can hold a subset of lines in
memory.
This seems like a simple problem but I have struggled with lots of
solutions that have come up short.
/*
No need to read the table more than once. I would do this:
*/
#include <string.h>
#include <limits.h>
#include <stdlib.h>
#include <ctype.h>
/* The default delimiters are chosen as some ordinary white space
characters: */
static const char default_delimiters[] = ",";
/*
* The tokenize() function is similar to a reentrant version of
strtok().
* It parses tokens from 'string', where tokens are substrings
separated by characters from 'delimiter_list'.
* To get the first token from 'string', tokenize() is called with
'string' as its first parameter.
* Remaining tokens from 'string' are obtained by calling tokenize()
with NULL for the first parameter.
* The string of delimiters, identified by 'delimiter_list', can
change from call to call.
* If the string of delimiters is NULL, then the standard list
'default_delimiters' (see above) is used.
* tokenize() modifies the memory pointed to by 'string', because it
writes null characters into the buffer.
*/
char *tokenize(char *string, const char *delimiter_list,
char **placeholder)
{
if (delimiter_list == NULL)
delimiter_list = default_delimiters;
if (delimiter_list[0] == 0)
delimiter_list = default_delimiters;
if (string == NULL)
string = *placeholder;
if (string == NULL)
return NULL;
/*
* The strspn() function computes the length of the initial segment of
the first string
* that consists entirely of characters contained in the second
string.
*/
string += strspn(string, delimiter_list);
if (!string[0]) {
*placeholder = string;
return NULL;
} else {
char *token;
token = string;
/*
* The strpbrk() function finds the first occurrence of any character
contained in the second string
* found in the first string.
*/
string = strpbrk(token, delimiter_list);
if (string == NULL)
*placeholder = token + strlen(token);
else {
*string++ = 0;
*placeholder = string;
}
return token;
}
}
#ifdef UNIT_TEST
char test_string0[] =
"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17";
char test_string1[] =
"21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37";
char test_string2[] =
"31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47";
#include <stdio.h>
void make_row_col(char *test_string, char *white, int row)
{
char *p = NULL;
char *token;
int col = 0;
token = tokenize(test_string, white, &p);
while (token) {
printf("%d,%d,%s\n",row, col++, token);
token = tokenize(NULL, white, &p);
}
}
int main(void)
{
int row = 0;
make_row_col(test_string0, NULL, row++);
make_row_col(test_string1, NULL, row++);
make_row_col(test_string2, NULL, row++);
return 0;
}
#endif
/*
-- Now load the file of transposed information into this database
table.
-- After that, you can do a SQL query to invert row and col or select
by row or select by col.
CREATE TABLE Transposer (
row INTEGER NOT NULL,
col INTEGER NOT NULL,
data VARCHAR(20)
);
CREATE UNIQUE INDEX XPKTransposer ON Transposer
(
row ASC,
col ASC
);
CREATE INDEX XieTransposerRow ON Transposer
(
row ASC
);
CREATE INDEX XieTransposerCol ON Transposer
(
col ASC
);
ALTER TABLE Transposer
ADD PRIMARY KEY (row, col);
*/