Reading Unicode Strings from File

J

Jamie

I have a file that was written using Java and the file has unicode
strings. What is the best way to deal with these in C? The file
definition reads:

Data Field Description
CHAR[32] File identifier (64 bytes corresponding to Unicode character
string padded with '0' Unicode characters.
CHAR[16] File format version (32 bytes corresponding to Unicode
character string "x.y.z" where x, y, z are integers
corresponding to major, minor and revision
number of the File format version) padded with '0' Unicode
characters.
INTEGER Main file header length [bytes].
....


The data field defitions are from Java primitives:

CHAR Unicode character. 16-bit Unicode character.
INTEGER Signed integer number. 32-bit two's complement signed integer.

There is absolutely no need for these strings to be in unicode format and
I am at a loss as to how to convert them to a standard C character array.
Moreover, in the example below, I seem to be out in the byte count as my
integer is garbage. Any ideas would be greatly appreciated.

#include <stdlib.h>
#include <stdio.h>

#define ARGC_FAILURE 100
#define OPEN_FAILURE 101
#define CLOSE_FAILURE 102

int main(int argc, char *argv[])
{
FILE *fp;
long n;
char d_id[64];
char d_version[32];
int d_hdrlen;

if (argc !=2)
{
printf("Usage: read_adf filename\n");
return ARGC_FAILURE;
}

// Open the file
if ( (fp = fopen(argv[1], "r")) == NULL)
{
printf("%s: Error opening %s", argv[0], argv[1]);
return OPEN_FAILURE;
}

// Read the contents
n = fread(d_id, sizeof(d_id), 1, fp);
n = fread(d_version, sizeof(d_version), 1, fp);
n = fread(&d_hdrlen, sizeof(d_hdrlen), 1, fp);

// Display the contents
printf(" ID: %s\n", d_id);
printf(" VER: %s\n", d_version);
printf(" HDR Length: %d\n", d_hdrlen);

// Close the file
if (fclose(fp) == EOF)
{
printf("%s: Error closing %s", argv[0], argv[1]);
return CLOSE_FAILURE;
}

return 0;
}
 
M

Mark McIntyre

I have a file that was written using Java and the file has unicode
strings. What is the best way to deal with these in C?

C knows nothing of Unicode. However your platform probably does, since
it seems it uses them. Almost certainly your compiler has some
platform-specific functions to convert unicode to C strings adn
vice-versa. You might also find the wchar_t type may match unicode on
your platform. You'd have to experiment to find out.
 
S

Simon Biber

Jamie said:
I have a file that was written using Java and the file has unicode
strings. What is the best way to deal with these in C? The file
definition reads:

Data Field Description
CHAR[32] File identifier (64 bytes corresponding to Unicode
character string padded with '0' Unicode characters.
CHAR[16] File format version (32 bytes corresponding to
Unicode character string "x.y.z" where x, y, z
are integers corresponding to major, minor and
revision number of the File format version)
padded with '0' Unicode characters.
INTEGER Main file header length [bytes].

This seems to work on my system. It assumes that the multibyte
and wide character system on your C implementation follows the
same standard as the Java unicode file.

#include <stdio.h>
#include <stdlib.h>
#include <wchar.h>
#include <assert.h>

struct header {
char id[32];
int major, minor, revision;
int length;
};

void read_header(struct header *hd, FILE *fp)
{
wchar_t wid[32];
wchar_t wver[16];
char ver[16];

assert(sizeof(wchar_t) == 2);
assert(sizeof(int) == 4);

fread(wid, sizeof(wchar_t), 32, fp);
wcstombs(hd->id, wid, 32);

fread(wver, sizeof(wchar_t), 16, fp);
wcstombs(ver, wver, 16);
sscanf(ver, "%d.%d.%d", &hd->major, &hd->minor, &hd->revision);

fread(&hd->length, 1, sizeof(int), fp);
}


I also wrote some companion functions for testing:

void write_header(const struct header *hd, FILE *fp)
{
wchar_t wid[32] = {0};
wchar_t wver[16] = {0};
char ver[16];

assert(sizeof(wchar_t) == 2);
assert(sizeof(int) == 4);

mbstowcs(wid, hd->id, 32);
fwrite(wid, sizeof(wchar_t), 32, fp);

sprintf(ver, "%d.%d.%d", hd->major, hd->minor, hd->revision);
mbstowcs(wver, ver, 16);
fwrite(wver, sizeof(wchar_t), 16, fp);

fwrite(&hd->length, 1, sizeof(int), fp);
}

int main(int argc, char **argv)
{
if(argc != 3)
{
fprintf(stderr, "Usage requires two arguments,\n"
"First is either 'r' (read) or 'w' (write)\n"
"Second is the file name\n");
}
else if(argv[1][0] == 'r')
{
FILE *fp = fopen(argv[2], "rb");
if(fp == NULL)
{
fprintf(stderr, "Error opening file %s for binary read\n", argv[2]);
}
else
{
struct header hd;
read_header(&hd, fp);
printf("id = \"%s\"\n", hd.id);
printf("ver = %d.%d.%d\n", hd.major, hd.minor, hd.revision);
printf("length = %d\n", hd.length);
fclose(fp);
}
}
else if(argv[1][0] == 'w')
{
FILE *fp = fopen(argv[2], "wb");
if(fp == NULL)
{
fprintf(stderr, "Error opening file %s for binary write\n", argv[2]);
}
else
{
struct header hd = {"ident", 1, 2, 3, 4};
write_header(&hd, fp);
fclose(fp);
}
}
return 0;
}
 
M

Michael B Allen

I have a file that was written using Java and the file has unicode
strings. What is the best way to deal with these in C? The file
definition reads:

Data Field Description
CHAR[32] File identifier (64 bytes corresponding to Unicode
character
string padded with '0' Unicode characters.
CHAR[16] File format version (32 bytes corresponding to Unicode
character string "x.y.z" where x, y, z are integers
corresponding to major, minor and revision number of the
File format version) padded with '0' Unicode characters.
INTEGER Main file header length [bytes]. ...


The data field defitions are from Java primitives:

CHAR Unicode character. 16-bit Unicode character. INTEGER Signed
integer number. 32-bit two's complement signed integer.

There is absolutely no need for these strings to be in unicode format
and I am at a loss as to how to convert them to a standard C character
array. Moreover, in the example below, I seem to be out in the byte
count as my integer is garbage. Any ideas would be greatly appreciated.

The easiest way to decode these strings is probably with encdec:

http://www.ioplex.com/~miallen/encdec/

See the dec_mbscpy function and use the identifier "JAVA" or maybe
"UTF-16BE".

Keep in mind that unless you use wide character strings throughout your
program you will be limited to the locale dependant codepage. On Linux
and some Unix you can run in a UTF-8 locale like LANG=en_US.UTF-8 to
support unicode but otherwise a Unicode string with characters that fall
outside of the locale dependant encoding range will generate an EILSEQ
error. So to properly support Unicode in your application you'll need
to use wchar_t (required if you use Windows for example) or the UTF-8
locale on Unix (see setlocale and encdec tests). Or you could just claim
the files must encode these strings with only characters of the locale
dependent encoding (e.g. ISO-8859-1) and cross your fingers.

Also, if DataOutputStream was used to encode the strings there may be
a leading integer denoting the number of characters that follow. But it
doesn't sound like that is the case. It sounds like a custom encoding.

Mike
 
J

Jamie

And what is the best strategy for when wchar_t != 2? I'm running linux on
x86 and ppc hardware and find wchar_t = 4. I am looking for a cleaver way
of defining a two element char (i.e. Java unicode representation) that
isn't too reliant on hardware :(

Thanks,
Jamie
 
S

Simon Biber

Jamie said:
And what is the best strategy for when wchar_t != 2? I'm running linux
on x86 and ppc hardware and find wchar_t = 4. I am looking for a clever
way of defining a two element char (i.e. Java unicode representation)
that isn't too reliant on hardware :(

Well, the type 'unsigned short' is probably two bytes on your system.
However the best way to read the values, given that the endianness is
probably different between your x86 and ppc hardware, is probably to
read unsigned chars and load them into wchar_t by shifting the value
like this:

/* define type twobyte as an array of 2 unsigned char */
typedef unsigned char twobyte[2];

wchar_t wid[32];
twobyte tbid[32];

fread(tbid, sizeof(twobyte), 32, fp);
for(i = 0; i < 32; i++)
{
wid = tbid[0] << 8 + tbid[1]; /* Or the other way around */
}
wcstombs(hd->id, wid, 32);

Or, if you know that there can't be any characters outside the first 256
code points, therefore the high byte of the unicode representation is
always zero, then you could forget about all the wcstombs crap, and just
copy the low byte, either tbid[0] or tbid[1], into an array of char.
 

Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Members online

Forum statistics

Threads
473,773
Messages
2,569,594
Members
45,120
Latest member
ShelaWalli
Top