Reading Unicode Strings from File

Discussion in 'C Programming' started by Jamie, Nov 24, 2003.

  1. Jamie

    Jamie Guest

    I have a file that was written using Java and the file has unicode
    strings. What is the best way to deal with these in C? The file
    definition reads:

    Data Field Description
    CHAR[32] File identifier (64 bytes corresponding to Unicode character
    string padded with '0' Unicode characters.
    CHAR[16] File format version (32 bytes corresponding to Unicode
    character string "x.y.z" where x, y, z are integers
    corresponding to major, minor and revision
    number of the File format version) padded with '0' Unicode
    characters.
    INTEGER Main file header length [bytes].
    ....


    The data field defitions are from Java primitives:

    CHAR Unicode character. 16-bit Unicode character.
    INTEGER Signed integer number. 32-bit two's complement signed integer.

    There is absolutely no need for these strings to be in unicode format and
    I am at a loss as to how to convert them to a standard C character array.
    Moreover, in the example below, I seem to be out in the byte count as my
    integer is garbage. Any ideas would be greatly appreciated.

    #include <stdlib.h>
    #include <stdio.h>

    #define ARGC_FAILURE 100
    #define OPEN_FAILURE 101
    #define CLOSE_FAILURE 102

    int main(int argc, char *argv[])
    {
    FILE *fp;
    long n;
    char d_id[64];
    char d_version[32];
    int d_hdrlen;

    if (argc !=2)
    {
    printf("Usage: read_adf filename\n");
    return ARGC_FAILURE;
    }

    // Open the file
    if ( (fp = fopen(argv[1], "r")) == NULL)
    {
    printf("%s: Error opening %s", argv[0], argv[1]);
    return OPEN_FAILURE;
    }

    // Read the contents
    n = fread(d_id, sizeof(d_id), 1, fp);
    n = fread(d_version, sizeof(d_version), 1, fp);
    n = fread(&d_hdrlen, sizeof(d_hdrlen), 1, fp);

    // Display the contents
    printf(" ID: %s\n", d_id);
    printf(" VER: %s\n", d_version);
    printf(" HDR Length: %d\n", d_hdrlen);

    // Close the file
    if (fclose(fp) == EOF)
    {
    printf("%s: Error closing %s", argv[0], argv[1]);
    return CLOSE_FAILURE;
    }

    return 0;
    }
     
    Jamie, Nov 24, 2003
    #1
    1. Advertising

  2. On Mon, 24 Nov 2003 16:50:02 +0000, in comp.lang.c , Jamie
    <> wrote:

    >I have a file that was written using Java and the file has unicode
    >strings. What is the best way to deal with these in C?


    C knows nothing of Unicode. However your platform probably does, since
    it seems it uses them. Almost certainly your compiler has some
    platform-specific functions to convert unicode to C strings adn
    vice-versa. You might also find the wchar_t type may match unicode on
    your platform. You'd have to experiment to find out.

    --
    Mark McIntyre
    CLC FAQ <http://www.eskimo.com/~scs/C-faq/top.html>
    CLC readme: <http://www.angelfire.com/ms3/bchambless0/welcome_to_clc.html>
     
    Mark McIntyre, Nov 24, 2003
    #2
    1. Advertising

  3. Jamie

    Simon Biber Guest

    "Jamie" <> wrote:
    > I have a file that was written using Java and the file has unicode
    > strings. What is the best way to deal with these in C? The file
    > definition reads:
    >
    > Data Field Description
    > CHAR[32] File identifier (64 bytes corresponding to Unicode
    > character string padded with '0' Unicode characters.
    > CHAR[16] File format version (32 bytes corresponding to
    > Unicode character string "x.y.z" where x, y, z
    > are integers corresponding to major, minor and
    > revision number of the File format version)
    > padded with '0' Unicode characters.
    > INTEGER Main file header length [bytes].


    This seems to work on my system. It assumes that the multibyte
    and wide character system on your C implementation follows the
    same standard as the Java unicode file.

    #include <stdio.h>
    #include <stdlib.h>
    #include <wchar.h>
    #include <assert.h>

    struct header {
    char id[32];
    int major, minor, revision;
    int length;
    };

    void read_header(struct header *hd, FILE *fp)
    {
    wchar_t wid[32];
    wchar_t wver[16];
    char ver[16];

    assert(sizeof(wchar_t) == 2);
    assert(sizeof(int) == 4);

    fread(wid, sizeof(wchar_t), 32, fp);
    wcstombs(hd->id, wid, 32);

    fread(wver, sizeof(wchar_t), 16, fp);
    wcstombs(ver, wver, 16);
    sscanf(ver, "%d.%d.%d", &hd->major, &hd->minor, &hd->revision);

    fread(&hd->length, 1, sizeof(int), fp);
    }


    I also wrote some companion functions for testing:

    void write_header(const struct header *hd, FILE *fp)
    {
    wchar_t wid[32] = {0};
    wchar_t wver[16] = {0};
    char ver[16];

    assert(sizeof(wchar_t) == 2);
    assert(sizeof(int) == 4);

    mbstowcs(wid, hd->id, 32);
    fwrite(wid, sizeof(wchar_t), 32, fp);

    sprintf(ver, "%d.%d.%d", hd->major, hd->minor, hd->revision);
    mbstowcs(wver, ver, 16);
    fwrite(wver, sizeof(wchar_t), 16, fp);

    fwrite(&hd->length, 1, sizeof(int), fp);
    }

    int main(int argc, char **argv)
    {
    if(argc != 3)
    {
    fprintf(stderr, "Usage requires two arguments,\n"
    "First is either 'r' (read) or 'w' (write)\n"
    "Second is the file name\n");
    }
    else if(argv[1][0] == 'r')
    {
    FILE *fp = fopen(argv[2], "rb");
    if(fp == NULL)
    {
    fprintf(stderr, "Error opening file %s for binary read\n", argv[2]);
    }
    else
    {
    struct header hd;
    read_header(&hd, fp);
    printf("id = \"%s\"\n", hd.id);
    printf("ver = %d.%d.%d\n", hd.major, hd.minor, hd.revision);
    printf("length = %d\n", hd.length);
    fclose(fp);
    }
    }
    else if(argv[1][0] == 'w')
    {
    FILE *fp = fopen(argv[2], "wb");
    if(fp == NULL)
    {
    fprintf(stderr, "Error opening file %s for binary write\n", argv[2]);
    }
    else
    {
    struct header hd = {"ident", 1, 2, 3, 4};
    write_header(&hd, fp);
    fclose(fp);
    }
    }
    return 0;
    }


    --
    Simon.
     
    Simon Biber, Nov 25, 2003
    #3
  4. On Mon, 24 Nov 2003 11:50:02 -0500, Jamie wrote:

    > I have a file that was written using Java and the file has unicode
    > strings. What is the best way to deal with these in C? The file
    > definition reads:
    >
    > Data Field Description
    > CHAR[32] File identifier (64 bytes corresponding to Unicode
    > character
    > string padded with '0' Unicode characters.
    > CHAR[16] File format version (32 bytes corresponding to Unicode
    > character string "x.y.z" where x, y, z are integers
    > corresponding to major, minor and revision number of the
    > File format version) padded with '0' Unicode characters.
    > INTEGER Main file header length [bytes]. ...
    >
    >
    > The data field defitions are from Java primitives:
    >
    > CHAR Unicode character. 16-bit Unicode character. INTEGER Signed
    > integer number. 32-bit two's complement signed integer.
    >
    > There is absolutely no need for these strings to be in unicode format
    > and I am at a loss as to how to convert them to a standard C character
    > array. Moreover, in the example below, I seem to be out in the byte
    > count as my integer is garbage. Any ideas would be greatly appreciated.


    The easiest way to decode these strings is probably with encdec:

    http://www.ioplex.com/~miallen/encdec/

    See the dec_mbscpy function and use the identifier "JAVA" or maybe
    "UTF-16BE".

    Keep in mind that unless you use wide character strings throughout your
    program you will be limited to the locale dependant codepage. On Linux
    and some Unix you can run in a UTF-8 locale like LANG=en_US.UTF-8 to
    support unicode but otherwise a Unicode string with characters that fall
    outside of the locale dependant encoding range will generate an EILSEQ
    error. So to properly support Unicode in your application you'll need
    to use wchar_t (required if you use Windows for example) or the UTF-8
    locale on Unix (see setlocale and encdec tests). Or you could just claim
    the files must encode these strings with only characters of the locale
    dependent encoding (e.g. ISO-8859-1) and cross your fingers.

    Also, if DataOutputStream was used to encode the strings there may be
    a leading integer denoting the number of characters that follow. But it
    doesn't sound like that is the case. It sounds like a custom encoding.

    Mike
     
    Michael B Allen, Nov 25, 2003
    #4
  5. Jamie

    Jamie Guest

    And what is the best strategy for when wchar_t != 2? I'm running linux on
    x86 and ppc hardware and find wchar_t = 4. I am looking for a cleaver way
    of defining a two element char (i.e. Java unicode representation) that
    isn't too reliant on hardware :(

    Thanks,
    Jamie

    On Wed, 26 Nov 2003, Simon Biber wrote:

    > This seems to work on my system. It assumes that the multibyte
    > and wide character system on your C implementation follows the
    > same standard as the Java unicode file.
    >
    > #include <stdio.h>
    > #include <stdlib.h>
    > #include <wchar.h>
    > #include <assert.h>
    >
    > struct header {
    > char id[32];
    > int major, minor, revision;
    > int length;
    > };
    >
    > void read_header(struct header *hd, FILE *fp)
    > {
    > wchar_t wid[32];
    > wchar_t wver[16];
    > char ver[16];
    >
    > assert(sizeof(wchar_t) == 2);
    > assert(sizeof(int) == 4);
    >
    > fread(wid, sizeof(wchar_t), 32, fp);
    > wcstombs(hd->id, wid, 32);
    >
    > fread(wver, sizeof(wchar_t), 16, fp);
    > wcstombs(ver, wver, 16);
    > sscanf(ver, "%d.%d.%d", &hd->major, &hd->minor, &hd->revision);
    >
    > fread(&hd->length, 1, sizeof(int), fp);
    > }
    >
    >
    > I also wrote some companion functions for testing:
    >
    > void write_header(const struct header *hd, FILE *fp)
    > {
    > wchar_t wid[32] = {0};
    > wchar_t wver[16] = {0};
    > char ver[16];
    >
    > assert(sizeof(wchar_t) == 2);
    > assert(sizeof(int) == 4);
    >
    > mbstowcs(wid, hd->id, 32);
    > fwrite(wid, sizeof(wchar_t), 32, fp);
    >
    > sprintf(ver, "%d.%d.%d", hd->major, hd->minor, hd->revision);
    > mbstowcs(wver, ver, 16);
    > fwrite(wver, sizeof(wchar_t), 16, fp);
    >
    > fwrite(&hd->length, 1, sizeof(int), fp);
    > }
    >
    > int main(int argc, char **argv)
    > {
    > if(argc != 3)
    > {
    > fprintf(stderr, "Usage requires two arguments,\n"
    > "First is either 'r' (read) or 'w' (write)\n"
    > "Second is the file name\n");
    > }
    > else if(argv[1][0] == 'r')
    > {
    > FILE *fp = fopen(argv[2], "rb");
    > if(fp == NULL)
    > {
    > fprintf(stderr, "Error opening file %s for binary read\n", argv[2]);
    > }
    > else
    > {
    > struct header hd;
    > read_header(&hd, fp);
    > printf("id = \"%s\"\n", hd.id);
    > printf("ver = %d.%d.%d\n", hd.major, hd.minor, hd.revision);
    > printf("length = %d\n", hd.length);
    > fclose(fp);
    > }
    > }
    > else if(argv[1][0] == 'w')
    > {
    > FILE *fp = fopen(argv[2], "wb");
    > if(fp == NULL)
    > {
    > fprintf(stderr, "Error opening file %s for binary write\n", argv[2]);
    > }
    > else
    > {
    > struct header hd = {"ident", 1, 2, 3, 4};
    > write_header(&hd, fp);
    > fclose(fp);
    > }
    > }
    > return 0;
    > }
    >
    >
    > --
    > Simon.
    >
    >
    >
     
    Jamie, Nov 26, 2003
    #5
  6. Jamie

    Simon Biber Guest

    "Jamie" <> wrote:
    > And what is the best strategy for when wchar_t != 2? I'm running linux
    > on x86 and ppc hardware and find wchar_t = 4. I am looking for a clever
    > way of defining a two element char (i.e. Java unicode representation)
    > that isn't too reliant on hardware :(


    Well, the type 'unsigned short' is probably two bytes on your system.
    However the best way to read the values, given that the endianness is
    probably different between your x86 and ppc hardware, is probably to
    read unsigned chars and load them into wchar_t by shifting the value
    like this:

    /* define type twobyte as an array of 2 unsigned char */
    typedef unsigned char twobyte[2];

    wchar_t wid[32];
    twobyte tbid[32];

    fread(tbid, sizeof(twobyte), 32, fp);
    for(i = 0; i < 32; i++)
    {
    wid = tbid[0] << 8 + tbid[1]; /* Or the other way around */
    }
    wcstombs(hd->id, wid, 32);

    Or, if you know that there can't be any characters outside the first 256
    code points, therefore the high byte of the unicode representation is
    always zero, then you could forget about all the wcstombs crap, and just
    copy the low byte, either tbid[0] or tbid[1], into an array of char.

    --
    Simon.
     
    Simon Biber, Nov 27, 2003
    #6
    1. Advertising

Want to reply to this thread or ask your own question?

It takes just 2 minutes to sign up (and it's free!). Just click the sign up button to choose a username and then you can ask your own questions on the forum.
Similar Threads
  1. Steve
    Replies:
    3
    Views:
    5,100
    Oliver Wong
    Sep 29, 2005
  2. Ben

    Strings, Strings and Damned Strings

    Ben, Jun 22, 2006, in forum: C Programming
    Replies:
    14
    Views:
    798
    Malcolm
    Jun 24, 2006
  3. Asterix
    Replies:
    5
    Views:
    738
    Matt Nordhoff
    Aug 31, 2008
  4. Jeremy
    Replies:
    1
    Views:
    832
    Alex Willmer
    Jan 11, 2011
  5. Jeremy
    Replies:
    0
    Views:
    607
    Jeremy
    Jan 11, 2011
Loading...

Share This Page