how to read a Unicode file

Discussion in 'C++' started by starffly@gmail.com, Nov 7, 2006.

  1. Guest

    I want to read a xml file in Unicode, UTF-8 or a native encoding
    into a wchar_t type string, so i write a routine as follows, however,
    sometimes a Unicode file including Chinese character cannot be read
    completely. and I cannot tell where its root located, so NEED your
    help, GIVE me a hand please.
    THX.
    static Status LoadXMLFile2String(const char *filename, wchar_t *text){
    FILE *f;
    if(!(f = fopen(filename, "r"))){
    __printDebugA("Input file %s cannot be opened.", filename);
    return ERROR;
    }
    char *encoding;
    //transform routine: other --> unicode --> other
    const unsigned char UTF_8_HEAD[3] = {239, 187, 191};
    const unsigned char UNICODE_HEAD[2] = {255, 254};
    const unsigned char UNICODE_BIGENDIAN_HEAD[2] = {254, 255};
    unsigned char head[3];
    fread(head, 1, 3, f);
    if(!memcmp(head, UNICODE_HEAD, 2)){
    encoding = "UNICODE";
    }
    else if(!memcmp(head, UNICODE_BIGENDIAN_HEAD, 2)){
    encoding = "UNICODE_BIGENDIAN";
    }
    else if(!memcmp(head, UTF_8_HEAD, 3)){
    encoding = "UTF_8";
    }
    else{
    encoding = "ANSI";
    }
    char *str = (char *) malloc((MAXXMLFILESIZE + 1) * sizeof(char));
    int i = 0;
    if(!strcmp(encoding, "ANSI")){
    str[0] = head[0];
    str[1] = head[1];
    str[2] = head[2];
    i = 3;
    }
    else if(!strcmp(encoding, "UNICODE") || !strcmp(encoding,
    "UNICODE_BIGENDIAN")){
    str[0] = head[2];
    i = 1;
    }
    while(!feof(f)){
    if(i >= MAXXMLFILESIZE){
    db_error(L"The file is too large.");
    return ERROR;
    }
    str = fgetc(f);
    i++;
    }
    str = '\0';
    if(!strcmp(encoding, "UNICODE")){
    for(int j = 0; j < i - 1; j++){
    if(j % 2){
    text[j/2] += ((unsigned char) str[j]) << 8;
    }
    else{
    text[j/2] = (unsigned char) str[j];
    }
    }
    text[j/2] = 0;
    //db_debug(L"%d", wcslen(text));
    }
    else if(!strcmp(encoding, "UNICODE_BIGENDIAN")){
    for(int j = 0; j < i; j++){
    if(j % 2){
    text[j/2] = (text[j/2] << 8) + (unsigned char) str[j];
    }
    else{
    text[j/2] = (unsigned char) str[j];
    }
    }
    text[j/2] = 0;
    }
    else if(!strcmp(encoding, "UTF_8")){
    UTF2Unicode(str, text);
    }
    else if(!strcmp(encoding, "ANSI")){
    setlocale(LC_CTYPE, "");
    mbstowcs(text, str, MAXXMLFILESIZE + 1);
    }
    else{
    assert(FALSE);
    }
    free(str);
    fclose(f);
    return OK;
    }
    , Nov 7, 2006
    #1
    1. Advertising

  2. Guest

    help
    " дµÀ£º
    "
    > I want to read a xml file in Unicode, UTF-8 or a native encoding
    > into a wchar_t type string, so i write a routine as follows, however,
    > sometimes a Unicode file including Chinese character cannot be read
    > completely. and I cannot tell where its root located, so NEED your
    > help, GIVE me a hand please.
    > THX.
    > static Status LoadXMLFile2String(const char *filename, wchar_t *text){
    > FILE *f;
    > if(!(f = fopen(filename, "r"))){
    > __printDebugA("Input file %s cannot be opened.", filename);
    > return ERROR;
    > }
    > char *encoding;
    > //transform routine: other --> unicode --> other
    > const unsigned char UTF_8_HEAD[3] = {239, 187, 191};
    > const unsigned char UNICODE_HEAD[2] = {255, 254};
    > const unsigned char UNICODE_BIGENDIAN_HEAD[2] = {254, 255};
    > unsigned char head[3];
    > fread(head, 1, 3, f);
    > if(!memcmp(head, UNICODE_HEAD, 2)){
    > encoding = "UNICODE";
    > }
    > else if(!memcmp(head, UNICODE_BIGENDIAN_HEAD, 2)){
    > encoding = "UNICODE_BIGENDIAN";
    > }
    > else if(!memcmp(head, UTF_8_HEAD, 3)){
    > encoding = "UTF_8";
    > }
    > else{
    > encoding = "ANSI";
    > }
    > char *str = (char *) malloc((MAXXMLFILESIZE + 1) * sizeof(char));
    > int i = 0;
    > if(!strcmp(encoding, "ANSI")){
    > str[0] = head[0];
    > str[1] = head[1];
    > str[2] = head[2];
    > i = 3;
    > }
    > else if(!strcmp(encoding, "UNICODE") || !strcmp(encoding,
    > "UNICODE_BIGENDIAN")){
    > str[0] = head[2];
    > i = 1;
    > }
    > while(!feof(f)){
    > if(i >= MAXXMLFILESIZE){
    > db_error(L"The file is too large.");
    > return ERROR;
    > }
    > str = fgetc(f);
    > i++;
    > }
    > str = '\0';
    > if(!strcmp(encoding, "UNICODE")){
    > for(int j = 0; j < i - 1; j++){
    > if(j % 2){
    > text[j/2] += ((unsigned char) str[j]) << 8;
    > }
    > else{
    > text[j/2] = (unsigned char) str[j];
    > }
    > }
    > text[j/2] = 0;
    > //db_debug(L"%d", wcslen(text));
    > }
    > else if(!strcmp(encoding, "UNICODE_BIGENDIAN")){
    > for(int j = 0; j < i; j++){
    > if(j % 2){
    > text[j/2] = (text[j/2] << 8) + (unsigned char) str[j];
    > }
    > else{
    > text[j/2] = (unsigned char) str[j];
    > }
    > }
    > text[j/2] = 0;
    > }
    > else if(!strcmp(encoding, "UTF_8")){
    > UTF2Unicode(str, text);
    > }
    > else if(!strcmp(encoding, "ANSI")){
    > setlocale(LC_CTYPE, "");
    > mbstowcs(text, str, MAXXMLFILESIZE + 1);
    > }
    > else{
    > assert(FALSE);
    > }
    > free(str);
    > fclose(f);
    > return OK;
    > }
    , Nov 10, 2006
    #2
    1. Advertising

  3. wrote:

    > I want to read a xml file in Unicode, UTF-8 or a native encoding
    > into a wchar_t type string, so i write a routine as follows, however,
    > sometimes a Unicode file including Chinese character cannot be read
    > completely. and I cannot tell where its root located, so NEED your
    > help, GIVE me a hand please.
    > THX.

    [code sniped]

    This code is horrible on so many levels. Mostly I suspect because it is
    in C rather than C++.

    You will have something much easier to work with if you reformulate
    this in C++ and apply some more useful abstractions to it.

    As for your error, you are only checking a few encodings and assuming
    that there is a BOM to tell you which to use. You need to check the XML
    prolog. It may be that the Chinese file is using a different encoding.


    K
    =?iso-8859-1?q?Kirit_S=E6lensminde?=, Nov 10, 2006
    #3
    1. Advertising

Want to reply to this thread or ask your own question?

It takes just 2 minutes to sign up (and it's free!). Just click the sign up button to choose a username and then you can ask your own questions on the forum.
Similar Threads
  1. John C. Bollinger
    Replies:
    0
    Views:
    5,312
    John C. Bollinger
    Jun 1, 2004
  2. Sreejith K
    Replies:
    24
    Views:
    1,017
    Sreejith K
    Mar 24, 2009
  3. Jeremy
    Replies:
    1
    Views:
    804
    Alex Willmer
    Jan 11, 2011
  4. Jeremy
    Replies:
    0
    Views:
    577
    Jeremy
    Jan 11, 2011
  5. Alex Dowad
    Replies:
    4
    Views:
    271
    Michel Demazure
    May 1, 2010
Loading...

Share This Page