wcstombs() problem

Discussion in 'C Programming' started by arnuld, Feb 23, 2012.

  1. arnuld

    arnuld Guest

    AIM: To convert a wide-character string into a character string
    PROBLEM: (1) checking return value or errno.
    (2) conversion just does not happen.




    #include <stdio.h>
    #include <stdlib.h>
    #include <wchar.h>
    #include <errno.h>
    #include <string.h>
    #include <locale.h>
    #include <limits.h>

    #ifndef __STDC_ISO_10646__
    #define __STDC_ISO_10646__
    #endif

    enum {
    VAL_SUCC = 0,
    VAL_ERR = -1,
    ERR_ENC = -101,
    ERR_ERRNO_UNKNOWN = -102,
    SIZE_INPUT = 1000
    };


    void setLocale(const char* t);
    size_t WStr2CStr(char **s, const wchar_t* ws, const size_t len);
    void getGermanLanguageFromFile(wchar_t arr[], const size_t len);
    char* myMalloc(const size_t len);


    int main(void)
    {
    int ret = 0;
    wchar_t contents[SIZE_INPUT+1] = {0};
    char* p = NULL;
    size_t plen = 0;

    setLocale("en_US.utf8");
    getGermanLanguageFromFile(contents, SIZE_INPUT);
    printf("Contents = {%ls}\n\n", contents);


    plen = ret + 1;
    printf("plen = %d\n", plen);
    p = myMalloc(plen);
    ret = WStr2CStr(&p, contents, plen);

    if( ret <= 0)
    {
    printf("IN: %s @%d ERROR converting to characters: ERRNO = %d\n",
    __FILE__, __LINE__, ret);
    exit(EXIT_FAILURE);
    }

    printf("p = [%s]\n\n", p);
    printf("W = %zu, Char = %zu\n", wcslen(contents), strlen(p));

    free(p);


    return 0;
    }

    char* myMalloc(const size_t len)
    {
    char* p = malloc(len * (sizeof *p));
    if(NULL == p)
    {
    printf("IN: %s @%d Out of Memory\n", __FILE__, __LINE__);
    exit(EXIT_FAILURE);
    }

    return p;
    }


    void setLocale(const char* t)
    {
    if(NULL == setlocale(LC_CTYPE, t))
    {
    printf("IN: %s @%d ERROR: can not set locale [%s]\n", __FILE__,
    __LINE__, t);
    exit(EXIT_FAILURE);
    }
    else
    {
    printf("IN: %s @%d Locale Set = [%s]\n", __FILE__, __LINE__, t);
    }
    }

    /* Contents of german.txt: Megaupload-Gründer Schmitz gegen Kaution auf
    freiem Fuß. News */
    void getGermanLanguageFromFile(wchar_t arr[], const size_t len)
    {
    const char* filename = "german.txt";
    FILE* fp;
    wchar_t* retp;

    fp = fopen(filename,"r");
    if(NULL == fp)
    {
    printf("IN:%s @%d: ERROR [%s]Can not open (%s)\n",__FILE__,
    __LINE__, strerror(errno), filename);
    return;
    }

    errno = 0;
    retp = fgetws(arr, len, fp);

    if(NULL == retp)
    {
    if(feof(fp))
    {
    printf("IN: %s @%d Reading Error: [%s]\n", __FILE__, __LINE__,
    strerror(errno));
    }
    else
    {
    printf("IN: %s @%d Reading Error: ERRNO = %d\n", __FILE__,
    __LINE__, errno);
    }
    exit(EXIT_FAILURE);
    }
    }




    size_t WStr2CStr(char **s, const wchar_t* ws, const size_t len)
    {
    int ret = VAL_ERR;

    if(NULL == ws || NULL == s)
    {
    ret = VAL_ERR;
    }
    else
    {
    size_t r;
    errno = 0;
    r = wcstombs(*s, ws, len);
    if(0 >= r)
    {
    if(EILSEQ == errno)
    {
    ret = ERR_ENC;
    }
    else if(0 == r)
    {
    printf("IN: %s @%d ERROR: ZERO bytes converted = %zu\n",
    __FILE__, __LINE__, r);
    ret = VAL_ERR;
    }
    else if(errno)
    {
    ret = ERR_ERRNO_UNKNOWN;
    }
    }
    else
    {
    printf("IN: %s @%d bytes converted = %zu\n", __FILE__,
    __LINE__, r);
    ret = r;
    }
    }

    return ret;
    }

    ==================== OUTPUT ==============================
    [arnuld@dune C]$ gcc -std=c99 -pedantic -Wall -Wextra convert.c
    [arnuld@dune C]$ ./a.out
    IN: convert.c @82 Locale Set = [en_US.utf8]
    Contents = {Megaupload-Gründer Schmitz gegen Kaution auf freiem Fuß. News}

    plen = 1
    IN: convert.c @151 bytes converted = 1
    p = [M]

    W = 61, Char = 1
    [arnuld@dune C]$





    I searched archives and came across this piece of code where poster calls
    wcstombs() 2 times, first to calculate characters (using NULL argument)
    and then to really do the conversion. I wonder if that is the way wcstombs
    () was supposed to use (because it works while mine does not):

    size_t n = wcstombs(NULL, src, 0);
    char *dst = malloc(n + 1);
    if(dst == NULL)
    {
    fprintf(stderr, "memory allocation failed\n");
    return NULL;
    }
    if(wcstombs(dst, src, n + 1) != n)
    {
    fprintf(stderr, "conversion failed\n");
    free(dst);
    return NULL;
    }



    --
    arnuld
    http://LispMachine.Wordpress.com
    arnuld, Feb 23, 2012
    #1
    1. Advertising

  2. arnuld

    arnuld Guest

    > On Thu, 23 Feb 2012 05:35:41 +0000, arnuld wrote:

    > ... SNIP..
    > int main(void)
    > {
    > int ret = 0;
    > wchar_t contents[SIZE_INPUT+1] = {0}; char* p = NULL;
    > size_t plen = 0;
    >
    > setLocale("en_US.utf8");
    > getGermanLanguageFromFile(contents, SIZE_INPUT); printf("Contents =
    > {%ls}\n\n", contents);
    >
    >
    > plen = ret + 1;


    was stupid enough to do that, changing it to

    plen = wcslen(contents) + 1;

    does the conversion but still it misses some last characters, any idea
    why ?



    --
    arnuld
    http://LispMachine.Wordpress.com
    arnuld, Feb 23, 2012
    #2
    1. Advertising

  3. arnuld <> writes:
    [...]
    > #ifndef __STDC_ISO_10646__
    > #define __STDC_ISO_10646__
    > #endif


    __STDC_ISO_10646__ is conditionally defined by the implementation.
    Defining it yourself won't give you the desired semantics.

    --
    Keith Thompson (The_Other_Keith) <http://www.ghoti.net/~kst>
    Will write code for food.
    "We must do something. This is something. Therefore, we must do this."
    -- Antony Jay and Jonathan Lynn, "Yes Minister"
    Keith Thompson, Feb 23, 2012
    #3
  4. On 23 Feb 2012 05:35:41 GMT, arnuld <> wrote:

    >AIM: To convert a wide-character string into a character string


    Why do you think this is possible?

    >PROBLEM: (1) checking return value or errno.
    > (2) conversion just does not happen.
    >
    >
    >
    >
    >#include <stdio.h>
    >#include <stdlib.h>
    >#include <wchar.h>
    >#include <errno.h>
    >#include <string.h>
    >#include <locale.h>
    >#include <limits.h>
    >
    >#ifndef __STDC_ISO_10646__
    >#define __STDC_ISO_10646__
    >#endif
    >
    >enum {
    > VAL_SUCC = 0,
    > VAL_ERR = -1,
    > ERR_ENC = -101,
    > ERR_ERRNO_UNKNOWN = -102,
    > SIZE_INPUT = 1000
    >};
    >
    >
    >void setLocale(const char* t);
    >size_t WStr2CStr(char **s, const wchar_t* ws, const size_t len);
    >void getGermanLanguageFromFile(wchar_t arr[], const size_t len);
    >char* myMalloc(const size_t len);
    >
    >
    >int main(void)
    >{
    > int ret = 0;
    > wchar_t contents[SIZE_INPUT+1] = {0};
    > char* p = NULL;
    > size_t plen = 0;
    >
    > setLocale("en_US.utf8");
    > getGermanLanguageFromFile(contents, SIZE_INPUT);
    > printf("Contents = {%ls}\n\n", contents);
    >
    >
    > plen = ret + 1;


    Fixed in follow-on message to
    plen = wcslen(contents)+1;

    > printf("plen = %d\n", plen);
    > p = myMalloc(plen);
    > ret = WStr2CStr(&p, contents, plen);


    WStr2CStr returns a size_t which is unsigned.

    >
    > if( ret <= 0)


    Therefore, ret can never be negative.

    > {
    > printf("IN: %s @%d ERROR converting to characters: ERRNO = %d\n",
    >__FILE__, __LINE__, ret);
    > exit(EXIT_FAILURE);
    > }
    >
    > printf("p = [%s]\n\n", p);
    > printf("W = %zu, Char = %zu\n", wcslen(contents), strlen(p));
    >
    > free(p);
    >
    >
    > return 0;
    >}
    >
    >char* myMalloc(const size_t len)
    >{
    > char* p = malloc(len * (sizeof *p));
    > if(NULL == p)
    > {
    > printf("IN: %s @%d Out of Memory\n", __FILE__, __LINE__);
    > exit(EXIT_FAILURE);
    > }
    >
    > return p;
    >}
    >
    >
    >void setLocale(const char* t)
    >{
    > if(NULL == setlocale(LC_CTYPE, t))
    > {
    > printf("IN: %s @%d ERROR: can not set locale [%s]\n", __FILE__,
    >__LINE__, t);
    > exit(EXIT_FAILURE);
    > }
    > else
    > {
    > printf("IN: %s @%d Locale Set = [%s]\n", __FILE__, __LINE__, t);
    > }
    >}
    >
    >/* Contents of german.txt: Megaupload-Gründer Schmitz gegen Kaution auf
    >freiem Fuß. News */
    >void getGermanLanguageFromFile(wchar_t arr[], const size_t len)
    >{
    > const char* filename = "german.txt";
    > FILE* fp;
    > wchar_t* retp;
    >
    > fp = fopen(filename,"r");
    > if(NULL == fp)
    > {
    > printf("IN:%s @%d: ERROR [%s]Can not open (%s)\n",__FILE__,
    >__LINE__, strerror(errno), filename);
    > return;
    > }
    >
    > errno = 0;
    > retp = fgetws(arr, len, fp);
    >
    > if(NULL == retp)
    > {
    > if(feof(fp))
    > {
    > printf("IN: %s @%d Reading Error: [%s]\n", __FILE__, __LINE__,
    >strerror(errno));
    > }
    > else
    > {
    > printf("IN: %s @%d Reading Error: ERRNO = %d\n", __FILE__,
    >__LINE__, errno);
    > }
    > exit(EXIT_FAILURE);
    > }
    >}
    >
    >
    >
    >
    >size_t WStr2CStr(char **s, const wchar_t* ws, const size_t len)
    >{
    > int ret = VAL_ERR;
    >
    > if(NULL == ws || NULL == s)
    > {
    > ret = VAL_ERR;
    > }
    > else
    > {
    > size_t r;
    > errno = 0;
    > r = wcstombs(*s, ws, len);


    wcstombs does not convert wide to char. It converts wide to
    multi-byte. Multi-byte characters can occupy one or TWO bytes. At
    least two of the wide characters you read in from the file (ü and ß)
    appear to require two bytes. Consequently, at least two of the last
    characters from the original message will not fit in the first len
    characters pointed to by *s.

    > if(0 >= r)


    r is a size_t and therefore can never be negative.

    > {
    > if(EILSEQ == errno)
    > {
    > ret = ERR_ENC;
    > }
    > else if(0 == r)
    > {
    > printf("IN: %s @%d ERROR: ZERO bytes converted = %zu\n",
    >__FILE__, __LINE__, r);
    > ret = VAL_ERR;
    > }
    > else if(errno)
    > {
    > ret = ERR_ERRNO_UNKNOWN;
    > }
    > }
    > else
    > {
    > printf("IN: %s @%d bytes converted = %zu\n", __FILE__,
    >__LINE__, r);
    > ret = r;
    > }
    > }
    >
    > return ret;
    >}
    >
    >==================== OUTPUT ==============================
    >[arnuld@dune C]$ gcc -std=c99 -pedantic -Wall -Wextra convert.c
    >[arnuld@dune C]$ ./a.out
    >IN: convert.c @82 Locale Set = [en_US.utf8]
    >Contents = {Megaupload-Gründer Schmitz gegen Kaution auf freiem Fuß. News}
    >
    >plen = 1
    >IN: convert.c @151 bytes converted = 1
    >p = [M]
    >
    >W = 61, Char = 1
    >[arnuld@dune C]$
    >
    >
    >
    >
    >
    >I searched archives and came across this piece of code where poster calls
    >wcstombs() 2 times, first to calculate characters (using NULL argument)
    >and then to really do the conversion. I wonder if that is the way wcstombs
    >() was supposed to use (because it works while mine does not):
    >
    > size_t n = wcstombs(NULL, src, 0);
    > char *dst = malloc(n + 1);
    > if(dst == NULL)
    > {
    > fprintf(stderr, "memory allocation failed\n");
    > return NULL;
    > }
    > if(wcstombs(dst, src, n + 1) != n)
    > {
    > fprintf(stderr, "conversion failed\n");
    > free(dst);
    > return NULL;
    > }


    --
    Remove del for email
    Barry Schwarz, Feb 23, 2012
    #4
  5. arnuld <> writes:

    >> On Thu, 23 Feb 2012 05:35:41 +0000, arnuld wrote:

    >
    >> ... SNIP..
    >> int main(void)
    >> {
    >> int ret = 0;
    >> wchar_t contents[SIZE_INPUT+1] = {0}; char* p = NULL;
    >> size_t plen = 0;
    >>
    >> setLocale("en_US.utf8");
    >> getGermanLanguageFromFile(contents, SIZE_INPUT); printf("Contents =
    >> {%ls}\n\n", contents);
    >>
    >>
    >> plen = ret + 1;

    >
    > was stupid enough to do that, changing it to
    >
    > plen = wcslen(contents) + 1;
    >
    > does the conversion but still it misses some last characters, any idea
    > why ?


    Barry Schwarz has answered this, as have you! The "mbs" at the end of
    wcstombs stands for "multi-byte string". That means that some
    characters need more than once byte to be encoded so the buffer size
    needed is rarely wcslen(contents) + 1.

    In your original post, you said that you'd seen code that calls wcstombs
    twice -- once to get the length and again to do the conversion and you
    asked "I wonder if that is the way wcstombs () was supposed to use
    (because it works while mine does not)". The answer is "yes".

    --
    Ben.
    Ben Bacarisse, Feb 23, 2012
    #5
  6. arnuld

    arnuld Guest

    > On Thu, 23 Feb 2012 12:34:29 +0000, Ben Bacarisse wrote:

    > Barry Schwarz has answered this, as have you! The "mbs" at the end of
    > wcstombs stands for "multi-byte string". That means that some
    > characters need more than once byte to be encoded so the buffer size
    > needed is rarely wcslen(contents) + 1.


    I worked for 2 days to understand this. Wrote several examples, read
    several examples, read archives. Since H&S5 does not mention this, which
    meant it must be easily understandable. I feel like 'still a kid in C
    programming'.


    > In your original post, you said that you'd seen code that calls wcstombs
    > twice -- once to get the length and again to do the conversion and you
    > asked "I wonder if that is the way wcstombs () was supposed to use
    > (because it works while mine does not)". The answer is "yes".


    What about mblen() to calculate length as alternative ? I tried it with
    this code but it does not work:

    #include <stdio.h>
    #include <stdlib.h>
    #include <wchar.h>
    #include <errno.h>
    #include <string.h>
    #include <locale.h>

    enum {
    VAL_SUCC = 0,
    VAL_ERR = -1,
    ERR_ENC = -101,
    ERR_ERRNO_UNKNOWN = -102,
    SIZE_NAME = 10,
    SIZE_INPUT = 1000,
    SIZE_READ = 1};


    void mySetLocale(const char* t);
    size_t WStr2CStr(char **s, const wchar_t* ws, const size_t len);
    void get_InternationlText_from_file(wchar_t arr[], const size_t len);
    char* myMalloc(const size_t len);

    int main(void)
    {
    wchar_t contents[SIZE_INPUT+1] = {0};
    char arr[SIZE_INPUT+1] = {0};
    size_t wlen = 0;
    int len = 0;

    mySetLocale("en_US.utf8");
    get_InternationlText_from_file(contents, SIZE_INPUT);
    printf("Contents = {%ls}\n\n", contents);

    wlen = wcslen(contents);
    len = mblen(arr, SIZE_INPUT+1);

    printf("IN: %s @%d: wlen = %zu, len = %d\n", __FILE__, __LINE__, wlen,
    len);

    return 0;
    }



    void mySetLocale(const char* t)
    {
    if(NULL == setlocale(LC_CTYPE, t))
    {
    printf("IN: %s @%d ERROR: can not set locale [%s]\n", __FILE__,
    __LINE__, t);
    exit(EXIT_FAILURE);
    }
    else
    {
    printf("IN: %s @%d Locale Set = [%s]\n", __FILE__, __LINE__, t);
    }
    }


    void get_InternationlText_from_file(wchar_t arr[], const size_t len)
    {
    const char* filename = "german.txt";
    FILE* fp;
    wchar_t* retp;

    fp = fopen(filename,"r");
    if(NULL == fp)
    {
    printf("IN:%s @%d: ERROR [%s]Can not open (%s)\n",__FILE__,
    __LINE__, strerror(errno), filename);
    return;
    }

    errno = 0;
    retp = fgetws(arr, len, fp);

    if(NULL == retp)
    {
    if(feof(fp))
    {
    printf("IN: %s @%d Reading Error: [%s]\n", __FILE__, __LINE__,
    strerror(errno));
    }
    else
    {
    printf("IN: %s @%d Reading Error: ERRNO = %d\n", __FILE__,
    __LINE__, errno);
    }
    exit(EXIT_FAILURE);
    }
    }




    size_t WStr2CStr(char **s, const wchar_t* ws, const size_t len)
    {
    size_t ret = VAL_ERR;

    if(NULL == ws || NULL == s)
    {
    ret = VAL_ERR;
    }
    else
    {
    size_t r;
    errno = 0;
    r = wcstombs(*s, ws, len);

    if(0 >= r)
    {
    if(EILSEQ == errno)
    {
    ret = ERR_ENC;
    }
    else if(0 == r)
    {
    printf("IN: %s @%d ERROR bytes converted = %zu\n",
    __FILE__, __LINE__, r);
    ret = VAL_ERR;
    }
    else if(errno)
    {
    ret = ERR_ERRNO_UNKNOWN;
    }
    }
    else
    {
    printf("IN: %s @%d bytes converted = %zu\n", __FILE__,
    __LINE__, r);
    ret = r;
    }
    }

    return ret;
    }

    ==================== OUTPUT ============================
    [arnuld@dune C]$ gcc -std=c99 -pedantic -Wall -Wextra mblen.c
    [arnuld@dune C]$ ./a.out
    IN: mblen.c @53 Locale Set = [en_US.utf8]
    Contents = {Megaupload-Gründer Schmitz gegen Kaution auf freiem Fuß. News}

    IN: mblen.c @37: wlen = 61, len = 0
    [arnuld@dune C]$



    --
    arnuld
    http://LispMachine.Wordpress.com
    arnuld, Feb 24, 2012
    #6
  7. arnuld

    arnuld Guest

    >> arnuld wrote:
    > On Thu, 23 Feb 2012 00:50:26 -0800, Barry Schwarz wrote:


    > WStr2CStr returns a size_t which is unsigned.
    >
    >> if( ret <= 0)

    >
    > Therefore, ret can never be negative.


    I know that an dyou have explained it very well. Problem is how do I
    compete with statement from section 16.11.2 from H&S5:

    "The function returns the number of characters written to s, not
    counting the terminating null character(if any). If a conversion error
    occurs, the function returns -1 (cast to size_t)"


    or it returns the value equal to "the value of -1 catst-ed to size_t ?
    (which is 4294967295 on mu machine)





    --
    arnuld
    http://LispMachine.Wordpress.com
    arnuld, Feb 24, 2012
    #7
  8. arnuld

    Ike Naar Guest

    On 2012-02-24, arnuld <> wrote:
    >> In your original post, you said that you'd seen code that calls wcstombs
    >> twice -- once to get the length and again to do the conversion and you
    >> asked "I wonder if that is the way wcstombs () was supposed to use
    >> (because it works while mine does not)". The answer is "yes".

    >
    > What about mblen() to calculate length as alternative ? I tried it with
    > this code but it does not work:
    >
    > [snip]
    >
    > int main(void)
    > {
    > wchar_t contents[SIZE_INPUT+1] = {0};
    > char arr[SIZE_INPUT+1] = {0};
    > size_t wlen = 0;
    > int len = 0;
    >
    > mySetLocale("en_US.utf8");
    > get_InternationlText_from_file(contents, SIZE_INPUT);
    > printf("Contents = {%ls}\n\n", contents);
    >
    > wlen = wcslen(contents);
    > len = mblen(arr, SIZE_INPUT+1);


    It seems there is something missing from this code.
    As it is written, mblen is applied to arr which contains all zeroes.

    > printf("IN: %s @%d: wlen = %zu, len = %d\n", __FILE__, __LINE__, wlen,
    > len);
    >
    > return 0;
    > }
    Ike Naar, Feb 24, 2012
    #8
  9. arnuld

    Nobody Guest

    On Fri, 24 Feb 2012 05:32:10 +0000, arnuld wrote:

    > What about mblen() to calculate length as alternative ?


    mblen() requires that you have already converted the string to multi-byte
    representation.

    The "correct" answer is the one you noted in your original post: call
    wcstombs() with NULL as the first argument to calculate the length of the
    resulting multi-byte string. That feature was added for this specific
    purpose.
    Nobody, Feb 24, 2012
    #9
  10. arnuld <> writes:

    >>> arnuld wrote:

    >> On Thu, 23 Feb 2012 00:50:26 -0800, Barry Schwarz wrote:

    >
    >> WStr2CStr returns a size_t which is unsigned.
    >>
    >>> if( ret <= 0)

    >>
    >> Therefore, ret can never be negative.

    >
    > I know that an dyou have explained it very well. Problem is how do I
    > compete with statement from section 16.11.2 from H&S5:
    >
    > "The function returns the number of characters written to s, not
    > counting the terminating null character(if any). If a conversion error
    > occurs, the function returns -1 (cast to size_t)"


    You compare the return value with -1 cast to size_t:

    if (ret == (size_t)-1)

    <snip>
    --
    Ben.
    Ben Bacarisse, Feb 24, 2012
    #10
  11. arnuld

    arnuld Guest

    > On Fri, 24 Feb 2012 11:51:18 +0000, Ben Bacarisse wrote:

    > You compare the return value with -1 cast to size_t:
    >
    > if (ret == (size_t)-1)


    So putting a negative value in size_t type results in UINT_MAX as result.
    Is this from ANSI standard ?

    That's from my machine:

    #include <stdio.h>
    #include <limits.h>

    int main(void)
    {
    printf("(size_t) - 1 = %u\n", (size_t) - 1);
    printf("UINT_MAX = %u\n", UINT_MAX);

    return 0;
    }

    =================== OUTPUT ========================
    [arnuld@dune C]$ ./a.out
    (size_t) - 1 = 4294967295
    UINT_MAX = 4294967295
    [arnuld@dune C]$








    --
    arnuld
    http://LispMachine.Wordpress.com
    arnuld, Feb 24, 2012
    #11
  12. arnuld

    arnuld Guest

    > On Fri, 24 Feb 2012 08:28:03 +0000, Ike Naar wrote:

    > It seems there is something missing from this code. As it is written,
    > mblen is applied to arr which contains all zeroes.



    okay, I applied to an array which has converted using wcstombs() but
    still mblen returns 1. Does that mean all those spanish and german
    characters are using only one byte in memory ?

    If its true, can I say this behavior is not portable and one must use
    wchar_t instead of char by looking at the result ?



    #include <stdio.h>
    #include <stdlib.h>
    #include <wchar.h>
    #include <errno.h>
    #include <string.h>
    #include <locale.h>

    enum {
    VAL_SUCC = 0,
    VAL_ERR = -1,
    ERR_ENC = -101,
    ERR_ERRNO_UNKNOWN = -102,
    SIZE_NAME = 10,
    SIZE_INPUT = 1000,
    SIZE_READ = 1};


    void mySetLocale(const char* t);
    size_t WStr2CStr(char **s, const wchar_t* ws, size_t* result);
    void get_InternationlText_from_file(wchar_t arr[], const size_t len);
    char* myMalloc(const size_t len);

    int main(void)
    {
    wchar_t contents[SIZE_INPUT+1] = {0};
    char* p = NULL;
    size_t sz = 0;
    int len = 0;
    int ret;

    mySetLocale("en_US.utf8");
    get_InternationlText_from_file(contents, SIZE_INPUT);
    printf("Contents = {%ls}\n\n", contents);

    ret = WStr2CStr(&p, contents, &sz);
    if(0 > ret)
    {
    printf("IN: %s @%d: ERROR in Conversion\n", __FILE__, __LINE__);
    exit(EXIT_FAILURE);
    }

    len = mblen(p, sz);
    printf("p = %s\n", p);
    printf("mblen() = %d\n", len);

    return 0;
    }


    void get_InternationlText_from_file(wchar_t arr[], const size_t len)
    {
    const char* filename = "german.txt";
    FILE* fp;
    wchar_t* retp;

    fp = fopen(filename,"r");
    if(NULL == fp)
    {
    printf("IN:%s @%d: ERROR [%s]Can not open (%s)\n",__FILE__,
    __LINE__, strerror(errno), filename);
    return;
    }

    errno = 0;
    retp = fgetws(arr, len, fp);

    if(NULL == retp)
    {
    if(feof(fp))
    {
    printf("IN: %s @%d Reading Error: [%s]\n", __FILE__, __LINE__,
    strerror(errno));
    }
    else
    {
    printf("IN: %s @%d Reading Error: ERRNO = %d\n", __FILE__,
    __LINE__, errno);
    }
    exit(EXIT_FAILURE);
    }
    }




    size_t WStr2CStr(char **s, const wchar_t* ws, size_t* result)
    {
    size_t ret = VAL_ERR;

    if(NULL == ws || NULL == s)
    {
    ret = VAL_ERR;
    }
    else
    {
    size_t r;
    size_t len;
    size_t wlen;

    wlen = wcslen(ws);
    len = wcstombs(NULL, ws, 0);
    printf("IN: %s @%d: wlen = %zu, len = %zu\n", __FILE__, __LINE__,
    wlen, len);

    if(0 >= len)
    {
    printf("IN: %s @%d ERROR calculating len for MBstring\n",
    __FILE__, __LINE__);
    exit(EXIT_FAILURE);
    }

    ++len; // for null character
    *s = myMalloc(len);

    errno = 0;
    r = wcstombs(*s, ws, len);

    if( (size_t)-1 == r)
    {
    if(EILSEQ == errno)
    {
    ret = ERR_ENC;
    }
    else if(0 == r)
    {
    printf("IN: %s @%d ERROR --> bytes converted = %zu\n",
    __FILE__, __LINE__, r);
    ret = VAL_ERR;
    }
    else if(errno)
    {
    ret = ERR_ERRNO_UNKNOWN;
    }
    }
    else
    {
    printf("IN: %s @%d bytes converted = %zu\n", __FILE__,
    __LINE__, r);
    *result = r;
    ret = VAL_SUCC;
    }
    }

    return ret;
    }




    void mySetLocale(const char* t)
    {
    if(NULL == setlocale(LC_CTYPE, t))
    {
    printf("IN: %s @%d ERROR: can not set locale [%s]\n", __FILE__,
    __LINE__, t);
    exit(EXIT_FAILURE);
    }
    else
    {
    printf("IN: %s @%d Locale Set = [%s]\n", __FILE__, __LINE__, t);
    }
    }



    char* myMalloc(const size_t len)
    {
    char* p = (char*) malloc(len * (sizeof *p));
    if(NULL == p)
    {
    printf("IN: %s @%d Out of Memory\n", __FILE__, __LINE__);
    exit(EXIT_FAILURE);
    }

    return p;
    }

    ======================== OUTPUT ==========================
    [arnuld@dune C]$ gcc -std=c99 -pedantic -Wall -Wextra wchar.c
    [arnuld@dune C]$ ./a.out
    IN: wchar.c @153 Locale Set = [en_US.utf8]
    Contents = {Megaupload-Gründer Schmitz gegen Kaution auf freiem Fuß.
    News. "Qué tan" "Qué tanto" = How¿Qué :::}

    IN: wchar.c @100: wlen = 98, len = 104
    IN: wchar.c @132 bytes converted = 104
    p = Megaupload-Gründer Schmitz gegen Kaution auf freiem Fuß. News. "Qué
    tan" "Qué tanto" = How¿Qué :::
    mblen() = 1
    [arnuld@dune C]$





    --
    arnuld
    http://LispMachine.Wordpress.com
    arnuld, Feb 24, 2012
    #12
  13. arnuld

    James Kuyper Guest

    On 02/24/2012 08:21 AM, arnuld wrote:
    >> On Fri, 24 Feb 2012 11:51:18 +0000, Ben Bacarisse wrote:

    >
    >> You compare the return value with -1 cast to size_t:
    >>
    >> if (ret == (size_t)-1)

    >
    > So putting a negative value in size_t type results in UINT_MAX as result.


    Not necessarily. It results in SIZE_MAX, which might or might not be the
    same as UINT_MAX (it's more likely to be the same as ULONG_MAX, on
    machines where those two are different).

    > Is this from ANSI standard ?


    Yes. See 6.3.1.3p2:
    > Otherwise, if the new type is unsigned, the value is converted by repeatedly adding or
    > subtracting one more than the maximum value that can be represented in the new type
    > until the value is in the range of the new type.


    -1 is not representable in any unsigned type. Adding one more than the
    maximum value representable in that type to -1 gives the maximum value
    representable in that type, in this case, SIZE_MAX.
    --
    James Kuyper
    James Kuyper, Feb 24, 2012
    #13
  14. On 24 Feb 2012 13:46:10 GMT, arnuld <> wrote:

    >> On Fri, 24 Feb 2012 08:28:03 +0000, Ike Naar wrote:

    >
    >> It seems there is something missing from this code. As it is written,
    >> mblen is applied to arr which contains all zeroes.

    >
    >
    >okay, I applied to an array which has converted using wcstombs() but
    >still mblen returns 1. Does that mean all those spanish and german
    >characters are using only one byte in memory ?


    Go back and read the description of mblen again. It does not return
    the length of a multibyte string as you seem to expect. The way you
    have coded it, it can return only -1, 0, 1, or 2, none of which seem
    relevant to your comment.

    >
    >If its true, can I say this behavior is not portable and one must use
    >wchar_t instead of char by looking at the result ?
    >
    >
    >
    >#include <stdio.h>
    >#include <stdlib.h>
    >#include <wchar.h>
    >#include <errno.h>
    >#include <string.h>
    >#include <locale.h>
    >
    >enum {
    > VAL_SUCC = 0,
    > VAL_ERR = -1,
    > ERR_ENC = -101,
    > ERR_ERRNO_UNKNOWN = -102,
    > SIZE_NAME = 10,
    > SIZE_INPUT = 1000,
    > SIZE_READ = 1};
    >
    >
    >void mySetLocale(const char* t);
    >size_t WStr2CStr(char **s, const wchar_t* ws, size_t* result);
    >void get_InternationlText_from_file(wchar_t arr[], const size_t len);
    >char* myMalloc(const size_t len);
    >
    >int main(void)
    >{
    > wchar_t contents[SIZE_INPUT+1] = {0};


    You have made contents one element larger that you will ever use. Go
    back and read the description of fgetws again.

    > char* p = NULL;
    > size_t sz = 0;
    > int len = 0;
    > int ret;
    >
    > mySetLocale("en_US.utf8");
    > get_InternationlText_from_file(contents, SIZE_INPUT);
    > printf("Contents = {%ls}\n\n", contents);
    >
    > ret = WStr2CStr(&p, contents, &sz);
    > if(0 > ret)


    You have already acknowledged that ret cannot be negative and Ben
    showed you what the correct comparison should be. If you are not
    interested in fixing the problems, why are you posting?

    Note that unless you change the type of ret, any error return from
    WStr2CStr will invoke bad behavior (either undefined or implementation
    defined depending on which C standard your implementation tries to
    support) and the comparison will always fail just as it does now.

    > {
    > printf("IN: %s @%d: ERROR in Conversion\n", __FILE__, __LINE__);
    > exit(EXIT_FAILURE);
    > }
    >
    > len = mblen(p, sz);
    > printf("p = %s\n", p);


    According to 7.19.6.1-8, footnote 246, the %s conversion specifier
    makes no special provision for multibyte characters. Since your text
    output below looks OK, I assume setting the locale (something I've
    never had to do) addresses this issue.

    > printf("mblen() = %d\n", len);
    >
    > return 0;
    >}
    >
    >
    >void get_InternationlText_from_file(wchar_t arr[], const size_t len)
    >{
    > const char* filename = "german.txt";
    > FILE* fp;
    > wchar_t* retp;
    >
    > fp = fopen(filename,"r");
    > if(NULL == fp)
    > {
    > printf("IN:%s @%d: ERROR [%s]Can not open (%s)\n",__FILE__,
    >__LINE__, strerror(errno), filename);
    > return;
    > }
    >
    > errno = 0;
    > retp = fgetws(arr, len, fp);
    >
    > if(NULL == retp)
    > {
    > if(feof(fp))
    > {
    > printf("IN: %s @%d Reading Error: [%s]\n", __FILE__, __LINE__,
    >strerror(errno));


    Attempting to read past end of file is not normally considered an I/O
    error. Usually it is a programming error or the file is damaged. Does
    this condition really set errno on your system?

    > }
    > else
    > {
    > printf("IN: %s @%d Reading Error: ERRNO = %d\n", __FILE__,
    >__LINE__, errno);


    I wonder why you didn't call strerror here since this is the real
    error situation.

    > }
    > exit(EXIT_FAILURE);
    > }
    >}
    >
    >
    >
    >
    >size_t WStr2CStr(char **s, const wchar_t* ws, size_t* result)
    >{
    > size_t ret = VAL_ERR;
    >
    > if(NULL == ws || NULL == s)
    > {
    > ret = VAL_ERR;


    Didn't you just do this two statements earlier?

    > }
    > else
    > {
    > size_t r;
    > size_t len;
    > size_t wlen;
    >
    > wlen = wcslen(ws);
    > len = wcstombs(NULL, ws, 0);
    > printf("IN: %s @%d: wlen = %zu, len = %zu\n", __FILE__, __LINE__,
    >wlen, len);


    Since all your code is in one file, wouldn't __func__ be a better
    choice than __FILE__?

    >
    > if(0 >= len)


    While len could be 0, this has the same problem noted previously in
    main.
    > {
    > printf("IN: %s @%d ERROR calculating len for MBstring\n",
    >__FILE__, __LINE__);
    > exit(EXIT_FAILURE);
    > }
    >
    > ++len; // for null character
    > *s = myMalloc(len);
    >
    > errno = 0;
    > r = wcstombs(*s, ws, len);
    >
    > if( (size_t)-1 == r)


    Since you previously "tested" wcstombs against ws when computing len,
    how can this if ever evaluate to true?

    > {
    > if(EILSEQ == errno)
    > {
    > ret = ERR_ENC;
    > }
    > else if(0 == r)


    Since r is already equal to (size_t)-1, it cannot be equal to 0. Are
    you missing a } before the else so that it would attach to the second
    previous if that test r instead of the immediate preceding one that
    tested errno? If so, remember to fix the indenting also.

    > {
    > printf("IN: %s @%d ERROR --> bytes converted = %zu\n",
    >__FILE__, __LINE__, r);
    > ret = VAL_ERR;
    > }
    > else if(errno)
    > {
    > ret = ERR_ERRNO_UNKNOWN;


    Wouldn't it be a good idea to tell the user which error occurred here?

    > }
    > }
    > else
    > {
    > printf("IN: %s @%d bytes converted = %zu\n", __FILE__,
    >__LINE__, r);
    > *result = r;
    > ret = VAL_SUCC;
    > }
    > }
    >
    > return ret;
    >}
    >
    >
    >
    >
    >void mySetLocale(const char* t)
    >{
    > if(NULL == setlocale(LC_CTYPE, t))
    > {
    > printf("IN: %s @%d ERROR: can not set locale [%s]\n", __FILE__,
    >__LINE__, t);
    > exit(EXIT_FAILURE);
    > }
    > else
    > {
    > printf("IN: %s @%d Locale Set = [%s]\n", __FILE__, __LINE__, t);
    > }
    >}
    >
    >
    >
    >char* myMalloc(const size_t len)
    >{
    > char* p = (char*) malloc(len * (sizeof *p));


    Why the superfluous cast?

    > if(NULL == p)
    > {
    > printf("IN: %s @%d Out of Memory\n", __FILE__, __LINE__);
    > exit(EXIT_FAILURE);
    > }
    >
    > return p;
    >}
    >
    >======================== OUTPUT ==========================
    >[arnuld@dune C]$ gcc -std=c99 -pedantic -Wall -Wextra wchar.c
    >[arnuld@dune C]$ ./a.out
    >IN: wchar.c @153 Locale Set = [en_US.utf8]
    >Contents = {Megaupload-Gründer Schmitz gegen Kaution auf freiem Fuß.
    >News. "Qué tan" "Qué tanto" = How¿Qué :::}
    >
    >IN: wchar.c @100: wlen = 98, len = 104
    >IN: wchar.c @132 bytes converted = 104
    >p = Megaupload-Gründer Schmitz gegen Kaution auf freiem Fuß. News. "Qué
    >tan" "Qué tanto" = How¿Qué :::
    >mblen() = 1


    The first multibyte character in p ('M') requires only one byte.

    >[arnuld@dune C]$


    --
    Remove del for email
    Barry Schwarz, Feb 24, 2012
    #14
  15. arnuld

    Geoff Guest

    On 24 Feb 2012 13:21:54 GMT, arnuld <> wrote:

    >> On Fri, 24 Feb 2012 11:51:18 +0000, Ben Bacarisse wrote:

    >
    >> You compare the return value with -1 cast to size_t:
    >>
    >> if (ret == (size_t)-1)

    >
    >So putting a negative value in size_t type results in UINT_MAX as result.


    No. It results in SIZE_MAX.

    >Is this from ANSI standard ?


    Yes.

    >
    >That's from my machine:
    >

    #include <stdio.h>
    #include <limits.h>

    int main(void)
    {
    printf("(size_t) - 1 = %u\n", (size_t) - 1);
    printf("SIZE_MAX = %u\n", SIZE_MAX);

    return 0;
    }
    Geoff, Feb 24, 2012
    #15
  16. arnuld <> writes:
    >> On Fri, 24 Feb 2012 11:51:18 +0000, Ben Bacarisse wrote:
    >> You compare the return value with -1 cast to size_t:
    >>
    >> if (ret == (size_t)-1)

    >
    > So putting a negative value in size_t type results in UINT_MAX as result.
    > Is this from ANSI standard ?


    Not any negative value. Converting -1 to size_t results in SIZE_MAX,
    which may or may not be numerically equal to UINT_MAX.

    And the C standard has been published by ISO, not ANSI, since 1990.
    ANSI adopts each ISO standard, but it's clearer to refer to it as the
    ISO standard. It also avoids the common confusion of using the term
    "ANSI C" to refer to the 1989 version of the language rather than the
    current one.

    > That's from my machine:
    >
    > #include <stdio.h>
    > #include <limits.h>
    >
    > int main(void)
    > {
    > printf("(size_t) - 1 = %u\n", (size_t) - 1);


    Don't use "%u" to print a size_t value. If your implementation supports
    it, use "%zu", which is specifically for size_t. If not, you can use
    "%lu" with a cast:

    printf("(size_t)-1 = %lu\n", (unsigned long)((size_t)-1));

    And it's clearer without the space between "-" and "1". With the space,
    it looks like a subtraction.

    [snip]

    --
    Keith Thompson (The_Other_Keith) <http://www.ghoti.net/~kst>
    Will write code for food.
    "We must do something. This is something. Therefore, we must do this."
    -- Antony Jay and Jonathan Lynn, "Yes Minister"
    Keith Thompson, Feb 24, 2012
    #16
  17. On Fri, 24 Feb 2012 10:20:41 -0800, Geoff <>
    wrote:

    >On 24 Feb 2012 13:21:54 GMT, arnuld <> wrote:
    >
    >>> On Fri, 24 Feb 2012 11:51:18 +0000, Ben Bacarisse wrote:

    >>
    >>> You compare the return value with -1 cast to size_t:
    >>>
    >>> if (ret == (size_t)-1)

    >>
    >>So putting a negative value in size_t type results in UINT_MAX as result.

    >
    >No. It results in SIZE_MAX.
    >
    >>Is this from ANSI standard ?

    >
    >Yes.
    >
    >>
    >>That's from my machine:
    >>

    >#include <stdio.h>
    >#include <limits.h>
    >
    >int main(void)
    >{
    > printf("(size_t) - 1 = %u\n", (size_t) - 1);


    size_t need not be unsigned int. It could be unsigned long or larger.
    If your system doesn't support %zu, the it would be better to use %lu
    and cast both values to unsigned long.

    > printf("SIZE_MAX = %u\n", SIZE_MAX);
    >
    > return 0;
    >}


    --
    Remove del for email
    Barry Schwarz, Feb 24, 2012
    #17
    1. Advertising

Want to reply to this thread or ask your own question?

It takes just 2 minutes to sign up (and it's free!). Just click the sign up button to choose a username and then you can ask your own questions on the forum.
Similar Threads
  1. Andreas Suurkuusk
    Replies:
    0
    Views:
    3,950
    Andreas Suurkuusk
    Jul 27, 2003
  2. Ted Miller
    Replies:
    0
    Views:
    5,136
    Ted Miller
    Sep 13, 2003
  3. Merek
    Replies:
    0
    Views:
    1,943
    Merek
    Dec 3, 2003
  4. Scott Meddows
    Replies:
    1
    Views:
    379
    John Saunders
    Jun 8, 2004
  5. Mike

    Problem problem problem :( Need Help

    Mike, May 7, 2004, in forum: ASP General
    Replies:
    2
    Views:
    532
    Bullschmidt
    May 11, 2004
Loading...

Share This Page