wcstombs() problem

arnuld · Feb 23, 2012

AIM: To convert a wide-character string into a character string
PROBLEM: (1) checking return value or errno.
(2) conversion just does not happen.

#include <stdio.h>
#include <stdlib.h>
#include <wchar.h>
#include <errno.h>
#include <string.h>
#include <locale.h>
#include <limits.h>

#ifndef __STDC_ISO_10646__
#define __STDC_ISO_10646__
#endif

enum {
VAL_SUCC = 0,
VAL_ERR = -1,
ERR_ENC = -101,
ERR_ERRNO_UNKNOWN = -102,
SIZE_INPUT = 1000
};

void setLocale(const char* t);
size_t WStr2CStr(char **s, const wchar_t* ws, const size_t len);
void getGermanLanguageFromFile(wchar_t arr[], const size_t len);
char* myMalloc(const size_t len);

int main(void)
{
int ret = 0;
wchar_t contents[SIZE_INPUT+1] = {0};
char* p = NULL;
size_t plen = 0;

setLocale("en_US.utf8");
getGermanLanguageFromFile(contents, SIZE_INPUT);
printf("Contents = {%ls}\n\n", contents);

plen = ret + 1;
printf("plen = %d\n", plen);
p = myMalloc(plen);
ret = WStr2CStr(&p, contents, plen);

if( ret <= 0)
{
printf("IN: %s @%d ERROR converting to characters: ERRNO = %d\n",
__FILE__, __LINE__, ret);
exit(EXIT_FAILURE);
}

printf("p = [%s]\n\n", p);
printf("W = %zu, Char = %zu\n", wcslen(contents), strlen(p));

free(p);

return 0;
}

char* myMalloc(const size_t len)
{
char* p = malloc(len * (sizeof *p));
if(NULL == p)
{
printf("IN: %s @%d Out of Memory\n", __FILE__, __LINE__);
exit(EXIT_FAILURE);
}

return p;
}

void setLocale(const char* t)
{
if(NULL == setlocale(LC_CTYPE, t))
{
printf("IN: %s @%d ERROR: can not set locale [%s]\n", __FILE__,
__LINE__, t);
exit(EXIT_FAILURE);
}
else
{
printf("IN: %s @%d Locale Set = [%s]\n", __FILE__, __LINE__, t);
}
}

/* Contents of german.txt: Megaupload-GrÃ¼nder Schmitz gegen Kaution auf
freiem FuÃŸ. News */
void getGermanLanguageFromFile(wchar_t arr[], const size_t len)
{
const char* filename = "german.txt";
FILE* fp;
wchar_t* retp;

fp = fopen(filename,"r");
if(NULL == fp)
{
printf("IN:%s @%d: ERROR [%s]Can not open (%s)\n",__FILE__,
__LINE__, strerror(errno), filename);
return;
}

errno = 0;
retp = fgetws(arr, len, fp);

if(NULL == retp)
{
if(feof(fp))
{
printf("IN: %s @%d Reading Error: [%s]\n", __FILE__, __LINE__,
strerror(errno));
}
else
{
printf("IN: %s @%d Reading Error: ERRNO = %d\n", __FILE__,
__LINE__, errno);
}
exit(EXIT_FAILURE);
}
}

size_t WStr2CStr(char **s, const wchar_t* ws, const size_t len)
{
int ret = VAL_ERR;

if(NULL == ws || NULL == s)
{
ret = VAL_ERR;
}
else
{
size_t r;
errno = 0;
r = wcstombs(*s, ws, len);
if(0 >= r)
{
if(EILSEQ == errno)
{
ret = ERR_ENC;
}
else if(0 == r)
{
printf("IN: %s @%d ERROR: ZERO bytes converted = %zu\n",
__FILE__, __LINE__, r);
ret = VAL_ERR;
}
else if(errno)
{
ret = ERR_ERRNO_UNKNOWN;
}
}
else
{
printf("IN: %s @%d bytes converted = %zu\n", __FILE__,
__LINE__, r);
ret = r;
}
}

return ret;
}

==================== OUTPUT ==============================
[arnuld@dune C]$ gcc -std=c99 -pedantic -Wall -Wextra convert.c
[arnuld@dune C]$ ./a.out
IN: convert.c @82 Locale Set = [en_US.utf8]
Contents = {Megaupload-GrÃ¼nder Schmitz gegen Kaution auf freiem FuÃŸ. News}

plen = 1
IN: convert.c @151 bytes converted = 1
p = [M]

W = 61, Char = 1
[arnuld@dune C]$

I searched archives and came across this piece of code where poster calls
wcstombs() 2 times, first to calculate characters (using NULL argument)
and then to really do the conversion. I wonder if that is the way wcstombs
() was supposed to use (because it works while mine does not):

size_t n = wcstombs(NULL, src, 0);
char *dst = malloc(n + 1);
if(dst == NULL)
{
fprintf(stderr, "memory allocation failed\n");
return NULL;
}
if(wcstombs(dst, src, n + 1) != n)
{
fprintf(stderr, "conversion failed\n");
free(dst);
return NULL;
}

arnuld · Feb 23, 2012

... SNIP..
int main(void)
{
int ret = 0;
wchar_t contents[SIZE_INPUT+1] = {0}; char* p = NULL;
size_t plen = 0;

setLocale("en_US.utf8");
getGermanLanguageFromFile(contents, SIZE_INPUT); printf("Contents =
{%ls}\n\n", contents);

plen = ret + 1;

was stupid enough to do that, changing it to

plen = wcslen(contents) + 1;

does the conversion but still it misses some last characters, any idea
why ?

Keith Thompson · Feb 23, 2012

arnuld said:
#ifndef __STDC_ISO_10646__
#define __STDC_ISO_10646__
#endif

__STDC_ISO_10646__ is conditionally defined by the implementation.
Defining it yourself won't give you the desired semantics.

Barry Schwarz · Feb 23, 2012

AIM: To convert a wide-character string into a character string

Why do you think this is possible?

PROBLEM: (1) checking return value or errno.
(2) conversion just does not happen.

#include <stdio.h>
#include <stdlib.h>
#include <wchar.h>
#include <errno.h>
#include <string.h>
#include <locale.h>
#include <limits.h>

#ifndef __STDC_ISO_10646__
#define __STDC_ISO_10646__
#endif

enum {
VAL_SUCC = 0,
VAL_ERR = -1,
ERR_ENC = -101,
ERR_ERRNO_UNKNOWN = -102,
SIZE_INPUT = 1000
};

void setLocale(const char* t);
size_t WStr2CStr(char **s, const wchar_t* ws, const size_t len);
void getGermanLanguageFromFile(wchar_t arr[], const size_t len);
char* myMalloc(const size_t len);

int main(void)
{
int ret = 0;
wchar_t contents[SIZE_INPUT+1] = {0};
char* p = NULL;
size_t plen = 0;

setLocale("en_US.utf8");
getGermanLanguageFromFile(contents, SIZE_INPUT);
printf("Contents = {%ls}\n\n", contents);

plen = ret + 1;

Fixed in follow-on message to
plen = wcslen(contents)+1;

printf("plen = %d\n", plen);
p = myMalloc(plen);
ret = WStr2CStr(&p, contents, plen);

WStr2CStr returns a size_t which is unsigned.

if( ret <= 0)

Therefore, ret can never be negative.

{
printf("IN: %s @%d ERROR converting to characters: ERRNO = %d\n",
__FILE__, __LINE__, ret);
exit(EXIT_FAILURE);
}

printf("p = [%s]\n\n", p);
printf("W = %zu, Char = %zu\n", wcslen(contents), strlen(p));

free(p);

return 0;
}

char* myMalloc(const size_t len)
{
char* p = malloc(len * (sizeof *p));
if(NULL == p)
{
printf("IN: %s @%d Out of Memory\n", __FILE__, __LINE__);
exit(EXIT_FAILURE);
}

return p;
}

void setLocale(const char* t)
{
if(NULL == setlocale(LC_CTYPE, t))
{
printf("IN: %s @%d ERROR: can not set locale [%s]\n", __FILE__,
__LINE__, t);
exit(EXIT_FAILURE);
}
else
{
printf("IN: %s @%d Locale Set = [%s]\n", __FILE__, __LINE__, t);
}
}

/* Contents of german.txt: Megaupload-Gründer Schmitz gegen Kaution auf
freiem Fuß. News */
void getGermanLanguageFromFile(wchar_t arr[], const size_t len)
{
const char* filename = "german.txt";
FILE* fp;
wchar_t* retp;

fp = fopen(filename,"r");
if(NULL == fp)
{
printf("IN:%s @%d: ERROR [%s]Can not open (%s)\n",__FILE__,
__LINE__, strerror(errno), filename);
return;
}

errno = 0;
retp = fgetws(arr, len, fp);

if(NULL == retp)
{
if(feof(fp))
{
printf("IN: %s @%d Reading Error: [%s]\n", __FILE__, __LINE__,
strerror(errno));
}
else
{
printf("IN: %s @%d Reading Error: ERRNO = %d\n", __FILE__,
__LINE__, errno);
}
exit(EXIT_FAILURE);
}
}

size_t WStr2CStr(char **s, const wchar_t* ws, const size_t len)
{
int ret = VAL_ERR;

if(NULL == ws || NULL == s)
{
ret = VAL_ERR;
}
else
{
size_t r;
errno = 0;
r = wcstombs(*s, ws, len);

wcstombs does not convert wide to char. It converts wide to
multi-byte. Multi-byte characters can occupy one or TWO bytes. At
least two of the wide characters you read in from the file (ü and ß)
appear to require two bytes. Consequently, at least two of the last
characters from the original message will not fit in the first len
characters pointed to by *s.

if(0 >= r)

r is a size_t and therefore can never be negative.

{
if(EILSEQ == errno)
{
ret = ERR_ENC;
}
else if(0 == r)
{
printf("IN: %s @%d ERROR: ZERO bytes converted = %zu\n",
__FILE__, __LINE__, r);
ret = VAL_ERR;
}
else if(errno)
{
ret = ERR_ERRNO_UNKNOWN;
}
}
else
{
printf("IN: %s @%d bytes converted = %zu\n", __FILE__,
__LINE__, r);
ret = r;
}
}

return ret;
}

==================== OUTPUT ==============================
[arnuld@dune C]$ gcc -std=c99 -pedantic -Wall -Wextra convert.c
[arnuld@dune C]$ ./a.out
IN: convert.c @82 Locale Set = [en_US.utf8]
Contents = {Megaupload-Gründer Schmitz gegen Kaution auf freiem Fuß. News}

plen = 1
IN: convert.c @151 bytes converted = 1
p = [M]

W = 61, Char = 1
[arnuld@dune C]$

I searched archives and came across this piece of code where poster calls
wcstombs() 2 times, first to calculate characters (using NULL argument)
and then to really do the conversion. I wonder if that is the way wcstombs
() was supposed to use (because it works while mine does not):

size_t n = wcstombs(NULL, src, 0);
char *dst = malloc(n + 1);
if(dst == NULL)
{
fprintf(stderr, "memory allocation failed\n");
return NULL;
}
if(wcstombs(dst, src, n + 1) != n)
{
fprintf(stderr, "conversion failed\n");
free(dst);
return NULL;
}

Ben Bacarisse · Feb 23, 2012

arnuld said:
... SNIP..
int main(void)
{
int ret = 0;
wchar_t contents[SIZE_INPUT+1] = {0}; char* p = NULL;
size_t plen = 0;

setLocale("en_US.utf8");
getGermanLanguageFromFile(contents, SIZE_INPUT); printf("Contents =
{%ls}\n\n", contents);

plen = ret + 1;

Click to expand...

was stupid enough to do that, changing it to

plen = wcslen(contents) + 1;

does the conversion but still it misses some last characters, any idea
why ?

Barry Schwarz has answered this, as have you! The "mbs" at the end of
wcstombs stands for "multi-byte string". That means that some
characters need more than once byte to be encoded so the buffer size
needed is rarely wcslen(contents) + 1.

In your original post, you said that you'd seen code that calls wcstombs
twice -- once to get the length and again to do the conversion and you
asked "I wonder if that is the way wcstombs () was supposed to use
(because it works while mine does not)". The answer is "yes".

arnuld · Feb 24, 2012

Barry Schwarz has answered this, as have you! The "mbs" at the end of
wcstombs stands for "multi-byte string". That means that some
characters need more than once byte to be encoded so the buffer size
needed is rarely wcslen(contents) + 1.

I worked for 2 days to understand this. Wrote several examples, read
several examples, read archives. Since H&S5 does not mention this, which
meant it must be easily understandable. I feel like 'still a kid in C
programming'.

In your original post, you said that you'd seen code that calls wcstombs
twice -- once to get the length and again to do the conversion and you
asked "I wonder if that is the way wcstombs () was supposed to use
(because it works while mine does not)". The answer is "yes".

What about mblen() to calculate length as alternative ? I tried it with
this code but it does not work:

#include <stdio.h>
#include <stdlib.h>
#include <wchar.h>
#include <errno.h>
#include <string.h>
#include <locale.h>

enum {
VAL_SUCC = 0,
VAL_ERR = -1,
ERR_ENC = -101,
ERR_ERRNO_UNKNOWN = -102,
SIZE_NAME = 10,
SIZE_INPUT = 1000,
SIZE_READ = 1};

void mySetLocale(const char* t);
size_t WStr2CStr(char **s, const wchar_t* ws, const size_t len);
void get_InternationlText_from_file(wchar_t arr[], const size_t len);
char* myMalloc(const size_t len);

int main(void)
{
wchar_t contents[SIZE_INPUT+1] = {0};
char arr[SIZE_INPUT+1] = {0};
size_t wlen = 0;
int len = 0;

mySetLocale("en_US.utf8");
get_InternationlText_from_file(contents, SIZE_INPUT);
printf("Contents = {%ls}\n\n", contents);

wlen = wcslen(contents);
len = mblen(arr, SIZE_INPUT+1);

printf("IN: %s @%d: wlen = %zu, len = %d\n", __FILE__, __LINE__, wlen,
len);

return 0;
}

void mySetLocale(const char* t)
{
if(NULL == setlocale(LC_CTYPE, t))
{
printf("IN: %s @%d ERROR: can not set locale [%s]\n", __FILE__,
__LINE__, t);
exit(EXIT_FAILURE);
}
else
{
printf("IN: %s @%d Locale Set = [%s]\n", __FILE__, __LINE__, t);
}
}

void get_InternationlText_from_file(wchar_t arr[], const size_t len)
{
const char* filename = "german.txt";
FILE* fp;
wchar_t* retp;

fp = fopen(filename,"r");
if(NULL == fp)
{
printf("IN:%s @%d: ERROR [%s]Can not open (%s)\n",__FILE__,
__LINE__, strerror(errno), filename);
return;
}

errno = 0;
retp = fgetws(arr, len, fp);

if(NULL == retp)
{
if(feof(fp))
{
printf("IN: %s @%d Reading Error: [%s]\n", __FILE__, __LINE__,
strerror(errno));
}
else
{
printf("IN: %s @%d Reading Error: ERRNO = %d\n", __FILE__,
__LINE__, errno);
}
exit(EXIT_FAILURE);
}
}

size_t WStr2CStr(char **s, const wchar_t* ws, const size_t len)
{
size_t ret = VAL_ERR;

if(NULL == ws || NULL == s)
{
ret = VAL_ERR;
}
else
{
size_t r;
errno = 0;
r = wcstombs(*s, ws, len);

if(0 >= r)
{
if(EILSEQ == errno)
{
ret = ERR_ENC;
}
else if(0 == r)
{
printf("IN: %s @%d ERROR bytes converted = %zu\n",
__FILE__, __LINE__, r);
ret = VAL_ERR;
}
else if(errno)
{
ret = ERR_ERRNO_UNKNOWN;
}
}
else
{
printf("IN: %s @%d bytes converted = %zu\n", __FILE__,
__LINE__, r);
ret = r;
}
}

return ret;
}

==================== OUTPUT ============================
[arnuld@dune C]$ gcc -std=c99 -pedantic -Wall -Wextra mblen.c
[arnuld@dune C]$ ./a.out
IN: mblen.c @53 Locale Set = [en_US.utf8]
Contents = {Megaupload-GrÃ¼nder Schmitz gegen Kaution auf freiem FuÃŸ. News}

IN: mblen.c @37: wlen = 61, len = 0
[arnuld@dune C]$

arnuld · Feb 24, 2012

WStr2CStr returns a size_t which is unsigned.

Therefore, ret can never be negative.

I know that an dyou have explained it very well. Problem is how do I
compete with statement from section 16.11.2 from H&S5:

"The function returns the number of characters written to s, not
counting the terminating null character(if any). If a conversion error
occurs, the function returns -1 (cast to size_t)"

or it returns the value equal to "the value of -1 catst-ed to size_t ?
(which is 4294967295 on mu machine)

Ike Naar · Feb 24, 2012

In your original post, you said that you'd seen code that calls wcstombs
twice -- once to get the length and again to do the conversion and you
asked "I wonder if that is the way wcstombs () was supposed to use
(because it works while mine does not)". The answer is "yes".

Click to expand...

What about mblen() to calculate length as alternative ? I tried it with
this code but it does not work:

[snip]

int main(void)
{
wchar_t contents[SIZE_INPUT+1] = {0};
char arr[SIZE_INPUT+1] = {0};
size_t wlen = 0;
int len = 0;

mySetLocale("en_US.utf8");
get_InternationlText_from_file(contents, SIZE_INPUT);
printf("Contents = {%ls}\n\n", contents);

wlen = wcslen(contents);
len = mblen(arr, SIZE_INPUT+1);

It seems there is something missing from this code.
As it is written, mblen is applied to arr which contains all zeroes.

Nobody · Feb 24, 2012

What about mblen() to calculate length as alternative ?

mblen() requires that you have already converted the string to multi-byte
representation.

The "correct" answer is the one you noted in your original post: call
wcstombs() with NULL as the first argument to calculate the length of the
resulting multi-byte string. That feature was added for this specific
purpose.

Ben Bacarisse · Feb 24, 2012

arnuld said:
I know that an dyou have explained it very well. Problem is how do I
compete with statement from section 16.11.2 from H&S5:

"The function returns the number of characters written to s, not
counting the terminating null character(if any). If a conversion error
occurs, the function returns -1 (cast to size_t)"

You compare the return value with -1 cast to size_t:

if (ret == (size_t)-1)

<snip>

arnuld · Feb 24, 2012

You compare the return value with -1 cast to size_t:

if (ret == (size_t)-1)

So putting a negative value in size_t type results in UINT_MAX as result.
Is this from ANSI standard ?

That's from my machine:

#include <stdio.h>
#include <limits.h>

int main(void)
{
printf("(size_t) - 1 = %u\n", (size_t) - 1);
printf("UINT_MAX = %u\n", UINT_MAX);

return 0;
}

=================== OUTPUT ========================
[arnuld@dune C]$ ./a.out
(size_t) - 1 = 4294967295
UINT_MAX = 4294967295
[arnuld@dune C]$

arnuld · Feb 24, 2012

It seems there is something missing from this code. As it is written,
mblen is applied to arr which contains all zeroes.

okay, I applied to an array which has converted using wcstombs() but
still mblen returns 1. Does that mean all those spanish and german
characters are using only one byte in memory ?

If its true, can I say this behavior is not portable and one must use
wchar_t instead of char by looking at the result ?

#include <stdio.h>
#include <stdlib.h>
#include <wchar.h>
#include <errno.h>
#include <string.h>
#include <locale.h>

enum {
VAL_SUCC = 0,
VAL_ERR = -1,
ERR_ENC = -101,
ERR_ERRNO_UNKNOWN = -102,
SIZE_NAME = 10,
SIZE_INPUT = 1000,
SIZE_READ = 1};

void mySetLocale(const char* t);
size_t WStr2CStr(char **s, const wchar_t* ws, size_t* result);
void get_InternationlText_from_file(wchar_t arr[], const size_t len);
char* myMalloc(const size_t len);

int main(void)
{
wchar_t contents[SIZE_INPUT+1] = {0};
char* p = NULL;
size_t sz = 0;
int len = 0;
int ret;

mySetLocale("en_US.utf8");
get_InternationlText_from_file(contents, SIZE_INPUT);
printf("Contents = {%ls}\n\n", contents);

ret = WStr2CStr(&p, contents, &sz);
if(0 > ret)
{
printf("IN: %s @%d: ERROR in Conversion\n", __FILE__, __LINE__);
exit(EXIT_FAILURE);
}

len = mblen(p, sz);
printf("p = %s\n", p);
printf("mblen() = %d\n", len);

return 0;
}

void get_InternationlText_from_file(wchar_t arr[], const size_t len)
{
const char* filename = "german.txt";
FILE* fp;
wchar_t* retp;

fp = fopen(filename,"r");
if(NULL == fp)
{
printf("IN:%s @%d: ERROR [%s]Can not open (%s)\n",__FILE__,
__LINE__, strerror(errno), filename);
return;
}

errno = 0;
retp = fgetws(arr, len, fp);

if(NULL == retp)
{
if(feof(fp))
{
printf("IN: %s @%d Reading Error: [%s]\n", __FILE__, __LINE__,
strerror(errno));
}
else
{
printf("IN: %s @%d Reading Error: ERRNO = %d\n", __FILE__,
__LINE__, errno);
}
exit(EXIT_FAILURE);
}
}

size_t WStr2CStr(char **s, const wchar_t* ws, size_t* result)
{
size_t ret = VAL_ERR;

if(NULL == ws || NULL == s)
{
ret = VAL_ERR;
}
else
{
size_t r;
size_t len;
size_t wlen;

wlen = wcslen(ws);
len = wcstombs(NULL, ws, 0);
printf("IN: %s @%d: wlen = %zu, len = %zu\n", __FILE__, __LINE__,
wlen, len);

if(0 >= len)
{
printf("IN: %s @%d ERROR calculating len for MBstring\n",
__FILE__, __LINE__);
exit(EXIT_FAILURE);
}

++len; // for null character
*s = myMalloc(len);

errno = 0;
r = wcstombs(*s, ws, len);

if( (size_t)-1 == r)
{
if(EILSEQ == errno)
{
ret = ERR_ENC;
}
else if(0 == r)
{
printf("IN: %s @%d ERROR --> bytes converted = %zu\n",
__FILE__, __LINE__, r);
ret = VAL_ERR;
}
else if(errno)
{
ret = ERR_ERRNO_UNKNOWN;
}
}
else
{
printf("IN: %s @%d bytes converted = %zu\n", __FILE__,
__LINE__, r);
*result = r;
ret = VAL_SUCC;
}
}

return ret;
}

void mySetLocale(const char* t)
{
if(NULL == setlocale(LC_CTYPE, t))
{
printf("IN: %s @%d ERROR: can not set locale [%s]\n", __FILE__,
__LINE__, t);
exit(EXIT_FAILURE);
}
else
{
printf("IN: %s @%d Locale Set = [%s]\n", __FILE__, __LINE__, t);
}
}

char* myMalloc(const size_t len)
{
char* p = (char*) malloc(len * (sizeof *p));
if(NULL == p)
{
printf("IN: %s @%d Out of Memory\n", __FILE__, __LINE__);
exit(EXIT_FAILURE);
}

return p;
}

======================== OUTPUT ==========================
[arnuld@dune C]$ gcc -std=c99 -pedantic -Wall -Wextra wchar.c
[arnuld@dune C]$ ./a.out
IN: wchar.c @153 Locale Set = [en_US.utf8]
Contents = {Megaupload-GrÃ¼nder Schmitz gegen Kaution auf freiem FuÃŸ.
News. "QuÃ© tan" "QuÃ© tanto" = HowÂ¿QuÃ© :::}

IN: wchar.c @100: wlen = 98, len = 104
IN: wchar.c @132 bytes converted = 104
p = Megaupload-GrÃ¼nder Schmitz gegen Kaution auf freiem FuÃŸ. News. "QuÃ©
tan" "QuÃ© tanto" = HowÂ¿QuÃ© :::
mblen() = 1
[arnuld@dune C]$

James Kuyper · Feb 24, 2012

So putting a negative value in size_t type results in UINT_MAX as result.

Not necessarily. It results in SIZE_MAX, which might or might not be the
same as UINT_MAX (it's more likely to be the same as ULONG_MAX, on
machines where those two are different).

Is this from ANSI standard ?

Yes. See 6.3.1.3p2:

Otherwise, if the new type is unsigned, the value is converted by repeatedly adding or
subtracting one more than the maximum value that can be represented in the new type
until the value is in the range of the new type.

-1 is not representable in any unsigned type. Adding one more than the
maximum value representable in that type to -1 gives the maximum value
representable in that type, in this case, SIZE_MAX.

Barry Schwarz · Feb 24, 2012

okay, I applied to an array which has converted using wcstombs() but
still mblen returns 1. Does that mean all those spanish and german
characters are using only one byte in memory ?

Go back and read the description of mblen again. It does not return
the length of a multibyte string as you seem to expect. The way you
have coded it, it can return only -1, 0, 1, or 2, none of which seem
relevant to your comment.

If its true, can I say this behavior is not portable and one must use
wchar_t instead of char by looking at the result ?

#include <stdio.h>
#include <stdlib.h>
#include <wchar.h>
#include <errno.h>
#include <string.h>
#include <locale.h>

enum {
VAL_SUCC = 0,
VAL_ERR = -1,
ERR_ENC = -101,
ERR_ERRNO_UNKNOWN = -102,
SIZE_NAME = 10,
SIZE_INPUT = 1000,
SIZE_READ = 1};

void mySetLocale(const char* t);
size_t WStr2CStr(char **s, const wchar_t* ws, size_t* result);
void get_InternationlText_from_file(wchar_t arr[], const size_t len);
char* myMalloc(const size_t len);

int main(void)
{
wchar_t contents[SIZE_INPUT+1] = {0};

You have made contents one element larger that you will ever use. Go
back and read the description of fgetws again.

char* p = NULL;
size_t sz = 0;
int len = 0;
int ret;

mySetLocale("en_US.utf8");
get_InternationlText_from_file(contents, SIZE_INPUT);
printf("Contents = {%ls}\n\n", contents);

ret = WStr2CStr(&p, contents, &sz);
if(0 > ret)

You have already acknowledged that ret cannot be negative and Ben
showed you what the correct comparison should be. If you are not
interested in fixing the problems, why are you posting?

Note that unless you change the type of ret, any error return from
WStr2CStr will invoke bad behavior (either undefined or implementation
defined depending on which C standard your implementation tries to
support) and the comparison will always fail just as it does now.

{
printf("IN: %s @%d: ERROR in Conversion\n", __FILE__, __LINE__);
exit(EXIT_FAILURE);
}

len = mblen(p, sz);
printf("p = %s\n", p);

According to 7.19.6.1-8, footnote 246, the %s conversion specifier
makes no special provision for multibyte characters. Since your text
output below looks OK, I assume setting the locale (something I've
never had to do) addresses this issue.

printf("mblen() = %d\n", len);

return 0;
}

void get_InternationlText_from_file(wchar_t arr[], const size_t len)
{
const char* filename = "german.txt";
FILE* fp;
wchar_t* retp;

fp = fopen(filename,"r");
if(NULL == fp)
{
printf("IN:%s @%d: ERROR [%s]Can not open (%s)\n",__FILE__,
__LINE__, strerror(errno), filename);
return;
}

errno = 0;
retp = fgetws(arr, len, fp);

if(NULL == retp)
{
if(feof(fp))
{
printf("IN: %s @%d Reading Error: [%s]\n", __FILE__, __LINE__,
strerror(errno));

Attempting to read past end of file is not normally considered an I/O
error. Usually it is a programming error or the file is damaged. Does
this condition really set errno on your system?

}
else
{
printf("IN: %s @%d Reading Error: ERRNO = %d\n", __FILE__,
__LINE__, errno);

I wonder why you didn't call strerror here since this is the real
error situation.

}
exit(EXIT_FAILURE);
}
}

size_t WStr2CStr(char **s, const wchar_t* ws, size_t* result)
{
size_t ret = VAL_ERR;

if(NULL == ws || NULL == s)
{
ret = VAL_ERR;

Didn't you just do this two statements earlier?

}
else
{
size_t r;
size_t len;
size_t wlen;

wlen = wcslen(ws);
len = wcstombs(NULL, ws, 0);
printf("IN: %s @%d: wlen = %zu, len = %zu\n", __FILE__, __LINE__,
wlen, len);

Since all your code is in one file, wouldn't __func__ be a better
choice than __FILE__?

if(0 >= len)

While len could be 0, this has the same problem noted previously in
main.

{
printf("IN: %s @%d ERROR calculating len for MBstring\n",
__FILE__, __LINE__);
exit(EXIT_FAILURE);
}

++len; // for null character
*s = myMalloc(len);

errno = 0;
r = wcstombs(*s, ws, len);

if( (size_t)-1 == r)

Since you previously "tested" wcstombs against ws when computing len,
how can this if ever evaluate to true?

{
if(EILSEQ == errno)
{
ret = ERR_ENC;
}
else if(0 == r)

Since r is already equal to (size_t)-1, it cannot be equal to 0. Are
you missing a } before the else so that it would attach to the second
previous if that test r instead of the immediate preceding one that
tested errno? If so, remember to fix the indenting also.

{
printf("IN: %s @%d ERROR --> bytes converted = %zu\n",
__FILE__, __LINE__, r);
ret = VAL_ERR;
}
else if(errno)
{
ret = ERR_ERRNO_UNKNOWN;

Wouldn't it be a good idea to tell the user which error occurred here?

}
}
else
{
printf("IN: %s @%d bytes converted = %zu\n", __FILE__,
__LINE__, r);
*result = r;
ret = VAL_SUCC;
}
}

return ret;
}

void mySetLocale(const char* t)
{
if(NULL == setlocale(LC_CTYPE, t))
{
printf("IN: %s @%d ERROR: can not set locale [%s]\n", __FILE__,
__LINE__, t);
exit(EXIT_FAILURE);
}
else
{
printf("IN: %s @%d Locale Set = [%s]\n", __FILE__, __LINE__, t);
}
}

char* myMalloc(const size_t len)
{
char* p = (char*) malloc(len * (sizeof *p));

Why the superfluous cast?

if(NULL == p)
{
printf("IN: %s @%d Out of Memory\n", __FILE__, __LINE__);
exit(EXIT_FAILURE);
}

return p;
}

======================== OUTPUT ==========================
[arnuld@dune C]$ gcc -std=c99 -pedantic -Wall -Wextra wchar.c
[arnuld@dune C]$ ./a.out
IN: wchar.c @153 Locale Set = [en_US.utf8]
Contents = {Megaupload-Gründer Schmitz gegen Kaution auf freiem Fuß.
News. "Qué tan" "Qué tanto" = How¿Qué :::}

IN: wchar.c @100: wlen = 98, len = 104
IN: wchar.c @132 bytes converted = 104
p = Megaupload-Gründer Schmitz gegen Kaution auf freiem Fuß. News. "Qué
tan" "Qué tanto" = How¿Qué :::
mblen() = 1

The first multibyte character in p ('M') requires only one byte.

[arnuld@dune C]$

Geoff · Feb 24, 2012

So putting a negative value in size_t type results in UINT_MAX as result.

No. It results in SIZE_MAX.

Is this from ANSI standard ?
Yes.

That's from my machine:

#include <stdio.h>
#include <limits.h>

int main(void)
{
printf("(size_t) - 1 = %u\n", (size_t) - 1);
printf("SIZE_MAX = %u\n", SIZE_MAX);

return 0;
}

Keith Thompson · Feb 24, 2012

arnuld said:
So putting a negative value in size_t type results in UINT_MAX as result.
Is this from ANSI standard ?

Not any negative value. Converting -1 to size_t results in SIZE_MAX,
which may or may not be numerically equal to UINT_MAX.

And the C standard has been published by ISO, not ANSI, since 1990.
ANSI adopts each ISO standard, but it's clearer to refer to it as the
ISO standard. It also avoids the common confusion of using the term
"ANSI C" to refer to the 1989 version of the language rather than the
current one.

That's from my machine:

#include <stdio.h>
#include <limits.h>

int main(void)
{
printf("(size_t) - 1 = %u\n", (size_t) - 1);

Don't use "%u" to print a size_t value. If your implementation supports
it, use "%zu", which is specifically for size_t. If not, you can use
"%lu" with a cast:

printf("(size_t)-1 = %lu\n", (unsigned long)((size_t)-1));

And it's clearer without the space between "-" and "1". With the space,
it looks like a subtraction.

[snip]

Barry Schwarz · Feb 24, 2012

No. It results in SIZE_MAX.

#include <stdio.h>
#include <limits.h>

int main(void)
{
printf("(size_t) - 1 = %u\n", (size_t) - 1);

size_t need not be unsigned int. It could be unsigned long or larger.
If your system doesn't support %zu, the it would be better to use %lu
and cast both values to unsigned long.

compressing charatcers	35	Apr 2, 2014
Adding adressing of IPv6 to program	1	Feb 16, 2023
perror()4 says SUCCESS	10	Nov 22, 2011
Array implementation of Stack	80	Jun 16, 2011
Please help with C programming to save GPS reception data in Raspberry Pi.	0	Dec 8, 2022
Queue - can not delete last element	6	Jun 13, 2011
Fibonacci	0	May 13, 2023
string to int	5	Apr 11, 2012

wcstombs() problem

arnuld

arnuld

Keith Thompson

Barry Schwarz

Ben Bacarisse

arnuld

arnuld

Ike Naar

Nobody

Ben Bacarisse

arnuld

arnuld

James Kuyper

Barry Schwarz

Geoff

Keith Thompson

Barry Schwarz

Ask a Question

Similar Threads

Members online

Forum statistics

Latest Threads