Trim string

D

David RF

Hi, anybody knows a better way to (right-left) trim a string?

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

/* Trim (right-left) and returns a new allocated string or NULL */
char *sstrip(const char *s);
char *sstrip(const char *s)
{
char *value = NULL;
size_t len;

if (s) {
while (*s && isspace(*s)) s++;
for (len = strlen(s); isspace(s[len - 1]); len--);
value = malloc(len + 1);
if (!value) {
fprintf(stderr, "%s\n", "Buy more RAM!!");
exit(0);
}
strncpy(value, s, len);
value[len] = '\0';
}
return value;
}

int main(void)
{
char *s;

s = sstrip("test");
printf("<%s>\n", s);
if (s) free(s);

s = sstrip(" test ");
printf("<%s>\n", s);
if (s) free(s);

s = sstrip("test ");
printf("<%s>\n", s);
if (s) free(s);

s = sstrip(" test");
printf("<%s>\n", s);
if (s) free(s);

s = sstrip("");
printf("<%s>\n", s);
if (s) free(s);

s = sstrip(NULL);
printf("<%s>\n", s);
if (s) free(s);

return 0;
}
 
D

David RF

Sorry, there was a bug when a string containing only spaces (" ")
was passed, a new version:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

/* Trim left and right spaces returning a new allocated string or NULL
*/
char *sstrip(const char *s);
char *sstrip(const char *s)
{
char *value = NULL;
size_t len = 0;

if (s) {
while (*s && isspace(*s)) s++;
if (*s) for (len = strlen(s); isspace(s[len - 1]); len--);
value = malloc(len + 1);
if (value == NULL) {
fprintf(stderr, "%s\n", "Buy more RAM!!");
exit(0);
}
strncpy(value, s, len);
value[len] = '\0';
}
return value;
}

int main(void)
{
char *s;

s = sstrip("test");
printf("<%s>\n", s);
if (s) free(s);

s = sstrip(" test ");
printf("<%s>\n", s);
if (s) free(s);

s = sstrip("test ");
printf("<%s>\n", s);
if (s) free(s);

s = sstrip(" test");
printf("<%s>\n", s);
if (s) free(s);

s = sstrip("");
printf("<%s>\n", s);
if (s) free(s);

s = sstrip(" ");
printf("<%s>\n", s);
if (s) free(s);

s = sstrip(NULL);
printf("<%s>\n", s);
if (s) free(s);

return 0;
}
 
E

Eric Sosman

David said:
Sorry, there was a bug when a string containing only spaces (" ")
was passed, a new version:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

/* Trim left and right spaces returning a new allocated string or NULL
*/
char *sstrip(const char *s);
char *sstrip(const char *s)
{
char *value = NULL;
size_t len = 0;

if (s) {
while (*s && isspace(*s)) s++;

You can simplify the test to just isspace(*s), since
isspace() produces false for the '\0' at the end of the string.

Actually, that's not quite right: The <ctype.h> functions
should not be applied to ordinary char values. On systems
where char is signed, a char value in a string could be negative
(this won't happen for characters in the "basic execution set"
like 'a' and '/' and '5', but can happen for "extended" glyphs
like 'ß', 'ç', 'ñ'). However, the <ctype.h> functions accept
only one negative argument: the value of the macro EOF. Feed
them a different negative value, and there's no telling what
will happen. So the test should actually be

isspace( (unsigned char)*s )

.... with similar modifications elsewhere.
if (*s) for (len = strlen(s); isspace(s[len - 1]); len--);
value = malloc(len + 1);
if (value == NULL) {
fprintf(stderr, "%s\n", "Buy more RAM!!");
exit(0);

Okay in a toy program. A production-quality function
would most likely return NULL to let the caller decide what
to do; a low-level function like this one is in a poor position
to make global decisions about the program's life or death.
}
strncpy(value, s, len);

Ugh. Well, there's nothing really *wrong* with this, but
I have a reflexive gag reaction to strncpy(), because it's so
often used incorrectly. In this instance you've made no mistake
(that I can see), but I'd still suggest using memcpy() as a
clearer statement of what you're doing: Copying a batch of
characters not terminated by a '\0'.
value[len] = '\0';
}
return value;
}
 
D

David RF

     You can simplify the test to just isspace(*s), since
isspace() produces false for the '\0' at the end of the string.

     Actually, that's not quite right: The <ctype.h> functions
should not be applied to ordinary char values.  On systems
where char is signed, a char value in a string could be negative
(this won't happen for characters in the "basic execution set"
like 'a' and '/' and '5', but can happen for "extended" glyphs
like 'ß', 'ç', 'ñ').  However, the <ctype.h> functions accept
only one negative argument: the value of the macro EOF.  Feed
them a different negative value, and there's no telling what
will happen.  So the test should actually be

        isspace( (unsigned char)*s )

Could you clarify this point with a little example?
... with similar modifications elsewhere.
           if (*s) for (len = strlen(s); isspace(s[len - 1]); len--);
           value = malloc(len + 1);
           if (value == NULL) {
                   fprintf(stderr, "%s\n", "Buy more RAM!!");
                   exit(0);

     Okay in a toy program.  A production-quality function
would most likely return NULL to let the caller decide what
to do; a low-level function like this one is in a poor position
to make global decisions about the program's life or death.

Yes, I use my own xmalloc in real life :)
           }
           strncpy(value, s, len);

     Ugh.  Well, there's nothing really *wrong* with this, but
I have a reflexive gag reaction to strncpy(), because it's so
often used incorrectly.  In this instance you've made no mistake
(that I can see), but I'd still suggest using memcpy() as a
clearer statement of what you're doing: Copying a batch of
characters not terminated by a '\0'.
           value[len] = '\0';
   }
   return value;
}

Thanks for the suggestions Pete
 
E

Eric Sosman

David said:
[...]
Actually, that's not quite right: The <ctype.h> functions
should not be applied to ordinary char values. On systems
where char is signed, a char value in a string could be negative
(this won't happen for characters in the "basic execution set"
like 'a' and '/' and '5', but can happen for "extended" glyphs
like 'ß', 'ç', 'ñ'). However, the <ctype.h> functions accept
only one negative argument: the value of the macro EOF. Feed
them a different negative value, and there's no telling what
will happen. So the test should actually be

isspace( (unsigned char)*s )

Could you clarify this point with a little example?

What's unclear? Here's a re-cap:

1) On some systems, the `char' type is signed. On these
systems, some character values are positive, some are negative.

1a) Negative character values sometimes go unnoticed because
the characters in the "basic execution set" -- roughly speaking,
those that the Standard requires -- are all non-negative. Merkuns
are particularly likely to forget about negative characters, since
their impoverished repertoire of characters is mostly covered by
the basic execution set, and hence non-negative.

1b) ... but many (most?) systems support a wider range of
characters than the C Standard insists upon. Even systems that
are only "mildly international" usually support accented letters
that look funny to Merkuns but are important in European languages.
And then there are things like the Pound Sterling symbol, the Euro
symbol, the upside-down question and exlamation marks, ... On
many systems, the codes for some of these characters are negative.

2) isspace() and the other <ctype.h> functions take `int'
arguments, but not every possible `int' value. Their behavior
is defined only for arguments that are

2a) ... in the range 0 through UCHAR_MAX, inclusive, or

2b) ... equal to the value of the macro EOF from <stdio.h>.
This value is a negative `int', usually (but not necessarily)
negative one.

2c) If you feed a <ctype.h> function a value other than
those mentioned in (2a) and (2b), the function's behavior is
undefined, and there's no telling what it might do.

2c') In particular, if you feed the function a negative
value other than EOF, there's no telling what will happen.

3) Taking (1*) and (2*) together, we see that it is unsafe
to just pluck a character out of a string and hand it to a
<ctype.h> function: The character might be negative, and then
you'd be out of luck.

3a) The way to avoid the harsh fate of (3) is to say "Aha!
I know this string-resident char is a real character, not an
indication of I/O failure, so I need to convert it to a nice,
non-negative value in the (2a) range." That's what the cast
does.

It's a sad and sorry state of affairs, the remnants of a
prehistoric <ctype.h> whose design was not well thought-out.
(The usual remark about hindsight being 20-20 applies here.)
If the original <ctype.h> designers had said "The argument
must be a char, period, and not an EOF" all would be well.
Or if they'd decided to allow EOF but insisted that EOF have
a value unequal to any possible char, that would have been
all right, too. But they didn't, and we're stuck with the
fallout.
 
D

David RF

     2c) If you feed a <ctype.h> function a value other than
those mentioned in (2a) and (2b), the function's behavior is
undefined, and there's no telling what it might do.

I must cast all params passed to ctype or is better to made my own
safe functions?
Thanks for the advise
 
D

David RF

     What's unclear?  Here's a re-cap:
[...]

Pardon my ignorance please, it's hard to me to speak english:

int cisspace(int c)
{
return ((c == ' ') || (c == '\n') || (c == '\t'));
}

this solve the problem?
 
E

Eric Sosman

David said:
I must cast all params passed to ctype or is better to made my own
safe functions?
Thanks for the advise

Either way will work: You can write wrappers for almost
any Standard library function to "condition" the arguments
if you like. My own preference is to write the clumsy cast
and call the <ctype.h> function directly, but my preferences
aren't binding on you. There may be a speed advantage in
my way, but in the context of a full-scale program (where the
strings most likely come from slow I/O operations), any speed
difference is likely to be unimportant.
 
B

Ben Bacarisse

Richard Heathfield said:
#include <stdlib.h>
#include <string.h>

char *lrtrim(const char *s, const char *wrappings)
{
char *new = NULL;
if(s != NULL)
{
size_t start = 0;
size_t len = 0;
const char *p = s;

const char *q = p;

if(wrappings != NULL)
{
start = strspn(s, wrappings);
p += start; /* p now points to first character to keep */
q = p + strlen(p); /* q points to terminator */
while(q > p && strchr(wrappings, *--q) != NULL)
{
continue;
}
}
else
{
q = p + strlen(p);
}
/* q now points to the last character to keep */

The comment is not true in the "else" case. It is entirely harmless
9as far as I can tell) since you just allocate and extra char and end
up with two null bytes at the end, but it's not "pretty".
/* there are 1 + q - p characters to keep, and we want
* one space for the null terminator */
len = 1 + q - p;
new = malloc(len + 1);
if(new != NULL)
{
memcpy(new, p, len);
new[len] = '\0';
}
}

return new;
}

<snip>
 
B

Barry Schwarz

     What's unclear?  Here's a re-cap:
[...]

Pardon my ignorance please, it's hard to me to speak english:

int cisspace(int c)
{
return ((c == ' ') || (c == '\n') || (c == '\t'));
}

this solve the problem?

If you look at 7.4.1.10 in the C99 standard, you will find that there
are other standard characters which are considered to be "spaces", \f,
\r, etc. There are also differences based on locale. If you never
have to deal with them, then you can limit you code to you do deal
with. If you want your function to be equivalent to the one in the
standard library, you have to do a bit more work.
 
B

Barry Schwarz

Sorry, there was a bug when a string containing only spaces (" ")
was passed, a new version:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

/* Trim left and right spaces returning a new allocated string or NULL
*/
char *sstrip(const char *s);
char *sstrip(const char *s)
{
char *value = NULL;
size_t len = 0;

if (s) {
while (*s && isspace(*s)) s++;
if (*s) for (len = strlen(s); isspace(s[len - 1]); len--);
value = malloc(len + 1);
if (value == NULL) {
fprintf(stderr, "%s\n", "Buy more RAM!!");
exit(0);
}
strncpy(value, s, len);
value[len] = '\0';
}
return value;
}

int main(void)
{
char *s;

s = sstrip("test");
printf("<%s>\n", s);
if (s) free(s);

The if test belongs on the call to printf. If s is NULL, the call to
printf invokes undefined behavior. On the other had, if s is NULL,
passing it to free causes no problem.
 
B

Ben Bacarisse

Richard Heathfield said:
Ben Bacarisse said:


Good spot. Thanks, Ben. The (rather ugly) fix is:

else
{
q = p + strlen(p);
if(q > p)
{
--q;
}
}

This may be just part of the "ugly", but that still goes on to
allocate a two-byte null string if s is zero-length to start with.
You know and I know that the (inelegant) check is to ensure that we
don't run off the beginning of an empty string and invoking undefined
behaviour, but I thought I'd just mention it in case anyone is
reading who didn't know this.

Because of this, I prefer to mark the end of sub-strings with a "just
past" pointer rather than a "last needed" pointer. I think it tidies
up the code a little. For example, the comment about why we need +1
twice now looks redundant. I've duped your style for a fair
comparison:

char *lrtrim(const char *s, const char *wrappings)
{
char *new = NULL;
if(s != NULL)
{
size_t start = 0;
size_t len = 0;
const char *p = s;

const char *q = p;

if(wrappings != NULL)
{
start = strspn(s, wrappings);
p += start; /* p now points to first character to keep */
q = p + strlen(p); /* q points to terminator */
while(q > p && strchr(wrappings, q[-1]) != NULL)
{
--q;
}
}
else
{
q = p + strlen(p);
}
/* q now points just past the last character to keep */

/* there are q - p characters to keep, and we want
* one space for the null terminator */
len = q - p;
new = malloc(len + 1);
if(new != NULL)
{
memcpy(new, p, len);
new[len] = '\0';
}
}
return new;
}

[The laws of the Internet mean I'll introduced some bugs, but it is
worth a shot.]
 
D

David RF

David RF said:


Did I really say that? :)
You said in english ;)



Sorry about that. We lose a lot of OPs for some reason.

Anyway, I hope my version has at least given you food for thought.
I find it very useful!!
 
B

Bart

Hi, anybody knows a better way to (right-left) trim a string?
/* Trim (right-left) and returns a new allocated string or NULL */
char *sstrip(const char *s);
char *sstrip(const char *s)

I'd be tempted to have a destination parameter here, so that the
caller can avoid the nuisance of dealing with allocated strings (by
using a fixed length buffer).

Passing a NULL destination can be be used to request an allocated
string.

If the source and destination strings are in the same place, then this
can also be dealt with, with a bit of extra care, and is even more
convenient.

The actual trimming operation: I haven't tried your code. If it works,
and is fast enough, then fine. Although, when I did something like
this, I made it possible to trim only one end, but I can't at the
minute tell you why that was a good idea...
 
C

Chris M. Thomasson

David RF said:
Hi, anybody knows a better way to (right-left) trim a string?
[...]

You could try doing it in place; something like this:




typed directly in newsreader, try and forgive any typos ;^o
_______________________________________________________________
#include <string.h>
#include <ctype.h>


#define xisspace(c) isspace((int)(c))


char*
trim_string(char* const buffer)
{
char* cur = buffer;

while (*cur && xisspace(*cur))
{
++cur;
}

if (*cur)
{
char* end = NULL;
char* start = cur;

++cur;

while (*cur)
{
if (xisspace(*cur))
{
if (! end)
{
end = cur;
}
}

else
{
end = NULL;
}

++cur;
}


if (start != buffer)
{
memmove(buffer, start, cur - start);

if (! end)
{
start = buffer + (cur - start);
*start = '\0';
}
}

if (end)
{
end -= (cur - buffer) - (cur - start);
*end = '\0';
}
}

return buffer;
}
_______________________________________________________________




You could use it like:
_______________________________________________________________
#include <stdio.h>


int main(void)
{
char name1[] = " Hello World! ";
char name2[] = "123 - 456 - 768 ";
char name3[] = " a b c d";

printf("->%s<-\n", trim_string(name1));
printf("->%s<-\n", trim_string(name2));
printf("->%s<-\n", trim_string(name3));

return 0;
}

_______________________________________________________________
 
C

Chris M. Thomasson

Chris M. Thomasson said:
David RF said:
Hi, anybody knows a better way to (right-left) trim a string?
[...]

You could try doing it in place; something like this:




typed directly in newsreader, try and forgive any typos ;^o
_______________________________________________________________
#include <string.h>
#include <ctype.h>


#define xisspace(c) isspace((int)(c))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


That should be:

#define xisspace(c) isspace((unsigned char)(c))



ARGHG!%$#!@


;^o
 
E

Eric Sosman

Chris said:
[...]
typed directly in newsreader, try and forgive any typos ;^o
[...]
#define xisspace(c) isspace((int)(c))

This isn't a typo; it's a thinko.
char* cur = buffer;
while (*cur && xisspace(*cur))

.... and this is still wrong, for all the same old reasons.
Also redundant, as well in addition to boot also.

See my detailed explanations to DavidRF, elsethread.
 

Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Members online

Forum statistics

Threads
473,769
Messages
2,569,582
Members
45,059
Latest member
cryptoseoagencies

Latest Threads

Top