Convert native character string to ASCII array of integers

  • Thread starter Tomás Ó hÉilidhe
  • Start date
T

Tomás Ó hÉilidhe

Given a string in the computer's native character set such as:

"Hello"

, I want to convert it to an array of integers representing the ASCII
values of the characters. The reason I want to do this is that I'll be
passing the ASCII array to a cryptographic hash function. In order to
make my program fully portable so that it will run properly on
machines where the default character set isn't ASCII, I've created a
MakeASCII function.

Please my MakeASCII function! Rip it apart!

typedef char unsigned OctetStorage;

void MakeASCII(OctetStorage *pos,char const *pc)
{
for( ; ; ++pos,++pc)
{
switch (*pc)
{
case ' ': *pos = 0x20u; break;
case '!': *pos = 0x21u; break;
case '\"': *pos = 0x22u; break;
case '#': *pos = 0x23u; break;
case '$': *pos = 0x24u; break;
case '%': *pos = 0x25u; break;
case '&': *pos = 0x26u; break;
case '\'': *pos = 0x27u; break;
case '(': *pos = 0x28u; break;
case ')': *pos = 0x29u; break;
case '*': *pos = 0x2Au; break;
case '+': *pos = 0x2Bu; break;
case ',': *pos = 0x2Cu; break;
case '-': *pos = 0x2Du; break;
case '.': *pos = 0x2Eu; break;
case '/': *pos = 0x2Fu; break;
case '0': *pos = 0x30u; break;
case '1': *pos = 0x31u; break;
case '2': *pos = 0x32u; break;
case '3': *pos = 0x33u; break;
case '4': *pos = 0x34u; break;
case '5': *pos = 0x35u; break;
case '6': *pos = 0x36u; break;
case '7': *pos = 0x37u; break;
case '8': *pos = 0x38u; break;
case '9': *pos = 0x39u; break;
case ':': *pos = 0x3Au; break;
case ';': *pos = 0x3Bu; break;
case '<': *pos = 0x3Cu; break;
case '=': *pos = 0x3Du; break;
case '>': *pos = 0x3Eu; break;
case '?': *pos = 0x3Fu; break;
case '@': *pos = 0x40u; break;
case 'A': *pos = 0x41u; break;
case 'B': *pos = 0x42u; break;
case 'C': *pos = 0x43u; break;
case 'D': *pos = 0x44u; break;
case 'E': *pos = 0x45u; break;
case 'F': *pos = 0x46u; break;
case 'G': *pos = 0x47u; break;
case 'H': *pos = 0x48u; break;
case 'I': *pos = 0x49u; break;
case 'J': *pos = 0x4Au; break;
case 'K': *pos = 0x4Bu; break;
case 'L': *pos = 0x4Cu; break;
case 'M': *pos = 0x4Du; break;
case 'N': *pos = 0x4Eu; break;
case 'O': *pos = 0x4Fu; break;
case 'P': *pos = 0x50u; break;
case 'Q': *pos = 0x51u; break;
case 'R': *pos = 0x52u; break;
case 'S': *pos = 0x53u; break;
case 'T': *pos = 0x54u; break;
case 'U': *pos = 0x55u; break;
case 'V': *pos = 0x56u; break;
case 'W': *pos = 0x57u; break;
case 'X': *pos = 0x58u; break;
case 'Y': *pos = 0x59u; break;
case 'Z': *pos = 0x5Au; break;
case '[': *pos = 0x5Bu; break;
case '\\': *pos = 0x5Cu; break;
case ']': *pos = 0x5Du; break;
case '^': *pos = 0x5Eu; break;
case '_': *pos = 0x5Fu; break;
case '`': *pos = 0x60u; break;
case 'a': *pos = 0x61u; break;
case 'b': *pos = 0x62u; break;
case 'c': *pos = 0x63u; break;
case 'd': *pos = 0x64u; break;
case 'e': *pos = 0x65u; break;
case 'f': *pos = 0x66u; break;
case 'g': *pos = 0x67u; break;
case 'h': *pos = 0x68u; break;
case 'i': *pos = 0x69u; break;
case 'j': *pos = 0x6Au; break;
case 'k': *pos = 0x6Bu; break;
case 'l': *pos = 0x6Cu; break;
case 'm': *pos = 0x6Du; break;
case 'n': *pos = 0x6Eu; break;
case 'o': *pos = 0x6Fu; break;
case 'p': *pos = 0x70u; break;
case 'q': *pos = 0x71u; break;
case 'r': *pos = 0x72u; break;
case 's': *pos = 0x73u; break;
case 't': *pos = 0x74u; break;
case 'u': *pos = 0x75u; break;
case 'v': *pos = 0x76u; break;
case 'w': *pos = 0x77u; break;
case 'x': *pos = 0x78u; break;
case 'y': *pos = 0x79u; break;
case 'z': *pos = 0x7Au; break;
case '{': *pos = 0x7Bu; break;
case '|': *pos = 0x7Cu; break;
case '}': *pos = 0x7Du; break;
case '~': *pos = 0x7Eu; break;

case 0: *pos = 0; return;

default: *pos = *pc;
}
}
}
 
R

Richard Heathfield

Tomás Ó hÉilidhe said:

Please my MakeASCII function! Rip it apart!

Well, I won't rip it apart, but I think I can let a little air out of it.

#include <string.h>

void MakeASCII(unsigned char *pos,char const *pc)
{
const char *bcs =
" !\"#$%&'()*+,-./0123456789:;<=>?@"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"[\\]^_`"
"abcdefghijklmnopqrstuvwxyz"
"{|}~";
const char *cur = NULL;

while(*pc != '\0')
{
cur = strchr(bcs, *pc);
if(cur != NULL)
{
*pos++ = (cur - bcs) + 32;
}
else
{
*pos++ = *pc;
}
++pc;
}
*pos = '\0';
}

If you hit performance issues with that one, consider this alternative:

#include <string.h>
#include <limits.h>

void MakeASCII(unsigned char *pos,char const *pc)
{
const char *bcs =
" !\"#$%&'()*+,-./0123456789:;<=>?@"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"[\\]^_`"
"abcdefghijklmnopqrstuvwxyz"
"{|}~";
static char att[UCHAR_MAX + 1] = {0};

const char *cur = bcs;
int i = 0;

if(att[' '] != 32) /* do we need to set up the array? */
{
/* defaults */
for(i = 0; i < UCHAR_MAX + 1; i++)
{
att = (char)i;
}

/* known ASCII characters */
i = 32;
while(*cur != '\0')
{
att[*cur++] = i++;
}
}

while(*pos++ = att[*pc++])
{
continue;
}
}
 
T

Tomás Ó hÉilidhe

Richard Heathfield:
#include <string.h>

void MakeASCII(unsigned char *pos,char const *pc)
{
  const char *bcs =
    " !\"#$%&'()*+,-./0123456789:;<=>?@"
    "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    "[\\]^_`"
    "abcdefghijklmnopqrstuvwxyz"
    "{|}~";
  const char *cur = NULL;

  while(*pc != '\0')
  {
    cur = strchr(bcs, *pc);
    if(cur != NULL)
    {
      *pos++ = (cur - bcs) + 32;
    }
    else
    {
      *pos++ = *pc;
    }
    ++pc;
  }
  *pos = '\0';

}

If you hit performance issues with that one, consider this alternative:

#include <string.h>
#include <limits.h>

void MakeASCII(unsigned char *pos,char const *pc)
{
  const char *bcs =
    " !\"#$%&'()*+,-./0123456789:;<=>?@"
    "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    "[\\]^_`"
    "abcdefghijklmnopqrstuvwxyz"
    "{|}~";
  static char att[UCHAR_MAX + 1] = {0};

  const char *cur = bcs;
  int i = 0;

  if(att[' '] != 32) /* do we need to set up the array? */
  {
    /* defaults */
    for(i = 0; i < UCHAR_MAX + 1; i++)
    {
      att = (char)i;
    }

    /* known ASCII characters */
    i = 32;
    while(*cur != '\0')
    {
      att[*cur++] = i++;
    }
  }

  while(*pos++ = att[*pc++])
  {
    continue;
  }

}



Very nice, the look-up method hadn't crossed my mind.

If we can be sure that all characters will be valid ASCII characters
then we can do the following:

#include <string.h> /* strchr */
#include <stdio.h> /* puts */

typedef char OctetStorage;

void MakeASCII(OctetStorage *pos,char const *pc)
{
static char const ascii[] =
" !\"#$%&'()*+,-./0123456789:;<=>?@"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"[\\]^_`"
"abcdefghijklmnopqrstuvwxyz"
"{|}~";


for ( ; *pc; ++pos, ++pc)
*pos = strchr(ascii,*pc) - ascii + ' ';

*pos = 0;
}

int main(void)
{
char hello[] = "hello";

MakeASCII(hello,hello);

puts(hello);

return 0;
}

I wasn't sure whether I was able to replace:

for ( ; *pc; ++pos, ++pc)
*pos = strchr(ascii,*pc) - ascii + ' ';

with:

while (*pc) *pos++ = strchr(ascii,*pc++) - ascii + ' ';

I thought there might be a sequence point violation if pos and pc
point to the same thing.. ?
 
T

Tomás Ó hÉilidhe

       *pos = strchr(ascii,*pc) - ascii + ' ';


That space should of course be the ASCII value for space:

*pos = strchr(ascii,*pc) - ascii + 0x20;
 
R

Richard Heathfield

Tomás Ó hÉilidhe said:

int main(void)
{
char hello[] = "hello";

MakeASCII(hello,hello);
I wasn't sure whether I was able to replace:

for ( ; *pc; ++pos, ++pc)
*pos = strchr(ascii,*pc) - ascii + ' ';

Don't add ' ' if you really want to add 32 and may be running on a
non-ASCII system! Add 32 instead (or ASCII_BASE, #defined to 32, or
something like that). Also, be absolutely sure that there is no
possibility of strchr(ascii, *pc) returning NULL!
with:

while (*pc) *pos++ = strchr(ascii,*pc++) - ascii + ' ';

I thought there might be a sequence point violation if pos and pc
point to the same thing.. ?

It's a valid point. If there is a risk of that, then make sure that the
increments occur separately:

while (*pc)
{
*pos = strchr(ascii,*pc) - ascii + ASCII_BASE;
++pos;
++pc;
}
 
W

Willem

Richard Heathfield wrote:
) Tomás Ó hÉilidhe said:
)> with:
)>
)> while (*pc) *pos++ = strchr(ascii,*pc++) - ascii + ' ';
)>
)> I thought there might be a sequence point violation if pos and pc
)> point to the same thing.. ?
)
) It's a valid point. If there is a risk of that, then make sure that the
) increments occur separately:

I disagree. While pos and pc may point to the same thing, it's not
the thing that is pointed to that gets incremented.


SaSW, Willem
--
Disclaimer: I am in no way responsible for any of the statements
made in the above text. For all I know I might be
drugged or something..
No I'm not paranoid. You all think I'm paranoid, don't you !
#EOT
 
P

Philip Potter

Richard said:
Tomás Ó hÉilidhe said:

It's a valid point. If there is a risk of that, then make sure that the
increments occur separately:

while (*pc)
{
*pos = strchr(ascii,*pc) - ascii + ASCII_BASE;
++pos;
++pc;
}

I disagree that there is a risk of UB. If we give the object which pos
and pc both point to a name "obj", then the statement in question has
three effects:
obj = strchr(ascii,obj) - ascii + ' ';
pos++;
pc++;
I don't see any problem in all these effects occuring in the same
expression. No object is written to and read from in the same expression
except obj, and it is only read to determine the new value of obj, which
is allowed.

The apparent similarity of the original statement

*pos++ = strchr(ascii,*pc++) - ascii + ' ';

to statements such as

a = i++;

is purely coincidental, because in the latter, the postincrement applies
to an object which is referenced elsewhere in the same expression,
whereas in the former, it is not. The pointer is incremented, not the
pointee.

In fact it is much more similar to

*to++ = *from++;

which is valid even if from == to.
 
R

Richard Heathfield

Willem said:
Richard Heathfield wrote:
) Tomás Ó hÉilidhe said:
)> with:
)>
)> while (*pc) *pos++ = strchr(ascii,*pc++) - ascii + ' ';
)>
)> I thought there might be a sequence point violation if pos and pc
)> point to the same thing.. ?
)
) It's a valid point. If there is a risk of that, then make sure that the
) increments occur separately:

I disagree. While pos and pc may point to the same thing, it's not
the thing that is pointed to that gets incremented.

Ha! Let me think about this for a moment, before I do a second about-turn
in the space of three articles.

WLOG we can reduce the expression to *pos++ = *pc++. pos and pc point to
the same thing but, as you say, are different objects. The object whose
value is being retrieved for the purpose of determining the value to be
stored is not itself being modified at all except via the assignment, and
in that respect is equivalent to x = x, which we all know is legal.

So yes, you're right, and the squeal of burning rubber is heard once more
in the land.
 
C

Chris Dollin

Willem said:
Richard Heathfield wrote:
) Tomás Ó hÉilidhe said:
)> with:
)>
)> while (*pc) *pos++ = strchr(ascii,*pc++) - ascii + ' ';
)>
)> I thought there might be a sequence point violation if pos and pc
)> point to the same thing.. ?
)
) It's a valid point. If there is a risk of that, then make sure that the
) increments occur separately:

I disagree. While pos and pc may point to the same thing, it's not
the thing that is pointed to that gets incremented.

(if pc == pos ...)

The same location is being written to and read from, and the read isn't
just to determine the value to be written (in a strict interpretation;
I'm sure we've had /that/ discussion before); but there is a sequence
point intervening, unless `strchr` might be a macro, but if it were there
could be a guarantee that it respected sequence points ...

My heads hurt.

This looks like a suitably horrible piece of avoidance:

while (*pc++) *pos++ = strchr( ascii, pc[-1] ) - ascii + ' ';

I propose that the expression `E*` be introduced as meaning `(E)[-1]`
to make this easier to type: postfix-* is the decreference operator,
and is nicely compaqt. Happy?

--
"I know it was late, but Mountjoy never bothers, /Archer's Goon/
so long as it's the full two thousand words."

Hewlett-Packard Limited Cain Road, Bracknell, registered no:
registered office: Berks RG12 1HN 690597 England
 
W

Willem

Richard wrote:
) Ha! Let me think about this for a moment, before I do a second about-turn
) in the space of three articles.
)
) WLOG we can reduce the expression to *pos++ = *pc++. pos and pc point to
) the same thing but, as you say, are different objects. The object whose
) value is being retrieved for the purpose of determining the value to be
) stored is not itself being modified at all except via the assignment, and
) in that respect is equivalent to x = x, which we all know is legal.

Ah, but wait!
What if *pos == pc ?
That is, pos points to the location of pc ?
Is that even possible ?


SaSW, Willem
--
Disclaimer: I am in no way responsible for any of the statements
made in the above text. For all I know I might be
drugged or something..
No I'm not paranoid. You all think I'm paranoid, don't you !
#EOT
 
V

vippstar

Richard wrote:

) Ha! Let me think about this for a moment, before I do a second about-turn
) in the space of three articles.
)
) WLOG we can reduce the expression to *pos++ = *pc++. pos and pc point to
) the same thing but, as you say, are different objects. The object whose
) value is being retrieved for the purpose of determining the value to be
) stored is not itself being modified at all except via the assignment, and
) in that respect is equivalent to x = x, which we all know is legal.

Ah, but wait!
What if *pos == pc ?
That is, pos points to the location of pc ?
Is that even possible ?
Not if pos and pc are the same type (unless if both are void *, but
then what they point to cannot be evaluated).
However, even assuming pos = &pc, the objects that are modified in *pos
++ = .. *pc++ are 'pos' and 'pc' and not '*pos' nor '*pc'.
So, even with your assumption, it is still valid.
 
R

Richard

Willem said:
Richard wrote:
) Ha! Let me think about this for a moment, before I do a second about-turn
) in the space of three articles.
)
) WLOG we can reduce the expression to *pos++ = *pc++. pos and pc point to
) the same thing but, as you say, are different objects. The object whose
) value is being retrieved for the purpose of determining the value to be
) stored is not itself being modified at all except via the assignment, and
) in that respect is equivalent to x = x, which we all know is legal.

Ah, but wait!
What if *pos == pc ?
That is, pos points to the location of pc ?

Except that is not it. *pos is the value of pc. pos points to the
location of a value equal to that of pc.
 
R

Richard Heathfield

Willem said:
Richard wrote:
) Ha! Let me think about this for a moment, before I do a second
about-turn ) in the space of three articles.
)
) WLOG we can reduce the expression to *pos++ = *pc++. pos and pc point
to ) the same thing but, as you say, are different objects. The object
whose ) value is being retrieved for the purpose of determining the value
to be ) stored is not itself being modified at all except via the
assignment, and ) in that respect is equivalent to x = x, which we all
know is legal.

Ah, but wait!
What if *pos == pc ?
That is, pos points to the location of pc ?
Is that even possible ?

Um, yes, it's possible, via a cast. But if it does, then changing *pos
(which is pointing to a pointer but is of type unsigned char *) is writing
to the pointer itself (pc), in which case the code is totally screwed
anyway. What price the pointer value after the update?

But I don't think it reasonable to impose on this function the burden of
avoiding utter stupidity in the caller. :)
 
T

Tomás Ó hÉilidhe

Richard Heathfield:
The object whose
value is being retrieved for the purpose of determining the
value to be stored is not itself being modified at all except via > the assignment, and in that respect is equivalent to x = x,
which we all know is legal.


With less competant programmers, you'll see that they avoid
certain programming techniques and contructs because they doubt their
own competency too much. You'll see them shy away from doing things
like using pointers to iterate thru array elements in a loop. If you
listen to comp.lang.c++ for twenty minutes, they'll constantly tell
you how "dangerous" it is to be using "raw pointers".

Now I've always been *against* this whole incompetency plea thing,
but I must admit that *this* is the individual single sole part of the
C programming language where I allow my own doubts over my own
competency to reshape the way I write code. That is to say, I'll
*always* have:

for ( ; *pc; ++pos, ++pc) *pos = strchr(ascii,*pc) - ascii + 0x20;

instead of:

while (*pc) *pos++ = strchr(ascii,*pc++) - ascii + 0x20;

because I don't want to risk the chance of getting it wrong. This is
the one place where I actually think I should just play it safe. And
why am I so frightful? Well I had a program one time that worked
PERFECTLY on numerous different systems until I decided to enable the
compiler optimiser. All of a sudden, the program gave different
output. Of course my first assumption was that the compiler had a
dodgy optimiser... but anyway I went thru the code -- code which I
thought had been bullet-proof -- to find the problem. Here was the
culprit:

void StrToLower(char *p)
{
while ( *p++ = tolower( (char unsigned)*p ) );
}

I'll *never* make that mistake again.
 
C

CBFalconer

Tomás Ó hÉilidhe said:
.... snip ...

Now I've always been *against* this whole incompetency plea thing,
but I must admit that *this* is the individual single sole part of
the C programming language where I allow my own doubts over my own
competency to reshape the way I write code. That is to say, I'll
*always* have:

for ( ; *pc; ++pos, ++pc) *pos = strchr(ascii,*pc) - ascii + 0x20;

instead of:

while (*pc) *pos++ = strchr(ascii,*pc++) - ascii + 0x20;

IMO you are making a mistake. The second is simpler, and much
easier to detect inaccuracies in. The thing that screams at you is
that both are missing copying the terminal '\0'. Both are missing
the initialization of pc and pos. Both are missing handling the
fact that the char is not found in the ascii string.
 
M

Morris Dovey

Tomás Ó hÉilidhe wrote:

I think you'd want your character translation logic to look like:

void make_ascii(char *d,char *s)
{ static char x[] = { 0 }; /* ? */
do *d++ = x[(unsigned)*s];
while (*s++);
}

which reduces the problem to appropriately initializing x[]. That
initialization could be done at runtime, but would make more
sense to do at compile time. One runtime approach might be:

x[' '] = 0x20;
x['!'] = 0x21;
:
:
x['~'] = 0x7E;

but my own preference would be to provide the initialization at
compile time.
 
T

Tomás Ó hÉilidhe

CBFalconer:
IMO you are making a mistake.  The second is simpler, and much
easier to detect inaccuracies in.  The thing that screams at you is
that both are missing copying the terminal '\0'.  Both are missing
the initialization of pc and pos.  Both are missing handling the
fact that the char is not found in the ascii string.


They're missing neither of those three things. The context of the code
is as follows:

static char const ascii[] =
" !\"#$%&'()*+,-./0123456789:;<=>?@"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"[\\]^_`"
"abcdefghijklmnopqrstuvwxyz"
"{|}~";

for ( ; *pc; ++pos, ++pc)
*pos = strchr(ascii,*pc) - ascii + ' ';

*pos = 0;

Also, it is assumed that every char is valid ASCII.
 
P

Peter Nilsson

Tomás Ó hÉilidhe said:
        case '$': *pos = 0x24u; break;
case '@': *pos = 0x40u; break;

$ and @ are not guaranteed members of implementation
source or execution character sets.
 
C

CBFalconer

Tomás Ó hÉilidhe said:
CBFalconer:
IMO you are making a mistake. The second is simpler, and much
easier to detect inaccuracies in. The thing that screams at you is
that both are missing copying the terminal '\0'. Both are missing
the initialization of pc and pos. Both are missing handling the
fact that the char is not found in the ascii string.

They're missing neither of those three things. The context of the code
is as follows:

static char const ascii[] =
" !\"#$%&'()*+,-./0123456789:;<=>?@"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"[\\]^_`"
"abcdefghijklmnopqrstuvwxyz"
"{|}~";

for ( ; *pc; ++pos, ++pc)
*pos = strchr(ascii,*pc) - ascii + ' ';

*pos = 0;

Also, it is assumed that every char is valid ASCII.

Which is a silly assumption. However, you are still failing to
initialize pc and pos.
 
S

santosh

CBFalconer said:
Tomás Ó hÉilidhe said:
CBFalconer:
for ( ; *pc; ++pos, ++pc) *pos = strchr(ascii,*pc) - ascii + 0x20;

instead of:

while (*pc) *pos++ = strchr(ascii,*pc++) - ascii + 0x20;

IMO you are making a mistake. The second is simpler, and much
easier to detect inaccuracies in. The thing that screams at you is
that both are missing copying the terminal '\0'. Both are missing
the initialization of pc and pos. Both are missing handling the
fact that the char is not found in the ascii string.

They're missing neither of those three things. The context of the
code is as follows:

static char const ascii[] =
" !\"#$%&'()*+,-./0123456789:;<=>?@"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"[\\]^_`"
"abcdefghijklmnopqrstuvwxyz"
"{|}~";

for ( ; *pc; ++pos, ++pc)
*pos = strchr(ascii,*pc) - ascii + ' ';

*pos = 0;

Also, it is assumed that every char is valid ASCII.

Which is a silly assumption. However, you are still failing to
initialize pc and pos.

They are initialised on entry to the function. Please read the previous
articles before coming to conclusions.
 

Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Members online

No members online now.

Forum statistics

Threads
473,769
Messages
2,569,580
Members
45,054
Latest member
TrimKetoBoost

Latest Threads

Top