Base 90... better than base 64?

N

nick

I wanted to try making a javascript compressor (more specifically,
implement a compression engine in javascript, not necessarily for
compressing js) ... but then I found this:

http://marklomas.net/ch-egg/articles/lzwjs.htm

The code is very cleanly written, although it could use some
optimizations (stuff should be moved from constructors to prototypes
for starters).

It outputs an array of numbers (byte array) instead of text... I
wanted it to output and read base64 encoded text. After I got that
working, I tried to squeeze a few more bytes with this base 90 scheme
I came up with. Let me know what you guys think?

// BOF

this.base90 = new function () {

var slash = /\\/g, tilde = /~/g;

// encode a number as a base 90 string
this.fromNumber = function (n) {
for (var r=''; n;) {
r += String.fromCharCode((n%90) + 35);
n = Math.floor(n/90);
}
return r.replace(slash,'~');
}

// decode a base 90 string to a number
this.toNumber = function (str) {
var s=str.replace(tilde,'\\');
for (var i=s.length, r=0; i--;) {
r += Math.pow(90, i) * (s.charCodeAt(i) - 35);
}
return r;
}

// encode a byte array as a base 90 string
this.fromArray = function (a) {
var i, r='', q=0, e, l=a.length;
for (i=0; i<l; i++) {
q += a * Math.pow(255, i%6);
if ((i+1)%6) continue;
e = this.fromNumber(q);
r += e + (e.length < 8 ? ' ' : '');
q = 0;
}
if (q) r += this.fromNumber(q);
return r.trim();
}

// decode a base 90 string to a byte array
this.toArray = function (str) {
var i, r=[], l=str.length, c, n, p, q='';
for (i=0; i<l; i++) {
q += (c = str.charAt(i));
if (c != ' ' && q.length < 8) continue;
this.encodeChunk(q, r);
q = '';
}
if (q) this.encodeChunk(q, r);
return r;
}

this.fromString = function (str) {
var i, a=[], l=str.length, r='';
for (i=0; i<l; i++) a.push(str.charCodeAt(i));
return this.fromArray(a);
}

this.toString = function (str) {
var i, a = this.toArray(str), l=a.length, r='';
for (i=0; i<l; i++) r += String.fromCharCode(a);
return r;
}

this.encodeChunk = function (str, arr) {
for (var n=this.toNumber(str.trim()), p=6; p--;) {
arr.push(n%255);
if (!n) break;
n = Math.floor(n/255);
}
}
}

// Test

var src = "The quick brown fox jumps over the lazy red dog.";
console.log(src);

var dst = base90.fromString(src);
console.log(dst);

src = base90.toString(dst);
console.log(src);

// EOF

The intermediate array storage is there so it works with the
compression script linked above. I'm going to merge everything
together and optimize some stuff, then try to get the decompressor as
small as possible...

So what do you guys think? Is this any better than base 64? It sure as
hell is ugly :)
 
B

BGB / cr88192

usual answer is base85.
base85 can encode 32 bits in 5 bytes.

(note: base85 is a bit better than base64 in terms of what it does...).

base 90 will not save much of anything (vs base 85) unless one transforms
large blocks at a time.
in fact, it would be needed to encode 8 additional bytes to save 1 byte (so,
a unit of 13 bytes).

this would encode around 84.4 bits, 80 bits being a byte-multiple (80 bits
in 13 bytes).
base85 could encode 96 bits in 15 bytes.

so, base90 has a mean-effectiveness of 6.154, base85 6.4.

in terms of the smallest basic unit, base85 is better.

the cases is different at 168 bits in 26 bytes, which has an effectiveness
of 6.462.


in either case, 168 bits is an awkward unit...

or such...


nick said:
I wanted to try making a javascript compressor (more specifically,
implement a compression engine in javascript, not necessarily for
compressing js) ... but then I found this:

http://marklomas.net/ch-egg/articles/lzwjs.htm

The code is very cleanly written, although it could use some
optimizations (stuff should be moved from constructors to prototypes
for starters).

It outputs an array of numbers (byte array) instead of text... I
wanted it to output and read base64 encoded text. After I got that
working, I tried to squeeze a few more bytes with this base 90 scheme
I came up with. Let me know what you guys think?

// BOF

this.base90 = new function () {

var slash = /\\/g, tilde = /~/g;

// encode a number as a base 90 string
this.fromNumber = function (n) {
for (var r=''; n;) {
r += String.fromCharCode((n%90) + 35);
n = Math.floor(n/90);
}
return r.replace(slash,'~');
}

// decode a base 90 string to a number
this.toNumber = function (str) {
var s=str.replace(tilde,'\\');
for (var i=s.length, r=0; i--;) {
r += Math.pow(90, i) * (s.charCodeAt(i) - 35);
}
return r;
}

// encode a byte array as a base 90 string
this.fromArray = function (a) {
var i, r='', q=0, e, l=a.length;
for (i=0; i<l; i++) {
q += a * Math.pow(255, i%6);
if ((i+1)%6) continue;
e = this.fromNumber(q);
r += e + (e.length < 8 ? ' ' : '');
q = 0;
}
if (q) r += this.fromNumber(q);
return r.trim();
}

// decode a base 90 string to a byte array
this.toArray = function (str) {
var i, r=[], l=str.length, c, n, p, q='';
for (i=0; i<l; i++) {
q += (c = str.charAt(i));
if (c != ' ' && q.length < 8) continue;
this.encodeChunk(q, r);
q = '';
}
if (q) this.encodeChunk(q, r);
return r;
}

this.fromString = function (str) {
var i, a=[], l=str.length, r='';
for (i=0; i<l; i++) a.push(str.charCodeAt(i));
return this.fromArray(a);
}

this.toString = function (str) {
var i, a = this.toArray(str), l=a.length, r='';
for (i=0; i<l; i++) r += String.fromCharCode(a);
return r;
}

this.encodeChunk = function (str, arr) {
for (var n=this.toNumber(str.trim()), p=6; p--;) {
arr.push(n%255);
if (!n) break;
n = Math.floor(n/255);
}
}
}

// Test

var src = "The quick brown fox jumps over the lazy red dog.";
console.log(src);

var dst = base90.fromString(src);
console.log(dst);

src = base90.toString(dst);
console.log(src);

// EOF

The intermediate array storage is there so it works with the
compression script linked above. I'm going to merge everything
together and optimize some stuff, then try to get the decompressor as
small as possible...

So what do you guys think? Is this any better than base 64? It sure as
hell is ugly :)
 
N

nick

usual answer is base85.
base85 can encode 32 bits in 5 bytes.

Aha, so it does. I was trying to use 6 bytes (which was less
efficient).
(note: base85 is a bit better than base64 in terms of what it does...).

Noted. I wonder why base64 is so much more widely used given this
fact.
base 90 will not save much of anything (vs base 85) unless one transforms
large blocks at a time.
in fact, it would be needed to encode 8 additional bytes to save 1 byte (so,
a unit of 13 bytes).

this would encode around 84.4 bits, 80 bits being a byte-multiple (80 bits
in 13 bytes).
base85 could encode 96 bits in 15 bytes.

so, base90 has a mean-effectiveness of 6.154, base85 6.4.

in terms of the smallest basic unit, base85 is better.

the cases is different at 168 bits in 26 bytes, which has an effectiveness
of 6.462.

in either case, 168 bits is an awkward unit...

or such...

Ah, crap, you just went way over my head. I suck at math... Sounds
like the trick here has to do with common denominators? What exactly
is the formula to calculate the efficiency of these things?

I ran some tests with different bases and 'chunk' sizes (chunk meaning
a group of bytes encoded together, not sure what the proper term is),
and I didn't see anything more efficient than base 85 with 4 byte
chunks (5 chars encoded), just as you said. Here's the modified code I
used to test if anyone else wants to play around with it.

<script>

// BaseConverter ctor
this.BaseConverter = function (base, bytes) {

// numeric base
this._base = base || 85;

// number of bytes in a chunk
this._bytes = bytes || 4;

// max char count for an encoded chunk
this._max = this.fromNumber(Math.pow(255, this._bytes)).length;

}

this.BaseConverter.prototype = new function () {

var slash = /\\/g, tilde = /~/g;

// encode a number as a string
this.fromNumber = function (n) {
for (var r=''; n;) {
r += String.fromCharCode((n%this._base) + 35);
n = Math.floor(n/this._base);
}
return r.replace(slash,'~');
}

// decode a string to a number
this.toNumber = function (str) {
var s=str.replace(tilde,'\\');
for (var i=s.length, r=0; i--;) {
r += Math.pow(this._base, i) * (s.charCodeAt(i) - 35);
}
return r;
}

// encode a byte array as a string
this.fromArray = function (a) {
var i, r='', q=0, e, l=a.length;
for (i=0; i<l; i++) {
q += a * Math.pow(255, i % this._bytes);
if ((i+1) % this._bytes) continue;
e = this.fromNumber(q);
r += e + (e.length < this._max ? ' ' : '');
q = 0;
}
if (q) r += this.fromNumber(q);
return r.trim();
}

// decode a string to a byte array
this.toArray = function (str) {
var i, r=[], l=str.length, c, q='';
for (i=0; i<l; i++) {
q += (c = str.charAt(i));
if (c != ' ' && q.length < this._max) continue;
this.encodeChunk(q, r);
q = '';
}
if (q) this.encodeChunk(q, r);
return r;
}

// encode a string as a string
this.fromString = function (str) {
var i, a=[], l=str.length, r='';
for (i=0; i<l; i++) a.push(str.charCodeAt(i));
return this.fromArray(a);
}

// decode a string to a string
this.toString = function (str) {
var i, a = this.toArray(str), r='';
for (i=-1; a[++i];) r += String.fromCharCode(a);
return r;
}

// encode a chunk of data
this.encodeChunk = function (str, arr) {
var n = this.toNumber(str.trim())
for (var p=this._bytes; p--;) {
arr.push(n%255);
if (!n) break;
n = Math.floor(n/255);
}
}
}

// Test

for (var i=64; i<=90; i++) for (var j=1; j<=6; j++) {
var b = new BaseConverter(i, j);
var src = "The quick red fox jumps over the lazy dog. "
+ "The dog lies there...";
// console.log(src);
var dst = b.fromString(src);
// console.log(dst);
src = b.toString(dst);
// console.log(src);
console.log(i + "/" + j + " (" + b._max + ")",
src.length, '->', dst.length);
}

</script>
 
B

BGB / cr88192

usual answer is base85.
base85 can encode 32 bits in 5 bytes.

<--
Aha, so it does. I was trying to use 6 bytes (which was less
efficient).
-->

yep.

(note: base85 is a bit better than base64 in terms of what it does...).

<--
Noted. I wonder why base64 is so much more widely used given this
fact.
-->

base85 is not exactly rare...

but, between them, base64 is a little easier to get reliably through a wider
range of transports, whereas base85 can pose more problems (for example, in
its standard form, it can't be directly transported via XML, ...).

also, it directly aligns with the underlying bits, and so can be mentally
decoded (whereas base85 is a bit more of a problem).

base 90 will not save much of anything (vs base 85) unless one transforms
large blocks at a time.
in fact, it would be needed to encode 8 additional bytes to save 1 byte
(so,
a unit of 13 bytes).

this would encode around 84.4 bits, 80 bits being a byte-multiple (80 bits
in 13 bytes).
base85 could encode 96 bits in 15 bytes.

so, base90 has a mean-effectiveness of 6.154, base85 6.4.

in terms of the smallest basic unit, base85 is better.

the cases is different at 168 bits in 26 bytes, which has an effectiveness
of 6.462.

in either case, 168 bits is an awkward unit...

or such...

<--
Ah, crap, you just went way over my head. I suck at math... Sounds
like the trick here has to do with common denominators? What exactly
is the formula to calculate the efficiency of these things?
-->

number_of_bits / number_of_bytes.

I was also looking for units where there was fairly close to parity (since
this is where it is most efficient, and also because partial-bits would be a
pain to use in a stream).

another possibility of course would be to transform larger fixed-size units
into a given base, which could potentially waste less bits, but is a lot
harder to implement.

however, maximal bits-per-byte is not usually that important, as something
"decently good" is good enough.
similarly, a 32-bit unit is easy to work with and transform, but a much
larger bit-unit (say, for example, 1024 bits) would be a little harder (and
likely slower to encode/decode).


but, yeah, generally I such at math as well...
I know enough to get through what I tend to do, but it is generally not all
that good by others' standards...


<--
I ran some tests with different bases and 'chunk' sizes (chunk meaning
a group of bytes encoded together, not sure what the proper term is),
and I didn't see anything more efficient than base 85 with 4 byte
chunks (5 chars encoded), just as you said. Here's the modified code I
used to test if anyone else wants to play around with it.
-->

fair enough...


<--
<script>

// BaseConverter ctor
this.BaseConverter = function (base, bytes) {

// numeric base
this._base = base || 85;

// number of bytes in a chunk
this._bytes = bytes || 4;

// max char count for an encoded chunk
this._max = this.fromNumber(Math.pow(255, this._bytes)).length;

}

this.BaseConverter.prototype = new function () {

var slash = /\\/g, tilde = /~/g;

// encode a number as a string
this.fromNumber = function (n) {
for (var r=''; n;) {
r += String.fromCharCode((n%this._base) + 35);
n = Math.floor(n/this._base);
}
return r.replace(slash,'~');
}

// decode a string to a number
this.toNumber = function (str) {
var s=str.replace(tilde,'\\');
for (var i=s.length, r=0; i--;) {
r += Math.pow(this._base, i) * (s.charCodeAt(i) - 35);
}
return r;
}

// encode a byte array as a string
this.fromArray = function (a) {
var i, r='', q=0, e, l=a.length;
for (i=0; i<l; i++) {
q += a * Math.pow(255, i % this._bytes);
if ((i+1) % this._bytes) continue;
e = this.fromNumber(q);
r += e + (e.length < this._max ? ' ' : '');
q = 0;
}
if (q) r += this.fromNumber(q);
return r.trim();
}

// decode a string to a byte array
this.toArray = function (str) {
var i, r=[], l=str.length, c, q='';
for (i=0; i<l; i++) {
q += (c = str.charAt(i));
if (c != ' ' && q.length < this._max) continue;
this.encodeChunk(q, r);
q = '';
}
if (q) this.encodeChunk(q, r);
return r;
}

// encode a string as a string
this.fromString = function (str) {
var i, a=[], l=str.length, r='';
for (i=0; i<l; i++) a.push(str.charCodeAt(i));
return this.fromArray(a);
}

// decode a string to a string
this.toString = function (str) {
var i, a = this.toArray(str), r='';
for (i=-1; a[++i];) r += String.fromCharCode(a);
return r;
}

// encode a chunk of data
this.encodeChunk = function (str, arr) {
var n = this.toNumber(str.trim())
for (var p=this._bytes; p--;) {
arr.push(n%255);
if (!n) break;
n = Math.floor(n/255);
}
}
}

// Test

for (var i=64; i<=90; i++) for (var j=1; j<=6; j++) {
var b = new BaseConverter(i, j);
var src = "The quick red fox jumps over the lazy dog. "
+ "The dog lies there...";
// console.log(src);
var dst = b.fromString(src);
// console.log(dst);
src = b.toString(dst);
// console.log(src);
console.log(i + "/" + j + " (" + b._max + ")",
src.length, '->', dst.length);
}

</script>
-->
 
P

Paul E. Schoen

nick said:
I wanted to try making a javascript compressor (more specifically,
implement a compression engine in javascript, not necessarily for
compressing js) ... but then I found this:

http://marklomas.net/ch-egg/articles/lzwjs.htm

So what do you guys think? Is this any better than base 64? It sure as
hell is ugly :)

I had always thought Base64 was used just to be able to send binary data in
text form, without control characters. Compression was a separate process.
There seem to be 94 ASCII characters (excluding Space) that could be used
for encoding. If all 8 bits of an unsigned character could be used, a base
128 would be possible, and it should be twice as efficient as base64.
http://code.google.com/apis/protocolbuffers/docs/encoding.html

I know that anyone can do a search and use the Wiki, but this seems to have
a good explanation:
http://en.wikipedia.org/wiki/Base64

I made a Base64 converter some time ago, perhaps using Delphi or C, but I
can't find it.

Paul
 
N

nick

"nick" <[email protected]> wrote in message
<--
Ah, crap, you just went way over my head. I suck at math... Sounds
like the trick here has to do with common denominators? What exactly
is the formula to calculate the efficiency of these things?
-->
number_of_bits / number_of_bytes.

Heh, sorry for being unclear... I got that part; I was wondering how
to calculate how many characters would be needed to represent a given
number of bytes when encoded in a given base with a given "chunk size"
without just encoding some data and checking the length. Looking for
something possibly like:

encoded_str_len = base (operator) chunk_size (operator) input_bytes
 
N

nick

I had always thought Base64 was used just to be able to send binary data in
text form, without control characters. Compression was a separate process..

I think you may have misread the OP. I was a bit OT at the beginning
(trying to set context).
There seem to be 94 ASCII characters (excluding Space) that could be used
for encoding. If all 8 bits of an unsigned character could be used, a base
128 would be possible, and it should be twice as efficient as base64.http://code.google.com/apis/protocolbuffers/docs/encoding.html

But characters like slash and quote can be troublesome. The way I
implemented it I could squeeze in one more character I think, but
base91 just sounded too silly.
 
D

Dr J R Stockton

In comp.lang.javascript message <[email protected]>,
I had always thought Base64 was used just to be able to send binary
data in text form, without control characters.
Yes.

Compression was a separate process. There seem to be 94 ASCII
characters (excluding Space) that could be used for encoding. If all 8
bits of an unsigned character could be used, a base 128 would be
possible, and it should be twice as efficient as base64.

No; a 7-bit character only contains about 15% more information than a
6-bit one.
 
P

Paul E. Schoen

Dr J R Stockton said:
In comp.lang.javascript message <[email protected]>,


No; a 7-bit character only contains about 15% more information than a
6-bit one.

I thought about that after I wrote and sent the post. The character contains
twice as many bits (128/64), but only an increase of 7/6 or 1.1667.

Paul
 
G

Guest

I thought about that after I wrote and sent the post. The character contains
twice as many bits (128/64), but only an increase of 7/6 or 1.1667.

Can represent twice as many values but only has one more bit.
 
G

Guest

I had always thought Base64 was used just to be able to send binary data in
text form, without control characters. Compression was a separate process.
There seem to be 94 ASCII characters (excluding Space) that could be used
for encoding.

It is easier (no math) to encode if the base is a power of two since we
are working with binary. Take the least common multiple of 8 (8 bits, a
byte) and the number of bits in the base (6 for base 64) and take
6 bytes to encode in 8 base64 digits just by taking the right bits:

BITS FOR EIGHT BASE 64 DIGITS:
123456123456123456123456123456123456123456123456

48 BITS
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx

BITS FOR SIX BASE 256 CHARACTERS:
123456781234567812345678123456781234567812345678

Which characters? 94? Be careful of characters which may not pass through
mail systems and ones which can be munged. UUENCODING was a base64 method
for unix-unix transfers of data over plain text lines but ... using some
IBM systems as relays which used EBCDIC character encoding resulted in
corruption. XXENCODING (I believe) was another base64 encoding changing
the characters used as the base64 "digits" set. They had line starters,
terminators, checksums(?).

MIME base64 uses 64 digits guaranteed(?) to work (well, maybe not on
APPLE I's which only had upper case characters?).

YYENCODING is now often used in newsgroups. It is pretty much raw 8 bit
since "modern" news servers can handle it. There is some encoding, but
it expands the size very little. Of course, if you save such a file and
try to email it ... it may well fail (email systems may be much more
sensitive) and trying to use one of those back on the old GEnie system
(7 bit, even parity) would fail. FTPing such a file using Windows (is
the default still TEXT instead of IMAGE/BINARY) may strip off the high
bits. Base64 encodings expand the file size by 33% (assuming you have to
use 8 bits to send one of the 64 base64 "digits") (compress the file first!).
You don't want the overhead of changing bases if it involves mathematical
operations (divide, etc.) or manipulation to avoid certain sensitive characters.
You just want a simple (fast, easy) selection (of bits).
 
G

Guest

Which characters? 94?

While using a base which is a power of two makes life very easy,
consider using a base where b^n is a little larger than a power
of two (rather than equal to a power of two). For example,
10^3 is about 2^10 so one can almost encode all 10 bit sequences
as three base 10 digits, but not quite as 10^3 is a bit smaller
than 2^10.

90^2=8100, 2^13=8192

That's a bit small. Better would be base 91.

91^2=8281, 2^13=8192

You can encode any sequence of thirteen bits as a pair of
base 91 digits. If you choose 91 "safe" characters to use
to represent those digits you can encode any sequence of
13 bits as a pair of those digits. If it takes 8 bits to
send each such character/digit, you can send any sequence
of 13 bits as those two characters (16 bits) (though you
will not use up all the possible pairs which can handle
numbers up through 8281 - you can use those other pairs
for control). This would be an increase of
(16-13)/13=23% (instead of the 33% of base 64 encoding).

But how do you convert a number (from 0 to 2^13-1) to
base 91 (it will take at most two "digits")? First extract
out the 13 bits (take 13 original bytes and split them into
8 sequences of 13 bits each). Convert each sequence of 13
bits into to base 91 digits and send those (16 bits to send
them). Convert to base 91? Divide by 91 (integer division)
to find the high digit and take it mod 91 to get the low
base 91 digit.

Which 91 characters are guaranteed to be safe on various
systems (including legacy systems using EBCDIC encoding
of which there are variants)?

Of course for any integer which is not a power of two,
ln(b)/ln(2) is irrational (even transcendental!) and so
there are some integers k,m so that k*ln(b) is as close
as you want but larger than m*ln(2) and you can use k
digits in base b to represent m digits in base 2.

How about base 83? A continued fraction expansion of
ln(83)/ln(2) shows that a good choice would be 8
base 83 digits to encode 51 bits:

83^8 = 2252292232139041
2^51 = 2251799813685248

and so one can encode 51 bits as 8 base 83 digits and
assuming that sending each digit takes 8 bit, you have
to send 64 bits (instead of 51) for an increase of
25.5% (if there aren't 91 "safe" characters to use as
digits but there are 83, one encode with an overhead
of 25.5% instead of base 64's 33.3%). Of course, in
this case one has to convert a number from
0 through 2251799813685248-1
to base 83.
 

Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Members online

Forum statistics

Threads
474,263
Messages
2,571,064
Members
48,769
Latest member
Clifft

Latest Threads

Top