N
Nils M Holm
I ran across some odd (as far as I can tell) behavior recently, when
performing some fun benchmarks. (Yeah, but please bear with me!)
The goal was to find out how bad exactly the code generated by my
toy compiler is, so I passed the following program to GNU C (GCC),
Portable C (PCC), and my toy compiler (let's call it SCC):
-----[ p.c ] ---------------------------------------------------
#include <stdio.h>
#define MAX 20000
main() {
int i, j, p[MAX], k;
for (k=0, i=3; k<MAX; i+=2) {
for (j=0; j<k; j++)
if (i % p[j] == 0)
break;
if (j < k) continue;
p[k++] = i;
}
printf("%d\n", p[k-1]);
}
----------------------------------------------------------------
Just a sieve algorithm to calculate the 20,000'th prime number,
nothing fancy, just some tight loops. Please note that I do NOT
intend to boast the code quality of SCC here. The code emitted by
it is in fact abysmally bad. So I was quite surprised to get the
following run times of the generated programs (median of three
runs, almost no deviation):
gcc -O2 p.c: 12.39s
pcc -O p.c: 12.42s
scc p.c: 12.27s
I can explain the difference between GCC/PCC and SCC by lower
startup overhead, but shouldn't the better code quality of PCC
and GCC compensate for this over a run time of 12 seconds?
I am rather puzzled now. Any ideas?
The output of my toy compiler compiler is attached below. You can
download the complete compiler source code here in case you want
to do some own experiments.
http://www.t3x.org/subc/
Any explanations or additional data (run times) would be greatly
appreciated.
-----[ compiler output (SubC) ]---------------------------------
.text
.globl Cmain
Cmain: pushl %ebp # main() {
movl %esp,%ebp
addl $-80012,%esp # int i, j, p[MAX], k;
movl $0,%eax # for (k=0, i=3;
movl %eax,-80012(%ebp)
movl $3,%eax
movl %eax,-4(%ebp)
L2: movl -80012(%ebp),%eax # k<MAX;
pushl %eax
movl $20000,%eax
xorl %edx,%edx
popl %ecx
cmpl %eax,%ecx
jge L6
incl %edx
L6: movl %edx,%eax
orl %eax,%eax
jnz L7
jmp L4
L7: jmp L3
L5: movl $2,%eax # i+=2) {
pushl %eax
movl -4(%ebp),%eax
popl %ecx
addl %ecx,%eax
movl %eax,-4(%ebp)
jmp L2
L3: movl $0,%eax # for (j=0;
movl %eax,-8(%ebp)
L8: movl -8(%ebp),%eax # j<k;
pushl %eax
movl -80012(%ebp),%eax
xorl %edx,%edx
popl %ecx
cmpl %eax,%ecx
jge L12
incl %edx
L12: movl %edx,%eax
orl %eax,%eax
jnz L13
jmp L10
L13: jmp L9
L11: movl -8(%ebp),%eax # j++) {
incl -8(%ebp)
jmp L8
L9: movl -4(%ebp),%eax # if (i % p[j] == 0)
pushl %eax
leal -80008(%ebp),%eax
pushl %eax
movl -8(%ebp),%eax
shll $2,%eax
popl %ecx
addl %ecx,%eax
movl (%eax),%eax
popl %ecx
xchgl %eax,%ecx
cdq
idivl %ecx
movl %edx,%eax
pushl %eax
movl $0,%eax
xorl %edx,%edx
popl %ecx
cmpl %eax,%ecx
jne L14
incl %edx
L14: movl %edx,%eax
orl %eax,%eax
jnz L16
jmp L15
L16: jmp L10 # break;
L15: jmp L11
L10: movl -8(%ebp),%eax # if (j < k)
pushl %eax
movl -80012(%ebp),%eax
xorl %edx,%edx
popl %ecx
cmpl %eax,%ecx
jge L17
incl %edx
L17: movl %edx,%eax
orl %eax,%eax
jnz L19
jmp L18
L19: jmp L5 # continue;
L18: leal -80008(%ebp),%eax # p[k++] = i;
pushl %eax
movl -80012(%ebp),%eax
incl -80012(%ebp)
shll $2,%eax
popl %ecx
addl %ecx,%eax
pushl %eax
movl -4(%ebp),%eax
popl %edx
movl %eax,(%edx)
jmp L5 # }
L4: .data
L20: .byte 37
.byte 'd'
.byte 10
.byte 0
.text # printf("%d\n", p[k-1]);
movl $L20,%eax
pushl %eax
leal -80008(%ebp),%eax
pushl %eax
movl -80012(%ebp),%eax
pushl %eax
movl $1,%eax
popl %ecx
xchgl %eax,%ecx
subl %ecx,%eax
shll $2,%eax
popl %ecx
addl %ecx,%eax
movl (%eax),%eax
pushl %eax
pushl $2
call Cprintf
addl $12,%esp
L1: addl $80012,%esp # }
popl %ebp
ret
performing some fun benchmarks. (Yeah, but please bear with me!)
The goal was to find out how bad exactly the code generated by my
toy compiler is, so I passed the following program to GNU C (GCC),
Portable C (PCC), and my toy compiler (let's call it SCC):
-----[ p.c ] ---------------------------------------------------
#include <stdio.h>
#define MAX 20000
main() {
int i, j, p[MAX], k;
for (k=0, i=3; k<MAX; i+=2) {
for (j=0; j<k; j++)
if (i % p[j] == 0)
break;
if (j < k) continue;
p[k++] = i;
}
printf("%d\n", p[k-1]);
}
----------------------------------------------------------------
Just a sieve algorithm to calculate the 20,000'th prime number,
nothing fancy, just some tight loops. Please note that I do NOT
intend to boast the code quality of SCC here. The code emitted by
it is in fact abysmally bad. So I was quite surprised to get the
following run times of the generated programs (median of three
runs, almost no deviation):
gcc -O2 p.c: 12.39s
pcc -O p.c: 12.42s
scc p.c: 12.27s
I can explain the difference between GCC/PCC and SCC by lower
startup overhead, but shouldn't the better code quality of PCC
and GCC compensate for this over a run time of 12 seconds?
I am rather puzzled now. Any ideas?
The output of my toy compiler compiler is attached below. You can
download the complete compiler source code here in case you want
to do some own experiments.
http://www.t3x.org/subc/
Any explanations or additional data (run times) would be greatly
appreciated.
-----[ compiler output (SubC) ]---------------------------------
.text
.globl Cmain
Cmain: pushl %ebp # main() {
movl %esp,%ebp
addl $-80012,%esp # int i, j, p[MAX], k;
movl $0,%eax # for (k=0, i=3;
movl %eax,-80012(%ebp)
movl $3,%eax
movl %eax,-4(%ebp)
L2: movl -80012(%ebp),%eax # k<MAX;
pushl %eax
movl $20000,%eax
xorl %edx,%edx
popl %ecx
cmpl %eax,%ecx
jge L6
incl %edx
L6: movl %edx,%eax
orl %eax,%eax
jnz L7
jmp L4
L7: jmp L3
L5: movl $2,%eax # i+=2) {
pushl %eax
movl -4(%ebp),%eax
popl %ecx
addl %ecx,%eax
movl %eax,-4(%ebp)
jmp L2
L3: movl $0,%eax # for (j=0;
movl %eax,-8(%ebp)
L8: movl -8(%ebp),%eax # j<k;
pushl %eax
movl -80012(%ebp),%eax
xorl %edx,%edx
popl %ecx
cmpl %eax,%ecx
jge L12
incl %edx
L12: movl %edx,%eax
orl %eax,%eax
jnz L13
jmp L10
L13: jmp L9
L11: movl -8(%ebp),%eax # j++) {
incl -8(%ebp)
jmp L8
L9: movl -4(%ebp),%eax # if (i % p[j] == 0)
pushl %eax
leal -80008(%ebp),%eax
pushl %eax
movl -8(%ebp),%eax
shll $2,%eax
popl %ecx
addl %ecx,%eax
movl (%eax),%eax
popl %ecx
xchgl %eax,%ecx
cdq
idivl %ecx
movl %edx,%eax
pushl %eax
movl $0,%eax
xorl %edx,%edx
popl %ecx
cmpl %eax,%ecx
jne L14
incl %edx
L14: movl %edx,%eax
orl %eax,%eax
jnz L16
jmp L15
L16: jmp L10 # break;
L15: jmp L11
L10: movl -8(%ebp),%eax # if (j < k)
pushl %eax
movl -80012(%ebp),%eax
xorl %edx,%edx
popl %ecx
cmpl %eax,%ecx
jge L17
incl %edx
L17: movl %edx,%eax
orl %eax,%eax
jnz L19
jmp L18
L19: jmp L5 # continue;
L18: leal -80008(%ebp),%eax # p[k++] = i;
pushl %eax
movl -80012(%ebp),%eax
incl -80012(%ebp)
shll $2,%eax
popl %ecx
addl %ecx,%eax
pushl %eax
movl -4(%ebp),%eax
popl %edx
movl %eax,(%edx)
jmp L5 # }
L4: .data
L20: .byte 37
.byte 'd'
.byte 10
.byte 0
.text # printf("%d\n", p[k-1]);
movl $L20,%eax
pushl %eax
leal -80008(%ebp),%eax
pushl %eax
movl -80012(%ebp),%eax
pushl %eax
movl $1,%eax
popl %ecx
xchgl %eax,%ecx
subl %ecx,%eax
shll $2,%eax
popl %ecx
addl %ecx,%eax
movl (%eax),%eax
pushl %eax
pushl $2
call Cprintf
addl $12,%esp
L1: addl $80012,%esp # }
popl %ebp
ret