J
jacob navia
Hi
I am doing a series of benchmarks against gcc, and I found out that the
results of my benchmark would change from version to version of gcc,
i.e. the 2.95 version would give different results than 3.4 version.
This is strange since under windows, MSVC, intel and lcc-win32
give exactly the same results...
This is not *per se* a bug, but I would like to know what
results you get. The program in question is a matrix
manipulation program that uses heavily floating point.
Just mail as an answer to this message the output you
get with a small note of machine, and gcc version (or
other compiler of course).
Thanks in advance. The program compiles under windows and linux
without modifications. I did not write this program, so please
ignore questions of style.
jacob
-----------------------------------------------cut here
/* SCCS ID @(#)ge.h 1.1 2/4/86 */
/***************************************************************
******************************************************************
**** Matrix data structure(s) for Gaussian Elimination ****
******************************************************************
****************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#ifdef __LCC__
#include <fenv.h>
#endif
double sasum(int n,double * sx,int incx );
int isamax( int n, double *sx, int incx );
/*
This file contains the definitions of the structures used in
various algorithms for doing Gaussian Elimination.
The following gives an array (of length 10) of pointers to doubles.
double *a[10];
Now assume that each a points to space for an array of doubles
(gotten
by a call to malloc, say).
Then the following is true:
a can be thought of as a pointer to the i-th array of
doubles,
*(a+j) is the j-th element of the i-th array.
The following shows how to reference things for the definition of
the FULL
structure given below.
a->cd is the value of (as apposed to a pointer to) the column
dimension.
a->rd is the value of (as apposed to a pointer to) the row dimension.
a->pd[j] is a pointer to the j-th column (an array of doubles).
*(a->pd[j]+i) is the i-th element of the j-th column, viz., a(i,j).
Here we think, as is natural in C, of all matrices and vectors indexed
from 0 insead of 1.
*/
#define MAXCOL 1000 /* Maximum number of Columns. */
struct FULL { /* Struct definition for the FULL matrix
structure. */
int cd; /* Column dimension of the matrix. */
int rd; /* Row Dimension of the matrix. */
double *pd[MAXCOL]; /* Array of pointers to the columns of a
matrix. */
};
void sgesl(struct FULL * a,int * ipvt,double b[],int job );
void saxpy(int n,double sa, double *sx,int incx,double * sy,int incy );
/* The following macro will get a(r,c) from a matrix in the FULL
structure. */
#define elem(a,r,c) (*(a.pd[(c)]+(r)))
/* The following macro will get a(r,c) from a pointer to a matrix
in the FULL structure. */
#define pelem(a,r,c) (*(a->pd[(c)]+(r)))
int sgeco( struct FULL *a,int * ipvt, double *rcond,double * z );
void matgen( int n,struct FULL * a,double * b );
int sgefa(struct FULL * a,int * ipvt );
double copysign(double,double);
double second(void);
double sdot(int n,double * sx,int incx,double * sy, int incy );
/***************************************************************
******************************************************************
**** Sample driver for SGEFA/SL and CO ****
******************************************************************
****************************************************************/
/*
* SYSTEM DEPENENT ROUTINES:
* double second(); Returns cpu time used in seconds
*/
#define MAXN 500 /* Maximum problem size. */
int main()
{
/* Storage for the linear system and associates. */
struct FULL a;
double x[MAXCOL], b[MAXCOL], z[MAXCOL], rcond;
int ipvt[MAXCOL];
#ifdef __LCC__
DoublePrecision(); // Set 53 bits preision
#endif
/* Local vars. */
double opsfa, opsco, opssl, xerrnrm, resnrm, xnrm;
double tfai, tfao, tsli, tslo, tcoi, tcoo;
int n, i, j, retval;
#ifdef sun
/* The following is a kludge for correct Sun FPA core dumps. */
unsigned newmode = 62464;
fpmode_(&newmode);
#endif
memset(&a,0,sizeof(a));
for( n=50; n<=MAXN; n+=50 ) {
/* Generate the linear system. */
matgen( n, &a, b );
opsfa = 1.0E-6*((2.0e0*(double)n*(double)n*(double)n)/3.0e0 +
2.0e0*(double)n*(double)n);
opsco = opsfa + 1.0E-6*(6.0e0*(double)n*(double)n);
opssl = 1.0e-6*(2.0e0*(double)n*(double)n);
/* Factor. */
tfai = second();
retval = sgefa( &a, ipvt );
tfao = second();
if( retval )
printf("Zero Column %d found.\n", retval );
else {
/* Solve the system. */
tsli = second();
sgesl( &a, ipvt, b, 0 );
tslo = second();
/* Compute a residual to verify results. */
for( i=0; i<n; i++ )
x = b;
matgen( n, &a, b );
for( j=0; j<n; j++ ) {
register double *col = a.pd[j];
register double xj = x[j];
for( i=0; i<n; i++ ) {
b -= col*xj;
}
}
xerrnrm = 0.0;
resnrm = 0.0;
for( i=0; i<n; i++ ) {
xerrnrm += ((double)x-1.0)*((double)x-1.0);
resnrm += (double)b*(double)b;
}
xnrm = sqrt( (double)n );
xerrnrm = sqrt( xerrnrm )/xnrm;
resnrm = sqrt( resnrm )/xnrm;
/* Now factor and est condition number. */
tcoi = second();
retval = sgeco( &a, ipvt, &rcond, z );
tcoo = second();
printf("\n\n\tLinpack Benchmark in C of size %d\n",n);
printf(" 1/COND(A) (Condition number of A) = %15e\n",rcond);
printf(" ||x-X||/||X|| (Solution Error) = %15e\n",xerrnrm);
printf(" ||b-Ax||/||X|| (Residual Error) = %15e\n",resnrm);
printf("\tWhere X = True solution and and x = computed solution\n");
}
}
return 0;
}
void matgen(int n,struct FULL * a,double * b )
{
/* This routine generates a struct FULL matrix */
int i, j, init = 1325;
/* Allocate/Deallocate space for the columns. */
for( i=0; i<n; i++ ) {
if( a->pd != NULL ) (void)free( a->pd );
if( (a->pd = (double *)malloc( (unsigned)n*sizeof(double) ))
== NULL ) {
fprintf(stderr, "MATGEN: Error allocating matrix for
n=%d\n",n);
exit( 1 );
}
}
/* Set the matrix elements and right hand side. */
a->cd = n;
a->rd = n;
for( j=0; j<n; j++ ) {
register double *col = a->pd[j];
for( i=0; i<n; i++ ) {
init = 3125*init % 65536;
col = ((double)init-32768.0)/16384.0;
}
}
for( j=0; j<n; j++ ) b[j] = 0.0;
for( j=0; j<n; j++ ) {
register double *col = a->pd[j];
for( i=0; i<n; i++ ) {
b += col;
}
}
}
/****************************************************************************
* Routines for getting elapsed CPU usage.
****************************************************************************/
#include <time.h>
static int epsilon=0;
double second(void)
/****************************************************************************
* Returns the total cpu time used in seconds.
* Emulates the Cray fortran library function of the same name.
****************************************************************************/
{
epsilon++;
return( (double)epsilon + ((double)clock())/1000.0 );
}
void sscal(int n,double sa,double * sx,int incx );
/***************************************************************
******************************************************************
**** Gaussian Elimination with partial pivoting ****
**** and condition number estimation. ****
**** This file contains the factorization driver and ****
**** contion number estimation routine SGECO ****
******************************************************************
****************************************************************/
int sgeco(struct FULL * a,int * ipvt,double * rcond,double * z )
/*
PURPOSE
SGECO factors a real matrix by gaussian elimination
and estimates the condition of the matrix.
REMARKS
If rcond is not needed, SGEFA is slightly faster.
to solve A*x = b , follow SGECO by SGESL.
To compute inverse(A)*c , follow SGECO by SGESL.
To compute determinant(A) , follow SGECO by SGEDI.
To compute inverse(A) , follow SGECO by SGEDI.
INPUT
a A pointer to the FULL matrix structure.
See the definition in ge.h.
OUTPUT
a A pointer to the FULL matrix structure containing
an upper triangular matrix and the multipliers
which were used to obtain it.
The factorization can be written a = l*u where
l is a product of permutation and unit lower
triangular matrices and u is upper triangular.
ipvt An integer vector (of length a->cd) of pivot indices.
rcond A double estimate of the reciprocal condition of A .
for the system A*x = b , relative perturbations
in A and b of size epsilon may cause
relative perturbations in x of size epsilon/rcond .
If rcond is so small that the logical expression
1.0 + rcond .eq. 1.0
is true, then a may be singular to working
precision. In particular, rcond is zero if
exact singularity is detected or the estimate
underflows.
z A double vector (of length a->cd) for a work vector
whose contents are usually unimportant. If A is
close to a singular matrix, then z is an approx-
imate null vector in the sense that:
norm(a*z) = rcond*norm(a)*norm(z) .
RETURNS
= -1 Matrix is not square.
= 0 Normal return value.
= k if u(k,k) .eq. 0.0 . This is not an error
condition for this subroutine, but it does
indicate that sgesl or sgedi will divide by zero
if called. Use rcond in sgeco for a reliable
indication of singularity.
ROUTINES
SGEFA(), blas sasum() and sdot(), copysign(), fabs();
WARNINGS
This routine uses the UN*X math library routines
copysign() and fabs().
*/
{
register int j;
int k, n, info ;
register double s;
double ek, anorm, ynorm;
n = a->cd;
/* Compute 1-norm of A. */
for( j=0, anorm=0.0; j<n; j++ ) {
register double sum;
sum = sasum( n, a->pd[j], 1 );
anorm = (double)sum > anorm ? (double)sum : anorm;
}
/* Factor A. */
info = sgefa( a, ipvt );
/*
* rcond = 1/(norm(a)*(estimate of norm(inverse(a)))) .
* estimate = norm(z)/norm(y) where a*z = y and trans(a)*y = e .
* trans(a) is the transpose of a . The components of e are
* chosen to cause maximum local growth in the elements of w where
* trans(u)*w = e . The vectors are frequently rescaled to avoid
* overflow.
*/
/* solve trans(u)*w = e */
ek = 1.0;
for( j=0; j<n; j++ ) {
z[j] = 0.0e0;
}
for( k=0; k<n; k++ ) {
register double zk = z[k];
double wk, wkm, sm;
int kp1 = k+1;
if( zk != 0.0 ) ek = (double)copysign( (double)ek, (double)-zk );
if( fabs( (double)(ek-zk) ) > fabs( (double)pelem(a,k,k) ) ) {
s = (double)fabs( (double)pelem(a,k,k) )/(double)fabs(
(double)(ek-zk) );
sscal( n, (double)s, z, 1 );
zk = z[k];
ek *= s;
}
wk = ek - zk;
wkm = -ek - zk;
s = (double)fabs( (double)wk );
sm = (double)fabs( (double)wkm );
if( pelem(a,k,k) == 0.0 ) {
wk = 1.0;
wkm = 1.0;
}
else {
wk /= pelem(a,k,k);
wkm /= pelem(a,k,k);
}
if( kp1<n ) {
for( j=kp1; j<n; j++ ) {
sm = sm + (double)fabs( (double)(z[j]+wkm*pelem(a,k,j)) );
z[j] += ((double)wk)*pelem(a,k,j);
s += (double)fabs( (double)z[j] );
}
if( s < sm ) {
register double t = wkm-wk;
wk = wkm;
for( j=kp1; j<n; j++ ) {
z[j] += t*pelem(a,k,j);
}
}
}
z[k] = wk;
}
s = 1.0/(double)sasum( n, z, 1 );
sscal( n, (double)s, z, 1 );
/* Solve trans(L)*y = w. */
for( k=n-1; k>=0; k-- ) {
register int l;
register double t;
if( k < n-1 ) z[k] += (double)sdot( n-k-1, a->pd[k]+k+1, 1,
(z+k+1), 1 );
if( fabs( (double)z[k] ) > 1.0 ) {
s = 1.0/(double)fabs( (double)z[k] );
sscal( n, (double)s, z, 1 );
}
l = ipvt[k];
t = z[l];
z[l] = z[k];
z[k] = t;
}
s = 1.0/(double)sasum( n, z, 1 );
sscal( n, (double)s, z, 1);
ynorm = 1.0;
/* solve L*v = y. */
for( k=0; k<n; k++ ) {
register int l;
register double t;
l = ipvt[k];
t = z[l];
z[l] = z[k];
z[k] = t;
if( k < n-1 ) saxpy( n-k-1, (double)t, (a->pd[k]+k+1), 1,
(z+k+1), 1 );
if( fabs( (double)z[k] ) > 1.0) {
s = 1.0/(double)fabs( (double)z[k] );
sscal( n, (double)s, z, 1 );
ynorm *= s;
}
}
s = 1.0/(double)sasum( n, z, 1 );
sscal( n, (double)s, z, 1 );
ynorm *= s;
/* Solve U*z = v. */
for( k=n-1; k>=0; k-- ) {
register double t;
if( fabs( (double)z[k] ) > fabs( (double)pelem(a,k,k) ) ) {
s = (double)fabs( (double)pelem(a,k,k) )/(double)fabs(
(double)z[k] );
sscal( n, (double)s, z, 1 );
ynorm *= s;
}
if( pelem(a,k,k) == 0.0 ) z[k] = 1.0;
else z[k] /= pelem(a,k,k);
t = -z[k];
saxpy( k, (double)t, a->pd[k], 1, z, 1 );
}
/* Make znorm = 1.0. */
s = 1.0/(double)sasum( n, z, 1 );
sscal( n, (double)s, z, 1 );
ynorm *= s;
if( anorm == 0.0e0) *rcond = 0.0;
else *rcond = ynorm/anorm;
return( info );
}
/***************************************************************
******************************************************************
**** Gaussian Elimination with partial pivoting. ****
**** This file contains the factorization routine SGEFA ****
******************************************************************
****************************************************************/
int sgefa(struct FULL *a,int * ipvt )
/*
PURPOSE
SGEFA factors a real matrix by gaussian elimination.
REMARKS
SGEFA is usually called by SGECO, but it can be called
directly with a saving in time if rcond is not needed.
(time for SGECO) = (1 + 9/n)*(time for SGEFA) .
INPUT
a A pointer to the FULL matrix structure.
See the definition in ge.h.
OUTPUT
a A pointer to the FULL matrix structure containing
an upper triangular matrix and the multipliers
which were used to obtain it.
The factorization can be written a = l*u where
l is a product of permutation and unit lower
triangular matrices and u is upper triangular.
ipvt An integer vector (of length a->cd) of pivot indices.
RETURNS
= -1 Matrix is not square.
= 0 Normal return value.
= k if u(k,k) .eq. 0.0 . This is not an error
condition for this subroutine, but it does
indicate that sgesl or sgedi will divide by zero
if called. Use rcond in sgeco for a reliable
indication of singularity.
ROUTINES
blas ISAMAX
*/
{
register int i, j;
int isamax(), k, l, nm1, info, n;
double *akk, *alk;
register double t, *mik;
/* Gaussian elimination with partial pivoting. */
if( a->cd != a->rd ) return( -1 );
n = a->cd;
nm1 = n - 1;
akk = a->pd[0];
info = 0; /* Assume nothing will go wrong! */
if( n < 2 ) goto CLEAN_UP;
/* Loop over Diagional */
for( k=0; k<nm1; k++, ipvt++ ) {
/* Find index of max elem in col below the diagional (l = pivot
index). */
akk = a->pd[k] + k;
l = isamax( n-k, akk, 1 ) + k;
*ipvt = l;
/* Zero pivot implies this column already triangularized. */
alk = a->pd[k] + l;
if( *alk == 0.0e0) {
info = k;
continue;
}
/* Interchange a(k,k) and a(l,k) if necessary. */
if( l != k ) {
t = *alk;
*alk = *akk;
*akk = t;
}
/* Compute multipliers for this column. */
t = -1.0e0 / (*akk);
for( i=k+1, mik=a->pd[k]; i<n; i++ )
mik *= t;
/* Column elimination with row indexing. */
if( l != k ) {
/* Interchange a(k,j) and a(l,j) if necessary. */
for( j=k+1; j<n; j++ ) {
t = pelem(a,k,j);
pelem(a,k,j) = pelem(a,l,j);
pelem(a,l,j) = t;
}
}
for( j=k+1; j<n; j++ ) {
register double *aij = a->pd[j];
t = pelem(a,k,j);
for( i=k+1, mik=a->pd[k]; i<n; i++ )
aij += t*mik;
}
} /* End of for k loop */
CLEAN_UP:
*ipvt = nm1;
if( *akk == 0.0e0 ) info = n;
return( info );
}
/***************************************************************
******************************************************************
**** Gaussian Elimination with partial pivoting. ****
**** This file contains the solution routine SGESL ****
******************************************************************
****************************************************************/
void sgesl(struct FULL * a,int * ipvt,double b[],int job )
/*
PURPOSE
SGESL solves the real system
a * x = b or trans(a) * x = b
using the factors computed by SGECO or SGEFA.
INPUT
a A pointer to the FULL matrix structure containing the
factored
matrix. See the definition of FULL in ge.h.
ipvt The pivot vector (of length a->cd) from SGECO or SGEFA.
b The right hand side vector (of length a->cd).
job = 0 to solve a*x = b ,
= nonzero to solve trans(a)*x = b where
trans(a) is the transpose.
OUTPUT
b The solution vector x.
REMARKS
Error condition:
A division by zero will occur if the input factor contains a
zero on the diagonal. Technically this indicates singularity
but it is often caused by improper arguments or improper
setting of lda . It will not occur if the subroutines are
called correctly and if sgeco has set rcond .gt. 0.0
or sgefa has set info .eq. 0 .
*/
{
register double t, *mik;
double *akk;
register int i, k;
int l, n, nm1;
n = a->cd;
nm1 = n - 1;
/* job = 0 , solve A * x = b. */
if( job == 0 ) {
/* Forward elimination. Solve L*y = b. */
for( k=0; k<nm1; k++ ) {
akk = a->pd[k] + k; /* akk points to a(k,k). */
/* Interchange b[k] and b[l] if necessary. */
l = ipvt[k];
t = b[l];
if( l != k ) {
b[l] = b[k];
b[k] = t;
}
for( i=k+1, mik=a->pd[k]; i<n; i++ )
b += t*mik;
}
/* Back substitution. Solve U*x = y. */
for( k=nm1; k>=0; k-- ) {
register double *uik = a->pd[k];
akk = uik + k;
b[k] /= (*akk);
for( i=0; i<k; i++ )
b -= uik*b[k];
}
return;
}
/* job = nonzero. Solve trans(A) * x = b. */
/* First solve trans(U)*y = b. */
for( k=0; k<n; k++ ) {
register double *uik = a->pd[k];
akk = uik + k;
t = 0.0;
for( i=0; i<k; i++ )
t += uik*b;
b[k] = (b[k] - t) / (*akk);
}
/* b now contains y. */
/* Solve trans(L)*x = y. */
for( k=n-2; k>=0; k-- ) {
mik = a->pd[k];
t = 0.0;
for( i=k+1; i<n; i++ )
t += mik*b;
b[k] += t;
/* Interchange b(k) and b(ipvt(k)) if necessary. */
l = ipvt[k];
if( l == k ) continue;
t = b[l];
b[l] = b[k];
b[k] = t;
}
return;
}
/***************************************************************
*****************************************************************
*******************************************************************
***** *****
***** BLAS *****
***** Basic Linear Algebra Subroutines *****
***** Written in the C Programming Language. *****
***** *****
***** Functions include: *****
***** isamax, sasum, saxpy, scopy, sdot, snrm2, *****
***** *****
***** In addition a few other routines are included: *****
***** vexopy, vfill *****
***** *****
***** If your 3M library does not have the copysign function *****
***** then compile this file with -DCOPYSIGN and one will be *****
***** be supplied. *****
*******************************************************************
*****************************************************************
***************************************************************/
#ifndef HUGEsp
#define HUGEsp 1.0e+38
#endif
#ifndef SMALLsp
#define SMALLsp 1.0e-45
#endif
int isamax(int n, double *sx,int incx )
/*
PURPOSE
Finds the index of element having max. absolute value.
INPUT
n Number of elements to check.
sx Vector to be checked.
incx Every incx-th element is checked.
*/
{
register double smax = 0.0e0;
register int i, istmp = 0;
#ifndef abs
#define abs(x) ((x)<0.0?-(x)
x))
#endif
#ifdef alliant
int isamax_();
istmp = isamax_( &n, sx, &incx )-1; /* Remember 0 is first. */
return( istmp );
#else
if( n <= 1 ) return( istmp );
if( incx != 1 ) {
/* Code for increment not equal to 1. */
if( incx < 0 ) sx = sx + ((-n+1)*incx + 1);
istmp = 0;
smax = abs( *sx );
sx += incx;
for( i=1; i<n; i++, sx+=incx )
if( abs( *sx ) > smax ) {
istmp = i;
smax = abs( *sx );
}
return( istmp );
}
/* Code for increment equal to 1. */
istmp = 0;
smax = abs(*sx);
sx++;
for( i=1; i<n; i++, sx++ )
if( abs( *sx ) > smax ) {
istmp = i;
smax = abs( *sx );
}
return( istmp );
#endif
}
double sasum(int n,double * sx,int incx )
/*
PURPOSE
Returns sum of magnitudes of single precision SX.
sasum = sum from 0 to n-1 of ABS(SX(1+I*INCX))
INPUT
n Number of elements to multiply.
sx Pointer to double vector to take abs sum of.
incx Storage incrament for sx.
RETURNS
sasum Double variable with the result.
WARNINGS
This routine uses the UN*X math library function fabs().
*/
{
#ifndef abs
#define abs(x) ((x)<0.0?-(x)
x))
#endif
register double sum = 0.0;
if( n<= 0 ) return( sum );
if( incx == 1 ) {
register int i, m;
/* Code for increments equal to 1. */
/* Clean-up loop so remaining vector length is a multiple of 6. */
m = n % 6;
if( m != 0 ) {
for( i=0; i<m; i++ )
sum += abs( sx );
if( n < 6 ) return( (double)sum );
}
for( i=m; i<n; i+=6 ) {
sum += abs( sx ) + abs( sx[i+1]) + abs( sx[i+2] ) +
abs( sx[i+3] ) + abs( sx[i+4] ) + abs( sx[i+5] );
}
return( (double)sum );
}
else {
register int i, ns;
/* Code for increments not equal to 1. */
ns = n*incx;
for( i=0; i<ns; i+=incx ) {
sum += abs( sx );
}
return( (double)sum );
}
}
void saxpy(int n,double sa, double *sx,int incx,double * sy,int incy )
/*
PURPOSE
Vector times a scalar plus a vector. sy = sy + sa*sx.
INPUT
n Number of elements to multiply.
sa Scalar to multiply by (note that this is a double).
sx Pointer to double vector to scale.
incx Storage incrament for sx.
sy Pointer to double vector to add.
incy Storage incrament for sy.
OUTPUT
sy sy = sy + sa*sx
*/
{
register int i;
register double ssa = (double)sa;
if( n<=0 || ssa==0.0 ) return;
if( incx == incy ) {
if( incx == 1 ) {
/* Both increments = 1 */
for( i=0; i<n; i++ )
sy += ssa*sx;
return;
}
if( incx>0 ) {
/* Equal, positive, non-unit increments. */
for( i=0; i<n; i++,sx+=incx,sy+=incx )
*sy += ssa*(*sx);
return;
}
}
/* Unequal or negative increments. */
if( incx < 0 ) sx += ((-n+1)*incx + 1);
if( incy < 0 ) sy += ((-n+1)*incy + 1);
for( i=0; i<n; i++,sx+=incx,sy+=incy )
*sy += ssa*(*sx);
}
void saxpyx( int n, double sa, double *sx,int incx,double * sy,int incy )
/*
PURPOSE
Vector times a scalar plus a vector. sx = sy + sa*sx.
INPUT
n Number of elements to multiply.
sa Scalar to multiply by (note that this is a double).
sx Pointer to double vector to scale.
incx Storage incrament for sx.
sy Pointer to double vector to add.
incy Storage incrament for sy.
OUTPUT
sx sx = sy + sa*sx
*/
{
register i;
register double ssa = (double)sa;
if( n<=0 || ssa==0.0 ) return;
if( incx == incy ) {
if( incx == 1 ) {
/* Both increments = 1 */
for( i=0; i<n; i++ )
sx = sy + ssa*sx;
return;
}
if( incx>0 ) {
/* Equal, positive, non-unit increments. */
for( i=0; i<n; i++, sx+=incx, sy+=incx)
*sx = *sy + ssa*(*sx);
return;
}
}
/* Unequal or negative increments. */
if( incx < 0 ) sx += ((-n+1)*incx + 1);
if( incy < 0 ) sy += ((-n+1)*incy + 1);
for( i=0; i<n; i++,sx+=incx,sy+=incy )
*sx = *sy + ssa*(*sx);
}
void scopy(int n,double * sx,int incx, double *sy,int incy )
/*
PURPOSE
Copies vector sx into vector sy.
INPUT
n Number of components to copy.
sx Source vector
incx Index increment for sx.
incy Index increment for sy.
OUTPUT
sy Destination vector.
*/
{
register int i;
if( n<1 ) return;
if( incx == incy ) {
if( incx == 1 ) {
/* Both increments = 1 */
for( i=0; i<n; i++ )
sy = sx;
return;
}
if( incx > 0 ) {
/* Equal, positive, non-unit increments. */
for( i=0; i<n; i++, sx+=incx, sy+=incx)
*sy = *sx;
return;
}
}
/* Non-equal or negative increments. */
if( incx < 0 ) sx += ((-n+1)*incx + 1);
if( incy < 0 ) sy += ((-n+1)*incy + 1);
for( i=0; i<n; i++,sx+=incx,sy+=incy )
(*sx) = (*sy);
return;
}
double sdot(int n,double * sx,int incx,double * sy, int incy )
/*
PURPOSE
Forms the dot product of a vector.
INPUT
n Number of elements to sum.
sx Address of first element of x vector.
incx Incrament for the x vector.
sy Address of first element of y vector.
incy incrament for the y vector.
OUPUT
sdot Dot product x and y. Double returned
due to `C' language features.
*/
{
register int i;
register double stemp = 0.0e0;
if( n<1 ) return( (double)stemp );
if( incx == incy ) {
if( incx == 1 ) {
/* Both increments = 1 */
for( i=0; i<n; i++ )
stemp += sx*sy;
return( (double)stemp );
}
if( incx>0 ) {
/* Equal, positive, non-unit increments. */
for( i=0; i<n; i++, sx+=incx, sy+=incx)
stemp += (*sx)*(*sy);
return( (double)stemp );
}
}
/* Unequal or negative increments. */
if( incx < 0 ) sx += ((-n+1)*incx + 1);
if( incy < 0 ) sy += ((-n+1)*incy + 1);
for( i=0; i<n; i++,sx+=incx,sy+=incy )
stemp += (*sx)*(*sy);
return( (double)stemp );
} /* End of ---SDOT--- */
double snrm2( int n,double * sx,int incx )
/*
PURPOSE
Computes the Euclidean norm of sx while being
very careful of distructive underflow and overflow.
INPUT
n Number of elements to use.
sx Address of first element of x vector.
incx Incrament for the x vector (>0).
OUPUT
snrm2 Euclidean norm of sx. Returns double
due to `C' language features.
REMARKS
This algorithm proceeds in four steps.
1) scan zero components.
2) do phase 2 when component is near underflow.
*/
{
register int i;
static double cutlo, cuthi;
double sum = 0.0e0, hitst, r1mach();
double xmax;
if( n<1 || incx<1 ) return( sum );
/* Calculate near underflow */
if( cutlo == 0.0 ) cutlo = sqrt( SMALLsp/r1mach() );
/* Calculate near overflow */
if( cuthi == 0.0 ) cuthi = sqrt( HUGEsp );
hitst = cuthi/(double) n;
i = 0;
/* Zero Sum. */
while( *sx == 0.0 && i<n ) {
i++;
sx += incx;
}
if( i>=n ) return( sum );
START:
if( abs( *sx ) > cutlo ) {
for( ; i<n; i++, sx+=incx ) { /* Loop over elements. */
if( abs( *sx ) > hitst ) goto GOT_LARGE;
sum += (*sx) * (*sx);
}
sum = sqrt( sum );
return( sum ); /* Sum completed
normaly. */
}
else { /* Small sum prepare
for phase 2. */
xmax = abs( *sx );
sx += incx;
i++;
sum += 1.0;
for( ; i<n; i++, sx+=incx ) {
if( abs( *sx ) > cutlo ) { /* Got normal elem.
Rescale and process. */
sum = (sum*xmax)*xmax;
goto START;
}
if( abs( *sx ) > xmax ) {
sum = 1.0 + sum*(xmax /(*sx))*(xmax /(*sx));
xmax = abs( *sx );
continue;
}
sum += ((*sx)/xmax)*((*sx)/xmax);
}
return( (double)xmax*sqrt( sum ) );
} /* End of small sum. */
GOT_LARGE:
sum = 1.0 + (sum/(*sx))/(*sx); /* Rescale and process. */
xmax = abs( *sx );
sx += incx;
i++;
for( ; i<n; i++, sx+=incx ) {
if( abs( *sx ) > xmax ) {
sum = 1.0 + sum*(xmax /(*sx))*(xmax /(*sx));
xmax = abs( *sx );
continue;
}
sum += ((*sx)/xmax)*((*sx)/xmax);
}
return( (double)xmax*sqrt( sum ) ); /* End of small sum. */
} /* End of ---SDOT--- */
double r1mach()
/***********************************************************************
**** This routine computes the unit roundoff for single precision
**** of the machine. This is defined as the smallest positive
**** machine number u such that 1.0 + u .ne. 1.0
**** Returns a double due to `C' language features.
**********************************************************************/
{
double u = 1.0e0, comp;
do {
u *= 0.5e0;
comp = 1.0e0 + u;
}
while( comp != 1.0e0 );
return( (double)u*2.0e0 );
} /*-------------------- end of function r1mach ------------------------*/
int min0(int n,int a,int b,int c,int d,int e,int f,int g,int h,int i,int
j,int k,int l,int m,int o,int p )
/*
PURPOSE
Determine the minimum of the arguments a-p.
INPUT
n Number of arguments to check 1 <= n <= 15.
a-p Integer arguments of which the minumum is desired.
RETURNS
min0 Minimum of a thru p.
*/
{
int mt;
if( n < 1 || n > 15 ) return( -1 );
mt = a;
if( n == 1 ) return( mt );
if( mt > b ) mt = b;
if( n == 2 ) return( mt );
if( mt > c ) mt = c;
if( n == 3 ) return( mt );
if( mt > d ) mt = d;
if( n == 4 ) return( mt );
if( mt > e ) mt = e;
if( n == 5 ) return( mt );
if( mt > f ) mt = f;
if( n == 6 ) return( mt );
if( mt > g ) mt = g;
if( n == 7 ) return( mt );
if( mt > h ) mt = h;
if( n == 8 ) return( mt );
if( mt > i ) mt = i;
if( n == 9 ) return( mt );
if( mt > j ) mt = j;
if( n == 10 ) return( mt );
if( mt > k ) mt = k;
if( n == 11 ) return( mt );
if( mt > l ) mt = l;
if( n == 12 ) return( mt );
if( mt > m ) mt = m;
if( n == 13 ) return( mt );
if( mt > o ) mt = o;
if( n == 14 ) return( mt );
if( mt > p ) mt = p;
return( mt );
}
void sscal(int n,double sa,double * sx,int incx )
/*
PURPOSE
Scales a vector by a constant.
INPUT
n Number of components to scale.
sa Scale value (note that this is a double).
sx Vector to scale.
incx Every incx-th element of sx will be scaled.
OUTPUT
sx Scaled vector.
*/
{
register double ssa = (double)sa;
int i;
#ifdef alliant
void sscal_();
double ssaa = (double)sa;
sscal_( &n, &ssaa, sx, &incx );
return;
#else
if( n < 1 ) return;
/* Code for increment not equal to 1.*/
if( incx != 1 ) {
if( incx < 0 ) sx += (-n+1)*incx;
for( i=0; i<n; i++, sx+=incx )
*sx *= ssa;
return;
}
/* Code for unit increment. */
for( i=0; i<n; i++ )
sx *= ssa;
return;
#endif
}
void vexopy(int n,double * v,double * x,double * y,int itype )
/*
Purpose:
To operate on the vectors x and y.
Input:
n Number of elements to scale.
x First operand vector.
y Second operand vector.
itype Type of operation to perform:
itype = 1 => '+'
itype = 2 => '-'
Output:
v Result vector of x op y.
*/
{
register int i;
if( n<1 ) return;
if( itype == 1 ) /* ADDITION. */
for( i=0; i<n; i++ )
v = x + y;
else /* SUBTRACTION. */
for( i=0; i<n; i++ )
v = x - y;
}
void vfill(int n,double * v,double val )
/*
Purpose
To fill the FLOAT vector v with the value val.
Make sure that if you pass a value for val that you cast
it to double, viz.,
vfill( 100, vector, (double)0.0 );
*/
{
register int i;
register double vval = (double)val;
if( n<1 ) return;
for( i=0; i<n; i++ )
v = vval;
}
#if 1
double copysign(double x,double y )
/*
PURPOSE
This routine is recomended by the IEEE standard 754. Most C
3M libraries contain this function. If yours does not then
compile this file with the -DCOPYSIGN option and the following
code will be generated. This routine copies the sign from y
onto x.
INPUT
x Thing to have it's sign changed.
y Variable to supply the sign.
RETURNS
copysign Returns x with the sign of y (the fortran intrinsic
sign(x,y)).
*/
{
/* Registers are used for reduced memory traffic. */
register double xx = x;
register double yy = y;
if( xx*yy < 0.0 ) return( -xx );
else return( xx );
}
#endif
I am doing a series of benchmarks against gcc, and I found out that the
results of my benchmark would change from version to version of gcc,
i.e. the 2.95 version would give different results than 3.4 version.
This is strange since under windows, MSVC, intel and lcc-win32
give exactly the same results...
This is not *per se* a bug, but I would like to know what
results you get. The program in question is a matrix
manipulation program that uses heavily floating point.
Just mail as an answer to this message the output you
get with a small note of machine, and gcc version (or
other compiler of course).
Thanks in advance. The program compiles under windows and linux
without modifications. I did not write this program, so please
ignore questions of style.
jacob
-----------------------------------------------cut here
/* SCCS ID @(#)ge.h 1.1 2/4/86 */
/***************************************************************
******************************************************************
**** Matrix data structure(s) for Gaussian Elimination ****
******************************************************************
****************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#ifdef __LCC__
#include <fenv.h>
#endif
double sasum(int n,double * sx,int incx );
int isamax( int n, double *sx, int incx );
/*
This file contains the definitions of the structures used in
various algorithms for doing Gaussian Elimination.
The following gives an array (of length 10) of pointers to doubles.
double *a[10];
Now assume that each a points to space for an array of doubles
(gotten
by a call to malloc, say).
Then the following is true:
a can be thought of as a pointer to the i-th array of
doubles,
*(a+j) is the j-th element of the i-th array.
The following shows how to reference things for the definition of
the FULL
structure given below.
a->cd is the value of (as apposed to a pointer to) the column
dimension.
a->rd is the value of (as apposed to a pointer to) the row dimension.
a->pd[j] is a pointer to the j-th column (an array of doubles).
*(a->pd[j]+i) is the i-th element of the j-th column, viz., a(i,j).
Here we think, as is natural in C, of all matrices and vectors indexed
from 0 insead of 1.
*/
#define MAXCOL 1000 /* Maximum number of Columns. */
struct FULL { /* Struct definition for the FULL matrix
structure. */
int cd; /* Column dimension of the matrix. */
int rd; /* Row Dimension of the matrix. */
double *pd[MAXCOL]; /* Array of pointers to the columns of a
matrix. */
};
void sgesl(struct FULL * a,int * ipvt,double b[],int job );
void saxpy(int n,double sa, double *sx,int incx,double * sy,int incy );
/* The following macro will get a(r,c) from a matrix in the FULL
structure. */
#define elem(a,r,c) (*(a.pd[(c)]+(r)))
/* The following macro will get a(r,c) from a pointer to a matrix
in the FULL structure. */
#define pelem(a,r,c) (*(a->pd[(c)]+(r)))
int sgeco( struct FULL *a,int * ipvt, double *rcond,double * z );
void matgen( int n,struct FULL * a,double * b );
int sgefa(struct FULL * a,int * ipvt );
double copysign(double,double);
double second(void);
double sdot(int n,double * sx,int incx,double * sy, int incy );
/***************************************************************
******************************************************************
**** Sample driver for SGEFA/SL and CO ****
******************************************************************
****************************************************************/
/*
* SYSTEM DEPENENT ROUTINES:
* double second(); Returns cpu time used in seconds
*/
#define MAXN 500 /* Maximum problem size. */
int main()
{
/* Storage for the linear system and associates. */
struct FULL a;
double x[MAXCOL], b[MAXCOL], z[MAXCOL], rcond;
int ipvt[MAXCOL];
#ifdef __LCC__
DoublePrecision(); // Set 53 bits preision
#endif
/* Local vars. */
double opsfa, opsco, opssl, xerrnrm, resnrm, xnrm;
double tfai, tfao, tsli, tslo, tcoi, tcoo;
int n, i, j, retval;
#ifdef sun
/* The following is a kludge for correct Sun FPA core dumps. */
unsigned newmode = 62464;
fpmode_(&newmode);
#endif
memset(&a,0,sizeof(a));
for( n=50; n<=MAXN; n+=50 ) {
/* Generate the linear system. */
matgen( n, &a, b );
opsfa = 1.0E-6*((2.0e0*(double)n*(double)n*(double)n)/3.0e0 +
2.0e0*(double)n*(double)n);
opsco = opsfa + 1.0E-6*(6.0e0*(double)n*(double)n);
opssl = 1.0e-6*(2.0e0*(double)n*(double)n);
/* Factor. */
tfai = second();
retval = sgefa( &a, ipvt );
tfao = second();
if( retval )
printf("Zero Column %d found.\n", retval );
else {
/* Solve the system. */
tsli = second();
sgesl( &a, ipvt, b, 0 );
tslo = second();
/* Compute a residual to verify results. */
for( i=0; i<n; i++ )
x = b;
matgen( n, &a, b );
for( j=0; j<n; j++ ) {
register double *col = a.pd[j];
register double xj = x[j];
for( i=0; i<n; i++ ) {
b -= col*xj;
}
}
xerrnrm = 0.0;
resnrm = 0.0;
for( i=0; i<n; i++ ) {
xerrnrm += ((double)x-1.0)*((double)x-1.0);
resnrm += (double)b*(double)b;
}
xnrm = sqrt( (double)n );
xerrnrm = sqrt( xerrnrm )/xnrm;
resnrm = sqrt( resnrm )/xnrm;
/* Now factor and est condition number. */
tcoi = second();
retval = sgeco( &a, ipvt, &rcond, z );
tcoo = second();
printf("\n\n\tLinpack Benchmark in C of size %d\n",n);
printf(" 1/COND(A) (Condition number of A) = %15e\n",rcond);
printf(" ||x-X||/||X|| (Solution Error) = %15e\n",xerrnrm);
printf(" ||b-Ax||/||X|| (Residual Error) = %15e\n",resnrm);
printf("\tWhere X = True solution and and x = computed solution\n");
}
}
return 0;
}
void matgen(int n,struct FULL * a,double * b )
{
/* This routine generates a struct FULL matrix */
int i, j, init = 1325;
/* Allocate/Deallocate space for the columns. */
for( i=0; i<n; i++ ) {
if( a->pd != NULL ) (void)free( a->pd );
if( (a->pd = (double *)malloc( (unsigned)n*sizeof(double) ))
== NULL ) {
fprintf(stderr, "MATGEN: Error allocating matrix for
n=%d\n",n);
exit( 1 );
}
}
/* Set the matrix elements and right hand side. */
a->cd = n;
a->rd = n;
for( j=0; j<n; j++ ) {
register double *col = a->pd[j];
for( i=0; i<n; i++ ) {
init = 3125*init % 65536;
col = ((double)init-32768.0)/16384.0;
}
}
for( j=0; j<n; j++ ) b[j] = 0.0;
for( j=0; j<n; j++ ) {
register double *col = a->pd[j];
for( i=0; i<n; i++ ) {
b += col;
}
}
}
/****************************************************************************
* Routines for getting elapsed CPU usage.
****************************************************************************/
#include <time.h>
static int epsilon=0;
double second(void)
/****************************************************************************
* Returns the total cpu time used in seconds.
* Emulates the Cray fortran library function of the same name.
****************************************************************************/
{
epsilon++;
return( (double)epsilon + ((double)clock())/1000.0 );
}
void sscal(int n,double sa,double * sx,int incx );
/***************************************************************
******************************************************************
**** Gaussian Elimination with partial pivoting ****
**** and condition number estimation. ****
**** This file contains the factorization driver and ****
**** contion number estimation routine SGECO ****
******************************************************************
****************************************************************/
int sgeco(struct FULL * a,int * ipvt,double * rcond,double * z )
/*
PURPOSE
SGECO factors a real matrix by gaussian elimination
and estimates the condition of the matrix.
REMARKS
If rcond is not needed, SGEFA is slightly faster.
to solve A*x = b , follow SGECO by SGESL.
To compute inverse(A)*c , follow SGECO by SGESL.
To compute determinant(A) , follow SGECO by SGEDI.
To compute inverse(A) , follow SGECO by SGEDI.
INPUT
a A pointer to the FULL matrix structure.
See the definition in ge.h.
OUTPUT
a A pointer to the FULL matrix structure containing
an upper triangular matrix and the multipliers
which were used to obtain it.
The factorization can be written a = l*u where
l is a product of permutation and unit lower
triangular matrices and u is upper triangular.
ipvt An integer vector (of length a->cd) of pivot indices.
rcond A double estimate of the reciprocal condition of A .
for the system A*x = b , relative perturbations
in A and b of size epsilon may cause
relative perturbations in x of size epsilon/rcond .
If rcond is so small that the logical expression
1.0 + rcond .eq. 1.0
is true, then a may be singular to working
precision. In particular, rcond is zero if
exact singularity is detected or the estimate
underflows.
z A double vector (of length a->cd) for a work vector
whose contents are usually unimportant. If A is
close to a singular matrix, then z is an approx-
imate null vector in the sense that:
norm(a*z) = rcond*norm(a)*norm(z) .
RETURNS
= -1 Matrix is not square.
= 0 Normal return value.
= k if u(k,k) .eq. 0.0 . This is not an error
condition for this subroutine, but it does
indicate that sgesl or sgedi will divide by zero
if called. Use rcond in sgeco for a reliable
indication of singularity.
ROUTINES
SGEFA(), blas sasum() and sdot(), copysign(), fabs();
WARNINGS
This routine uses the UN*X math library routines
copysign() and fabs().
*/
{
register int j;
int k, n, info ;
register double s;
double ek, anorm, ynorm;
n = a->cd;
/* Compute 1-norm of A. */
for( j=0, anorm=0.0; j<n; j++ ) {
register double sum;
sum = sasum( n, a->pd[j], 1 );
anorm = (double)sum > anorm ? (double)sum : anorm;
}
/* Factor A. */
info = sgefa( a, ipvt );
/*
* rcond = 1/(norm(a)*(estimate of norm(inverse(a)))) .
* estimate = norm(z)/norm(y) where a*z = y and trans(a)*y = e .
* trans(a) is the transpose of a . The components of e are
* chosen to cause maximum local growth in the elements of w where
* trans(u)*w = e . The vectors are frequently rescaled to avoid
* overflow.
*/
/* solve trans(u)*w = e */
ek = 1.0;
for( j=0; j<n; j++ ) {
z[j] = 0.0e0;
}
for( k=0; k<n; k++ ) {
register double zk = z[k];
double wk, wkm, sm;
int kp1 = k+1;
if( zk != 0.0 ) ek = (double)copysign( (double)ek, (double)-zk );
if( fabs( (double)(ek-zk) ) > fabs( (double)pelem(a,k,k) ) ) {
s = (double)fabs( (double)pelem(a,k,k) )/(double)fabs(
(double)(ek-zk) );
sscal( n, (double)s, z, 1 );
zk = z[k];
ek *= s;
}
wk = ek - zk;
wkm = -ek - zk;
s = (double)fabs( (double)wk );
sm = (double)fabs( (double)wkm );
if( pelem(a,k,k) == 0.0 ) {
wk = 1.0;
wkm = 1.0;
}
else {
wk /= pelem(a,k,k);
wkm /= pelem(a,k,k);
}
if( kp1<n ) {
for( j=kp1; j<n; j++ ) {
sm = sm + (double)fabs( (double)(z[j]+wkm*pelem(a,k,j)) );
z[j] += ((double)wk)*pelem(a,k,j);
s += (double)fabs( (double)z[j] );
}
if( s < sm ) {
register double t = wkm-wk;
wk = wkm;
for( j=kp1; j<n; j++ ) {
z[j] += t*pelem(a,k,j);
}
}
}
z[k] = wk;
}
s = 1.0/(double)sasum( n, z, 1 );
sscal( n, (double)s, z, 1 );
/* Solve trans(L)*y = w. */
for( k=n-1; k>=0; k-- ) {
register int l;
register double t;
if( k < n-1 ) z[k] += (double)sdot( n-k-1, a->pd[k]+k+1, 1,
(z+k+1), 1 );
if( fabs( (double)z[k] ) > 1.0 ) {
s = 1.0/(double)fabs( (double)z[k] );
sscal( n, (double)s, z, 1 );
}
l = ipvt[k];
t = z[l];
z[l] = z[k];
z[k] = t;
}
s = 1.0/(double)sasum( n, z, 1 );
sscal( n, (double)s, z, 1);
ynorm = 1.0;
/* solve L*v = y. */
for( k=0; k<n; k++ ) {
register int l;
register double t;
l = ipvt[k];
t = z[l];
z[l] = z[k];
z[k] = t;
if( k < n-1 ) saxpy( n-k-1, (double)t, (a->pd[k]+k+1), 1,
(z+k+1), 1 );
if( fabs( (double)z[k] ) > 1.0) {
s = 1.0/(double)fabs( (double)z[k] );
sscal( n, (double)s, z, 1 );
ynorm *= s;
}
}
s = 1.0/(double)sasum( n, z, 1 );
sscal( n, (double)s, z, 1 );
ynorm *= s;
/* Solve U*z = v. */
for( k=n-1; k>=0; k-- ) {
register double t;
if( fabs( (double)z[k] ) > fabs( (double)pelem(a,k,k) ) ) {
s = (double)fabs( (double)pelem(a,k,k) )/(double)fabs(
(double)z[k] );
sscal( n, (double)s, z, 1 );
ynorm *= s;
}
if( pelem(a,k,k) == 0.0 ) z[k] = 1.0;
else z[k] /= pelem(a,k,k);
t = -z[k];
saxpy( k, (double)t, a->pd[k], 1, z, 1 );
}
/* Make znorm = 1.0. */
s = 1.0/(double)sasum( n, z, 1 );
sscal( n, (double)s, z, 1 );
ynorm *= s;
if( anorm == 0.0e0) *rcond = 0.0;
else *rcond = ynorm/anorm;
return( info );
}
/***************************************************************
******************************************************************
**** Gaussian Elimination with partial pivoting. ****
**** This file contains the factorization routine SGEFA ****
******************************************************************
****************************************************************/
int sgefa(struct FULL *a,int * ipvt )
/*
PURPOSE
SGEFA factors a real matrix by gaussian elimination.
REMARKS
SGEFA is usually called by SGECO, but it can be called
directly with a saving in time if rcond is not needed.
(time for SGECO) = (1 + 9/n)*(time for SGEFA) .
INPUT
a A pointer to the FULL matrix structure.
See the definition in ge.h.
OUTPUT
a A pointer to the FULL matrix structure containing
an upper triangular matrix and the multipliers
which were used to obtain it.
The factorization can be written a = l*u where
l is a product of permutation and unit lower
triangular matrices and u is upper triangular.
ipvt An integer vector (of length a->cd) of pivot indices.
RETURNS
= -1 Matrix is not square.
= 0 Normal return value.
= k if u(k,k) .eq. 0.0 . This is not an error
condition for this subroutine, but it does
indicate that sgesl or sgedi will divide by zero
if called. Use rcond in sgeco for a reliable
indication of singularity.
ROUTINES
blas ISAMAX
*/
{
register int i, j;
int isamax(), k, l, nm1, info, n;
double *akk, *alk;
register double t, *mik;
/* Gaussian elimination with partial pivoting. */
if( a->cd != a->rd ) return( -1 );
n = a->cd;
nm1 = n - 1;
akk = a->pd[0];
info = 0; /* Assume nothing will go wrong! */
if( n < 2 ) goto CLEAN_UP;
/* Loop over Diagional */
for( k=0; k<nm1; k++, ipvt++ ) {
/* Find index of max elem in col below the diagional (l = pivot
index). */
akk = a->pd[k] + k;
l = isamax( n-k, akk, 1 ) + k;
*ipvt = l;
/* Zero pivot implies this column already triangularized. */
alk = a->pd[k] + l;
if( *alk == 0.0e0) {
info = k;
continue;
}
/* Interchange a(k,k) and a(l,k) if necessary. */
if( l != k ) {
t = *alk;
*alk = *akk;
*akk = t;
}
/* Compute multipliers for this column. */
t = -1.0e0 / (*akk);
for( i=k+1, mik=a->pd[k]; i<n; i++ )
mik *= t;
/* Column elimination with row indexing. */
if( l != k ) {
/* Interchange a(k,j) and a(l,j) if necessary. */
for( j=k+1; j<n; j++ ) {
t = pelem(a,k,j);
pelem(a,k,j) = pelem(a,l,j);
pelem(a,l,j) = t;
}
}
for( j=k+1; j<n; j++ ) {
register double *aij = a->pd[j];
t = pelem(a,k,j);
for( i=k+1, mik=a->pd[k]; i<n; i++ )
aij += t*mik;
}
} /* End of for k loop */
CLEAN_UP:
*ipvt = nm1;
if( *akk == 0.0e0 ) info = n;
return( info );
}
/***************************************************************
******************************************************************
**** Gaussian Elimination with partial pivoting. ****
**** This file contains the solution routine SGESL ****
******************************************************************
****************************************************************/
void sgesl(struct FULL * a,int * ipvt,double b[],int job )
/*
PURPOSE
SGESL solves the real system
a * x = b or trans(a) * x = b
using the factors computed by SGECO or SGEFA.
INPUT
a A pointer to the FULL matrix structure containing the
factored
matrix. See the definition of FULL in ge.h.
ipvt The pivot vector (of length a->cd) from SGECO or SGEFA.
b The right hand side vector (of length a->cd).
job = 0 to solve a*x = b ,
= nonzero to solve trans(a)*x = b where
trans(a) is the transpose.
OUTPUT
b The solution vector x.
REMARKS
Error condition:
A division by zero will occur if the input factor contains a
zero on the diagonal. Technically this indicates singularity
but it is often caused by improper arguments or improper
setting of lda . It will not occur if the subroutines are
called correctly and if sgeco has set rcond .gt. 0.0
or sgefa has set info .eq. 0 .
*/
{
register double t, *mik;
double *akk;
register int i, k;
int l, n, nm1;
n = a->cd;
nm1 = n - 1;
/* job = 0 , solve A * x = b. */
if( job == 0 ) {
/* Forward elimination. Solve L*y = b. */
for( k=0; k<nm1; k++ ) {
akk = a->pd[k] + k; /* akk points to a(k,k). */
/* Interchange b[k] and b[l] if necessary. */
l = ipvt[k];
t = b[l];
if( l != k ) {
b[l] = b[k];
b[k] = t;
}
for( i=k+1, mik=a->pd[k]; i<n; i++ )
b += t*mik;
}
/* Back substitution. Solve U*x = y. */
for( k=nm1; k>=0; k-- ) {
register double *uik = a->pd[k];
akk = uik + k;
b[k] /= (*akk);
for( i=0; i<k; i++ )
b -= uik*b[k];
}
return;
}
/* job = nonzero. Solve trans(A) * x = b. */
/* First solve trans(U)*y = b. */
for( k=0; k<n; k++ ) {
register double *uik = a->pd[k];
akk = uik + k;
t = 0.0;
for( i=0; i<k; i++ )
t += uik*b;
b[k] = (b[k] - t) / (*akk);
}
/* b now contains y. */
/* Solve trans(L)*x = y. */
for( k=n-2; k>=0; k-- ) {
mik = a->pd[k];
t = 0.0;
for( i=k+1; i<n; i++ )
t += mik*b;
b[k] += t;
/* Interchange b(k) and b(ipvt(k)) if necessary. */
l = ipvt[k];
if( l == k ) continue;
t = b[l];
b[l] = b[k];
b[k] = t;
}
return;
}
/***************************************************************
*****************************************************************
*******************************************************************
***** *****
***** BLAS *****
***** Basic Linear Algebra Subroutines *****
***** Written in the C Programming Language. *****
***** *****
***** Functions include: *****
***** isamax, sasum, saxpy, scopy, sdot, snrm2, *****
***** *****
***** In addition a few other routines are included: *****
***** vexopy, vfill *****
***** *****
***** If your 3M library does not have the copysign function *****
***** then compile this file with -DCOPYSIGN and one will be *****
***** be supplied. *****
*******************************************************************
*****************************************************************
***************************************************************/
#ifndef HUGEsp
#define HUGEsp 1.0e+38
#endif
#ifndef SMALLsp
#define SMALLsp 1.0e-45
#endif
int isamax(int n, double *sx,int incx )
/*
PURPOSE
Finds the index of element having max. absolute value.
INPUT
n Number of elements to check.
sx Vector to be checked.
incx Every incx-th element is checked.
*/
{
register double smax = 0.0e0;
register int i, istmp = 0;
#ifndef abs
#define abs(x) ((x)<0.0?-(x)
#endif
#ifdef alliant
int isamax_();
istmp = isamax_( &n, sx, &incx )-1; /* Remember 0 is first. */
return( istmp );
#else
if( n <= 1 ) return( istmp );
if( incx != 1 ) {
/* Code for increment not equal to 1. */
if( incx < 0 ) sx = sx + ((-n+1)*incx + 1);
istmp = 0;
smax = abs( *sx );
sx += incx;
for( i=1; i<n; i++, sx+=incx )
if( abs( *sx ) > smax ) {
istmp = i;
smax = abs( *sx );
}
return( istmp );
}
/* Code for increment equal to 1. */
istmp = 0;
smax = abs(*sx);
sx++;
for( i=1; i<n; i++, sx++ )
if( abs( *sx ) > smax ) {
istmp = i;
smax = abs( *sx );
}
return( istmp );
#endif
}
double sasum(int n,double * sx,int incx )
/*
PURPOSE
Returns sum of magnitudes of single precision SX.
sasum = sum from 0 to n-1 of ABS(SX(1+I*INCX))
INPUT
n Number of elements to multiply.
sx Pointer to double vector to take abs sum of.
incx Storage incrament for sx.
RETURNS
sasum Double variable with the result.
WARNINGS
This routine uses the UN*X math library function fabs().
*/
{
#ifndef abs
#define abs(x) ((x)<0.0?-(x)
#endif
register double sum = 0.0;
if( n<= 0 ) return( sum );
if( incx == 1 ) {
register int i, m;
/* Code for increments equal to 1. */
/* Clean-up loop so remaining vector length is a multiple of 6. */
m = n % 6;
if( m != 0 ) {
for( i=0; i<m; i++ )
sum += abs( sx );
if( n < 6 ) return( (double)sum );
}
for( i=m; i<n; i+=6 ) {
sum += abs( sx ) + abs( sx[i+1]) + abs( sx[i+2] ) +
abs( sx[i+3] ) + abs( sx[i+4] ) + abs( sx[i+5] );
}
return( (double)sum );
}
else {
register int i, ns;
/* Code for increments not equal to 1. */
ns = n*incx;
for( i=0; i<ns; i+=incx ) {
sum += abs( sx );
}
return( (double)sum );
}
}
void saxpy(int n,double sa, double *sx,int incx,double * sy,int incy )
/*
PURPOSE
Vector times a scalar plus a vector. sy = sy + sa*sx.
INPUT
n Number of elements to multiply.
sa Scalar to multiply by (note that this is a double).
sx Pointer to double vector to scale.
incx Storage incrament for sx.
sy Pointer to double vector to add.
incy Storage incrament for sy.
OUTPUT
sy sy = sy + sa*sx
*/
{
register int i;
register double ssa = (double)sa;
if( n<=0 || ssa==0.0 ) return;
if( incx == incy ) {
if( incx == 1 ) {
/* Both increments = 1 */
for( i=0; i<n; i++ )
sy += ssa*sx;
return;
}
if( incx>0 ) {
/* Equal, positive, non-unit increments. */
for( i=0; i<n; i++,sx+=incx,sy+=incx )
*sy += ssa*(*sx);
return;
}
}
/* Unequal or negative increments. */
if( incx < 0 ) sx += ((-n+1)*incx + 1);
if( incy < 0 ) sy += ((-n+1)*incy + 1);
for( i=0; i<n; i++,sx+=incx,sy+=incy )
*sy += ssa*(*sx);
}
void saxpyx( int n, double sa, double *sx,int incx,double * sy,int incy )
/*
PURPOSE
Vector times a scalar plus a vector. sx = sy + sa*sx.
INPUT
n Number of elements to multiply.
sa Scalar to multiply by (note that this is a double).
sx Pointer to double vector to scale.
incx Storage incrament for sx.
sy Pointer to double vector to add.
incy Storage incrament for sy.
OUTPUT
sx sx = sy + sa*sx
*/
{
register i;
register double ssa = (double)sa;
if( n<=0 || ssa==0.0 ) return;
if( incx == incy ) {
if( incx == 1 ) {
/* Both increments = 1 */
for( i=0; i<n; i++ )
sx = sy + ssa*sx;
return;
}
if( incx>0 ) {
/* Equal, positive, non-unit increments. */
for( i=0; i<n; i++, sx+=incx, sy+=incx)
*sx = *sy + ssa*(*sx);
return;
}
}
/* Unequal or negative increments. */
if( incx < 0 ) sx += ((-n+1)*incx + 1);
if( incy < 0 ) sy += ((-n+1)*incy + 1);
for( i=0; i<n; i++,sx+=incx,sy+=incy )
*sx = *sy + ssa*(*sx);
}
void scopy(int n,double * sx,int incx, double *sy,int incy )
/*
PURPOSE
Copies vector sx into vector sy.
INPUT
n Number of components to copy.
sx Source vector
incx Index increment for sx.
incy Index increment for sy.
OUTPUT
sy Destination vector.
*/
{
register int i;
if( n<1 ) return;
if( incx == incy ) {
if( incx == 1 ) {
/* Both increments = 1 */
for( i=0; i<n; i++ )
sy = sx;
return;
}
if( incx > 0 ) {
/* Equal, positive, non-unit increments. */
for( i=0; i<n; i++, sx+=incx, sy+=incx)
*sy = *sx;
return;
}
}
/* Non-equal or negative increments. */
if( incx < 0 ) sx += ((-n+1)*incx + 1);
if( incy < 0 ) sy += ((-n+1)*incy + 1);
for( i=0; i<n; i++,sx+=incx,sy+=incy )
(*sx) = (*sy);
return;
}
double sdot(int n,double * sx,int incx,double * sy, int incy )
/*
PURPOSE
Forms the dot product of a vector.
INPUT
n Number of elements to sum.
sx Address of first element of x vector.
incx Incrament for the x vector.
sy Address of first element of y vector.
incy incrament for the y vector.
OUPUT
sdot Dot product x and y. Double returned
due to `C' language features.
*/
{
register int i;
register double stemp = 0.0e0;
if( n<1 ) return( (double)stemp );
if( incx == incy ) {
if( incx == 1 ) {
/* Both increments = 1 */
for( i=0; i<n; i++ )
stemp += sx*sy;
return( (double)stemp );
}
if( incx>0 ) {
/* Equal, positive, non-unit increments. */
for( i=0; i<n; i++, sx+=incx, sy+=incx)
stemp += (*sx)*(*sy);
return( (double)stemp );
}
}
/* Unequal or negative increments. */
if( incx < 0 ) sx += ((-n+1)*incx + 1);
if( incy < 0 ) sy += ((-n+1)*incy + 1);
for( i=0; i<n; i++,sx+=incx,sy+=incy )
stemp += (*sx)*(*sy);
return( (double)stemp );
} /* End of ---SDOT--- */
double snrm2( int n,double * sx,int incx )
/*
PURPOSE
Computes the Euclidean norm of sx while being
very careful of distructive underflow and overflow.
INPUT
n Number of elements to use.
sx Address of first element of x vector.
incx Incrament for the x vector (>0).
OUPUT
snrm2 Euclidean norm of sx. Returns double
due to `C' language features.
REMARKS
This algorithm proceeds in four steps.
1) scan zero components.
2) do phase 2 when component is near underflow.
*/
{
register int i;
static double cutlo, cuthi;
double sum = 0.0e0, hitst, r1mach();
double xmax;
if( n<1 || incx<1 ) return( sum );
/* Calculate near underflow */
if( cutlo == 0.0 ) cutlo = sqrt( SMALLsp/r1mach() );
/* Calculate near overflow */
if( cuthi == 0.0 ) cuthi = sqrt( HUGEsp );
hitst = cuthi/(double) n;
i = 0;
/* Zero Sum. */
while( *sx == 0.0 && i<n ) {
i++;
sx += incx;
}
if( i>=n ) return( sum );
START:
if( abs( *sx ) > cutlo ) {
for( ; i<n; i++, sx+=incx ) { /* Loop over elements. */
if( abs( *sx ) > hitst ) goto GOT_LARGE;
sum += (*sx) * (*sx);
}
sum = sqrt( sum );
return( sum ); /* Sum completed
normaly. */
}
else { /* Small sum prepare
for phase 2. */
xmax = abs( *sx );
sx += incx;
i++;
sum += 1.0;
for( ; i<n; i++, sx+=incx ) {
if( abs( *sx ) > cutlo ) { /* Got normal elem.
Rescale and process. */
sum = (sum*xmax)*xmax;
goto START;
}
if( abs( *sx ) > xmax ) {
sum = 1.0 + sum*(xmax /(*sx))*(xmax /(*sx));
xmax = abs( *sx );
continue;
}
sum += ((*sx)/xmax)*((*sx)/xmax);
}
return( (double)xmax*sqrt( sum ) );
} /* End of small sum. */
GOT_LARGE:
sum = 1.0 + (sum/(*sx))/(*sx); /* Rescale and process. */
xmax = abs( *sx );
sx += incx;
i++;
for( ; i<n; i++, sx+=incx ) {
if( abs( *sx ) > xmax ) {
sum = 1.0 + sum*(xmax /(*sx))*(xmax /(*sx));
xmax = abs( *sx );
continue;
}
sum += ((*sx)/xmax)*((*sx)/xmax);
}
return( (double)xmax*sqrt( sum ) ); /* End of small sum. */
} /* End of ---SDOT--- */
double r1mach()
/***********************************************************************
**** This routine computes the unit roundoff for single precision
**** of the machine. This is defined as the smallest positive
**** machine number u such that 1.0 + u .ne. 1.0
**** Returns a double due to `C' language features.
**********************************************************************/
{
double u = 1.0e0, comp;
do {
u *= 0.5e0;
comp = 1.0e0 + u;
}
while( comp != 1.0e0 );
return( (double)u*2.0e0 );
} /*-------------------- end of function r1mach ------------------------*/
int min0(int n,int a,int b,int c,int d,int e,int f,int g,int h,int i,int
j,int k,int l,int m,int o,int p )
/*
PURPOSE
Determine the minimum of the arguments a-p.
INPUT
n Number of arguments to check 1 <= n <= 15.
a-p Integer arguments of which the minumum is desired.
RETURNS
min0 Minimum of a thru p.
*/
{
int mt;
if( n < 1 || n > 15 ) return( -1 );
mt = a;
if( n == 1 ) return( mt );
if( mt > b ) mt = b;
if( n == 2 ) return( mt );
if( mt > c ) mt = c;
if( n == 3 ) return( mt );
if( mt > d ) mt = d;
if( n == 4 ) return( mt );
if( mt > e ) mt = e;
if( n == 5 ) return( mt );
if( mt > f ) mt = f;
if( n == 6 ) return( mt );
if( mt > g ) mt = g;
if( n == 7 ) return( mt );
if( mt > h ) mt = h;
if( n == 8 ) return( mt );
if( mt > i ) mt = i;
if( n == 9 ) return( mt );
if( mt > j ) mt = j;
if( n == 10 ) return( mt );
if( mt > k ) mt = k;
if( n == 11 ) return( mt );
if( mt > l ) mt = l;
if( n == 12 ) return( mt );
if( mt > m ) mt = m;
if( n == 13 ) return( mt );
if( mt > o ) mt = o;
if( n == 14 ) return( mt );
if( mt > p ) mt = p;
return( mt );
}
void sscal(int n,double sa,double * sx,int incx )
/*
PURPOSE
Scales a vector by a constant.
INPUT
n Number of components to scale.
sa Scale value (note that this is a double).
sx Vector to scale.
incx Every incx-th element of sx will be scaled.
OUTPUT
sx Scaled vector.
*/
{
register double ssa = (double)sa;
int i;
#ifdef alliant
void sscal_();
double ssaa = (double)sa;
sscal_( &n, &ssaa, sx, &incx );
return;
#else
if( n < 1 ) return;
/* Code for increment not equal to 1.*/
if( incx != 1 ) {
if( incx < 0 ) sx += (-n+1)*incx;
for( i=0; i<n; i++, sx+=incx )
*sx *= ssa;
return;
}
/* Code for unit increment. */
for( i=0; i<n; i++ )
sx *= ssa;
return;
#endif
}
void vexopy(int n,double * v,double * x,double * y,int itype )
/*
Purpose:
To operate on the vectors x and y.
Input:
n Number of elements to scale.
x First operand vector.
y Second operand vector.
itype Type of operation to perform:
itype = 1 => '+'
itype = 2 => '-'
Output:
v Result vector of x op y.
*/
{
register int i;
if( n<1 ) return;
if( itype == 1 ) /* ADDITION. */
for( i=0; i<n; i++ )
v = x + y;
else /* SUBTRACTION. */
for( i=0; i<n; i++ )
v = x - y;
}
void vfill(int n,double * v,double val )
/*
Purpose
To fill the FLOAT vector v with the value val.
Make sure that if you pass a value for val that you cast
it to double, viz.,
vfill( 100, vector, (double)0.0 );
*/
{
register int i;
register double vval = (double)val;
if( n<1 ) return;
for( i=0; i<n; i++ )
v = vval;
}
#if 1
double copysign(double x,double y )
/*
PURPOSE
This routine is recomended by the IEEE standard 754. Most C
3M libraries contain this function. If yours does not then
compile this file with the -DCOPYSIGN option and the following
code will be generated. This routine copies the sign from y
onto x.
INPUT
x Thing to have it's sign changed.
y Variable to supply the sign.
RETURNS
copysign Returns x with the sign of y (the fortran intrinsic
sign(x,y)).
*/
{
/* Registers are used for reduced memory traffic. */
register double xx = x;
register double yy = y;
if( xx*yy < 0.0 ) return( -xx );
else return( xx );
}
#endif