/* PROGRAM LSADT Fortran->C conversion by Mike Maciukenas, CPGA, Microbiology at University of Illinois. C----------------------------------------------------------------------- C C LEAST SQUARES ALGORITHM FOR FITTING ADDITIVE TREES TO C PROXIMITY DATA C C GEERT DE SOETE -- VERSION 1.01 - FEB. 1983 C VERSION 1.02 - JUNE 1983 C VERSION 1.03 - JULY 1983 C C REFERENCE: DE SOETE, G. A LEAST SQUARES ALGORITHM FOR FITTING C ADDITIVE TREES TO PROXIMITY DATA. PSYCHOMETRIKA, 1983, 48, C 621-626. C DE SOETE, G. ADDITIVE TREE REPRESENTATIONS OF INCOMPLETE C DISSIMILARITY DATA. QUALITY AND QUANTITY, 1984, 18, C 387-393. C REMARKS C ------- C 2) UNIFORMLY DISTRIBUTED RANDOM NUMBERS ARE GENERATED BY A C PROCEDURE DUE TO SCHRAGE. CF. C SCHRAGE, L. A MORE PORTABLE FORTRAN RANDOM NUMBER GENERATOR. C ACM TRANS. ON MATH. SOFTW., 1979, 5, 132-138. C 3) SUBROUTINES VA14AD AND VA14AC (translated into minfungra) ARE C ADAPTED FROM THE HARWELL SUBROUTINE LIBRARY (1979 EDITION). C 4) ALTHOUGH THIS PROGRAM HAS BEEN CAREFULLY TESTED, THE C AUTHOR DISCLAIMS ANY RESPONSABILITY FOR POSSIBLE C ERRORS. C C----------------------------------------------------------------------- */ #include #include #include #include #include #include #define BUFLEN 1024 #define MAXLEAVES 256 static int m, n, dissim, pr, start, save, seed, nempty; static double ps1, ps2, f, empty, tol, c; static char fname[1000]; static char *names[MAXLEAVES]; static double *delta[MAXLEAVES]; static double **d; static double **g; static double **dold; static FILE *reportf; static int report; extern int errno; double nfac; extern double strtod(); double dabs(a) double a; { return((a<0.0) ? -a : a); } double sqr(a) double a; { return(a*a); } double max(a, b) double a; double b; { return((a>b)?a:b); } int imin(a, b) int a; int b; { return((ai) return(d[j][i]); else show_error("gd: i=j -- programmer screwed up!"); } char *repeatch(string, ch, num) char *string; int ch; int num; { for(string[num--] = '\0'; num >= 0; string[num--] = ch); return(string); } int getachar() /* skips comments! */ { static int oldchar = '\0'; int ch; int more=1; while(more) { ch = getchar(); if(oldchar == '\n' && ch == '#') { while(ch!='\n'&&ch!=EOF) ch=getchar(); oldchar = ch; } else if(oldchar == '\n' && isspace(ch)) ; else more=0; } oldchar = ch; return(ch); } int skip_space() { int ch; while(isspace(ch=getachar())); return(ch); } int getaword(string, len) /* 0 if failed, 1 if data was read, -1 if data read to end of file */ char *string; int len; { int i; int ch; ch = skip_space(); if(ch == EOF) return(0); for(i=0; !isspace(ch) && ch != EOF; i++) { if(i=0 && isspace(string[i]); i--); string[i+1] = '\0'; if(ch==EOF) return(-1); else return(1); } int readtobarorcolon(string, len) /* 0 if failed, 1 if data was read */ char *string; int len; { int i; int ch; ch = skip_space(); if(ch==EOF) return(0); for(i=0; ch!=EOF && ch!='|' && ch!=':'; i++) { if(ch=='\n'||ch=='\r'||ch=='\t') i--; else { if(i=0 && isspace(string[i]); i--); string[i+1] = '\0'; if(ch==EOF) return(-1); else return(1); } char *getmem(nelem, elsize) unsigned nelem, elsize; { char *temp; temp = (char *)calloc(nelem+1, elsize); if(temp == NULL) show_error("Couldn't allocate memory."); else return(temp); } int get_parms(argc, argv) int argc; char **argv; { int i; int cur_arg; /* codes for current argument: ** 0 = no current argument ** 1 = pr ** 2 = start ** 3 = seed ** 4 = ps1 ** 5 = ps2 ** 6 = empty ** 7 = filename */ dissim = 0; pr = 0; start = 2; save = 0; seed = 12345; ps1 = 0.0001; ps2 = 0.0001; empty = 0.0; n = 0; cur_arg = 0; for(i=1; i 3) start = 3; if(save != 1) save = 0; if(seed < 0) seed = 12345; /*printf("dissim=%d\n", dissim);*/ /*printf("pr=%d\n", pr);*/ /*printf("start=%d\n", start);*/ /*printf("save=%d\n", save);*/ /*printf("seed=%d\n", seed);*/ /*printf("ps1=%f\n", ps1);*/ /*printf("ps2=%f\n", ps2);*/ /*printf("empty=%f\n", empty);*/ } int get_data() { int i, j, more; char buf[BUFLEN]; char *ptr; char ch; int result; double temp, nfactor, datmin, datmax; nempty = n = 0; more = 1; ptr = &ch; while(more) { result=readtobarorcolon(buf, BUFLEN); if(result == 0 || result == -1) more = 0; else { n++; names[n] = getmem(BUFLEN, 1); result=readtobar(buf, BUFLEN); if(result != 1) show_error("get_data: bad name syntax, or missing '|'"); strcpy(names[n], buf); if(n>1) delta[n]=(double *)getmem(n, sizeof(double)); else delta[n]=NULL; for(j=1; j datmax && delta[i][j] != empty) datmax = delta[i][j]; datmax += 1.0; for(i=2; i<=n; i++) for(j=1; j=d[l][i]+d[k][j])&& (d[k][i]+d[l][j]>=d[l][i]+d[k][j])) { wijkl=d[j][i]+d[l][k] -d[k][i]-d[l][j]; fitp+=sqr(wijkl); g[j][i]+=fac*wijkl; g[l][k]+=fac*wijkl; g[k][i]-=fac*wijkl; g[l][j]-=fac*wijkl; } else if((d[j][i]+d[l][k]>=d[k][i]+d[l][j])&& (d[l][i]+d[k][j]>=d[k][i]+d[l][j])) { wijkl=d[j][i]+d[l][k] -d[l][i]-d[k][j]; fitp+=sqr(wijkl); g[j][i]+=fac*wijkl; g[l][k]+=fac*wijkl; g[k][j]-=fac*wijkl; g[l][i]-=fac*wijkl; } else if((d[k][i]+d[l][j]>=d[j][i]+d[l][k])&& (d[l][i]+d[k][j]>=d[j][i]+d[l][k])) { wijkl=d[k][i]+d[l][j] -d[l][i]-d[k][j]; fitp+=sqr(wijkl); g[k][i]+=fac*wijkl; g[l][j]+=fac*wijkl; g[l][i]-=fac*wijkl; g[k][j]-=fac*wijkl; } f = fitl+r*fitp; } static double **dr, **dgr, **d1, **gs, **xx, **gg; static int iterc, prc; print_iter(maxfnc, f) int maxfnc; double f; { int i, j; if(pr == 0) { iterc++; } else if(prc < abs(pr)) { prc++; iterc++; } else { printf("Iteration %6d", iterc); printf(": function values %6d", maxfnc); printf(" f = %24.16e\n", f); if(pr < 0) { printf(" d[] looks like this:\n"); for(i=2;i<=n;i++) { printf(" "); for(j=1;j 0) { prc = 0; print_iter(maxfnc, f); return; } if(itcrs>m) { for(i=2;i<=n;i++) for(j=1;j= 0.0) { retry = -retry; if(imin(retry, maxfnk)<2) { printf("minfungra: \ gradient wrong or acc too small\n"); flag = 2; } else itcrs = m+1; goto L30; } xmin = 0.0; fmin = f; finit = f; gsinit = gsumsq; gmin = dginit; gm = dginit; xbound = -1.0; xnew = xnew * min(1.0, dgstep/dginit); dgstep = dginit; L170: c = xnew-xmin; dtest = 0.0; for(i=1;i<=n;i++) for(j=1;j1) { for(gnew = 0.0,i=1;i<=n;i++) for(j=1;j1 && f1 && f==fmin && dabs(gnew) <= dabs(gmin))) { maxfnk = maxfnc; gsumsq = 0.0; for(i=1;i<=n;i++) for(j=1;jclt*dabs(dginit) || (dabs(gm)<=clt*dabs(dginit) && dabs(gm*beta) >= clt*gsumsq)) { L310: clt += 0.3; if(clt>0.8) { retry = -retry; if(imin(retry, maxfnk)<2) { printf("minfungra: \ gradient wrong or acc too small\n"); flag = 2; } else itcrs = m+1; goto L30; } xold = xnew; xnew = .5*(xmin+xold); if(maxfnk >= maxfnc && gmin*gnew > 0.0) { xnew = 10.0*xold; if(xbound>=0.0) { xnew = 0.5*(xold+xbound); } } c = gnew-(3.0*gnew + gmin-4.0*(f-fmin)/(xold-xmin))* (xold-xnew)/(xold-xmin); if(maxfnk>=maxfnc) { if(gmin*gnew<=0.0) { xbound = xmin; } xmin = xold; fmin = f; gmin = gnew; } else xbound = xold; if(c*gmin < 0.0) xnew = (xmin*c-xnew*gmin)/(c-gmin); goto L170; } if(min(f, fmin)=finit && gsumsq < gsinit)) { if(itcrsps2) { iter++; r*=10.0; } } while (dif>ps2); fungra(); for(i=2; i<=n; i++) { free(dr[i]); free(dgr[i]); free(d1[i]); free(gs[i]); free(xx[i]); free(gg[i]); } free(dr); free(dgr); free(d1); free(gs); free(xx); free(gg); } double gttol() { double result; int i, j, k, l; result = 0.0; nm0 = n; nm1 = n-1; nm2 = n-2; nm3 = n-3; for(i=1;i<=nm3;i++) for(j=i+1;j<=nm2;j++) for(k=j+1;k<=nm1;k++) for(l=k+1;l<=nm0;l++) if((d[j][i]+d[l][k]>=d[l][i]+d[k][j])&& (d[k][i]+d[l][j]>=d[l][i]+d[k][j])) result=max(result, dabs(d[j][i]+d[l][k]-d[k][i]-d[l][j])); else if((d[j][i]+d[l][k]>=d[k][i]+d[l][j])&& (d[l][i]+d[k][j]>=d[k][i]+d[l][j])) result=max(result, dabs(d[j][i]+d[l][k]-d[l][i]-d[k][j])); else if((d[k][i]+d[l][j]>=d[j][i]+d[l][k])&& (d[l][i]+d[k][j]>=d[j][i]+d[l][k])) result=max(result, dabs(d[k][i]+d[l][j]-d[l][i]-d[k][j])); return(result); } gtcord() { double sumx, sumy, ssqx, ssqy, scp, fn; int i, j; sumx = sumy = ssqx = ssqy = scp = 0.0; for(i=1;i<=n;i++) for(j=1;j4) { maxcnt=0; for(i=1;i<=nnode;i++) if(!act[i]) for(j=1;jmaxcnt) { maxcnt = count; arcim=max(0.0, arci/count); arcjm=max(0.0, arcj/count); im=i; jm=j; } } nnode++; if(nnode+2>maxnode) show_error("get_tree: number of nodes exceeds 2N-2"); ninv++; mergei[ninv]=im; mergej[ninv]=jm; act[im]=true; act[jm]=true; d[nnode]=(double *)getmem(nnode-1, sizeof(double)); d[nnode][im]=arcim; d[nnode][jm]=arcjm; for(i=1;i<=nnode-1;i++) if(!act[i]) d[nnode][i] = max(0.0, gd(im,i)-arcim); nact--; } for(i=1;act[i];i++) if(i>nnode) show_error("get_tree: can't find last two invisible nodes"); im=i; for(i=im+1;act[i];i++) if(i>nnode) show_error("get_tree: can't find last two invisible nodes"); jm=i; for(i=jm+1;act[i];i++) if(i>nnode) show_error("get_tree: can't find last two invisible nodes"); km=i; for(i=km+1;act[i];i++) if(i>nnode) show_error("get_tree: can't find last two invisible nodes"); lm=i; if(gd(im,jm)+gd(km,lm)<=gd(im,km)+gd(jm,lm)+tol && gd(im,jm)+gd(km,lm)<=gd(im,lm)+gd(jm,km)+tol) { i=im; j=jm; k=km; l=lm; } else if(gd(im,lm)+gd(jm,km)<=gd(im,km)+gd(jm,lm)+tol && gd(im,lm)+gd(jm,km)<=gd(im,jm)+gd(km,lm)+tol) { i=im; j=lm; k=km; l=jm; } else if(gd(im,km)+gd(jm,lm)<=gd(im,jm)+gd(km,lm)+tol && gd(im,km)+gd(jm,lm)<=gd(im,lm)+gd(jm,km)+tol) { i=im; j=km; k=lm; l=jm; } nnode++; ninv++; mergei[ninv]=i; mergej[ninv]=j; d[nnode]=(double *)getmem(nnode-1, sizeof(double)); d[nnode][i] = max(0.0, (gd(i,j)+gd(i,k)-gd(j,k))/2.0); d[nnode][j] = max(0.0, (gd(i,j)+gd(j,l)-gd(i,l))/2.0); nnode++; ninv++; mergei[ninv]=k; mergej[ninv]=l; d[nnode]=(double *)getmem(nnode-1, sizeof(double)); d[nnode][k] = max(0.0, (gd(k,l)+gd(i,k)-gd(i,l))/2.0); d[nnode][l] = max(0.0, (gd(k,l)+gd(j,l)-gd(j,k))/2.0); d[nnode][nnode-1] = max(0.0, (gd(i,k)+gd(j,l)-gd(i,j)-gd(k,l))/2.0); } print_node(node, dist, indent) int node; double dist; int indent; { static char buf[BUFLEN]; if(node<=n) printf("%s%s:%6.4f", repeatch(buf, '\t', indent), names[node], dist/nfac); else { printf("%s(\n", repeatch(buf, '\t', indent)); print_node(mergei[node-n], gd(node, mergei[node-n]), indent+1); printf(",\n"); print_node(mergej[node-n], gd(node, mergej[node-n]), indent+1); printf("\n%s):%6.4f", repeatch(buf, '\t', indent), dist/nfac); } } show_tree() { int i, j, current; int ij[2]; current=0; for(i=1;current<2;i++) { for(j=1;(mergei[j]!=i && mergej[j] != i) && j<=ninv;j++); if(j>ninv) ij[current++]=i; } printf("(\n"); print_node(ij[0], gd(ij[0],ij[1])/2.0, 1); printf(",\n"); print_node(ij[1], gd(ij[0],ij[1])/2.0, 1); printf("\n);\n"); } show_help() { printf("\nlsadt--options:\n"); printf(" -f file - write report to 'file'\n"); printf(" -d - treat data as dissimilarities (default)\n"); printf(" -s - treat data as similarities\n"); printf(" -print 0 - don't print out iteration history (default)\n"); printf(" -print n>0 - print out iteration history every n iterations\n"); printf(" -print n<0 - print out iteration history every n iterations\n"); printf(" (including current distance estimates & gradients)\n"); printf(" -init n - initial parameter estimates (default = 3)\n"); printf(" n=1 - uniformly distributed random numbers\n"); printf(" n=2 - error-perturbed data\n"); printf(" n=3 - original distance data from input matrix\n"); printf(" -save - save final parameter estimates (default is don't save\n"); printf(" -seed n - seed for random number generator (default = 12345)\n"); printf(" -ps1 n - convergence criterion for inner iterations (default = 0.0001)\n"); printf(" -ps2 n - convergence criterion for major iterations (default = 0.0001)\n"); printf(" -empty n - missing data indicator (default = 0.0)\n"); printf(" -help - show this help text\n"); exit(0); } show_error(message) char *message; { printf("\n>>>>ERROR:\n>>>>%s\n", message); exit(0); } main(argc, argv) int argc; char **argv; { int i; strcpy(fname, ""); get_parms(argc, argv); if(strcmp(fname, "")) { report=1; reportf = fopen(fname, "w"); if(reportf==NULL) { perror("lsadt"); exit(0); } } else report=0; get_data(); d = (double **)getmem(n, sizeof(delta[1])); g = (double **)getmem(n, sizeof(delta[1])); dold = (double **)getmem(n, sizeof(delta[1])); d[1]=NULL; g[1]=NULL; dold[1]=NULL; for(i=2; i<=n; i++) { d[i]=(double *)getmem(i-1, sizeof(double)); g[i]=(double *)getmem(i-1, sizeof(double)); dold[i]=(double *)getmem(i-1, sizeof(double)); } initd(); get_dist(); goodfit(); additive_const(); get_tree(); show_tree(); if(report) close(reportf); }