gde_linux/SUPPORT/count.c
2023-04-12 03:39:54 +08:00

417 lines
6.5 KiB
C

/*
* Copyright 1991 Steven Smith at the Harvard Genome Lab.
* All rights reserved.
*/
#include "Flatio.c"
#include <math.h>
#define FALSE 0
#define TRUE 1
#define JUKES 0
#define OLSEN 1
#define NONE 2
#define Min(a,b) (a)<(b)?(a):(b)
int width,start,jump,usecase,sim,correction;
int tbl,numseq,num,denom,special;
char argtyp[255],argval[255];
float acwt=1.0, agwt=1.0, auwt=1.0, ucwt=1.0, ugwt=1.0, gcwt=1.0;
float dist[200][200];
struct data_format data[10000];
float parta[200], partc[200], partg[200], partu[200], setdist();
main(ac,av)
int ac;
char **av;
{
int i,j,k;
extern int ReadFlat();
FILE *file;
width = 1;
jump = 1;
if(ac==1)
{
fprintf(stderr,
"usage: %s [-sim] [-case] [-c=<none,olsen,jukes>] ",av[0]);
fprintf(stderr,"[-t] alignment_flat_file\n");
exit(1);
}
for(j=1;j<ac-1;j++)
{
getarg(av,j,argtyp,argval);
if(strcmp(argtyp,"-s=") == 0)
{
j++;
sscanf(argval,"%d",&start);
start --;
}
else if(strcmp(argtyp,"-m=") == 0)
{
j++;
sscanf(argval,"%d",&width);
}
else if(strcmp(argtyp,"-j=") == 0)
{
j++;
sscanf(argval,"%d",&jump);
}
else if(strcmp(argtyp,"-case") == 0)
usecase = TRUE;
else if(strcmp(argtyp,"-sim") == 0)
sim = TRUE;
else if(strcmp(argtyp,"-c=") == 0)
{
if(strcmp(argval,"olsen") == 0)
correction = OLSEN;
else if(strcmp(argval,"none") == 0)
correction = NONE;
else if(strcmp(argval,"jukes") == 0)
correction = JUKES;
else
fprintf(stderr,"Correction type %s %s\n",
argval,"unknown, using JUKES");
}
else if(strcmp("-t",argtyp) == 0)
tbl = TRUE;
else if(strcmp("-ac=",argtyp) == 0 || strcmp("-ca=",argtyp)== 0)
{
j++;
sscanf(argval,"%f",&acwt);
special = TRUE;
}
else if(strcmp("-au=",argtyp) == 0 || strcmp("-ua=",argtyp)== 0)
{
j++;
sscanf(argval,"%f",&auwt);
special = TRUE;
}
else if(strcmp("-ag=",argtyp) == 0 || strcmp("-ga=",argtyp)== 0)
{
j++;
sscanf(argval,"%f",&agwt);
special = TRUE;
}
else if(strcmp("-uc=",argtyp) == 0 || strcmp("-cu=",argtyp)== 0)
{
j++;
sscanf(argval,"%f",&ucwt);
special = TRUE;
}
else if(strcmp("-ug=",argtyp) == 0 || strcmp("-gu=",argtyp)== 0)
{
j++;
sscanf(argval,"%f",&ugwt);
special = TRUE;
}
else if(strcmp("-gc=",argtyp) == 0 || strcmp("-cg=",argtyp)== 0)
{
j++;
sscanf(argval,"%f",&gcwt);
special = TRUE;
}
else if(strcmp("-transition=",argtyp) == 0)
{
j++;
sscanf(argval,"%f",&ucwt);
agwt = ucwt;
special = TRUE;
}
else if(strcmp("-transversion=",argtyp) == 0)
{
j++;
sscanf(argval,"%f",&gcwt);
ugwt = gcwt;
acwt = gcwt;
auwt = gcwt;
special = TRUE;
}
}
file = fopen(av[ac-1],"r");
if ((file == NULL) || (ac == 1))
{
fprintf(stderr,"Error opening input file %s\n",av[ac-1]);
exit(1);
}
numseq = ReadFlat(file,data,10000);
fclose(file);
SetPart();
for(j=0;j<numseq-1;j++)
for(k=j+1;k<numseq;k++)
{
Compare(j,k,&num,&denom);
dist[j][k] = setdist(num,denom,j,k);
}
Report();
exit(0);
}
Compare(a,b,num,denom)
int a,b,*num,*denom;
{
int mn,i,j,casefix,match,blank;
float fnum = 0.0;
struct data_format *da,*db;
char ac,bc;
casefix = (usecase)? 0:32;
*num = 0;
*denom = 0;
da = &data[a];
db = &data[b];
mn = Min(da->length,db->length);
for(j=0;j<mn;j+=jump)
{
match = TRUE;
blank = TRUE;
for(i=0;i<width;i++)
{
ac = da->nuc[j+i] | casefix;
bc = db->nuc[j+i] | casefix;
if(ac == 't')
ac = 'u';
if(ac == 'T')
ac = 'U';
if(bc == 't')
bc = 'u';
if(bc == 'T')
bc = 'U';
if((ac=='-') || (ac|32)=='n' || (ac==' ') ||
(bc== '-') || (bc|32)=='n' || (bc==' '));
else
{
blank = FALSE;
if(ac != bc)
{
match = FALSE;
switch(ac)
{
case 'a':
if (bc == 'c') fnum+=acwt;
else if(bc == 'g') fnum+=agwt;
else if(bc == 'u') fnum+=auwt;
break;
case 'c':
if (bc == 'a') fnum+=acwt;
else if(bc == 'g') fnum+=gcwt;
else if(bc == 'u') fnum+=ucwt;
break;
case 'g':
if (bc == 'a') fnum+=agwt;
else if(bc == 'c') fnum+=gcwt;
else if(bc == 'u') fnum+=ugwt;
break;
case 'u':
if (bc == 'a') fnum+=auwt;
else if(bc == 'c') fnum+=ucwt;
else if(bc == 'g') fnum+=ugwt;
break;
case 't':
if (bc == 'a') fnum+=auwt;
else if(bc == 'c') fnum+=ucwt;
else if(bc == 'g') fnum+=ugwt;
break;
default:
break;
};
}
}
if((blank == FALSE) && match)
{
(*num) ++;
(*denom) ++;
}
else if(!blank)
(*denom) ++;
}
}
if(special)
(*num) = *denom - (int)fnum;
return;
}
float setdist(num,denom,a,b)
int num,denom,a,b;
{
float cor;
switch (correction)
{
case OLSEN:
cor = parta[a]*parta[b]+
partc[a]*partc[b]+
partg[a]*partg[b]+
partu[a]*partu[b];
break;
case JUKES:
cor = 0.25;
break;
case NONE:
cor = 0.0;
break;
default:
cor = 0.0;
break;
};
if(correction == NONE)
return(1.0 - (float)num/(float)denom);
else
return( -(1.0-cor)*log(1.0 / (1.0-cor)*((float)num/(float)denom-cor)));
}
getarg(av,ndx,atype,aval)
char **av,atype[],aval[];
int ndx;
{
int i,j;
char c;
for(j=0;(c=av[ndx][j])!=' ' && c!= '=' && c!= '\0';j++)
atype[j]=c;
if (c=='=')
{
atype[j++] = c;
atype[j] = '\0';
}
else
{
atype[j] = '\0';
j++;
}
if(c=='=')
{
for(i=0;(c=av[ndx][j]) != '\0' && c!= ' ';i++,j++)
aval[i] = c;
aval[i] = '\0';
}
return;
}
SetPart()
{
int a,c,g,u,tot,i,j;
char nuc;
for(j=0;j<numseq;j++)
{
a=0;
c=0;
g=0;
u=0;
tot=0;
for(i=0;i<data[j].length;i++)
{
nuc=data[j].nuc[i] | 32;
switch (nuc)
{
case 'a':
a++;
tot++;
break;
case 'c':
c++;
tot++;
break;
case 'g':
g++;
tot++;
break;
case 'u':
u++;
tot++;
break;
case 't':
u++;
tot++;
break;
};
}
parta[j] = (float)a / (float)tot;
partc[j] = (float)c / (float)tot;
partg[j] = (float)g / (float)tot;
partu[j] = (float)u / (float)tot;
}
return;
}
Report()
{
int i,ii,jj,j,k;
if(tbl)
printf("#\n#-\n#-\n#-\n#-\n");
for(jj=0,j=0;j<numseq;j++)
{
if(tbl)
printf("%2d: %-.15s|",jj+1,data[j].name);
for (i=0;i<j;i++)
{
if(sim)
printf("%6.1f",100 - dist[i][j]*100.0);
else
printf("%6.1f",dist[i][j]*100.0);
}
printf("\n");
jj++;
}
return;
}
int find(b,a)
char *a,*b;
{
int flag,lenb,lena;
register i,j;
flag=0;
lenb=strlen(b);
lena=strlen(a);
for (i=0;((i<lena) && flag==0);i++)
{
for(j=0;(j<lenb) && (a[i+j]==b[j]);j++);
flag=((j==lenb)? 1:0);
}
return flag;
}