gde_linux/SUPPORT/count.c

394 lines
7.1 KiB
C
Raw Normal View History

2023-04-12 03:41:11 +08:00
/*
* Copyright 1991 Steven Smith at the Harvard Genome Lab.
* All rights reserved.
*/
2023-04-12 03:39:54 +08:00
#include <math.h>
2023-04-12 03:41:11 +08:00
#include "Flatio.c"
2023-04-12 03:39:54 +08:00
#define FALSE 0
#define TRUE 1
#define JUKES 0
#define OLSEN 1
#define NONE 2
2023-04-12 03:41:11 +08:00
#define Min(a, b) (a) < (b) ? (a) : (b)
2023-04-12 03:39:54 +08:00
2023-04-12 03:41:11 +08:00
int width, start, jump, usecase, sim, correction;
int tbl, numseq, num, denom, special;
char argtyp[255], argval[255];
2023-04-12 03:39:54 +08:00
2023-04-12 03:41:11 +08:00
float acwt = 1.0, agwt = 1.0, auwt = 1.0, ucwt = 1.0, ugwt = 1.0, gcwt = 1.0;
2023-04-12 03:39:54 +08:00
float dist[200][200];
struct data_format data[10000];
float parta[200], partc[200], partg[200], partu[200], setdist();
2023-04-12 03:41:11 +08:00
main(ac, av) int ac;
2023-04-12 03:39:54 +08:00
char **av;
{
2023-04-12 03:41:11 +08:00
int i, j, k;
2023-04-12 03:39:54 +08:00
extern int ReadFlat();
FILE *file;
width = 1;
jump = 1;
2023-04-12 03:41:11 +08:00
if (ac == 1) {
2023-04-12 03:39:54 +08:00
fprintf(stderr,
2023-04-12 03:41:11 +08:00
"usage: %s [-sim] [-case] [-c=<none,olsen,jukes>] ",
av[0]);
fprintf(stderr, "[-t] alignment_flat_file\n");
2023-04-12 03:39:54 +08:00
exit(1);
}
2023-04-12 03:41:11 +08:00
for (j = 1; j < ac - 1; j++) {
getarg(av, j, argtyp, argval);
if (strcmp(argtyp, "-s=") == 0) {
2023-04-12 03:39:54 +08:00
j++;
2023-04-12 03:41:11 +08:00
sscanf(argval, "%d", &start);
start--;
2023-04-12 03:39:54 +08:00
}
2023-04-12 03:41:11 +08:00
else if (strcmp(argtyp, "-m=") == 0) {
2023-04-12 03:39:54 +08:00
j++;
2023-04-12 03:41:11 +08:00
sscanf(argval, "%d", &width);
2023-04-12 03:39:54 +08:00
}
2023-04-12 03:41:11 +08:00
else if (strcmp(argtyp, "-j=") == 0) {
2023-04-12 03:39:54 +08:00
j++;
2023-04-12 03:41:11 +08:00
sscanf(argval, "%d", &jump);
2023-04-12 03:39:54 +08:00
}
2023-04-12 03:41:11 +08:00
else if (strcmp(argtyp, "-case") == 0)
2023-04-12 03:39:54 +08:00
usecase = TRUE;
2023-04-12 03:41:11 +08:00
else if (strcmp(argtyp, "-sim") == 0)
2023-04-12 03:39:54 +08:00
sim = TRUE;
2023-04-12 03:41:11 +08:00
else if (strcmp(argtyp, "-c=") == 0) {
if (strcmp(argval, "olsen") == 0)
2023-04-12 03:39:54 +08:00
correction = OLSEN;
2023-04-12 03:41:11 +08:00
else if (strcmp(argval, "none") == 0)
2023-04-12 03:39:54 +08:00
correction = NONE;
2023-04-12 03:41:11 +08:00
else if (strcmp(argval, "jukes") == 0)
2023-04-12 03:39:54 +08:00
correction = JUKES;
else
2023-04-12 03:41:11 +08:00
fprintf(stderr, "Correction type %s %s\n",
argval, "unknown, using JUKES");
2023-04-12 03:39:54 +08:00
}
2023-04-12 03:41:11 +08:00
else if (strcmp("-t", argtyp) == 0)
2023-04-12 03:39:54 +08:00
tbl = TRUE;
2023-04-12 03:41:11 +08:00
else if (strcmp("-ac=", argtyp) == 0 ||
strcmp("-ca=", argtyp) == 0) {
2023-04-12 03:39:54 +08:00
j++;
2023-04-12 03:41:11 +08:00
sscanf(argval, "%f", &acwt);
2023-04-12 03:39:54 +08:00
special = TRUE;
}
2023-04-12 03:41:11 +08:00
else if (strcmp("-au=", argtyp) == 0 ||
strcmp("-ua=", argtyp) == 0) {
2023-04-12 03:39:54 +08:00
j++;
2023-04-12 03:41:11 +08:00
sscanf(argval, "%f", &auwt);
2023-04-12 03:39:54 +08:00
special = TRUE;
}
2023-04-12 03:41:11 +08:00
else if (strcmp("-ag=", argtyp) == 0 ||
strcmp("-ga=", argtyp) == 0) {
2023-04-12 03:39:54 +08:00
j++;
2023-04-12 03:41:11 +08:00
sscanf(argval, "%f", &agwt);
2023-04-12 03:39:54 +08:00
special = TRUE;
}
2023-04-12 03:41:11 +08:00
else if (strcmp("-uc=", argtyp) == 0 ||
strcmp("-cu=", argtyp) == 0) {
2023-04-12 03:39:54 +08:00
j++;
2023-04-12 03:41:11 +08:00
sscanf(argval, "%f", &ucwt);
2023-04-12 03:39:54 +08:00
special = TRUE;
}
2023-04-12 03:41:11 +08:00
else if (strcmp("-ug=", argtyp) == 0 ||
strcmp("-gu=", argtyp) == 0) {
2023-04-12 03:39:54 +08:00
j++;
2023-04-12 03:41:11 +08:00
sscanf(argval, "%f", &ugwt);
2023-04-12 03:39:54 +08:00
special = TRUE;
}
2023-04-12 03:41:11 +08:00
else if (strcmp("-gc=", argtyp) == 0 ||
strcmp("-cg=", argtyp) == 0) {
2023-04-12 03:39:54 +08:00
j++;
2023-04-12 03:41:11 +08:00
sscanf(argval, "%f", &gcwt);
2023-04-12 03:39:54 +08:00
special = TRUE;
}
2023-04-12 03:41:11 +08:00
else if (strcmp("-transition=", argtyp) == 0) {
2023-04-12 03:39:54 +08:00
j++;
2023-04-12 03:41:11 +08:00
sscanf(argval, "%f", &ucwt);
2023-04-12 03:39:54 +08:00
agwt = ucwt;
special = TRUE;
}
2023-04-12 03:41:11 +08:00
else if (strcmp("-transversion=", argtyp) == 0) {
2023-04-12 03:39:54 +08:00
j++;
2023-04-12 03:41:11 +08:00
sscanf(argval, "%f", &gcwt);
2023-04-12 03:39:54 +08:00
ugwt = gcwt;
acwt = gcwt;
auwt = gcwt;
special = TRUE;
}
}
2023-04-12 03:41:11 +08:00
file = fopen(av[ac - 1], "r");
if ((file == NULL) || (ac == 1)) {
fprintf(stderr, "Error opening input file %s\n", av[ac - 1]);
2023-04-12 03:39:54 +08:00
exit(1);
}
2023-04-12 03:41:11 +08:00
numseq = ReadFlat(file, data, 10000);
2023-04-12 03:39:54 +08:00
fclose(file);
SetPart();
2023-04-12 03:41:11 +08:00
for (j = 0; j < numseq - 1; j++)
for (k = j + 1; k < numseq; k++) {
Compare(j, k, &num, &denom);
dist[j][k] = setdist(num, denom, j, k);
2023-04-12 03:39:54 +08:00
}
Report();
exit(0);
}
2023-04-12 03:41:11 +08:00
Compare(a, b, num, denom) int a, b, *num, *denom;
2023-04-12 03:39:54 +08:00
{
2023-04-12 03:41:11 +08:00
int mn, i, j, casefix, match, blank;
2023-04-12 03:39:54 +08:00
float fnum = 0.0;
2023-04-12 03:41:11 +08:00
struct data_format *da, *db;
char ac, bc;
2023-04-12 03:39:54 +08:00
2023-04-12 03:41:11 +08:00
casefix = (usecase) ? 0 : 32;
2023-04-12 03:39:54 +08:00
*num = 0;
*denom = 0;
da = &data[a];
db = &data[b];
2023-04-12 03:41:11 +08:00
mn = Min(da->length, db->length);
2023-04-12 03:39:54 +08:00
2023-04-12 03:41:11 +08:00
for (j = 0; j < mn; j += jump) {
2023-04-12 03:39:54 +08:00
match = TRUE;
blank = TRUE;
2023-04-12 03:41:11 +08:00
for (i = 0; i < width; i++) {
ac = da->nuc[j + i] | casefix;
bc = db->nuc[j + i] | casefix;
if (ac == 't') ac = 'u';
if (ac == 'T') ac = 'U';
if (bc == 't') bc = 'u';
if (bc == 'T') bc = 'U';
if ((ac == '-') || (ac | 32) == 'n' || (ac == ' ') ||
(bc == '-') || (bc | 32) == 'n' || (bc == ' '))
;
else {
2023-04-12 03:39:54 +08:00
blank = FALSE;
2023-04-12 03:41:11 +08:00
if (ac != bc) {
2023-04-12 03:39:54 +08:00
match = FALSE;
2023-04-12 03:41:11 +08:00
switch (ac) {
case 'a':
if (bc == 'c')
fnum += acwt;
else if (bc == 'g')
fnum += agwt;
else if (bc == 'u')
fnum += auwt;
break;
case 'c':
if (bc == 'a')
fnum += acwt;
else if (bc == 'g')
fnum += gcwt;
else if (bc == 'u')
fnum += ucwt;
break;
case 'g':
if (bc == 'a')
fnum += agwt;
else if (bc == 'c')
fnum += gcwt;
else if (bc == 'u')
fnum += ugwt;
break;
case 'u':
if (bc == 'a')
fnum += auwt;
else if (bc == 'c')
fnum += ucwt;
else if (bc == 'g')
fnum += ugwt;
break;
case 't':
if (bc == 'a')
fnum += auwt;
else if (bc == 'c')
fnum += ucwt;
else if (bc == 'g')
fnum += ugwt;
break;
default:
break;
2023-04-12 03:39:54 +08:00
};
}
}
2023-04-12 03:41:11 +08:00
if ((blank == FALSE) && match) {
(*num)++;
(*denom)++;
2023-04-12 03:39:54 +08:00
}
2023-04-12 03:41:11 +08:00
else if (!blank)
(*denom)++;
2023-04-12 03:39:54 +08:00
}
}
2023-04-12 03:41:11 +08:00
if (special) (*num) = *denom - (int)fnum;
return 0;
2023-04-12 03:39:54 +08:00
}
2023-04-12 03:41:11 +08:00
float setdist(num, denom, a, b)
int num, denom, a, b;
2023-04-12 03:39:54 +08:00
{
float cor;
2023-04-12 03:41:11 +08:00
switch (correction) {
case OLSEN:
cor = parta[a] * parta[b] + partc[a] * partc[b] +
partg[a] * partg[b] + partu[a] * partu[b];
break;
case JUKES:
cor = 0.25;
break;
case NONE:
cor = 0.0;
break;
default:
cor = 0.0;
break;
2023-04-12 03:39:54 +08:00
};
2023-04-12 03:41:11 +08:00
if (correction == NONE)
return (1.0 - (float)num / (float)denom);
2023-04-12 03:39:54 +08:00
else
2023-04-12 03:41:11 +08:00
return (-(1.0 - cor) * log(1.0 / (1.0 - cor) *
((float)num / (float)denom - cor)));
2023-04-12 03:39:54 +08:00
}
2023-04-12 03:41:11 +08:00
getarg(av, ndx, atype, aval) char **av, atype[], aval[];
2023-04-12 03:39:54 +08:00
int ndx;
{
2023-04-12 03:41:11 +08:00
int i, j;
2023-04-12 03:39:54 +08:00
char c;
2023-04-12 03:41:11 +08:00
for (j = 0; (c = av[ndx][j]) != ' ' && c != '=' && c != '\0'; j++)
atype[j] = c;
if (c == '=') {
2023-04-12 03:39:54 +08:00
atype[j++] = c;
atype[j] = '\0';
}
2023-04-12 03:41:11 +08:00
else {
2023-04-12 03:39:54 +08:00
atype[j] = '\0';
j++;
}
2023-04-12 03:41:11 +08:00
if (c == '=') {
for (i = 0; (c = av[ndx][j]) != '\0' && c != ' '; i++, j++)
2023-04-12 03:39:54 +08:00
aval[i] = c;
aval[i] = '\0';
}
2023-04-12 03:41:11 +08:00
return 0;
2023-04-12 03:39:54 +08:00
}
SetPart()
{
2023-04-12 03:41:11 +08:00
int a, c, g, u, tot, i, j;
2023-04-12 03:39:54 +08:00
char nuc;
2023-04-12 03:41:11 +08:00
for (j = 0; j < numseq; j++) {
a = 0;
c = 0;
g = 0;
u = 0;
tot = 0;
for (i = 0; i < data[j].length; i++) {
nuc = data[j].nuc[i] | 32;
switch (nuc) {
case 'a':
a++;
tot++;
break;
case 'c':
c++;
tot++;
break;
case 'g':
g++;
tot++;
break;
case 'u':
u++;
tot++;
break;
case 't':
u++;
tot++;
break;
2023-04-12 03:39:54 +08:00
};
}
parta[j] = (float)a / (float)tot;
partc[j] = (float)c / (float)tot;
partg[j] = (float)g / (float)tot;
partu[j] = (float)u / (float)tot;
}
2023-04-12 03:41:11 +08:00
return 0;
2023-04-12 03:39:54 +08:00
}
Report()
{
2023-04-12 03:41:11 +08:00
int i, ii, jj, j, k;
if (tbl) printf("#\n#-\n#-\n#-\n#-\n");
for (jj = 0, j = 0; j < numseq; j++) {
if (tbl) printf("%2d: %-.15s|", jj + 1, data[j].name);
for (i = 0; i < j; i++) {
if (sim)
printf("%6.1f", 100 - dist[i][j] * 100.0);
2023-04-12 03:39:54 +08:00
else
2023-04-12 03:41:11 +08:00
printf("%6.1f", dist[i][j] * 100.0);
2023-04-12 03:39:54 +08:00
}
printf("\n");
jj++;
}
2023-04-12 03:41:11 +08:00
return 0;
2023-04-12 03:39:54 +08:00
}
2023-04-12 03:41:11 +08:00
int find(b, a)
char *a, *b;
2023-04-12 03:39:54 +08:00
{
2023-04-12 03:41:11 +08:00
int flag, lenb, lena;
register i, j;
flag = 0;
lenb = strlen(b);
lena = strlen(a);
for (i = 0; ((i < lena) && flag == 0); i++) {
for (j = 0; (j < lenb) && (a[i + j] == b[j]); j++)
;
flag = ((j == lenb) ? 1 : 0);
2023-04-12 03:39:54 +08:00
}
return flag;
}