gde_linux/HGL_SRC/Translate.c

388 lines
9.9 KiB
C
Executable File

#include <stdio.h>
#include <malloc.h>
#include <strings.h>
#include "global_defs.h"
char vert_mito[512][4] =
{
"AAA","Lys", "AAC","Asn", "AAG","Lys", "AAT","Asn", "ACA","Thr",
"ACC","Thr", "ACG","Thr", "ACT","Thr", "AGA","Ter", "AGC","Ser",
"AGG","Ter", "AGT","Ser", "ATA","Met", "ATC","Ile", "ATG","Met",
"ATT","Ile", "CAA","Gln", "CAC","His", "CAG","Gln", "CAT","His",
"CCA","Pro", "CCC","Pro", "CCG","Pro", "CCT","Pro", "CGA","Arg",
"CGC","Arg", "CGG","Arg", "CGT","Arg", "CTA","Leu", "CTC","Leu",
"CTG","Leu", "CTT","Leu", "GAA","Glu", "GAC","Asp", "GAG","Glu",
"GAT","Asp", "GCA","Ala", "GCC","Ala", "GCG","Ala", "GCT","Ala",
"GGA","Gly", "GGC","Gly", "GGG","Gly", "GGT","Gly", "GTA","Val",
"GTC","Val", "GTG","Val", "GTT","Val", "TAA","Ter", "TAC","Tyr",
"TAG","Ter", "TAT","Tyr", "TCA","Ser", "TCC","Ser", "TCG","Ser",
"TCT","Ser", "TGA","Trp", "TGC","Cys", "TGG","Trp", "TGT","Cys",
"TTA","Leu", "TTC","Phe", "TTG","Leu", "TTT","Phe"
},
mycoplasma[512][4] =
{
"AAA","Lys", "AAC","Asn", "AAG","Lys", "AAT","Asn", "ACA","Thr",
"ACC","Thr", "ACG","Thr", "ACT","Thr", "AGA","Arg", "AGC","Ser",
"AGG","Arg", "AGT","Ser", "ATA","Ile", "ATC","Ile", "ATG","Met",
"ATT","Ile", "CAA","Gln", "CAC","His", "CAG","Gln", "CAT","His",
"CCA","Pro", "CCC","Pro", "CCG","Pro", "CCT","Pro", "CGA","Arg",
"CGC","Arg", "CGG","Arg", "CGT","Arg", "CTA","Leu", "CTC","Leu",
"CTG","Leu", "CTT","Leu", "GAA","Glu", "GAC","Asp", "GAG","Glu",
"GAT","Asp", "GCA","Ala", "GCC","Ala", "GCG","Ala", "GCT","Ala",
"GGA","Gly", "GGC","Gly", "GGG","Gly", "GGT","Gly", "GTA","Val",
"GTC","Val", "GTG","Val", "GTT","Val", "TAA","Ter", "TAC","Tyr",
"TAG","Ter", "TAT","Tyr", "TCA","Ser", "TCC","Ser", "TCG","Ser",
"TCT","Ser", "TGA","Trp", "TGC","Cys", "TGG","Trp", "TGT","Cys",
"TTA","Leu", "TTC","Phe", "TTG","Leu", "TTT","Phe" },
universal[512][4] =
{
"AAA","Lys", "AAC","Asn", "AAG","Lys", "AAT","Asn", "ACA","Thr",
"ACC","Thr", "ACG","Thr", "ACT","Thr", "AGA","Arg", "AGC","Ser",
"AGG","Arg", "AGT","Ser", "ATA","Ile", "ATC","Ile", "ATG","Met",
"ATT","Ile", "CAA","Gln", "CAC","His", "CAG","Gln", "CAT","His",
"CCA","Pro", "CCC","Pro", "CCG","Pro", "CCT","Pro", "CGA","Arg",
"CGC","Arg", "CGG","Arg", "CGT","Arg", "CTA","Leu", "CTC","Leu",
"CTG","Leu", "CTT","Leu", "GAA","Glu", "GAC","Asp", "GAG","Glu",
"GAT","Asp", "GCA","Ala", "GCC","Ala", "GCG","Ala", "GCT","Ala",
"GGA","Gly", "GGC","Gly", "GGG","Gly", "GGT","Gly", "GTA","Val",
"GTC","Val", "GTG","Val", "GTT","Val", "TAA","Ter", "TAC","Tyr",
"TAG","Ter", "TAT","Tyr", "TCA","Ser", "TCC","Ser", "TCG","Ser",
"TCT","Ser", "TGA","Ter", "TGC","Cys", "TGG","Trp", "TGT","Cys",
"TTA","Leu", "TTC","Phe", "TTG","Leu", "TTT","Phe" },
yeast[512][4] =
{
"AAA","Lys", "AAC","Asn", "AAG","Lys", "AAT","Asn", "ACA","Thr",
"ACC","Thr", "ACG","Thr", "ACT","Thr", "AGA","Arg", "AGC","Ser",
"AGG","Arg", "AGT","Ser", "ATA","Met", "ATC","Ile", "ATG","Met",
"ATT","Ile", "CAA","Gln", "CAC","His", "CAG","Gln", "CAT","His",
"CCA","Pro", "CCC","Pro", "CCG","Pro", "CCT","Pro", "CGA","Arg",
"CGC","Arg", "CGG","Arg", "CGT","Arg", "CTA","Thr", "CTC","Thr",
"CTG","Thr", "CTT","Thr", "GAA","Glu", "GAC","Asp", "GAG","Glu",
"GAT","Asp", "GCA","Ala", "GCC","Ala", "GCG","Ala", "GCT","Ala",
"GGA","Gly", "GGC","Gly", "GGG","Gly", "GGT","Gly", "GTA","Val",
"GTC","Val", "GTG","Val", "GTT","Val", "TAA","Ter", "TAC","Tyr",
"TAG","Ter", "TAT","Tyr", "TCA","Ser", "TCC","Ser", "TCG","Ser",
"TCT","Ser", "TGA","Trp", "TGC","Cys", "TGG","Trp", "TGT","Cys",
"TTA","Leu", "TTC","Phe", "TTG","Leu", "TTT","Phe"
};
char three_to_one[23][5] = {
"AlaA", "ArgR", "AsnN", "AspD",
"AsxB", "CysC", "GlnQ", "GluE",
"GlxZ", "GlyG", "HisH", "IleI",
"LeuL", "LysK", "MetM", "PheF",
"ProP", "SerS", "ThrT", "TrpW",
"TyrY", "ValV", "Ter*"
};
main(ac,av)
int ac;
char **av;
{
int Success = TRUE,cursize,tbl,i,j,k,maxsize = 10,frame=1,
min_frame = 0,ltrs=0, sep=0, tmp_num_frame, print_comp=FALSE;
Sequence temp,*seqs;
FILE *file;
char number[5],tr_tbl[512][4];
extern char *Realloc();
extern Translate_NA_AA();
if(ac == 1)
{
fprintf(stderr,
"Options:[-tbl codon_table] [-frame #] [-min_frame #] [-3] [-sep] GDEfile\n");
fprintf(stderr," 1=universal 1=first frame Shortest AA Three letter don't seperate\n");
fprintf(stderr," 2=mycoplasma 2=second frame sequence codes groups\n");
fprintf(stderr," 3=yeast 3=third frame to translate\n");
fprintf(stderr," 4=Vert. mito. 6=All six\n");
exit(0);
}
for(j=1;j<ac-1;j++)
{
if(strcmp(av[j],"-tbl")==0)
{
sscanf(av[++j],"%d",&tbl);
}
if(strcmp(av[j],"-frame")==0)
{
sscanf(av[++j],"%d",&frame);
}
if(strcmp(av[j],"-min_frame")==0)
{
sscanf(av[++j],"%d",&min_frame);
}
if(strcmp(av[j],"-3")==0)
{
ltrs = 1;
}
if(strcmp(av[j],"-sep")==0)
{
sep = 1;
}
}
file = fopen(av[ac-1],"r");
if(file == NULL)
exit(-1);
seqs = (Sequence*)Calloc(maxsize,sizeof(Sequence));
for(cursize = 0;Success == TRUE;cursize++)
{
Success = ReadRecord(file,&(seqs[cursize]));
if(cursize == maxsize-1)
{
maxsize *= 2;
seqs = (Sequence*)Realloc(seqs, maxsize*sizeof(Sequence));
}
if(Success==TRUE)
{
seqs[cursize].group_number = 99999;
}
}
cursize--;
tmp_num_frame = frame; /*added by jckelley, 7/2/92 */
for(j=0;j<cursize;j++)
{
frame = tmp_num_frame; /*added by jckelley, 7/2/92 */
if(frame == 6)
for(i=0;i<2;i++)
{
for(k=0;k<3;k++)
{
SeqCopy(&(seqs[j]),&temp);
if(i==0)
sprintf(number,"_%d",k+1);
else
sprintf(number,"_comp%d",k+1);
strncat(temp.name,number,32);
switch (tbl)
{
case 3:
Translate_NA_AA(&temp,yeast,k,min_frame,ltrs,sep);
break;
case 4:
Translate_NA_AA(&temp,vert_mito,k,min_frame,ltrs,sep);
break;
case 2:
Translate_NA_AA(&temp,mycoplasma,k,min_frame,ltrs,sep);
break;
case 1:
default:
Translate_NA_AA(&temp,universal,k,min_frame,ltrs,sep);
break;
}
/* WriteRecord(stdout,&temp,NULL,0); */
}
SeqRev(&(seqs[j]),seqs[j].offset,seqs[j].seqlen+
seqs[j].offset-1);
SeqComp(&(seqs[j]));
}
else
{
frame = (frame-1)%3;
SeqCopy(&(seqs[j]),&temp);
sprintf(number,"_%d",frame+1);
strncat(temp.name,number,32);
switch(tbl)
{
case 3:
Translate_NA_AA(&temp,yeast,frame,min_frame,ltrs,sep);
break;
case 4:
Translate_NA_AA(&temp,vert_mito,frame,min_frame,ltrs,sep);
break;
case 2:
Translate_NA_AA(&temp,mycoplasma,frame,min_frame,ltrs,sep);
break;
case 1:
default:
Translate_NA_AA(&temp,universal,frame,min_frame,ltrs,sep);
break;
}
}
}
exit(0);
}
Translate_NA_AA(seq,table,r_frame,min_frame,letter_code,sep)
Sequence *seq;
char table[][4];
int r_frame,min_frame,letter_code, sep;
{
extern char *Realloc(), ThreeToOne();
int i,true_start,k,pos,fptr = 0,start = 0;
int last_start = 0,codon_count = 0;
char c,codon[4],*temp, *save_c_elem, *save_name, strtmp[80];
extern int Default_PROColor_LKUP[];
int grp = 0;
codon[3] = '\0';
fptr = 0;
save_name = malloc(80);
strncpy(save_name, seq->name, 80);
temp=(char*)Calloc(seq->seqlen+1,sizeof(char));
for(i=0;i<seq->seqlen;i++)
temp[i] = '-';
if(letter_code == 1)
{
/*
* Triple letter codes
*/
strcpy(seq->type,"TEXT");
}
else
{
/*
* Single Letter Codes
*/
strcpy(seq->type , "PROT");
}
/*
* Skip over r_frame valid characters (skip ' ','-','~')
*/
for(true_start=0,i=0; i<r_frame&&true_start<
seq->seqlen;true_start++)
{
c=seq->c_elem[true_start];
if(index(" -~",c) == NULL)
i++;
}
for(pos=true_start;pos<seq->seqlen;pos++)
{
c=seq->c_elem[pos];
if(index(" -~",c) == NULL)
{
c &= (255-32); /*upper case*/
if(c == 'U') c = 'T';
/*
* We have a valid character...
*/
if(fptr == 0)
start = pos;
codon[fptr++] = c;
/*
* Translate the codon...
*/
if(fptr == 3)
{
/*
* Place default code 'X' in case translation fails
*/
temp[start] = 'X';
for(i=0;i<512;i+=2)
if(strcmp(codon,table[i]) == 0)
{
if(letter_code == 1)
{
temp[start] = table[i+1][0];
temp[start+1] = table[i+1][1];
temp[start+2] = table[i+1][2];
}
else
temp[start] = ThreeToOne
(table[i+1]);
i = 512;
}
fptr = 0;
/*
* Check to see if it is a valid ORF
*/
if((strncmp("Ter",&(temp[start]),3) == 0) ||
(temp[start] == '*'))
{
/*
* If the ORF is too small, clear it out...
*/
if(codon_count < min_frame)
{
/*
* Should we seperate the groups out, or not?!?!
*
*/
if (!sep)
{
/*
* If reading from stop to stop, leave the elading stop codon
*/
if(temp[last_start]=='*' ||
strncmp("Ter",&(temp[last_start]),3) == 0)
for(i=last_start+3;i<start+3;i++)
temp[i] = '-';
/*
* Otherwise, we are at the first coding region, no stop codon...
*/
else
for(i=last_start;i<start+3;i++)
temp[i] = '-';
}
else
{
for(i=0;i<seq->seqlen;i++)
temp[i] = '-';
}
}
else
{
if (sep)
{
sprintf(strtmp, "_%d", grp++);
strncat(seq->name, strtmp, 20);
save_c_elem = seq->c_elem;
seq->c_elem = temp;
WriteRecord(stdout,seq,NULL,0);
strncpy(seq->name, save_name, 80);
seq->c_elem = save_c_elem;
for(i=0;i<seq->seqlen;i++)
temp[i] = '-';
}
}
codon_count = 0;
last_start = start;
/*
* Don't bother continuing if too close to end to meat min_frame requirements
*
*/
if ((pos + (3 * min_frame)) > seq->seqlen)
break;
}
else
codon_count++;
}
}
}
Cfree(seq->c_elem);
if (!sep)
{
seq->c_elem = temp;
WriteRecord(stdout,seq,NULL,0);
}
Cfree(temp);
return;
}
char ThreeToOne(s)
char s[];
{
extern char three_to_one[][5];
int j;
for(j=0;j<23;j++)
if(strncmp(s,three_to_one[j],3) == 0)
return(three_to_one[j][3]);
Warning("ThreeToOne, code not found");
return('*');
}