gb2tnt/gb2tnt.c
2023-02-28 16:07:31 +08:00

736 lines
26 KiB
C

/*
Copyright (C) 2008 Pablo A. Goloboff
Copyright (C) 2023 Guoyi Zhang;
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc., 59
Temple Place - Suite 330, Boston, MA 02111-1307, USA.
Email: pablogolo@csnat.edu.ar
Mail: Pablo A. Goloboff, INSUE, Instituto Miguel Lillo, Miguel Lillo 205, 4000 S.M. de Tucuman, Argentina.
Email: GuoyiZhang@malacology.net
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
int verb = 0 ;
int use_translation = 0 ;
int dargc , laquiero ;
int showedskipped = 0 ;
char ** dargv ;
int fileargs ; // args 1 to fileargs-1 are file names!
FILE * inpf , * opsf , * curinput , * opsf , * notsfile ;
int laschar ;
#define NUCLEAR 1
#define MITOCH 2
#define PLASTID 3
#define CHLORO 4
#define TRUE 1
#define FALSE 0
#define MAXUSERACC 50000
int genometype = 0 ;
int use_string_matching = 0 ;
double string_similarity ;
int stringis ( char * , char * ) ; // are both strings the same, case aside ???
int rd ( void ) ; // return next char. from input file; save it in laschar
void outer ( int , char * txt ) ; // if first arg. is TRUE, then output txt and exit
void gonln ( void ) ; // read all the way to the ENTER
void gotostring ( char * , int ) ; // find string of first arg, with margin of sec arg characters
void rdto ( int , char * ) ; // saves to string every byte it finds before first arg
void stornexline ( void ) ; // puts in stringsp everything to the ENTER
void * mymalloc ( unsigned long int ) ; // allocs mem or exits
void process (void);
void save_to_not ( void );
void setopts ( void );
void makelower ( char * txt );
void parsit ( void );
void parse_translation ( void );
void spew_name ( void );
void output_translation ( void );
void effect_complementation ( void );
#define MAXNUMCHUNKS 200
typedef struct { int from ; int to ; } Chunktyp ;
Chunktyp chunk[MAXNUMCHUNKS] ;
int numchunks = 0 ;
#define MAXSEQLENGTH 25000
char bytestring[MAXSEQLENGTH] ;
char complement_string[MAXSEQLENGTH] ;
unsigned long int accepted = 0 , rejected = 0 ;
char stringsp[160] , headerline[160];
int prodnumber = 0 , genenumber = 0 ;
#define MAX_USER_DEFINITIONS 60
char * prodname[MAX_USER_DEFINITIONS] , * genename[MAX_USER_DEFINITIONS ] ,
accnumber [ 100 ] ;
char taxname [ 100 ] ;
char taxonomy [ 1000 ] ;
char ** useraccname ;
int useraccnumber = 0 ;
unsigned long int dafsize , bytesread = 0 ;
#ifdef LINUX
int getch ( void )
{
return ( getc ( stdin ) ) ;
}
#endif
void * mymalloc ( unsigned long int size )
{
void * pt = malloc ( size ) ;
outer ( pt == NULL , "Not enough memory!" ) ;
return pt ;
}
void dildit ( void )
{
static int prv = 0 ;
double fract ;
unsigned long int ifract ;
++ bytesread ;
fract = ( ( double ) bytesread / dafsize ) * 100 ;
ifract = fract ;
if ( ifract == prv ) return ;
prv = ifract ;
fprintf ( stderr , "\rParsing ... %lu%%" , ifract ) ;
}
void undild ( void )
{
fprintf ( stderr , "\r \r" ) ;
}
int rd ( void )
{
int i ;
if ( feof ( curinput ) ) {
fclose ( curinput ) ;
if ( inpf == curinput ) {
undild () ;
if ( !useraccnumber )
fprintf ( stderr , "\nAccepted %lu accessions, rejected %lu" , accepted , rejected ) ;
fprintf ( stderr , "\nFinished parsing %s (input)\n" , dargv[1] ) ; }
else fprintf ( stderr , "\nFinished parsing %s (options)\n" , dargv[2] ) ;
if ( inpf == curinput ) {
getc ( stdin ) ;
exit ( 1 ) ; }}
if ( curinput == inpf ) dildit () ;
i = getc ( curinput ) ;
if ( i == 13 )
if ( ( i = getc ( curinput ) ) != 10 ) ungetc ( i , curinput ) ;
return ( laschar = i ) ;
}
void openit ( void )
{
struct stat buf ;
curinput = inpf = fopen ( dargv [ 1 ] , "rb" ) ;
if ( inpf == NULL ) {
fprintf ( stderr , "Error trying to open input file %s: " , dargv[1] ) ;
outer ( 1 , "cannot open file" ) ; }
fstat ( fileno ( inpf ) , &buf ) ;
dafsize = buf.st_size ;
}
void outer ( int doit , char * txt )
{
if ( !doit ) return ;
fprintf ( stderr , "%s\n" , txt ) ;
getc ( stdin ) ;
exit ( 0 ) ;
}
void dohelp ( void )
{
fprintf ( stderr , "\n\n"
"Usage is: \n\n"
" Give input file name (1st arg) and options file (2nd arg)\n"
" Output can be redirected with \">\"\n\n"
" Inside options file: \n\n"
" gene \"name(s)\"\n"
" product \"name(s)\"\n"
" protein\n"
" genome \"type\"\n"
" stringmatch S (S=similarity=1-(E/L)\n"
" where E=edit cost, and L=length)\n\n"
" List of skipped sequences goes to file gb2tnt.not\n\n"
" To see details of skipped sequences, use gb2tnt.not as\n"
" options file:\n\n"
" \"gb2tnt input gb2tnt.not\"\n\n"
" Alternatively, string similarity can be given as 3d and\n"
" 4th argument (which overrides values in options file)\n\n"
) ;
getc ( stdin ) ;
exit ( 0 ) ;
}
int main ( int argc , char ** argv )
{
dargc = argc ;
dargv = argv ;
setopts () ;
process () ;
return 0;
}
void gonln ( void )
{
while ( laschar != 10 ) rd () ;
}
void gotostring ( char * string , int shift )
{
char * cp ;
int i ;
int somenonwhite ;
strcpy ( stringsp , " " ) ;
while ( strcmp ( stringsp , string ) && !feof ( inpf ) ) {
gonln () ;
somenonwhite = 0 ;
if ( shift > 0 ) {
for ( i = 0 ; i < shift && !somenonwhite ; ++ i )
if ( rd () != 32 ) somenonwhite = 1 ;
if ( somenonwhite ) continue ; }
else {
while ( isspace ( rd () ) ) ;
ungetc ( laschar , inpf ) ; }
* ( cp = stringsp ) = rd () ;
if ( * cp ++ != * string ) continue ;
while ( !isspace( * cp = rd () ) ) {
cp ++ ;
if ( cp - stringsp > 98 ) break ; }
* cp = '\0' ; }
if ( feof ( inpf ) ) {
fprintf ( stderr , "\nDone!" ) ;
exit ( 1 ) ; }
}
void rdaccnumber ( void )
{
char * cp = accnumber ;
while ( isspace ( rd () ) ) ;
while ( !isspace ( laschar ) ) {
* cp ++ = laschar ;
rd () ; }
* cp = '\0' ;
}
void rdto ( int donde , char * stor )
{
char * cp ;
int a = 0 ;
cp = stor ;
while ( isspace( laschar ) ) rd () ;
while ( laschar != donde ) {
if ( a != '_' || laschar != '_' )
* cp ++ = laschar ;
a = laschar ;
rd () ;
if ( laschar == 32 ) laschar = '_' ;
if ( laschar == 10 && donde != 10 ) laschar = '_' ; }
* cp = '\0' ;
}
void stornexline ( void )
{
char * cp = stringsp ;
rd () ;
while ( isspace ( laschar ) ) rd () ;
while ( laschar != 10 && laschar != 13 ) {
* cp ++ = laschar ;
rd () ; }
* cp = '\0' ;
}
typedef struct
{ int up , diag , lef ;
int min ; } Stringcomptyp ;
Stringcomptyp ** cellcost ;
int gapcost = 1 , gapextcost = 1 ;
int suscost = 1 ;
int mademem = 0 ;
void ** loray ( int wid , int hei , int size )
{
int a ;
void ** pp ;
pp = ( void ** ) malloc ( wid * sizeof ( void * ) ) ;
outer ( ( pp == NULL ) , "Not enough RAM") ;
for ( a = 0 ; a < wid ; ++ a )
if ( ( pp [ a ] = ( void * ) malloc ( hei * size ) ) == NULL )
outer ( 1 , "Not enough RAM") ;
return pp ;
}
double doneedwunsch ( char * ap , char * bp )
{
int wid , hei , i , j , dacos ;
char * app , * bpp ;
char * abecs , * bbecs , * anp , * bnp ;
double val ;
int HIGH = 10000000 ;
wid = strlen ( ap ) ;
hei = strlen ( bp ) ;
if ( ! mademem ) {
cellcost = ( Stringcomptyp ** ) loray ( 100 , 100 , sizeof ( Stringcomptyp ) ) ;
mademem = 1 ; }
outer ( ( hei >= 99 || wid >= 99 ) , "String is too long to use string-matching" ) ;
cellcost[0][0].min = cellcost[0][0].diag = 0 ;
cellcost[0][0].up = cellcost[0][0].lef = HIGH ;
bpp = bp ;
for ( j = 0 ; j < hei ; ++ j ) {
app = ap ;
for ( i = 0 ; i < wid ; ++ i ) {
if ( !i && !j ) {
continue ; }
dacos = 0 ;
if ( * app != * bpp ) dacos = suscost ;
if ( j ) {
if ( cellcost[i][j-1].min == cellcost[i][j-1].up )
cellcost[i][j].up = cellcost[i][j-1].min + gapextcost ;
else
cellcost[i][j].up = cellcost[i][j-1].min + gapcost ; }
else cellcost[i][j].up = cellcost[i][j].diag = HIGH ;
if ( i ) {
if ( cellcost[i-1][j].min == cellcost[i-1][j].lef )
cellcost[i][j].lef = cellcost[i-1][j].min + gapextcost ;
else
cellcost[i][j].lef = cellcost[i-1][j].min + gapcost ; }
else cellcost[i][j].lef = cellcost[i][j].diag = HIGH ;
if ( i && j ) cellcost[i][j].diag = cellcost[i-1][j-1].min + dacos ;
dacos = cellcost[i][j].diag ;
if ( dacos > cellcost[i][j].up ) dacos = cellcost[i][j].up ;
if ( dacos > cellcost[i][j].lef ) dacos = cellcost[i][j].lef ;
cellcost[i][j].min = dacos ;
++ app ; }
++ bpp ; }
dacos = cellcost[wid-1][hei-1].min ;
val = ( double ) dacos / ( double ) hei ;
val = 1 - val ;
return val ;
}
int stringis ( char * a , char * b )
{
if ( !strcmp ( b , "\"?\"" ) ) return 1 ;
if ( use_string_matching ) {
if ( doneedwunsch ( a , b ) >= string_similarity ) return 1 ;
return 0 ; }
while ( * a && * b )
if ( tolower ( * a ++ ) != tolower ( * b ++ ) ) return 0 ;
return 1 ;
}
int rdliteral ( void )
{
char * cp = stringsp ;
rd () ;
while ( isspace ( laschar ) && !feof ( curinput ) ) rd () ;
if ( laschar == '\"' ) {
* cp ++ = laschar ;
while ( rd () != '\"' && !feof ( curinput ) ) * cp ++ = laschar ;
* cp ++ = laschar ;
* cp = '\0' ;
return 1 ; }
else {
* cp = laschar ;
while ( !isspace ( laschar ) && !feof ( curinput ) ) * ++ cp = rd () ;
* cp = '\0' ;
return 0 ; }
}
int istrunc ( char * a ) // is a a truncation of stringsp ??
{
char * b = stringsp ;
while ( 1 ) {
if ( ! * a ) return 1 ;
if ( * a ++ != * b ++ ) return 0 ; }
}
int isamatch ( char * a , char * b )
{
while ( 1 ) {
if ( ! * a ) return 1 ;
if ( * a ++ != * b ++ ) return 0 ; }
}
void process (void)
{
char * cp ;
int i , mygenometype , showed_headerline ;
int showacc_only ;
int found_translation ;
while ( !feof ( inpf ) ) {
gotostring ( "ACCESSION" , 0 ) ;
rdaccnumber () ;
showacc_only = 0 ;
if ( useraccnumber ) {
for ( i = 0 ; i < useraccnumber && !showacc_only ; ++ i )
if ( !strcmp ( accnumber , useraccname[i] ) ) showacc_only = 1 ; }
laquiero = found_translation = 0 ;
gotostring ( "ORGANISM" , -1 ) ;
rdto ( 10 , taxname ) ;
rdto ( '.' , taxonomy ) ;
gotostring ( "FEATURES" , 0 ) ;
mygenometype = NUCLEAR ;
/**** Empieza accession... ****/
while ( 1 ) {
stornexline ( ) ;
if ( istrunc ( "ORIGIN" ) ) { laquiero = -1 ; break ; }
if ( istrunc ( "/organelle=" ) ) {
if ( istrunc ( "/organelle=\"mitoc" ) )
mygenometype = MITOCH ;
else if ( istrunc ( "/organelle=\"chlorop" ) )
mygenometype = CHLORO ;
else if ( istrunc ( "/organelle=\"plastid" ) )
mygenometype = PLASTID ; }
if ( istrunc ( "tRNA" ) ) break ;
if ( istrunc ( "rRNA" ) ) break ;
if ( istrunc ( "CDS" ) ) {
/*** linea agregada por el caso de Marcos... ****/
if ( use_translation ) laquiero = FALSE ;
break ; }} /***** End initial parsing... ******/
if ( genometype && mygenometype != genometype )
laquiero = -1 ;
while ( laquiero == FALSE || ( use_translation && !found_translation ) ) {
if ( !istrunc ( "gene" ) ) {
if ( istrunc ( "ORIGIN" ) ) break ;
if ( istrunc ( "tRNA" ) ||
istrunc ( "rRNA" ) ||
istrunc ( "CDS" ) ) {
showed_headerline = 0 ;
strcpy ( headerline , stringsp ) ;
cp = stringsp + 10 ;
while ( isspace ( * cp ) ) ++ cp ;
if ( isamatch ( "join(" , cp ) ) {
while ( * cp != ')' && * cp ) ++ cp ;
if ( !*cp ) {
stornexline () ;
cp = stringsp ;
while ( isspace ( * cp ) ) ++ cp ; }
strcat ( headerline , cp ) ; }}
stornexline () ;
if ( showacc_only ) {
if ( showacc_only ++ == 1 ) {
fprintf ( stdout , "\n%s , %s " , accnumber , taxname ) ;
if ( mygenometype == NUCLEAR ) fprintf ( stdout , ", nuclear " ) ;
if ( mygenometype == CHLORO ) fprintf ( stdout , ", chloro " ) ;
if ( mygenometype == MITOCH ) fprintf ( stdout , ", mitoch " ) ;
if ( mygenometype == PLASTID ) fprintf ( stdout , ", plastid " ) ; }
if ( !showed_headerline )
if ( isamatch ( "tRNA" , headerline ) ||
isamatch ( "rRNA" , headerline ) ||
isamatch ( "CDS" , headerline ) ) {
fprintf ( stdout , "\n %s , " , headerline ) ;
++ showed_headerline ; }
if ( ( istrunc ( "/product=" ) || istrunc ( "/gene=" ) ) ) {
if ( istrunc ( "/product=" ) ) cp = stringsp + 9 ;
else cp = stringsp + 6 ;
fprintf ( stdout , ", %s" , cp ) ; }}
if (
/*** linea agregada por el problema de Marcos.... ****/
laquiero &&
istrunc ( "/translation=" ) && use_translation ) {
parse_translation () ;
found_translation = 1 ; }
if ( istrunc ( "/product=" ) && prodnumber ) {
cp = stringsp + 9 ;
makelower ( cp ) ;
for ( i = 0 ; i < prodnumber && !laquiero ; ++ i )
if ( stringis ( cp , prodname[i] ) ) laquiero = 1 ; }
if ( istrunc ( "/gene=" ) && genenumber ) {
cp = stringsp + 6 ;
makelower ( cp ) ;
for ( i = 0 ; i < genenumber && !laquiero ; ++ i )
if ( stringis ( cp , genename[i] ) ) laquiero = 1 ; }}
else stornexline () ; }
if ( useraccnumber ) {
// if ( showacc_only == 2 ) fprintf ( stdout , "\n" ) ;
continue ; }
if ( laquiero == TRUE && ( !use_translation || found_translation ) ) {
if ( use_translation ) output_translation () ;
else parsit () ; }
else
save_to_not () ;
} // --- while ( !feof ( inpf ) )
}
void save_to_not ( void )
{
++ rejected ;
if ( !showedskipped )
fprintf ( notsfile , "accession " ) ;
showedskipped = 1 ;
fprintf ( notsfile , "\"%s\" " , accnumber ) ;
}
void setopts ( void )
{
int i ;
if ( dargc > 1 )
if ( !strcmp ( dargv[1] , "--help" ) ) dohelp () ;
outer ( dargc < 3 , "Specify input file and option file (\"--help\" to get help)" ) ;
outer ( ( curinput = opsf = fopen ( dargv[ 2 ] , "rb" ) ) == NULL , "Cannot open file with options" ) ;
if ( dargc > 4 )
if ( !strcmp ( dargv[3], "stringmatch" ) ) {
use_string_matching = 1 ;
string_similarity = atof ( dargv[4] ) ; }
while ( !feof ( opsf ) ) {
outer ( rdliteral () , "Unexpected literal string in options file" ) ;
if ( !strcmp ( stringsp , "protein" ) ) use_translation = 1 ;
else if ( !strcmp ( stringsp , "stringmatch" ) ) {
rdliteral () ;
if ( !use_string_matching ) {
use_string_matching = 1 ;
string_similarity = atof ( stringsp ) ; }}
else if ( !strcmp ( stringsp , "gene" ) ) {
while ( !feof ( curinput ) ) {
while ( isspace ( rd () ) && !feof ( curinput ) ) ;
ungetc ( laschar , opsf ) ;
if ( laschar != '\"' ) break ;
outer ( genenumber == MAX_USER_DEFINITIONS , "Cannot define so many gene names" ) ;
rdliteral () ;
makelower ( stringsp ) ;
genename [ genenumber ] = mymalloc ( ( strlen ( stringsp ) + 1 ) * sizeof ( char ) ) ;
strcpy ( genename [ genenumber ++ ] , stringsp ) ; }}
else if ( !strcmp ( stringsp , "accession" ) ) {
useraccname = mymalloc ( MAXUSERACC * sizeof ( char * ) ) ;
while ( !feof ( curinput ) ) {
while ( isspace ( rd () ) && !feof ( curinput ) ) ;
ungetc ( laschar , opsf ) ;
if ( laschar != '\"' ) break ;
rdliteral () ;
if ( useraccnumber == MAXUSERACC )
fprintf ( stderr , "Cannot define so many accessions, will skip from %s on" , stringsp ) ;
else {
useraccname [ useraccnumber ] = mymalloc ( ( strlen ( stringsp ) + 1 ) * sizeof ( char ) ) ;
strcpy ( useraccname [ useraccnumber ] , stringsp + 1 ) ;
i = strlen ( useraccname [ useraccnumber ] ) ;
useraccname [ useraccnumber ] [ i - 1 ] = '\0' ;
++ useraccnumber ; }}}
else if ( !strcmp ( stringsp , "product" ) ) {
while ( !feof ( curinput ) ) {
while ( isspace ( rd () ) && !feof ( curinput ) ) ;
ungetc ( laschar , opsf ) ;
if ( laschar != '\"' ) break ;
outer ( prodnumber == MAX_USER_DEFINITIONS , "Cannot define so many product names" ) ;
rdliteral () ;
makelower ( stringsp ) ;
prodname [ prodnumber ] = malloc ( ( strlen ( stringsp ) + 1 ) * sizeof ( char ) ) ;
strcpy ( prodname [ prodnumber ++ ] , stringsp ) ; }}
else if ( !strcmp ( stringsp , "genome" ) ) {
outer ( !rdliteral () , "Syntax error after \"genome\"" ) ;
if ( !strcmp ( stringsp , "\"mitochondrial\"" ) ) genometype = MITOCH ;
else if ( !strcmp ( stringsp , "\"nuclear\"" ) ) genometype = NUCLEAR ;
else if ( !strcmp ( stringsp , "\"plastid\"" ) ) genometype = PLASTID ;
else if ( !strcmp ( stringsp , "\"chloroplast\"" ) ) genometype = CHLORO ;
else outer ( 1 , "Unrecognized genome option" ) ; }}
if ( use_string_matching )
fprintf ( stderr , "\nUsing string similarity of %.3f\n" , string_similarity ) ;
if ( !useraccnumber )
outer ( ( notsfile = fopen ( "gb2tnt.not" , "wb" ) ) == NULL , "Cannot open file for skipped accessions" ) ;
openit () ;
}
void makelower ( char * txt )
{
char * cp = txt ;
while ( * cp ) {
* cp = tolower ( *cp ) ;
++ cp ; }
return ;
}
int wrong_location ;
char * storchunk ( char * cp )
{
if ( * cp == '<' ) ++ cp ;
if ( !isdigit ( * cp ) ) { wrong_location = 1 ; * cp = '0' ; return cp ; }
chunk[numchunks].from = atoi ( cp ) ;
while ( * cp ++ != '.' && * cp ) ;
if ( * cp != '.' ) { wrong_location = 1 ; * cp = '0' ; return cp ; }
++ cp ;
if ( * cp == '>' ) ++ cp ;
if ( !isdigit ( * cp ) ) { wrong_location = 1 ; * cp = '0' ; return cp ; }
chunk[numchunks].to = atoi ( cp ) ;
while ( isdigit ( * cp ) ) ++ cp ;
++ numchunks ;
return cp ;
}
int species_read = 0 ;
void parsit ( void )
{
char * cp = headerline + 15 ;
int i , atchunk , atpos ;
char * bytept = bytestring , now ;
int complement_it = 0 , didntmatch ;
numchunks = 0 ;
wrong_location = 0 ;
while ( isspace ( * cp ) ) ++ cp ;
if ( * cp == '<' || isdigit ( * cp ) ) storchunk ( cp ) ;
else {
i = cp [ 4 ] ;
cp [ 4 ] = '\0' ;
didntmatch = 0 ;
if ( strcmp ( "join" , cp ) ) didntmatch = 1 ;
cp [ 4 ] = i ;
i = cp [ 10 ] ;
cp [ 10 ] = '\0' ;
if ( didntmatch )
if ( strcmp ( "complement" , cp ) ) {
fprintf ( stderr , "OOPS!!\nFound unrecognized location specifier: %s\nFor accession %s\n" , cp , accnumber ) ;
save_to_not () ; return ; }
if ( !strcmp ( "complement" , cp ) ) {
complement_it = 1 ;
cp += 11 ; }
else { cp [ 10 ] = i ; cp += 5 ; }
while ( * cp != ')' && !wrong_location ) {
cp = storchunk ( cp ) ;
while ( * cp == ',' || isspace ( * cp ) ) ++ cp ; }}
if ( wrong_location ) { save_to_not () ; return ; }
gotostring ( "ORIGIN" , 0 ) ;
rdliteral () ;
atchunk = atpos = 0 ;
++ species_read ;
while ( 1 ) {
now = laschar ;
while ( laschar == 10 || laschar == 13 || laschar == 32 || ( laschar >= '0' && laschar <= '9' ) ) {
now = laschar ;
rd () ; }
if ( now == '/' || laschar == '/' ) return ;
++ atpos ;
if ( chunk[atchunk].from <= atpos && chunk[atchunk].to >= atpos ) {
* bytept ++ = laschar ;
outer ( ( bytept - bytestring >= MAXSEQLENGTH ) , "OOPS -- sequence is too long!\nChange MAXSEQLENGTH and re-compile" ) ; }
laschar = 32 ;
if ( atpos == chunk[atchunk].to )
if ( ++ atchunk == numchunks ) break ; }
* bytept = '\0' ;
++ accepted ;
spew_name () ;
if ( complement_it ) effect_complementation () ;
fprintf ( stdout , "%s" , bytestring ) ;
fprintf ( stdout , "\n" ) ;
fflush ( stdout ) ;
}
void parse_translation ( void )
{
char * cp , * bytept ;
cp = stringsp + 14 ;
bytept = bytestring ;
while ( * cp != '\"' && * cp ) {
if ( !isspace ( * cp ) ) * bytept ++ = * cp ++ ;
if ( * cp == 10 || * cp == 13 || * cp == '\0' ) {
stornexline () ;
cp = stringsp ;
while ( isspace ( * cp ) ) ++ cp ; }}
* bytept = '\0' ;
}
void spew_name ( void )
{
char * cp = taxonomy , * here , * begg ;
int numsemicols = 0 ;
while ( * cp != ';' && * cp ) ++ cp ;
begg = here = ( cp += 2 ) ;
while ( numsemicols < 2 && * cp ) {
if ( * cp == ';' ) {
++ cp ;
++ numsemicols ; }
* here ++ = * cp ++ ; }
* here = '\0' ;
fprintf ( stdout , ">%s____%s_@%s\n" , taxname , accnumber , begg ) ;
}
void output_translation ( void )
{
++ accepted ;
spew_name () ;
fprintf ( stdout , "%s" , bytestring ) ;
fprintf ( stdout , "\n" ) ;
fflush ( stdout ) ;
}
char tmpmask[256] ;
char makeit[9] ;
char antimask[16] ;
void effect_complementation ( void )
{
int i , j , k , l ;
int bit ;
int x , y ;
for ( i = 0 ; i < 256 ; ++ i ) tmpmask[i] = 0 ;
tmpmask [ 'a' ] = tmpmask [ 'A' ] = 1 ; // tmpmask [ '0' ] ;
tmpmask [ 'g' ] = tmpmask [ 'G' ] = 2 ; // tmpmask [ '1' ] ;
tmpmask [ 'c' ] = tmpmask [ 'C' ] = 4 ; // tmpmask [ '2' ] ;
tmpmask [ 't' ] = tmpmask [ 'T' ] = 8 ; // tmpmask [ '3' ] ;
tmpmask [ 'R' ] = tmpmask [ 'r' ] = tmpmask [ 'a' ] | tmpmask [ 'g' ] ;
tmpmask [ 'Y' ] = tmpmask [ 'y' ] = tmpmask [ 't' ] | tmpmask [ 'c' ] ;
tmpmask [ 'W' ] = tmpmask [ 'w' ] = tmpmask [ 'a' ] | tmpmask [ 't' ] ;
tmpmask [ 'S' ] = tmpmask [ 's' ] = tmpmask [ 'c' ] | tmpmask [ 'g' ] ;
tmpmask [ 'M' ] = tmpmask [ 'm' ] = tmpmask [ 'a' ] | tmpmask [ 'c' ] ;
tmpmask [ 'K' ] = tmpmask [ 'k' ] = tmpmask [ 'g' ] | tmpmask [ 't' ] ;
tmpmask [ 'B' ] = tmpmask [ 'b' ] = tmpmask [ 'c' ] | tmpmask [ 'g' ] | tmpmask [ 't' ] ;
tmpmask [ 'D' ] = tmpmask [ 'd' ] = tmpmask [ 'a' ] | tmpmask [ 'g' ] | tmpmask [ 't' ] ;
tmpmask [ 'H' ] = tmpmask [ 'h' ] = tmpmask [ 'a' ] | tmpmask [ 'c' ] | tmpmask [ 't' ] ;
tmpmask [ 'V' ] = tmpmask [ 'v' ] = tmpmask [ 'a' ] | tmpmask [ 'c' ] | tmpmask [ 'g' ] ;
tmpmask [ 'N' ] = tmpmask [ 'n' ] = 1 | 2 | 4 | 8 ;
makeit[tmpmask['a']] = tmpmask['t'] ;
makeit[tmpmask['c']] = tmpmask['g'] ;
makeit[tmpmask['g']] = tmpmask['c'] ;
makeit[tmpmask['t']] = tmpmask['a'] ;
for ( i = 0 ; i < 256 ; ++ i )
if ( tmpmask[i] )
antimask[ tmpmask[i] ] = i ;
j = strlen ( bytestring ) ;
for ( i = 0 ; i < j ; ++ i )
complement_string[i] = bytestring[i] ;
for ( i = 0 , k = j - 1 ; i < j ; ++ i , -- k ) {
x = tmpmask [ complement_string [i]] ;
y = 0 ;
for ( l = 0 , bit = 1 ; l < 4 ; ++ l , bit <<= 1 )
if ( ( bit & x ) )
y |= makeit[ bit ] ;
bytestring [ k ] = antimask [ y ] ; }
return;
}