/* File: ureadseq.c * * Reads and writes nucleic/protein sequence in various * formats. Data files may have multiple sequences. * * Copyright 1990 by d.g.gilbert * biology dept., indiana university, bloomington, in 47405 * e-mail: gilbertd@bio.indiana.edu * * This program may be freely copied and used by anyone. * Developers are encourged to incorporate parts in their * programs, rather than devise their own private sequence * format. * * This should compile and run with any ANSI C compiler. * */ #include #include #include #include #define UREADSEQ_G #include "ureadseq.h" #pragma segment ureadseq int Strcasecmp(const char *a, const char *b) /* from Nlm_StrICmp */ { int diff, done; if (a == b) return 0; done = 0; while (!done) { diff = to_upper(*a) - to_upper(*b); if (diff) return diff; if (*a == '\0') done = 1; else { a++; b++; } } return 0; } int Strncasecmp(const char *a, const char *b, long maxn) /* from Nlm_StrNICmp */ { int diff, done; if (a == b) return 0; done = 0; while (!done) { diff = to_upper(*a) - to_upper(*b); if (diff) return diff; if (*a == '\0') done = 1; else { a++; b++; maxn--; if (!maxn) done = 1; } } return 0; } #ifndef Local #define Local static /* local functions */ #endif #define kStartLength 500 const char *aminos = "ABCDEFGHIKLMNPQRSTVWXYZ*"; const char *primenuc = "ACGTU"; const char *protonly = "EFIPQZ"; const char kNocountsymbols[5] = "_.-?"; const char stdsymbols[6] = "_.-*?"; const char allsymbols[32] = "_.-*?<>{}[]()!@#$%^&=+;:'/|`~\"\\"; static const char *seqsymbols = allsymbols; const char nummask[11] = "0123456789"; const char nonummask[11] = "~!@#$%^&*("; /* use general form of isseqchar -- all chars + symbols. no formats except nbrf (?) use symbols in data area as anything other than sequence chars. */ /* Local variables for readSeq: */ struct ReadSeqVars { short choice, err, nseq; long seqlen, maxseq, seqlencount; short topnseq; long topseqlen; const char *fname; char *seq, *seqid, matchchar; boolean allDone, done, filestart, addit; FILE *f; long linestart; char s[256], *sp; int (*isseqchar)(); /* int (*isseqchar)(int c); << sgi cc hates (int c) */ }; int isSeqChar(int c) { return (isalpha(c) || strchr(seqsymbols, c)); } int isSeqNumChar(int c) { return (isalnum(c) || strchr(seqsymbols, c)); } int isAnyChar(int c) { return isascii(c); /* wrap in case isascii is macro */ } Local void readline(FILE *f, char *s, long *linestart) { char *cp; *linestart = ftell(f); if (NULL == fgets(s, 256, f)) *s = 0; else { cp = strchr(s, '\n'); if (cp != NULL) *cp = 0; } } Local void cgetline(struct ReadSeqVars *V) { readline(V->f, V->s, &V->linestart); } Local void ungetline(struct ReadSeqVars *V) { fseek(V->f, V->linestart, 0); } Local void addseq(char *s, struct ReadSeqVars *V) { char *ptr; if (V->addit) while (*s != 0) { if ((V->isseqchar)(*s)) { if (V->seqlen >= V->maxseq) { V->maxseq += kStartLength; ptr = (char *)realloc(V->seq, V->maxseq + 1); if (ptr == NULL) { V->err = eMemFull; return; } else V->seq = ptr; } V->seq[(V->seqlen)++] = *s; } s++; } } Local void countseq(char *s, struct ReadSeqVars *V) /* this must count all valid seq chars, for some formats (paup-sequential) even if we are skipping seq... */ { while (*s != 0) { if ((V->isseqchar)(*s)) { (V->seqlencount)++; } s++; } } Local void addinfo(char *s, struct ReadSeqVars *V) { char s2[256], *si; boolean saveadd; si = s2; while (*s == ' ') s++; sprintf(si, " %d) %s\n", V->nseq, s); saveadd = V->addit; V->addit = true; V->isseqchar = isAnyChar; addseq(si, V); V->addit = saveadd; V->isseqchar = isSeqChar; } Local void readLoop(short margin, boolean addfirst, boolean (*endTest)(boolean *addend, boolean *ungetend, struct ReadSeqVars *V), struct ReadSeqVars *V) { boolean addend = false; boolean ungetend = false; V->nseq++; if (V->choice == kListSequences) V->addit = false; else V->addit = (V->nseq == V->choice); if (V->addit) V->seqlen = 0; if (addfirst) addseq(V->s, V); do { cgetline(V); V->done = feof(V->f); V->done |= (*endTest)(&addend, &ungetend, V); if (V->addit && (addend || !V->done) && (strlen(V->s) > margin)) { addseq((V->s) + margin, V); } } while (!V->done); if (V->choice == kListSequences) addinfo(V->seqid, V); else { V->allDone = (V->nseq >= V->choice); if (V->allDone && ungetend) ungetline(V); } } Local boolean endIG(boolean *addend, boolean *ungetend, struct ReadSeqVars *V) { *addend = true; /* 1 or 2 occur in line w/ bases */ *ungetend = false; return ((strchr(V->s, '1') != NULL) || (strchr(V->s, '2') != NULL)); } Local void readIG(struct ReadSeqVars *V) { /* 18Aug92: new IG format -- ^L between sequences in place of ";" */ char *si; while (!V->allDone) { do { cgetline(V); for (si = V->s; *si != 0 && *si < ' '; si++) *si = ' '; /* drop controls */ if (*si == 0) *V->s = 0; /* chop line to empty */ } while (!(feof(V->f) || ((*V->s != 0) && (*V->s != ';')))); if (feof(V->f)) V->allDone = true; else { strcpy(V->seqid, V->s); readLoop(0, false, endIG, V); } } } Local boolean endStrider(boolean *addend, boolean *ungetend, struct ReadSeqVars *V) { *addend = false; *ungetend = false; return (strstr(V->s, "//") != NULL); } Local void readStrider(struct ReadSeqVars *V) { /* ? only 1 seq/file ? */ while (!V->allDone) { cgetline(V); if (strstr(V->s, "; DNA sequence ") == V->s) strcpy(V->seqid, (V->s) + 16); else strcpy(V->seqid, (V->s) + 1); while ((!feof(V->f)) && (*V->s == ';')) { cgetline(V); } if (feof(V->f)) V->allDone = true; else readLoop(0, true, endStrider, V); } } Local boolean endPIR(boolean *addend, boolean *ungetend, struct ReadSeqVars *V) { *addend = false; *ungetend = (strstr(V->s, "ENTRY") == V->s); return ((strstr(V->s, "///") != NULL) || *ungetend); } Local void readPIR(struct ReadSeqVars *V) { /*PIR -- many seqs/file */ while (!V->allDone) { while (!(feof(V->f) || strstr(V->s, "ENTRY") || strstr(V->s, "SEQUENCE"))) cgetline(V); strcpy(V->seqid, (V->s) + 16); while (!(feof(V->f) || strstr(V->s, "SEQUENCE") == V->s)) cgetline(V); readLoop(0, false, endPIR, V); if (!V->allDone) { while (!( feof(V->f) || ((*V->s != 0) && (strstr(V->s, "ENTRY") == V->s)))) cgetline(V); } if (feof(V->f)) V->allDone = true; } } Local boolean endGB(boolean *addend, boolean *ungetend, struct ReadSeqVars *V) { *addend = false; *ungetend = (strstr(V->s, "LOCUS") == V->s); return ((strstr(V->s, "//") != NULL) || *ungetend); } Local void readGenBank(struct ReadSeqVars *V) { /*GenBank -- many seqs/file */ while (!V->allDone) { strcpy(V->seqid, (V->s) + 12); while (!(feof(V->f) || strstr(V->s, "ORIGIN") == V->s)) cgetline(V); readLoop(0, false, endGB, V); if (!V->allDone) { while (!( feof(V->f) || ((*V->s != 0) && (strstr(V->s, "LOCUS") == V->s)))) cgetline(V); } if (feof(V->f)) V->allDone = true; } } Local boolean endNBRF(boolean *addend, boolean *ungetend, struct ReadSeqVars *V) { char *a; if ((a = strchr(V->s, '*')) != NULL) { /* end of 1st seq */ /* "*" can be valid base symbol, drop it here */ *a = 0; *addend = true; *ungetend = false; return (true); } else if (*V->s == '>') { /* start of next seq */ *addend = false; *ungetend = true; return (true); } else return (false); } Local void readNBRF(struct ReadSeqVars *V) { while (!V->allDone) { strcpy(V->seqid, (V->s) + 4); cgetline(V); /*skip title-junk line*/ readLoop(0, false, endNBRF, V); if (!V->allDone) { while (!(feof(V->f) || (*V->s != 0 && *V->s == '>'))) cgetline(V); } if (feof(V->f)) V->allDone = true; } } Local boolean endPearson(boolean *addend, boolean *ungetend, struct ReadSeqVars *V) { *addend = false; *ungetend = true; return (*V->s == '>'); } Local void readPearson(struct ReadSeqVars *V) { while (!V->allDone) { strcpy(V->seqid, (V->s) + 1); readLoop(0, false, endPearson, V); if (!V->allDone) { while ( !(feof(V->f) || ((*V->s != 0) && (*V->s == '>')))) cgetline(V); } if (feof(V->f)) V->allDone = true; } } Local boolean endEMBL(boolean *addend, boolean *ungetend, struct ReadSeqVars *V) { *addend = false; *ungetend = (strstr(V->s, "ID ") == V->s); return ((strstr(V->s, "//") != NULL) || *ungetend); } Local void readEMBL(struct ReadSeqVars *V) { while (!V->allDone) { strcpy(V->seqid, (V->s) + 5); do { cgetline(V); } while (!(feof(V->f) | (strstr(V->s, "SQ ") == V->s))); readLoop(0, false, endEMBL, V); if (!V->allDone) { while ( !(feof(V->f) | ((*V->s != '\0') & (strstr(V->s, "ID ") == V->s)))) cgetline(V); } if (feof(V->f)) V->allDone = true; } } Local boolean endZuker(boolean *addend, boolean *ungetend, struct ReadSeqVars *V) { *addend = false; *ungetend = true; return (*V->s == '('); } Local void readZuker(struct ReadSeqVars *V) { /*! 1st string is Zuker's Fortran format */ while (!V->allDone) { cgetline(V); /*s == "seqLen seqid string..."*/ strcpy(V->seqid, (V->s) + 6); readLoop(0, false, endZuker, V); if (!V->allDone) { while ( !(feof(V->f) | ((*V->s != '\0') & (*V->s == '(')))) cgetline(V); } if (feof(V->f)) V->allDone = true; } } Local boolean endFitch(boolean *addend, boolean *ungetend, struct ReadSeqVars *V) { /* this is a somewhat shaky end, 1st char of line is non-blank for seq. title */ *addend = false; *ungetend = true; return (*V->s != ' '); } Local void readFitch(struct ReadSeqVars *V) { boolean first; first = true; while (!V->allDone) { if (!first) strcpy(V->seqid, V->s); readLoop(0, first, endFitch, V); if (feof(V->f)) V->allDone = true; first = false; } } Local void readPlain(struct ReadSeqVars *V) { V->nseq++; V->addit = (V->choice > 0); if (V->addit) V->seqlen = 0; addseq(V->seqid, V); /*from above..*/ if (V->fname != NULL) sprintf(V->seqid, "%s [Unknown form]", V->fname); else sprintf(V->seqid, " [Unknown form]"); do { addseq(V->s, V); V->done = feof(V->f); cgetline(V); } while (!V->done); if (V->choice == kListSequences) addinfo(V->seqid, V); V->allDone = true; } Local void readUWGCG(struct ReadSeqVars *V) { /* 10nov91: Reading GCG files casued duplication of last line when EOF followed that line !!! fix: cgetline now sets *V->s = 0 */ char *si; V->nseq++; V->addit = (V->choice > 0); if (V->addit) V->seqlen = 0; strcpy(V->seqid, V->s); /*writeseq: " %s Length: %d (today) Check: %d ..\n" */ /*drop above or ".." from id*/ if (si = strstr(V->seqid, " Length: ")) *si = 0; else if (si = strstr(V->seqid, "..")) *si = 0; do { V->done = feof(V->f); cgetline(V); if (!V->done) addseq((V->s), V); } while (!V->done); if (V->choice == kListSequences) addinfo(V->seqid, V); V->allDone = true; } Local void readOlsen(struct ReadSeqVars *V) { /* G. Olsen /print output from multiple sequence editor */ char *si, *sj, *sk, *sm, sid[40], snum[20]; boolean indata = false; int snumlen; V->addit = (V->choice > 0); if (V->addit) V->seqlen = 0; rewind(V->f); V->nseq = 0; do { cgetline(V); V->done = feof(V->f); if (V->done && !(*V->s)) break; else if (indata) { if ((si = strstr(V->s, sid)) /* && (strstr(V->s, snum) == si - snumlen - 1) ) { */ && (sm = strstr(V->s, snum)) && (sm < si - snumlen)) { /* Spaces are valid alignment data !! */ /* 17Oct91: Error, the left margin is 21 not 22! */ /* dropped some nucs up to now -- my example * file was right shifted ! */ /* variable right id margin, drop id-2 spaces at * end */ /* VMS CC COMPILER (VAXC031) mess up: -- Index of 21 is chopping 1st nuc on VMS systems Only! Byte-for-byte same ame rnasep.olsen sequence file ! */ /* si = (V->s)+21; < was this before VMS CC * wasted my time */ si += 10; /* use strstr index plus offset to outfox VMS CC bug */ if (sk = strstr(si, sid)) *(sk - 2) = 0; for (sk = si; *sk != 0; sk++) { if (*sk == ' ') *sk = '.'; /* 18aug92: !! some olsen masks are * NUMBERS !! which addseq eats */ else if (isdigit(*sk)) *sk = nonummask[*sk - '0']; } addseq(si, V); } } else if (sk = strstr(V->s, "): ")) { /* seq info header line */ /* 18aug92: correct for diff seqs w/ same name -- use * number, e.g. */ /* 3 (Agr.tume): agrobacterium.prna 18-JUN-1987 * 16:12 */ /* 328 (Agr.tume): agrobacterium.prna XYZ 19-DEC-1992 */ (V->nseq)++; si = 1 + strchr(V->s, '('); *sk = ' '; if (V->choice == kListSequences) addinfo(si, V); else if (V->nseq == V->choice) { strcpy(V->seqid, si); sj = strchr(V->seqid, ':'); while (*(--sj) == ' ') ; while (--sj != V->seqid) { if (*sj == ' ') *sj = '_'; } *sk = 0; while (*(--sk) == ' ') *sk = 0; strcpy(sid, si); si = V->s; while ((*si <= ' ') && (*si != 0)) si++; snumlen = 0; while (si[snumlen] > ' ' && snumlen < 20) { snum[snumlen] = si[snumlen]; snumlen++; } snum[snumlen] = 0; } } else if (strstr(V->s, "identity: Data:")) { indata = true; if (V->choice == kListSequences) V->done = true; } } while (!V->done); V->allDone = true; } /*readOlsen*/ Local void readMSF(struct ReadSeqVars *V) { /* gcg's MSF, mult. sequence format, interleaved ! */ char *si, *sj, sid[128]; boolean indata = false; int atseq = 0, iline = 0; V->addit = (V->choice > 0); if (V->addit) V->seqlen = 0; rewind(V->f); V->nseq = 0; do { cgetline(V); V->done = feof(V->f); if (V->done && !(*V->s)) break; else if (indata) { /*somename ...gpvedai .......t.. aaigr..vad tvgtgptnse * aipaltaaet */ /* E gvenae.kgv tentna.tad fvaqpvylpe .nqt...... * kv.affynrs */ si = V->s; skipwhitespace(si); /* for (sj= si; isalnum(*sj); sj++) ; bug -- cdelwiche * uses "-", "_" and others in names*/ for (sj = si; *sj > ' '; sj++) ; *sj = 0; if (*si) { if ((0 == strcmp(si, sid))) { addseq(sj + 1, V); } iline++; } } else if (NULL != (si = strstr(V->s, "Name: "))) { /* seq info header line */ /* Name: somename Len: 100 Check: 7009 * Weight: 1.00 */ (V->nseq)++; si += 6; if (V->choice == kListSequences) addinfo(si, V); else if (V->nseq == V->choice) { strcpy(V->seqid, si); si = V->seqid; skipwhitespace(si); /* for (sj= si; isalnum(*sj); sj++) ; -- bug */ for (sj = si; *sj > ' '; sj++) ; *sj = 0; strcpy(sid, si); } } else if (strstr(V->s, "//") /*== V->s*/) { indata = true; iline = 0; if (V->choice == kListSequences) V->done = true; } } while (!V->done); V->allDone = true; } /*readMSF*/ Local void readPAUPinterleaved(struct ReadSeqVars *V) { /* PAUP mult. sequence format, interleaved or sequential! */ char *si, *sj, *send, sid[40], sid1[40], saveseq[255]; boolean first = true, indata = false, domatch; int atseq = 0, iline = 0, ifmc, saveseqlen = 0; #define fixmatchchar(s) \ { \ for (ifmc = 0; ifmc < saveseqlen; ifmc++) \ if (s[ifmc] == V->matchchar) s[ifmc] = saveseq[ifmc]; \ } V->addit = (V->choice > 0); V->seqlencount = 0; if (V->addit) V->seqlen = 0; /* rewind(V->f); V->nseq= 0; << do in caller !*/ indata = true; /* call here after we find "matrix" */ domatch = (V->matchchar > 0); do { cgetline(V); V->done = feof(V->f); if (V->done && !(*V->s)) break; else if (indata) { /* [ 1 1 1 * ]*/ /* human aagcttcaccggcgcagtca ttctcataatcgcccacggR * cttacatcct*/ /* chimp ................a.t. .c.................a * ..........*/ /* !! need to correct for V->matchchar */ si = V->s; skipwhitespace(si); if (strchr(si, ';')) indata = false; if (isalnum(*si)) { /* valid data line starts w/ a left-justified * seq name in columns [0..8] */ if (first) { (V->nseq)++; if (V->nseq >= V->topnseq) first = false; for (sj = si; isalnum(*sj); sj++) ; send = sj; skipwhitespace(sj); if (V->choice == kListSequences) { *send = 0; addinfo(si, V); } else if (V->nseq == V->choice) { if (domatch) { if (V->nseq == 1) { strcpy(saveseq, sj); saveseqlen = strlen( saveseq); } else fixmatchchar( sj); } addseq(sj, V); *send = 0; strcpy(V->seqid, si); strcpy(sid, si); if (V->nseq == 1) strcpy(sid1, sid); } } else if ((strstr(si, sid) == si)) { while (isalnum(*si)) si++; skipwhitespace(si); if (domatch) { if (V->nseq == 1) { strcpy(saveseq, si); saveseqlen = strlen(saveseq); } else fixmatchchar(si); } addseq(si, V); } else if (domatch && (strstr(si, sid1) == si)) { strcpy(saveseq, si); saveseqlen = strlen(saveseq); } iline++; } } else if (strstr(V->s, "matrix")) { indata = true; iline = 0; if (V->choice == kListSequences) V->done = true; } } while (!V->done); V->allDone = true; } /*readPAUPinterleaved*/ Local void readPAUPsequential(struct ReadSeqVars *V) { /* PAUP mult. sequence format, interleaved or sequential! */ char *si, *sj; boolean atname = true, indata = false; V->addit = (V->choice > 0); if (V->addit) V->seqlen = 0; V->seqlencount = 0; /* rewind(V->f); V->nseq= 0; << do in caller !*/ indata = true; /* call here after we find "matrix" */ do { cgetline(V); V->done = feof(V->f); if (V->done && !(*V->s)) break; else if (indata) { /* [ 1 1 1 * ]*/ /* human aagcttcaccggcgcagtca ttctcataatcgcccacggR * cttacatcct*/ /* aagcttcaccggcgcagtca ttctcataatcgcccacggR * cttacatcct*/ /* chimp ................a.t. .c.................a * ..........*/ /* ................a.t. .c.................a * ..........*/ si = V->s; skipwhitespace(si); if (strchr(si, ';')) indata = false; if (isalnum(*si)) { /* valid data line starts w/ a left-justified * seq name in columns [0..8] */ if (atname) { (V->nseq)++; V->seqlencount = 0; atname = false; sj = si + 1; while (isalnum(*sj)) sj++; if (V->choice == kListSequences) { /* !! we must count bases to * know when topseqlen is * reached ! */ countseq(sj, V); if (V->seqlencount >= V->topseqlen) atname = true; *sj = 0; addinfo(si, V); } else if (V->nseq == V->choice) { addseq(sj, V); V->seqlencount = V->seqlen; if (V->seqlencount >= V->topseqlen) atname = true; *sj = 0; strcpy(V->seqid, si); } else { countseq(sj, V); if (V->seqlencount >= V->topseqlen) atname = true; } } else if (V->nseq == V->choice) { addseq(V->s, V); V->seqlencount = V->seqlen; if (V->seqlencount >= V->topseqlen) atname = true; } else { countseq(V->s, V); if (V->seqlencount >= V->topseqlen) atname = true; } } } else if (strstr(V->s, "matrix")) { indata = true; atname = true; if (V->choice == kListSequences) V->done = true; } } while (!V->done); V->allDone = true; } /*readPAUPsequential*/ Local void readPhylipInterleaved(struct ReadSeqVars *V) { char *si, *sj; boolean first = true; int iline = 0; V->addit = (V->choice > 0); if (V->addit) V->seqlen = 0; V->seqlencount = 0; /* sscanf( V->s, "%d%d", &V->topnseq, &V->topseqlen); << topnseq == 0 * !!! bad scan !! */ si = V->s; skipwhitespace(si); V->topnseq = atoi(si); while (isdigit(*si)) si++; skipwhitespace(si); V->topseqlen = atol(si); /* fprintf(stderr,"Phylip-ileaf: topnseq=%d topseqlen=%d\n",V->topnseq, * V->topseqlen); */ do { cgetline(V); V->done = feof(V->f); if (V->done && !(*V->s)) break; si = V->s; skipwhitespace(si); if (*si != 0) { if (first) { /* collect seq names + seq, as fprintf(outf,"%-10s ",seqname); */ (V->nseq)++; if (V->nseq >= V->topnseq) first = false; sj = V->s + 10; /* past name, start of data */ if (V->choice == kListSequences) { *sj = 0; addinfo(si, V); } else if (V->nseq == V->choice) { addseq(sj, V); *sj = 0; strcpy(V->seqid, si); } } else if (iline % V->nseq == V->choice - 1) { addseq(si, V); } iline++; } } while (!V->done); V->allDone = true; } /*readPhylipInterleaved*/ Local boolean endPhylipSequential(boolean *addend, boolean *ungetend, struct ReadSeqVars *V) { *addend = false; *ungetend = false; countseq(V->s, V); return V->seqlencount >= V->topseqlen; } Local void readPhylipSequential(struct ReadSeqVars *V) { short i; char *si; /* sscanf( V->s, "%d%d", &V->topnseq, &V->topseqlen); < ? bad sscan ? */ si = V->s; skipwhitespace(si); V->topnseq = atoi(si); while (isdigit(*si)) si++; skipwhitespace(si); V->topseqlen = atol(si); cgetline(V); while (!V->allDone) { V->seqlencount = 0; strncpy(V->seqid, (V->s), 10); V->seqid[10] = 0; for (i = 0; i < 10 && V->s[i]; i++) V->s[i] = ' '; readLoop(0, true, endPhylipSequential, V); if (feof(V->f)) V->allDone = true; } } Local void readSeqMain(struct ReadSeqVars *V, const long skiplines_, const short format_) { #define tolowerstr(s) \ { \ long Itlwr, Ntlwr = strlen(s); \ for (Itlwr = 0; Itlwr < Ntlwr; Itlwr++) \ s[Itlwr] = to_lower(s[Itlwr]); \ } boolean gotuw; long l; V->linestart = 0; V->matchchar = 0; if (V->f == NULL) V->err = eFileNotFound; else { for (l = skiplines_; l > 0; l--) cgetline(V); do { cgetline(V); for (l = strlen(V->s); (l > 0) && (V->s[l] == ' '); l--) ; } while ((l == 0) && !feof(V->f)); if (feof(V->f)) V->err = eNoData; else switch (format_) { case kPlain: readPlain(V); break; case kIG: readIG(V); break; case kStrider: readStrider(V); break; case kGenBank: readGenBank(V); break; case kPIR: readPIR(V); break; case kNBRF: readNBRF(V); break; case kPearson: readPearson(V); break; case kEMBL: readEMBL(V); break; case kZuker: readZuker(V); break; case kOlsen: readOlsen(V); break; case kMSF: readMSF(V); break; case kPAUP: { boolean done = false; boolean interleaved = false; char *cp; /* rewind(V->f); V->nseq= 0; ?? assume * it is at top ?? skiplines ... */ while (!done) { cgetline(V); tolowerstr(V->s); if (strstr(V->s, "matrix")) done = true; if (strstr(V->s, "interleav")) interleaved = true; if (NULL != (cp = strstr(V->s, "ntax="))) V->topnseq = atoi(cp + 5); if (NULL != (cp = strstr(V->s, "nchar="))) V->topseqlen = atoi(cp + 6); if (NULL != (cp = strstr( V->s, "matchchar="))) { cp += 10; if (*cp == '\'') cp++; else if (*cp == '"') cp++; V->matchchar = *cp; } } if (interleaved) readPAUPinterleaved(V); else readPAUPsequential(V); } break; /* kPhylip: ! can't determine in middle of file * which type it is...*/ /* test for interleave or sequential and use * Phylip4(ileave) or Phylip2 */ case kPhylip2: readPhylipSequential(V); break; case kPhylip4: /* == kPhylip3 */ readPhylipInterleaved(V); break; default: V->err = eUnknownFormat; break; case kFitch: strcpy(V->seqid, V->s); cgetline(V); readFitch(V); break; case kGCG: do { gotuw = (strstr(V->s, "..") != NULL); if (gotuw) readUWGCG(V); cgetline(V); } while (!(feof(V->f) || V->allDone)); break; } } V->filestart = false; V->seq[V->seqlen] = 0; /* stick a string terminator on it */ } char *readSeqFp(const short whichEntry_, /* index to sequence in file */ FILE *fp_, /* pointer to open seq file */ const long skiplines_, const short format_, /* sequence file format */ long *seqlen_, /* return seq size */ short *nseq_, /* number of seqs in file, for listSeqs() */ short *error_, /* return error */ char *seqid_) /* return seq name/info */ { struct ReadSeqVars V; if (format_ < kMinFormat || format_ > kMaxFormat) { *error_ = eUnknownFormat; *seqlen_ = 0; return NULL; } V.choice = whichEntry_; V.fname = NULL; /* don't know */ V.seq = (char *)calloc(1, kStartLength + 1); V.maxseq = kStartLength; V.seqlen = 0; V.seqid = seqid_; V.f = fp_; V.filestart = (ftell(fp_) == 0); /* !! in sequential read, must remove current seq position from * choice/whichEntry_ counter !! ... */ if (V.filestart) V.nseq = 0; else V.nseq = *nseq_; /* track where we are in file...*/ *V.seqid = '\0'; V.err = 0; V.nseq = 0; V.isseqchar = isSeqChar; if (V.choice == kListSequences) ; /* leave as is */ else if (V.choice <= 0) V.choice = 1; /* default ?? */ V.addit = (V.choice > 0); V.allDone = false; readSeqMain(&V, skiplines_, format_); *error_ = V.err; *seqlen_ = V.seqlen; *nseq_ = V.nseq; return V.seq; } char *readSeq(const short whichEntry_, /* index to sequence in file */ const char *filename_, /* file name */ const long skiplines_, const short format_, /* sequence file format */ long *seqlen_, /* return seq size */ short *nseq_, /* number of seqs in file, for listSeqs() */ short *error_, /* return error */ char *seqid_) /* return seq name/info */ { struct ReadSeqVars V; if (format_ < kMinFormat || format_ > kMaxFormat) { *error_ = eUnknownFormat; *seqlen_ = 0; return NULL; } V.choice = whichEntry_; V.fname = filename_; /* don't need to copy string, just ptr to it */ V.seq = (char *)calloc(1, kStartLength + 1); V.maxseq = kStartLength; V.seqlen = 0; V.seqid = seqid_; V.f = NULL; *V.seqid = '\0'; V.err = 0; V.nseq = 0; V.isseqchar = isSeqChar; if (V.choice == kListSequences) ; /* leave as is */ else if (V.choice <= 0) V.choice = 1; /* default ?? */ V.addit = (V.choice > 0); V.allDone = false; V.f = fopen(V.fname, "r"); V.filestart = true; readSeqMain(&V, skiplines_, format_); if (V.f != NULL) fclose(V.f); *error_ = V.err; *seqlen_ = V.seqlen; *nseq_ = V.nseq; return V.seq; } char *listSeqs(const char *filename_, /* file name */ const long skiplines_, const short format_, /* sequence file format */ short *nseq_, /* number of seqs in file, for listSeqs() */ short *error_) /* return error */ { char seqid[256]; long seqlen; return readSeq(kListSequences, filename_, skiplines_, format_, &seqlen, nseq_, error_, seqid); } short seqFileFormat(/* return sequence format number, see ureadseq.h */ const char *filename, long *skiplines, /* return #lines to skip any junk like mail header */ short *error) /* return any error value or 0 */ { FILE *fseq; short format; fseq = fopen(filename, "r"); format = seqFileFormatFp(fseq, skiplines, error); if (fseq != NULL) fclose(fseq); return format; } short seqFileFormatFp( FILE *fseq, long *skiplines, /* return #lines to skip any junk like mail header */ short *error) /* return any error value or 0 */ { boolean foundDNA = false, foundIG = false, foundStrider = false, foundGB = false, foundPIR = false, foundEMBL = false, foundNBRF = false, foundPearson = false, foundFitch = false, foundPhylip = false, foundZuker = false, gotolsen = false, gotpaup = false, gotasn1 = false, gotuw = false, gotMSF = false, isfitch = false, isphylip = false, done = false; short format = kUnknown; int nlines = 0, k, splen = 0, otherlines = 0, aminolines = 0, dnalines = 0; char sp[256]; long linestart = 0; int maxlines2check = 500; #define ReadOneLine(sp) \ { \ done |= (feof(fseq)); \ readline(fseq, sp, &linestart); \ if (!done) { \ splen = strlen(sp); \ ++nlines; \ } \ } *skiplines = 0; *error = 0; if (fseq == NULL) { *error = eFileNotFound; return kNoformat; } while (!done) { ReadOneLine(sp); /* check for mailer head & skip past if found */ if (nlines < 4 && !done) { if ((strstr(sp, "From ") == sp) || (strstr(sp, "Received:") == sp)) { do { /* skip all lines until find one blank * line */ ReadOneLine(sp); if (!done) for (k = 0; (k < splen) && (sp[k] == ' '); k++) ; } while ((!done) && (k < splen)); *skiplines = nlines; /* !? do we want #lines or #bytes ?? */ } } if (sp == NULL || *sp == 0) ; /* nada */ /* high probability identities: */ else if (strstr(sp, "MSF:") && strstr(sp, "Type:") && strstr(sp, "Check:")) gotMSF = true; else if ((strstr(sp, "..") != NULL) && (strstr(sp, "Check:") != NULL)) gotuw = true; else if (strstr(sp, "identity: Data:") != NULL) gotolsen = true; else if (strstr(sp, "::=") && (strstr(sp, "Bioseq") || /* Bioseq or Bioseq-set */ strstr(sp, "Seq-entry") || strstr( sp, "Seq-submit"))) /* can we read submit format? */ gotasn1 = true; else if (strstr(sp, "#NEXUS") == sp) gotpaup = true; /* uncertain identities: */ else if (*sp == ';') { if (strstr(sp, "Strider") != NULL) foundStrider = true; else foundIG = true; } else if (strstr(sp, "LOCUS") == sp) foundGB = true; else if (strstr(sp, "ORIGIN") == sp) foundGB = true; else if (strstr(sp, "ENTRY ") == sp) /* ? also (strcmp(sp,"\\\\\\")==0) */ foundPIR = true; else if (strstr(sp, "SEQUENCE") == sp) foundPIR = true; else if (*sp == '>') { if (sp[3] == ';') foundNBRF = true; else foundPearson = true; } else if (strstr(sp, "ID ") == sp) foundEMBL = true; else if (strstr(sp, "SQ ") == sp) foundEMBL = true; else if (*sp == '(') foundZuker = true; else { if (nlines - *skiplines == 1) { int ispp = 0, ilen = 0; sscanf(sp, "%d%d", &ispp, &ilen); if (ispp > 0 && ilen > 0) isphylip = true; } else if (isphylip && nlines - *skiplines == 2) { int tseq; tseq = getseqtype(sp + 10, strlen(sp + 10)); if (isalpha(*sp) /* 1st letter in 2nd line must be of a name */ && (tseq != kOtherSeq)) /* sequence section must be okay */ foundPhylip = true; } for (k = 0, isfitch = true; isfitch & (k < splen); k++) { if (k % 4 == 0) isfitch &= (sp[k] == ' '); else isfitch &= (sp[k] != ' '); } if (isfitch & (splen > 20)) foundFitch = true; /* kRNA && kDNA are fairly certain...*/ switch (getseqtype(sp, splen)) { case kOtherSeq: otherlines++; break; case kAmino: if (splen > 20) aminolines++; break; case kDNA: case kRNA: if (splen > 20) dnalines++; break; case kNucleic: break; /* not much info ? */ } } /* pretty certain */ if (gotolsen) { format = kOlsen; done = true; } else if (gotMSF) { format = kMSF; done = true; } else if (gotasn1) { /* !! we need to look further and return kASNseqentry | * kASNseqset */ /* seqentry key is Seq-entry ::= seqset key is Bioseq-set ::= ?? can't read these yet w/ ncbi tools ?? Seq-submit ::= Bioseq ::= << fails both bioseq-seq and seq-entry parsers ! */ if (strstr(sp, "Bioseq-set")) format = kASNseqset; else if (strstr(sp, "Seq-entry")) format = kASNseqentry; else format = kASN1; /* other form, we can't yet read... */ done = true; } else if (gotpaup) { format = kPAUP; done = true; } else if (gotuw) { if (foundIG) format = kIG; /* a TOIG file from GCG for certain */ else format = kGCG; done = true; } else if ((dnalines > 1) || done || (nlines > maxlines2check)) { /* decide on most likely format */ /* multichar idents: */ if (foundStrider) format = kStrider; else if (foundGB) format = kGenBank; else if (foundPIR) format = kPIR; else if (foundEMBL) format = kEMBL; else if (foundNBRF) format = kNBRF; /* single char idents: */ else if (foundIG) format = kIG; else if (foundPearson) format = kPearson; else if (foundZuker) format = kZuker; /* digit ident: */ else if (foundPhylip) format = kPhylip; /* spacing ident: */ else if (foundFitch) format = kFitch; /* no format chars: */ else if (otherlines > 0) format = kUnknown; else if (dnalines > 1) format = kPlain; else if (aminolines > 1) format = kPlain; else format = kUnknown; done = true; } /* need this for possible long header in olsen format */ else if (strstr(sp, "): ") != NULL) maxlines2check++; } if (format == kPhylip) { /* check for interleaved or sequential -- really messy */ int tname, tseq; long i, j, nspp = 0, nlen = 0, ilen, leaf = 0, seq = 0; char *ps; rewind(fseq); for (i = 0; i < *skiplines; i++) ReadOneLine(sp); nlines = 0; ReadOneLine(sp); sscanf(sp, "%d%d", &nspp, &nlen); ReadOneLine(sp); /* 1st seq line */ for (ps = sp + 10, ilen = 0; *ps != 0; ps++) if (isprint(*ps)) ilen++; for (i = 1; i < nspp; i++) { ReadOneLine(sp); tseq = getseqtype(sp + 10, strlen(sp + 10)); tname = getseqtype(sp, 10); for (j = 0, ps = sp; isspace(*ps) && j < 10; ps++, j++) ; for (ps = sp; *ps != 0; ps++) if (isprint(*ps)) ilen++; /* find probable interleaf or sequential ... */ if (j >= 9) seq += 10; /* pretty certain not ileaf */ else { if (tseq != tname) leaf++; else seq++; if (tname == kDNA || tname == kRNA) seq++; else leaf++; } if (ilen <= nlen && j < 9) { if (tname == kOtherSeq) leaf += 10; else if (tname == kAmino || tname == kDNA || tname == kRNA) seq++; else leaf++; } else if (ilen > nlen) { ilen = 0; } } for (nspp *= 2; i < nspp; i++) { /* this should be only bases if interleaf */ ReadOneLine(sp); tseq = getseqtype(sp + 10, strlen(sp + 10)); tname = getseqtype(sp, 10); for (ps = sp; *ps != 0; ps++) if (isprint(*ps)) ilen++; for (j = 0, ps = sp; isspace(*ps) && j < 10; ps++, j++) ; if (j < 9) { if (tname == kOtherSeq) seq += 10; if (tseq != tname) seq++; else leaf++; if (tname == kDNA || tname == kRNA) leaf++; else seq++; } if (ilen > nlen) { if (j > 9) leaf += 10; /* must be a name here for sequent */ else if (tname == kOtherSeq) seq += 10; ilen = 0; } } if (leaf > seq) format = kPhylip4; else format = kPhylip2; } return (format); #undef ReadOneLine } /* SeqFileFormat */ unsigned long GCGchecksum(const char *seq, const long seqlen, unsigned long *checktotal) /* GCGchecksum */ { register long i, check = 0, count = 0; for (i = 0; i < seqlen; i++) { count++; check += count * to_upper(seq[i]); if (count == 57) count = 0; } check %= 10000; *checktotal += check; *checktotal %= 10000; return check; } /* Table of CRC-32's of all single byte values (made by makecrc.c of ZIP source) */ const unsigned long crctab[] = { 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L, 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L, 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L, 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL, 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L, 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L, 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L, 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL, 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L, 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL, 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L, 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L, 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L, 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL, 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL, 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L, 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL, 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L, 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L, 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L, 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL, 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L, 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L, 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL, 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L, 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L, 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L, 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L, 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L, 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL, 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL, 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L, 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L, 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL, 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL, 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L, 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL, 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L, 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL, 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L, 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL, 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L, 0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L, 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL, 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L, 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L, 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L, 0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L, 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L, 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L, 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL, 0x2d02ef8dL}; unsigned long CRC32checksum(const char *seq, const long seqlen, unsigned long *checktotal) /*CRC32checksum: modified from CRC-32 algorithm found in ZIP compression source */ { register unsigned long c = 0xffffffffL; register long n = seqlen; while (n--) { c = crctab[((int)c ^ (to_upper(*seq))) & 0xff] ^ (c >> 8); seq++; /* fixed aug'98 finally */ } c = c ^ 0xffffffffL; *checktotal += c; return c; } short getseqtype(const char *seq, const long seqlen) { /* return sequence kind: kDNA, kRNA, kProtein, kOtherSeq, ??? */ char c; short i, maxtest; short na = 0, aa = 0, po = 0, nt = 0, nu = 0, ns = 0, no = 0; maxtest = min(300, seqlen); for (i = 0; i < maxtest; i++) { c = to_upper(seq[i]); if (strchr(protonly, c)) po++; else if (strchr(primenuc, c)) { na++; if (c == 'T') nt++; else if (c == 'U') nu++; } else if (strchr(aminos, c)) aa++; else if (strchr(seqsymbols, c)) ns++; else if (isalpha(c)) no++; } if ((no > 0) || (po + aa + na == 0)) return kOtherSeq; /* ?? test for probability of kOtherSeq ?, e.g., else if (po+aa+na / maxtest < 0.70) return kOtherSeq; */ else if (po > 0) return kAmino; else if (aa == 0) { if (nu > nt) return kRNA; else return kDNA; } else if (na > aa) return kNucleic; else return kAmino; } /* getseqtype */ char *compressSeq(const char gapc, const char *seq, const long seqlen, long *newlen) { register char *a, *b; register long i; char *newseq; *newlen = 0; if (!seq) return NULL; newseq = (char *)malloc(seqlen + 1); if (!newseq) return NULL; for (a = (char *)seq, b = newseq, i = 0; *a != 0; a++) if (*a != gapc) { *b++ = *a; i++; } *b = '\0'; newseq = (char *)realloc(newseq, i + 1); *newlen = i; return newseq; } /*** char *rtfhead = "{\\rtf1\\defformat\\mac\\deff2 \ {\\fonttbl\ {\\f1\\fmodern Courier;}{\\f2\\fmodern Monaco;}\ {\\f3\\fswiss Helvetica;}{\\f4\\fswiss Geneva;}\ {\\f5\\froman Times;}{\\f6\\froman Palatino;}\ {\\f7\\froman New Century Schlbk;}{\\f8\\ftech Symbol;}}\ {\\stylesheet\ {\\s1 \\f5\\fs20 \\sbasedon0\\snext1 name;}\ {\\s2 \\f3\\fs20 \\sbasedon0\\snext2 num;}\ {\\s3 \\f1\\f21 \\sbasedon0\\snext3 seq;}}"; char *rtftail = "}"; ****/ short writeSeq(FILE *outf, const char *seq, const long seqlen, const short outform, const char *seqid) /* dump sequence to standard output */ { const short kSpaceAll = -9; #define kMaxseqwidth 250 boolean baseonlynum = false; /* nocountsymbols -- only count true bases, not "-" */ short numline = 0; /* only true if we are writing seq number line (for interleave) */ boolean numright = false, numleft = false; boolean nameright = false, nameleft = false; short namewidth = 8, numwidth = 8; short spacer = 0, width = 50, tab = 0; /* new parameters: width, spacer, those above... */ short linesout = 0, seqtype = kNucleic; long i, j, l, l1, ibase; char idword[31], endstr[10]; char seqnamestore[128], *seqname = seqnamestore; char s[kMaxseqwidth], *cp; char nameform[10], numform[10], nocountsymbols[10]; unsigned long checksum = 0, checktotal = 0; gPretty.atseq++; skipwhitespace(seqid); l = min(128, strlen(seqid)); strncpy(seqnamestore, seqid, l); seqname[l] = 0; sscanf(seqname, "%30s", idword); sprintf(numform, "%d", seqlen); numwidth = strlen(numform) + 1; nameform[0] = '\0'; if (strstr(seqname, "checksum") != NULL) { cp = strstr(seqname, "bases"); if (cp != NULL) { for (; (cp != seqname) && (*cp != ','); cp--) ; if (cp != seqname) *cp = 0; } } strcpy(endstr, ""); l1 = 0; if (outform == kGCG || outform == kMSF) checksum = GCGchecksum(seq, seqlen, &checktotal); else checksum = seqchecksum(seq, seqlen, &checktotal); switch (outform) { case kPlain: case kUnknown: /* no header, just sequence */ strcpy(endstr, "\n"); /* end w/ extra blank line */ break; case kOlsen: /* Olsen seq. editor takes plain nucs OR Genbank */ case kGenBank: fprintf(outf, "LOCUS %s %d bp\n", idword, seqlen); fprintf(outf, "DEFINITION %s, %d bases, %X checksum.\n", seqname, seqlen, checksum); /* fprintf(outf,"ACCESSION %s\n", accnum); */ fprintf(outf, "ORIGIN \n"); spacer = 11; numleft = true; numwidth = 8; /* dgg. 1Feb93, patch for GDE fail to read short numwidth */ strcpy(endstr, "\n//"); linesout += 4; break; case kPIR: /* somewhat like genbank... \\\*/ /* fprintf(outf,"\\\\\\\n"); << only at top of file, not * each entry... */ fprintf(outf, "ENTRY %s \n", idword); fprintf(outf, "TITLE %s, %d bases, %X checksum.\n", seqname, seqlen, checksum); /* fprintf(outf,"ACCESSION %s\n", accnum); */ fprintf(outf, "SEQUENCE \n"); numwidth = 7; width = 30; spacer = kSpaceAll; numleft = true; strcpy(endstr, "\n///"); /* run a top number line for PIR */ for (j = 0; j < numwidth; j++) fputc(' ', outf); for (j = 5; j <= width; j += 5) fprintf(outf, "%10d", j); fputc('\n', outf); linesout += 5; break; case kNBRF: if (getseqtype(seq, seqlen) == kAmino) fprintf(outf, ">P1;%s\n", idword); else fprintf(outf, ">DL;%s\n", idword); fprintf(outf, "%s, %d bases, %X checksum.\n", seqname, seqlen, checksum); spacer = 11; strcpy(endstr, "*\n"); linesout += 3; break; case kEMBL: fprintf(outf, "ID %s\n", idword); /* fprintf(outf,"AC %s\n", accnum); */ fprintf(outf, "DE %s, %d bases, %X checksum.\n", seqname, seqlen, checksum); fprintf(outf, "SQ %d BP\n", seqlen); strcpy(endstr, "\n//"); /* 11Oct90: bug fix*/ tab = 4; /** added 31jan91 */ spacer = 11; /** added 31jan91 */ width = 60; linesout += 4; break; case kGCG: fprintf(outf, "%s\n", seqname); /* fprintf(outf,"ACCESSION %s\n", accnum); */ fprintf(outf, " %s Length: %d (today) Check: %d ..\n", idword, seqlen, checksum); spacer = 11; numleft = true; strcpy(endstr, "\n"); /* this is insurance to help prevent misreads at eof */ linesout += 3; break; case kStrider: /* ?? map ?*/ fprintf(outf, "; ### from DNA Strider ;-)\n"); fprintf( outf, "; DNA sequence %s, %d bases, %X checksum.\n;\n", seqname, seqlen, checksum); strcpy(endstr, "\n//"); linesout += 3; break; case kFitch: fprintf(outf, "%s, %d bases, %X checksum.\n", seqname, seqlen, checksum); spacer = 4; width = 60; linesout += 1; break; case kPhylip2: case kPhylip4: /* this is version 3.2/3.4 -- simplest way to write version 3.3 is to write as version 3.2, then re-read file and interleave the species lines */ if (strlen(idword) > 10) idword[10] = 0; fprintf(outf, "%-10s ", idword); l1 = -1; tab = 12; spacer = 11; break; case kASN1: seqtype = getseqtype(seq, seqlen); switch (seqtype) { case kDNA: cp = "dna"; break; case kRNA: cp = "rna"; break; case kNucleic: cp = "na"; break; case kAmino: cp = "aa"; break; case kOtherSeq: cp = "not-set"; break; } fprintf(outf, " seq {\n"); fprintf(outf, " id { local id %d },\n", gPretty.atseq); fprintf(outf, " descr { title \"%s\" },\n", seqid); fprintf(outf, " inst {\n"); fprintf(outf, " repr raw, mol %s, length %d, topology " "linear,\n", cp, seqlen); fprintf(outf, " seq-data\n"); if (seqtype == kAmino) fprintf(outf, " iupacaa \""); else fprintf(outf, " iupacna \""); l1 = 17; spacer = 0; width = 78; tab = 0; strcpy(endstr, "\"\n } } ,"); linesout += 7; break; case kPAUP: nameleft = true; namewidth = 9; spacer = 21; width = 100; tab = 0; /* 1; */ /* strcpy(endstr,";\nend;"); << this is end of all * seqs.. */ /* do a header comment line for paup */ fprintf(outf, "[Name: %-16s Len:%6d Check: %8X]\n", idword, seqlen, checksum); linesout += 1; break; case kPretty: numline = gPretty.numline; baseonlynum = gPretty.baseonlynum; namewidth = gPretty.namewidth; numright = gPretty.numright; numleft = gPretty.numleft; nameright = gPretty.nameright; nameleft = gPretty.nameleft; spacer = gPretty.spacer + 1; width = gPretty.seqwidth; tab = gPretty.tab; /* also add rtf formatting w/ font, size, style */ if (gPretty.nametop) { fprintf(outf, "Name: %-16s Len:%6d Check: %8X\n", idword, seqlen, checksum); linesout++; } break; case kMSF: fprintf( outf, " Name: %-16s Len:%6d Check: %5d Weight: 1.00\n", idword, seqlen, checksum); linesout++; nameleft = true; namewidth = 15; /* need MAX namewidth here... */ sprintf(nameform, "%%+%ds ", namewidth); spacer = 11; width = 50; tab = 0; /* 1; */ break; case kIG: fprintf(outf, ";%s, %d bases, %X checksum.\n", seqname, seqlen, checksum); fprintf(outf, "%s\n", idword); strcpy(endstr, "1"); /* == linear dna */ linesout += 2; break; default: case kZuker: /* don't attempt Zuker's ftn format */ case kPearson: fprintf(outf, ">%s, %d bases, %X checksum.\n", seqname, seqlen, checksum); linesout += 1; break; } if (*nameform == 0) sprintf(nameform, "%%%d.%ds ", namewidth, namewidth); if (numline) sprintf(numform, "%%%ds ", numwidth); else sprintf(numform, "%%%dd ", numwidth); strcpy(nocountsymbols, kNocountsymbols); if (baseonlynum) { if (strchr(nocountsymbols, gPretty.gapchar) == NULL) { strcat(nocountsymbols, " "); nocountsymbols[strlen(nocountsymbols) - 1] = gPretty.gapchar; } if (gPretty.domatch && (cp = strchr(nocountsymbols, gPretty.matchchar)) != NULL) { *cp = ' '; } } if (numline) { *idword = 0; } width = min(width, kMaxseqwidth); for (i = 0, l = 0, ibase = 1; i < seqlen;) { if (l1 < 0) l1 = 0; else if (l1 == 0) { if (nameleft) fprintf(outf, nameform, idword); if (numleft) { if (numline) fprintf(outf, numform, ""); else fprintf(outf, numform, ibase); } for (j = 0; j < tab; j++) fputc(' ', outf); } l1++; /* don't count spaces for width*/ if (numline) { if (spacer == kSpaceAll || (spacer != 0 && (l + 1) % spacer == 1)) { if (numline == 1) fputc(' ', outf); s[l++] = ' '; } if (l1 % 10 == 1 || l1 == width) { if (numline == 1) fprintf(outf, "%-9d ", i + 1); s[l++] = '|'; /* == put a number here */ } else s[l++] = ' '; i++; } else { if (spacer == kSpaceAll || (spacer != 0 && (l + 1) % spacer == 1)) s[l++] = ' '; if (!baseonlynum) ibase++; else if (0 == strchr(nocountsymbols, seq[i])) ibase++; s[l++] = seq[i++]; } if (l1 == width || i == seqlen) { if (outform == kPretty) for (; l1 < width; l1++) { if (spacer == kSpaceAll || (spacer != 0 && (l + 1) % spacer == 1)) s[l++] = ' '; s[l++] = ' '; /* pad w/ blanks */ } s[l] = '\0'; l = 0; l1 = 0; if (numline) { if (numline == 2) fprintf( outf, "%s", s); /* finish numberline ! and | */ } else { if (i == seqlen) fprintf(outf, "%s%s", s, endstr); else fprintf(outf, "%s", s); if (numright || nameright) fputc(' ', outf); if (numright) fprintf(outf, numform, ibase - 1); if (nameright) fprintf(outf, nameform, idword); } fputc('\n', outf); linesout++; } } return linesout; } /*writeSeq*/ /* End file: ureadseq.c */