/* File: readseq.c * main() program for ureadseq.c, ureadseq.h * * Reads and writes nucleic/protein sequence in various * formats. Data files may have multiple sequences. * * Copyright 1990 by d.g.gilbert * biology dept., indiana university, bloomington, in 47405 * e-mail: gilbertd@bio.indiana.edu * * This program may be freely copied and used by anyone. * Developers are encourged to incorporate parts in their * programs, rather than devise their own private sequence * format. * * This should compile and run with any ANSI C compiler. * Please advise me of any bugs, additions or corrections. * */ const char *title = "readSeq (1Feb93), multi-format molbio sequence reader.\n"; /* History 27 Feb 90. 1st release to public. 4 Mar 90. + Gary Olsen format + case change * minor corrections to NBRF,EMBL,others * output 1 file per sequence for gcg, unknown * define -DNOSTR for c-libraries w/o strstr - readseq.p, pascal version, becomes out-of-date 24 May 90. + Phylip 3.2 output format (no input) 20 Jul 90. + Phylip 3.3 output (no input yet) + interactive output re-direction + verbose progress info * interactive help output * dropped line no.s on NBRF output * patched in HyperGCG XCMD corrections, - except for seq. documentation handling * dropped the IG special nuc codes, as IG has adopted the standard IUB codes (now if only everyone would adopt a standard format !) 11 Oct 90. * corrected bug in reading/writing of EMBL format 17 Oct 91. * corrected bug in reading Olsen format (serious-deletion) 10 Nov 91. * corrected bug in reading some GCG format files (serious-last line duplicated) + add format name parsing (-fgb, -ffasta, ...) + Phylip v3.4 output format (== v3.2, sequential) + add checksum output to all forms that have document + skip mail headers in seq file + add pipe for standard input == seq file (with -p) * fold in parts of MacApp Seq object * strengthen format detection * clarify program structure * remove fixed sequence size limit (now dynamic, sizeof memory) * check and fold in accumulated bug reports: * Now ANSI-C fopen(..,"w") & check open failure * Define -DFIXTOUPPER for nonANSI C libraries that mess up toupper/tolower = No command-line changes; callers of readseq main() should be okay - ureadseq.h functions have changed; client programs need to note. + added Unix and VMS Make scripts, including validation tests 4 May 92. + added 32 bit CRC checksum as alternative to GCG 6.5bit checksum (-DBIGCHECKSUM) Aug 92 = fixed Olsen format input to handle files w/ more sequences, not to mess up when more than one seq has same identifier, and to convert number masks to symbols. = IG format fix to understand ^L 25-30 Dec 92 * revised command-line & interactive interface. Suggested form is now readseq infile -format=genbank -output=outfile -item=1,3,4 ... but remains compatible with prior commandlines: readseq infile -f2 -ooutfile -i3 ... + added GCG MSF multi sequence file format + added PIR/CODATA format + added NCBI ASN.1 sequence file format + added Pretty, multi sequence pretty output (only) + added PAUP multi seq format + added degap option + added Gary Williams (GWW, G.Williams@CRC.AC.UK) reverse-complement option. + added support for reading Phylip formats (interleave & sequential) * string fixes, dropped need for compiler flags NOSTR, FIXTOUPPER, NEEDSTRCASECMP * changed 32bit checksum to default, -DSMALLCHECKSUM for GCG version 1Feb93 = revert GenBank output to a fixed left number width which other software depends on. = fix for MSF input to handle symbols in names = fix bug for possible memory overrun when truncating seqs for Phylip or Paup formats (thanks Anthony Persechini) */ /* Readseq has been tested with: Macintosh MPW C GNU gcc SGI cc VAX-VMS cc Any ANSI C compiler should be able to handle this. Old-style C compilers barf all over the source. How do I build the readseq program if I have an Ansi C compiler? #-------------------- # Unix ANSI C # Use the supplied Makefile this way: % make CC=name-of-c-compiler # OR do this... % gcc readseq.c ureadseq.c -o readseq #-------------------- $!VAX-VMS cc $! Use the supplied Make.Com this way: $ @make $! OR, do this: $ cc readseq, ureadseq $ link readseq, ureadseq, sys$library:vaxcrtl/lib $ readseq :== $ MyDisk:[myacct]readseq #-------------------- # Macintosh Simple Input/Output Window application # requires MPW-C and SIOW library (from APDA) # also uses files macinit.c, macinit.r, readseqSIOW.make # Buildprogram readseqSIOW #-------------------- #MPW-C v3 tool C ureadseq.c C readseq.c link -w -o readseq -t MPST -c 'MPS ' � readseq.c.o Ureadseq.c.o � "{Libraries}"Interface.o � "{Libraries}"ToolLibs.o � "{Libraries}"Runtime.o � "{CLibraries}"StdClib.o readseq -i1 ig.seq # MPW-C with NCBI tools set NCBI "{Boot}@molbio:ncbi:"; EXPORT NCBI set NCBILIB1 "{NCBI}"lib:libncbi.o; export NCBILIB1 set NCBILIB2 "{NCBI}"lib:libncbiobj.o; export NCBILIB2 set NCBILIB3 "{NCBI}"lib:libncbicdr.o; export NCBILIB3 set NCBILIB4 "{NCBI}"lib:libvibrant.o; export NCBILIB4 C ureadseq.c C -d NCBI -i "{NCBI}"include: ureadasn.c C -d NCBI -i "{NCBI}"include: readseq.c link -w -o readseq -t MPST -c 'MPS ' � ureadseq.c.o ureadasn.c.o readseq.c.o � {NCBILIB4} {NCBILIB2} {NCBILIB1} � "{Libraries}"Interface.o � "{Libraries}"ToolLibs.o � "{Libraries}"Runtime.o � "{CLibraries}"CSANELib.o � "{CLibraries}"Math.o � "{CLibraries}"StdClib.o ===========================================================*/ #include #include #include #include #include "ureadseq.h" #pragma segment readseq static char inputfilestore[256], *inputfile = inputfilestore; const char *formats[kMaxFormat + 1] = {" 1. IG/Stanford", " 2. GenBank/GB", " 3. NBRF", " 4. EMBL", " 5. GCG", " 6. DNAStrider", " 7. Fitch", " 8. Pearson/Fasta", " 9. Zuker (in-only)", "10. Olsen (in-only)", "11. Phylip3.2", "12. Phylip", "13. Plain/Raw", "14. PIR/CODATA", "15. MSF", "16. ASN.1", "17. PAUP/NEXUS", "18. Pretty (out-only)", ""}; #define kFormCount 30 #define kMaxFormName 15 const struct formatTable { char *name; short num; } formname[] = { {"ig", kIG}, {"stanford", kIG}, {"genbank", kGenBank}, {"gb", kGenBank}, {"nbrf", kNBRF}, {"embl", kEMBL}, {"gcg", kGCG}, {"uwgcg", kGCG}, {"dnastrider", kStrider}, {"strider", kStrider}, {"fitch", kFitch}, {"pearson", kPearson}, {"fasta", kPearson}, {"zuker", kZuker}, {"olsen", kOlsen}, {"phylip", kPhylip}, {"phylip3.2", kPhylip2}, {"phylip3.3", kPhylip3}, {"phylip3.4", kPhylip4}, {"phylip-interleaved", kPhylip4}, {"phylip-sequential", kPhylip2}, {"plain", kPlain}, {"raw", kPlain}, {"pir", kPIR}, {"codata", kPIR}, {"asn.1", kASN1}, {"msf", kMSF}, {"paup", kPAUP}, {"nexus", kPAUP}, {"pretty", kPretty}, }; const char *kASN1headline = "Bioseq-set ::= {\nseq-set {\n"; /* GWW table for getting the complement of a nucleotide (IUB codes) */ /* ! * "#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[ * \]^_`abcdefghijklmnopqrstuvwxyz{|}~ */ const char compl [] = " !\"#$%&'()*+,-./" "0123456789:;<=>?@TVGHNNCDNNMNKNNYRYSAABWNRN[\\]^_`" "tvghnncdnnmnknnyrysaabwnrn{|}~"; char *formatstr(short format) { if (format < 1 || format > kMaxFormat) { switch (format) { case kASNseqentry: case kASNseqset: return formats[kASN1 - 1]; case kPhylipInterleave: case kPhylipSequential: return formats[kPhylip - 1]; default: return "(unknown)"; } } else return formats[format - 1]; } int parseformat(char *name) { #define kDupmatch -2 int namelen, maxlen, i, match, matchat; char lname[kMaxFormName + 1]; skipwhitespace(name); namelen = strlen(name); if (namelen == 0) return kNoformat; else if (isdigit(*name)) { i = atol(name); if (i kMaxFormat) return kNoformat; else return i; } /* else match character name */ maxlen = min(kMaxFormName, namelen); for (i = 0; i < maxlen; i++) lname[i] = to_lower(name[i]); lname[maxlen] = 0; matchat = kNoformat; for (i = 0; i < kFormCount; i++) { match = strncmp(lname, formname[i].name, maxlen); if (match == 0) { if (strlen(formname[i].name) == namelen) return (formname[i].num); else if (matchat == kNoformat) matchat = i; else matchat = kDupmatch; /* 2 or more partial matches */ } } if (matchat == kNoformat || matchat == kDupmatch) return kNoformat; else return formname[matchat].num; } static void dumpSeqList(char *list, short format) { long i, l, listlen; char s[256]; listlen = strlen(list); printf("Sequences in %s (format is %s)\n", inputfile, formatstr(format)); for (i = 0, l = 0; i < listlen; i++) { if (list[i] == (char)NEWLINE) { s[l] = '\0'; l = 0; puts(s); } else if (l < 255) s[l++] = list[i]; } putchar('\n'); } void usage() { short i, midi; fprintf(stderr, title); fprintf(stderr, "usage: readseq [-options] in.seq > out.seq\n"); fprintf(stderr, " options\n"); /* ? add -d[igits] to allow digits in sequence data, &/or option to * specify seq charset !? */ fprintf(stderr, " -a[ll] select All sequences\n"); fprintf(stderr, " -c[aselower] change to lower case\n"); fprintf(stderr, " -C[ASEUPPER] change to UPPER CASE\n"); fprintf(stderr, " -degap[=-] remove gap symbols\n"); fprintf(stderr, " -i[tem=2,3,4] select Item number(s) from several\n"); fprintf(stderr, " -l[ist] List sequences only\n"); fprintf(stderr, " -o[utput=]out.seq redirect Output\n"); fprintf(stderr, " -p[ipe] Pipe (command line, stdout)\n"); fprintf(stderr, " -r[everse] change to Reverse-complement\n"); fprintf(stderr, " -v[erbose] Verbose progress\n"); fprintf(stderr, " -f[ormat=]# Format number for output, or\n"); fprintf(stderr, " -f[ormat=]Name Format name for output:\n"); midi = (kMaxFormat + 1) / 2; for (i = kMinFormat - 1; i < midi; i++) fprintf(stderr, " %-20s %-20s\n", formats[i], formats[midi + i]); /* new output format options, esp. for pretty format: */ fprintf(stderr, " \n"); fprintf(stderr, " Pretty format options: \n"); fprintf(stderr, " -wid[th]=# sequence line width\n"); fprintf(stderr, " -tab=# left indent\n"); fprintf(stderr, " -col[space]=# column space within sequence line " "on output\n"); fprintf( stderr, " -gap[count] count gap chars in sequence numbers\n"); fprintf(stderr, " -nameleft, -nameright[=#] name on left/right side [=max " "width]\n"); fprintf(stderr, " -nametop name at top/bottom\n"); fprintf(stderr, " -numleft, -numright seq index on left/right side\n"); fprintf(stderr, " -numtop, -numbot index on top/bottom\n"); fprintf(stderr, " -match[=.] use match base for 2..n species\n"); fprintf(stderr, " -inter[line=#] blank line(s) between sequence " "blocks\n"); /****** not ready yet fprintf(stderr, " -code=none,rtf,postscript,ps code syntax\n"); fprintf(stderr, " -namefont=, -numfont=, -seqfont=font font choice\n"); fprintf(stderr, " font suggestions include times,courier,helvetica\n"); fprintf(stderr, " -namefontsize=, -numfontsize=, -seqfontsize=#\n"); fprintf(stderr, " fontsize suggestions include 9,10,12,14\n"); fprintf(stderr, " -namefontstyle=, -numfontstyle=, -seqfontstyle= style fontstyle for names\n"); fprintf(stderr, " fontstyle options are plain,italic,bold,bold-italic\n"); ******/ } void erralert(short err) { switch (err) { case 0: break; case eFileNotFound: fprintf(stderr, "File not found: %s\n", inputfile); break; case eFileCreate: fprintf(stderr, "Can't open output file.\n"); break; case eASNerr: fprintf(stderr, "Error in ASN.1 sequence routines.\n"); break; case eNoData: fprintf(stderr, "No data in file.\n"); break; case eItemNotFound: fprintf(stderr, "Specified item not in file.\n"); break; case eUnequalSize: fprintf(stderr, "This format requires equal length " "sequences.\nSequence truncated or padded to " "fit.\n"); break; case eUnknownFormat: fprintf(stderr, "Error: this format is unknown to me.\n"); break; case eOneFormat: fprintf(stderr, "Warning: This format permits only 1 sequence " "per file.\n"); break; case eMemFull: fprintf(stderr, "Out of storage memory. Sequence truncated.\n"); break; default: fprintf(stderr, "readSeq error = %d\n", err); break; } } /* erralert */ int chooseFormat(boolean quietly) { char sform[128]; int midi, i, outform; if (quietly) return kPearson; /* default */ else { midi = (kMaxFormat + 1) / 2; for (i = kMinFormat - 1; i < midi; i++) fprintf(stderr, " %-20s %-20s\n", formats[i], formats[midi + i]); fprintf(stderr, "\nChoose an output format (name or #): \n"); gets(sform); outform = parseformat(sform); if (outform == kNoformat) outform = kPearson; return outform; } } /* read paramater(s) */ boolean checkopt(boolean casesense, char *sopt, const char *smatch, short minword) { long lenopt, lenmatch; boolean result; short minmaxw; lenopt = strlen(sopt); lenmatch = strlen(smatch); minmaxw = max(minword, min(lenopt, lenmatch)); if (casesense) result = (!strncmp(sopt, smatch, minmaxw)); else result = (!Strncasecmp(sopt, smatch, minmaxw)); /* if (result) { */ /* fprintf(stderr,"true checkopt(opt=%s,match=%s,param=%s)\n", sopt, * smatch, *sparam); */ /* } */ return result; } #define kMaxwhichlist 50 /* global for readopt(), main() */ boolean chooseall = false, quietly = false, gotinputfile = false, listonly = false, closeout = false, verbose = false, manyout = false, dolower = false, doupper = false, doreverse = false, askout = true, dopipe = false, interleaved = false; short nfile = 0, iwhichlist = 0, nwhichlist = 0; short whichlist[kMaxwhichlist + 1]; long whichSeq = 0, outform = kNoformat; char onamestore[128], *oname = onamestore; FILE *foo = NULL; void resetGlobals() /* need this when used from SIOW, as these globals are not reinited automatically between calls to local main() */ { chooseall = false; quietly = false; gotinputfile = false; listonly = false; closeout = false; verbose = false; manyout = false; dolower = false; doupper = false; doreverse = false; askout = true; dopipe = false; interleaved = false; nfile = 0; iwhichlist = 0; nwhichlist = 0; whichSeq = 0; outform = kNoformat; oname = onamestore; foo = NULL; gPrettyInit(gPretty); } #define kOptOkay 1 #define kOptNone 0 int readopt(char *sopt) { char sparamstore[256], *sparam = sparamstore; short n, slen = strlen(sopt); /* fprintf(stderr,"readopt( %s) == ", sopt); */ if (*sopt == '?') { usage(); return kOptNone; /*? eOptionBad or kOptNone */ } else if (*sopt == '-') { char *cp = strchr(sopt, '='); *sparam = '\0'; if (cp) { strcpy(sparam, cp + 1); *cp = 0; } if (checkopt(false, sopt, "-help", 2)) { usage(); return kOptNone; } if (checkopt(false, sopt, "-all", 2)) { whichSeq = 1; chooseall = true; return kOptOkay; } if (checkopt(false, sopt, "-colspace", 4)) { /* test before -c[ase] */ n = atoi(sparam); gPretty.spacer = n; return kOptOkay; } if (checkopt(true, sopt, "-caselower", 2)) { dolower = true; return kOptOkay; } if (checkopt(true, sopt, "-CASEUPPER", 2)) { doupper = true; return kOptOkay; } if (checkopt(false, sopt, "-pipe", 2)) { dopipe = true; askout = false; return kOptOkay; } if (checkopt(false, sopt, "-list", 2)) { listonly = true; askout = false; return kOptOkay; } if (checkopt(false, sopt, "-reverse", 2)) { doreverse = true; return kOptOkay; } if (checkopt(false, sopt, "-verbose", 2)) { verbose = true; return kOptOkay; } if (checkopt(false, sopt, "-match", 5)) { gPretty.domatch = true; if (*sparam >= ' ') gPretty.matchchar = *sparam; return kOptOkay; } if (checkopt(false, sopt, "-degap", 4)) { gPretty.degap = true; if (*sparam >= ' ') gPretty.gapchar = *sparam; return kOptOkay; } if (checkopt(false, sopt, "-interline", 4)) { gPretty.interline = atoi(sparam); return kOptOkay; } if (checkopt(false, sopt, "-item", 2)) { char *cp = sparam; nwhichlist = 0; whichlist[0] = 0; if (*cp == 0) cp = sopt + 2; /* compatible w/ old way */ do { while (*cp != 0 && !isdigit(*cp)) cp++; if (*cp != 0) { n = atoi(cp); whichlist[nwhichlist++] = n; while (*cp != 0 && isdigit(*cp)) cp++; } } while (*cp != 0 && n > 0 && nwhichlist < kMaxwhichlist); whichlist[nwhichlist++] = 0; /* 0 == stopsign for loop */ whichSeq = max(1, whichlist[0]); iwhichlist = 1; return kOptOkay; } if (checkopt(false, sopt, "-format", 5)) { /* -format=phylip, -f2, -form=phylip */ if (*sparam == 0) { for (sparam = sopt + 2; isalpha(*sparam); sparam++) ; } outform = parseformat(sparam); return kOptOkay; } if (checkopt(false, sopt, "-f", 2)) { /* compatible w/ -fphylip prior version */ if (*sparam == 0) sparam = sopt + 2; outform = parseformat(sparam); return kOptOkay; } if (checkopt(false, sopt, "-output", 3)) { /* -output=myseq */ if (*sparam == 0) { for (sparam = sopt + 3; isalpha(*sparam); sparam++) ; } strcpy(oname, sparam); foo = fopen(oname, "w"); if (!foo) { erralert(eFileCreate); return eFileCreate; } closeout = true; askout = false; return kOptOkay; } if (checkopt(false, sopt, "-o", 2)) { /* compatible w/ -omyseq prior version */ if (*sparam == 0) sparam = sopt + 2; strcpy(oname, sparam); foo = fopen(oname, "w"); if (!foo) { erralert(eFileCreate); return eFileCreate; } closeout = true; askout = false; return kOptOkay; } if (checkopt(false, sopt, "-width", 2)) { if (*sparam == 0) { for (sparam = sopt + 2; !isdigit(*sparam) && *sparam != 0; sparam++) ; } n = atoi(sparam); if (n > 0) gPretty.seqwidth = n; return kOptOkay; } if (checkopt(false, sopt, "-tab", 4)) { if (*sparam == 0) { for (sparam = sopt + 2; !isdigit(*sparam) && *sparam != 0; sparam++) ; } n = atoi(sparam); gPretty.tab = n; return kOptOkay; } if (checkopt(false, sopt, "-gapcount", 4)) { gPretty.baseonlynum = false; /* if (*sparam >= ' ') gPretty.gapchar= *sparam; */ return kOptOkay; } if (checkopt(false, sopt, "-nointerleave", 8)) { gPretty.noleaves = true; return kOptOkay; } if (checkopt(false, sopt, "-nameleft", 7)) { if (*sparam == 0) { for (sparam = sopt + 2; !isdigit(*sparam) && *sparam != 0; sparam++) ; } n = atoi(sparam); if (n > 0 && n < 50) gPretty.namewidth = n; gPretty.nameleft = true; return kOptOkay; } if (checkopt(false, sopt, "-nameright", 7)) { if (*sparam == 0) { for (sparam = sopt + 2; !isdigit(*sparam) && *sparam != 0; sparam++) ; } n = atoi(sparam); if (n > 0 && n < 50) gPretty.namewidth = n; gPretty.nameright = true; return kOptOkay; } if (checkopt(false, sopt, "-nametop", 6)) { gPretty.nametop = true; return kOptOkay; } if (checkopt(false, sopt, "-numleft", 6)) { if (*sparam == 0) { for (sparam = sopt + 2; !isdigit(*sparam) && *sparam != 0; sparam++) ; } n = atoi(sparam); if (n > 0 && n < 50) gPretty.numwidth = n; gPretty.numleft = true; return kOptOkay; } if (checkopt(false, sopt, "-numright", 6)) { if (*sparam == 0) { for (sparam = sopt + 2; !isdigit(*sparam) && *sparam != 0; sparam++) ; } n = atoi(sparam); if (n > 0 && n < 50) gPretty.numwidth = n; gPretty.numright = true; return kOptOkay; } if (checkopt(false, sopt, "-numtop", 6)) { gPretty.numtop = true; return kOptOkay; } if (checkopt(false, sopt, "-numbottom", 6)) { gPretty.numbot = true; return kOptOkay; } else { usage(); return eOptionBad; } } else { strcpy(inputfile, sopt); gotinputfile = (*inputfile != 0); nfile++; return kOptOkay; } /* return kOptNone; -- never here */ } /* this program suffers some as it tries to be a quiet translator pipe _and_ a noisy user interactor */ /* return is best for SIOW, okay for others */ #ifdef SIOW #define Exit(a) return (a) siow_main(int argc, char *argv[]) #else #define Exit(a) exit(a) main(int argc, char *argv[]) #endif { boolean closein = false; short ifile, nseq, atseq, format, err = 0, seqtype = kDNA, nlines, seqout = 0, phylvers = 2; long i, skiplines, seqlen, seqlen0; unsigned long checksum = 0, checkall = 0; char *seq, *cp, *firstseq = NULL, *seqlist, *progname, tempname[256]; char seqid[256], *seqidptr = seqid; char stempstore[256], *stemp = stempstore; FILE *ftmp, *fin, *fout; long outindexmax = 0, noutindex = 0, *outindex = NULL; #define exit_main(err) \ { \ if (closeout) fclose(fout); \ if (closein) fclose(fin); \ if (*tempname != 0) remove(tempname); \ Exit(err); \ } #define indexout() \ if (interleaved) { \ if (noutindex >= outindexmax) { \ outindexmax = noutindex + 20; \ outindex = (long *)realloc( \ outindex, sizeof(long) * outindexmax); \ if (outindex == NULL) { \ err = eMemFull; \ erralert(err); \ exit_main(err); \ } \ } \ outindex[noutindex++] = ftell(fout); \ } resetGlobals(); foo = stdout; progname = argv[0]; *oname = 0; *tempname = 0; /* initialize gPretty ?? -- done in header */ for (i = 1; i < argc; i++) { err = readopt(argv[i]); if (err <= 0) exit_main(err); } /* pipe input from stdin !? */ if (dopipe && !gotinputfile) { int c; tmpnam(tempname); inputfile = tempname; ftmp = fopen(inputfile, "w"); if (!ftmp) { erralert(eFileCreate); exit_main(eFileCreate); } while ((c = getc(stdin)) != EOF) fputc(c, ftmp); fclose(ftmp); gotinputfile = true; } quietly = (dopipe || (gotinputfile && (listonly || whichSeq != 0))); if (verbose || (!quietly && !gotinputfile)) fprintf(stderr, title); ifile = 1; /* UI: Choose output */ if (askout && !closeout && !quietly) { askout = false; fprintf( stderr, "\nName of output file (?=help, defaults to display): \n"); gets(oname = onamestore); skipwhitespace(oname); if (*oname == '?') { usage(); exit_main(0); } else if (*oname != 0) { closeout = true; foo = fopen(oname, "w"); if (!foo) { erralert(eFileCreate); exit_main(eFileCreate); } } } fout = foo; if (outform == kNoformat) outform = chooseFormat(quietly); /* set up formats ... */ switch (outform) { case kPhylip2: interleaved = false; phylvers = 2; outform = kPhylip; break; case kPhylip4: interleaved = true; phylvers = 4; outform = kPhylip; break; case kMSF: case kPAUP: interleaved = true; break; case kPretty: gPretty.isactive = true; interleaved = true; break; } if (gPretty.isactive && gPretty.noleaves) interleaved = false; if (interleaved) { fout = ftmp = tmpfile(); outindexmax = 30; noutindex = 0; outindex = (long *)malloc(outindexmax * sizeof(long)); if (outindex == NULL) { err = eMemFull; erralert(err); exit_main(err); } } /* big loop over all input files */ do { /* select next input file */ gotinputfile = (*tempname != 0); while ((ifile < argc) && (!gotinputfile)) { if (*argv[ifile] != '-') { strcpy(inputfile, argv[ifile]); gotinputfile = (*inputfile != 0); --nfile; } ifile++; } while (!gotinputfile) { fprintf(stderr, "\nName an input sequence or -option: \n"); inputfile = inputfilestore; gets(stemp = stempstore); if (*stemp == 0) goto fini; /* !! need this to finish work during interactive use */ stemp = strtok(stempstore, " \n\r\t"); while (stemp) { err = readopt(stemp); /* will read inputfile if it exists */ if (err < 0) exit_main(err); stemp = strtok(NULL, " \n\r\t"); } } /* thanks to AJB@UK.AC.DARESBURY.DLVH for this PHYLIP3 fix: */ /* head for end (interleave if needed) */ if (*inputfile == 0) break; format = seqFileFormat(inputfile, &skiplines, &err); if (err == 0) { #ifdef NCBI if (format == kASNseqentry || format == kASNseqset) seqlist = listASNSeqs(inputfile, skiplines, format, &nseq, &err); else #endif seqlist = listSeqs(inputfile, skiplines, format, &nseq, &err); } if (err != 0) erralert(err); else if (listonly) { dumpSeqList(seqlist, format); free(seqlist); } else { /* choose whichSeq if needed */ if (nseq == 1 || chooseall || (quietly && whichSeq == 0)) { chooseall = true; whichSeq = 1; quietly = true; /* no loop */ } else if (whichSeq > nseq && quietly) { erralert(eItemNotFound); err = eItemNotFound; } else if (whichSeq > nseq || !quietly) { dumpSeqList(seqlist, format); fprintf(stderr, "\nChoose a sequence (# or All): \n"); gets(stemp = stempstore); skipwhitespace(stemp); if (to_lower(*stemp) == 'a') { chooseall = true; whichSeq = 1; quietly = true; /* !? this means we don't ask for another file as well as no more whichSeqs... */ } else if (isdigit(*stemp)) whichSeq = atol(stemp); else whichSeq = 1; /* default */ } free(seqlist); if (false /*chooseall*/) { /* this isn't debugged yet...*/ fin = fopen(inputfile, "r"); closein = true; } while (whichSeq > 0 && whichSeq <= nseq) { /* need to open multiple output files ? */ manyout = ((chooseall || nwhichlist > 1) && nseq > 1 && (outform == kPlain || outform == kGCG)); if (manyout) { if (whichSeq == 1) erralert(eOneFormat); else if (closeout) { sprintf(stemp, "%s_%d", oname, whichSeq); freopen(stemp, "w", fout); fprintf(stderr, "Writing sequence %d " "to file %s\n", whichSeq, stemp); } } if (closein) { /* !! this fails... skips most seqs... */ /* !! in sequential read, must count * seqs already read from whichSeq ... */ /* need major revision of ureadseq * before we can do this */ atseq = whichSeq - 1; seqidptr = seqid; seq = readSeqFp( whichSeq, fin, skiplines, format, &seqlen, &atseq, &err, seqidptr); skiplines = 0; } else { atseq = 0; seqidptr = seqid; #ifdef NCBI if (format == kASNseqentry || format == kASNseqset) { seqidptr = NULL; seq = readASNSeq( whichSeq, inputfile, skiplines, format, &seqlen, &atseq, &err, &seqidptr); } else #endif seq = readSeq( whichSeq, inputfile, skiplines, format, &seqlen, &atseq, &err, seqidptr); } if (gPretty.degap) { char *newseq; long newlen; newseq = compressSeq(gPretty.gapchar, seq, seqlen, &newlen); if (newseq) { free(seq); seq = newseq; seqlen = newlen; } } if (outform == kMSF) checksum = GCGchecksum(seq, seqlen, &checkall); else if (verbose) checksum = seqchecksum(seq, seqlen, &checkall); if (verbose) fprintf(stderr, "Sequence %d, length= %d, " "checksum= %X, format= %s, id= " "%s\n", whichSeq, seqlen, checksum, formatstr(format), seqidptr); if (err != 0) erralert(err); else { /* format fixes that writeseq doesn't do */ switch (outform) { case kPIR: if (seqout == 0) fprintf( foo, "\\\\\\\n"); break; case kASN1: if (seqout == 0) fprintf( foo, kASN1headline); break; case kPhylip: if (seqout == 0) { if (!interleaved) { /* bug, nseq is for 1st infile only */ if (chooseall) i = nseq; else i = 1; if (phylvers >= 4) fprintf( foo, " %d %d\n", i, seqlen); else fprintf( foo, " %d %d YF\n", i, seqlen); } seqlen0 = seqlen; } else if (seqlen != seqlen0) { erralert( eUnequalSize); if (seqlen < seqlen0) seq = (char *) realloc( seq, seqlen0); for (i = seqlen; i < seqlen0; i++) seq[i] = gPretty .gapchar; seqlen = seqlen0; seq[seqlen] = 0; } break; case kPAUP: if (seqout == 0) { seqtype = getseqtype( seq, seqlen); seqlen0 = seqlen; } else if (seqlen != seqlen0) { erralert( eUnequalSize); if (seqlen < seqlen0) seq = (char *) realloc( seq, seqlen0); for (i = seqlen; i < seqlen0; i++) seq[i] = gPretty .gapchar; seqlen = seqlen0; seq[seqlen] = 0; } break; } if (doupper) for (i = 0; i < seqlen; i++) seq[i] = to_upper(seq[i]); else if (dolower) for (i = 0; i < seqlen; i++) seq[i] = to_lower(seq[i]); if (doreverse) { long j, k; char ctemp; for (j = 0, k = seqlen - 1; j <= k; j++, k--) { ctemp = compl [seq[j] - ' ']; seq[j] = compl [seq[k] - ' ']; seq[k] = ctemp; } } if ((gPretty.isactive || outform == kPAUP) && gPretty.domatch && firstseq != NULL) { for (i = 0; i < seqlen; i++) if (seq[i] == firstseq[i]) seq[i] = gPretty .matchchar; } if (gPretty.isactive && gPretty.numtop && seqout == 0) { gPretty.numline = 1; indexout(); (void)writeSeq(fout, seq, seqlen, outform, seqidptr); gPretty.numline = 2; indexout(); (void)writeSeq(fout, seq, seqlen, outform, seqidptr); gPretty.numline = 0; } indexout(); nlines = writeSeq(fout, seq, seqlen, outform, seqidptr); seqout++; } if ((gPretty.isactive || outform == kPAUP) && gPretty.domatch && firstseq == NULL) { firstseq = seq; seq = NULL; } else if (seq != NULL) { free(seq); seq = NULL; } #ifdef NCBI if ((format == kASNseqentry || format == kASNseqset) && seqidptr && seqidptr != seqid) free(seqidptr); #endif if (chooseall) whichSeq++; else if (iwhichlist < nwhichlist) whichSeq = whichlist[iwhichlist++]; else whichSeq = 0; } if (closein) { fclose(fin); closein = false; } } whichSeq = 0; } while (nfile > 0 || !quietly); fini: if (firstseq) { free(firstseq); firstseq = NULL; } if (err || listonly) exit_main(err); if (gPretty.isactive && gPretty.numbot) { gPretty.numline = 2; indexout(); (void)writeSeq(fout, seq, seqlen, outform, seqidptr); gPretty.numline = 1; indexout(); (void)writeSeq(fout, seq, seqlen, outform, seqidptr); gPretty.numline = 0; } if (outform == kMSF) { if (*oname) cp = oname; else cp = inputfile; fprintf(foo, "\n %s MSF: %d Type: N January 01, 1776 12:00 " "Check: %d ..\n\n", cp, seqlen, checkall); } if (outform == kPAUP) { fprintf(foo, "#NEXUS\n"); if (*oname) cp = oname; else cp = inputfile; fprintf(foo, "[%s -- data title]\n\n", cp); /* ! now have header lines for each sequence... put them before * "begin data;... */ } if (outform == kPhylip && interleaved) { if (phylvers >= 4) fprintf(foo, " %d %d\n", seqout, seqlen); else fprintf(foo, " %d %d YF\n", seqout, seqlen); } if (interleaved) { /* interleave species lines in true output */ /* nlines is # lines / sequence */ short iline, j, leaf, iseq; char *s = stempstore; indexout(); noutindex--; /* mark eof */ for (leaf = 0; leaf < nlines; leaf++) { if (outform == kMSF && leaf == 1) { fputs("//\n\n", foo); } if (outform == kPAUP && leaf == 1) { switch (seqtype) { case kDNA: cp = "dna"; break; case kRNA: cp = "rna"; break; case kNucleic: cp = "dna"; break; case kAmino: cp = "protein"; break; case kOtherSeq: cp = "dna"; break; } fprintf(foo, "\nbegin data;\n"); fprintf(foo, " dimensions ntax=%d nchar=%d;\n", seqout, seqlen); fprintf( foo, " format datatype=%s interleave missing=%c", cp, gPretty.gapchar); if (gPretty.domatch) fprintf(foo, " matchchar=%c", gPretty.matchchar); fprintf(foo, ";\n matrix\n"); } for (iseq = 0; iseq < noutindex; iseq++) { fseek(ftmp, outindex[iseq], 0); for (iline = 0; iline <= leaf; iline++) if (!fgets(s, 256, ftmp)) *s = 0; if (ftell(ftmp) <= outindex[iseq + 1]) fputs(s, foo); } for (j = 0; j < gPretty.interline; j++) fputs("\n", foo); /* some want spacer line */ } fclose(ftmp); /* tmp disappears */ fout = foo; } if (outform == kASN1) fprintf(foo, "} }\n"); if (outform == kPAUP) fprintf(foo, ";\n end;\n"); if (outindex != NULL) free(outindex); exit_main(0); }