From c9b94ce735ea41a048be4cf7c22e0af3b253dda7 Mon Sep 17 00:00:00 2001 From: Kuoi Date: Sun, 16 Apr 2023 07:33:28 +0800 Subject: [PATCH] init --- Makefile | 111 +++ macinit.c | 292 ++++++++ readseq.c | 1411 ++++++++++++++++++++++++++++++++++ ureadasn.c | 324 ++++++++ ureadseq.c | 2121 ++++++++++++++++++++++++++++++++++++++++++++++++++++ ureadseq.h | 172 +++++ 6 files changed, 4431 insertions(+) create mode 100644 Makefile create mode 100644 macinit.c create mode 100644 readseq.c create mode 100644 ureadasn.c create mode 100644 ureadseq.c create mode 100644 ureadseq.h diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..40a5a92 --- /dev/null +++ b/Makefile @@ -0,0 +1,111 @@ +# +# Unix Makefile for readseq +# to use, command me: +# % make -- or -- +# % make CC=your-c-compiler-name +# + +# pick an ANSI C compiler (the default Sun CC is not ANSI) +CC=gcc # Gnu C Compiler +#CC=cc # SGI Irix +#CC=vcc # some DEC Ultrix + +CFLAGS= +#CFLAGS= -DSMALLCHECKSUM # if you prefer to use a GCG-standard 13 bit checksum +# instead of a full 32 bit checksum. This may enhance compatibility w/ GCG software + +SOURCES= readseq.c ureadseq.c ureadseq.h ureadasn.c +DOCS= Readme Readseq.help Formats Stdfiles Makefile Make.com add.gdemenu *.std + + +# NCBI toolkit support for ASN.1 reader + +# this is path to NCBI toolkit, you must set for your system: +NCBI= +#NCBI=/bio/mb/ncbi +# +OTHERLIBS=-lm +LIB1=-lncbi +LIB2=-lncbiobj +LIB3=-lncbicdr +LIB4=-lvibrant +INCPATH=$(NCBI)/include +LIBPATH=$(NCBI)/lib +NCFLAGS=$(CFLAGS) -DNCBI -I$(INCPATH) +NLDFLAGS=-I$(INCPATH) -L$(LIBPATH) +NLIBS=$(LIB1) $(LIB2) $(OTHERLIBS) + + +all: build test + +build: $(SOURCES) + @echo "Compiling readseq..." + $(CC) $(CFLAGS) -o readseq readseq.c ureadseq.c + +# if using NCBI, uncomment these lines in place of build: above +#build: $(SOURCES) +# @echo "Compiling readseq with NCBI toolkit support..."; +# $(CC) -o readseq $(NLDFLAGS) $(NCFLAGS) readseq.c ureadseq.c ureadasn.c $(NLIBS) + +test: $(SOURCES) readseq + @echo "" + @echo "Test for general read/write of all chars:" + ./readseq -p alphabet.std -otest.alpha + -diff test.alpha alphabet.std + + @echo "" + @echo "Test for valid format conversions:" + ./readseq -v -p -f=ig nucleic.std -otest.ig + ./readseq -v -p -f=gb test.ig -otest.gb + ./readseq -v -p -f=nbrf test.gb -otest.nbrf + ./readseq -v -p -f=embl test.nbrf -otest.embl + ./readseq -v -p -f=gcg test.embl -otest.gcg + ./readseq -v -p -f=strider test.gcg -otest.strider + ./readseq -v -p -f=fitch test.strider -otest.fitch + ./readseq -v -p -f=fasta test.fitch -otest.fasta + ./readseq -v -p -f=pir test.fasta -otest.pir + ./readseq -v -p -f=ig test.pir -otest.ig-b + -diff test.ig test.ig-b + + @echo "" + @echo "Test for multiple-sequence format conversions:" + ./readseq -p -f=ig multi.std -otest.m-ig + ./readseq -p -f=gb test.m-ig -otest.m-gb + ./readseq -p -f=nbrf test.m-gb -otest.m-nbrf + ./readseq -p -f=embl test.m-nbrf -otest.m-embl + ./readseq -p -f=fasta test.m-embl -otest.m-fasta + ./readseq -p -f=pir test.m-fasta -otest.m-pir + ./readseq -p -f=msf test.m-pir -otest.m-msf + ./readseq -p -f=paup test.m-msf -otest.m-paup + ./readseq -p -f=ig test.m-paup -otest.m-ig-b + -diff test.m-ig test.m-ig-b +# +# if using NCBI, uncomment these lines +# @echo "" +# @echo "Test of NCBI ASN.1 conversions:" +# ./readseq -p -f=asn test.m-ig -otest.m-asn +# ./readseq -p -f=ig test.m-asn -otest.m-ig-c +# -diff test.m-ig test.m-ig-c +# + @echo "" + @echo "Expect differences in the header lines due to" + @echo "different format headers. If any sequence lines" + @echo "differ, or if the checksums differ, there is a problem." + @echo "----------------------" + @echo "" + @echo "To clean up test files, command me:" + @echo " make clean" + + +clean: + rm -f *.o core test.* + +shar: + @echo "shell archiving files..." + -rm -f readseq*.shar + mkdir readseqd + cp $(SOURCES) readseqd + cp $(DOCS) readseqd + shar -v readseqd > readseq.shar + rm -rf readseqd + diff --git a/macinit.c b/macinit.c new file mode 100644 index 0000000..aeeabd3 --- /dev/null +++ b/macinit.c @@ -0,0 +1,292 @@ +/* + macinit.c + -- Macintosh initializations, then call real main + +Note: compile this segment as Main for generic 68000 processor, so it won't + fail on generic mac + +*/ + +#pragma segment Main + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +// #include +// #include + +Boolean StopKey() +{ + EventRecord ev; + + if (EventAvail(keyDownMask + autoKeyMask, &ev)) { + if ((ev.modifiers & cmdKey) && + ((char)(ev.message & charCodeMask) == '.')) { + SysBeep(1); + (void)GetNextEvent(keyDownMask + autoKeyMask, &ev); + return true; + } + } + return false; +} + +Boolean cmdKeyIsDown() +{ + KeyMap kmap; + GetKeys(&kmap); + return BitTst(kmap, (sizeof(KeyMap) * 8) - 55); +} + +Boolean shiftKeyIsDown() +{ + KeyMap kmap; + GetKeys(&kmap); + return BitTst(kmap, (sizeof(KeyMap) * 8) - 56); +} + +Boolean capsLockIsDown() +{ + KeyMap kmap; + GetKeys(&kmap); + return BitTst(kmap, (sizeof(KeyMap) * 8) - 57); +} + +Boolean optionKeyIsDown() +{ + KeyMap kmap; + GetKeys(&kmap); + return BitTst(kmap, (sizeof(KeyMap) * 8) - 58); +} + +Boolean MouseButton() { return Button(); } + +Boolean Keypress() +{ + EventRecord ev; + return EventAvail(keyDownMask + keyUpMask + autoKeyMask, &ev); +} + +char *StdGetFile(char *prompt, OSType fileTypes[], int nFileTypes) +{ + Point wher; /*where to display dialog*/ + SFReply reply; /*reply record*/ + short len; + static char filename[80] = "\0"; + + wher.h = 80; + wher.v = 90; + if (optionKeyIsDown()) nFileTypes = 0; + + SFGetFile(wher, prompt, nil, nFileTypes, fileTypes, nil, &reply); + + if (reply.good) { + len = SetVol(nil, reply.vRefNum); + len = reply.fName[0]; + strncpy(filename, (char *)(&reply.fName[1]), len); + filename[len] = '\0'; + return filename; + } + else + return NULL; +} + +int readCmdOptions(FILE *cl, char *progname, char ***argv) +/* command line reader for Mac/MPW -- dgg */ +{ +#define MAXS 255 +#define addarg(sptr) \ + if (strlen(sptr) > 0) { \ + targv = (char **)realloc(targv, (argc + 1) * sizeof(char *)); \ + targv[argc] = (char *)malloc(1 + strlen(sptr) * sizeof(char)); \ + strcpy(targv[argc], sptr); \ + argc++; \ + } + + char *pword, st[MAXS]; + int argc = 0; + char **targv; + + targv = (char **)malloc(1); + if (progname == NULL) progname = "program"; + addarg(progname); + fgets(st, MAXS, cl); + if (!feof(cl) && st != NULL && *st != 0) { + pword = strtok(st, "\ \n"); + while (pword != NULL) { + addarg(pword); + pword = strtok(NULL, "\ \n"); + } + } + + *argv = targv; + return argc; +} + +int ccommand(char ***argv) +{ + int argc; + char **targv; + + argc = readCmdOptions(stdin, *argv[0], &targv); + *argv = targv; + return argc; +} + +extern _DataInit(); + +// #define VERSION curSysEnvVers +#define nocolorID 130 +#define no68020 133 +#define no68881 132 +#define no256 134 +#define nosys6 135 + +void MacInit() +{ + SysEnvRec theWorld; + OSErr OSys; + DialogPtr crashDia; + long tick; + + UnloadSeg(_DataInit); + + InitGraf((Ptr)&qd.thePort); + // InitFonts(); + InitWindows(); + // InitMenus(); + // TEInit(); + InitDialogs(nil); + InitCursor(); + + /*______________________________________________________*/ + /* If not right Machine then stop */ + /*______________________________________________________*/ + OSys = SysEnvirons(curSysEnvVers, &theWorld); + + /*if(!theWorld.hasColorQD) { + crashDia = GetNewDialog (nocolorID, nil, (WindowPtr) -1); + DrawDialog (crashDia); + Delay (300, &tick); + ExitToShell(); + }*/ + /*if(theWorld.processor < env68020) { + crashDia = GetNewDialog (no68020, nil, (WindowPtr) -1); + DrawDialog (crashDia); + Delay (300, &tick); + ExitToShell(); + }*/ + /*if(!theWorld.hasFPU) { + crashDia = GetNewDialog (no68881, nil, (WindowPtr) -1); + DrawDialog (crashDia); + Delay (300, &tick); + ExitToShell(); + } + if(theWorld.systemVersion < 0x0600) { + crashDia = GetNewDialog (nosys6, nil, (WindowPtr) -1); + DrawDialog (crashDia); + Delay (300, &tick); + ExitToShell(); + }*/ + +#ifdef UnDeFineD + /*______________________________________________________*/ + /* Set Rects */ + /*______________________________________________________*/ + screenRect = qd.screenBits.bounds; + offLeft = 0; + offTop = 0; + offRight = screenRect.right; + offBottom = screenRect.bottom; + SetRect(&BaseRect, 40, 60, 472, 282); + tempRgn = GetGrayRgn(); + HLock((Handle)tempRgn); + TotalRect = (**tempRgn).rgnBBox; + SetRect(&minRect, 80, 80, (**tempRgn).rgnBBox.right - 40, + (**tempRgn).rgnBBox.bottom - 40); + HUnlock((Handle)tempRgn); + + /*______________________________________________________*/ + /* Open Window & set Palette & Picture */ + /*______________________________________________________*/ + theGDevice = GetMainDevice(); + HLock((Handle)theGDevice); + mycolors = (**(**theGDevice).gdPMap).pmTable; + numcolor = (**(**theGDevice).gdPMap).pixelSize; + HUnlock((Handle)theGDevice); + switch (numcolor) { + case 1: + numcolor = 2; + break; + case 2: + numcolor = 4; + break; + case 4: + numcolor = 16; + break; + case 8: + numcolor = 256; + break; + } + + myWindow = NewCWindow(nil, &BaseRect, "", true, zoomDocProc, + (WindowPtr)-1, true, 150); + SetPort((WindowPtr)myWindow); + DrawGrowIcon(myWindow); + + srcPalette = NewPalette(numcolor, mycolors, pmCourteous, 0); + SetPalette((WindowPtr)myWindow, srcPalette, true); + + /*______________________________________________________*/ + /* Set menus */ + /*______________________________________________________*/ + mymenu0 = GetMenu(appleID); + AddResMenu(mymenu0, 'DRVR'); + InsertMenu(mymenu0, 0); + mymenu1 = newmenu(129, "File"); + appendmenu(mymenu1, "Start;Quit"); + InsertMenu(mymenu1, 0); + mymenu2 = newmenu(130, "Edit"); + InsertMenu(mymenu2, 0); + DrawMenuBar(); + + /*______________________________________________________*/ + /* Init variables */ + /*______________________________________________________*/ + DoneFlag = false; + yieldTime = 0; + return; +#endif +} + +main(int argc, char *argv[]) +{ + Boolean loop = true; + char **myargv; + int myargc; + + /* MacInit(); -- SIOW library handles this */ + do { + fprintf(stderr, "\nEnter command line for %s [cmd-Q to quit]\n", + argv[0]); + fprintf(stderr, "-> %s ", argv[0]); + myargv = argv; + myargc = ccommand(&myargv); + + siow_main(myargc, myargv); + fflush(stdout); + + } while (true); + exit(0); +} + diff --git a/readseq.c b/readseq.c new file mode 100644 index 0000000..44e2472 --- /dev/null +++ b/readseq.c @@ -0,0 +1,1411 @@ +/* File: readseq.c + * main() program for ureadseq.c, ureadseq.h + * + * Reads and writes nucleic/protein sequence in various + * formats. Data files may have multiple sequences. + * + * Copyright 1990 by d.g.gilbert + * biology dept., indiana university, bloomington, in 47405 + * e-mail: gilbertd@bio.indiana.edu + * + * This program may be freely copied and used by anyone. + * Developers are encourged to incorporate parts in their + * programs, rather than devise their own private sequence + * format. + * + * This should compile and run with any ANSI C compiler. + * Please advise me of any bugs, additions or corrections. + * + */ + +const char *title = "readSeq (1Feb93), multi-format molbio sequence reader.\n"; + +/* History + 27 Feb 90. 1st release to public. + 4 Mar 90. + Gary Olsen format + + case change + * minor corrections to NBRF,EMBL,others + * output 1 file per sequence for gcg, unknown + * define -DNOSTR for c-libraries w/o strstr + - readseq.p, pascal version, becomes out-of-date + 24 May 90. + Phylip 3.2 output format (no input) + 20 Jul 90. + Phylip 3.3 output (no input yet) + + interactive output re-direction + + verbose progress info + * interactive help output + * dropped line no.s on NBRF output + * patched in HyperGCG XCMD corrections, + - except for seq. documentation handling + * dropped the IG special nuc codes, as IG has + adopted the standard IUB codes (now if only + everyone would adopt a standard format !) + 11 Oct 90. * corrected bug in reading/writing of EMBL format + + 17 Oct 91. * corrected bug in reading Olsen format + (serious-deletion) + 10 Nov 91. * corrected bug in reading some GCG format files + (serious-last line duplicated) + + add format name parsing (-fgb, -ffasta, ...) + + Phylip v3.4 output format (== v3.2, sequential) + + add checksum output to all forms that have document + + skip mail headers in seq file + + add pipe for standard input == seq file (with -p) + * fold in parts of MacApp Seq object + * strengthen format detection + * clarify program structure + * remove fixed sequence size limit (now dynamic, sizeof memory) + * check and fold in accumulated bug reports: + * Now ANSI-C fopen(..,"w") & check open failure + * Define -DFIXTOUPPER for nonANSI C libraries that mess + up toupper/tolower + = No command-line changes; callers of readseq main() should be okay + - ureadseq.h functions have changed; client programs need to note. + + added Unix and VMS Make scripts, including validation tests + + 4 May 92. + added 32 bit CRC checksum as alternative to GCG 6.5bit checksum + (-DBIGCHECKSUM) + Aug 92 = fixed Olsen format input to handle files w/ more sequences, + not to mess up when more than one seq has same identifier, + and to convert number masks to symbols. + = IG format fix to understand ^L + + 25-30 Dec 92 + * revised command-line & interactive interface. Suggested form is + now readseq infile -format=genbank -output=outfile -item=1,3,4 ... but remains + compatible with prior commandlines: readseq infile -f2 -ooutfile -i3 ... + + added GCG MSF multi sequence file format + + added PIR/CODATA format + + added NCBI ASN.1 sequence file format + + added Pretty, multi sequence pretty output (only) + + added PAUP multi seq format + + added degap option + + added Gary Williams (GWW, G.Williams@CRC.AC.UK) + reverse-complement option. + + added support for reading Phylip formats (interleave & + sequential) + * string fixes, dropped need for compiler flags NOSTR, FIXTOUPPER, + NEEDSTRCASECMP + * changed 32bit checksum to default, -DSMALLCHECKSUM for GCG + version + + 1Feb93 + = revert GenBank output to a fixed left number width which + other software depends on. + = fix for MSF input to handle symbols in names + = fix bug for possible memory overrun when truncating seqs for + Phylip or Paup formats (thanks Anthony Persechini) + +*/ + +/* + Readseq has been tested with: + Macintosh MPW C + GNU gcc + SGI cc + VAX-VMS cc + Any ANSI C compiler should be able to handle this. + Old-style C compilers barf all over the source. + + +How do I build the readseq program if I have an Ansi C compiler? +#-------------------- +# Unix ANSI C +# Use the supplied Makefile this way: +% make CC=name-of-c-compiler +# OR do this... +% gcc readseq.c ureadseq.c -o readseq + +#-------------------- +$!VAX-VMS cc +$! Use the supplied Make.Com this way: +$ @make +$! OR, do this: +$ cc readseq, ureadseq +$ link readseq, ureadseq, sys$library:vaxcrtl/lib +$ readseq :== $ MyDisk:[myacct]readseq + +#-------------------- +# Macintosh Simple Input/Output Window application +# requires MPW-C and SIOW library (from APDA) +# also uses files macinit.c, macinit.r, readseqSIOW.make +# +Buildprogram readseqSIOW + +#-------------------- +#MPW-C v3 tool +C ureadseq.c +C readseq.c +link -w -o readseq -t MPST -c 'MPS ' � + readseq.c.o Ureadseq.c.o � + "{Libraries}"Interface.o � + "{Libraries}"ToolLibs.o � + "{Libraries}"Runtime.o � + "{CLibraries}"StdClib.o +readseq -i1 ig.seq + +# MPW-C with NCBI tools + +set NCBI "{Boot}@molbio:ncbi:"; EXPORT NCBI +set NCBILIB1 "{NCBI}"lib:libncbi.o; export NCBILIB1 +set NCBILIB2 "{NCBI}"lib:libncbiobj.o; export NCBILIB2 +set NCBILIB3 "{NCBI}"lib:libncbicdr.o; export NCBILIB3 +set NCBILIB4 "{NCBI}"lib:libvibrant.o; export NCBILIB4 + +C ureadseq.c +C -d NCBI -i "{NCBI}"include: ureadasn.c +C -d NCBI -i "{NCBI}"include: readseq.c +link -w -o readseq -t MPST -c 'MPS ' � + ureadseq.c.o ureadasn.c.o readseq.c.o � + {NCBILIB4} {NCBILIB2} {NCBILIB1} � + "{Libraries}"Interface.o � + "{Libraries}"ToolLibs.o � + "{Libraries}"Runtime.o � + "{CLibraries}"CSANELib.o � + "{CLibraries}"Math.o � + "{CLibraries}"StdClib.o + +===========================================================*/ + +#include +#include +#include + +#include "ureadseq.h" + +#pragma segment readseq + +static char inputfilestore[256], *inputfile = inputfilestore; + +const char *formats[kMaxFormat + 1] = {" 1. IG/Stanford", + " 2. GenBank/GB", + " 3. NBRF", + " 4. EMBL", + " 5. GCG", + " 6. DNAStrider", + " 7. Fitch", + " 8. Pearson/Fasta", + " 9. Zuker (in-only)", + "10. Olsen (in-only)", + "11. Phylip3.2", + "12. Phylip", + "13. Plain/Raw", + "14. PIR/CODATA", + "15. MSF", + "16. ASN.1", + "17. PAUP/NEXUS", + "18. Pretty (out-only)", + ""}; + +#define kFormCount 30 +#define kMaxFormName 15 + +const struct formatTable { + char *name; + short num; +} formname[] = { + {"ig", kIG}, + {"stanford", kIG}, + {"genbank", kGenBank}, + {"gb", kGenBank}, + {"nbrf", kNBRF}, + {"embl", kEMBL}, + {"gcg", kGCG}, + {"uwgcg", kGCG}, + {"dnastrider", kStrider}, + {"strider", kStrider}, + {"fitch", kFitch}, + {"pearson", kPearson}, + {"fasta", kPearson}, + {"zuker", kZuker}, + {"olsen", kOlsen}, + {"phylip", kPhylip}, + {"phylip3.2", kPhylip2}, + {"phylip3.3", kPhylip3}, + {"phylip3.4", kPhylip4}, + {"phylip-interleaved", kPhylip4}, + {"phylip-sequential", kPhylip2}, + {"plain", kPlain}, + {"raw", kPlain}, + {"pir", kPIR}, + {"codata", kPIR}, + {"asn.1", kASN1}, + {"msf", kMSF}, + {"paup", kPAUP}, + {"nexus", kPAUP}, + {"pretty", kPretty}, +}; + +const char *kASN1headline = "Bioseq-set ::= {\nseq-set {\n"; + +/* GWW table for getting the complement of a nucleotide (IUB codes) */ +/* ! + * "#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[ + * \]^_`abcdefghijklmnopqrstuvwxyz{|}~ */ +const char compl [] = + " !\"#$%&'()*+,-./" + "0123456789:;<=>?@TVGHNNCDNNMNKNNYRYSAABWNRN[\\]^_`" + "tvghnncdnnmnknnyrysaabwnrn{|}~"; + +char *formatstr(short format) +{ + if (format < 1 || format > kMaxFormat) { + switch (format) { + case kASNseqentry: + case kASNseqset: + return formats[kASN1 - 1]; + case kPhylipInterleave: + case kPhylipSequential: + return formats[kPhylip - 1]; + default: + return "(unknown)"; + } + } + else + return formats[format - 1]; +} + +int parseformat(char *name) +{ +#define kDupmatch -2 + int namelen, maxlen, i, match, matchat; + char lname[kMaxFormName + 1]; + + skipwhitespace(name); + namelen = strlen(name); + if (namelen == 0) + return kNoformat; + else if (isdigit(*name)) { + i = atol(name); + if (i kMaxFormat) + return kNoformat; + else + return i; + } + + /* else match character name */ + maxlen = min(kMaxFormName, namelen); + for (i = 0; i < maxlen; i++) lname[i] = to_lower(name[i]); + lname[maxlen] = 0; + matchat = kNoformat; + + for (i = 0; i < kFormCount; i++) { + match = strncmp(lname, formname[i].name, maxlen); + if (match == 0) { + if (strlen(formname[i].name) == namelen) + return (formname[i].num); + else if (matchat == kNoformat) + matchat = i; + else + matchat = + kDupmatch; /* 2 or more partial matches */ + } + } + if (matchat == kNoformat || matchat == kDupmatch) + return kNoformat; + else + return formname[matchat].num; +} + +static void dumpSeqList(char *list, short format) +{ + long i, l, listlen; + char s[256]; + + listlen = strlen(list); + printf("Sequences in %s (format is %s)\n", inputfile, + formatstr(format)); + for (i = 0, l = 0; i < listlen; i++) { + if (list[i] == (char)NEWLINE) { + s[l] = '\0'; + l = 0; + puts(s); + } + else if (l < 255) + s[l++] = list[i]; + } + putchar('\n'); +} + +void usage() +{ + short i, midi; + + fprintf(stderr, title); + fprintf(stderr, "usage: readseq [-options] in.seq > out.seq\n"); + fprintf(stderr, " options\n"); + /* ? add -d[igits] to allow digits in sequence data, &/or option to + * specify seq charset !? */ + fprintf(stderr, " -a[ll] select All sequences\n"); + fprintf(stderr, " -c[aselower] change to lower case\n"); + fprintf(stderr, " -C[ASEUPPER] change to UPPER CASE\n"); + fprintf(stderr, " -degap[=-] remove gap symbols\n"); + fprintf(stderr, + " -i[tem=2,3,4] select Item number(s) from several\n"); + fprintf(stderr, " -l[ist] List sequences only\n"); + fprintf(stderr, " -o[utput=]out.seq redirect Output\n"); + fprintf(stderr, + " -p[ipe] Pipe (command line, stdout)\n"); + fprintf(stderr, " -r[everse] change to Reverse-complement\n"); + fprintf(stderr, " -v[erbose] Verbose progress\n"); + fprintf(stderr, " -f[ormat=]# Format number for output, or\n"); + fprintf(stderr, " -f[ormat=]Name Format name for output:\n"); + midi = (kMaxFormat + 1) / 2; + for (i = kMinFormat - 1; i < midi; i++) + fprintf(stderr, " %-20s %-20s\n", formats[i], + formats[midi + i]); + + /* new output format options, esp. for pretty format: */ + fprintf(stderr, " \n"); + fprintf(stderr, " Pretty format options: \n"); + fprintf(stderr, " -wid[th]=# sequence line width\n"); + fprintf(stderr, " -tab=# left indent\n"); + fprintf(stderr, + " -col[space]=# column space within sequence line " + "on output\n"); + fprintf( + stderr, + " -gap[count] count gap chars in sequence numbers\n"); + fprintf(stderr, + " -nameleft, -nameright[=#] name on left/right side [=max " + "width]\n"); + fprintf(stderr, " -nametop name at top/bottom\n"); + fprintf(stderr, + " -numleft, -numright seq index on left/right side\n"); + fprintf(stderr, " -numtop, -numbot index on top/bottom\n"); + fprintf(stderr, + " -match[=.] use match base for 2..n species\n"); + fprintf(stderr, + " -inter[line=#] blank line(s) between sequence " + "blocks\n"); + + /****** not ready yet + fprintf(stderr, " -code=none,rtf,postscript,ps code syntax\n"); + fprintf(stderr, " -namefont=, -numfont=, -seqfont=font font + choice\n"); fprintf(stderr, " font suggestions include + times,courier,helvetica\n"); fprintf(stderr, " -namefontsize=, + -numfontsize=, -seqfontsize=#\n"); fprintf(stderr, " fontsize + suggestions include 9,10,12,14\n"); fprintf(stderr, " -namefontstyle=, + -numfontstyle=, -seqfontstyle= style fontstyle for names\n"); + fprintf(stderr, " fontstyle options are + plain,italic,bold,bold-italic\n"); + ******/ +} + +void erralert(short err) +{ + switch (err) { + case 0: + break; + case eFileNotFound: + fprintf(stderr, "File not found: %s\n", inputfile); + break; + case eFileCreate: + fprintf(stderr, "Can't open output file.\n"); + break; + case eASNerr: + fprintf(stderr, "Error in ASN.1 sequence routines.\n"); + break; + case eNoData: + fprintf(stderr, "No data in file.\n"); + break; + case eItemNotFound: + fprintf(stderr, "Specified item not in file.\n"); + break; + case eUnequalSize: + fprintf(stderr, + "This format requires equal length " + "sequences.\nSequence truncated or padded to " + "fit.\n"); + break; + case eUnknownFormat: + fprintf(stderr, + "Error: this format is unknown to me.\n"); + break; + case eOneFormat: + fprintf(stderr, + "Warning: This format permits only 1 sequence " + "per file.\n"); + break; + case eMemFull: + fprintf(stderr, + "Out of storage memory. Sequence truncated.\n"); + break; + default: + fprintf(stderr, "readSeq error = %d\n", err); + break; + } +} /* erralert */ + +int chooseFormat(boolean quietly) +{ + char sform[128]; + int midi, i, outform; + + if (quietly) + return kPearson; /* default */ + else { + midi = (kMaxFormat + 1) / 2; + for (i = kMinFormat - 1; i < midi; i++) + fprintf(stderr, " %-20s %-20s\n", + formats[i], formats[midi + i]); + fprintf(stderr, "\nChoose an output format (name or #): \n"); + gets(sform); + outform = parseformat(sform); + if (outform == kNoformat) outform = kPearson; + return outform; + } +} + +/* read paramater(s) */ + +boolean checkopt(boolean casesense, char *sopt, const char *smatch, + short minword) +{ + long lenopt, lenmatch; + boolean result; + short minmaxw; + + lenopt = strlen(sopt); + lenmatch = strlen(smatch); + minmaxw = max(minword, min(lenopt, lenmatch)); + + if (casesense) + result = (!strncmp(sopt, smatch, minmaxw)); + else + result = (!Strncasecmp(sopt, smatch, minmaxw)); + /* if (result) { */ + /* fprintf(stderr,"true checkopt(opt=%s,match=%s,param=%s)\n", sopt, + * smatch, *sparam); */ + /* } */ + return result; +} + +#define kMaxwhichlist 50 + +/* global for readopt(), main() */ +boolean chooseall = false, quietly = false, gotinputfile = false, + listonly = false, closeout = false, verbose = false, manyout = false, + dolower = false, doupper = false, doreverse = false, askout = true, + dopipe = false, interleaved = false; +short nfile = 0, iwhichlist = 0, nwhichlist = 0; +short whichlist[kMaxwhichlist + 1]; +long whichSeq = 0, outform = kNoformat; +char onamestore[128], *oname = onamestore; +FILE *foo = NULL; + +void resetGlobals() +/* need this when used from SIOW, as these globals are not reinited +automatically between calls to local main() */ +{ + chooseall = false; + quietly = false; + gotinputfile = false; + listonly = false; + closeout = false; + verbose = false; + manyout = false; + dolower = false; + doupper = false; + doreverse = false; + askout = true; + dopipe = false; + interleaved = false; + nfile = 0; + iwhichlist = 0; + nwhichlist = 0; + whichSeq = 0; + outform = kNoformat; + oname = onamestore; + foo = NULL; + + gPrettyInit(gPretty); +} + +#define kOptOkay 1 +#define kOptNone 0 + +int readopt(char *sopt) +{ + char sparamstore[256], *sparam = sparamstore; + short n, slen = strlen(sopt); + + /* fprintf(stderr,"readopt( %s) == ", sopt); */ + + if (*sopt == '?') { + usage(); + return kOptNone; /*? eOptionBad or kOptNone */ + } + + else if (*sopt == '-') { + char *cp = strchr(sopt, '='); + *sparam = '\0'; + if (cp) { + strcpy(sparam, cp + 1); + *cp = 0; + } + + if (checkopt(false, sopt, "-help", 2)) { + usage(); + return kOptNone; + } + + if (checkopt(false, sopt, "-all", 2)) { + whichSeq = 1; + chooseall = true; + return kOptOkay; + } + + if (checkopt(false, sopt, "-colspace", + 4)) { /* test before -c[ase] */ + n = atoi(sparam); + gPretty.spacer = n; + return kOptOkay; + } + + if (checkopt(true, sopt, "-caselower", 2)) { + dolower = true; + return kOptOkay; + } + if (checkopt(true, sopt, "-CASEUPPER", 2)) { + doupper = true; + return kOptOkay; + } + + if (checkopt(false, sopt, "-pipe", 2)) { + dopipe = true; + askout = false; + return kOptOkay; + } + + if (checkopt(false, sopt, "-list", 2)) { + listonly = true; + askout = false; + return kOptOkay; + } + + if (checkopt(false, sopt, "-reverse", 2)) { + doreverse = true; + return kOptOkay; + } + + if (checkopt(false, sopt, "-verbose", 2)) { + verbose = true; + return kOptOkay; + } + + if (checkopt(false, sopt, "-match", 5)) { + gPretty.domatch = true; + if (*sparam >= ' ') gPretty.matchchar = *sparam; + return kOptOkay; + } + if (checkopt(false, sopt, "-degap", 4)) { + gPretty.degap = true; + if (*sparam >= ' ') gPretty.gapchar = *sparam; + return kOptOkay; + } + + if (checkopt(false, sopt, "-interline", 4)) { + gPretty.interline = atoi(sparam); + return kOptOkay; + } + + if (checkopt(false, sopt, "-item", 2)) { + char *cp = sparam; + nwhichlist = 0; + whichlist[0] = 0; + if (*cp == 0) cp = sopt + 2; /* compatible w/ old way */ + do { + while (*cp != 0 && !isdigit(*cp)) cp++; + if (*cp != 0) { + n = atoi(cp); + whichlist[nwhichlist++] = n; + while (*cp != 0 && isdigit(*cp)) cp++; + } + } while (*cp != 0 && n > 0 && + nwhichlist < kMaxwhichlist); + whichlist[nwhichlist++] = + 0; /* 0 == stopsign for loop */ + whichSeq = max(1, whichlist[0]); + iwhichlist = 1; + return kOptOkay; + } + + if (checkopt(false, sopt, "-format", + 5)) { /* -format=phylip, -f2, -form=phylip */ + if (*sparam == 0) { + for (sparam = sopt + 2; isalpha(*sparam); + sparam++) + ; + } + outform = parseformat(sparam); + return kOptOkay; + } + if (checkopt(false, sopt, "-f", + 2)) { /* compatible w/ -fphylip prior version */ + if (*sparam == 0) sparam = sopt + 2; + outform = parseformat(sparam); + return kOptOkay; + } + + if (checkopt(false, sopt, "-output", 3)) { /* -output=myseq */ + if (*sparam == 0) { + for (sparam = sopt + 3; isalpha(*sparam); + sparam++) + ; + } + strcpy(oname, sparam); + foo = fopen(oname, "w"); + if (!foo) { + erralert(eFileCreate); + return eFileCreate; + } + closeout = true; + askout = false; + return kOptOkay; + } + if (checkopt(false, sopt, "-o", + 2)) { /* compatible w/ -omyseq prior version */ + if (*sparam == 0) sparam = sopt + 2; + strcpy(oname, sparam); + foo = fopen(oname, "w"); + if (!foo) { + erralert(eFileCreate); + return eFileCreate; + } + closeout = true; + askout = false; + return kOptOkay; + } + + if (checkopt(false, sopt, "-width", 2)) { + if (*sparam == 0) { + for (sparam = sopt + 2; + !isdigit(*sparam) && *sparam != 0; + sparam++) + ; + } + n = atoi(sparam); + if (n > 0) gPretty.seqwidth = n; + return kOptOkay; + } + + if (checkopt(false, sopt, "-tab", 4)) { + if (*sparam == 0) { + for (sparam = sopt + 2; + !isdigit(*sparam) && *sparam != 0; + sparam++) + ; + } + n = atoi(sparam); + gPretty.tab = n; + return kOptOkay; + } + + if (checkopt(false, sopt, "-gapcount", 4)) { + gPretty.baseonlynum = false; + /* if (*sparam >= ' ') gPretty.gapchar= *sparam; */ + return kOptOkay; + } + if (checkopt(false, sopt, "-nointerleave", 8)) { + gPretty.noleaves = true; + return kOptOkay; + } + + if (checkopt(false, sopt, "-nameleft", 7)) { + if (*sparam == 0) { + for (sparam = sopt + 2; + !isdigit(*sparam) && *sparam != 0; + sparam++) + ; + } + n = atoi(sparam); + if (n > 0 && n < 50) gPretty.namewidth = n; + gPretty.nameleft = true; + return kOptOkay; + } + if (checkopt(false, sopt, "-nameright", 7)) { + if (*sparam == 0) { + for (sparam = sopt + 2; + !isdigit(*sparam) && *sparam != 0; + sparam++) + ; + } + n = atoi(sparam); + if (n > 0 && n < 50) gPretty.namewidth = n; + gPretty.nameright = true; + return kOptOkay; + } + if (checkopt(false, sopt, "-nametop", 6)) { + gPretty.nametop = true; + return kOptOkay; + } + + if (checkopt(false, sopt, "-numleft", 6)) { + if (*sparam == 0) { + for (sparam = sopt + 2; + !isdigit(*sparam) && *sparam != 0; + sparam++) + ; + } + n = atoi(sparam); + if (n > 0 && n < 50) gPretty.numwidth = n; + gPretty.numleft = true; + return kOptOkay; + } + if (checkopt(false, sopt, "-numright", 6)) { + if (*sparam == 0) { + for (sparam = sopt + 2; + !isdigit(*sparam) && *sparam != 0; + sparam++) + ; + } + n = atoi(sparam); + if (n > 0 && n < 50) gPretty.numwidth = n; + gPretty.numright = true; + return kOptOkay; + } + + if (checkopt(false, sopt, "-numtop", 6)) { + gPretty.numtop = true; + return kOptOkay; + } + if (checkopt(false, sopt, "-numbottom", 6)) { + gPretty.numbot = true; + return kOptOkay; + } + + else { + usage(); + return eOptionBad; + } + } + + else { + strcpy(inputfile, sopt); + gotinputfile = (*inputfile != 0); + nfile++; + return kOptOkay; + } + + /* return kOptNone; -- never here */ +} + +/* this program suffers some as it tries to be a quiet translator pipe + _and_ a noisy user interactor +*/ + +/* return is best for SIOW, okay for others */ +#ifdef SIOW +#define Exit(a) return (a) +siow_main(int argc, char *argv[]) + +#else +#define Exit(a) exit(a) + +main(int argc, char *argv[]) +#endif +{ + boolean closein = false; + short ifile, nseq, atseq, format, err = 0, seqtype = kDNA, nlines, + seqout = 0, phylvers = 2; + long i, skiplines, seqlen, seqlen0; + unsigned long checksum = 0, checkall = 0; + char *seq, *cp, *firstseq = NULL, *seqlist, *progname, tempname[256]; + char seqid[256], *seqidptr = seqid; + char stempstore[256], *stemp = stempstore; + FILE *ftmp, *fin, *fout; + long outindexmax = 0, noutindex = 0, *outindex = NULL; + +#define exit_main(err) \ + { \ + if (closeout) fclose(fout); \ + if (closein) fclose(fin); \ + if (*tempname != 0) remove(tempname); \ + Exit(err); \ + } + +#define indexout() \ + if (interleaved) { \ + if (noutindex >= outindexmax) { \ + outindexmax = noutindex + 20; \ + outindex = (long *)realloc( \ + outindex, sizeof(long) * outindexmax); \ + if (outindex == NULL) { \ + err = eMemFull; \ + erralert(err); \ + exit_main(err); \ + } \ + } \ + outindex[noutindex++] = ftell(fout); \ + } + + resetGlobals(); + foo = stdout; + progname = argv[0]; + *oname = 0; + *tempname = 0; + /* initialize gPretty ?? -- done in header */ + + for (i = 1; i < argc; i++) { + err = readopt(argv[i]); + if (err <= 0) exit_main(err); + } + + /* pipe input from stdin !? */ + if (dopipe && !gotinputfile) { + int c; + tmpnam(tempname); + inputfile = tempname; + ftmp = fopen(inputfile, "w"); + if (!ftmp) { + erralert(eFileCreate); + exit_main(eFileCreate); + } + while ((c = getc(stdin)) != EOF) fputc(c, ftmp); + fclose(ftmp); + gotinputfile = true; + } + + quietly = (dopipe || (gotinputfile && (listonly || whichSeq != 0))); + + if (verbose || (!quietly && !gotinputfile)) fprintf(stderr, title); + ifile = 1; + + /* UI: Choose output */ + if (askout && !closeout && !quietly) { + askout = false; + fprintf( + stderr, + "\nName of output file (?=help, defaults to display): \n"); + gets(oname = onamestore); + skipwhitespace(oname); + if (*oname == '?') { + usage(); + exit_main(0); + } + else if (*oname != 0) { + closeout = true; + foo = fopen(oname, "w"); + if (!foo) { + erralert(eFileCreate); + exit_main(eFileCreate); + } + } + } + + fout = foo; + if (outform == kNoformat) outform = chooseFormat(quietly); + + /* set up formats ... */ + switch (outform) { + case kPhylip2: + interleaved = false; + phylvers = 2; + outform = kPhylip; + break; + + case kPhylip4: + interleaved = true; + phylvers = 4; + outform = kPhylip; + break; + + case kMSF: + case kPAUP: + interleaved = true; + break; + + case kPretty: + gPretty.isactive = true; + interleaved = true; + break; + } + + if (gPretty.isactive && gPretty.noleaves) interleaved = false; + if (interleaved) { + fout = ftmp = tmpfile(); + outindexmax = 30; + noutindex = 0; + outindex = (long *)malloc(outindexmax * sizeof(long)); + if (outindex == NULL) { + err = eMemFull; + erralert(err); + exit_main(err); + } + } + + /* big loop over all input files */ + do { + /* select next input file */ + gotinputfile = (*tempname != 0); + while ((ifile < argc) && (!gotinputfile)) { + if (*argv[ifile] != '-') { + strcpy(inputfile, argv[ifile]); + gotinputfile = (*inputfile != 0); + --nfile; + } + ifile++; + } + + while (!gotinputfile) { + fprintf(stderr, + "\nName an input sequence or -option: \n"); + inputfile = inputfilestore; + + gets(stemp = stempstore); + if (*stemp == 0) + goto fini; /* !! need this to finish work during + interactive use */ + stemp = strtok(stempstore, " \n\r\t"); + while (stemp) { + err = readopt(stemp); /* will read inputfile if + it exists */ + if (err < 0) exit_main(err); + stemp = strtok(NULL, " \n\r\t"); + } + } + /* thanks to AJB@UK.AC.DARESBURY.DLVH for this PHYLIP3 fix: */ + /* head for end (interleave if needed) */ + if (*inputfile == 0) break; + + format = seqFileFormat(inputfile, &skiplines, &err); + + if (err == 0) { +#ifdef NCBI + if (format == kASNseqentry || format == kASNseqset) + seqlist = listASNSeqs(inputfile, skiplines, + format, &nseq, &err); + else +#endif + seqlist = listSeqs(inputfile, skiplines, format, + &nseq, &err); + } + + if (err != 0) + erralert(err); + + else if (listonly) { + dumpSeqList(seqlist, format); + free(seqlist); + } + + else { + /* choose whichSeq if needed */ + if (nseq == 1 || chooseall || + (quietly && whichSeq == 0)) { + chooseall = true; + whichSeq = 1; + quietly = true; /* no loop */ + } + else if (whichSeq > nseq && quietly) { + erralert(eItemNotFound); + err = eItemNotFound; + } + else if (whichSeq > nseq || !quietly) { + dumpSeqList(seqlist, format); + fprintf(stderr, + "\nChoose a sequence (# or All): \n"); + gets(stemp = stempstore); + skipwhitespace(stemp); + if (to_lower(*stemp) == 'a') { + chooseall = true; + whichSeq = 1; + quietly = + true; /* !? this means we don't ask + for another file as well as + no more whichSeqs... */ + } + else if (isdigit(*stemp)) + whichSeq = atol(stemp); + else + whichSeq = 1; /* default */ + } + free(seqlist); + + if (false /*chooseall*/) { /* this isn't debugged + yet...*/ + fin = fopen(inputfile, "r"); + closein = true; + } + + while (whichSeq > 0 && whichSeq <= nseq) { + /* need to open multiple output files ? */ + manyout = + ((chooseall || nwhichlist > 1) && + nseq > 1 && + (outform == kPlain || outform == kGCG)); + if (manyout) { + if (whichSeq == 1) + erralert(eOneFormat); + else if (closeout) { + sprintf(stemp, "%s_%d", oname, + whichSeq); + freopen(stemp, "w", fout); + fprintf(stderr, + "Writing sequence %d " + "to file %s\n", + whichSeq, stemp); + } + } + + if (closein) { + /* !! this fails... skips most seqs... + */ + /* !! in sequential read, must count + * seqs already read from whichSeq ... + */ + /* need major revision of ureadseq + * before we can do this */ + atseq = whichSeq - 1; + seqidptr = seqid; + seq = readSeqFp( + whichSeq, fin, skiplines, format, + &seqlen, &atseq, &err, seqidptr); + skiplines = 0; + } + else { + atseq = 0; + seqidptr = seqid; +#ifdef NCBI + if (format == kASNseqentry || + format == kASNseqset) { + seqidptr = NULL; + seq = readASNSeq( + whichSeq, inputfile, + skiplines, format, &seqlen, + &atseq, &err, &seqidptr); + } + else +#endif + seq = readSeq( + whichSeq, inputfile, + skiplines, format, &seqlen, + &atseq, &err, seqidptr); + } + + if (gPretty.degap) { + char *newseq; + long newlen; + newseq = + compressSeq(gPretty.gapchar, seq, + seqlen, &newlen); + if (newseq) { + free(seq); + seq = newseq; + seqlen = newlen; + } + } + + if (outform == kMSF) + checksum = + GCGchecksum(seq, seqlen, &checkall); + else if (verbose) + checksum = + seqchecksum(seq, seqlen, &checkall); + if (verbose) + fprintf(stderr, + "Sequence %d, length= %d, " + "checksum= %X, format= %s, id= " + "%s\n", + whichSeq, seqlen, checksum, + formatstr(format), seqidptr); + + if (err != 0) + erralert(err); + else { + /* format fixes that writeseq doesn't do + */ + switch (outform) { + case kPIR: + if (seqout == 0) + fprintf( + foo, + "\\\\\\\n"); + break; + case kASN1: + if (seqout == 0) + fprintf( + foo, + kASN1headline); + break; + + case kPhylip: + if (seqout == 0) { + if (!interleaved) { /* bug, nseq is for 1st infile only */ + if (chooseall) + i = nseq; + else + i = 1; + if (phylvers >= + 4) + fprintf( + foo, + " %d %d\n", + i, + seqlen); + else + fprintf( + foo, + " %d %d YF\n", + i, + seqlen); + } + seqlen0 = + seqlen; + } + else if (seqlen != + seqlen0) { + erralert( + eUnequalSize); + if (seqlen < + seqlen0) + seq = (char + *) + realloc( + seq, + seqlen0); + for (i = seqlen; + i < + seqlen0; + i++) + seq[i] = + gPretty + .gapchar; + seqlen = + seqlen0; + seq[seqlen] = 0; + } + break; + + case kPAUP: + if (seqout == 0) { + seqtype = + getseqtype( + seq, + seqlen); + seqlen0 = + seqlen; + } + else if (seqlen != + seqlen0) { + erralert( + eUnequalSize); + if (seqlen < + seqlen0) + seq = (char + *) + realloc( + seq, + seqlen0); + for (i = seqlen; + i < + seqlen0; + i++) + seq[i] = + gPretty + .gapchar; + seqlen = + seqlen0; + seq[seqlen] = 0; + } + break; + } + + if (doupper) + for (i = 0; i < seqlen; i++) + seq[i] = + to_upper(seq[i]); + else if (dolower) + for (i = 0; i < seqlen; i++) + seq[i] = + to_lower(seq[i]); + + if (doreverse) { + long j, k; + char ctemp; + for (j = 0, k = seqlen - 1; + j <= k; j++, k--) { + ctemp = compl [seq[j] - + ' ']; + seq[j] = compl [seq[k] - + ' ']; + seq[k] = ctemp; + } + } + + if ((gPretty.isactive || + outform == kPAUP) && + gPretty.domatch && + firstseq != NULL) { + for (i = 0; i < seqlen; i++) + if (seq[i] == + firstseq[i]) + seq[i] = + gPretty + .matchchar; + } + + if (gPretty.isactive && + gPretty.numtop && seqout == 0) { + gPretty.numline = 1; + indexout(); + (void)writeSeq(fout, seq, + seqlen, outform, + seqidptr); + gPretty.numline = 2; + indexout(); + (void)writeSeq(fout, seq, + seqlen, outform, + seqidptr); + gPretty.numline = 0; + } + + indexout(); + nlines = writeSeq(fout, seq, seqlen, + outform, seqidptr); + seqout++; + } + + if ((gPretty.isactive || outform == kPAUP) && + gPretty.domatch && firstseq == NULL) { + firstseq = seq; + seq = NULL; + } + else if (seq != NULL) { + free(seq); + seq = NULL; + } + +#ifdef NCBI + if ((format == kASNseqentry || + format == kASNseqset) && + seqidptr && seqidptr != seqid) + free(seqidptr); +#endif + if (chooseall) + whichSeq++; + else if (iwhichlist < nwhichlist) + whichSeq = whichlist[iwhichlist++]; + else + whichSeq = 0; + } + if (closein) { + fclose(fin); + closein = false; + } + } + whichSeq = 0; + } while (nfile > 0 || !quietly); + +fini: + if (firstseq) { + free(firstseq); + firstseq = NULL; + } + if (err || listonly) exit_main(err); + + if (gPretty.isactive && gPretty.numbot) { + gPretty.numline = 2; + indexout(); + (void)writeSeq(fout, seq, seqlen, outform, seqidptr); + gPretty.numline = 1; + indexout(); + (void)writeSeq(fout, seq, seqlen, outform, seqidptr); + gPretty.numline = 0; + } + + if (outform == kMSF) { + if (*oname) + cp = oname; + else + cp = inputfile; + fprintf(foo, + "\n %s MSF: %d Type: N January 01, 1776 12:00 " + "Check: %d ..\n\n", + cp, seqlen, checkall); + } + + if (outform == kPAUP) { + fprintf(foo, "#NEXUS\n"); + if (*oname) + cp = oname; + else + cp = inputfile; + fprintf(foo, "[%s -- data title]\n\n", cp); + /* ! now have header lines for each sequence... put them before + * "begin data;... */ + } + + if (outform == kPhylip && interleaved) { + if (phylvers >= 4) + fprintf(foo, " %d %d\n", seqout, seqlen); + else + fprintf(foo, " %d %d YF\n", seqout, seqlen); + } + + if (interleaved) { + /* interleave species lines in true output */ + /* nlines is # lines / sequence */ + short iline, j, leaf, iseq; + char *s = stempstore; + + indexout(); + noutindex--; /* mark eof */ + + for (leaf = 0; leaf < nlines; leaf++) { + if (outform == kMSF && leaf == 1) { + fputs("//\n\n", foo); + } + if (outform == kPAUP && leaf == 1) { + switch (seqtype) { + case kDNA: + cp = "dna"; + break; + case kRNA: + cp = "rna"; + break; + case kNucleic: + cp = "dna"; + break; + case kAmino: + cp = "protein"; + break; + case kOtherSeq: + cp = "dna"; + break; + } + fprintf(foo, "\nbegin data;\n"); + fprintf(foo, " dimensions ntax=%d nchar=%d;\n", + seqout, seqlen); + fprintf( + foo, + " format datatype=%s interleave missing=%c", + cp, gPretty.gapchar); + if (gPretty.domatch) + fprintf(foo, " matchchar=%c", + gPretty.matchchar); + fprintf(foo, ";\n matrix\n"); + } + + for (iseq = 0; iseq < noutindex; iseq++) { + fseek(ftmp, outindex[iseq], 0); + for (iline = 0; iline <= leaf; iline++) + if (!fgets(s, 256, ftmp)) *s = 0; + if (ftell(ftmp) <= outindex[iseq + 1]) + fputs(s, foo); + } + + for (j = 0; j < gPretty.interline; j++) + fputs("\n", foo); /* some want spacer line */ + } + fclose(ftmp); /* tmp disappears */ + fout = foo; + } + + if (outform == kASN1) fprintf(foo, "} }\n"); + if (outform == kPAUP) fprintf(foo, ";\n end;\n"); + + if (outindex != NULL) free(outindex); + exit_main(0); +} + diff --git a/ureadasn.c b/ureadasn.c new file mode 100644 index 0000000..1548594 --- /dev/null +++ b/ureadasn.c @@ -0,0 +1,324 @@ +/* ureadasn.c + -- parse, mangle and otherwise rewrite ASN1 file/entries for readseq reading + -- from NCBI toolkit (ncbi.nlm.nih.gov:/toolkit) +*/ + +#ifdef NCBI + +#include +#include +#include + +/* NCBI toolkit :include: must be on lib path */ +#include +#include + +#define UREADASN +#include "ureadseq.h" + +#pragma segment ureadasn + +/* this stuff is hacked up from tofasta.c of ncbitools */ +#define kBaseAny 0 +#define kBaseNucleic 1 +#define kBaseAmino 2 + +typedef struct tofasta { + Boolean idonly; + short *seqnum; + short whichSeq; + char **seq, **seqid; + long *seqlen; +} FastaDat, PNTR FastaPtr; + +void BioseqRawToRaw(BioseqPtr bsp, Boolean idonly, short whichSeq, + short *seqnum, char **seq, char **seqid, long *seqlen) +{ + SeqPortPtr spp; + SeqIdPtr bestid; + Uint1 repr, code, residue; + CharPtr tmp, title; + long outlen, outmax; + char localid[256], *sp; + + /* !!! this may be called several times for a single sequence + because SeqEntryExplore looks for parts and joins them... + assume seq, seqid, seqlen may contain data (or NULL) + */ + if (bsp == NULL) return; + repr = Bioseq_repr(bsp); + if (!(repr == Seq_repr_raw || repr == Seq_repr_const)) return; + + (*seqnum)++; + if (!(whichSeq == *seqnum || whichSeq == 0)) return; + + bestid = SeqIdFindBest(bsp->id, (Uint1)0); + title = BioseqGetTitle(bsp); + if (idonly) { + sprintf(localid, " %d) ", *seqnum); + tmp = localid + strlen(localid) - 1; + } + else { + strcpy(localid, " "); + tmp = localid; + } + tmp = SeqIdPrint(bestid, tmp, PRINTID_FASTA_SHORT); + tmp = StringMove(tmp, " "); + StringNCpy(tmp, title, 200); + /* fprintf(stderr,"BioseqRawToRaw: localid='%s'\n",localid); */ + + /* < seqid is fixed storage */ + /* strcpy( *seqid, localid); */ + /* < seqid is variable sized */ + outmax = strlen(localid) + 3; + if (*seqid == NULL) { + *seqid = (char *)malloc(outmax); + if (*seqid == NULL) return; + strcpy(*seqid, localid); + } + else { + outmax += strlen(*seqid) + 2; + *seqid = (char *)realloc(*seqid, outmax); + if (*seqid == NULL) return; + if (!idonly) strcat(*seqid, "; "); + strcat(*seqid, localid); + } + + if (idonly) { + strcat(*seqid, "\n"); + return; + } + + if (ISA_na(bsp->mol)) + code = Seq_code_iupacna; + else + code = Seq_code_iupacaa; + spp = SeqPortNew(bsp, 0, -1, 0, code); + SeqPortSeek(spp, 0, SEEK_SET); + + sp = *seq; + if (sp == NULL) { /* this is always true now !? */ + outlen = 0; + outmax = 500; + sp = (char *)malloc(outmax); + } + else { + outlen = strlen(sp); + outmax = outlen + 500; + sp = (char *)realloc(sp, outmax); + } + if (sp == NULL) return; + + while ((residue = SeqPortGetResidue(spp)) != SEQPORT_EOF) { + if (outlen >= outmax) { + outmax = outlen + 500; + sp = (char *)realloc(sp, outmax); + if (sp == NULL) return; + } + sp[outlen++] = residue; + } + sp = (char *)realloc(sp, outlen + 1); + if (sp != NULL) sp[outlen] = '\0'; + *seq = sp; + *seqlen = outlen; + SeqPortFree(spp); + return; +} + +static void SeqEntryRawseq(SeqEntryPtr sep, Pointer data, Int4 index, + Int2 indent) +{ + FastaPtr tfa; + BioseqPtr bsp; + + if (!IS_Bioseq(sep)) return; + bsp = (BioseqPtr)sep->data.ptrvalue; + tfa = (FastaPtr)data; + BioseqRawToRaw(bsp, tfa->idonly, tfa->whichSeq, tfa->seqnum, tfa->seq, + tfa->seqid, tfa->seqlen); +} + +void SeqEntryToRaw(SeqEntryPtr sep, Boolean idonly, short whichSeq, + short *seqnum, char **seq, char **seqid, long *seqlen) +{ + FastaDat tfa; + + if (sep == NULL) return; + tfa.idonly = idonly; + tfa.seqnum = seqnum; + tfa.whichSeq = whichSeq; + tfa.seq = seq; + tfa.seqid = seqid; + tfa.seqlen = seqlen; + SeqEntryExplore(sep, (Pointer)&tfa, SeqEntryRawseq); +} + +char *listASNSeqs( + const char *filename, const long skiplines, + const short format, /* note: this is kASNseqentry or kASNseqset */ + short *nseq, short *error) +{ + AsnIoPtr aip = NULL; + SeqEntryPtr the_set; + AsnTypePtr atp, atp2; + AsnModulePtr amp; + Boolean inIsBinary = + FALSE; /* damn, why can't asn routines test this? */ + char *seq = NULL; + char *seqid = NULL, stemp[256]; + long seqlen; + int i, count; + + *nseq = 0; + *error = 0; + + /* asn dictionary setups */ + /*fprintf(stderr,"listASNSeqs: SeqEntryLoad\n");*/ + if (!SeqEntryLoad()) + goto errxit; /* sequence alphabets (and sequence parse trees) + */ + amp = AsnAllModPtr(); /* get pointer to all loaded ASN.1 modules */ + if (amp == NULL) goto errxit; + atp = AsnFind("Bioseq-set"); /* get the initial type pointers */ + if (atp == NULL) goto errxit; + atp2 = AsnFind("Bioseq-set.seq-set.E"); + if (atp2 == NULL) goto errxit; + + /*fprintf(stderr,"listASNSeqs: AsnIoOpen\n");*/ + /* open the ASN.1 input file in the right mode */ + /* !!!! THIS FAILS when filename has MAC PATH (& other paths?) + * (:folder:filename) */ + if ((aip = AsnIoOpen(filename, inIsBinary ? "rb" : "r")) == NULL) + goto errxit; + for (i = 0; i < skiplines; i++) + fgets(stemp, 255, + aip->fp); /* this may mess up asn routines... */ + + if (!ErrSetLog("stderr")) + goto errxit; + else + ErrSetOpts(ERR_CONTINUE, + ERR_LOG_ON); /*?? log errors instead of die */ + + if (format == kASNseqentry) { /* read one Seq-entry */ + /*fprintf(stderr,"listASNSeqs: SeqEntryAsnRead\n");*/ + the_set = SeqEntryAsnRead(aip, NULL); + SeqEntryToRaw(the_set, true, 0, nseq, &seq, &seqid, &seqlen); + if (seq) free(seq); + seq = NULL; + SeqEntryFree(the_set); + } + else { /* read Seq-entry's from a Bioseq-set */ + count = 0; + /*fprintf(stderr,"listASNSeqs: AsnReadId\n");*/ + while ((atp = AsnReadId(aip, amp, atp)) != NULL) { + if (atp == atp2) { /* top level Seq-entry */ + the_set = SeqEntryAsnRead(aip, atp); + SeqEntryToRaw(the_set, true, 0, nseq, &seq, + &seqid, &seqlen); + SeqEntryFree(the_set); + if (seq) free(seq); + seq = NULL; + } + else + AsnReadVal(aip, atp, NULL); + count++; + } + } + + AsnIoClose(aip); + *error = 0; + return seqid; + +errxit: + AsnIoClose(aip); + if (seqid) free(seqid); + *error = eASNerr; + return NULL; +} + +char *readASNSeq( + const short whichEntry, const char *filename, const long skiplines, + const short format, /* note: this is kASNseqentry or kASNseqset */ + long *seqlen, short *nseq, short *error, char **seqid) +{ + AsnIoPtr aip = NULL; + SeqEntryPtr the_set; + AsnTypePtr atp, atp2; + AsnModulePtr amp; + Boolean inIsBinary = + FALSE; /* damn, why can't asn routines test this? */ + char *seq, stemp[200]; + int i, count; + + *seqlen = 0; + *nseq = 0; + *error = 0; + seq = NULL; + + /*fprintf(stderr,"readASNseq: SeqEntryLoad\n");*/ + /* asn dictionary setups */ + if (!SeqEntryLoad()) + goto errxit; /* sequence alphabets (and sequence parse trees) + */ + amp = AsnAllModPtr(); /* get pointer to all loaded ASN.1 modules */ + if (amp == NULL) goto errxit; + atp = AsnFind("Bioseq-set"); /* get the initial type pointers */ + if (atp == NULL) goto errxit; + atp2 = AsnFind("Bioseq-set.seq-set.E"); + if (atp2 == NULL) goto errxit; + + /* open the ASN.1 input file in the right mode */ + /*fprintf(stderr,"readASNseq: AsnIoOpen(%s)\n", filename);*/ + if ((aip = AsnIoOpen(filename, inIsBinary ? "rb" : "r")) == NULL) + goto errxit; + for (i = 0; i < skiplines; i++) + fgets(stemp, 255, + aip->fp); /* this may mess up asn routines... */ + + if (!ErrSetLog("stderr")) + goto errxit; + else + ErrSetOpts(ERR_CONTINUE, + ERR_LOG_ON); /*?? log errors instead of die */ + + seq = NULL; + if (format == kASNseqentry) { /* read one Seq-entry */ + /*fprintf(stderr,"readASNseq: SeqEntryAsnRead\n");*/ + the_set = SeqEntryAsnRead(aip, NULL); + SeqEntryToRaw(the_set, false, whichEntry, nseq, &seq, seqid, + seqlen); + SeqEntryFree(the_set); + goto goodexit; + } + + else { /* read Seq-entry's from a Bioseq-set */ + count = 0; + /*fprintf(stderr,"readASNseq: AsnReadId\n");*/ + while ((atp = AsnReadId(aip, amp, atp)) != NULL) { + if (atp == atp2) { /* top level Seq-entry */ + the_set = SeqEntryAsnRead(aip, atp); + SeqEntryToRaw(the_set, false, whichEntry, nseq, + &seq, seqid, seqlen); + SeqEntryFree(the_set); + if (*nseq >= whichEntry) goto goodexit; + } + else + AsnReadVal(aip, atp, NULL); + count++; + } + } + +goodexit: + AsnIoClose(aip); + *error = 0; + return seq; + +errxit: + AsnIoClose(aip); + *error = eASNerr; + if (seq) free(seq); + return NULL; +} + +#endif /*NCBI*/ diff --git a/ureadseq.c b/ureadseq.c new file mode 100644 index 0000000..dbd93b8 --- /dev/null +++ b/ureadseq.c @@ -0,0 +1,2121 @@ +/* File: ureadseq.c + * + * Reads and writes nucleic/protein sequence in various + * formats. Data files may have multiple sequences. + * + * Copyright 1990 by d.g.gilbert + * biology dept., indiana university, bloomington, in 47405 + * e-mail: gilbertd@bio.indiana.edu + * + * This program may be freely copied and used by anyone. + * Developers are encourged to incorporate parts in their + * programs, rather than devise their own private sequence + * format. + * + * This should compile and run with any ANSI C compiler. + * + */ + +#include +#include +#include + +#define UREADSEQ_G +#include "ureadseq.h" + +#pragma segment ureadseq + +int Strcasecmp(const char *a, const char *b) /* from Nlm_StrICmp */ +{ + int diff, done; + if (a == b) return 0; + done = 0; + while (!done) { + diff = to_upper(*a) - to_upper(*b); + if (diff) return diff; + if (*a == '\0') + done = 1; + else { + a++; + b++; + } + } + return 0; +} + +int Strncasecmp(const char *a, const char *b, long maxn) /* from Nlm_StrNICmp */ +{ + int diff, done; + if (a == b) return 0; + done = 0; + while (!done) { + diff = to_upper(*a) - to_upper(*b); + if (diff) return diff; + if (*a == '\0') + done = 1; + else { + a++; + b++; + maxn--; + if (!maxn) done = 1; + } + } + return 0; +} + +#ifndef Local +#define Local static /* local functions */ +#endif + +#define kStartLength 500 + +const char *aminos = "ABCDEFGHIKLMNPQRSTVWXYZ*"; +const char *primenuc = "ACGTU"; +const char *protonly = "EFIPQZ"; + +const char kNocountsymbols[5] = "_.-?"; +const char stdsymbols[6] = "_.-*?"; +const char allsymbols[32] = "_.-*?<>{}[]()!@#$%^&=+;:'/|`~\"\\"; +static const char *seqsymbols = allsymbols; + +const char nummask[11] = "0123456789"; +const char nonummask[11] = "~!@#$%^&*("; + +/* + use general form of isseqchar -- all chars + symbols. + no formats except nbrf (?) use symbols in data area as + anything other than sequence chars. +*/ + +/* Local variables for readSeq: */ +struct ReadSeqVars { + short choice, err, nseq; + long seqlen, maxseq, seqlencount; + short topnseq; + long topseqlen; + const char *fname; + char *seq, *seqid, matchchar; + boolean allDone, done, filestart, addit; + FILE *f; + long linestart; + char s[256], *sp; + + int (*isseqchar)(); + /* int (*isseqchar)(int c); << sgi cc hates (int c) */ +}; + +int isSeqChar(int c) { return (isalpha(c) || strchr(seqsymbols, c)); } + +int isSeqNumChar(int c) { return (isalnum(c) || strchr(seqsymbols, c)); } + +int isAnyChar(int c) { return isascii(c); /* wrap in case isascii is macro */ } + +Local void readline(FILE *f, char *s, long *linestart) +{ + char *cp; + + *linestart = ftell(f); + if (NULL == fgets(s, 256, f)) + *s = 0; + else { + cp = strchr(s, '\n'); + if (cp != NULL) *cp = 0; + } +} + +Local void getline(struct ReadSeqVars *V) +{ + readline(V->f, V->s, &V->linestart); +} + +Local void ungetline(struct ReadSeqVars *V) { fseek(V->f, V->linestart, 0); } + +Local void addseq(char *s, struct ReadSeqVars *V) +{ + char *ptr; + + if (V->addit) + while (*s != 0) { + if ((V->isseqchar)(*s)) { + if (V->seqlen >= V->maxseq) { + V->maxseq += kStartLength; + ptr = (char *)realloc(V->seq, + V->maxseq + 1); + if (ptr == NULL) { + V->err = eMemFull; + return; + } + else + V->seq = ptr; + } + V->seq[(V->seqlen)++] = *s; + } + s++; + } +} + +Local void countseq(char *s, struct ReadSeqVars *V) +/* this must count all valid seq chars, for some formats (paup-sequential) even + if we are skipping seq... */ +{ + while (*s != 0) { + if ((V->isseqchar)(*s)) { + (V->seqlencount)++; + } + s++; + } +} + +Local void addinfo(char *s, struct ReadSeqVars *V) +{ + char s2[256], *si; + boolean saveadd; + + si = s2; + while (*s == ' ') s++; + sprintf(si, " %d) %s\n", V->nseq, s); + + saveadd = V->addit; + V->addit = true; + V->isseqchar = isAnyChar; + addseq(si, V); + V->addit = saveadd; + V->isseqchar = isSeqChar; +} + +Local void readLoop(short margin, boolean addfirst, + boolean (*endTest)(boolean *addend, boolean *ungetend, + struct ReadSeqVars *V), + struct ReadSeqVars *V) +{ + boolean addend = false; + boolean ungetend = false; + + V->nseq++; + if (V->choice == kListSequences) + V->addit = false; + else + V->addit = (V->nseq == V->choice); + if (V->addit) V->seqlen = 0; + + if (addfirst) addseq(V->s, V); + do { + getline(V); + V->done = feof(V->f); + V->done |= (*endTest)(&addend, &ungetend, V); + if (V->addit && (addend || !V->done) && + (strlen(V->s) > margin)) { + addseq((V->s) + margin, V); + } + } while (!V->done); + + if (V->choice == kListSequences) + addinfo(V->seqid, V); + else { + V->allDone = (V->nseq >= V->choice); + if (V->allDone && ungetend) ungetline(V); + } +} + +Local boolean endIG(boolean *addend, boolean *ungetend, struct ReadSeqVars *V) +{ + *addend = true; /* 1 or 2 occur in line w/ bases */ + *ungetend = false; + return ((strchr(V->s, '1') != NULL) || (strchr(V->s, '2') != NULL)); +} + +Local void readIG(struct ReadSeqVars *V) +{ + /* 18Aug92: new IG format -- ^L between sequences in place of ";" */ + char *si; + + while (!V->allDone) { + do { + getline(V); + for (si = V->s; *si != 0 && *si < ' '; si++) + *si = ' '; /* drop controls */ + if (*si == 0) *V->s = 0; /* chop line to empty */ + } while (!(feof(V->f) || ((*V->s != 0) && (*V->s != ';')))); + if (feof(V->f)) + V->allDone = true; + else { + strcpy(V->seqid, V->s); + readLoop(0, false, endIG, V); + } + } +} + +Local boolean endStrider(boolean *addend, boolean *ungetend, + struct ReadSeqVars *V) +{ + *addend = false; + *ungetend = false; + return (strstr(V->s, "//") != NULL); +} + +Local void readStrider(struct ReadSeqVars *V) +{ /* ? only 1 seq/file ? */ + + while (!V->allDone) { + getline(V); + if (strstr(V->s, "; DNA sequence ") == V->s) + strcpy(V->seqid, (V->s) + 16); + else + strcpy(V->seqid, (V->s) + 1); + while ((!feof(V->f)) && (*V->s == ';')) { + getline(V); + } + if (feof(V->f)) + V->allDone = true; + else + readLoop(0, true, endStrider, V); + } +} + +Local boolean endPIR(boolean *addend, boolean *ungetend, struct ReadSeqVars *V) +{ + *addend = false; + *ungetend = (strstr(V->s, "ENTRY") == V->s); + return ((strstr(V->s, "///") != NULL) || *ungetend); +} + +Local void readPIR(struct ReadSeqVars *V) +{ /*PIR -- many seqs/file */ + + while (!V->allDone) { + while (!(feof(V->f) || strstr(V->s, "ENTRY") || + strstr(V->s, "SEQUENCE"))) + getline(V); + strcpy(V->seqid, (V->s) + 16); + while (!(feof(V->f) || strstr(V->s, "SEQUENCE") == V->s)) + getline(V); + readLoop(0, false, endPIR, V); + + if (!V->allDone) { + while (!( + feof(V->f) || + ((*V->s != 0) && (strstr(V->s, "ENTRY") == V->s)))) + getline(V); + } + if (feof(V->f)) V->allDone = true; + } +} + +Local boolean endGB(boolean *addend, boolean *ungetend, struct ReadSeqVars *V) +{ + *addend = false; + *ungetend = (strstr(V->s, "LOCUS") == V->s); + return ((strstr(V->s, "//") != NULL) || *ungetend); +} + +Local void readGenBank(struct ReadSeqVars *V) +{ /*GenBank -- many seqs/file */ + + while (!V->allDone) { + strcpy(V->seqid, (V->s) + 12); + while (!(feof(V->f) || strstr(V->s, "ORIGIN") == V->s)) + getline(V); + readLoop(0, false, endGB, V); + + if (!V->allDone) { + while (!( + feof(V->f) || + ((*V->s != 0) && (strstr(V->s, "LOCUS") == V->s)))) + getline(V); + } + if (feof(V->f)) V->allDone = true; + } +} + +Local boolean endNBRF(boolean *addend, boolean *ungetend, struct ReadSeqVars *V) +{ + char *a; + + if ((a = strchr(V->s, '*')) != NULL) { /* end of 1st seq */ + /* "*" can be valid base symbol, drop it here */ + *a = 0; + *addend = true; + *ungetend = false; + return (true); + } + else if (*V->s == '>') { /* start of next seq */ + *addend = false; + *ungetend = true; + return (true); + } + else + return (false); +} + +Local void readNBRF(struct ReadSeqVars *V) +{ + while (!V->allDone) { + strcpy(V->seqid, (V->s) + 4); + getline(V); /*skip title-junk line*/ + readLoop(0, false, endNBRF, V); + if (!V->allDone) { + while (!(feof(V->f) || (*V->s != 0 && *V->s == '>'))) + getline(V); + } + if (feof(V->f)) V->allDone = true; + } +} + +Local boolean endPearson(boolean *addend, boolean *ungetend, + struct ReadSeqVars *V) +{ + *addend = false; + *ungetend = true; + return (*V->s == '>'); +} + +Local void readPearson(struct ReadSeqVars *V) +{ + while (!V->allDone) { + strcpy(V->seqid, (V->s) + 1); + readLoop(0, false, endPearson, V); + if (!V->allDone) { + while ( + !(feof(V->f) || ((*V->s != 0) && (*V->s == '>')))) + getline(V); + } + if (feof(V->f)) V->allDone = true; + } +} + +Local boolean endEMBL(boolean *addend, boolean *ungetend, struct ReadSeqVars *V) +{ + *addend = false; + *ungetend = (strstr(V->s, "ID ") == V->s); + return ((strstr(V->s, "//") != NULL) || *ungetend); +} + +Local void readEMBL(struct ReadSeqVars *V) +{ + while (!V->allDone) { + strcpy(V->seqid, (V->s) + 5); + do { + getline(V); + } while (!(feof(V->f) | (strstr(V->s, "SQ ") == V->s))); + + readLoop(0, false, endEMBL, V); + if (!V->allDone) { + while ( + !(feof(V->f) | ((*V->s != '\0') & + (strstr(V->s, "ID ") == V->s)))) + getline(V); + } + if (feof(V->f)) V->allDone = true; + } +} + +Local boolean endZuker(boolean *addend, boolean *ungetend, + struct ReadSeqVars *V) +{ + *addend = false; + *ungetend = true; + return (*V->s == '('); +} + +Local void readZuker(struct ReadSeqVars *V) +{ + /*! 1st string is Zuker's Fortran format */ + + while (!V->allDone) { + getline(V); /*s == "seqLen seqid string..."*/ + strcpy(V->seqid, (V->s) + 6); + readLoop(0, false, endZuker, V); + if (!V->allDone) { + while ( + !(feof(V->f) | ((*V->s != '\0') & (*V->s == '(')))) + getline(V); + } + if (feof(V->f)) V->allDone = true; + } +} + +Local boolean endFitch(boolean *addend, boolean *ungetend, + struct ReadSeqVars *V) +{ + /* this is a somewhat shaky end, + 1st char of line is non-blank for seq. title + */ + *addend = false; + *ungetend = true; + return (*V->s != ' '); +} + +Local void readFitch(struct ReadSeqVars *V) +{ + boolean first; + + first = true; + while (!V->allDone) { + if (!first) strcpy(V->seqid, V->s); + readLoop(0, first, endFitch, V); + if (feof(V->f)) V->allDone = true; + first = false; + } +} + +Local void readPlain(struct ReadSeqVars *V) +{ + V->nseq++; + V->addit = (V->choice > 0); + if (V->addit) V->seqlen = 0; + addseq(V->seqid, V); /*from above..*/ + if (V->fname != NULL) + sprintf(V->seqid, "%s [Unknown form]", V->fname); + else + sprintf(V->seqid, " [Unknown form]"); + do { + addseq(V->s, V); + V->done = feof(V->f); + getline(V); + } while (!V->done); + if (V->choice == kListSequences) addinfo(V->seqid, V); + V->allDone = true; +} + +Local void readUWGCG(struct ReadSeqVars *V) +{ + /* + 10nov91: Reading GCG files casued duplication of last line when + EOF followed that line !!! + fix: getline now sets *V->s = 0 + */ + char *si; + + V->nseq++; + V->addit = (V->choice > 0); + if (V->addit) V->seqlen = 0; + strcpy(V->seqid, V->s); + /*writeseq: " %s Length: %d (today) Check: %d ..\n" */ + /*drop above or ".." from id*/ + if (si = strstr(V->seqid, " Length: ")) + *si = 0; + else if (si = strstr(V->seqid, "..")) + *si = 0; + do { + V->done = feof(V->f); + getline(V); + if (!V->done) addseq((V->s), V); + } while (!V->done); + if (V->choice == kListSequences) addinfo(V->seqid, V); + V->allDone = true; +} + +Local void readOlsen(struct ReadSeqVars *V) +{ /* G. Olsen /print output from multiple sequence editor */ + + char *si, *sj, *sk, *sm, sid[40], snum[20]; + boolean indata = false; + int snumlen; + + V->addit = (V->choice > 0); + if (V->addit) V->seqlen = 0; + rewind(V->f); + V->nseq = 0; + do { + getline(V); + V->done = feof(V->f); + + if (V->done && !(*V->s)) + break; + else if (indata) { + if ((si = strstr(V->s, sid)) + /* && (strstr(V->s, snum) == si - snumlen - 1) ) { + */ + && (sm = strstr(V->s, snum)) && + (sm < si - snumlen)) { + /* Spaces are valid alignment data !! */ + /* 17Oct91: Error, the left margin is 21 not 22! + */ + /* dropped some nucs up to now -- my example + * file was right shifted ! */ + /* variable right id margin, drop id-2 spaces at + * end */ + /* + VMS CC COMPILER (VAXC031) mess up: + -- Index of 21 is chopping 1st nuc on VMS + systems Only! Byte-for-byte same ame + rnasep.olsen sequence file ! + */ + + /* si = (V->s)+21; < was this before VMS CC + * wasted my time */ + si += 10; /* use strstr index plus offset to + outfox VMS CC bug */ + + if (sk = strstr(si, sid)) *(sk - 2) = 0; + for (sk = si; *sk != 0; sk++) { + if (*sk == ' ') *sk = '.'; + /* 18aug92: !! some olsen masks are + * NUMBERS !! which addseq eats */ + else if (isdigit(*sk)) + *sk = nonummask[*sk - '0']; + } + + addseq(si, V); + } + } + + else if (sk = strstr(V->s, "): ")) { /* seq info header line */ + /* 18aug92: correct for diff seqs w/ same name -- use + * number, e.g. */ + /* 3 (Agr.tume): agrobacterium.prna 18-JUN-1987 + * 16:12 */ + /* 328 (Agr.tume): agrobacterium.prna XYZ 19-DEC-1992 + */ + (V->nseq)++; + si = 1 + strchr(V->s, '('); + *sk = ' '; + if (V->choice == kListSequences) + addinfo(si, V); + else if (V->nseq == V->choice) { + strcpy(V->seqid, si); + sj = strchr(V->seqid, ':'); + while (*(--sj) == ' ') + ; + while (--sj != V->seqid) { + if (*sj == ' ') *sj = '_'; + } + + *sk = 0; + while (*(--sk) == ' ') *sk = 0; + strcpy(sid, si); + + si = V->s; + while ((*si <= ' ') && (*si != 0)) si++; + snumlen = 0; + while (si[snumlen] > ' ' && snumlen < 20) { + snum[snumlen] = si[snumlen]; + snumlen++; + } + snum[snumlen] = 0; + } + } + + else if (strstr(V->s, "identity: Data:")) { + indata = true; + if (V->choice == kListSequences) V->done = true; + } + + } while (!V->done); + + V->allDone = true; +} /*readOlsen*/ + +Local void readMSF(struct ReadSeqVars *V) +{ /* gcg's MSF, mult. sequence format, interleaved ! */ + + char *si, *sj, sid[128]; + boolean indata = false; + int atseq = 0, iline = 0; + + V->addit = (V->choice > 0); + if (V->addit) V->seqlen = 0; + rewind(V->f); + V->nseq = 0; + do { + getline(V); + V->done = feof(V->f); + + if (V->done && !(*V->s)) + break; + else if (indata) { + /*somename ...gpvedai .......t.. aaigr..vad tvgtgptnse + * aipaltaaet */ + /* E gvenae.kgv tentna.tad fvaqpvylpe .nqt...... + * kv.affynrs */ + + si = V->s; + skipwhitespace(si); + /* for (sj= si; isalnum(*sj); sj++) ; bug -- cdelwiche + * uses "-", "_" and others in names*/ + for (sj = si; *sj > ' '; sj++) + ; + *sj = 0; + if (*si) { + if ((0 == strcmp(si, sid))) { + addseq(sj + 1, V); + } + iline++; + } + } + + else if (NULL != + (si = strstr(V->s, + "Name: "))) { /* seq info header line */ + /* Name: somename Len: 100 Check: 7009 + * Weight: 1.00 */ + + (V->nseq)++; + si += 6; + if (V->choice == kListSequences) + addinfo(si, V); + else if (V->nseq == V->choice) { + strcpy(V->seqid, si); + si = V->seqid; + skipwhitespace(si); + /* for (sj= si; isalnum(*sj); sj++) ; -- bug */ + for (sj = si; *sj > ' '; sj++) + ; + *sj = 0; + strcpy(sid, si); + } + } + + else if (strstr(V->s, "//") /*== V->s*/) { + indata = true; + iline = 0; + if (V->choice == kListSequences) V->done = true; + } + + } while (!V->done); + + V->allDone = true; +} /*readMSF*/ + +Local void readPAUPinterleaved(struct ReadSeqVars *V) +{ /* PAUP mult. sequence format, interleaved or sequential! */ + + char *si, *sj, *send, sid[40], sid1[40], saveseq[255]; + boolean first = true, indata = false, domatch; + int atseq = 0, iline = 0, ifmc, saveseqlen = 0; + +#define fixmatchchar(s) \ + { \ + for (ifmc = 0; ifmc < saveseqlen; ifmc++) \ + if (s[ifmc] == V->matchchar) s[ifmc] = saveseq[ifmc]; \ + } + + V->addit = (V->choice > 0); + V->seqlencount = 0; + if (V->addit) V->seqlen = 0; + /* rewind(V->f); V->nseq= 0; << do in caller !*/ + indata = true; /* call here after we find "matrix" */ + domatch = (V->matchchar > 0); + + do { + getline(V); + V->done = feof(V->f); + + if (V->done && !(*V->s)) + break; + else if (indata) { + /* [ 1 1 1 + * ]*/ + /* human aagcttcaccggcgcagtca ttctcataatcgcccacggR + * cttacatcct*/ + /* chimp ................a.t. .c.................a + * ..........*/ + /* !! need to correct for V->matchchar */ + si = V->s; + skipwhitespace(si); + if (strchr(si, ';')) indata = false; + + if (isalnum(*si)) { + /* valid data line starts w/ a left-justified + * seq name in columns [0..8] */ + if (first) { + (V->nseq)++; + if (V->nseq >= V->topnseq) + first = false; + for (sj = si; isalnum(*sj); sj++) + ; + send = sj; + skipwhitespace(sj); + if (V->choice == kListSequences) { + *send = 0; + addinfo(si, V); + } + else if (V->nseq == V->choice) { + if (domatch) { + if (V->nseq == 1) { + strcpy(saveseq, + sj); + saveseqlen = + strlen( + saveseq); + } + else + fixmatchchar( + sj); + } + addseq(sj, V); + *send = 0; + strcpy(V->seqid, si); + strcpy(sid, si); + if (V->nseq == 1) + strcpy(sid1, sid); + } + } + + else if ((strstr(si, sid) == si)) { + while (isalnum(*si)) si++; + skipwhitespace(si); + if (domatch) { + if (V->nseq == 1) { + strcpy(saveseq, si); + saveseqlen = + strlen(saveseq); + } + else + fixmatchchar(si); + } + addseq(si, V); + } + + else if (domatch && (strstr(si, sid1) == si)) { + strcpy(saveseq, si); + saveseqlen = strlen(saveseq); + } + + iline++; + } + } + + else if (strstr(V->s, "matrix")) { + indata = true; + iline = 0; + if (V->choice == kListSequences) V->done = true; + } + + } while (!V->done); + + V->allDone = true; +} /*readPAUPinterleaved*/ + +Local void readPAUPsequential(struct ReadSeqVars *V) +{ /* PAUP mult. sequence format, interleaved or sequential! */ + char *si, *sj; + boolean atname = true, indata = false; + + V->addit = (V->choice > 0); + if (V->addit) V->seqlen = 0; + V->seqlencount = 0; + /* rewind(V->f); V->nseq= 0; << do in caller !*/ + indata = true; /* call here after we find "matrix" */ + do { + getline(V); + V->done = feof(V->f); + + if (V->done && !(*V->s)) + break; + else if (indata) { + /* [ 1 1 1 + * ]*/ + /* human aagcttcaccggcgcagtca ttctcataatcgcccacggR + * cttacatcct*/ + /* aagcttcaccggcgcagtca ttctcataatcgcccacggR + * cttacatcct*/ + /* chimp ................a.t. .c.................a + * ..........*/ + /* ................a.t. .c.................a + * ..........*/ + + si = V->s; + skipwhitespace(si); + if (strchr(si, ';')) indata = false; + if (isalnum(*si)) { + /* valid data line starts w/ a left-justified + * seq name in columns [0..8] */ + if (atname) { + (V->nseq)++; + V->seqlencount = 0; + atname = false; + sj = si + 1; + while (isalnum(*sj)) sj++; + if (V->choice == kListSequences) { + /* !! we must count bases to + * know when topseqlen is + * reached ! */ + countseq(sj, V); + if (V->seqlencount >= + V->topseqlen) + atname = true; + *sj = 0; + addinfo(si, V); + } + else if (V->nseq == V->choice) { + addseq(sj, V); + V->seqlencount = V->seqlen; + if (V->seqlencount >= + V->topseqlen) + atname = true; + *sj = 0; + strcpy(V->seqid, si); + } + else { + countseq(sj, V); + if (V->seqlencount >= + V->topseqlen) + atname = true; + } + } + + else if (V->nseq == V->choice) { + addseq(V->s, V); + V->seqlencount = V->seqlen; + if (V->seqlencount >= V->topseqlen) + atname = true; + } + else { + countseq(V->s, V); + if (V->seqlencount >= V->topseqlen) + atname = true; + } + } + } + + else if (strstr(V->s, "matrix")) { + indata = true; + atname = true; + if (V->choice == kListSequences) V->done = true; + } + + } while (!V->done); + + V->allDone = true; +} /*readPAUPsequential*/ + +Local void readPhylipInterleaved(struct ReadSeqVars *V) +{ + char *si, *sj; + boolean first = true; + int iline = 0; + + V->addit = (V->choice > 0); + if (V->addit) V->seqlen = 0; + V->seqlencount = 0; + /* sscanf( V->s, "%d%d", &V->topnseq, &V->topseqlen); << topnseq == 0 + * !!! bad scan !! */ + si = V->s; + skipwhitespace(si); + V->topnseq = atoi(si); + while (isdigit(*si)) si++; + skipwhitespace(si); + V->topseqlen = atol(si); + /* fprintf(stderr,"Phylip-ileaf: topnseq=%d topseqlen=%d\n",V->topnseq, + * V->topseqlen); */ + + do { + getline(V); + V->done = feof(V->f); + + if (V->done && !(*V->s)) break; + si = V->s; + skipwhitespace(si); + if (*si != 0) { + if (first) { /* collect seq names + seq, as + fprintf(outf,"%-10s ",seqname); */ + (V->nseq)++; + if (V->nseq >= V->topnseq) first = false; + sj = V->s + 10; /* past name, start of data */ + if (V->choice == kListSequences) { + *sj = 0; + addinfo(si, V); + } + else if (V->nseq == V->choice) { + addseq(sj, V); + *sj = 0; + strcpy(V->seqid, si); + } + } + else if (iline % V->nseq == V->choice - 1) { + addseq(si, V); + } + iline++; + } + } while (!V->done); + + V->allDone = true; +} /*readPhylipInterleaved*/ + +Local boolean endPhylipSequential(boolean *addend, boolean *ungetend, + struct ReadSeqVars *V) +{ + *addend = false; + *ungetend = false; + countseq(V->s, V); + return V->seqlencount >= V->topseqlen; +} + +Local void readPhylipSequential(struct ReadSeqVars *V) +{ + short i; + char *si; + /* sscanf( V->s, "%d%d", &V->topnseq, &V->topseqlen); < ? bad sscan ? */ + si = V->s; + skipwhitespace(si); + V->topnseq = atoi(si); + while (isdigit(*si)) si++; + skipwhitespace(si); + V->topseqlen = atol(si); + getline(V); + while (!V->allDone) { + V->seqlencount = 0; + strncpy(V->seqid, (V->s), 10); + V->seqid[10] = 0; + for (i = 0; i < 10 && V->s[i]; i++) V->s[i] = ' '; + readLoop(0, true, endPhylipSequential, V); + if (feof(V->f)) V->allDone = true; + } +} + +Local void readSeqMain(struct ReadSeqVars *V, const long skiplines_, + const short format_) +{ +#define tolowerstr(s) \ + { \ + long Itlwr, Ntlwr = strlen(s); \ + for (Itlwr = 0; Itlwr < Ntlwr; Itlwr++) \ + s[Itlwr] = to_lower(s[Itlwr]); \ + } + + boolean gotuw; + long l; + + V->linestart = 0; + V->matchchar = 0; + if (V->f == NULL) + V->err = eFileNotFound; + else { + for (l = skiplines_; l > 0; l--) getline(V); + + do { + getline(V); + for (l = strlen(V->s); (l > 0) && (V->s[l] == ' '); l--) + ; + } while ((l == 0) && !feof(V->f)); + + if (feof(V->f)) + V->err = eNoData; + + else + switch (format_) { + case kPlain: + readPlain(V); + break; + case kIG: + readIG(V); + break; + case kStrider: + readStrider(V); + break; + case kGenBank: + readGenBank(V); + break; + case kPIR: + readPIR(V); + break; + case kNBRF: + readNBRF(V); + break; + case kPearson: + readPearson(V); + break; + case kEMBL: + readEMBL(V); + break; + case kZuker: + readZuker(V); + break; + case kOlsen: + readOlsen(V); + break; + case kMSF: + readMSF(V); + break; + + case kPAUP: { + boolean done = false; + boolean interleaved = false; + char *cp; + /* rewind(V->f); V->nseq= 0; ?? assume + * it is at top ?? skiplines ... */ + while (!done) { + getline(V); + tolowerstr(V->s); + if (strstr(V->s, "matrix")) + done = true; + if (strstr(V->s, "interleav")) + interleaved = true; + if (NULL != + (cp = + strstr(V->s, "ntax="))) + V->topnseq = + atoi(cp + 5); + if (NULL != + (cp = strstr(V->s, + "nchar="))) + V->topseqlen = + atoi(cp + 6); + if (NULL != + (cp = strstr( + V->s, "matchchar="))) { + cp += 10; + if (*cp == '\'') + cp++; + else if (*cp == '"') + cp++; + V->matchchar = *cp; + } + } + if (interleaved) + readPAUPinterleaved(V); + else + readPAUPsequential(V); + } break; + + /* kPhylip: ! can't determine in middle of file + * which type it is...*/ + /* test for interleave or sequential and use + * Phylip4(ileave) or Phylip2 */ + case kPhylip2: + readPhylipSequential(V); + break; + case kPhylip4: /* == kPhylip3 */ + readPhylipInterleaved(V); + break; + + default: + V->err = eUnknownFormat; + break; + + case kFitch: + strcpy(V->seqid, V->s); + getline(V); + readFitch(V); + break; + + case kGCG: + do { + gotuw = (strstr(V->s, "..") != + NULL); + if (gotuw) readUWGCG(V); + getline(V); + } while (!(feof(V->f) || V->allDone)); + break; + } + } + + V->filestart = false; + V->seq[V->seqlen] = 0; /* stick a string terminator on it */ +} + +char *readSeqFp(const short whichEntry_, /* index to sequence in file */ + FILE *fp_, /* pointer to open seq file */ + const long skiplines_, + const short format_, /* sequence file format */ + long *seqlen_, /* return seq size */ + short *nseq_, /* number of seqs in file, for listSeqs() */ + short *error_, /* return error */ + char *seqid_) /* return seq name/info */ +{ + struct ReadSeqVars V; + + if (format_ < kMinFormat || format_ > kMaxFormat) { + *error_ = eUnknownFormat; + *seqlen_ = 0; + return NULL; + } + + V.choice = whichEntry_; + V.fname = NULL; /* don't know */ + V.seq = (char *)calloc(1, kStartLength + 1); + V.maxseq = kStartLength; + V.seqlen = 0; + V.seqid = seqid_; + + V.f = fp_; + V.filestart = (ftell(fp_) == 0); + /* !! in sequential read, must remove current seq position from + * choice/whichEntry_ counter !! ... */ + if (V.filestart) + V.nseq = 0; + else + V.nseq = *nseq_; /* track where we are in file...*/ + + *V.seqid = '\0'; + V.err = 0; + V.nseq = 0; + V.isseqchar = isSeqChar; + if (V.choice == kListSequences) + ; /* leave as is */ + else if (V.choice <= 0) + V.choice = 1; /* default ?? */ + V.addit = (V.choice > 0); + V.allDone = false; + + readSeqMain(&V, skiplines_, format_); + + *error_ = V.err; + *seqlen_ = V.seqlen; + *nseq_ = V.nseq; + return V.seq; +} + +char *readSeq(const short whichEntry_, /* index to sequence in file */ + const char *filename_, /* file name */ + const long skiplines_, + const short format_, /* sequence file format */ + long *seqlen_, /* return seq size */ + short *nseq_, /* number of seqs in file, for listSeqs() */ + short *error_, /* return error */ + char *seqid_) /* return seq name/info */ +{ + struct ReadSeqVars V; + + if (format_ < kMinFormat || format_ > kMaxFormat) { + *error_ = eUnknownFormat; + *seqlen_ = 0; + return NULL; + } + + V.choice = whichEntry_; + V.fname = filename_; /* don't need to copy string, just ptr to it */ + V.seq = (char *)calloc(1, kStartLength + 1); + V.maxseq = kStartLength; + V.seqlen = 0; + V.seqid = seqid_; + + V.f = NULL; + *V.seqid = '\0'; + V.err = 0; + V.nseq = 0; + V.isseqchar = isSeqChar; + if (V.choice == kListSequences) + ; /* leave as is */ + else if (V.choice <= 0) + V.choice = 1; /* default ?? */ + V.addit = (V.choice > 0); + V.allDone = false; + + V.f = fopen(V.fname, "r"); + V.filestart = true; + + readSeqMain(&V, skiplines_, format_); + + if (V.f != NULL) fclose(V.f); + *error_ = V.err; + *seqlen_ = V.seqlen; + *nseq_ = V.nseq; + return V.seq; +} + +char *listSeqs(const char *filename_, /* file name */ + const long skiplines_, + const short format_, /* sequence file format */ + short *nseq_, /* number of seqs in file, for listSeqs() */ + short *error_) /* return error */ +{ + char seqid[256]; + long seqlen; + + return readSeq(kListSequences, filename_, skiplines_, format_, &seqlen, + nseq_, error_, seqid); +} + +short seqFileFormat(/* return sequence format number, see ureadseq.h */ + const char *filename, + long *skiplines, /* return #lines to skip any junk like mail + header */ + short *error) /* return any error value or 0 */ +{ + FILE *fseq; + short format; + + fseq = fopen(filename, "r"); + format = seqFileFormatFp(fseq, skiplines, error); + if (fseq != NULL) fclose(fseq); + return format; +} + +short seqFileFormatFp( + FILE *fseq, + long *skiplines, /* return #lines to skip any junk like mail header */ + short *error) /* return any error value or 0 */ +{ + boolean foundDNA = false, foundIG = false, foundStrider = false, + foundGB = false, foundPIR = false, foundEMBL = false, + foundNBRF = false, foundPearson = false, foundFitch = false, + foundPhylip = false, foundZuker = false, gotolsen = false, + gotpaup = false, gotasn1 = false, gotuw = false, gotMSF = false, + isfitch = false, isphylip = false, done = false; + short format = kUnknown; + int nlines = 0, k, splen = 0, otherlines = 0, aminolines = 0, + dnalines = 0; + char sp[256]; + long linestart = 0; + int maxlines2check = 500; + +#define ReadOneLine(sp) \ + { \ + done |= (feof(fseq)); \ + readline(fseq, sp, &linestart); \ + if (!done) { \ + splen = strlen(sp); \ + ++nlines; \ + } \ + } + + *skiplines = 0; + *error = 0; + if (fseq == NULL) { + *error = eFileNotFound; + return kNoformat; + } + + while (!done) { + ReadOneLine(sp); + + /* check for mailer head & skip past if found */ + if (nlines < 4 && !done) { + if ((strstr(sp, "From ") == sp) || + (strstr(sp, "Received:") == sp)) { + do { + /* skip all lines until find one blank + * line */ + ReadOneLine(sp); + if (!done) + for (k = 0; (k < splen) && + (sp[k] == ' '); + k++) + ; + } while ((!done) && (k < splen)); + *skiplines = nlines; /* !? do we want #lines or + #bytes ?? */ + } + } + + if (sp == NULL || *sp == 0) + ; /* nada */ + + /* high probability identities: */ + + else if (strstr(sp, "MSF:") && strstr(sp, "Type:") && + strstr(sp, "Check:")) + gotMSF = true; + + else if ((strstr(sp, "..") != NULL) && + (strstr(sp, "Check:") != NULL)) + gotuw = true; + + else if (strstr(sp, "identity: Data:") != NULL) + gotolsen = true; + + else if (strstr(sp, "::=") && + (strstr(sp, "Bioseq") || /* Bioseq or Bioseq-set */ + strstr(sp, "Seq-entry") || + strstr( + sp, + "Seq-submit"))) /* can we read submit format? */ + gotasn1 = true; + + else if (strstr(sp, "#NEXUS") == sp) + gotpaup = true; + + /* uncertain identities: */ + + else if (*sp == ';') { + if (strstr(sp, "Strider") != NULL) + foundStrider = true; + else + foundIG = true; + } + + else if (strstr(sp, "LOCUS") == sp) + foundGB = true; + else if (strstr(sp, "ORIGIN") == sp) + foundGB = true; + + else if (strstr(sp, "ENTRY ") == + sp) /* ? also (strcmp(sp,"\\\\\\")==0) */ + foundPIR = true; + else if (strstr(sp, "SEQUENCE") == sp) + foundPIR = true; + + else if (*sp == '>') { + if (sp[3] == ';') + foundNBRF = true; + else + foundPearson = true; + } + + else if (strstr(sp, "ID ") == sp) + foundEMBL = true; + else if (strstr(sp, "SQ ") == sp) + foundEMBL = true; + + else if (*sp == '(') + foundZuker = true; + + else { + if (nlines - *skiplines == 1) { + int ispp = 0, ilen = 0; + sscanf(sp, "%d%d", &ispp, &ilen); + if (ispp > 0 && ilen > 0) isphylip = true; + } + else if (isphylip && nlines - *skiplines == 2) { + int tseq; + tseq = getseqtype(sp + 10, strlen(sp + 10)); + if (isalpha(*sp) /* 1st letter in 2nd line must + be of a name */ + && (tseq != kOtherSeq)) /* sequence section + must be okay */ + foundPhylip = true; + } + + for (k = 0, isfitch = true; isfitch & (k < splen); + k++) { + if (k % 4 == 0) + isfitch &= (sp[k] == ' '); + else + isfitch &= (sp[k] != ' '); + } + if (isfitch & (splen > 20)) foundFitch = true; + + /* kRNA && kDNA are fairly certain...*/ + switch (getseqtype(sp, splen)) { + case kOtherSeq: + otherlines++; + break; + case kAmino: + if (splen > 20) aminolines++; + break; + case kDNA: + case kRNA: + if (splen > 20) dnalines++; + break; + case kNucleic: + break; /* not much info ? */ + } + } + + /* pretty certain */ + if (gotolsen) { + format = kOlsen; + done = true; + } + else if (gotMSF) { + format = kMSF; + done = true; + } + else if (gotasn1) { + /* !! we need to look further and return kASNseqentry | + * kASNseqset */ + /* + seqentry key is Seq-entry ::= + seqset key is Bioseq-set ::= + ?? can't read these yet w/ ncbi tools ?? + Seq-submit ::= + Bioseq ::= << fails both bioseq-seq and seq-entry + parsers ! + */ + if (strstr(sp, "Bioseq-set")) + format = kASNseqset; + else if (strstr(sp, "Seq-entry")) + format = kASNseqentry; + else + format = kASN1; /* other form, we can't yet + read... */ + done = true; + } + else if (gotpaup) { + format = kPAUP; + done = true; + } + + else if (gotuw) { + if (foundIG) + format = + kIG; /* a TOIG file from GCG for certain */ + else + format = kGCG; + done = true; + } + + else if ((dnalines > 1) || done || (nlines > maxlines2check)) { + /* decide on most likely format */ + /* multichar idents: */ + if (foundStrider) + format = kStrider; + else if (foundGB) + format = kGenBank; + else if (foundPIR) + format = kPIR; + else if (foundEMBL) + format = kEMBL; + else if (foundNBRF) + format = kNBRF; + /* single char idents: */ + else if (foundIG) + format = kIG; + else if (foundPearson) + format = kPearson; + else if (foundZuker) + format = kZuker; + /* digit ident: */ + else if (foundPhylip) + format = kPhylip; + /* spacing ident: */ + else if (foundFitch) + format = kFitch; + /* no format chars: */ + else if (otherlines > 0) + format = kUnknown; + else if (dnalines > 1) + format = kPlain; + else if (aminolines > 1) + format = kPlain; + else + format = kUnknown; + + done = true; + } + + /* need this for possible long header in olsen format */ + else if (strstr(sp, "): ") != NULL) + maxlines2check++; + } + + if (format == kPhylip) { + /* check for interleaved or sequential -- really messy */ + int tname, tseq; + long i, j, nspp = 0, nlen = 0, ilen, leaf = 0, seq = 0; + char *ps; + + rewind(fseq); + for (i = 0; i < *skiplines; i++) ReadOneLine(sp); + nlines = 0; + ReadOneLine(sp); + sscanf(sp, "%d%d", &nspp, &nlen); + ReadOneLine(sp); /* 1st seq line */ + for (ps = sp + 10, ilen = 0; *ps != 0; ps++) + if (isprint(*ps)) ilen++; + + for (i = 1; i < nspp; i++) { + ReadOneLine(sp); + + tseq = getseqtype(sp + 10, strlen(sp + 10)); + tname = getseqtype(sp, 10); + for (j = 0, ps = sp; isspace(*ps) && j < 10; ps++, j++) + ; + for (ps = sp; *ps != 0; ps++) + if (isprint(*ps)) ilen++; + + /* find probable interleaf or sequential ... */ + if (j >= 9) + seq += 10; /* pretty certain not ileaf */ + else { + if (tseq != tname) + leaf++; + else + seq++; + if (tname == kDNA || tname == kRNA) + seq++; + else + leaf++; + } + + if (ilen <= nlen && j < 9) { + if (tname == kOtherSeq) + leaf += 10; + else if (tname == kAmino || tname == kDNA || + tname == kRNA) + seq++; + else + leaf++; + } + else if (ilen > nlen) { + ilen = 0; + } + } + for (nspp *= 2; i < nspp; + i++) { /* this should be only bases if interleaf */ + ReadOneLine(sp); + + tseq = getseqtype(sp + 10, strlen(sp + 10)); + tname = getseqtype(sp, 10); + for (ps = sp; *ps != 0; ps++) + if (isprint(*ps)) ilen++; + for (j = 0, ps = sp; isspace(*ps) && j < 10; ps++, j++) + ; + if (j < 9) { + if (tname == kOtherSeq) seq += 10; + if (tseq != tname) + seq++; + else + leaf++; + if (tname == kDNA || tname == kRNA) + leaf++; + else + seq++; + } + if (ilen > nlen) { + if (j > 9) + leaf += 10; /* must be a name here for + sequent */ + else if (tname == kOtherSeq) + seq += 10; + ilen = 0; + } + } + + if (leaf > seq) + format = kPhylip4; + else + format = kPhylip2; + } + + return (format); +#undef ReadOneLine +} /* SeqFileFormat */ + +unsigned long GCGchecksum(const char *seq, const long seqlen, + unsigned long *checktotal) +/* GCGchecksum */ +{ + register long i, check = 0, count = 0; + + for (i = 0; i < seqlen; i++) { + count++; + check += count * to_upper(seq[i]); + if (count == 57) count = 0; + } + check %= 10000; + *checktotal += check; + *checktotal %= 10000; + return check; +} + +/* Table of CRC-32's of all single byte values (made by makecrc.c of ZIP source) + */ +const unsigned long crctab[] = { + 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L, + 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L, + 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L, + 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL, + 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L, + 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L, + 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L, + 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL, + 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L, + 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL, + 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L, + 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L, + 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L, + 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL, + 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL, + 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L, + 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL, + 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L, + 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L, + 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L, + 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL, + 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L, + 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L, + 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL, + 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L, + 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L, + 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L, + 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L, + 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L, + 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL, + 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL, + 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L, + 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L, + 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL, + 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL, + 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L, + 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL, + 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L, + 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL, + 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L, + 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL, + 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L, + 0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L, + 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL, + 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L, + 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L, + 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L, + 0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L, + 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L, + 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L, + 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL, + 0x2d02ef8dL}; + +unsigned long CRC32checksum(const char *seq, const long seqlen, + unsigned long *checktotal) +/*CRC32checksum: modified from CRC-32 algorithm found in ZIP compression source + */ +{ + register unsigned long c = 0xffffffffL; + register long n = seqlen; + + while (n--) { + c = crctab[((int)c ^ (to_upper(*seq))) & 0xff] ^ (c >> 8); + seq++; /* fixed aug'98 finally */ + } + c = c ^ 0xffffffffL; + *checktotal += c; + return c; +} + +short getseqtype(const char *seq, const long seqlen) +{ /* return sequence kind: kDNA, kRNA, kProtein, kOtherSeq, ??? */ + char c; + short i, maxtest; + short na = 0, aa = 0, po = 0, nt = 0, nu = 0, ns = 0, no = 0; + + maxtest = min(300, seqlen); + for (i = 0; i < maxtest; i++) { + c = to_upper(seq[i]); + if (strchr(protonly, c)) + po++; + else if (strchr(primenuc, c)) { + na++; + if (c == 'T') + nt++; + else if (c == 'U') + nu++; + } + else if (strchr(aminos, c)) + aa++; + else if (strchr(seqsymbols, c)) + ns++; + else if (isalpha(c)) + no++; + } + + if ((no > 0) || (po + aa + na == 0)) return kOtherSeq; + /* ?? test for probability of kOtherSeq ?, e.g., + else if (po+aa+na / maxtest < 0.70) return kOtherSeq; + */ + else if (po > 0) + return kAmino; + else if (aa == 0) { + if (nu > nt) + return kRNA; + else + return kDNA; + } + else if (na > aa) + return kNucleic; + else + return kAmino; +} /* getseqtype */ + +char *compressSeq(const char gapc, const char *seq, const long seqlen, + long *newlen) +{ + register char *a, *b; + register long i; + char *newseq; + + *newlen = 0; + if (!seq) return NULL; + newseq = (char *)malloc(seqlen + 1); + if (!newseq) return NULL; + for (a = (char *)seq, b = newseq, i = 0; *a != 0; a++) + if (*a != gapc) { + *b++ = *a; + i++; + } + *b = '\0'; + newseq = (char *)realloc(newseq, i + 1); + *newlen = i; + return newseq; +} + +/*** +char *rtfhead = "{\\rtf1\\defformat\\mac\\deff2 \ +{\\fonttbl\ + {\\f1\\fmodern Courier;}{\\f2\\fmodern Monaco;}\ + {\\f3\\fswiss Helvetica;}{\\f4\\fswiss Geneva;}\ + {\\f5\\froman Times;}{\\f6\\froman Palatino;}\ + {\\f7\\froman New Century Schlbk;}{\\f8\\ftech Symbol;}}\ +{\\stylesheet\ + {\\s1 \\f5\\fs20 \\sbasedon0\\snext1 name;}\ + {\\s2 \\f3\\fs20 \\sbasedon0\\snext2 num;}\ + {\\s3 \\f1\\f21 \\sbasedon0\\snext3 seq;}}"; + +char *rtftail = "}"; +****/ + +short writeSeq(FILE *outf, const char *seq, const long seqlen, + const short outform, const char *seqid) +/* dump sequence to standard output */ +{ + const short kSpaceAll = -9; +#define kMaxseqwidth 250 + + boolean baseonlynum = + false; /* nocountsymbols -- only count true bases, not "-" */ + short numline = 0; /* only true if we are writing seq number line (for + interleave) */ + boolean numright = false, numleft = false; + boolean nameright = false, nameleft = false; + short namewidth = 8, numwidth = 8; + short spacer = 0, width = 50, tab = 0; + /* new parameters: width, spacer, those above... */ + + short linesout = 0, seqtype = kNucleic; + long i, j, l, l1, ibase; + char idword[31], endstr[10]; + char seqnamestore[128], *seqname = seqnamestore; + char s[kMaxseqwidth], *cp; + char nameform[10], numform[10], nocountsymbols[10]; + unsigned long checksum = 0, checktotal = 0; + + gPretty.atseq++; + skipwhitespace(seqid); + l = min(128, strlen(seqid)); + strncpy(seqnamestore, seqid, l); + seqname[l] = 0; + + sscanf(seqname, "%30s", idword); + sprintf(numform, "%d", seqlen); + numwidth = strlen(numform) + 1; + nameform[0] = '\0'; + + if (strstr(seqname, "checksum") != NULL) { + cp = strstr(seqname, "bases"); + if (cp != NULL) { + for (; (cp != seqname) && (*cp != ','); cp--) + ; + if (cp != seqname) *cp = 0; + } + } + + strcpy(endstr, ""); + l1 = 0; + + if (outform == kGCG || outform == kMSF) + checksum = GCGchecksum(seq, seqlen, &checktotal); + else + checksum = seqchecksum(seq, seqlen, &checktotal); + + switch (outform) { + case kPlain: + case kUnknown: /* no header, just sequence */ + strcpy(endstr, "\n"); /* end w/ extra blank line */ + break; + + case kOlsen: /* Olsen seq. editor takes plain nucs OR Genbank */ + case kGenBank: + fprintf(outf, "LOCUS %s %d bp\n", idword, + seqlen); + fprintf(outf, + "DEFINITION %s, %d bases, %X checksum.\n", + seqname, seqlen, checksum); + /* fprintf(outf,"ACCESSION %s\n", accnum); */ + fprintf(outf, "ORIGIN \n"); + spacer = 11; + numleft = true; + numwidth = 8; /* dgg. 1Feb93, patch for GDE fail to read + short numwidth */ + strcpy(endstr, "\n//"); + linesout += 4; + break; + + case kPIR: + /* somewhat like genbank... \\\*/ + /* fprintf(outf,"\\\\\\\n"); << only at top of file, not + * each entry... */ + fprintf(outf, "ENTRY %s \n", idword); + fprintf(outf, + "TITLE %s, %d bases, %X checksum.\n", + seqname, seqlen, checksum); + /* fprintf(outf,"ACCESSION %s\n", accnum); */ + fprintf(outf, "SEQUENCE \n"); + numwidth = 7; + width = 30; + spacer = kSpaceAll; + numleft = true; + strcpy(endstr, "\n///"); + /* run a top number line for PIR */ + for (j = 0; j < numwidth; j++) fputc(' ', outf); + for (j = 5; j <= width; j += 5) + fprintf(outf, "%10d", j); + fputc('\n', outf); + linesout += 5; + break; + + case kNBRF: + if (getseqtype(seq, seqlen) == kAmino) + fprintf(outf, ">P1;%s\n", idword); + else + fprintf(outf, ">DL;%s\n", idword); + fprintf(outf, "%s, %d bases, %X checksum.\n", seqname, + seqlen, checksum); + spacer = 11; + strcpy(endstr, "*\n"); + linesout += 3; + break; + + case kEMBL: + fprintf(outf, "ID %s\n", idword); + /* fprintf(outf,"AC %s\n", accnum); */ + fprintf(outf, "DE %s, %d bases, %X checksum.\n", + seqname, seqlen, checksum); + fprintf(outf, "SQ %d BP\n", seqlen); + strcpy(endstr, "\n//"); /* 11Oct90: bug fix*/ + tab = 4; /** added 31jan91 */ + spacer = 11; /** added 31jan91 */ + width = 60; + linesout += 4; + break; + + case kGCG: + fprintf(outf, "%s\n", seqname); + /* fprintf(outf,"ACCESSION %s\n", accnum); */ + fprintf(outf, + " %s Length: %d (today) Check: %d ..\n", + idword, seqlen, checksum); + spacer = 11; + numleft = true; + strcpy(endstr, "\n"); /* this is insurance to help + prevent misreads at eof */ + linesout += 3; + break; + + case kStrider: /* ?? map ?*/ + fprintf(outf, "; ### from DNA Strider ;-)\n"); + fprintf( + outf, + "; DNA sequence %s, %d bases, %X checksum.\n;\n", + seqname, seqlen, checksum); + strcpy(endstr, "\n//"); + linesout += 3; + break; + + case kFitch: + fprintf(outf, "%s, %d bases, %X checksum.\n", seqname, + seqlen, checksum); + spacer = 4; + width = 60; + linesout += 1; + break; + + case kPhylip2: + case kPhylip4: + /* this is version 3.2/3.4 -- simplest way to write + version 3.3 is to write as version 3.2, then + re-read file and interleave the species lines */ + if (strlen(idword) > 10) idword[10] = 0; + fprintf(outf, "%-10s ", idword); + l1 = -1; + tab = 12; + spacer = 11; + break; + + case kASN1: + seqtype = getseqtype(seq, seqlen); + switch (seqtype) { + case kDNA: + cp = "dna"; + break; + case kRNA: + cp = "rna"; + break; + case kNucleic: + cp = "na"; + break; + case kAmino: + cp = "aa"; + break; + case kOtherSeq: + cp = "not-set"; + break; + } + fprintf(outf, " seq {\n"); + fprintf(outf, " id { local id %d },\n", + gPretty.atseq); + fprintf(outf, " descr { title \"%s\" },\n", seqid); + fprintf(outf, " inst {\n"); + fprintf(outf, + " repr raw, mol %s, length %d, topology " + "linear,\n", + cp, seqlen); + fprintf(outf, " seq-data\n"); + if (seqtype == kAmino) + fprintf(outf, " iupacaa \""); + else + fprintf(outf, " iupacna \""); + l1 = 17; + spacer = 0; + width = 78; + tab = 0; + strcpy(endstr, "\"\n } } ,"); + linesout += 7; + break; + + case kPAUP: + nameleft = true; + namewidth = 9; + spacer = 21; + width = 100; + tab = 0; /* 1; */ + /* strcpy(endstr,";\nend;"); << this is end of all + * seqs.. */ + /* do a header comment line for paup */ + fprintf(outf, "[Name: %-16s Len:%6d Check: %8X]\n", + idword, seqlen, checksum); + linesout += 1; + break; + + case kPretty: + numline = gPretty.numline; + baseonlynum = gPretty.baseonlynum; + namewidth = gPretty.namewidth; + numright = gPretty.numright; + numleft = gPretty.numleft; + nameright = gPretty.nameright; + nameleft = gPretty.nameleft; + spacer = gPretty.spacer + 1; + width = gPretty.seqwidth; + tab = gPretty.tab; + /* also add rtf formatting w/ font, size, style */ + if (gPretty.nametop) { + fprintf(outf, + "Name: %-16s Len:%6d Check: %8X\n", + idword, seqlen, checksum); + linesout++; + } + break; + + case kMSF: + fprintf( + outf, + " Name: %-16s Len:%6d Check: %5d Weight: 1.00\n", + idword, seqlen, checksum); + linesout++; + nameleft = true; + namewidth = 15; /* need MAX namewidth here... */ + sprintf(nameform, "%%+%ds ", namewidth); + spacer = 11; + width = 50; + tab = 0; /* 1; */ + break; + + case kIG: + fprintf(outf, ";%s, %d bases, %X checksum.\n", seqname, + seqlen, checksum); + fprintf(outf, "%s\n", idword); + strcpy(endstr, "1"); /* == linear dna */ + linesout += 2; + break; + + default: + case kZuker: /* don't attempt Zuker's ftn format */ + case kPearson: + fprintf(outf, ">%s, %d bases, %X checksum.\n", seqname, + seqlen, checksum); + linesout += 1; + break; + } + + if (*nameform == 0) + sprintf(nameform, "%%%d.%ds ", namewidth, namewidth); + if (numline) + sprintf(numform, "%%%ds ", numwidth); + else + sprintf(numform, "%%%dd ", numwidth); + strcpy(nocountsymbols, kNocountsymbols); + if (baseonlynum) { + if (strchr(nocountsymbols, gPretty.gapchar) == NULL) { + strcat(nocountsymbols, " "); + nocountsymbols[strlen(nocountsymbols) - 1] = + gPretty.gapchar; + } + if (gPretty.domatch && + (cp = strchr(nocountsymbols, gPretty.matchchar)) != NULL) { + *cp = ' '; + } + } + + if (numline) { + *idword = 0; + } + + width = min(width, kMaxseqwidth); + for (i = 0, l = 0, ibase = 1; i < seqlen;) { + if (l1 < 0) + l1 = 0; + else if (l1 == 0) { + if (nameleft) fprintf(outf, nameform, idword); + if (numleft) { + if (numline) + fprintf(outf, numform, ""); + else + fprintf(outf, numform, ibase); + } + for (j = 0; j < tab; j++) fputc(' ', outf); + } + + l1++; /* don't count spaces for width*/ + if (numline) { + if (spacer == kSpaceAll || + (spacer != 0 && (l + 1) % spacer == 1)) { + if (numline == 1) fputc(' ', outf); + s[l++] = ' '; + } + if (l1 % 10 == 1 || l1 == width) { + if (numline == 1) fprintf(outf, "%-9d ", i + 1); + s[l++] = '|'; /* == put a number here */ + } + else + s[l++] = ' '; + i++; + } + + else { + if (spacer == kSpaceAll || + (spacer != 0 && (l + 1) % spacer == 1)) + s[l++] = ' '; + if (!baseonlynum) + ibase++; + else if (0 == strchr(nocountsymbols, seq[i])) + ibase++; + s[l++] = seq[i++]; + } + + if (l1 == width || i == seqlen) { + if (outform == kPretty) + for (; l1 < width; l1++) { + if (spacer == kSpaceAll || + (spacer != 0 && + (l + 1) % spacer == 1)) + s[l++] = ' '; + s[l++] = ' '; /* pad w/ blanks */ + } + s[l] = '\0'; + l = 0; + l1 = 0; + + if (numline) { + if (numline == 2) + fprintf( + outf, "%s", + s); /* finish numberline ! and | */ + } + else { + if (i == seqlen) + fprintf(outf, "%s%s", s, endstr); + else + fprintf(outf, "%s", s); + if (numright || nameright) fputc(' ', outf); + if (numright) fprintf(outf, numform, ibase - 1); + if (nameright) fprintf(outf, nameform, idword); + } + fputc('\n', outf); + linesout++; + } + } + return linesout; +} /*writeSeq*/ + +/* End file: ureadseq.c */ diff --git a/ureadseq.h b/ureadseq.h new file mode 100644 index 0000000..29d71b5 --- /dev/null +++ b/ureadseq.h @@ -0,0 +1,172 @@ +/* File: ureadseq.h + * + * Header for module UReadSeq + */ + +#ifndef UREADSEQ_H +#define UREADSEQ_H + +typedef char boolean; +#define NEWLINE '\n' +#define false 0 +#define true 1 +#define min(a, b) (a < b) ? a : b +#define max(a, b) (a > b) ? a : b +#define skipwhitespace(string) \ + { \ + while (*string <= ' ' && *string != 0) string++; \ + } + +/* NLM strings */ +#define is_upper(c) ('A' <= (c) && (c) <= 'Z') +#define is_lower(c) ('a' <= (c) && (c) <= 'z') +#define to_lower(c) ((char)(is_upper(c) ? (c) + ' ' : (c))) +#define to_upper(c) ((char)(is_lower(c) ? (c) - ' ' : (c))) + +/* readSeq errors */ +#define eFileNotFound -1 +#define eNoData -2 +#define eMemFull -3 +#define eItemNotFound -4 +#define eOneFormat -5 +#define eUnequalSize -6 +#define eFileCreate -7 +#define eUnknownFormat -8 +#define eOptionBad -9 +#define eASNerr -10 + +/* magic number for readSeq(whichEntry) to give seq list */ +#define kListSequences -1 + +/* sequence types parsed by getseqtype */ +#define kOtherSeq 0 +#define kDNA 1 +#define kRNA 2 +#define kNucleic 3 +#define kAmino 4 + +/* formats known to readSeq */ +#define kIG 1 +#define kGenBank 2 +#define kNBRF 3 +#define kEMBL 4 +#define kGCG 5 +#define kStrider 6 +#define kFitch 7 +#define kPearson 8 +#define kZuker 9 +#define kOlsen 10 +#define kPhylip2 11 +#define kPhylip4 12 +#define kPhylip3 kPhylip4 +#define kPhylip kPhylip4 +#define kPlain 13 /* keep this at #13 */ +#define kPIR 14 +#define kMSF 15 +#define kASN1 16 +#define kPAUP 17 +#define kPretty 18 + +#define kMaxFormat 18 +#define kMinFormat 1 +#define kNoformat -1 /* format not tested */ +#define kUnknown 0 /* format not determinable */ + +/* subsidiary types */ +#define kASNseqentry 51 +#define kASNseqset 52 + +#define kPhylipInterleave 61 +#define kPhylipSequential 62 + +typedef struct { + boolean isactive, baseonlynum; + boolean numright, numleft, numtop, numbot; + boolean nameright, nameleft, nametop; + boolean noleaves, domatch, degap; + char matchchar, gapchar; + short numline, atseq; + short namewidth, numwidth; + short interline, spacer, seqwidth, tab; +} prettyopts; + +#define gPrettyInit(p) \ + { \ + p.isactive = false; \ + p.baseonlynum = true; \ + p.numline = p.atseq = 0; \ + p.numright = p.numleft = p.numtop = p.numbot = false; \ + p.nameright = p.nameleft = p.nametop = false; \ + p.noleaves = p.domatch = p.degap = false; \ + p.matchchar = '.'; \ + p.gapchar = '-'; \ + p.namewidth = 8; \ + p.numwidth = 5; \ + p.interline = 1; \ + p.spacer = 10; \ + p.seqwidth = 50; \ + p.tab = 0; \ + } + +#ifdef UREADSEQ_G +prettyopts gPretty; +#else +extern prettyopts gPretty; +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +extern short seqFileFormat(const char *filename, long *skiplines, short *error); +extern short seqFileFormatFp(FILE *fseq, long *skiplines, short *error); + +extern char *listSeqs(const char *filename, const long skiplines, + const short format, short *nseq, short *error); + +extern char *readSeq(const short whichEntry, const char *filename, + const long skiplines, const short format, long *seqlen, + short *nseq, short *error, char *seqid); + +extern char *readSeqFp(const short whichEntry_, FILE *fp_, + const long skiplines_, const short format_, + long *seqlen_, short *nseq_, short *error_, + char *seqid_); + +extern short writeSeq(FILE *outf, const char *seq, const long seqlen, + const short outform, const char *seqid); + +extern unsigned long CRC32checksum(const char *seq, const long seqlen, + unsigned long *checktotal); +extern unsigned long GCGchecksum(const char *seq, const long seqlen, + unsigned long *checktotal); +#ifdef SMALLCHECKSUM +#define seqchecksum GCGchecksum +#else +#define seqchecksum CRC32checksum +#endif + +extern short getseqtype(const char *seq, const long seqlen); +extern char *compressSeq(const char gapc, const char *seq, const long seqlen, + long *newlen); + +#ifdef NCBI + +extern char *listASNSeqs(const char *filename, const long skiplines, + const short format, short *nseq, short *error); + +extern char *readASNSeq(const short whichEntry, const char *filename, + const long skiplines, const short format, long *seqlen, + short *nseq, short *error, char **seqid); +#endif + +/* patches for some missing string.h stuff */ +extern int Strcasecmp(const char *a, const char *b); +extern int Strncasecmp(const char *a, const char *b, long maxn); + +#ifdef __cplusplus +} +#endif + +#endif /*UREADSEQ_H*/ +