readseq/readseq.c

1413 lines
34 KiB
C
Raw Permalink Blame History

/* File: readseq.c
* main() program for ureadseq.c, ureadseq.h
*
* Reads and writes nucleic/protein sequence in various
* formats. Data files may have multiple sequences.
*
* Copyright 1990 by d.g.gilbert
* biology dept., indiana university, bloomington, in 47405
* e-mail: gilbertd@bio.indiana.edu
*
* This program may be freely copied and used by anyone.
* Developers are encourged to incorporate parts in their
* programs, rather than devise their own private sequence
* format.
*
* This should compile and run with any ANSI C compiler.
* Please advise me of any bugs, additions or corrections.
*
*/
const char *title = "readSeq (1Feb93), multi-format molbio sequence reader.\n";
/* History
27 Feb 90. 1st release to public.
4 Mar 90. + Gary Olsen format
+ case change
* minor corrections to NBRF,EMBL,others
* output 1 file per sequence for gcg, unknown
* define -DNOSTR for c-libraries w/o strstr
- readseq.p, pascal version, becomes out-of-date
24 May 90. + Phylip 3.2 output format (no input)
20 Jul 90. + Phylip 3.3 output (no input yet)
+ interactive output re-direction
+ verbose progress info
* interactive help output
* dropped line no.s on NBRF output
* patched in HyperGCG XCMD corrections,
- except for seq. documentation handling
* dropped the IG special nuc codes, as IG has
adopted the standard IUB codes (now if only
everyone would adopt a standard format !)
11 Oct 90. * corrected bug in reading/writing of EMBL format
17 Oct 91. * corrected bug in reading Olsen format
(serious-deletion)
10 Nov 91. * corrected bug in reading some GCG format files
(serious-last line duplicated)
+ add format name parsing (-fgb, -ffasta, ...)
+ Phylip v3.4 output format (== v3.2, sequential)
+ add checksum output to all forms that have document
+ skip mail headers in seq file
+ add pipe for standard input == seq file (with -p)
* fold in parts of MacApp Seq object
* strengthen format detection
* clarify program structure
* remove fixed sequence size limit (now dynamic, sizeof memory)
* check and fold in accumulated bug reports:
* Now ANSI-C fopen(..,"w") & check open failure
* Define -DFIXTOUPPER for nonANSI C libraries that mess
up toupper/tolower
= No command-line changes; callers of readseq main() should be okay
- ureadseq.h functions have changed; client programs need to note.
+ added Unix and VMS Make scripts, including validation tests
4 May 92. + added 32 bit CRC checksum as alternative to GCG 6.5bit checksum
(-DBIGCHECKSUM)
Aug 92 = fixed Olsen format input to handle files w/ more sequences,
not to mess up when more than one seq has same identifier,
and to convert number masks to symbols.
= IG format fix to understand ^L
25-30 Dec 92
* revised command-line & interactive interface. Suggested form is
now readseq infile -format=genbank -output=outfile -item=1,3,4 ... but remains
compatible with prior commandlines: readseq infile -f2 -ooutfile -i3 ...
+ added GCG MSF multi sequence file format
+ added PIR/CODATA format
+ added NCBI ASN.1 sequence file format
+ added Pretty, multi sequence pretty output (only)
+ added PAUP multi seq format
+ added degap option
+ added Gary Williams (GWW, G.Williams@CRC.AC.UK)
reverse-complement option.
+ added support for reading Phylip formats (interleave &
sequential)
* string fixes, dropped need for compiler flags NOSTR, FIXTOUPPER,
NEEDSTRCASECMP
* changed 32bit checksum to default, -DSMALLCHECKSUM for GCG
version
1Feb93
= revert GenBank output to a fixed left number width which
other software depends on.
= fix for MSF input to handle symbols in names
= fix bug for possible memory overrun when truncating seqs for
Phylip or Paup formats (thanks Anthony Persechini)
*/
/*
Readseq has been tested with:
Macintosh MPW C
GNU gcc
SGI cc
VAX-VMS cc
Any ANSI C compiler should be able to handle this.
Old-style C compilers barf all over the source.
How do I build the readseq program if I have an Ansi C compiler?
#--------------------
# Unix ANSI C
# Use the supplied Makefile this way:
% make CC=name-of-c-compiler
# OR do this...
% gcc readseq.c ureadseq.c -o readseq
#--------------------
$!VAX-VMS cc
$! Use the supplied Make.Com this way:
$ @make
$! OR, do this:
$ cc readseq, ureadseq
$ link readseq, ureadseq, sys$library:vaxcrtl/lib
$ readseq :== $ MyDisk:[myacct]readseq
#--------------------
# Macintosh Simple Input/Output Window application
# requires MPW-C and SIOW library (from APDA)
# also uses files macinit.c, macinit.r, readseqSIOW.make
#
Buildprogram readseqSIOW
#--------------------
#MPW-C v3 tool
C ureadseq.c
C readseq.c
link -w -o readseq -t MPST -c 'MPS ' <20>
readseq.c.o Ureadseq.c.o <20>
"{Libraries}"Interface.o <20>
"{Libraries}"ToolLibs.o <20>
"{Libraries}"Runtime.o <20>
"{CLibraries}"StdClib.o
readseq -i1 ig.seq
# MPW-C with NCBI tools
set NCBI "{Boot}@molbio:ncbi:"; EXPORT NCBI
set NCBILIB1 "{NCBI}"lib:libncbi.o; export NCBILIB1
set NCBILIB2 "{NCBI}"lib:libncbiobj.o; export NCBILIB2
set NCBILIB3 "{NCBI}"lib:libncbicdr.o; export NCBILIB3
set NCBILIB4 "{NCBI}"lib:libvibrant.o; export NCBILIB4
C ureadseq.c
C -d NCBI -i "{NCBI}"include: ureadasn.c
C -d NCBI -i "{NCBI}"include: readseq.c
link -w -o readseq -t MPST -c 'MPS ' <20>
ureadseq.c.o ureadasn.c.o readseq.c.o <20>
{NCBILIB4} {NCBILIB2} {NCBILIB1} <20>
"{Libraries}"Interface.o <20>
"{Libraries}"ToolLibs.o <20>
"{Libraries}"Runtime.o <20>
"{CLibraries}"CSANELib.o <20>
"{CLibraries}"Math.o <20>
"{CLibraries}"StdClib.o
===========================================================*/
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "ureadseq.h"
#pragma segment readseq
static char inputfilestore[256], *inputfile = inputfilestore;
const char *formats[kMaxFormat + 1] = {" 1. IG/Stanford",
" 2. GenBank/GB",
" 3. NBRF",
" 4. EMBL",
" 5. GCG",
" 6. DNAStrider",
" 7. Fitch",
" 8. Pearson/Fasta",
" 9. Zuker (in-only)",
"10. Olsen (in-only)",
"11. Phylip3.2",
"12. Phylip",
"13. Plain/Raw",
"14. PIR/CODATA",
"15. MSF",
"16. ASN.1",
"17. PAUP/NEXUS",
"18. Pretty (out-only)",
""};
#define kFormCount 30
#define kMaxFormName 15
const struct formatTable {
char *name;
short num;
} formname[] = {
{"ig", kIG},
{"stanford", kIG},
{"genbank", kGenBank},
{"gb", kGenBank},
{"nbrf", kNBRF},
{"embl", kEMBL},
{"gcg", kGCG},
{"uwgcg", kGCG},
{"dnastrider", kStrider},
{"strider", kStrider},
{"fitch", kFitch},
{"pearson", kPearson},
{"fasta", kPearson},
{"zuker", kZuker},
{"olsen", kOlsen},
{"phylip", kPhylip},
{"phylip3.2", kPhylip2},
{"phylip3.3", kPhylip3},
{"phylip3.4", kPhylip4},
{"phylip-interleaved", kPhylip4},
{"phylip-sequential", kPhylip2},
{"plain", kPlain},
{"raw", kPlain},
{"pir", kPIR},
{"codata", kPIR},
{"asn.1", kASN1},
{"msf", kMSF},
{"paup", kPAUP},
{"nexus", kPAUP},
{"pretty", kPretty},
};
const char *kASN1headline = "Bioseq-set ::= {\nseq-set {\n";
/* GWW table for getting the complement of a nucleotide (IUB codes) */
/* !
* "#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[
* \]^_`abcdefghijklmnopqrstuvwxyz{|}~ */
const char compl [] =
" !\"#$%&'()*+,-./"
"0123456789:;<=>?@TVGHNNCDNNMNKNNYRYSAABWNRN[\\]^_`"
"tvghnncdnnmnknnyrysaabwnrn{|}~";
char *formatstr(short format)
{
if (format < 1 || format > kMaxFormat) {
switch (format) {
case kASNseqentry:
case kASNseqset:
return formats[kASN1 - 1];
case kPhylipInterleave:
case kPhylipSequential:
return formats[kPhylip - 1];
default:
return "(unknown)";
}
}
else
return formats[format - 1];
}
int parseformat(char *name)
{
#define kDupmatch -2
int namelen, maxlen, i, match, matchat;
char lname[kMaxFormName + 1];
skipwhitespace(name);
namelen = strlen(name);
if (namelen == 0)
return kNoformat;
else if (isdigit(*name)) {
i = atol(name);
if (i<kMinFormat | i> kMaxFormat)
return kNoformat;
else
return i;
}
/* else match character name */
maxlen = min(kMaxFormName, namelen);
for (i = 0; i < maxlen; i++) lname[i] = to_lower(name[i]);
lname[maxlen] = 0;
matchat = kNoformat;
for (i = 0; i < kFormCount; i++) {
match = strncmp(lname, formname[i].name, maxlen);
if (match == 0) {
if (strlen(formname[i].name) == namelen)
return (formname[i].num);
else if (matchat == kNoformat)
matchat = i;
else
matchat =
kDupmatch; /* 2 or more partial matches */
}
}
if (matchat == kNoformat || matchat == kDupmatch)
return kNoformat;
else
return formname[matchat].num;
}
static void dumpSeqList(char *list, short format)
{
long i, l, listlen;
char s[256];
listlen = strlen(list);
printf("Sequences in %s (format is %s)\n", inputfile,
formatstr(format));
for (i = 0, l = 0; i < listlen; i++) {
if (list[i] == (char)NEWLINE) {
s[l] = '\0';
l = 0;
puts(s);
}
else if (l < 255)
s[l++] = list[i];
}
putchar('\n');
}
void usage()
{
short i, midi;
fprintf(stderr, title);
fprintf(stderr, "usage: readseq [-options] in.seq > out.seq\n");
fprintf(stderr, " options\n");
/* ? add -d[igits] to allow digits in sequence data, &/or option to
* specify seq charset !? */
fprintf(stderr, " -a[ll] select All sequences\n");
fprintf(stderr, " -c[aselower] change to lower case\n");
fprintf(stderr, " -C[ASEUPPER] change to UPPER CASE\n");
fprintf(stderr, " -degap[=-] remove gap symbols\n");
fprintf(stderr,
" -i[tem=2,3,4] select Item number(s) from several\n");
fprintf(stderr, " -l[ist] List sequences only\n");
fprintf(stderr, " -o[utput=]out.seq redirect Output\n");
fprintf(stderr,
" -p[ipe] Pipe (command line, <stdin, >stdout)\n");
fprintf(stderr, " -r[everse] change to Reverse-complement\n");
fprintf(stderr, " -v[erbose] Verbose progress\n");
fprintf(stderr, " -f[ormat=]# Format number for output, or\n");
fprintf(stderr, " -f[ormat=]Name Format name for output:\n");
midi = (kMaxFormat + 1) / 2;
for (i = kMinFormat - 1; i < midi; i++)
fprintf(stderr, " %-20s %-20s\n", formats[i],
formats[midi + i]);
/* new output format options, esp. for pretty format: */
fprintf(stderr, " \n");
fprintf(stderr, " Pretty format options: \n");
fprintf(stderr, " -wid[th]=# sequence line width\n");
fprintf(stderr, " -tab=# left indent\n");
fprintf(stderr,
" -col[space]=# column space within sequence line "
"on output\n");
fprintf(
stderr,
" -gap[count] count gap chars in sequence numbers\n");
fprintf(stderr,
" -nameleft, -nameright[=#] name on left/right side [=max "
"width]\n");
fprintf(stderr, " -nametop name at top/bottom\n");
fprintf(stderr,
" -numleft, -numright seq index on left/right side\n");
fprintf(stderr, " -numtop, -numbot index on top/bottom\n");
fprintf(stderr,
" -match[=.] use match base for 2..n species\n");
fprintf(stderr,
" -inter[line=#] blank line(s) between sequence "
"blocks\n");
/****** not ready yet
fprintf(stderr, " -code=none,rtf,postscript,ps code syntax\n");
fprintf(stderr, " -namefont=, -numfont=, -seqfont=font font
choice\n"); fprintf(stderr, " font suggestions include
times,courier,helvetica\n"); fprintf(stderr, " -namefontsize=,
-numfontsize=, -seqfontsize=#\n"); fprintf(stderr, " fontsize
suggestions include 9,10,12,14\n"); fprintf(stderr, " -namefontstyle=,
-numfontstyle=, -seqfontstyle= style fontstyle for names\n");
fprintf(stderr, " fontstyle options are
plain,italic,bold,bold-italic\n");
******/
}
void erralert(short err)
{
switch (err) {
case 0:
break;
case eFileNotFound:
fprintf(stderr, "File not found: %s\n", inputfile);
break;
case eFileCreate:
fprintf(stderr, "Can't open output file.\n");
break;
case eASNerr:
fprintf(stderr, "Error in ASN.1 sequence routines.\n");
break;
case eNoData:
fprintf(stderr, "No data in file.\n");
break;
case eItemNotFound:
fprintf(stderr, "Specified item not in file.\n");
break;
case eUnequalSize:
fprintf(stderr,
"This format requires equal length "
"sequences.\nSequence truncated or padded to "
"fit.\n");
break;
case eUnknownFormat:
fprintf(stderr,
"Error: this format is unknown to me.\n");
break;
case eOneFormat:
fprintf(stderr,
"Warning: This format permits only 1 sequence "
"per file.\n");
break;
case eMemFull:
fprintf(stderr,
"Out of storage memory. Sequence truncated.\n");
break;
default:
fprintf(stderr, "readSeq error = %d\n", err);
break;
}
} /* erralert */
int chooseFormat(boolean quietly)
{
char sform[128];
int midi, i, outform;
if (quietly)
return kPearson; /* default */
else {
midi = (kMaxFormat + 1) / 2;
for (i = kMinFormat - 1; i < midi; i++)
fprintf(stderr, " %-20s %-20s\n",
formats[i], formats[midi + i]);
fprintf(stderr, "\nChoose an output format (name or #): \n");
gets(sform);
outform = parseformat(sform);
if (outform == kNoformat) outform = kPearson;
return outform;
}
}
/* read paramater(s) */
boolean checkopt(boolean casesense, char *sopt, const char *smatch,
short minword)
{
long lenopt, lenmatch;
boolean result;
short minmaxw;
lenopt = strlen(sopt);
lenmatch = strlen(smatch);
minmaxw = max(minword, min(lenopt, lenmatch));
if (casesense)
result = (!strncmp(sopt, smatch, minmaxw));
else
result = (!Strncasecmp(sopt, smatch, minmaxw));
/* if (result) { */
/* fprintf(stderr,"true checkopt(opt=%s,match=%s,param=%s)\n", sopt,
* smatch, *sparam); */
/* } */
return result;
}
#define kMaxwhichlist 50
/* global for readopt(), main() */
boolean chooseall = false, quietly = false, gotinputfile = false,
listonly = false, closeout = false, verbose = false, manyout = false,
dolower = false, doupper = false, doreverse = false, askout = true,
dopipe = false, interleaved = false;
short nfile = 0, iwhichlist = 0, nwhichlist = 0;
short whichlist[kMaxwhichlist + 1];
long whichSeq = 0, outform = kNoformat;
char onamestore[128], *oname = onamestore;
FILE *foo = NULL;
void resetGlobals()
/* need this when used from SIOW, as these globals are not reinited
automatically between calls to local main() */
{
chooseall = false;
quietly = false;
gotinputfile = false;
listonly = false;
closeout = false;
verbose = false;
manyout = false;
dolower = false;
doupper = false;
doreverse = false;
askout = true;
dopipe = false;
interleaved = false;
nfile = 0;
iwhichlist = 0;
nwhichlist = 0;
whichSeq = 0;
outform = kNoformat;
oname = onamestore;
foo = NULL;
gPrettyInit(gPretty);
}
#define kOptOkay 1
#define kOptNone 0
int readopt(char *sopt)
{
char sparamstore[256], *sparam = sparamstore;
short n, slen = strlen(sopt);
/* fprintf(stderr,"readopt( %s) == ", sopt); */
if (*sopt == '?') {
usage();
return kOptNone; /*? eOptionBad or kOptNone */
}
else if (*sopt == '-') {
char *cp = strchr(sopt, '=');
*sparam = '\0';
if (cp) {
strcpy(sparam, cp + 1);
*cp = 0;
}
if (checkopt(false, sopt, "-help", 2)) {
usage();
return kOptNone;
}
if (checkopt(false, sopt, "-all", 2)) {
whichSeq = 1;
chooseall = true;
return kOptOkay;
}
if (checkopt(false, sopt, "-colspace",
4)) { /* test before -c[ase] */
n = atoi(sparam);
gPretty.spacer = n;
return kOptOkay;
}
if (checkopt(true, sopt, "-caselower", 2)) {
dolower = true;
return kOptOkay;
}
if (checkopt(true, sopt, "-CASEUPPER", 2)) {
doupper = true;
return kOptOkay;
}
if (checkopt(false, sopt, "-pipe", 2)) {
dopipe = true;
askout = false;
return kOptOkay;
}
if (checkopt(false, sopt, "-list", 2)) {
listonly = true;
askout = false;
return kOptOkay;
}
if (checkopt(false, sopt, "-reverse", 2)) {
doreverse = true;
return kOptOkay;
}
if (checkopt(false, sopt, "-verbose", 2)) {
verbose = true;
return kOptOkay;
}
if (checkopt(false, sopt, "-match", 5)) {
gPretty.domatch = true;
if (*sparam >= ' ') gPretty.matchchar = *sparam;
return kOptOkay;
}
if (checkopt(false, sopt, "-degap", 4)) {
gPretty.degap = true;
if (*sparam >= ' ') gPretty.gapchar = *sparam;
return kOptOkay;
}
if (checkopt(false, sopt, "-interline", 4)) {
gPretty.interline = atoi(sparam);
return kOptOkay;
}
if (checkopt(false, sopt, "-item", 2)) {
char *cp = sparam;
nwhichlist = 0;
whichlist[0] = 0;
if (*cp == 0) cp = sopt + 2; /* compatible w/ old way */
do {
while (*cp != 0 && !isdigit(*cp)) cp++;
if (*cp != 0) {
n = atoi(cp);
whichlist[nwhichlist++] = n;
while (*cp != 0 && isdigit(*cp)) cp++;
}
} while (*cp != 0 && n > 0 &&
nwhichlist < kMaxwhichlist);
whichlist[nwhichlist++] =
0; /* 0 == stopsign for loop */
whichSeq = max(1, whichlist[0]);
iwhichlist = 1;
return kOptOkay;
}
if (checkopt(false, sopt, "-format",
5)) { /* -format=phylip, -f2, -form=phylip */
if (*sparam == 0) {
for (sparam = sopt + 2; isalpha(*sparam);
sparam++)
;
}
outform = parseformat(sparam);
return kOptOkay;
}
if (checkopt(false, sopt, "-f",
2)) { /* compatible w/ -fphylip prior version */
if (*sparam == 0) sparam = sopt + 2;
outform = parseformat(sparam);
return kOptOkay;
}
if (checkopt(false, sopt, "-output", 3)) { /* -output=myseq */
if (*sparam == 0) {
for (sparam = sopt + 3; isalpha(*sparam);
sparam++)
;
}
strcpy(oname, sparam);
foo = fopen(oname, "w");
if (!foo) {
erralert(eFileCreate);
return eFileCreate;
}
closeout = true;
askout = false;
return kOptOkay;
}
if (checkopt(false, sopt, "-o",
2)) { /* compatible w/ -omyseq prior version */
if (*sparam == 0) sparam = sopt + 2;
strcpy(oname, sparam);
foo = fopen(oname, "w");
if (!foo) {
erralert(eFileCreate);
return eFileCreate;
}
closeout = true;
askout = false;
return kOptOkay;
}
if (checkopt(false, sopt, "-width", 2)) {
if (*sparam == 0) {
for (sparam = sopt + 2;
!isdigit(*sparam) && *sparam != 0;
sparam++)
;
}
n = atoi(sparam);
if (n > 0) gPretty.seqwidth = n;
return kOptOkay;
}
if (checkopt(false, sopt, "-tab", 4)) {
if (*sparam == 0) {
for (sparam = sopt + 2;
!isdigit(*sparam) && *sparam != 0;
sparam++)
;
}
n = atoi(sparam);
gPretty.tab = n;
return kOptOkay;
}
if (checkopt(false, sopt, "-gapcount", 4)) {
gPretty.baseonlynum = false;
/* if (*sparam >= ' ') gPretty.gapchar= *sparam; */
return kOptOkay;
}
if (checkopt(false, sopt, "-nointerleave", 8)) {
gPretty.noleaves = true;
return kOptOkay;
}
if (checkopt(false, sopt, "-nameleft", 7)) {
if (*sparam == 0) {
for (sparam = sopt + 2;
!isdigit(*sparam) && *sparam != 0;
sparam++)
;
}
n = atoi(sparam);
if (n > 0 && n < 50) gPretty.namewidth = n;
gPretty.nameleft = true;
return kOptOkay;
}
if (checkopt(false, sopt, "-nameright", 7)) {
if (*sparam == 0) {
for (sparam = sopt + 2;
!isdigit(*sparam) && *sparam != 0;
sparam++)
;
}
n = atoi(sparam);
if (n > 0 && n < 50) gPretty.namewidth = n;
gPretty.nameright = true;
return kOptOkay;
}
if (checkopt(false, sopt, "-nametop", 6)) {
gPretty.nametop = true;
return kOptOkay;
}
if (checkopt(false, sopt, "-numleft", 6)) {
if (*sparam == 0) {
for (sparam = sopt + 2;
!isdigit(*sparam) && *sparam != 0;
sparam++)
;
}
n = atoi(sparam);
if (n > 0 && n < 50) gPretty.numwidth = n;
gPretty.numleft = true;
return kOptOkay;
}
if (checkopt(false, sopt, "-numright", 6)) {
if (*sparam == 0) {
for (sparam = sopt + 2;
!isdigit(*sparam) && *sparam != 0;
sparam++)
;
}
n = atoi(sparam);
if (n > 0 && n < 50) gPretty.numwidth = n;
gPretty.numright = true;
return kOptOkay;
}
if (checkopt(false, sopt, "-numtop", 6)) {
gPretty.numtop = true;
return kOptOkay;
}
if (checkopt(false, sopt, "-numbottom", 6)) {
gPretty.numbot = true;
return kOptOkay;
}
else {
usage();
return eOptionBad;
}
}
else {
strcpy(inputfile, sopt);
gotinputfile = (*inputfile != 0);
nfile++;
return kOptOkay;
}
/* return kOptNone; -- never here */
}
/* this program suffers some as it tries to be a quiet translator pipe
_and_ a noisy user interactor
*/
/* return is best for SIOW, okay for others */
#ifdef SIOW
#define Exit(a) return (a)
siow_main(int argc, char *argv[])
#else
#define Exit(a) exit(a)
main(int argc, char *argv[])
#endif
{
boolean closein = false;
short ifile, nseq, atseq, format, err = 0, seqtype = kDNA, nlines,
seqout = 0, phylvers = 2;
long i, skiplines, seqlen, seqlen0;
unsigned long checksum = 0, checkall = 0;
char *seq, *cp, *firstseq = NULL, *seqlist, *progname, tempname[256];
char seqid[256], *seqidptr = seqid;
char stempstore[256], *stemp = stempstore;
FILE *ftmp, *fin, *fout;
long outindexmax = 0, noutindex = 0, *outindex = NULL;
#define exit_main(err) \
{ \
if (closeout) fclose(fout); \
if (closein) fclose(fin); \
if (*tempname != 0) remove(tempname); \
Exit(err); \
}
#define indexout() \
if (interleaved) { \
if (noutindex >= outindexmax) { \
outindexmax = noutindex + 20; \
outindex = (long *)realloc( \
outindex, sizeof(long) * outindexmax); \
if (outindex == NULL) { \
err = eMemFull; \
erralert(err); \
exit_main(err); \
} \
} \
outindex[noutindex++] = ftell(fout); \
}
resetGlobals();
foo = stdout;
progname = argv[0];
*oname = 0;
*tempname = 0;
/* initialize gPretty ?? -- done in header */
for (i = 1; i < argc; i++) {
err = readopt(argv[i]);
if (err <= 0) exit_main(err);
}
/* pipe input from stdin !? */
if (dopipe && !gotinputfile) {
int c;
tmpnam(tempname);
inputfile = tempname;
ftmp = fopen(inputfile, "w");
if (!ftmp) {
erralert(eFileCreate);
exit_main(eFileCreate);
}
while ((c = getc(stdin)) != EOF) fputc(c, ftmp);
fclose(ftmp);
gotinputfile = true;
}
quietly = (dopipe || (gotinputfile && (listonly || whichSeq != 0)));
if (verbose || (!quietly && !gotinputfile)) fprintf(stderr, title);
ifile = 1;
/* UI: Choose output */
if (askout && !closeout && !quietly) {
askout = false;
fprintf(
stderr,
"\nName of output file (?=help, defaults to display): \n");
gets(oname = onamestore);
skipwhitespace(oname);
if (*oname == '?') {
usage();
exit_main(0);
}
else if (*oname != 0) {
closeout = true;
foo = fopen(oname, "w");
if (!foo) {
erralert(eFileCreate);
exit_main(eFileCreate);
}
}
}
fout = foo;
if (outform == kNoformat) outform = chooseFormat(quietly);
/* set up formats ... */
switch (outform) {
case kPhylip2:
interleaved = false;
phylvers = 2;
outform = kPhylip;
break;
case kPhylip4:
interleaved = true;
phylvers = 4;
outform = kPhylip;
break;
case kMSF:
case kPAUP:
interleaved = true;
break;
case kPretty:
gPretty.isactive = true;
interleaved = true;
break;
}
if (gPretty.isactive && gPretty.noleaves) interleaved = false;
if (interleaved) {
fout = ftmp = tmpfile();
outindexmax = 30;
noutindex = 0;
outindex = (long *)malloc(outindexmax * sizeof(long));
if (outindex == NULL) {
err = eMemFull;
erralert(err);
exit_main(err);
}
}
/* big loop over all input files */
do {
/* select next input file */
gotinputfile = (*tempname != 0);
while ((ifile < argc) && (!gotinputfile)) {
if (*argv[ifile] != '-') {
strcpy(inputfile, argv[ifile]);
gotinputfile = (*inputfile != 0);
--nfile;
}
ifile++;
}
while (!gotinputfile) {
fprintf(stderr,
"\nName an input sequence or -option: \n");
inputfile = inputfilestore;
gets(stemp = stempstore);
if (*stemp == 0)
goto fini; /* !! need this to finish work during
interactive use */
stemp = strtok(stempstore, " \n\r\t");
while (stemp) {
err = readopt(stemp); /* will read inputfile if
it exists */
if (err < 0) exit_main(err);
stemp = strtok(NULL, " \n\r\t");
}
}
/* thanks to AJB@UK.AC.DARESBURY.DLVH for this PHYLIP3 fix: */
/* head for end (interleave if needed) */
if (*inputfile == 0) break;
format = seqFileFormat(inputfile, &skiplines, &err);
if (err == 0) {
#ifdef NCBI
if (format == kASNseqentry || format == kASNseqset)
seqlist = listASNSeqs(inputfile, skiplines,
format, &nseq, &err);
else
#endif
seqlist = listSeqs(inputfile, skiplines, format,
&nseq, &err);
}
if (err != 0)
erralert(err);
else if (listonly) {
dumpSeqList(seqlist, format);
free(seqlist);
}
else {
/* choose whichSeq if needed */
if (nseq == 1 || chooseall ||
(quietly && whichSeq == 0)) {
chooseall = true;
whichSeq = 1;
quietly = true; /* no loop */
}
else if (whichSeq > nseq && quietly) {
erralert(eItemNotFound);
err = eItemNotFound;
}
else if (whichSeq > nseq || !quietly) {
dumpSeqList(seqlist, format);
fprintf(stderr,
"\nChoose a sequence (# or All): \n");
gets(stemp = stempstore);
skipwhitespace(stemp);
if (to_lower(*stemp) == 'a') {
chooseall = true;
whichSeq = 1;
quietly =
true; /* !? this means we don't ask
for another file as well as
no more whichSeqs... */
}
else if (isdigit(*stemp))
whichSeq = atol(stemp);
else
whichSeq = 1; /* default */
}
free(seqlist);
if (false /*chooseall*/) { /* this isn't debugged
yet...*/
fin = fopen(inputfile, "r");
closein = true;
}
while (whichSeq > 0 && whichSeq <= nseq) {
/* need to open multiple output files ? */
manyout =
((chooseall || nwhichlist > 1) &&
nseq > 1 &&
(outform == kPlain || outform == kGCG));
if (manyout) {
if (whichSeq == 1)
erralert(eOneFormat);
else if (closeout) {
sprintf(stemp, "%s_%d", oname,
whichSeq);
freopen(stemp, "w", fout);
fprintf(stderr,
"Writing sequence %d "
"to file %s\n",
whichSeq, stemp);
}
}
if (closein) {
/* !! this fails... skips most seqs...
*/
/* !! in sequential read, must count
* seqs already read from whichSeq ...
*/
/* need major revision of ureadseq
* before we can do this */
atseq = whichSeq - 1;
seqidptr = seqid;
seq = readSeqFp(
whichSeq, fin, skiplines, format,
&seqlen, &atseq, &err, seqidptr);
skiplines = 0;
}
else {
atseq = 0;
seqidptr = seqid;
#ifdef NCBI
if (format == kASNseqentry ||
format == kASNseqset) {
seqidptr = NULL;
seq = readASNSeq(
whichSeq, inputfile,
skiplines, format, &seqlen,
&atseq, &err, &seqidptr);
}
else
#endif
seq = readSeq(
whichSeq, inputfile,
skiplines, format, &seqlen,
&atseq, &err, seqidptr);
}
if (gPretty.degap) {
char *newseq;
long newlen;
newseq =
compressSeq(gPretty.gapchar, seq,
seqlen, &newlen);
if (newseq) {
free(seq);
seq = newseq;
seqlen = newlen;
}
}
if (outform == kMSF)
checksum =
GCGchecksum(seq, seqlen, &checkall);
else if (verbose)
checksum =
seqchecksum(seq, seqlen, &checkall);
if (verbose)
fprintf(stderr,
"Sequence %d, length= %d, "
"checksum= %X, format= %s, id= "
"%s\n",
whichSeq, seqlen, checksum,
formatstr(format), seqidptr);
if (err != 0)
erralert(err);
else {
/* format fixes that writeseq doesn't do
*/
switch (outform) {
case kPIR:
if (seqout == 0)
fprintf(
foo,
"\\\\\\\n");
break;
case kASN1:
if (seqout == 0)
fprintf(
foo,
kASN1headline);
break;
case kPhylip:
if (seqout == 0) {
if (!interleaved) { /* bug, nseq is for 1st infile only */
if (chooseall)
i = nseq;
else
i = 1;
if (phylvers >=
4)
fprintf(
foo,
" %d %d\n",
i,
seqlen);
else
fprintf(
foo,
" %d %d YF\n",
i,
seqlen);
}
seqlen0 =
seqlen;
}
else if (seqlen !=
seqlen0) {
erralert(
eUnequalSize);
if (seqlen <
seqlen0)
seq = (char
*)
realloc(
seq,
seqlen0);
for (i = seqlen;
i <
seqlen0;
i++)
seq[i] =
gPretty
.gapchar;
seqlen =
seqlen0;
seq[seqlen] = 0;
}
break;
case kPAUP:
if (seqout == 0) {
seqtype =
getseqtype(
seq,
seqlen);
seqlen0 =
seqlen;
}
else if (seqlen !=
seqlen0) {
erralert(
eUnequalSize);
if (seqlen <
seqlen0)
seq = (char
*)
realloc(
seq,
seqlen0);
for (i = seqlen;
i <
seqlen0;
i++)
seq[i] =
gPretty
.gapchar;
seqlen =
seqlen0;
seq[seqlen] = 0;
}
break;
}
if (doupper)
for (i = 0; i < seqlen; i++)
seq[i] =
to_upper(seq[i]);
else if (dolower)
for (i = 0; i < seqlen; i++)
seq[i] =
to_lower(seq[i]);
if (doreverse) {
long j, k;
char ctemp;
for (j = 0, k = seqlen - 1;
j <= k; j++, k--) {
ctemp = compl [seq[j] -
' '];
seq[j] = compl [seq[k] -
' '];
seq[k] = ctemp;
}
}
if ((gPretty.isactive ||
outform == kPAUP) &&
gPretty.domatch &&
firstseq != NULL) {
for (i = 0; i < seqlen; i++)
if (seq[i] ==
firstseq[i])
seq[i] =
gPretty
.matchchar;
}
if (gPretty.isactive &&
gPretty.numtop && seqout == 0) {
gPretty.numline = 1;
indexout();
(void)writeSeq(fout, seq,
seqlen, outform,
seqidptr);
gPretty.numline = 2;
indexout();
(void)writeSeq(fout, seq,
seqlen, outform,
seqidptr);
gPretty.numline = 0;
}
indexout();
nlines = writeSeq(fout, seq, seqlen,
outform, seqidptr);
seqout++;
}
if ((gPretty.isactive || outform == kPAUP) &&
gPretty.domatch && firstseq == NULL) {
firstseq = seq;
seq = NULL;
}
else if (seq != NULL) {
free(seq);
seq = NULL;
}
#ifdef NCBI
if ((format == kASNseqentry ||
format == kASNseqset) &&
seqidptr && seqidptr != seqid)
free(seqidptr);
#endif
if (chooseall)
whichSeq++;
else if (iwhichlist < nwhichlist)
whichSeq = whichlist[iwhichlist++];
else
whichSeq = 0;
}
if (closein) {
fclose(fin);
closein = false;
}
}
whichSeq = 0;
} while (nfile > 0 || !quietly);
fini:
if (firstseq) {
free(firstseq);
firstseq = NULL;
}
if (err || listonly) exit_main(err);
if (gPretty.isactive && gPretty.numbot) {
gPretty.numline = 2;
indexout();
(void)writeSeq(fout, seq, seqlen, outform, seqidptr);
gPretty.numline = 1;
indexout();
(void)writeSeq(fout, seq, seqlen, outform, seqidptr);
gPretty.numline = 0;
}
if (outform == kMSF) {
if (*oname)
cp = oname;
else
cp = inputfile;
fprintf(foo,
"\n %s MSF: %d Type: N January 01, 1776 12:00 "
"Check: %d ..\n\n",
cp, seqlen, checkall);
}
if (outform == kPAUP) {
fprintf(foo, "#NEXUS\n");
if (*oname)
cp = oname;
else
cp = inputfile;
fprintf(foo, "[%s -- data title]\n\n", cp);
/* ! now have header lines for each sequence... put them before
* "begin data;... */
}
if (outform == kPhylip && interleaved) {
if (phylvers >= 4)
fprintf(foo, " %d %d\n", seqout, seqlen);
else
fprintf(foo, " %d %d YF\n", seqout, seqlen);
}
if (interleaved) {
/* interleave species lines in true output */
/* nlines is # lines / sequence */
short iline, j, leaf, iseq;
char *s = stempstore;
indexout();
noutindex--; /* mark eof */
for (leaf = 0; leaf < nlines; leaf++) {
if (outform == kMSF && leaf == 1) {
fputs("//\n\n", foo);
}
if (outform == kPAUP && leaf == 1) {
switch (seqtype) {
case kDNA:
cp = "dna";
break;
case kRNA:
cp = "rna";
break;
case kNucleic:
cp = "dna";
break;
case kAmino:
cp = "protein";
break;
case kOtherSeq:
cp = "dna";
break;
}
fprintf(foo, "\nbegin data;\n");
fprintf(foo, " dimensions ntax=%d nchar=%d;\n",
seqout, seqlen);
fprintf(
foo,
" format datatype=%s interleave missing=%c",
cp, gPretty.gapchar);
if (gPretty.domatch)
fprintf(foo, " matchchar=%c",
gPretty.matchchar);
fprintf(foo, ";\n matrix\n");
}
for (iseq = 0; iseq < noutindex; iseq++) {
fseek(ftmp, outindex[iseq], 0);
for (iline = 0; iline <= leaf; iline++)
if (!fgets(s, 256, ftmp)) *s = 0;
if (ftell(ftmp) <= outindex[iseq + 1])
fputs(s, foo);
}
for (j = 0; j < gPretty.interline; j++)
fputs("\n", foo); /* some want spacer line */
}
fclose(ftmp); /* tmp disappears */
fout = foo;
}
if (outform == kASN1) fprintf(foo, "} }\n");
if (outform == kPAUP) fprintf(foo, ";\n end;\n");
if (outindex != NULL) free(outindex);
exit_main(0);
}