diff --git a/Formats b/Formats new file mode 100644 index 0000000..cb39ce8 --- /dev/null +++ b/Formats @@ -0,0 +1,980 @@ +||||||||||| ReadSeq supported formats (revised 30Dec92) +-------------------------------------------------------- + + -f[ormat=]Name Format name for output: + 1. IG/Stanford 10. Olsen (in-only) + 2. GenBank/GB 11. Phylip3.2 + 3. NBRF 12. Phylip + 4. EMBL 13. Plain/Raw + 5. GCG 14. PIR/CODATA + 6. DNAStrider 15. MSF + 7. Fitch 16. ASN.1 + 8. Pearson/Fasta 17. PAUP + 9. Zuker (in-only) 18. Pretty (out-only) + +In general, output supports only minimal subsets of each format +needed for sequence data exchanges. Features, descriptions +and other format-unique information is discarded. + +Users of Olsen multi sequence editor (VMS). The Olsen format +here is produced with the print command: + print/out=some.file +Use Genbank output from readseq to produce a format that this +editor can read, and use the command + load/genbank some.file +Dan Davison has a VMS program that will convert to/from the +Olsen native binary data format. E-mail davison@uh.edu + +Warning: Phylip format input is now supported (30Dec92), however the +auto-detection of Phylip format is very probabilistic and messy, +especially distinguishing sequential from interleaved versions. It +is not recommended that one use readseq to convert files from Phylip +format to others unless essential. + + + +||||||||||| ReadSeq usage (revised 11Nov91) +-------------------------------------------------------- + +A. determine file format: + + short skiplines; /* result: number of header lines to skip (or 0) */ + short error; /* error result or 0 */ + short format; /* resulting format code, see ureadseq.h */ + char *filename = "Mysequence.file" + + format = seqFileFormat( filename, &skiplines, &error); + if (error!=0) fail; + +B. read number and list of sequences (optional) + short numseqs; /* resulting number of sequences found in file */ + char *seqlist; /* list of sequence names, newline separated, 0 terminated */ + + seqlist = listSeqs( filename, skiplines, format, &numseqs, &error); + if (error!=0) display (seqlist); + free( seqlist); + +C. read individual sequences as desired + short seqIndex; /* sequence index #, or == kListSeqs for listSeqs equivalent */ + long seqlen; /* length of seq */ + char seqid[256]; /* sequence name */ + char *seq; /* sequence, 0 terminated, free when done */ + + seq = readSeq( seqIndex, filename, skiplines, format, + &seqlen, &numseqs, &error, seqid); + if (error!=0) manipulate(seq); + free(seq); + +D. write sequences as desired + int nlines; /* number of lines of sequence written */ + FILE* fout; /* open file pointer (stdout or other) */ + short outform; /* output format, see ureadseq.h */ + + nlines = writeSeq( fout, seq, seqlen, format, outform, seqid); + + +Note (30Dec92): There is various processing done by the main program (in readseq.c), + rather than just in the subroutines (in ureadseq.c). Especially for interleaved + output formats, the writeSeq subroutine does not handle interleaving, nor some of + the formatting at the top and end of output files. While seqFileFormat, listSeqs, + and readSeq subroutines are fairly self-contained, the writeSeq depends a lot on + auxilliary processing. At some point, this may be revised so writeSeq is self- + contained. + +Note 2: The NCBI toolkit (ftp from ncbi.nlm.nih.gov) is needed for the ASN.1 format + reading (see ureadasn.c). A bastard (but workable I hope) ASN.1 format is written + by writeSeq alone. + + + +||||||||||| sequence formats.... +--------------------------------------------------- + +stanford/IG +;comments +;... +seq1 info +abcd... +efgh1 (or 2 = terminator) +;another seq +;.... +seq2 info +abcd...1 +--- for e.g. ---- +; Dro5s-T.Seq Length: 120 April 6, 1989 21:22 Check: 9487 .. +dro5stseq +GCCAACGACCAUACCACGCUGAAUACAUCGGUUCUCGUCCGAUCACCGAAAUUAAGCAGCGUCGCGGGCG +GUUAGUACUUAGAUGGGGGACCGCUUGGGAACACCGCGUGUUGUUGGCCU1 + +; TOIG of: Dro5srna.Seq check: 9487 from: 1 to: 120 +--------------------------------------------------- + +Genbank: +LOCUS seq1 ID.. +... +ORIGIN ... +123456789abcdefg....(1st 9 columns are formatting) + hijkl... +// (end of sequence) +LOCUS seq2 ID .. +... +ORIGIN + abcd... +// +--------------------------------------------------- + +NBRF format: (from uwgcg ToNBRF) +>DL;DRO5SRNA +Iubio$Dua0:[Gilbertd.Gcg]Dro5srna.Seq;2 => DRO5SRNA + + 51 AAUUAAGCAG CGUCGCGGGC GGUUAGUACU UAGAUGGGGG ACCGCUUGGG + 101 AACACCGCGU GUUGUUGGCC U + +--------------------------------------------------- + +EMBL format +ID345 seq1 id (the 345 are spaces) +... other info +SQ345Sequence (the 3,4,5 are spaces) +abcd... +hijk... +// (! this is proper end string: 12Oct90) +ID seq2 id +... +SQ Sequence +abcd... +... +// +--------------------------------------------------- + +UW GCG Format: +comments of any form, up to ".." signal +signal line has seq id, and " Check: #### .." +only 1 seq/file + +-- e.g. --- (GCG from GenBank) +LOCUS DROEST6 1819 bp ss-mRNA INV 31-AUG-1987 + ... much more ... +ORIGIN 1 bp upstream of EcoRI site; chromosome BK9 region 69A1. + +INVERTEBRATE:DROEST6 Length: 1819 January 9, 1989 16:48 Check: 8008 .. + + 1 GAATTCGCCG GAGTGAGGAG CAACATGAAC TACGTGGGAC TGGGACTTAT + + 51 CATTGTGCTG AGCTGCCTTT GGCTCGGTTC GAACGCGAGT GATACAGATG + + +--------------------------------------------------- + +DNAStrider (Mac) = modified Stanford: +; ### from DNA Strider Friday, April 7, 1989 11:04:24 PM +; DNA sequence pBR322 4363 b.p. complete sequence +; +abcd... +efgh +// (end of sequence) +--------------------------------------------------- + +Fitch format: +Dro5srna.Seq + GCC AAC GAC CAU ACC ACG CUG AAU ACA UCG GUU CUC GUC CGA UCA CCG AAA UUA AGC AGC + GUC GCG GGC GGU UAG UAC UUA GAU GGG GGA CCG CUU GGG AAC ACC GCG UGU UGU UGG CCU +Droest6.Seq + GAA TTC GCC GGA GTG AGG AGC AAC ATG AAC TAC GTG GGA CTG GGA CTT ATC ATT GTG CTG + AGC TGC CTT TGG CTC GGT TCG AAC GCG AGT GAT ACA GAT GAC CCT CTG TTG GTG CAG CTG +--------------------------------------------------- + +W.Pearson/Fasta format: +>BOVPRL GenBank entry BOVPRL from omam file. 907 nucleotides. +TGCTTGGCTGAGGAGCCATAGGACGAGAGCTTCCTGGTGAAGTGTGTTTCTTGAAATCAT + +--------------------------------------------------- +Phylip version 3.2 format (e.g., DNAML): + + 5 13 YF (# seqs, #bases, YF) +Alpha AACGTGGCCAAAT + aaaagggccc... (continued sp. alpha) +Beta AAGGTCGCCAAAC + aaaagggccc... (continued sp. beta) +Gamma CATTTCGTCACAA + aaaagggccc... (continued sp. Gamma) +1234567890^-- bases must start in col 11, and run 'til #bases + (spaces & newlines are okay) +--------------------------------------------------- +Phylip version 3.3 format (e.g., DNAML): + + 5 42 YF (# seqs, #bases, YF) +Turkey AAGCTNGGGC ATTTCAGGGT +Salmo gairAAGCCTTGGC AGTGCAGGGT +H. SapiensACCGGTTGGC CGTTCAGGGT +Chimp AAACCCTTGC CGTTACGCTT +Gorilla AAACCCTTGC CGGTACGCTT +1234567890^-- bases must start in col 11 + !! this version interleaves the species -- contrary to + all other output formats. + +GAGCCCGGGC AATACAGGGT AT +GAGCCGTGGC CGGGCACGGT AT +ACAGGTTGGC CGTTCAGGGT AA +AAACCGAGGC CGGGACACTC AT +AAACCATTGC CGGTACGCTT AA + +--------------------------------------------------- +Phylip version 3.4 format (e.g., DNAML) +-- Both Interleaved and sequential are permitted + + 5 13 (# seqs, #bases) +Alpha AACGTGGCCAAAT + aaaagggccc... (continued sp. alpha) +Beta AAGGTCGCCAAAC + aaaagggccc... (continued sp. beta) +Gamma CATTTCGTCACAA + aaaagggccc... (continued sp. Gamma) +1234567890^-- bases must start in col 11, and run 'til #bases + (spaces, newlines and numbers are are ignored) + +--------------------------------------------------- +Gary Olsen (multiple) sequence editor /print format: + +!--------------------- +!17Oct91 -- error in original copy of olsen /print format, shifted right 1 space +! here is correct copy: + 301 40 Tb.thiop CGCAGCGAAA----------GCUNUGCUAAUACCGCAUA-CGnCCUG----------------------------------------------------- Tb.thiop +123456789012345678901 + 301 42 Rhc.purp CGUAGCGAAA----------GUUACGCUAAUACCGCAUA-UUCUGUG----------------------------------------------------- Rhc.purp + + 301 44 Rhc.gela nnngnCGAAA----------GCCGGAUUAAUACCGCAUA-CGACCUA----------------------------------------------------- Rhc.gela +!--------------------- + + RNase P RNA components. on 20-FEB-90 17:23:58 + + 1 (E.c. pr ): Base pairing in Escherichia coli RNase P RNA. + 2 (chrom ): Chromatium + : + 12 (B.brevis): Bacillus brevis RNase P RNA, B. James. + 13 ( 90% con): 90% conserved + 14 (100% con): 100% conserved + 15 (gram+ pr): pairing + +1 + RNase P RNA components. on 20-FEB-90 17:23:58 + + Posi- Sequence + tion: identity: Data: + + 1 1 E.c. pr <<<<<<<<<< {{{{{{{{<<:<<<<<<<<<<^<<<<<<====>>>> E.c. pr + 1 2 chrom GGAGUCGGCCAGACAGUCGCUUCCGUCCU------------------ chrom + : + 1 12 B.brevis AUGCAGGAAAUGCGGGUAGCCGCUGCCGCAAUCGUCU------------- B.brevis +1234567890123456789012 >>>>>^>>^>>>>:>> <<<^<<<< {{{{{ E.c. pr + 60 2 chrom -----GGUG-ACGGGGGAGGAAAGUCCGG-GCUCCAU------------- chrom + : : + 60 10 B.stearo ----UU-CG-GCCGUAGAGGAAAGUCCAUGCUCGCACGGUGCUGAGAUGC B.stearo + + +--------------------------------------------------- + GCG MSF format +Title line + +picorna.msf MSF: 100 Type: P January 17, 1991 17:53 Check: 541 +.. +Name: Cb3 Len: 100 Check: 7009 Weight: 1.00 +Name: E Len: 100 Check: 60 Weight: 1.00 + +// + + 1 50 +Cb3 ...gpvedai .......t.. aaigr..vad tvgtgptnse aipaltaaet + E gvenae.kgv tentna.tad fvaqpvylpe .nqt...... kv.affynrs + + 51 100 + +Cb3 ghtsqvvpgd tmqtrhvkny hsrsestien flcrsacvyf teykn..... + E ...spi.gaf tvks...... gs.lesgfap .fsngtc.pn sviltpgpqf + +--------------------------------------------------- + PIR format +This is NBRF-PIR MAILSERVER version 1.45 +Command-> get PIR3:A31391 +\\\ +ENTRY A31391 #Type Protein +TITLE *Esterase-6 - Fruit fly (Drosophila melanogaster) + +DATE 03-Aug-1992 #Sequence 03-Aug-1992 #Text 03-Aug-1992 +PLACEMENT 0.0 0.0 0.0 0.0 0.0 +COMMENT *This entry is not verified. +SOURCE Drosophila melanogaster + +REFERENCE + #Authors Cooke P.H., Oakeshott J.G. + #Citation submitted to GenBank, April 1989 + #Reference-number A31391 + #Accession A31391 + #Cross-reference GB:J04167 + +SUMMARY #Molecular-weight 61125 #Length 544 #Checksum 1679 +SEQUENCE + 5 10 15 20 25 30 + 1 M N Y V G L G L I I V L S C L W L G S N A S D T D D P L L V + 31 Q L P Q G K L R G R D N G S Y Y S Y E S I P Y A E P P T G D + 61 L R F E A P E P Y K Q K W S D I F D A T K T P V A C L Q W D + 91 Q F T P G A N K L V G E E D C L T V S V Y K P K N S K R N S + 121 F P V V A H I H G G A F M F G A A W Q N G H E N V M R E G K + 151 F I L V K I S Y R L G P L G F V S T G D R D L P G N Y G L K + 181 D Q R L A L K W I K Q N I A S F G G E P Q N V L L V G H S A + 211 G G A S V H L Q M L R E D F G Q L A R A A F S F S G N A L D + 241 P W V I Q K G A R G R A F E L G R N V G C E S A E D S T S L + 271 K K C L K S K P A S E L V T A V R K F L I F S Y V P F A P F + 301 S P V L E P S D A P D A I I T Q D P R D V I K S G K F G Q V + 331 P W A V S Y V T E D G G Y N A A L L L K E R K S G I V I D D + 361 L N E R W L E L A P Y L L F Y R D T K T K K D M D D Y S R K + 391 I K Q E Y I G N Q R F D I E S Y S E L Q R L F T D I L F K N + 421 S T Q E S L D L H R K Y G K S P A Y A Y V Y D N P A E K G I + 451 A Q V L A N R T D Y D F G T V H G D D Y F L I F E N F V R D + 481 V E M R P D E Q I I S R N F I N M L A D F A S S D N G S L K + 511 Y G E C D F K D N V G S E K F Q L L A I Y I D G C Q N R Q H + 541 V E F P +/// +\\\ +--------------------------------------------------- +PAUP format: +The NEXUS Format + +Every block starts with "BEGIN blockname;" and ends with "END;". +Each block is composed of one or more statements, each +terminated by a semicolon (;). + +Comments may be included in NEXUS files by enclosing them within +square brackets, as in "[This is a comment]." + +NEXUS-conforming files are identified by a "#NEXUS" directive at +the very beginning of the file (line 1, column 1). If the +#NEXUS is omitted PAUP issues a warning but continues +processing. + +NEXUS files are entirely free-format. Blanks, tabs, and +newlines may be placed anywhere in the file. Unless RESPECTCASE +is requested, commands and data may be entered in upper case, +lower case, or a mixture of upper and lower case. + +The following conventions are used in the syntax descriptions of +the various blocks. Upper-case items are entered exactly as +shown. Lower-case items inside of angle brackets -- e.g., +-- represent items to be substituted by the user. Items inside +of square brackets -- e.g., [X] -- are optional. Items inside +of curly braces and separated by vertical bars -- e.g., { X | Y +| Z } -- are mutually exclusive options. + + +The DATA Block + +The DATA block contains the data matrix and other associated +information. Its syntax is: + +BEGIN DATA; +DIMENSIONS NTAX= NCHAR=; + [ FORMAT [ MISSING= ] + [ LABELPOS={ LEFT | RIGHT } ] + [ SYMBOLS="" ] + [ INTERLEAVE ] + [ MATCHCHAR= ] + [ EQUATE="= [=...]" ] + [ TRANSPOSE ] + [ RESPECTCASE ] + [ DATATYPE = { STANDARD | DNA | RNA | PROTEIN } ]; ] + [ OPTIONS [ IGNORE={ INVAR | UNINFORM } ] + [ MSTAXA = { UNCERTAIN | POLYMORPH | VARIABLE } ] + [ ZAP = "" ] ; ] + [ CHARLABELS label_2>�� ; ] + [ TAXLABELS ; ] + [ STATELABELS ; ] + MATRIX ; + END; + +--- example PAUP file + +#NEXUS + +[!Brown et al. (1982) primate mitochondrial DNA] + +begin data; + dimensions ntax=5 nchar=896; + format datatype=dna matchchar=. interleave missing='-'; + matrix +[ 2 4 6 8 ] +[ 1 1 1 1 1 ] +human aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcctcattactatt ctgcctagcaaactcaaact acgaacgcactcacagtcgc +chimp ................a.t. .c.................a ...............t.... ..................t. .t........c......... +gorilla ..................tg ....t.....t........a ........a......t.... .................... .......a..c.....c... +orang ................ac.. cc.....g..t.....t..a ..c........cc....g.. .................... .......a..c.....c... +gibbon ......t..a..t...ac.g .c.................a ..a..c..t..cc.g..... ......t............. .......a........c... + +[ 8 8 8 8 8 8 ] +[ 0 2 4 6 8 9 ] +[ 1 1 1 1 1 6 ] +human cttccccacaacaatattca tgtgcctagaccaagaagtt attatctcgaactgacactg agccacaacccaaacaaccc agctctccctaagctt +chimp t................... .a................c. ........a.....g..... ...a................ ................ +gorilla ..................tc .a................c. ........a.g......... ...a.............tt. .a.............. +orang ta....a...........t. .c.......ga......acc ..cg..a.a......tg... .a.a..c.....g...cta. .a.....a........ +gibbon a..t.......t........ ....ac...........acc .....t..a........... .a.tg..........gctag .a.............. + ; +end; +--------------------------------------------------- + + + + + + +||||||||||| Sample SMTP mail header +--------------------------------------------------- + +- - - - - - - - - +From GenBank-Retrieval-System@genbank.bio.net Sun Nov 10 17:28:56 1991 +Received: from genbank.bio.net by sunflower.bio.indiana.edu + (4.1/9.5jsm) id AA19328; Sun, 10 Nov 91 17:28:55 EST +Received: by genbank.bio.net (5.65/IG-2.0) + id AA14458; Sun, 10 Nov 91 14:30:03 -0800 +Date: Sun, 10 Nov 91 14:30:03 -0800 +Message-Id: <9111102230.AA14458@genbank.bio.net> +From: Database Server +To: gilbertd@sunflower.bio.indiana.edu +Subject: Results of Query for drorna +Status: R + +No matches on drorna. +- - - - - - +From GenBank-Retrieval-System@genbank.bio.net Sun Nov 10 17:28:49 1991 +Received: from genbank.bio.net by sunflower.bio.indiana.edu + (4.1/9.5jsm) id AA19323; Sun, 10 Nov 91 17:28:47 EST +Received: by genbank.bio.net (5.65/IG-2.0) + id AA14461; Sun, 10 Nov 91 14:30:03 -0800 +Date: Sun, 10 Nov 91 14:30:03 -0800 +Message-Id: <9111102230.AA14461@genbank.bio.net> +From: Database Server +To: gilbertd@sunflower.bio.indiana.edu +Subject: Results of Query for droest6 +Status: R + +LOCUS DROEST6 1819 bp ss-mRNA INV 31-AUG-1987 +DEFINITION D.melanogaster esterase-6 mRNA, complete cds. +ACCESSION M15961 + + + + + + + + + + + + +||||||||||| GCG manual discussion of sequence symbols: +--------------------------------------------------- + +III_SEQUENCE_SYMBOLS + + + GCG programs allow all upper and lower case letters, periods (.), +asterisks (*), pluses (+), ampersands (&), and ats (@) as symbols in +biological sequences. Nucleotide symbols, their complements, and the +standard one-letter amino acid symbols are shown below in separate lists. +The meanings of the symbols +, &, and @ have not been assigned at this +writing (March, 1989). + + GCG uses the letter codes for amino acid codes and nucleotide +ambiguity proposed by IUB (Nomenclature Committee, 1985, +Eur. J. Biochem. 150; 1-5). These codes are compatible with the codes +used by the EMBL, GenBank, and NBRF data libraries. + + + NUCLEOTIDES + + The meaning of each symbol, its complement, and the Cambridge and +Stanford equivalents are shown below. Cambridge files can be converted +into GCG files and vice versa with the programs FROMSTADEN and TOSTADEN. +IntelliGenetics sequence files can be interconverted with the programs +FROMIG and TOIG. + +IUB/GCG Meaning Complement Staden/Sanger Stanford + + A A T A A + C C G C C + G G C G G + T/U T A T T/U + M A or C K 5 J + R A or G Y R R + W A or T W 7 L + S C or G S 8 M + Y C or T R Y Y + K G or T M 6 K + V A or C or G B not supported N + H A or C or T D not supported N + D A or G or T H not supported N + B C or G or T V not supported N + X/N G or A or T or C X -/X N + . not G or A or T or C . not supported ? + + + The frame ambiguity codes used by Staden are not supported by GCG +and are translated by FROMSTADEN as the lower case single base +equivalent. + + Staden Code Meaning GCG + + D C or CC c + V T or TT t + B A or AA a + H G or GG g + K C or CX c + L T or TX t + M A or AX a + N G or GX g + + + AMINO ACIDS + + Here is a list of the standard one-letter amino acid codes and their +three-letter equivalents. The synonymous codons and their depiction in +the IUB codes are shown. You should recognize that the codons following +semicolons (;) are not sufficiently specific to define a single amino +acid even though they represent the best possible back translation into +the IUB codes! All of the relationships in this list can be redefined by +the user in a local data file described below. + + IUB +Symbol 3-letter Meaning Codons Depiction + A Ala Alanine GCT,GCC,GCA,GCG !GCX + B Asp,Asn Aspartic, + Asparagine GAT,GAC,AAT,AAC !RAY + C Cys Cysteine TGT,TGC !TGY + D Asp Aspartic GAT,GAC !GAY + E Glu Glutamic GAA,GAG !GAR + F Phe Phenylalanine TTT,TTC !TTY + G Gly Glycine GGT,GGC,GGA,GGG !GGX + H His Histidine CAT,CAC !CAY + I Ile Isoleucine ATT,ATC,ATA !ATH + K Lys Lysine AAA,AAG !AAR + L Leu Leucine TTG,TTA,CTT,CTC,CTA,CTG +!TTR,CTX,YTR;YTX + M Met Methionine ATG !ATG + N Asn Asparagine AAT,AAC !AAY + P Pro Proline CCT,CCC,CCA,CCG !CCX + Q Gln Glutamine CAA,CAG !CAR + R Arg Arginine CGT,CGC,CGA,CGG,AGA,AGG +!CGX,AGR,MGR;MGX + S Ser Serine TCT,TCC,TCA,TCG,AGT,AGC !TCX,AGY;WSX + T Thr Threonine ACT,ACC,ACA,ACG !ACX + V Val Valine GTT,GTC,GTA,GTG !GTX + W Trp Tryptophan TGG !TGG + X Xxx Unknown !XXX + Y Tyr Tyrosine TAT, TAC !TAY + Z Glu,Gln Glutamic, + Glutamine GAA,GAG,CAA,CAG !SAR + * End Terminator TAA, TAG, TGA !TAR,TRA;TRR + + + + + + + + +||||||||||| docs from PSC on sequence formats: +--------------------------------------------------- + + + Nucleic Acid and Protein Sequence File Formats + + +It will probably save you some time if you have your data in a usable +format before you send it to us. However, we do have the University of +Wisconsin Genetics Computing Group programs running on our VAXen and +this package includes several reformatting utilities. Our programs +usually recognize any of several standard formats, including GenBank, +EMBL, NBRF, and MolGen/Stanford. For the purposes of annotating an +analysis we find the GenBank and EMBL formats most useful, particularly +if you have already received an accession number from one of these +organizations for your sequence. + +Our programs do not require that all of the line types available in +GenBank, EMBL, or NBRF file formats be present for the file format to +be recognized and processed. The following pages outline the essential +details required for correct processing of files by our programs. +Additional information may be present but will generally be ignored. + + + GenBank File Format + +File Header + +1. The first line in the file must have "GENETIC SEQUENCE DATA BANK" + in spaces 20 through 46 (see LINE 1, below). +2. The next 8 lines may contain arbitrary text. They are ignored but + are required to maintain the GenBank format (see LINE 2 - LINE 9). + +Sequence Data Entries + +3. Each sequence entry in the file should have the following format. + a) first line: Must have LOCUS in the first 5 spaces. The + genetic locus name or identifier must be in spaces + 13 - 22. The length of the sequences is right + justified in spaces 23 through 29 (see LINE 10). + b) second line: Must have DEFINITION in the first 10 spaces. + Spaces 13 - 80 are free form text to identify the + sequence (see LINE 11). + c) third line: Must have ACCESSION in the first 9 spaces. Spaces + 13 - 18 must hold the primary accession number + (see LINE 12). + d) fourth line: Must have ORIGIN in the first 6 spaces. Nothing + else is required on this line, it indicates that + the nucleic acid sequence begins on the next line + (see LINE 13). + e) fifth line: Begins the nucleotide sequence. The first 9 + spaces of each sequence line may either be blank + or may contain the position in the sequence of the + first nucleotide on the line. The next 66 spaces + hold the nucleotide sequence in six blocks of ten + nucleotides. Each of the six blocks begins with a + blank space followed by ten nucleotides. Thus the + first nucleotide is in space eleven of the line while + the last is in space 75 (see LINE 14, LINE 15). + f) last line: Must have // in the first 2 spaces to indicate + termination of the sequence (see LINE 16). + +NOTE: Multiple sequences may appear in each file. To begin another + sequence go back to a) and start again. + + + Example GenBank file + + +LINE 1 : GENETIC SEQUENCE DATA BANK +LINE 2 : +LINE 3 : +LINE 4 : +LINE 5 : +LINE 6 : +LINE 7 : +LINE 8 : +LINE 9 : +LINE 10 :LOCUS L_Name Length BP +LINE 11 :DEFINITION Describe the sequence any way you want +LINE 12 :ACCESSION Accession Number +LINE 13 :ORIGIN +LINE 14 : 1 acgtacgtac gtacgtacgt acgtacgtac gtacgtacgt a... +LINE 15 : 61 acgt... +LINE 16 :// + + + + EMBL File Format + +Unlike the GenBank file format the EMBL file format does not require +a series of header lines. Thus the first line in the file begins +the first sequence entry of the file. + +1. The first line of each sequence entry contains the two letters ID + in the first two spaces. This is followed by the EMBL identifier + in spaces 6 through 14. (See LINE 1). + +2. The second line of each sequence entry has the two letters AC in + the first two spaces. This is followed by the accession number in + spaces 6 through 11. (See LINE 2). + +3. The third line of each sequence entry has the two letters DE in the + first two spaces. This is followed by a free form text definition + in spaces 6 through 72. (See LINE 3). + +4. The fourth line in each sequence entry has the two letters SQ in + the first two spaces. This is followed by the length of the + sequence beginning at or after space 13. After the sequence length + there is a blank space and the two letters BP. (See LINE 4). + +5. The nucleotide sequence begins on the fifth line of the sequence + entry. Each line of sequence begins with four blank spaces. The + next 66 spaces hold the nucleotide sequence in six blocks of ten + nucleotides. Each of the six blocks begins with a blank space + followed by ten nucleotides. Thus the first nucleotide is in space + 6 of the line while the last is in space 70. (See LINE 5 - + LINE 6). + +6. The last line of each sequence entry in the file is a terminator + line which has the two characters // in the first two spaces. + (See LINE 7). + +7. Multiple sequences may appear in each file. To begin another + sequence go back to item 1 and start again. + + + Example EMBL file + +LINE 1 :ID ID_name +LINE 2 :AC Accession number +LINE 3 :DE Describe the sequence any way you want +LINE 4 :SQ Length BP +LINE 5 : ACGTACGTAC GTACGTACGT ACGTACGTAC GTACGTA... +LINE 6 : ACGT... +LINE 7 :// + + + + NBRF (protein or nucleic acid) File Format + +1. The first line of each sequence entry begins with a greater than + symbol, >. This is immediately followed by the two character + sequence type specifier. Space four must contain a semi-colon. + Beginning in space five is the sequence name or identification code + for the NBRF database. The code is from four to six letters and + numbers. (See LINE 1). + +!!!! >> add these to readseq + Specifier Sequence type + + P1 protein, complete + F1 protein, fragment + DL DNA, linear + DC DNA, circular + RL RNA, linear + RC RNA, circular + N1 functional RNA, other than tRNA + N3 tRNA + +2. The second line of each sequence entry contains two kinds of + information. First is the sequence name which is separated from + the organism or organelle name by the three character sequence + blank space, dash, blank space, " - ". There is no special + character marking the beginning of this line. (See LINE 2). + +3. Either the amino acid or nucleic acid sequence begins on line three + and can begin in any space, including the first. The sequence is + free format and may be interrupted by blanks for ease of reading. + Protein sequences man contain special punctuation to indicate + various indeterminacies in the sequence. In the NBRF data files + all lines may be up to 500 characters long. However some PSC + programs currently have a limit of 130 characters per line + (including blanks), and BitNet will not accept lines of over eighty + characters. (See LINE 3, LINE 4, and LINE 5). + + The last character in the sequence must be an asterisks, *. + + Example NBRF file + + LINE 1 :>P1;CBRT + LINE 2 :Cytochrome b - Rat mitochondrion (SGC1) + LINE 3 :M T N I R K S H P L F K I I N H S F I D L P A P S + LINE 4 : VTHICRDVN Y GWL IRY + LINE 5 :TWIGGQPVEHPFIIIGQLASISYFSIILILMPISGIVEDKMLKWN* + + + + MolGen/Stanford File Format + +1. The first line in a sequence file is a comment line. This line + begins with a semi-colon in the first space. This line need + not be present. If it is present it holds descriptive text. + There may be as many comment lines as desired at the first of + sequence file. (See LINE 1). + +2. The second line must be present and contains an identifier or + name for the sequence in the first ten spaces. (See LINE 2). + +3. The sequence begins on the third line and occupies up to eighty + spaces. Spaces may be included in the sequence for ease of + reading. The sequence continues for as many line as needed + and is terminated with a 1 or 2. 1 indicates a linear sequence + while 2 marks a circular sequence. (See LINE 3 and LINE 4). + + Example MolGen/Stanford file + +LINE 1 :; Describe the sequence any way you want +LINE 2 :ECTRNAGLY2 +LINE 3 :ACGCACGTAC ACGTACGTAC A C G T C C G T ACG TAC GTA CGT +LINE 4 : GCTTA GG G C T A1 + + + + +||||||||||| Phylip file format +--------------------------------------------------- + + Phylip 3.3 File Format (DNA sequences) + + + The input and output formats for PROTPARS and for RESTML are described in +their document files. In general their input formats are similar to those +described here, except that the one-letter codes for data are specific to those +programs and are described in those document files. Since the input formats +for the eight DNA sequence programs apply to all eight, they are described +here. Their input formats are standard: the data have A's, G's, C's and T's +(or U's). The first line of the input file contains the number of species and +the number of sites. As with the other programs, options information may +follow this. In the case of DNAML, DNAMLK, and DNADIST an additional line +(described in the document file for these pograms) may follow the first one. +Following this, each species starts on a new line. The first 10 characters of +that line are the species name. There then follows the base sequence of that +species, each character being one of the letters A, B, C, D, G, H, K, M, N, O, +R, S, T, U, V, W, X, Y, ?, or - (a period was also previously allowed but it is +no longer allowed, because it sometimes is used to in aligned sequences to mean +"the same as the sequence above"). Blanks will be ignored, and so will +numerical digits. This allows GENBANK and EMBL sequence entries to be read +with minimum editing. + + These characters can be either upper or lower case. The algorithms +convert all input characters to upper case (which is how they are treated). +The characters constitute the IUPAC (IUB) nucleic acid code plus some slight +extensions. They enable input of nucleic acid sequences taking full account of +any ambiguities in the sequence. + +The sequences can continue over multiple lines; when this is done the sequences +must be either in "interleaved" format, similar to the output of alignment +programs, or "sequential" format. These are described in the main document +file. In sequential format all of one sequence is given, possibly on multiple +lines, before the next starts. In interleaved format the first part of the +file should contain the first part of each of the sequences, then possibly a +line containing nothing but a carriage-return character, then the second part +of each sequence, and so on. Only the first parts of the sequences should be +preceded by names. Here is a hypothetical example of interleaved format: + + 5 42 +Turkey AAGCTNGGGC ATTTCAGGGT +Salmo gairAAGCCTTGGC AGTGCAGGGT +H. SapiensACCGGTTGGC CGTTCAGGGT +Chimp AAACCCTTGC CGTTACGCTT +Gorilla AAACCCTTGC CGGTACGCTT + +GAGCCCGGGC AATACAGGGT AT +GAGCCGTGGC CGGGCACGGT AT +ACAGGTTGGC CGTTCAGGGT AA +AAACCGAGGC CGGGACACTC AT +AAACCATTGC CGGTACGCTT AA + +while in sequential format the same sequences would be: + + 5 42 +Turkey AAGCTNGGGC ATTTCAGGGT +GAGCCCGGGC AATACAGGGT AT +Salmo gairAAGCCTTGGC AGTGCAGGGT +GAGCCGTGGC CGGGCACGGT AT +H. SapiensACCGGTTGGC CGTTCAGGGT +ACAGGTTGGC CGTTCAGGGT AA +Chimp AAACCCTTGC CGTTACGCTT +AAACCGAGGC CGGGACACTC AT +Gorilla AAACCCTTGC CGGTACGCTT +AAACCATTGC CGGTACGCTT AA + + +Note, of course, that a portion of a sequence like this: + + 300 AAGCGTGAAC GTTGTACTAA TRCAG + +is perfectly legal, assuming that the species name has gone before, and is +filled out to full length by blanks. The above digits and blanks will be +ignored, the sequence being taken as starting at the first base symbol (in this +case an A). + + The present versions of the programs may sometimes have difficulties with +the blank lines between groups of lines, and if so you might want to retype +those lines, making sure that they have only a carriage-return and no blank +characters on them, or you may perhaps have to eliminate them. The symptoms of +this problem are that the programs complain that the sequences are not properly +aligned, and you can find no other cause for this complaint. + +------------------------------------------------ + + +||||||||||| ASN.1 file format +--------------------------------------------------- + + +ASN.1 -- see NCBI toolkit docs, source and examples (ncbi.nlm.nih.gov) + +Example asn.1 sequence file---- + +Bioseq-set ::= { +seq-set { + seq { + id { local id 1 } , -- id essential + descr { title "Dummy sequence data from nowhere" } , -- optional + inst { -- inst essential + repr raw , + mol dna , + length 156 , + topology linear , + seq-data + iupacna "GAATTCATTTTTGAAACAAATCGACCTGACGACGGAATGGTACTCGAATTA +TGGGCCAAAGGGTTTTATGGGACAAATTAATAGGTGTTCATTATATGCCACTTTCGGAGATTAGATACAGCAATGCAG +TGGATTCAAAGCAATAGAGTTGTTCTT" + } } , + + seq { + id { local id 2 } , + descr { title "Dummy sequence 2 data from somewhere else" } , + inst { + repr raw , + mol dna , + length 150 , + topology linear , + seq-data + iupacna "TTTTTTTTTTTTGAAACAAATCGACCTGACGACGGAATGGTACTCGAATTA +TGGGCCAAAGGGTTTTATGGGACAAATTAATAGGTGTTCATTATATGCCACTTTCGGAGATTAGATACAGCAATGCAG +TGGATTCAAAGCAATAGAGTT" + } + } + } + } + + +partial ASN.1 description from toolkit + +Bioseq ::= SEQUENCE { + id SET OF Seq-id , -- equivalent identifiers + descr Seq-descr OPTIONAL , -- descriptors + inst Seq-inst , -- the sequence data + annot SET OF Seq-annot OPTIONAL } + +Seq-inst ::= SEQUENCE { -- the sequence data itself + repr ENUMERATED { -- representation class + not-set (0) , -- empty + virtual (1) , -- no seq data + raw (2) , -- continuous sequence + seg (3) , -- segmented sequence + const (4) , -- constructed sequence + ref (5) , -- reference to another sequence + consen (6) , -- consensus sequence or pattern + map (7) , -- ordered map (genetic, restriction) + other (255) } , + mol ENUMERATED { -- molecule class in living organism + not-set (0) , -- > cdna = rna + dna (1) , + rna (2) , + aa (3) , + na (4) , -- just a nucleic acid + other (255) } , + length INTEGER OPTIONAL , -- length of sequence in residues + fuzz Int-fuzz OPTIONAL , -- length uncertainty + topology ENUMERATED { -- topology of molecule + not-set (0) , + linear (1) , + circular (2) , + tandem (3) , -- some part of tandem repeat + other (255) } DEFAULT linear , + strand ENUMERATED { -- strandedness in living organism + not-set (0) , + ss (1) , -- single strand + ds (2) , -- double strand + mixed (3) , + other (255) } OPTIONAL , -- default ds for DNA, ss for RNA, pept + seq-data Seq-data OPTIONAL , -- the sequence + ext Seq-ext OPTIONAL , -- extensions for special types + hist Seq-hist OPTIONAL } -- sequence history + +------------------------------------------------ diff --git a/Make.com b/Make.com new file mode 100644 index 0000000..82da18a --- /dev/null +++ b/Make.com @@ -0,0 +1,63 @@ +$! +$!VAX-VMS cc make file for readseq +$! +$ echo := write sys$output +$ if p1.eqs."TEST" then goto tests +$ +$ echo "compiling readseq..." +$ cc readseq, ureadseq +$! +$ echo "linking readseq..." +$ link readseq, ureadseq, sys$library:vaxcrtl/lib +$! +$tests: +$! +$ echo "defining readseq symbol:" +$ dd = f$environment("default") +$ readseq :== $ 'dd'readseq.exe +$ show symbol readseq +$! +$ echo "" +$ echo "test for general read/write of all chars:" +$ readseq -p alphabet.std -otest.alpha +$ diff test.alpha alphabet.std +$! +$ echo "" +$ echo "test for valid format conversions" +$! +$ readseq -v -p -f=ig nucleic.std -otest.ig +$ readseq -v -p -f=gb test.ig -otest.gb +$ readseq -v -p -f=nbrf test.gb -otest.nbrf +$ readseq -v -p -f=embl test.nbrf -otest.embl +$ readseq -v -p -f=gcg test.embl -otest.gcg +$ readseq -v -p -f=strider test.gcg -otest.strider +$ readseq -v -p -f=fitch test.strider -otest.fitch +$ readseq -v -p -f=fasta test.fitch -otest.fasta +$ readseq -v -p -f=pir test.fasta -otest.pir +$ readseq -v -p -f=ig test.pir -otest.ig-b +$ diff test.ig test.ig-b +$! +$ echo "" +$ echo "Test for multiple-sequence format conversions:" +$ readseq -p -f=ig multi.std -otest.m-ig +$ readseq -p -f=gb test.m-ig -otest.m-gb +$ readseq -p -f=nbrf test.m-gb -otest.m-nbrf +$ readseq -p -f=embl test.m-nbrf -otest.m-embl +$ readseq -p -f=fasta test.m-embl -otest.m-fasta +$ readseq -p -f=pir test.m-fasta -otest.m-pir +$ readseq -p -f=msf test.m-pir -otest.m-msf +$ readseq -p -f=paup test.m-msf -otest.m-paup +$ readseq -p -f=ig test.m-paup -otest.m-ig-b +$ diff test.m-ig test.m-ig-b +$ echo "" +$ echo "Expect differences in the header lines due to" +$ echo "different format headers. If any sequence lines" +$ echo "differ, or if checksums differ, there is a problem." +$! +$! #cleanup +$! delete test.*; +$ echo "-----------" +$ echo "" +$ echo "To clean up test files, command me: +$ echo " DELETE test.*;" +$! diff --git a/Make.ncbi b/Make.ncbi new file mode 100644 index 0000000..c502210 --- /dev/null +++ b/Make.ncbi @@ -0,0 +1,109 @@ +# +# Unix Makefile for readseq +# to use, command me: +# % make -- or -- +# % make CC=your-c-compiler-name +# + +# pick an ANSI C compiler (the default Sun CC is not ANSI) +CC=gcc # Gnu C Compiler +#CC=cc # SGI Irix +#CC=vcc # some DEC Ultrix + +CFLAGS= +#CFLAGS= -DSMALLCHECKSUM # if you prefer to use a GCG-standard 13 bit checksum +# instead of a full 32 bit checksum. This may enhance compatibility w/ GCG software + +SOURCES= readseq.c ureadseq.c ureadseq.h ureadasn.c +DOCS= Readme readseq.help Formats Stdfiles Makefile Make.com add.gdemenu *.std + + +# NCBI toolkit support for ASN.1 reader + +# this is path to NCBI toolkit, you must set for your system: +NCBI=/bio/mb/ncbi +# +OTHERLIBS=-lm +LIB1=-lncbi +LIB2=-lncbiobj +LIB3=-lncbicdr +LIB4=-lvibrant +INCPATH=$(NCBI)/include +LIBPATH=$(NCBI)/lib +NCFLAGS=$(CFLAGS) -DNCBI -I$(INCPATH) +NLDFLAGS=-I$(INCPATH) -L$(LIBPATH) +NLIBS=$(LIB1) $(LIB2) $(OTHERLIBS) + + +all: build test + +#build: $(SOURCES) +# @echo "Compiling readseq..." +# $(CC) $(CFLAGS) -o readseq readseq.c ureadseq.c + +# if using NCBI, uncomment these lines in place of build: above +build: $(SOURCES) + @echo "Compiling readseq with NCBI toolkit support..."; + $(CC) -o readseq $(NLDFLAGS) $(NCFLAGS) readseq.c ureadseq.c ureadasn.c $(NLIBS) + +test: $(SOURCES) readseq + @echo "" + @echo "Test for general read/write of all chars:" + ./readseq -p alphabet.std -otest.alpha + -diff test.alpha alphabet.std + + @echo "" + @echo "Test for valid format conversions:" + ./readseq -v -p -f=ig nucleic.std -otest.ig + ./readseq -v -p -f=gb test.ig -otest.gb + ./readseq -v -p -f=nbrf test.gb -otest.nbrf + ./readseq -v -p -f=embl test.nbrf -otest.embl + ./readseq -v -p -f=gcg test.embl -otest.gcg + ./readseq -v -p -f=strider test.gcg -otest.strider + ./readseq -v -p -f=fitch test.strider -otest.fitch + ./readseq -v -p -f=fasta test.fitch -otest.fasta + ./readseq -v -p -f=pir test.fasta -otest.pir + ./readseq -v -p -f=ig test.pir -otest.ig-b + -diff test.ig test.ig-b + + @echo "" + @echo "Test for multiple-sequence format conversions:" + ./readseq -p -f=ig multi.std -otest.m-ig + ./readseq -p -f=gb test.m-ig -otest.m-gb + ./readseq -p -f=nbrf test.m-gb -otest.m-nbrf + ./readseq -p -f=embl test.m-nbrf -otest.m-embl + ./readseq -p -f=fasta test.m-embl -otest.m-fasta + ./readseq -p -f=pir test.m-fasta -otest.m-pir + ./readseq -p -f=msf test.m-pir -otest.m-msf + ./readseq -p -f=paup test.m-msf -otest.m-paup + ./readseq -p -f=ig test.m-paup -otest.m-ig-b + -diff test.m-ig test.m-ig-b +# +# if using NCBI, uncomment these lines + @echo "" + @echo "Test of NCBI ASN.1 conversions:" + ./readseq -p -f=asn test.m-ig -otest.m-asn + ./readseq -p -f=ig test.m-asn -otest.m-ig-c + -diff test.m-ig test.m-ig-c +# + @echo "" + @echo "Expect differences in the header lines due to" + @echo "different format headers. If any sequence lines" + @echo "differ, or if the checksums differ, there is a problem." + @echo "----------------------" + @echo "" + @echo "To clean up test files, command me:" + @echo " make clean" + + +clean: + rm -f *.o core test.* + +shar: + @echo "shell archiving files..." + -rm -f readseq*.shar + mkdir readseqd + cp $(SOURCES) readseqd + cp $(DOCS) readseqd + shar -v readseqd > readseq.shar + rm -rf readseqd diff --git a/Readme b/Readme new file mode 100644 index 0000000..6efd1f4 --- /dev/null +++ b/Readme @@ -0,0 +1,160 @@ + + * ReadSeq -- 1 Feb 93 + * + * Reads and writes nucleic/protein sequences in various + * formats. Data files may have multiple sequences. + * + * Copyright 1990 by d.g.gilbert + * biology dept., indiana university, bloomington, in 47405 + * e-mail: gilbertd@bio.indiana.edu + * + * This program may be freely copied and used by anyone. + * Developers are encourged to incorporate parts in their + * programs, rather than devise their own private sequence + * format. + * + * This should compile and run with any ANSI C compiler. + * Please advise me of any bugs, additions or corrections. + +Readseq has been updated. There have been a number of enhancements +and a few bug corrections since the previous general release in Nov 91 +(see below). If you are using earlier versions, I recommend you update to +this release. + +Readseq is particularly useful as it automatically detects many +sequence formats, and interconverts among them. +Formats added to this release include + + MSF multi sequence format used by GCG software + + PAUP's multiple sequence (NEXUS) format + + PIR/CODATA format used by PIR + + ASN.1 format used by NCBI + + Pretty print with various options for nice looking output. + +As well, Phylip format can now be used as input. Options to +reverse-compliment and to degap sequences have been added. A menu +addition for users of the GDE sequence editor is included. + +This program is available thru Internet gopher, as + + gopher ftp.bio.indiana.edu + browse into the IUBio-Software+Data/molbio/readseq/ folder + select the readseq.shar document + +Or thru anonymous FTP in this manner: + my_computer> ftp ftp.bio.indiana.edu (or IP address 129.79.224.25) + username: anonymous + password: my_username@my_computer + ftp> cd molbio/readseq + ftp> get readseq.shar + ftp> bye + +readseq.shar is a Unix shell archive of the readseq files. +This file can be editted by any text editor to reconstitute the +original files, for those who do not have a Unix system or an +Unshar program. Read the top of this .shar file for further +instructions. + +There are also pre-compiled executables for the following computers: +Silicon Graphics Iris, Sparc (Sun Sparcstation & clones), VMS-Vax, +Macintosh. Use binary ftp to transfer these, except Macintosh. The +Mac version is just the command-line program in a window, not very +handy. + +C source files: + readseq.c ureadseq.c ureadasn.c ureadseq.h +Document files: + Readme (this doc) + Readseq.help (longer than this doc) + Formats (description of sequence file formats) + add.gdemenu (GDE program users can add this to the .GDEmenu file) + Stdfiles -- test sequence files + Makefile -- Unix make file + Make.com -- VMS make file + *.std -- files for testing validity of readseq + + +Example usage: + readseq + -- for interactive use + readseq my.1st.seq my.2nd.seq -all -format=genbank -output=my.gb + -- convert all of two input files to one genbank format output file + readseq my.seq -all -form=pretty -nameleft=3 -numleft -numright -numtop -match + -- output to standard output a file in a pretty format + readseq my.seq -item=9,8,3,2 -degap -CASE -rev -f=msf -out=my.rev + -- select 4 items from input, degap, reverse, and uppercase them + cat *.seq | readseq -pipe -all -format=asn > bunch-of.asn + -- pipe a bunch of data thru readseq, converting all to asn + + +The brief usage of readseq is as follows. The "[]" denote +optional parts of the syntax: + + readseq -help +readSeq (27Dec92), multi-format molbio sequence reader. +usage: readseq [-options] in.seq > out.seq + options + -a[ll] select All sequences + -c[aselower] change to lower case + -C[ASEUPPER] change to UPPER CASE + -degap[=-] remove gap symbols + -i[tem=2,3,4] select Item number(s) from several + -l[ist] List sequences only + -o[utput=]out.seq redirect Output + -p[ipe] Pipe (command line, stdout) + -r[everse] change to Reverse-complement + -v[erbose] Verbose progress + -f[ormat=]# Format number for output, or + -f[ormat=]Name Format name for output: + 1. IG/Stanford 10. Olsen (in-only) + 2. GenBank/GB 11. Phylip3.2 + 3. NBRF 12. Phylip + 4. EMBL 13. Plain/Raw + 5. GCG 14. PIR/CODATA + 6. DNAStrider 15. MSF + 7. Fitch 16. ASN.1 + 8. Pearson/Fasta 17. PAUP + 9. Zuker 18. Pretty (out-only) + + Pretty format options: + -wid[th]=# sequence line width + -tab=# left indent + -col[space]=# column space within sequence line on output + -gap[count] count gap chars in sequence numbers + -nameleft, -nameright[=#] name on left/right side [=max width] + -nametop name at top/bottom + -numleft, -numright seq index on left/right side + -numtop, -numbot index on top/bottom + -match[=.] use match base for 2..n species + -inter[line=#] blank line(s) between sequence blocks + + + +Recent changes: + +4 May 92 ++ added 32 bit CRC checksum as alternative to GCG 6.5bit checksum +Aug 92 += fixed Olsen format input to handle files w/ more sequences, + not to mess up when more than one seq has same identifier, + and to convert number masks to symbols. += IG format fix to understand ^L +30 Dec 92 +* revised command-line & interactive interface. Suggested form is now + readseq infile -format=genbank -output=outfile -item=1,3,4 ... + but remains compatible with prior commandlines: + readseq infile -f2 -ooutfile -i3 ... ++ added GCG MSF multi sequence file format ++ added PIR/CODATA format ++ added NCBI ASN.1 sequence file format ++ added Pretty, multi sequence pretty output (only) ++ added PAUP multi seq format ++ added degap option ++ added Gary Williams (GWW, G.Williams@CRC.AC.UK) reverse-complement option. ++ added support for reading Phylip formats (interleave & sequential) +* string fixes, dropped need for compiler flags NOSTR, FIXTOUPPER, NEEDSTRCASECMP +* changed 32bit checksum to default, -DSMALLCHECKSUM for GCG version + +1Feb93 += reverted Genbank output format to fixed left margin + (change in 30 Dec release), so GDE and others relying on fixed margin + can read this. diff --git a/Readseq.help b/Readseq.help new file mode 100644 index 0000000..08fdc08 --- /dev/null +++ b/Readseq.help @@ -0,0 +1,229 @@ + + * ReadSeq.Help -- 30 Dec 92 + * + * Reads and writes nucleic/protein sequences in various + * formats. Data files may have multiple sequences. + * + * Copyright 1990 by d.g.gilbert + * biology dept., indiana university, bloomington, in 47405 + * e-mail: gilbertd@bio.indiana.edu + * + * This program may be freely copied and used by anyone. + * Developers are encourged to incorporate parts in their + * programs, rather than devise their own private sequence + * format. + * + * This should compile and run with any ANSI C compiler. + * Please advise me of any bugs, additions or corrections. + +Readseq is particularly useful as it automatically detects many +sequence formats, and interconverts among them. + +Formats which readseq currently understands: + + * IG/Stanford, used by Intelligenetics and others + * GenBank/GB, genbank flatfile format + * NBRF format + * EMBL, EMBL flatfile format + * GCG, single sequence format of GCG software + * DNAStrider, for common Mac program + * Fitch format, limited use + * Pearson/Fasta, a common format used by Fasta programs and others + * Zuker format, limited use. Input only. + * Olsen, format printed by Olsen VMS sequence editor. Input only. + * Phylip3.2, sequential format for Phylip programs + * Phylip, interleaved format for Phylip programs (v3.3, v3.4) + * Plain/Raw, sequence data only (no name, document, numbering) + + MSF multi sequence format used by GCG software + + PAUP's multiple sequence (NEXUS) format + + PIR/CODATA format used by PIR + + ASN.1 format used by NCBI + + Pretty print with various options for nice looking output. Output only. + +See the included "Formats" file for detail on file formats. + + +Example usage: + readseq + -- for interactive use + + readseq my.1st.seq my.2nd.seq -all -format=genbank -output=my.gb + -- convert all of two input files to one genbank format output file + + readseq my.seq -all -form=pretty -nameleft=3 -numleft -numright -numtop -match + -- output to standard output a file in a pretty format + + readseq my.seq -item=9,8,3,2 -degap -CASE -rev -f=msf -out=my.rev + -- select 4 items from input, degap, reverse, and uppercase them + + cat *.seq | readseq -pipe -all -format=asn > bunch-of.asn + -- pipe a bunch of data thru readseq, converting all to asn + + +The brief usage of readseq is as follows. The "[]" denote +optional parts of the syntax: + +readseq -help +readSeq (27Dec92), multi-format molbio sequence reader. +usage: readseq [-options] in.seq > out.seq + options + -a[ll] select All sequences + -c[aselower] change to lower case + -C[ASEUPPER] change to UPPER CASE + -degap[=-] remove gap symbols + -i[tem=2,3,4] select Item number(s) from several + -l[ist] List sequences only + -o[utput=]out.seq redirect Output + -p[ipe] Pipe (command line, stdout) + -r[everse] change to Reverse-complement + -v[erbose] Verbose progress + -f[ormat=]# Format number for output, or + -f[ormat=]Name Format name for output: + 1. IG/Stanford 10. Olsen (in-only) + 2. GenBank/GB 11. Phylip3.2 + 3. NBRF 12. Phylip + 4. EMBL 13. Plain/Raw + 5. GCG 14. PIR/CODATA + 6. DNAStrider 15. MSF + 7. Fitch 16. ASN.1 + 8. Pearson/Fasta 17. PAUP + 9. Zuker 18. Pretty (out-only) + + Pretty format options: + -wid[th]=# sequence line width + -tab=# left indent + -col[space]=# column space within sequence line on output + -gap[count] count gap chars in sequence numbers + -nameleft, -nameright[=#] name on left/right side [=max width] + -nametop name at top/bottom + -numleft, -numright seq index on left/right side + -numtop, -numbot index on top/bottom + -match[=.] use match base for 2..n species + -inter[line=#] blank line(s) between sequence blocks + + +Notes: + +In use, readseq will respond to command line arguments, or to +interactive use. Command line arguments cannot be combined +but must each follow a switch character (-). In this release, +the command line options are now words, with an equals (=) +to separate parameter(s) fromt he command. You cannot put a +space between a command and its parameter, as is usual for +Unix programs (this is to preserve compatibility with VMS). +The command line syntax of the earlier versions is still +supported. + +See the file Formats for details of the sequence formats which +are supported by readseq. The auto-detection feature of +readseq which distinguishes these formats looks for some of the +unique keywords and symbols that are found in each format. It +is not infallible at this, though it attempts to exclude unknown +formats. In general, if you feed to readseq a sequence file that +you know is one of these common formats, you are okay. If you feed +it data that might be oddball formats, or non-sequence data, +you might well get garbage results. Also, different developers +are always thinking up minor twists on these common formats +(like PAUP requiring a blank line between blocks of Phylip format, +or IG adding form feeds between sequences), which may cause hassles. + +In general, output supports only minimal subsets of each format +needed for sequence data exchanges. Features, descriptions +and other format-unique information is discarded. + +The pretty format requires additional options to generate a +nice output. Try the various pretty options to see what you like. +Pretty format is OUPUT only, readseq cannot read a Pretty format +file. + +Readseq is NOT optimized for LARGE files. It generally makes several +reads thru each input file (one per sequence output at present, future +version may optimize this). It should handle input and output files +and sequences of any size, but will slow down quite a bit for very large +(multi megabyte) sized files. It is NOT recommended for converting +databanks or large subsets there-of. It is primarily directed at the +small files that researchers use to maintain their personal data, which +they frequently need to interconvert for the various analysis programs +which so frequently require a special format. + +Users of Olsen multi sequence editor (VMS). The Olsen format +here is produced with the print command: + print/out=some.file +Use Genbank output from readseq to produce a format that this +editor can read, and use the command + load/genbank some.file +Dan Davison has a VMS program that will convert to/from the +Olsen native binary data format. E-mail davison@uh.edu + +Warning: Phylip format input is now supported (30Dec92), however the +auto-detection of Phylip format is very probabilistic and messy, +especially distinguishing sequential from interleaved versions. It +is not recommended that one use readseq to convert files from Phylip +format to others unless essential. + + +This program is available thru Internet gopher, as + + gopher ftp.bio.indiana.edu + browse into the IUBio-Software+Data/molbio/readseq/ folder + select the readseq.shar document + +Or thru anonymous FTP in this manner: + my_computer> ftp ftp.bio.indiana.edu (or IP address 129.79.224.25) + username: anonymous + password: my_username@my_computer + ftp> cd molbio/readseq + ftp> get readseq.shar + ftp> bye + +readseq.shar is a Unix shell archive of the readseq files. +This file can be editted by any text editor to reconstitute the +original files, for those who do not have a Unix system or an +Unshar program. Read the top of this .shar file for further +instructions. + +There are also pre-compiled executables for the following computers: +Silicon Graphics Iris, Sparc (Sun Sparcstation & clones), VMS-Vax, +Macintosh. Use binary ftp to transfer these, except Macintosh. The +Mac version is just the command-line program in a window, not very +handy. + +C source files: + readseq.c ureadseq.c ureadasn.c ureadseq.h + +Document files: + Readme (this doc) + Formats (description of sequence file formats) + add.gdemenu (GDE program users can add this to the .GDEmenu file) + Stdfiles -- test sequence files + Makefile -- Unix make file + Make.com -- VMS make file + *.std -- files for testing validity of readseq + + +Recent changes (see also readseq.c for all history of changes): + +4 May 92 ++ added 32 bit CRC checksum as alternative to GCG 6.5bit checksum +Aug 92 += fixed Olsen format input to handle files w/ more sequences, + not to mess up when more than one seq has same identifier, + and to convert number masks to symbols. += IG format fix to understand ^L +30 Dec 92 +* revised command-line & interactive interface. Suggested form is now + readseq infile -format=genbank -output=outfile -item=1,3,4 ... + but remains compatible with prior commandlines: + readseq infile -f2 -ooutfile -i3 ... ++ added GCG MSF multi sequence file format ++ added PIR/CODATA format ++ added NCBI ASN.1 sequence file format ++ added Pretty, multi sequence pretty output (only) ++ added PAUP multi seq format ++ added degap option ++ added Gary Williams (GWW, G.Williams@CRC.AC.UK) reverse-complement option. ++ added support for reading Phylip formats (interleave & sequential) +* string fixes, dropped need for compiler flags NOSTR, FIXTOUPPER, NEEDSTRCASECMP +* changed 32bit checksum to default, -DSMALLCHECKSUM for GCG version + + diff --git a/Stdfiles b/Stdfiles new file mode 100644 index 0000000..bd7efc5 --- /dev/null +++ b/Stdfiles @@ -0,0 +1,134 @@ +/* Stdfiles + generate standard files to test readseq +*/ + +C +#include +/* no sequence formats use chars > #126, ignore these */ +main(void) +{ + int c; + puts("> alphabet['!'..'~']"); + for (c = '!'; c <= '~'; c++) putc(c,stdout); + putc('\n', stdout); +} + +link -w -t MPST -c 'MPS ' c.o � + "{Libraries}"Interface.o "{Libraries}"ToolLibs.o � + "{Libraries}"Runtime.o "{CLibraries}"StdClib.o +link.out > alphabet.orig + + +C +#include +main(void) +{ +/* note: symbols "*" and "/" removed as terminators for various formats */ +const char *aminos = "ABCDEFGHIKLMNPQRSTVWXYZ"; +const char *primenuc = "ACGTU"; +const char *allsymbols = "_.-?<>{}[]()!@#$%^&=+;:'|`~\"\\"; + + char *c, all[256]; + int count; + + strcpy(all, aminos); + strcat(all, primenuc); + strcat(all, allsymbols); + puts("> nucleic/amino test"); + for (count=0; count<4; count++) { + for (c = all; *c!=0; c++) putc(*c, stdout); + putc('\n', stdout); + } +} + +link -w -t MPST -c 'MPS ' c.o � + "{Libraries}"Interface.o "{Libraries}"ToolLibs.o � + "{Libraries}"Runtime.o "{CLibraries}"StdClib.o +link.out > nucleic.std + +#-------------------------- + +#standards (ship w/ readseq) +#note: not all alphabet.orig chars are expected to be passed by +# readseq. Numbers are dropped. +readseq -p alphabet.orig > alphabet.std +readseq -p -C alphabet.std > upper.std + +cat alphabet.orig + > alphabet['!'..'~'] + !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ + +cat alphabet.std + >alphabet['!'..'~'], 83 bases, 9429 checksum. + !"#$%&'()*+-./:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\] + ^_`abcdefghijklmnopqrstuvwxyz{|}~ + +cat upper.std + >alphabet['!'..'~'], 83 bases, 9429 checksum. + !"#$%&'()*+-./:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\] + ^_`ABCDEFGHIJKLMNOPQRSTUVWXYZ{|}~ + +cat nucleic.std + > nucleic/amino test + ABCDEFGHIKLMNPQRSTVWXYZACGTU_.-?<>{}[]()!@#$%^&=+;:'|`~"\ + ABCDEFGHIKLMNPQRSTVWXYZACGTU_.-?<>{}[]()!@#$%^&=+;:'|`~"\ + ABCDEFGHIKLMNPQRSTVWXYZACGTU_.-?<>{}[]()!@#$%^&=+;:'|`~"\ + ABCDEFGHIKLMNPQRSTVWXYZACGTU_.-?<>{}[]()!@#$%^&=+;:'|`~"\ + +readseq -p nucleic.std + >nucleic/amino test, 228 bases, 5952 checksum. + ABCDEFGHIKLMNPQRSTVWXYZACGTU_.-?<>{}[]()!@#$%^&=+; + :'|`~"\ABCDEFGHIKLMNPQRSTVWXYZACGTU_.-?<>{}[]()!@# + $%^&=+;:'|`~"\ABCDEFGHIKLMNPQRSTVWXYZACGTU_.-?<>{} + []()!@#$%^&=+;:'|`~"\ABCDEFGHIKLMNPQRSTVWXYZACGTU_ + .-?<>{}[]()!@#$%^&=+;:'|`~"\ + + +#---------------------------------- + +#test for general read/write of all chars: +readseq -p alphabet.std -otest.alpha +diff test.alpha alphabet.std + +#test for valid toupper, general read/write: +readseq -p -C alphabet.std -otest.upper +diff test.upper upper.std +#for vms, use "-C" to preserve case +# readseq -p "-C" alphabet.std -otest.upper + +#test for multiple sequence file conversions +# leave out gcg, raw; +# test of long seq conversion ? +# test of mail-header seq conversion ? + +#test for valid format conversions +readseq -v -p -f1 nucleic.std -otest.f1 +readseq -v -p -f2 test.f1 -otest.f2 +readseq -v -p -f3 test.f2 -otest.f3 +readseq -v -p -f4 test.f3 -otest.f4 +readseq -v -p -f5 test.f4 -otest.f5 +readseq -v -p -f6 test.f5 -otest.f6 +readseq -v -p -f7 test.f6 -otest.f7 +readseq -v -p -f8 test.f7 -otest.f8 +readseq -v -p -f1 test.f8 -otest.f1b +diff test.f1 test.f1b +compare test.f1 test.f1b + +readseq -v -p -f13 test.f8 -otest.f13 # raw, drops name +readseq -v -p -f9 test.f8 -otest.f9 # zuker, little used +#readseq -v -p -f10 test.f9 -otest.f10 # olsen, input only (output=raw) +readseq -v -p -f11 test.f8 -otest.f11 # phylip 3.2, output only +readseq -v -p -f12 test.f8 -otest.f12 # phylip 3.3, output only +readseq -v -p -f14 test.f8 -otest.f14 # phylip 3.4, output only + + +#clean up +rm test.� + + +#----------------------------- +# some general tests + +readseq -h + +readseq diff --git a/add.gdemenu b/add.gdemenu new file mode 100644 index 0000000..12818f6 --- /dev/null +++ b/add.gdemenu @@ -0,0 +1,123 @@ +# +# dgg added new readseq formats, 29 dec 92 +# + +item:Export Foreign Format +itemmethod:readseq in1 -pipe -all -form=$FORMAT > $OUTPUTFILE +itemhelp:readseq.help + +arg:FORMAT +argtype:choice_menu +argchoice:GenBank:genbank +argchoice:IG/Stanford:ig +argchoice:NBRF:nbrf +argchoice:EMBL:embl +argchoice:GCG:gcg +argchoice:DNA Strider:strider +argchoice:Fitch:fitch +argchoice:Pearson/Fasta:pearson +argchoice:Zuker:zuker +argchoice:Olsen:olsen +argchoice:Phylip:phylip +#argchoice:Phylip v3.2:phylip3.2 +argchoice:Plain text:raw +argchoice:ASN.1:asn +argchoice:PIR:pir +argchoice:MSF:msf +argchoice:PAUP:paup +argchoice:Pretty:pretty -nametop -nameleft=3 -numright -nameright -numtop + +arg:OUTPUTFILE +argtype:text +arglabel:Save as? + +in:in1 +informat:genbank + + +# +#dgg addition for new readseq, 24 dec 92 +# + +item:Pretty Print +itemmethod:readseq in1 -p -a -f=pretty $NAMELEFT $NAMERIGHT $NUMTOP $NUMBOT $NUMLEFT $NUMRIGHT -col=$COLS -width=$WIDTH $MATCH $GAPC > in1.pretty; (textedit in1.pretty; /bin/rm -f in1 in1.pretty)& +itemhelp:readseq.help + +#nametop is bad !? + +in:in1 +informat:genbank + +arg:NAMETOP +argtype:chooser +arglabel:Names at top ? +argchoice:No: +argchoice:Yes:-nametop + +arg:NAMELEFT +argtype:chooser +arglabel:Names at left ? +argchoice:No: +argchoice:Yes:-nameleft + +arg:NAMERIGHT +argtype:chooser +arglabel:Names at right? +argchoice:Yes:-nameright +argchoice:No: + +arg:NUMTOP +argtype:chooser +arglabel:Numbers at top ? +argchoice:Yes:-numtop +argchoice:No: + +arg:NUMBOT +argtype:chooser +arglabel:Numbers at tail ? +argchoice:No: +argchoice:Yes:-numbot + +arg:NUMLEFT +argtype:chooser +arglabel:Numbers at left ? +argchoice:Yes:-numleft +argchoice:No: + +arg:NUMRIGHT +argtype:chooser +arglabel:Numbers at right? +argchoice:Yes:-numright +argchoice:No: + +arg:MATCH +argtype:chooser +arglabel:Use match '.' for 2..n species? +argchoice:No: +argchoice:Yes:-match + +arg:GAPC +argtype:chooser +arglabel:Count gap symbols? +argchoice:No: +argchoice:Yes:-gap + +arg:WIDTH +argtype:slider +arglabel:Sequence width? +argmin:10 +argmax:200 +argvalue:50 + +arg:COLS +argtype:slider +arglabel:Column spacers? +argmin:0 +argmax:50 +argvalue:10 + + +### pretty print insert end +# + + diff --git a/macinit.r b/macinit.r new file mode 100644 index 0000000..3dd9c4b --- /dev/null +++ b/macinit.r @@ -0,0 +1,412 @@ +/*------------------------------------------------------------------------------ +# +# +# MultiFinder-Aware Simple Input/Output Window resource +# +# for ReadSeq +# +------------------------------------------------------------------------------*/ + +#include "systypes.r" +#include "types.r" + + +resource 'MENU' (20000, preload) { + 20000, + textMenuProc, + 0x7FFFFFFD, + enabled, + apple, + { /* array: 2 elements */ + /* [1] */ + "About ReadSeq�", noIcon, noKey, noMark, plain, + /* [2] */ + "-", noIcon, noKey, noMark, plain + } +}; + +resource 'MENU' (20001, preload) { + 20001, + textMenuProc, + 0x0, + enabled, + "File", + { /* array: 11 elements */ + /* [1] */ + "New", noIcon, "N", noMark, plain, + /* [2] */ + "Open", noIcon, "O", noMark, plain, + /* [3] */ + "-", noIcon, noKey, noMark, plain, + /* [4] */ + "Close", noIcon, "W", noMark, plain, + /* [5] */ + "Save", noIcon, "S", noMark, plain, + /* [6] */ + "Save As�", noIcon, noKey, noMark, plain, + /* [7] */ + "-", noIcon, noKey, noMark, plain, + /* [8] */ + "Page Setup�", noIcon, noKey, noMark, plain, + /* [9] */ + "Print�", noIcon, noKey, noMark, plain, + /* [10] */ + "-", noIcon, noKey, noMark, plain, + /* [11] */ + "Quit", noIcon, "Q", noMark, plain + } +}; + +resource 'MENU' (20002, preload) { + 20002, + textMenuProc, + 0x0, + enabled, + "Edit", + { /* array: 6 elements */ + /* [1] */ + "Undo", noIcon, "Z", noMark, plain, + /* [2] */ + "-", noIcon, noKey, noMark, plain, + /* [3] */ + "Cut", noIcon, "X", noMark, plain, + /* [4] */ + "Copy", noIcon, "C", noMark, plain, + /* [5] */ + "Paste", noIcon, "V", noMark, plain, + /* [6] */ + "Clear", noIcon, noKey, noMark, plain + } +}; + +resource 'MENU' (20003, preload) { + 20003, + textMenuProc, + allEnabled, + enabled, + "Font", + { /* array: 0 elements */ + } +}; + +resource 'ALRT' (20000, purgeable) { + {98, 108, 314, 405}, + 20000, + { /* array: 4 elements */ + /* [1] */ + OK, visible, silent, + /* [2] */ + OK, visible, silent, + /* [3] */ + OK, visible, silent, + /* [4] */ + OK, visible, silent + } +}; + +resource 'ALRT' (20001, purgeable) { + {40, 20, 150, 260}, + 20001, + { /* array: 4 elements */ + /* [1] */ + OK, visible, silent, + /* [2] */ + OK, visible, silent, + /* [3] */ + OK, visible, silent, + /* [4] */ + OK, visible, silent + } +}; + +resource 'ALRT' (20002, preload) { + {72, 64, 212, 372}, + 20002, + { /* array: 4 elements */ + /* [1] */ + OK, visible, silent, + /* [2] */ + OK, visible, silent, + /* [3] */ + OK, visible, silent, + /* [4] */ + OK, visible, silent + } +}; + +resource 'DITL' (20000, purgeable) { + { /* array DITLarray: 8 elements */ + /* [1] */ + {191, 98, 211, 178}, + Button { + enabled, + "OK" + }, + /* [2] */ + {110, 24, 130, 256}, + StaticText { + disabled, + " Copyright � 1990 by d.g.gilbert\n" + }, + /* [3] */ + {6, 93, 24, 281}, + StaticText { + disabled, + "A tool for molecular biology." + }, + /* [4] */ + {31, 25, 86, 281}, + StaticText { + disabled, + "Reads and writes nucleic or protein sequ" + "ences in various formats. Data files may" + " have multiple sequences." + }, + /* [5] */ + {6, 17, 22, 92}, + StaticText { + disabled, + "ReadSeq" + }, + /* [6] */ + {150, 28, 186, 262}, + StaticText { + disabled, + "land mail: biology dept., indiana univer" + "sity, bloomington, in 47405\n" + }, + /* [7] */ + {129, 25, 153, 258}, + StaticText { + disabled, + " e-mail: gilbertd@bio.indiana.edu\n" + }, + /* [8] */ + {86, 12, 107, 281}, + StaticText { + disabled, + "This program may be freely distributed." + } + } +}; + +resource 'DITL' (20001, purgeable) { + { /* array DITLarray: 3 elements */ + /* [1] */ + {80, 150, 100, 230}, + Button { + enabled, + "OK" + }, + /* [2] */ + {10, 60, 60, 230}, + StaticText { + disabled, + "Error. ^0." + }, + /* [3] */ + {8, 8, 40, 40}, + Icon { + disabled, + 2 + } + } +}; + +resource 'DITL' (20002, preload) { + { /* array DITLarray: 4 elements */ + /* [1] */ + {58, 25, 76, 99}, + Button { + enabled, + "Yes" + }, + /* [2] */ + {86, 25, 104, 99}, + Button { + enabled, + "No" + }, + /* [3] */ + {12, 20, 45, 277}, + StaticText { + disabled, + "Save changes before closing?" + }, + /* [4] */ + {86, 195, 104, 269}, + Button { + enabled, + "Cancel" + } + } +}; + +resource 'CNTL' (20000, purgeable, preload) { + {-1, 465, 272, 481}, + 0, + invisible, + 0, + 0, + scrollBarProc, + 0, + "" +}; + +resource 'CNTL' (20001, purgeable, preload) { + {271, -1, 287, 466}, + 0, + invisible, + 0, + 0, + scrollBarProc, + 0, + "" +}; + +data 'pzza' (128, purgeable) { + $"4D50 5320" /* MPS */ +}; + +resource 'MBAR' (20000, preload) { + { /* array MenuArray: 4 elements */ + /* [1] */ + 20000, + /* [2] */ + 20001, + /* [3] */ + 20002, + /* [4] */ + 20003 + } +}; + +resource 'WIND' (20000, purgeable, preload) { + {0, 0, 286, 480}, + zoomDocProc, + invisible, + noGoAway, + 0x0, + "untitled" +}; + +resource 'STR#' (20000, purgeable) { + { /* array StringArray: 11 elements */ + /* [1] */ + "You must run on 512Ke or later", + /* [2] */ + "Application Memory Size is too small", + /* [3] */ + "Not enough memory to run SIOW", + /* [4] */ + "Not enough memory to do Cut", + /* [5] */ + "Cannot do Cut", + /* [6] */ + "Cannot do Copy", + /* [7] */ + "Cannot exceed 32,000 characters with Pas" + "te", + /* [8] */ + "Not enough memory to do Paste", + /* [9] */ + "Cannot create window", + /* [10] */ + "Cannot exceed 32,000 characters", + /* [11] */ + "Cannot do PasteFont not found" + } +}; + +resource 'SIZE' (-1) { + reserved, + acceptSuspendResumeEvents, + reserved, + canBackground, + multiFinderAware, + backgroundAndForeground, + dontGetFrontClicks, + ignoreChildDiedEvents, + not32BitCompatible, + notHighLevelEventAware, + onlyLocalHLEvents, + notStationeryAware, + dontUseTextEditServices, + reserved, + reserved, + reserved, + 124928, + 38912 +}; + +resource 'SIZE' (0) { + reserved, + acceptSuspendResumeEvents, + reserved, + canBackground, + multiFinderAware, + backgroundAndForeground, + dontGetFrontClicks, + ignoreChildDiedEvents, + not32BitCompatible, + notHighLevelEventAware, + onlyLocalHLEvents, + notStationeryAware, + dontUseTextEditServices, + reserved, + reserved, + reserved, + 256000, + 38912 +}; + +data 'siow' (0) { + $"0F52 6561 6453 6571 2069 6E20 5349 4F57" /* .ReadSeq in SIOW */ +}; + +resource 'BNDL' (128) { + 'siow', + 0, + { /* array TypeArray: 2 elements */ + /* [1] */ + 'ICN#', + { /* array IDArray: 1 elements */ + /* [1] */ + 0, 128 + }, + /* [2] */ + 'FREF', + { /* array IDArray: 1 elements */ + /* [1] */ + 0, 128 + } + } +}; + +resource 'FREF' (128) { + 'APPL', + 0, + "" +}; + +resource 'ICN#' (128) { + { /* array: 2 elements */ + /* [1] */ + $"0000 0000 0000 0000 0010 4100 0010 2200" + $"0020 2200 0020 2100 0020 4100 0010 4200" + $"0010 4200 0010 2200 0020 2100 0020 0100" + $"00FF FF00 03FF FFE0 0791 03F0 0ED1 0E7C" + $"1C31 321C 380D C10E 3FFF FFFE 3003 C106" + $"380D 300E 1E31 0E3C 1FC1 01F8 07FF FFE0" + $"00FF FE", + /* [2] */ + $"0000 0000 0000 0000 0010 4100 0010 2200" + $"0020 2200 0020 2100 0020 4100 0010 4200" + $"0010 4200 0010 2200 0020 2100 0020 0100" + $"00FF FF00 03FF FFE0 07FF FFF0 0FFF FFFC" + $"1FFF FFFC 3FFF FFFE 3FFF FFFE 3FFF FFFE" + $"3FFF FFFE 1FFF FFFC 1FFF FFF8 07FF FFE0" + $"00FF FE" + } +}; + diff --git a/readseqSIOW.make b/readseqSIOW.make new file mode 100644 index 0000000..480a146 --- /dev/null +++ b/readseqSIOW.make @@ -0,0 +1,42 @@ +# Macintosh MPW-C Makefile +# using Simple Input/Output Window library +# +# File: ReadseqSIOW.make +# Target: ReadseqSIOW +# Sources: readseq.c ureadseq.c ureadasn.c macinit.c +# Created: Wednesday, November 13, 1991 8:23:00 PM + + +#OBJECTS = macinit.c.o readseq.c.o ureadseq.c.o +#COptions = -D SIOW # -r + +#if NCBI is available, set path here to NCBI toolkit: +NCBI = "{Boot}@molbio:ncbi:" +OBJECTS = macinit.c.o readseq.c.o ureadseq.c.o ureadasn.c.o +COptions = -D SIOW -d NCBI -i "{NCBI}"include: +NCBILIBS = "{NCBI}"lib:libncbi.o "{NCBI}"lib:libncbiobj.o "{NCBI}"lib:libvibrant.o +#endif NCBI + +ReadseqSIOW �� ReadseqSIOW.make {OBJECTS} + Link -d -c '????' -t APPL � + {OBJECTS} � + "{CLibraries}"StdClib.o � + "{MPW}"Libraries:Libraries:SIOW.o � + "{Libraries}"Runtime.o � + "{Libraries}"Interface.o � +#if NCBI + {NCBILIBS} � + "{CLibraries}"CSANELib.o � + "{CLibraries}"Math.o � +#endif NCBI + -o ReadseqSIOW + +readseq.c.o � ReadseqSIOW.make readseq.c +ureadseq.c.o � ReadseqSIOW.make ureadseq.c +macinit.c.o � ReadseqSIOW.make macinit.c +#if NCBI +ureadasn.c.o � ReadseqSIOW.make ureadasn.c +#endif NCBI + +ReadseqSIOW �� macinit.r + Rez -a macinit.r -o ReadseqSIOW