init: extra
This commit is contained in:
parent
a445d20c65
commit
650d39cff9
9 changed files with 2252 additions and 0 deletions
980
Formats
Normal file
980
Formats
Normal file
|
@ -0,0 +1,980 @@
|
|||
||||||||||| ReadSeq supported formats (revised 30Dec92)
|
||||
--------------------------------------------------------
|
||||
|
||||
-f[ormat=]Name Format name for output:
|
||||
1. IG/Stanford 10. Olsen (in-only)
|
||||
2. GenBank/GB 11. Phylip3.2
|
||||
3. NBRF 12. Phylip
|
||||
4. EMBL 13. Plain/Raw
|
||||
5. GCG 14. PIR/CODATA
|
||||
6. DNAStrider 15. MSF
|
||||
7. Fitch 16. ASN.1
|
||||
8. Pearson/Fasta 17. PAUP
|
||||
9. Zuker (in-only) 18. Pretty (out-only)
|
||||
|
||||
In general, output supports only minimal subsets of each format
|
||||
needed for sequence data exchanges. Features, descriptions
|
||||
and other format-unique information is discarded.
|
||||
|
||||
Users of Olsen multi sequence editor (VMS). The Olsen format
|
||||
here is produced with the print command:
|
||||
print/out=some.file
|
||||
Use Genbank output from readseq to produce a format that this
|
||||
editor can read, and use the command
|
||||
load/genbank some.file
|
||||
Dan Davison has a VMS program that will convert to/from the
|
||||
Olsen native binary data format. E-mail davison@uh.edu
|
||||
|
||||
Warning: Phylip format input is now supported (30Dec92), however the
|
||||
auto-detection of Phylip format is very probabilistic and messy,
|
||||
especially distinguishing sequential from interleaved versions. It
|
||||
is not recommended that one use readseq to convert files from Phylip
|
||||
format to others unless essential.
|
||||
|
||||
|
||||
|
||||
||||||||||| ReadSeq usage (revised 11Nov91)
|
||||
--------------------------------------------------------
|
||||
|
||||
A. determine file format:
|
||||
|
||||
short skiplines; /* result: number of header lines to skip (or 0) */
|
||||
short error; /* error result or 0 */
|
||||
short format; /* resulting format code, see ureadseq.h */
|
||||
char *filename = "Mysequence.file"
|
||||
|
||||
format = seqFileFormat( filename, &skiplines, &error);
|
||||
if (error!=0) fail;
|
||||
|
||||
B. read number and list of sequences (optional)
|
||||
short numseqs; /* resulting number of sequences found in file */
|
||||
char *seqlist; /* list of sequence names, newline separated, 0 terminated */
|
||||
|
||||
seqlist = listSeqs( filename, skiplines, format, &numseqs, &error);
|
||||
if (error!=0) display (seqlist);
|
||||
free( seqlist);
|
||||
|
||||
C. read individual sequences as desired
|
||||
short seqIndex; /* sequence index #, or == kListSeqs for listSeqs equivalent */
|
||||
long seqlen; /* length of seq */
|
||||
char seqid[256]; /* sequence name */
|
||||
char *seq; /* sequence, 0 terminated, free when done */
|
||||
|
||||
seq = readSeq( seqIndex, filename, skiplines, format,
|
||||
&seqlen, &numseqs, &error, seqid);
|
||||
if (error!=0) manipulate(seq);
|
||||
free(seq);
|
||||
|
||||
D. write sequences as desired
|
||||
int nlines; /* number of lines of sequence written */
|
||||
FILE* fout; /* open file pointer (stdout or other) */
|
||||
short outform; /* output format, see ureadseq.h */
|
||||
|
||||
nlines = writeSeq( fout, seq, seqlen, format, outform, seqid);
|
||||
|
||||
|
||||
Note (30Dec92): There is various processing done by the main program (in readseq.c),
|
||||
rather than just in the subroutines (in ureadseq.c). Especially for interleaved
|
||||
output formats, the writeSeq subroutine does not handle interleaving, nor some of
|
||||
the formatting at the top and end of output files. While seqFileFormat, listSeqs,
|
||||
and readSeq subroutines are fairly self-contained, the writeSeq depends a lot on
|
||||
auxilliary processing. At some point, this may be revised so writeSeq is self-
|
||||
contained.
|
||||
|
||||
Note 2: The NCBI toolkit (ftp from ncbi.nlm.nih.gov) is needed for the ASN.1 format
|
||||
reading (see ureadasn.c). A bastard (but workable I hope) ASN.1 format is written
|
||||
by writeSeq alone.
|
||||
|
||||
|
||||
|
||||
||||||||||| sequence formats....
|
||||
---------------------------------------------------
|
||||
|
||||
stanford/IG
|
||||
;comments
|
||||
;...
|
||||
seq1 info
|
||||
abcd...
|
||||
efgh1 (or 2 = terminator)
|
||||
;another seq
|
||||
;....
|
||||
seq2 info
|
||||
abcd...1
|
||||
--- for e.g. ----
|
||||
; Dro5s-T.Seq Length: 120 April 6, 1989 21:22 Check: 9487 ..
|
||||
dro5stseq
|
||||
GCCAACGACCAUACCACGCUGAAUACAUCGGUUCUCGUCCGAUCACCGAAAUUAAGCAGCGUCGCGGGCG
|
||||
GUUAGUACUUAGAUGGGGGACCGCUUGGGAACACCGCGUGUUGUUGGCCU1
|
||||
|
||||
; TOIG of: Dro5srna.Seq check: 9487 from: 1 to: 120
|
||||
---------------------------------------------------
|
||||
|
||||
Genbank:
|
||||
LOCUS seq1 ID..
|
||||
...
|
||||
ORIGIN ...
|
||||
123456789abcdefg....(1st 9 columns are formatting)
|
||||
hijkl...
|
||||
// (end of sequence)
|
||||
LOCUS seq2 ID ..
|
||||
...
|
||||
ORIGIN
|
||||
abcd...
|
||||
//
|
||||
---------------------------------------------------
|
||||
|
||||
NBRF format: (from uwgcg ToNBRF)
|
||||
>DL;DRO5SRNA
|
||||
Iubio$Dua0:[Gilbertd.Gcg]Dro5srna.Seq;2 => DRO5SRNA
|
||||
|
||||
51 AAUUAAGCAG CGUCGCGGGC GGUUAGUACU UAGAUGGGGG ACCGCUUGGG
|
||||
101 AACACCGCGU GUUGUUGGCC U
|
||||
|
||||
---------------------------------------------------
|
||||
|
||||
EMBL format
|
||||
ID345 seq1 id (the 345 are spaces)
|
||||
... other info
|
||||
SQ345Sequence (the 3,4,5 are spaces)
|
||||
abcd...
|
||||
hijk...
|
||||
// (! this is proper end string: 12Oct90)
|
||||
ID seq2 id
|
||||
...
|
||||
SQ Sequence
|
||||
abcd...
|
||||
...
|
||||
//
|
||||
---------------------------------------------------
|
||||
|
||||
UW GCG Format:
|
||||
comments of any form, up to ".." signal
|
||||
signal line has seq id, and " Check: #### .."
|
||||
only 1 seq/file
|
||||
|
||||
-- e.g. --- (GCG from GenBank)
|
||||
LOCUS DROEST6 1819 bp ss-mRNA INV 31-AUG-1987
|
||||
... much more ...
|
||||
ORIGIN 1 bp upstream of EcoRI site; chromosome BK9 region 69A1.
|
||||
|
||||
INVERTEBRATE:DROEST6 Length: 1819 January 9, 1989 16:48 Check: 8008 ..
|
||||
|
||||
1 GAATTCGCCG GAGTGAGGAG CAACATGAAC TACGTGGGAC TGGGACTTAT
|
||||
|
||||
51 CATTGTGCTG AGCTGCCTTT GGCTCGGTTC GAACGCGAGT GATACAGATG
|
||||
|
||||
|
||||
---------------------------------------------------
|
||||
|
||||
DNAStrider (Mac) = modified Stanford:
|
||||
; ### from DNA Strider Friday, April 7, 1989 11:04:24 PM
|
||||
; DNA sequence pBR322 4363 b.p. complete sequence
|
||||
;
|
||||
abcd...
|
||||
efgh
|
||||
// (end of sequence)
|
||||
---------------------------------------------------
|
||||
|
||||
Fitch format:
|
||||
Dro5srna.Seq
|
||||
GCC AAC GAC CAU ACC ACG CUG AAU ACA UCG GUU CUC GUC CGA UCA CCG AAA UUA AGC AGC
|
||||
GUC GCG GGC GGU UAG UAC UUA GAU GGG GGA CCG CUU GGG AAC ACC GCG UGU UGU UGG CCU
|
||||
Droest6.Seq
|
||||
GAA TTC GCC GGA GTG AGG AGC AAC ATG AAC TAC GTG GGA CTG GGA CTT ATC ATT GTG CTG
|
||||
AGC TGC CTT TGG CTC GGT TCG AAC GCG AGT GAT ACA GAT GAC CCT CTG TTG GTG CAG CTG
|
||||
---------------------------------------------------
|
||||
|
||||
W.Pearson/Fasta format:
|
||||
>BOVPRL GenBank entry BOVPRL from omam file. 907 nucleotides.
|
||||
TGCTTGGCTGAGGAGCCATAGGACGAGAGCTTCCTGGTGAAGTGTGTTTCTTGAAATCAT
|
||||
|
||||
---------------------------------------------------
|
||||
Phylip version 3.2 format (e.g., DNAML):
|
||||
|
||||
5 13 YF (# seqs, #bases, YF)
|
||||
Alpha AACGTGGCCAAAT
|
||||
aaaagggccc... (continued sp. alpha)
|
||||
Beta AAGGTCGCCAAAC
|
||||
aaaagggccc... (continued sp. beta)
|
||||
Gamma CATTTCGTCACAA
|
||||
aaaagggccc... (continued sp. Gamma)
|
||||
1234567890^-- bases must start in col 11, and run 'til #bases
|
||||
(spaces & newlines are okay)
|
||||
---------------------------------------------------
|
||||
Phylip version 3.3 format (e.g., DNAML):
|
||||
|
||||
5 42 YF (# seqs, #bases, YF)
|
||||
Turkey AAGCTNGGGC ATTTCAGGGT
|
||||
Salmo gairAAGCCTTGGC AGTGCAGGGT
|
||||
H. SapiensACCGGTTGGC CGTTCAGGGT
|
||||
Chimp AAACCCTTGC CGTTACGCTT
|
||||
Gorilla AAACCCTTGC CGGTACGCTT
|
||||
1234567890^-- bases must start in col 11
|
||||
!! this version interleaves the species -- contrary to
|
||||
all other output formats.
|
||||
|
||||
GAGCCCGGGC AATACAGGGT AT
|
||||
GAGCCGTGGC CGGGCACGGT AT
|
||||
ACAGGTTGGC CGTTCAGGGT AA
|
||||
AAACCGAGGC CGGGACACTC AT
|
||||
AAACCATTGC CGGTACGCTT AA
|
||||
|
||||
---------------------------------------------------
|
||||
Phylip version 3.4 format (e.g., DNAML)
|
||||
-- Both Interleaved and sequential are permitted
|
||||
|
||||
5 13 (# seqs, #bases)
|
||||
Alpha AACGTGGCCAAAT
|
||||
aaaagggccc... (continued sp. alpha)
|
||||
Beta AAGGTCGCCAAAC
|
||||
aaaagggccc... (continued sp. beta)
|
||||
Gamma CATTTCGTCACAA
|
||||
aaaagggccc... (continued sp. Gamma)
|
||||
1234567890^-- bases must start in col 11, and run 'til #bases
|
||||
(spaces, newlines and numbers are are ignored)
|
||||
|
||||
---------------------------------------------------
|
||||
Gary Olsen (multiple) sequence editor /print format:
|
||||
|
||||
!---------------------
|
||||
!17Oct91 -- error in original copy of olsen /print format, shifted right 1 space
|
||||
! here is correct copy:
|
||||
301 40 Tb.thiop CGCAGCGAAA----------GCUNUGCUAAUACCGCAUA-CGnCCUG----------------------------------------------------- Tb.thiop
|
||||
123456789012345678901
|
||||
301 42 Rhc.purp CGUAGCGAAA----------GUUACGCUAAUACCGCAUA-UUCUGUG----------------------------------------------------- Rhc.purp
|
||||
|
||||
301 44 Rhc.gela nnngnCGAAA----------GCCGGAUUAAUACCGCAUA-CGACCUA----------------------------------------------------- Rhc.gela
|
||||
!---------------------
|
||||
|
||||
RNase P RNA components. on 20-FEB-90 17:23:58
|
||||
|
||||
1 (E.c. pr ): Base pairing in Escherichia coli RNase P RNA.
|
||||
2 (chrom ): Chromatium
|
||||
:
|
||||
12 (B.brevis): Bacillus brevis RNase P RNA, B. James.
|
||||
13 ( 90% con): 90% conserved
|
||||
14 (100% con): 100% conserved
|
||||
15 (gram+ pr): pairing
|
||||
|
||||
1
|
||||
RNase P RNA components. on 20-FEB-90 17:23:58
|
||||
|
||||
Posi- Sequence
|
||||
tion: identity: Data:
|
||||
|
||||
1 1 E.c. pr <<<<<<<<<< {{{{{{{{<<:<<<<<<<<<<^<<<<<<====>>>> E.c. pr
|
||||
1 2 chrom GGAGUCGGCCAGACAGUCGCUUCCGUCCU------------------ chrom
|
||||
:
|
||||
1 12 B.brevis AUGCAGGAAAUGCGGGUAGCCGCUGCCGCAAUCGUCU------------- B.brevis
|
||||
1234567890123456789012 <! this should be 21 not 22,
|
||||
! this example must be inset on left by 1 space from olsen /print files !
|
||||
1 13 90% con G C G A CGC GC - - 90% con
|
||||
1 14 100% con G A CGC 100% con
|
||||
1 15 gram+ pr <<<<<<<<<< {{{{{{{{<<<<<<<<<<<<<=============== gram+ pr
|
||||
|
||||
60 1 E.c. pr >>>>>>^>>^>>>>:>> <<<^<<<< {{{{{ E.c. pr
|
||||
60 2 chrom -----GGUG-ACGGGGGAGGAAAGUCCGG-GCUCCAU------------- chrom
|
||||
: :
|
||||
60 10 B.stearo ----UU-CG-GCCGUAGAGGAAAGUCCAUGCUCGCACGGUGCUGAGAUGC B.stearo
|
||||
|
||||
|
||||
---------------------------------------------------
|
||||
GCG MSF format
|
||||
Title line
|
||||
|
||||
picorna.msf MSF: 100 Type: P January 17, 1991 17:53 Check: 541
|
||||
..
|
||||
Name: Cb3 Len: 100 Check: 7009 Weight: 1.00
|
||||
Name: E Len: 100 Check: 60 Weight: 1.00
|
||||
|
||||
//
|
||||
|
||||
1 50
|
||||
Cb3 ...gpvedai .......t.. aaigr..vad tvgtgptnse aipaltaaet
|
||||
E gvenae.kgv tentna.tad fvaqpvylpe .nqt...... kv.affynrs
|
||||
|
||||
51 100
|
||||
|
||||
Cb3 ghtsqvvpgd tmqtrhvkny hsrsestien flcrsacvyf teykn.....
|
||||
E ...spi.gaf tvks...... gs.lesgfap .fsngtc.pn sviltpgpqf
|
||||
|
||||
---------------------------------------------------
|
||||
PIR format
|
||||
This is NBRF-PIR MAILSERVER version 1.45
|
||||
Command-> get PIR3:A31391
|
||||
\\\
|
||||
ENTRY A31391 #Type Protein
|
||||
TITLE *Esterase-6 - Fruit fly (Drosophila melanogaster)
|
||||
|
||||
DATE 03-Aug-1992 #Sequence 03-Aug-1992 #Text 03-Aug-1992
|
||||
PLACEMENT 0.0 0.0 0.0 0.0 0.0
|
||||
COMMENT *This entry is not verified.
|
||||
SOURCE Drosophila melanogaster
|
||||
|
||||
REFERENCE
|
||||
#Authors Cooke P.H., Oakeshott J.G.
|
||||
#Citation submitted to GenBank, April 1989
|
||||
#Reference-number A31391
|
||||
#Accession A31391
|
||||
#Cross-reference GB:J04167
|
||||
|
||||
SUMMARY #Molecular-weight 61125 #Length 544 #Checksum 1679
|
||||
SEQUENCE
|
||||
5 10 15 20 25 30
|
||||
1 M N Y V G L G L I I V L S C L W L G S N A S D T D D P L L V
|
||||
31 Q L P Q G K L R G R D N G S Y Y S Y E S I P Y A E P P T G D
|
||||
61 L R F E A P E P Y K Q K W S D I F D A T K T P V A C L Q W D
|
||||
91 Q F T P G A N K L V G E E D C L T V S V Y K P K N S K R N S
|
||||
121 F P V V A H I H G G A F M F G A A W Q N G H E N V M R E G K
|
||||
151 F I L V K I S Y R L G P L G F V S T G D R D L P G N Y G L K
|
||||
181 D Q R L A L K W I K Q N I A S F G G E P Q N V L L V G H S A
|
||||
211 G G A S V H L Q M L R E D F G Q L A R A A F S F S G N A L D
|
||||
241 P W V I Q K G A R G R A F E L G R N V G C E S A E D S T S L
|
||||
271 K K C L K S K P A S E L V T A V R K F L I F S Y V P F A P F
|
||||
301 S P V L E P S D A P D A I I T Q D P R D V I K S G K F G Q V
|
||||
331 P W A V S Y V T E D G G Y N A A L L L K E R K S G I V I D D
|
||||
361 L N E R W L E L A P Y L L F Y R D T K T K K D M D D Y S R K
|
||||
391 I K Q E Y I G N Q R F D I E S Y S E L Q R L F T D I L F K N
|
||||
421 S T Q E S L D L H R K Y G K S P A Y A Y V Y D N P A E K G I
|
||||
451 A Q V L A N R T D Y D F G T V H G D D Y F L I F E N F V R D
|
||||
481 V E M R P D E Q I I S R N F I N M L A D F A S S D N G S L K
|
||||
511 Y G E C D F K D N V G S E K F Q L L A I Y I D G C Q N R Q H
|
||||
541 V E F P
|
||||
///
|
||||
\\\
|
||||
---------------------------------------------------
|
||||
PAUP format:
|
||||
The NEXUS Format
|
||||
|
||||
Every block starts with "BEGIN blockname;" and ends with "END;".
|
||||
Each block is composed of one or more statements, each
|
||||
terminated by a semicolon (;).
|
||||
|
||||
Comments may be included in NEXUS files by enclosing them within
|
||||
square brackets, as in "[This is a comment]."
|
||||
|
||||
NEXUS-conforming files are identified by a "#NEXUS" directive at
|
||||
the very beginning of the file (line 1, column 1). If the
|
||||
#NEXUS is omitted PAUP issues a warning but continues
|
||||
processing.
|
||||
|
||||
NEXUS files are entirely free-format. Blanks, tabs, and
|
||||
newlines may be placed anywhere in the file. Unless RESPECTCASE
|
||||
is requested, commands and data may be entered in upper case,
|
||||
lower case, or a mixture of upper and lower case.
|
||||
|
||||
The following conventions are used in the syntax descriptions of
|
||||
the various blocks. Upper-case items are entered exactly as
|
||||
shown. Lower-case items inside of angle brackets -- e.g., <x>
|
||||
-- represent items to be substituted by the user. Items inside
|
||||
of square brackets -- e.g., [X] -- are optional. Items inside
|
||||
of curly braces and separated by vertical bars -- e.g., { X | Y
|
||||
| Z } -- are mutually exclusive options.
|
||||
|
||||
|
||||
The DATA Block
|
||||
|
||||
The DATA block contains the data matrix and other associated
|
||||
information. Its syntax is:
|
||||
|
||||
BEGIN DATA;
|
||||
DIMENSIONS NTAX=<number of taxa> NCHAR=<number of characters>;
|
||||
[ FORMAT [ MISSING=<missing-symbol> ]
|
||||
[ LABELPOS={ LEFT | RIGHT } ]
|
||||
[ SYMBOLS="<symbols-list>" ]
|
||||
[ INTERLEAVE ]
|
||||
[ MATCHCHAR=<match-symbol> ]
|
||||
[ EQUATE="<symbol>=<expansion> [<symbol>=<expansion>...]" ]
|
||||
[ TRANSPOSE ]
|
||||
[ RESPECTCASE ]
|
||||
[ DATATYPE = { STANDARD | DNA | RNA | PROTEIN } ]; ]
|
||||
[ OPTIONS [ IGNORE={ INVAR | UNINFORM } ]
|
||||
[ MSTAXA = { UNCERTAIN | POLYMORPH | VARIABLE } ]
|
||||
[ ZAP = "<list of zapped characters>" ] ; ]
|
||||
[ CHARLABELS <label_1> label_2><3E><> <label_NCHAR> ; ]
|
||||
[ TAXLABELS <label1_1> <label1_2> <label1_NTAX> ; ]
|
||||
[ STATELABELS <currently ignored by PAUP> ; ]
|
||||
MATRIX <data-matrix> ;
|
||||
END;
|
||||
|
||||
--- example PAUP file
|
||||
|
||||
#NEXUS
|
||||
|
||||
[!Brown et al. (1982) primate mitochondrial DNA]
|
||||
|
||||
begin data;
|
||||
dimensions ntax=5 nchar=896;
|
||||
format datatype=dna matchchar=. interleave missing='-';
|
||||
matrix
|
||||
[ 2 4 6 8 ]
|
||||
[ 1 1 1 1 1 ]
|
||||
human aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcctcattactatt ctgcctagcaaactcaaact acgaacgcactcacagtcgc
|
||||
chimp ................a.t. .c.................a ...............t.... ..................t. .t........c.........
|
||||
gorilla ..................tg ....t.....t........a ........a......t.... .................... .......a..c.....c...
|
||||
orang ................ac.. cc.....g..t.....t..a ..c........cc....g.. .................... .......a..c.....c...
|
||||
gibbon ......t..a..t...ac.g .c.................a ..a..c..t..cc.g..... ......t............. .......a........c...
|
||||
|
||||
[ 8 8 8 8 8 8 ]
|
||||
[ 0 2 4 6 8 9 ]
|
||||
[ 1 1 1 1 1 6 ]
|
||||
human cttccccacaacaatattca tgtgcctagaccaagaagtt attatctcgaactgacactg agccacaacccaaacaaccc agctctccctaagctt
|
||||
chimp t................... .a................c. ........a.....g..... ...a................ ................
|
||||
gorilla ..................tc .a................c. ........a.g......... ...a.............tt. .a..............
|
||||
orang ta....a...........t. .c.......ga......acc ..cg..a.a......tg... .a.a..c.....g...cta. .a.....a........
|
||||
gibbon a..t.......t........ ....ac...........acc .....t..a........... .a.tg..........gctag .a..............
|
||||
;
|
||||
end;
|
||||
---------------------------------------------------
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
||||||||||| Sample SMTP mail header
|
||||
---------------------------------------------------
|
||||
|
||||
- - - - - - - - -
|
||||
From GenBank-Retrieval-System@genbank.bio.net Sun Nov 10 17:28:56 1991
|
||||
Received: from genbank.bio.net by sunflower.bio.indiana.edu
|
||||
(4.1/9.5jsm) id AA19328; Sun, 10 Nov 91 17:28:55 EST
|
||||
Received: by genbank.bio.net (5.65/IG-2.0)
|
||||
id AA14458; Sun, 10 Nov 91 14:30:03 -0800
|
||||
Date: Sun, 10 Nov 91 14:30:03 -0800
|
||||
Message-Id: <9111102230.AA14458@genbank.bio.net>
|
||||
From: Database Server <GenBank-Retrieval-System@genbank.bio.net>
|
||||
To: gilbertd@sunflower.bio.indiana.edu
|
||||
Subject: Results of Query for drorna
|
||||
Status: R
|
||||
|
||||
No matches on drorna.
|
||||
- - - - - -
|
||||
From GenBank-Retrieval-System@genbank.bio.net Sun Nov 10 17:28:49 1991
|
||||
Received: from genbank.bio.net by sunflower.bio.indiana.edu
|
||||
(4.1/9.5jsm) id AA19323; Sun, 10 Nov 91 17:28:47 EST
|
||||
Received: by genbank.bio.net (5.65/IG-2.0)
|
||||
id AA14461; Sun, 10 Nov 91 14:30:03 -0800
|
||||
Date: Sun, 10 Nov 91 14:30:03 -0800
|
||||
Message-Id: <9111102230.AA14461@genbank.bio.net>
|
||||
From: Database Server <GenBank-Retrieval-System@genbank.bio.net>
|
||||
To: gilbertd@sunflower.bio.indiana.edu
|
||||
Subject: Results of Query for droest6
|
||||
Status: R
|
||||
|
||||
LOCUS DROEST6 1819 bp ss-mRNA INV 31-AUG-1987
|
||||
DEFINITION D.melanogaster esterase-6 mRNA, complete cds.
|
||||
ACCESSION M15961
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
||||||||||| GCG manual discussion of sequence symbols:
|
||||
---------------------------------------------------
|
||||
|
||||
III_SEQUENCE_SYMBOLS
|
||||
|
||||
|
||||
GCG programs allow all upper and lower case letters, periods (.),
|
||||
asterisks (*), pluses (+), ampersands (&), and ats (@) as symbols in
|
||||
biological sequences. Nucleotide symbols, their complements, and the
|
||||
standard one-letter amino acid symbols are shown below in separate lists.
|
||||
The meanings of the symbols +, &, and @ have not been assigned at this
|
||||
writing (March, 1989).
|
||||
|
||||
GCG uses the letter codes for amino acid codes and nucleotide
|
||||
ambiguity proposed by IUB (Nomenclature Committee, 1985,
|
||||
Eur. J. Biochem. 150; 1-5). These codes are compatible with the codes
|
||||
used by the EMBL, GenBank, and NBRF data libraries.
|
||||
|
||||
|
||||
NUCLEOTIDES
|
||||
|
||||
The meaning of each symbol, its complement, and the Cambridge and
|
||||
Stanford equivalents are shown below. Cambridge files can be converted
|
||||
into GCG files and vice versa with the programs FROMSTADEN and TOSTADEN.
|
||||
IntelliGenetics sequence files can be interconverted with the programs
|
||||
FROMIG and TOIG.
|
||||
|
||||
IUB/GCG Meaning Complement Staden/Sanger Stanford
|
||||
|
||||
A A T A A
|
||||
C C G C C
|
||||
G G C G G
|
||||
T/U T A T T/U
|
||||
M A or C K 5 J
|
||||
R A or G Y R R
|
||||
W A or T W 7 L
|
||||
S C or G S 8 M
|
||||
Y C or T R Y Y
|
||||
K G or T M 6 K
|
||||
V A or C or G B not supported N
|
||||
H A or C or T D not supported N
|
||||
D A or G or T H not supported N
|
||||
B C or G or T V not supported N
|
||||
X/N G or A or T or C X -/X N
|
||||
. not G or A or T or C . not supported ?
|
||||
|
||||
|
||||
The frame ambiguity codes used by Staden are not supported by GCG
|
||||
and are translated by FROMSTADEN as the lower case single base
|
||||
equivalent.
|
||||
|
||||
Staden Code Meaning GCG
|
||||
|
||||
D C or CC c
|
||||
V T or TT t
|
||||
B A or AA a
|
||||
H G or GG g
|
||||
K C or CX c
|
||||
L T or TX t
|
||||
M A or AX a
|
||||
N G or GX g
|
||||
|
||||
|
||||
AMINO ACIDS
|
||||
|
||||
Here is a list of the standard one-letter amino acid codes and their
|
||||
three-letter equivalents. The synonymous codons and their depiction in
|
||||
the IUB codes are shown. You should recognize that the codons following
|
||||
semicolons (;) are not sufficiently specific to define a single amino
|
||||
acid even though they represent the best possible back translation into
|
||||
the IUB codes! All of the relationships in this list can be redefined by
|
||||
the user in a local data file described below.
|
||||
|
||||
IUB
|
||||
Symbol 3-letter Meaning Codons Depiction
|
||||
A Ala Alanine GCT,GCC,GCA,GCG !GCX
|
||||
B Asp,Asn Aspartic,
|
||||
Asparagine GAT,GAC,AAT,AAC !RAY
|
||||
C Cys Cysteine TGT,TGC !TGY
|
||||
D Asp Aspartic GAT,GAC !GAY
|
||||
E Glu Glutamic GAA,GAG !GAR
|
||||
F Phe Phenylalanine TTT,TTC !TTY
|
||||
G Gly Glycine GGT,GGC,GGA,GGG !GGX
|
||||
H His Histidine CAT,CAC !CAY
|
||||
I Ile Isoleucine ATT,ATC,ATA !ATH
|
||||
K Lys Lysine AAA,AAG !AAR
|
||||
L Leu Leucine TTG,TTA,CTT,CTC,CTA,CTG
|
||||
!TTR,CTX,YTR;YTX
|
||||
M Met Methionine ATG !ATG
|
||||
N Asn Asparagine AAT,AAC !AAY
|
||||
P Pro Proline CCT,CCC,CCA,CCG !CCX
|
||||
Q Gln Glutamine CAA,CAG !CAR
|
||||
R Arg Arginine CGT,CGC,CGA,CGG,AGA,AGG
|
||||
!CGX,AGR,MGR;MGX
|
||||
S Ser Serine TCT,TCC,TCA,TCG,AGT,AGC !TCX,AGY;WSX
|
||||
T Thr Threonine ACT,ACC,ACA,ACG !ACX
|
||||
V Val Valine GTT,GTC,GTA,GTG !GTX
|
||||
W Trp Tryptophan TGG !TGG
|
||||
X Xxx Unknown !XXX
|
||||
Y Tyr Tyrosine TAT, TAC !TAY
|
||||
Z Glu,Gln Glutamic,
|
||||
Glutamine GAA,GAG,CAA,CAG !SAR
|
||||
* End Terminator TAA, TAG, TGA !TAR,TRA;TRR
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
||||||||||| docs from PSC on sequence formats:
|
||||
---------------------------------------------------
|
||||
|
||||
|
||||
Nucleic Acid and Protein Sequence File Formats
|
||||
|
||||
|
||||
It will probably save you some time if you have your data in a usable
|
||||
format before you send it to us. However, we do have the University of
|
||||
Wisconsin Genetics Computing Group programs running on our VAXen and
|
||||
this package includes several reformatting utilities. Our programs
|
||||
usually recognize any of several standard formats, including GenBank,
|
||||
EMBL, NBRF, and MolGen/Stanford. For the purposes of annotating an
|
||||
analysis we find the GenBank and EMBL formats most useful, particularly
|
||||
if you have already received an accession number from one of these
|
||||
organizations for your sequence.
|
||||
|
||||
Our programs do not require that all of the line types available in
|
||||
GenBank, EMBL, or NBRF file formats be present for the file format to
|
||||
be recognized and processed. The following pages outline the essential
|
||||
details required for correct processing of files by our programs.
|
||||
Additional information may be present but will generally be ignored.
|
||||
|
||||
|
||||
GenBank File Format
|
||||
|
||||
File Header
|
||||
|
||||
1. The first line in the file must have "GENETIC SEQUENCE DATA BANK"
|
||||
in spaces 20 through 46 (see LINE 1, below).
|
||||
2. The next 8 lines may contain arbitrary text. They are ignored but
|
||||
are required to maintain the GenBank format (see LINE 2 - LINE 9).
|
||||
|
||||
Sequence Data Entries
|
||||
|
||||
3. Each sequence entry in the file should have the following format.
|
||||
a) first line: Must have LOCUS in the first 5 spaces. The
|
||||
genetic locus name or identifier must be in spaces
|
||||
13 - 22. The length of the sequences is right
|
||||
justified in spaces 23 through 29 (see LINE 10).
|
||||
b) second line: Must have DEFINITION in the first 10 spaces.
|
||||
Spaces 13 - 80 are free form text to identify the
|
||||
sequence (see LINE 11).
|
||||
c) third line: Must have ACCESSION in the first 9 spaces. Spaces
|
||||
13 - 18 must hold the primary accession number
|
||||
(see LINE 12).
|
||||
d) fourth line: Must have ORIGIN in the first 6 spaces. Nothing
|
||||
else is required on this line, it indicates that
|
||||
the nucleic acid sequence begins on the next line
|
||||
(see LINE 13).
|
||||
e) fifth line: Begins the nucleotide sequence. The first 9
|
||||
spaces of each sequence line may either be blank
|
||||
or may contain the position in the sequence of the
|
||||
first nucleotide on the line. The next 66 spaces
|
||||
hold the nucleotide sequence in six blocks of ten
|
||||
nucleotides. Each of the six blocks begins with a
|
||||
blank space followed by ten nucleotides. Thus the
|
||||
first nucleotide is in space eleven of the line while
|
||||
the last is in space 75 (see LINE 14, LINE 15).
|
||||
f) last line: Must have // in the first 2 spaces to indicate
|
||||
termination of the sequence (see LINE 16).
|
||||
|
||||
NOTE: Multiple sequences may appear in each file. To begin another
|
||||
sequence go back to a) and start again.
|
||||
|
||||
|
||||
Example GenBank file
|
||||
|
||||
|
||||
LINE 1 : GENETIC SEQUENCE DATA BANK
|
||||
LINE 2 :
|
||||
LINE 3 :
|
||||
LINE 4 :
|
||||
LINE 5 :
|
||||
LINE 6 :
|
||||
LINE 7 :
|
||||
LINE 8 :
|
||||
LINE 9 :
|
||||
LINE 10 :LOCUS L_Name Length BP
|
||||
LINE 11 :DEFINITION Describe the sequence any way you want
|
||||
LINE 12 :ACCESSION Accession Number
|
||||
LINE 13 :ORIGIN
|
||||
LINE 14 : 1 acgtacgtac gtacgtacgt acgtacgtac gtacgtacgt a...
|
||||
LINE 15 : 61 acgt...
|
||||
LINE 16 ://
|
||||
|
||||
|
||||
|
||||
EMBL File Format
|
||||
|
||||
Unlike the GenBank file format the EMBL file format does not require
|
||||
a series of header lines. Thus the first line in the file begins
|
||||
the first sequence entry of the file.
|
||||
|
||||
1. The first line of each sequence entry contains the two letters ID
|
||||
in the first two spaces. This is followed by the EMBL identifier
|
||||
in spaces 6 through 14. (See LINE 1).
|
||||
|
||||
2. The second line of each sequence entry has the two letters AC in
|
||||
the first two spaces. This is followed by the accession number in
|
||||
spaces 6 through 11. (See LINE 2).
|
||||
|
||||
3. The third line of each sequence entry has the two letters DE in the
|
||||
first two spaces. This is followed by a free form text definition
|
||||
in spaces 6 through 72. (See LINE 3).
|
||||
|
||||
4. The fourth line in each sequence entry has the two letters SQ in
|
||||
the first two spaces. This is followed by the length of the
|
||||
sequence beginning at or after space 13. After the sequence length
|
||||
there is a blank space and the two letters BP. (See LINE 4).
|
||||
|
||||
5. The nucleotide sequence begins on the fifth line of the sequence
|
||||
entry. Each line of sequence begins with four blank spaces. The
|
||||
next 66 spaces hold the nucleotide sequence in six blocks of ten
|
||||
nucleotides. Each of the six blocks begins with a blank space
|
||||
followed by ten nucleotides. Thus the first nucleotide is in space
|
||||
6 of the line while the last is in space 70. (See LINE 5 -
|
||||
LINE 6).
|
||||
|
||||
6. The last line of each sequence entry in the file is a terminator
|
||||
line which has the two characters // in the first two spaces.
|
||||
(See LINE 7).
|
||||
|
||||
7. Multiple sequences may appear in each file. To begin another
|
||||
sequence go back to item 1 and start again.
|
||||
|
||||
|
||||
Example EMBL file
|
||||
|
||||
LINE 1 :ID ID_name
|
||||
LINE 2 :AC Accession number
|
||||
LINE 3 :DE Describe the sequence any way you want
|
||||
LINE 4 :SQ Length BP
|
||||
LINE 5 : ACGTACGTAC GTACGTACGT ACGTACGTAC GTACGTA...
|
||||
LINE 6 : ACGT...
|
||||
LINE 7 ://
|
||||
|
||||
|
||||
|
||||
NBRF (protein or nucleic acid) File Format
|
||||
|
||||
1. The first line of each sequence entry begins with a greater than
|
||||
symbol, >. This is immediately followed by the two character
|
||||
sequence type specifier. Space four must contain a semi-colon.
|
||||
Beginning in space five is the sequence name or identification code
|
||||
for the NBRF database. The code is from four to six letters and
|
||||
numbers. (See LINE 1).
|
||||
|
||||
!!!! >> add these to readseq
|
||||
Specifier Sequence type
|
||||
|
||||
P1 protein, complete
|
||||
F1 protein, fragment
|
||||
DL DNA, linear
|
||||
DC DNA, circular
|
||||
RL RNA, linear
|
||||
RC RNA, circular
|
||||
N1 functional RNA, other than tRNA
|
||||
N3 tRNA
|
||||
|
||||
2. The second line of each sequence entry contains two kinds of
|
||||
information. First is the sequence name which is separated from
|
||||
the organism or organelle name by the three character sequence
|
||||
blank space, dash, blank space, " - ". There is no special
|
||||
character marking the beginning of this line. (See LINE 2).
|
||||
|
||||
3. Either the amino acid or nucleic acid sequence begins on line three
|
||||
and can begin in any space, including the first. The sequence is
|
||||
free format and may be interrupted by blanks for ease of reading.
|
||||
Protein sequences man contain special punctuation to indicate
|
||||
various indeterminacies in the sequence. In the NBRF data files
|
||||
all lines may be up to 500 characters long. However some PSC
|
||||
programs currently have a limit of 130 characters per line
|
||||
(including blanks), and BitNet will not accept lines of over eighty
|
||||
characters. (See LINE 3, LINE 4, and LINE 5).
|
||||
|
||||
The last character in the sequence must be an asterisks, *.
|
||||
|
||||
Example NBRF file
|
||||
|
||||
LINE 1 :>P1;CBRT
|
||||
LINE 2 :Cytochrome b - Rat mitochondrion (SGC1)
|
||||
LINE 3 :M T N I R K S H P L F K I I N H S F I D L P A P S
|
||||
LINE 4 : VTHICRDVN Y GWL IRY
|
||||
LINE 5 :TWIGGQPVEHPFIIIGQLASISYFSIILILMPISGIVEDKMLKWN*
|
||||
|
||||
|
||||
|
||||
MolGen/Stanford File Format
|
||||
|
||||
1. The first line in a sequence file is a comment line. This line
|
||||
begins with a semi-colon in the first space. This line need
|
||||
not be present. If it is present it holds descriptive text.
|
||||
There may be as many comment lines as desired at the first of
|
||||
sequence file. (See LINE 1).
|
||||
|
||||
2. The second line must be present and contains an identifier or
|
||||
name for the sequence in the first ten spaces. (See LINE 2).
|
||||
|
||||
3. The sequence begins on the third line and occupies up to eighty
|
||||
spaces. Spaces may be included in the sequence for ease of
|
||||
reading. The sequence continues for as many line as needed
|
||||
and is terminated with a 1 or 2. 1 indicates a linear sequence
|
||||
while 2 marks a circular sequence. (See LINE 3 and LINE 4).
|
||||
|
||||
Example MolGen/Stanford file
|
||||
|
||||
LINE 1 :; Describe the sequence any way you want
|
||||
LINE 2 :ECTRNAGLY2
|
||||
LINE 3 :ACGCACGTAC ACGTACGTAC A C G T C C G T ACG TAC GTA CGT
|
||||
LINE 4 : GCTTA GG G C T A1
|
||||
|
||||
|
||||
|
||||
|
||||
||||||||||| Phylip file format
|
||||
---------------------------------------------------
|
||||
|
||||
Phylip 3.3 File Format (DNA sequences)
|
||||
|
||||
|
||||
The input and output formats for PROTPARS and for RESTML are described in
|
||||
their document files. In general their input formats are similar to those
|
||||
described here, except that the one-letter codes for data are specific to those
|
||||
programs and are described in those document files. Since the input formats
|
||||
for the eight DNA sequence programs apply to all eight, they are described
|
||||
here. Their input formats are standard: the data have A's, G's, C's and T's
|
||||
(or U's). The first line of the input file contains the number of species and
|
||||
the number of sites. As with the other programs, options information may
|
||||
follow this. In the case of DNAML, DNAMLK, and DNADIST an additional line
|
||||
(described in the document file for these pograms) may follow the first one.
|
||||
Following this, each species starts on a new line. The first 10 characters of
|
||||
that line are the species name. There then follows the base sequence of that
|
||||
species, each character being one of the letters A, B, C, D, G, H, K, M, N, O,
|
||||
R, S, T, U, V, W, X, Y, ?, or - (a period was also previously allowed but it is
|
||||
no longer allowed, because it sometimes is used to in aligned sequences to mean
|
||||
"the same as the sequence above"). Blanks will be ignored, and so will
|
||||
numerical digits. This allows GENBANK and EMBL sequence entries to be read
|
||||
with minimum editing.
|
||||
|
||||
These characters can be either upper or lower case. The algorithms
|
||||
convert all input characters to upper case (which is how they are treated).
|
||||
The characters constitute the IUPAC (IUB) nucleic acid code plus some slight
|
||||
extensions. They enable input of nucleic acid sequences taking full account of
|
||||
any ambiguities in the sequence.
|
||||
|
||||
The sequences can continue over multiple lines; when this is done the sequences
|
||||
must be either in "interleaved" format, similar to the output of alignment
|
||||
programs, or "sequential" format. These are described in the main document
|
||||
file. In sequential format all of one sequence is given, possibly on multiple
|
||||
lines, before the next starts. In interleaved format the first part of the
|
||||
file should contain the first part of each of the sequences, then possibly a
|
||||
line containing nothing but a carriage-return character, then the second part
|
||||
of each sequence, and so on. Only the first parts of the sequences should be
|
||||
preceded by names. Here is a hypothetical example of interleaved format:
|
||||
|
||||
5 42
|
||||
Turkey AAGCTNGGGC ATTTCAGGGT
|
||||
Salmo gairAAGCCTTGGC AGTGCAGGGT
|
||||
H. SapiensACCGGTTGGC CGTTCAGGGT
|
||||
Chimp AAACCCTTGC CGTTACGCTT
|
||||
Gorilla AAACCCTTGC CGGTACGCTT
|
||||
|
||||
GAGCCCGGGC AATACAGGGT AT
|
||||
GAGCCGTGGC CGGGCACGGT AT
|
||||
ACAGGTTGGC CGTTCAGGGT AA
|
||||
AAACCGAGGC CGGGACACTC AT
|
||||
AAACCATTGC CGGTACGCTT AA
|
||||
|
||||
while in sequential format the same sequences would be:
|
||||
|
||||
5 42
|
||||
Turkey AAGCTNGGGC ATTTCAGGGT
|
||||
GAGCCCGGGC AATACAGGGT AT
|
||||
Salmo gairAAGCCTTGGC AGTGCAGGGT
|
||||
GAGCCGTGGC CGGGCACGGT AT
|
||||
H. SapiensACCGGTTGGC CGTTCAGGGT
|
||||
ACAGGTTGGC CGTTCAGGGT AA
|
||||
Chimp AAACCCTTGC CGTTACGCTT
|
||||
AAACCGAGGC CGGGACACTC AT
|
||||
Gorilla AAACCCTTGC CGGTACGCTT
|
||||
AAACCATTGC CGGTACGCTT AA
|
||||
|
||||
|
||||
Note, of course, that a portion of a sequence like this:
|
||||
|
||||
300 AAGCGTGAAC GTTGTACTAA TRCAG
|
||||
|
||||
is perfectly legal, assuming that the species name has gone before, and is
|
||||
filled out to full length by blanks. The above digits and blanks will be
|
||||
ignored, the sequence being taken as starting at the first base symbol (in this
|
||||
case an A).
|
||||
|
||||
The present versions of the programs may sometimes have difficulties with
|
||||
the blank lines between groups of lines, and if so you might want to retype
|
||||
those lines, making sure that they have only a carriage-return and no blank
|
||||
characters on them, or you may perhaps have to eliminate them. The symptoms of
|
||||
this problem are that the programs complain that the sequences are not properly
|
||||
aligned, and you can find no other cause for this complaint.
|
||||
|
||||
------------------------------------------------
|
||||
|
||||
|
||||
||||||||||| ASN.1 file format
|
||||
---------------------------------------------------
|
||||
|
||||
|
||||
ASN.1 -- see NCBI toolkit docs, source and examples (ncbi.nlm.nih.gov)
|
||||
|
||||
Example asn.1 sequence file----
|
||||
|
||||
Bioseq-set ::= {
|
||||
seq-set {
|
||||
seq {
|
||||
id { local id 1 } , -- id essential
|
||||
descr { title "Dummy sequence data from nowhere" } , -- optional
|
||||
inst { -- inst essential
|
||||
repr raw ,
|
||||
mol dna ,
|
||||
length 156 ,
|
||||
topology linear ,
|
||||
seq-data
|
||||
iupacna "GAATTCATTTTTGAAACAAATCGACCTGACGACGGAATGGTACTCGAATTA
|
||||
TGGGCCAAAGGGTTTTATGGGACAAATTAATAGGTGTTCATTATATGCCACTTTCGGAGATTAGATACAGCAATGCAG
|
||||
TGGATTCAAAGCAATAGAGTTGTTCTT"
|
||||
} } ,
|
||||
|
||||
seq {
|
||||
id { local id 2 } ,
|
||||
descr { title "Dummy sequence 2 data from somewhere else" } ,
|
||||
inst {
|
||||
repr raw ,
|
||||
mol dna ,
|
||||
length 150 ,
|
||||
topology linear ,
|
||||
seq-data
|
||||
iupacna "TTTTTTTTTTTTGAAACAAATCGACCTGACGACGGAATGGTACTCGAATTA
|
||||
TGGGCCAAAGGGTTTTATGGGACAAATTAATAGGTGTTCATTATATGCCACTTTCGGAGATTAGATACAGCAATGCAG
|
||||
TGGATTCAAAGCAATAGAGTT"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
partial ASN.1 description from toolkit
|
||||
|
||||
Bioseq ::= SEQUENCE {
|
||||
id SET OF Seq-id , -- equivalent identifiers
|
||||
descr Seq-descr OPTIONAL , -- descriptors
|
||||
inst Seq-inst , -- the sequence data
|
||||
annot SET OF Seq-annot OPTIONAL }
|
||||
|
||||
Seq-inst ::= SEQUENCE { -- the sequence data itself
|
||||
repr ENUMERATED { -- representation class
|
||||
not-set (0) , -- empty
|
||||
virtual (1) , -- no seq data
|
||||
raw (2) , -- continuous sequence
|
||||
seg (3) , -- segmented sequence
|
||||
const (4) , -- constructed sequence
|
||||
ref (5) , -- reference to another sequence
|
||||
consen (6) , -- consensus sequence or pattern
|
||||
map (7) , -- ordered map (genetic, restriction)
|
||||
other (255) } ,
|
||||
mol ENUMERATED { -- molecule class in living organism
|
||||
not-set (0) , -- > cdna = rna
|
||||
dna (1) ,
|
||||
rna (2) ,
|
||||
aa (3) ,
|
||||
na (4) , -- just a nucleic acid
|
||||
other (255) } ,
|
||||
length INTEGER OPTIONAL , -- length of sequence in residues
|
||||
fuzz Int-fuzz OPTIONAL , -- length uncertainty
|
||||
topology ENUMERATED { -- topology of molecule
|
||||
not-set (0) ,
|
||||
linear (1) ,
|
||||
circular (2) ,
|
||||
tandem (3) , -- some part of tandem repeat
|
||||
other (255) } DEFAULT linear ,
|
||||
strand ENUMERATED { -- strandedness in living organism
|
||||
not-set (0) ,
|
||||
ss (1) , -- single strand
|
||||
ds (2) , -- double strand
|
||||
mixed (3) ,
|
||||
other (255) } OPTIONAL , -- default ds for DNA, ss for RNA, pept
|
||||
seq-data Seq-data OPTIONAL , -- the sequence
|
||||
ext Seq-ext OPTIONAL , -- extensions for special types
|
||||
hist Seq-hist OPTIONAL } -- sequence history
|
||||
|
||||
------------------------------------------------
|
63
Make.com
Normal file
63
Make.com
Normal file
|
@ -0,0 +1,63 @@
|
|||
$!
|
||||
$!VAX-VMS cc make file for readseq
|
||||
$!
|
||||
$ echo := write sys$output
|
||||
$ if p1.eqs."TEST" then goto tests
|
||||
$
|
||||
$ echo "compiling readseq..."
|
||||
$ cc readseq, ureadseq
|
||||
$!
|
||||
$ echo "linking readseq..."
|
||||
$ link readseq, ureadseq, sys$library:vaxcrtl/lib
|
||||
$!
|
||||
$tests:
|
||||
$!
|
||||
$ echo "defining readseq symbol:"
|
||||
$ dd = f$environment("default")
|
||||
$ readseq :== $ 'dd'readseq.exe
|
||||
$ show symbol readseq
|
||||
$!
|
||||
$ echo ""
|
||||
$ echo "test for general read/write of all chars:"
|
||||
$ readseq -p alphabet.std -otest.alpha
|
||||
$ diff test.alpha alphabet.std
|
||||
$!
|
||||
$ echo ""
|
||||
$ echo "test for valid format conversions"
|
||||
$!
|
||||
$ readseq -v -p -f=ig nucleic.std -otest.ig
|
||||
$ readseq -v -p -f=gb test.ig -otest.gb
|
||||
$ readseq -v -p -f=nbrf test.gb -otest.nbrf
|
||||
$ readseq -v -p -f=embl test.nbrf -otest.embl
|
||||
$ readseq -v -p -f=gcg test.embl -otest.gcg
|
||||
$ readseq -v -p -f=strider test.gcg -otest.strider
|
||||
$ readseq -v -p -f=fitch test.strider -otest.fitch
|
||||
$ readseq -v -p -f=fasta test.fitch -otest.fasta
|
||||
$ readseq -v -p -f=pir test.fasta -otest.pir
|
||||
$ readseq -v -p -f=ig test.pir -otest.ig-b
|
||||
$ diff test.ig test.ig-b
|
||||
$!
|
||||
$ echo ""
|
||||
$ echo "Test for multiple-sequence format conversions:"
|
||||
$ readseq -p -f=ig multi.std -otest.m-ig
|
||||
$ readseq -p -f=gb test.m-ig -otest.m-gb
|
||||
$ readseq -p -f=nbrf test.m-gb -otest.m-nbrf
|
||||
$ readseq -p -f=embl test.m-nbrf -otest.m-embl
|
||||
$ readseq -p -f=fasta test.m-embl -otest.m-fasta
|
||||
$ readseq -p -f=pir test.m-fasta -otest.m-pir
|
||||
$ readseq -p -f=msf test.m-pir -otest.m-msf
|
||||
$ readseq -p -f=paup test.m-msf -otest.m-paup
|
||||
$ readseq -p -f=ig test.m-paup -otest.m-ig-b
|
||||
$ diff test.m-ig test.m-ig-b
|
||||
$ echo ""
|
||||
$ echo "Expect differences in the header lines due to"
|
||||
$ echo "different format headers. If any sequence lines"
|
||||
$ echo "differ, or if checksums differ, there is a problem."
|
||||
$!
|
||||
$! #cleanup
|
||||
$! delete test.*;
|
||||
$ echo "-----------"
|
||||
$ echo ""
|
||||
$ echo "To clean up test files, command me:
|
||||
$ echo " DELETE test.*;"
|
||||
$!
|
109
Make.ncbi
Normal file
109
Make.ncbi
Normal file
|
@ -0,0 +1,109 @@
|
|||
#
|
||||
# Unix Makefile for readseq
|
||||
# to use, command me:
|
||||
# % make -- or --
|
||||
# % make CC=your-c-compiler-name
|
||||
#
|
||||
|
||||
# pick an ANSI C compiler (the default Sun CC is not ANSI)
|
||||
CC=gcc # Gnu C Compiler
|
||||
#CC=cc # SGI Irix
|
||||
#CC=vcc # some DEC Ultrix
|
||||
|
||||
CFLAGS=
|
||||
#CFLAGS= -DSMALLCHECKSUM # if you prefer to use a GCG-standard 13 bit checksum
|
||||
# instead of a full 32 bit checksum. This may enhance compatibility w/ GCG software
|
||||
|
||||
SOURCES= readseq.c ureadseq.c ureadseq.h ureadasn.c
|
||||
DOCS= Readme readseq.help Formats Stdfiles Makefile Make.com add.gdemenu *.std
|
||||
|
||||
|
||||
# NCBI toolkit support for ASN.1 reader
|
||||
|
||||
# this is path to NCBI toolkit, you must set for your system:
|
||||
NCBI=/bio/mb/ncbi
|
||||
#
|
||||
OTHERLIBS=-lm
|
||||
LIB1=-lncbi
|
||||
LIB2=-lncbiobj
|
||||
LIB3=-lncbicdr
|
||||
LIB4=-lvibrant
|
||||
INCPATH=$(NCBI)/include
|
||||
LIBPATH=$(NCBI)/lib
|
||||
NCFLAGS=$(CFLAGS) -DNCBI -I$(INCPATH)
|
||||
NLDFLAGS=-I$(INCPATH) -L$(LIBPATH)
|
||||
NLIBS=$(LIB1) $(LIB2) $(OTHERLIBS)
|
||||
|
||||
|
||||
all: build test
|
||||
|
||||
#build: $(SOURCES)
|
||||
# @echo "Compiling readseq..."
|
||||
# $(CC) $(CFLAGS) -o readseq readseq.c ureadseq.c
|
||||
|
||||
# if using NCBI, uncomment these lines in place of build: above
|
||||
build: $(SOURCES)
|
||||
@echo "Compiling readseq with NCBI toolkit support...";
|
||||
$(CC) -o readseq $(NLDFLAGS) $(NCFLAGS) readseq.c ureadseq.c ureadasn.c $(NLIBS)
|
||||
|
||||
test: $(SOURCES) readseq
|
||||
@echo ""
|
||||
@echo "Test for general read/write of all chars:"
|
||||
./readseq -p alphabet.std -otest.alpha
|
||||
-diff test.alpha alphabet.std
|
||||
|
||||
@echo ""
|
||||
@echo "Test for valid format conversions:"
|
||||
./readseq -v -p -f=ig nucleic.std -otest.ig
|
||||
./readseq -v -p -f=gb test.ig -otest.gb
|
||||
./readseq -v -p -f=nbrf test.gb -otest.nbrf
|
||||
./readseq -v -p -f=embl test.nbrf -otest.embl
|
||||
./readseq -v -p -f=gcg test.embl -otest.gcg
|
||||
./readseq -v -p -f=strider test.gcg -otest.strider
|
||||
./readseq -v -p -f=fitch test.strider -otest.fitch
|
||||
./readseq -v -p -f=fasta test.fitch -otest.fasta
|
||||
./readseq -v -p -f=pir test.fasta -otest.pir
|
||||
./readseq -v -p -f=ig test.pir -otest.ig-b
|
||||
-diff test.ig test.ig-b
|
||||
|
||||
@echo ""
|
||||
@echo "Test for multiple-sequence format conversions:"
|
||||
./readseq -p -f=ig multi.std -otest.m-ig
|
||||
./readseq -p -f=gb test.m-ig -otest.m-gb
|
||||
./readseq -p -f=nbrf test.m-gb -otest.m-nbrf
|
||||
./readseq -p -f=embl test.m-nbrf -otest.m-embl
|
||||
./readseq -p -f=fasta test.m-embl -otest.m-fasta
|
||||
./readseq -p -f=pir test.m-fasta -otest.m-pir
|
||||
./readseq -p -f=msf test.m-pir -otest.m-msf
|
||||
./readseq -p -f=paup test.m-msf -otest.m-paup
|
||||
./readseq -p -f=ig test.m-paup -otest.m-ig-b
|
||||
-diff test.m-ig test.m-ig-b
|
||||
#
|
||||
# if using NCBI, uncomment these lines
|
||||
@echo ""
|
||||
@echo "Test of NCBI ASN.1 conversions:"
|
||||
./readseq -p -f=asn test.m-ig -otest.m-asn
|
||||
./readseq -p -f=ig test.m-asn -otest.m-ig-c
|
||||
-diff test.m-ig test.m-ig-c
|
||||
#
|
||||
@echo ""
|
||||
@echo "Expect differences in the header lines due to"
|
||||
@echo "different format headers. If any sequence lines"
|
||||
@echo "differ, or if the checksums differ, there is a problem."
|
||||
@echo "----------------------"
|
||||
@echo ""
|
||||
@echo "To clean up test files, command me:"
|
||||
@echo " make clean"
|
||||
|
||||
|
||||
clean:
|
||||
rm -f *.o core test.*
|
||||
|
||||
shar:
|
||||
@echo "shell archiving files..."
|
||||
-rm -f readseq*.shar
|
||||
mkdir readseqd
|
||||
cp $(SOURCES) readseqd
|
||||
cp $(DOCS) readseqd
|
||||
shar -v readseqd > readseq.shar
|
||||
rm -rf readseqd
|
160
Readme
Normal file
160
Readme
Normal file
|
@ -0,0 +1,160 @@
|
|||
|
||||
* ReadSeq -- 1 Feb 93
|
||||
*
|
||||
* Reads and writes nucleic/protein sequences in various
|
||||
* formats. Data files may have multiple sequences.
|
||||
*
|
||||
* Copyright 1990 by d.g.gilbert
|
||||
* biology dept., indiana university, bloomington, in 47405
|
||||
* e-mail: gilbertd@bio.indiana.edu
|
||||
*
|
||||
* This program may be freely copied and used by anyone.
|
||||
* Developers are encourged to incorporate parts in their
|
||||
* programs, rather than devise their own private sequence
|
||||
* format.
|
||||
*
|
||||
* This should compile and run with any ANSI C compiler.
|
||||
* Please advise me of any bugs, additions or corrections.
|
||||
|
||||
Readseq has been updated. There have been a number of enhancements
|
||||
and a few bug corrections since the previous general release in Nov 91
|
||||
(see below). If you are using earlier versions, I recommend you update to
|
||||
this release.
|
||||
|
||||
Readseq is particularly useful as it automatically detects many
|
||||
sequence formats, and interconverts among them.
|
||||
Formats added to this release include
|
||||
+ MSF multi sequence format used by GCG software
|
||||
+ PAUP's multiple sequence (NEXUS) format
|
||||
+ PIR/CODATA format used by PIR
|
||||
+ ASN.1 format used by NCBI
|
||||
+ Pretty print with various options for nice looking output.
|
||||
|
||||
As well, Phylip format can now be used as input. Options to
|
||||
reverse-compliment and to degap sequences have been added. A menu
|
||||
addition for users of the GDE sequence editor is included.
|
||||
|
||||
This program is available thru Internet gopher, as
|
||||
|
||||
gopher ftp.bio.indiana.edu
|
||||
browse into the IUBio-Software+Data/molbio/readseq/ folder
|
||||
select the readseq.shar document
|
||||
|
||||
Or thru anonymous FTP in this manner:
|
||||
my_computer> ftp ftp.bio.indiana.edu (or IP address 129.79.224.25)
|
||||
username: anonymous
|
||||
password: my_username@my_computer
|
||||
ftp> cd molbio/readseq
|
||||
ftp> get readseq.shar
|
||||
ftp> bye
|
||||
|
||||
readseq.shar is a Unix shell archive of the readseq files.
|
||||
This file can be editted by any text editor to reconstitute the
|
||||
original files, for those who do not have a Unix system or an
|
||||
Unshar program. Read the top of this .shar file for further
|
||||
instructions.
|
||||
|
||||
There are also pre-compiled executables for the following computers:
|
||||
Silicon Graphics Iris, Sparc (Sun Sparcstation & clones), VMS-Vax,
|
||||
Macintosh. Use binary ftp to transfer these, except Macintosh. The
|
||||
Mac version is just the command-line program in a window, not very
|
||||
handy.
|
||||
|
||||
C source files:
|
||||
readseq.c ureadseq.c ureadasn.c ureadseq.h
|
||||
Document files:
|
||||
Readme (this doc)
|
||||
Readseq.help (longer than this doc)
|
||||
Formats (description of sequence file formats)
|
||||
add.gdemenu (GDE program users can add this to the .GDEmenu file)
|
||||
Stdfiles -- test sequence files
|
||||
Makefile -- Unix make file
|
||||
Make.com -- VMS make file
|
||||
*.std -- files for testing validity of readseq
|
||||
|
||||
|
||||
Example usage:
|
||||
readseq
|
||||
-- for interactive use
|
||||
readseq my.1st.seq my.2nd.seq -all -format=genbank -output=my.gb
|
||||
-- convert all of two input files to one genbank format output file
|
||||
readseq my.seq -all -form=pretty -nameleft=3 -numleft -numright -numtop -match
|
||||
-- output to standard output a file in a pretty format
|
||||
readseq my.seq -item=9,8,3,2 -degap -CASE -rev -f=msf -out=my.rev
|
||||
-- select 4 items from input, degap, reverse, and uppercase them
|
||||
cat *.seq | readseq -pipe -all -format=asn > bunch-of.asn
|
||||
-- pipe a bunch of data thru readseq, converting all to asn
|
||||
|
||||
|
||||
The brief usage of readseq is as follows. The "[]" denote
|
||||
optional parts of the syntax:
|
||||
|
||||
readseq -help
|
||||
readSeq (27Dec92), multi-format molbio sequence reader.
|
||||
usage: readseq [-options] in.seq > out.seq
|
||||
options
|
||||
-a[ll] select All sequences
|
||||
-c[aselower] change to lower case
|
||||
-C[ASEUPPER] change to UPPER CASE
|
||||
-degap[=-] remove gap symbols
|
||||
-i[tem=2,3,4] select Item number(s) from several
|
||||
-l[ist] List sequences only
|
||||
-o[utput=]out.seq redirect Output
|
||||
-p[ipe] Pipe (command line, <stdin, >stdout)
|
||||
-r[everse] change to Reverse-complement
|
||||
-v[erbose] Verbose progress
|
||||
-f[ormat=]# Format number for output, or
|
||||
-f[ormat=]Name Format name for output:
|
||||
1. IG/Stanford 10. Olsen (in-only)
|
||||
2. GenBank/GB 11. Phylip3.2
|
||||
3. NBRF 12. Phylip
|
||||
4. EMBL 13. Plain/Raw
|
||||
5. GCG 14. PIR/CODATA
|
||||
6. DNAStrider 15. MSF
|
||||
7. Fitch 16. ASN.1
|
||||
8. Pearson/Fasta 17. PAUP
|
||||
9. Zuker 18. Pretty (out-only)
|
||||
|
||||
Pretty format options:
|
||||
-wid[th]=# sequence line width
|
||||
-tab=# left indent
|
||||
-col[space]=# column space within sequence line on output
|
||||
-gap[count] count gap chars in sequence numbers
|
||||
-nameleft, -nameright[=#] name on left/right side [=max width]
|
||||
-nametop name at top/bottom
|
||||
-numleft, -numright seq index on left/right side
|
||||
-numtop, -numbot index on top/bottom
|
||||
-match[=.] use match base for 2..n species
|
||||
-inter[line=#] blank line(s) between sequence blocks
|
||||
|
||||
|
||||
|
||||
Recent changes:
|
||||
|
||||
4 May 92
|
||||
+ added 32 bit CRC checksum as alternative to GCG 6.5bit checksum
|
||||
Aug 92
|
||||
= fixed Olsen format input to handle files w/ more sequences,
|
||||
not to mess up when more than one seq has same identifier,
|
||||
and to convert number masks to symbols.
|
||||
= IG format fix to understand ^L
|
||||
30 Dec 92
|
||||
* revised command-line & interactive interface. Suggested form is now
|
||||
readseq infile -format=genbank -output=outfile -item=1,3,4 ...
|
||||
but remains compatible with prior commandlines:
|
||||
readseq infile -f2 -ooutfile -i3 ...
|
||||
+ added GCG MSF multi sequence file format
|
||||
+ added PIR/CODATA format
|
||||
+ added NCBI ASN.1 sequence file format
|
||||
+ added Pretty, multi sequence pretty output (only)
|
||||
+ added PAUP multi seq format
|
||||
+ added degap option
|
||||
+ added Gary Williams (GWW, G.Williams@CRC.AC.UK) reverse-complement option.
|
||||
+ added support for reading Phylip formats (interleave & sequential)
|
||||
* string fixes, dropped need for compiler flags NOSTR, FIXTOUPPER, NEEDSTRCASECMP
|
||||
* changed 32bit checksum to default, -DSMALLCHECKSUM for GCG version
|
||||
|
||||
1Feb93
|
||||
= reverted Genbank output format to fixed left margin
|
||||
(change in 30 Dec release), so GDE and others relying on fixed margin
|
||||
can read this.
|
229
Readseq.help
Normal file
229
Readseq.help
Normal file
|
@ -0,0 +1,229 @@
|
|||
|
||||
* ReadSeq.Help -- 30 Dec 92
|
||||
*
|
||||
* Reads and writes nucleic/protein sequences in various
|
||||
* formats. Data files may have multiple sequences.
|
||||
*
|
||||
* Copyright 1990 by d.g.gilbert
|
||||
* biology dept., indiana university, bloomington, in 47405
|
||||
* e-mail: gilbertd@bio.indiana.edu
|
||||
*
|
||||
* This program may be freely copied and used by anyone.
|
||||
* Developers are encourged to incorporate parts in their
|
||||
* programs, rather than devise their own private sequence
|
||||
* format.
|
||||
*
|
||||
* This should compile and run with any ANSI C compiler.
|
||||
* Please advise me of any bugs, additions or corrections.
|
||||
|
||||
Readseq is particularly useful as it automatically detects many
|
||||
sequence formats, and interconverts among them.
|
||||
|
||||
Formats which readseq currently understands:
|
||||
|
||||
* IG/Stanford, used by Intelligenetics and others
|
||||
* GenBank/GB, genbank flatfile format
|
||||
* NBRF format
|
||||
* EMBL, EMBL flatfile format
|
||||
* GCG, single sequence format of GCG software
|
||||
* DNAStrider, for common Mac program
|
||||
* Fitch format, limited use
|
||||
* Pearson/Fasta, a common format used by Fasta programs and others
|
||||
* Zuker format, limited use. Input only.
|
||||
* Olsen, format printed by Olsen VMS sequence editor. Input only.
|
||||
* Phylip3.2, sequential format for Phylip programs
|
||||
* Phylip, interleaved format for Phylip programs (v3.3, v3.4)
|
||||
* Plain/Raw, sequence data only (no name, document, numbering)
|
||||
+ MSF multi sequence format used by GCG software
|
||||
+ PAUP's multiple sequence (NEXUS) format
|
||||
+ PIR/CODATA format used by PIR
|
||||
+ ASN.1 format used by NCBI
|
||||
+ Pretty print with various options for nice looking output. Output only.
|
||||
|
||||
See the included "Formats" file for detail on file formats.
|
||||
|
||||
|
||||
Example usage:
|
||||
readseq
|
||||
-- for interactive use
|
||||
|
||||
readseq my.1st.seq my.2nd.seq -all -format=genbank -output=my.gb
|
||||
-- convert all of two input files to one genbank format output file
|
||||
|
||||
readseq my.seq -all -form=pretty -nameleft=3 -numleft -numright -numtop -match
|
||||
-- output to standard output a file in a pretty format
|
||||
|
||||
readseq my.seq -item=9,8,3,2 -degap -CASE -rev -f=msf -out=my.rev
|
||||
-- select 4 items from input, degap, reverse, and uppercase them
|
||||
|
||||
cat *.seq | readseq -pipe -all -format=asn > bunch-of.asn
|
||||
-- pipe a bunch of data thru readseq, converting all to asn
|
||||
|
||||
|
||||
The brief usage of readseq is as follows. The "[]" denote
|
||||
optional parts of the syntax:
|
||||
|
||||
readseq -help
|
||||
readSeq (27Dec92), multi-format molbio sequence reader.
|
||||
usage: readseq [-options] in.seq > out.seq
|
||||
options
|
||||
-a[ll] select All sequences
|
||||
-c[aselower] change to lower case
|
||||
-C[ASEUPPER] change to UPPER CASE
|
||||
-degap[=-] remove gap symbols
|
||||
-i[tem=2,3,4] select Item number(s) from several
|
||||
-l[ist] List sequences only
|
||||
-o[utput=]out.seq redirect Output
|
||||
-p[ipe] Pipe (command line, <stdin, >stdout)
|
||||
-r[everse] change to Reverse-complement
|
||||
-v[erbose] Verbose progress
|
||||
-f[ormat=]# Format number for output, or
|
||||
-f[ormat=]Name Format name for output:
|
||||
1. IG/Stanford 10. Olsen (in-only)
|
||||
2. GenBank/GB 11. Phylip3.2
|
||||
3. NBRF 12. Phylip
|
||||
4. EMBL 13. Plain/Raw
|
||||
5. GCG 14. PIR/CODATA
|
||||
6. DNAStrider 15. MSF
|
||||
7. Fitch 16. ASN.1
|
||||
8. Pearson/Fasta 17. PAUP
|
||||
9. Zuker 18. Pretty (out-only)
|
||||
|
||||
Pretty format options:
|
||||
-wid[th]=# sequence line width
|
||||
-tab=# left indent
|
||||
-col[space]=# column space within sequence line on output
|
||||
-gap[count] count gap chars in sequence numbers
|
||||
-nameleft, -nameright[=#] name on left/right side [=max width]
|
||||
-nametop name at top/bottom
|
||||
-numleft, -numright seq index on left/right side
|
||||
-numtop, -numbot index on top/bottom
|
||||
-match[=.] use match base for 2..n species
|
||||
-inter[line=#] blank line(s) between sequence blocks
|
||||
|
||||
|
||||
Notes:
|
||||
|
||||
In use, readseq will respond to command line arguments, or to
|
||||
interactive use. Command line arguments cannot be combined
|
||||
but must each follow a switch character (-). In this release,
|
||||
the command line options are now words, with an equals (=)
|
||||
to separate parameter(s) fromt he command. You cannot put a
|
||||
space between a command and its parameter, as is usual for
|
||||
Unix programs (this is to preserve compatibility with VMS).
|
||||
The command line syntax of the earlier versions is still
|
||||
supported.
|
||||
|
||||
See the file Formats for details of the sequence formats which
|
||||
are supported by readseq. The auto-detection feature of
|
||||
readseq which distinguishes these formats looks for some of the
|
||||
unique keywords and symbols that are found in each format. It
|
||||
is not infallible at this, though it attempts to exclude unknown
|
||||
formats. In general, if you feed to readseq a sequence file that
|
||||
you know is one of these common formats, you are okay. If you feed
|
||||
it data that might be oddball formats, or non-sequence data,
|
||||
you might well get garbage results. Also, different developers
|
||||
are always thinking up minor twists on these common formats
|
||||
(like PAUP requiring a blank line between blocks of Phylip format,
|
||||
or IG adding form feeds between sequences), which may cause hassles.
|
||||
|
||||
In general, output supports only minimal subsets of each format
|
||||
needed for sequence data exchanges. Features, descriptions
|
||||
and other format-unique information is discarded.
|
||||
|
||||
The pretty format requires additional options to generate a
|
||||
nice output. Try the various pretty options to see what you like.
|
||||
Pretty format is OUPUT only, readseq cannot read a Pretty format
|
||||
file.
|
||||
|
||||
Readseq is NOT optimized for LARGE files. It generally makes several
|
||||
reads thru each input file (one per sequence output at present, future
|
||||
version may optimize this). It should handle input and output files
|
||||
and sequences of any size, but will slow down quite a bit for very large
|
||||
(multi megabyte) sized files. It is NOT recommended for converting
|
||||
databanks or large subsets there-of. It is primarily directed at the
|
||||
small files that researchers use to maintain their personal data, which
|
||||
they frequently need to interconvert for the various analysis programs
|
||||
which so frequently require a special format.
|
||||
|
||||
Users of Olsen multi sequence editor (VMS). The Olsen format
|
||||
here is produced with the print command:
|
||||
print/out=some.file
|
||||
Use Genbank output from readseq to produce a format that this
|
||||
editor can read, and use the command
|
||||
load/genbank some.file
|
||||
Dan Davison has a VMS program that will convert to/from the
|
||||
Olsen native binary data format. E-mail davison@uh.edu
|
||||
|
||||
Warning: Phylip format input is now supported (30Dec92), however the
|
||||
auto-detection of Phylip format is very probabilistic and messy,
|
||||
especially distinguishing sequential from interleaved versions. It
|
||||
is not recommended that one use readseq to convert files from Phylip
|
||||
format to others unless essential.
|
||||
|
||||
|
||||
This program is available thru Internet gopher, as
|
||||
|
||||
gopher ftp.bio.indiana.edu
|
||||
browse into the IUBio-Software+Data/molbio/readseq/ folder
|
||||
select the readseq.shar document
|
||||
|
||||
Or thru anonymous FTP in this manner:
|
||||
my_computer> ftp ftp.bio.indiana.edu (or IP address 129.79.224.25)
|
||||
username: anonymous
|
||||
password: my_username@my_computer
|
||||
ftp> cd molbio/readseq
|
||||
ftp> get readseq.shar
|
||||
ftp> bye
|
||||
|
||||
readseq.shar is a Unix shell archive of the readseq files.
|
||||
This file can be editted by any text editor to reconstitute the
|
||||
original files, for those who do not have a Unix system or an
|
||||
Unshar program. Read the top of this .shar file for further
|
||||
instructions.
|
||||
|
||||
There are also pre-compiled executables for the following computers:
|
||||
Silicon Graphics Iris, Sparc (Sun Sparcstation & clones), VMS-Vax,
|
||||
Macintosh. Use binary ftp to transfer these, except Macintosh. The
|
||||
Mac version is just the command-line program in a window, not very
|
||||
handy.
|
||||
|
||||
C source files:
|
||||
readseq.c ureadseq.c ureadasn.c ureadseq.h
|
||||
|
||||
Document files:
|
||||
Readme (this doc)
|
||||
Formats (description of sequence file formats)
|
||||
add.gdemenu (GDE program users can add this to the .GDEmenu file)
|
||||
Stdfiles -- test sequence files
|
||||
Makefile -- Unix make file
|
||||
Make.com -- VMS make file
|
||||
*.std -- files for testing validity of readseq
|
||||
|
||||
|
||||
Recent changes (see also readseq.c for all history of changes):
|
||||
|
||||
4 May 92
|
||||
+ added 32 bit CRC checksum as alternative to GCG 6.5bit checksum
|
||||
Aug 92
|
||||
= fixed Olsen format input to handle files w/ more sequences,
|
||||
not to mess up when more than one seq has same identifier,
|
||||
and to convert number masks to symbols.
|
||||
= IG format fix to understand ^L
|
||||
30 Dec 92
|
||||
* revised command-line & interactive interface. Suggested form is now
|
||||
readseq infile -format=genbank -output=outfile -item=1,3,4 ...
|
||||
but remains compatible with prior commandlines:
|
||||
readseq infile -f2 -ooutfile -i3 ...
|
||||
+ added GCG MSF multi sequence file format
|
||||
+ added PIR/CODATA format
|
||||
+ added NCBI ASN.1 sequence file format
|
||||
+ added Pretty, multi sequence pretty output (only)
|
||||
+ added PAUP multi seq format
|
||||
+ added degap option
|
||||
+ added Gary Williams (GWW, G.Williams@CRC.AC.UK) reverse-complement option.
|
||||
+ added support for reading Phylip formats (interleave & sequential)
|
||||
* string fixes, dropped need for compiler flags NOSTR, FIXTOUPPER, NEEDSTRCASECMP
|
||||
* changed 32bit checksum to default, -DSMALLCHECKSUM for GCG version
|
||||
|
||||
|
134
Stdfiles
Normal file
134
Stdfiles
Normal file
|
@ -0,0 +1,134 @@
|
|||
/* Stdfiles
|
||||
generate standard files to test readseq
|
||||
*/
|
||||
|
||||
C
|
||||
#include <stdio.h>
|
||||
/* no sequence formats use chars > #126, ignore these */
|
||||
main(void)
|
||||
{
|
||||
int c;
|
||||
puts("> alphabet['!'..'~']");
|
||||
for (c = '!'; c <= '~'; c++) putc(c,stdout);
|
||||
putc('\n', stdout);
|
||||
}
|
||||
|
||||
link -w -t MPST -c 'MPS ' c.o <20>
|
||||
"{Libraries}"Interface.o "{Libraries}"ToolLibs.o <20>
|
||||
"{Libraries}"Runtime.o "{CLibraries}"StdClib.o
|
||||
link.out > alphabet.orig
|
||||
|
||||
|
||||
C
|
||||
#include <stdio.h>
|
||||
main(void)
|
||||
{
|
||||
/* note: symbols "*" and "/" removed as terminators for various formats */
|
||||
const char *aminos = "ABCDEFGHIKLMNPQRSTVWXYZ";
|
||||
const char *primenuc = "ACGTU";
|
||||
const char *allsymbols = "_.-?<>{}[]()!@#$%^&=+;:'|`~\"\\";
|
||||
|
||||
char *c, all[256];
|
||||
int count;
|
||||
|
||||
strcpy(all, aminos);
|
||||
strcat(all, primenuc);
|
||||
strcat(all, allsymbols);
|
||||
puts("> nucleic/amino test");
|
||||
for (count=0; count<4; count++) {
|
||||
for (c = all; *c!=0; c++) putc(*c, stdout);
|
||||
putc('\n', stdout);
|
||||
}
|
||||
}
|
||||
|
||||
link -w -t MPST -c 'MPS ' c.o <20>
|
||||
"{Libraries}"Interface.o "{Libraries}"ToolLibs.o <20>
|
||||
"{Libraries}"Runtime.o "{CLibraries}"StdClib.o
|
||||
link.out > nucleic.std
|
||||
|
||||
#--------------------------
|
||||
|
||||
#standards (ship w/ readseq)
|
||||
#note: not all alphabet.orig chars are expected to be passed by
|
||||
# readseq. Numbers are dropped.
|
||||
readseq -p alphabet.orig > alphabet.std
|
||||
readseq -p -C alphabet.std > upper.std
|
||||
|
||||
cat alphabet.orig
|
||||
> alphabet['!'..'~']
|
||||
!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
|
||||
|
||||
cat alphabet.std
|
||||
>alphabet['!'..'~'], 83 bases, 9429 checksum.
|
||||
!"#$%&'()*+-./:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]
|
||||
^_`abcdefghijklmnopqrstuvwxyz{|}~
|
||||
|
||||
cat upper.std
|
||||
>alphabet['!'..'~'], 83 bases, 9429 checksum.
|
||||
!"#$%&'()*+-./:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]
|
||||
^_`ABCDEFGHIJKLMNOPQRSTUVWXYZ{|}~
|
||||
|
||||
cat nucleic.std
|
||||
> nucleic/amino test
|
||||
ABCDEFGHIKLMNPQRSTVWXYZACGTU_.-?<>{}[]()!@#$%^&=+;:'|`~"\
|
||||
ABCDEFGHIKLMNPQRSTVWXYZACGTU_.-?<>{}[]()!@#$%^&=+;:'|`~"\
|
||||
ABCDEFGHIKLMNPQRSTVWXYZACGTU_.-?<>{}[]()!@#$%^&=+;:'|`~"\
|
||||
ABCDEFGHIKLMNPQRSTVWXYZACGTU_.-?<>{}[]()!@#$%^&=+;:'|`~"\
|
||||
|
||||
readseq -p nucleic.std
|
||||
>nucleic/amino test, 228 bases, 5952 checksum.
|
||||
ABCDEFGHIKLMNPQRSTVWXYZACGTU_.-?<>{}[]()!@#$%^&=+;
|
||||
:'|`~"\ABCDEFGHIKLMNPQRSTVWXYZACGTU_.-?<>{}[]()!@#
|
||||
$%^&=+;:'|`~"\ABCDEFGHIKLMNPQRSTVWXYZACGTU_.-?<>{}
|
||||
[]()!@#$%^&=+;:'|`~"\ABCDEFGHIKLMNPQRSTVWXYZACGTU_
|
||||
.-?<>{}[]()!@#$%^&=+;:'|`~"\
|
||||
|
||||
|
||||
#----------------------------------
|
||||
|
||||
#test for general read/write of all chars:
|
||||
readseq -p alphabet.std -otest.alpha
|
||||
diff test.alpha alphabet.std
|
||||
|
||||
#test for valid toupper, general read/write:
|
||||
readseq -p -C alphabet.std -otest.upper
|
||||
diff test.upper upper.std
|
||||
#for vms, use "-C" to preserve case
|
||||
# readseq -p "-C" alphabet.std -otest.upper
|
||||
|
||||
#test for multiple sequence file conversions
|
||||
# leave out gcg, raw;
|
||||
# test of long seq conversion ?
|
||||
# test of mail-header seq conversion ?
|
||||
|
||||
#test for valid format conversions
|
||||
readseq -v -p -f1 nucleic.std -otest.f1
|
||||
readseq -v -p -f2 test.f1 -otest.f2
|
||||
readseq -v -p -f3 test.f2 -otest.f3
|
||||
readseq -v -p -f4 test.f3 -otest.f4
|
||||
readseq -v -p -f5 test.f4 -otest.f5
|
||||
readseq -v -p -f6 test.f5 -otest.f6
|
||||
readseq -v -p -f7 test.f6 -otest.f7
|
||||
readseq -v -p -f8 test.f7 -otest.f8
|
||||
readseq -v -p -f1 test.f8 -otest.f1b
|
||||
diff test.f1 test.f1b
|
||||
compare test.f1 test.f1b
|
||||
|
||||
readseq -v -p -f13 test.f8 -otest.f13 # raw, drops name
|
||||
readseq -v -p -f9 test.f8 -otest.f9 # zuker, little used
|
||||
#readseq -v -p -f10 test.f9 -otest.f10 # olsen, input only (output=raw)
|
||||
readseq -v -p -f11 test.f8 -otest.f11 # phylip 3.2, output only
|
||||
readseq -v -p -f12 test.f8 -otest.f12 # phylip 3.3, output only
|
||||
readseq -v -p -f14 test.f8 -otest.f14 # phylip 3.4, output only
|
||||
|
||||
|
||||
#clean up
|
||||
rm test.<2E>
|
||||
|
||||
|
||||
#-----------------------------
|
||||
# some general tests
|
||||
|
||||
readseq -h
|
||||
|
||||
readseq
|
123
add.gdemenu
Normal file
123
add.gdemenu
Normal file
|
@ -0,0 +1,123 @@
|
|||
#
|
||||
# dgg added new readseq formats, 29 dec 92
|
||||
#
|
||||
|
||||
item:Export Foreign Format
|
||||
itemmethod:readseq in1 -pipe -all -form=$FORMAT > $OUTPUTFILE
|
||||
itemhelp:readseq.help
|
||||
|
||||
arg:FORMAT
|
||||
argtype:choice_menu
|
||||
argchoice:GenBank:genbank
|
||||
argchoice:IG/Stanford:ig
|
||||
argchoice:NBRF:nbrf
|
||||
argchoice:EMBL:embl
|
||||
argchoice:GCG:gcg
|
||||
argchoice:DNA Strider:strider
|
||||
argchoice:Fitch:fitch
|
||||
argchoice:Pearson/Fasta:pearson
|
||||
argchoice:Zuker:zuker
|
||||
argchoice:Olsen:olsen
|
||||
argchoice:Phylip:phylip
|
||||
#argchoice:Phylip v3.2:phylip3.2
|
||||
argchoice:Plain text:raw
|
||||
argchoice:ASN.1:asn
|
||||
argchoice:PIR:pir
|
||||
argchoice:MSF:msf
|
||||
argchoice:PAUP:paup
|
||||
argchoice:Pretty:pretty -nametop -nameleft=3 -numright -nameright -numtop
|
||||
|
||||
arg:OUTPUTFILE
|
||||
argtype:text
|
||||
arglabel:Save as?
|
||||
|
||||
in:in1
|
||||
informat:genbank
|
||||
|
||||
|
||||
#
|
||||
#dgg addition for new readseq, 24 dec 92
|
||||
#
|
||||
|
||||
item:Pretty Print
|
||||
itemmethod:readseq in1 -p -a -f=pretty $NAMELEFT $NAMERIGHT $NUMTOP $NUMBOT $NUMLEFT $NUMRIGHT -col=$COLS -width=$WIDTH $MATCH $GAPC > in1.pretty; (textedit in1.pretty; /bin/rm -f in1 in1.pretty)&
|
||||
itemhelp:readseq.help
|
||||
|
||||
#nametop is bad !?
|
||||
|
||||
in:in1
|
||||
informat:genbank
|
||||
|
||||
arg:NAMETOP
|
||||
argtype:chooser
|
||||
arglabel:Names at top ?
|
||||
argchoice:No:
|
||||
argchoice:Yes:-nametop
|
||||
|
||||
arg:NAMELEFT
|
||||
argtype:chooser
|
||||
arglabel:Names at left ?
|
||||
argchoice:No:
|
||||
argchoice:Yes:-nameleft
|
||||
|
||||
arg:NAMERIGHT
|
||||
argtype:chooser
|
||||
arglabel:Names at right?
|
||||
argchoice:Yes:-nameright
|
||||
argchoice:No:
|
||||
|
||||
arg:NUMTOP
|
||||
argtype:chooser
|
||||
arglabel:Numbers at top ?
|
||||
argchoice:Yes:-numtop
|
||||
argchoice:No:
|
||||
|
||||
arg:NUMBOT
|
||||
argtype:chooser
|
||||
arglabel:Numbers at tail ?
|
||||
argchoice:No:
|
||||
argchoice:Yes:-numbot
|
||||
|
||||
arg:NUMLEFT
|
||||
argtype:chooser
|
||||
arglabel:Numbers at left ?
|
||||
argchoice:Yes:-numleft
|
||||
argchoice:No:
|
||||
|
||||
arg:NUMRIGHT
|
||||
argtype:chooser
|
||||
arglabel:Numbers at right?
|
||||
argchoice:Yes:-numright
|
||||
argchoice:No:
|
||||
|
||||
arg:MATCH
|
||||
argtype:chooser
|
||||
arglabel:Use match '.' for 2..n species?
|
||||
argchoice:No:
|
||||
argchoice:Yes:-match
|
||||
|
||||
arg:GAPC
|
||||
argtype:chooser
|
||||
arglabel:Count gap symbols?
|
||||
argchoice:No:
|
||||
argchoice:Yes:-gap
|
||||
|
||||
arg:WIDTH
|
||||
argtype:slider
|
||||
arglabel:Sequence width?
|
||||
argmin:10
|
||||
argmax:200
|
||||
argvalue:50
|
||||
|
||||
arg:COLS
|
||||
argtype:slider
|
||||
arglabel:Column spacers?
|
||||
argmin:0
|
||||
argmax:50
|
||||
argvalue:10
|
||||
|
||||
|
||||
### pretty print insert end
|
||||
#
|
||||
|
||||
|
412
macinit.r
Normal file
412
macinit.r
Normal file
|
@ -0,0 +1,412 @@
|
|||
/*------------------------------------------------------------------------------
|
||||
#
|
||||
#
|
||||
# MultiFinder-Aware Simple Input/Output Window resource
|
||||
#
|
||||
# for ReadSeq
|
||||
#
|
||||
------------------------------------------------------------------------------*/
|
||||
|
||||
#include "systypes.r"
|
||||
#include "types.r"
|
||||