981 lines
38 KiB
Text
981 lines
38 KiB
Text
|
||||||||||| ReadSeq supported formats (revised 30Dec92)
|
|||
|
--------------------------------------------------------
|
|||
|
|
|||
|
-f[ormat=]Name Format name for output:
|
|||
|
1. IG/Stanford 10. Olsen (in-only)
|
|||
|
2. GenBank/GB 11. Phylip3.2
|
|||
|
3. NBRF 12. Phylip
|
|||
|
4. EMBL 13. Plain/Raw
|
|||
|
5. GCG 14. PIR/CODATA
|
|||
|
6. DNAStrider 15. MSF
|
|||
|
7. Fitch 16. ASN.1
|
|||
|
8. Pearson/Fasta 17. PAUP
|
|||
|
9. Zuker (in-only) 18. Pretty (out-only)
|
|||
|
|
|||
|
In general, output supports only minimal subsets of each format
|
|||
|
needed for sequence data exchanges. Features, descriptions
|
|||
|
and other format-unique information is discarded.
|
|||
|
|
|||
|
Users of Olsen multi sequence editor (VMS). The Olsen format
|
|||
|
here is produced with the print command:
|
|||
|
print/out=some.file
|
|||
|
Use Genbank output from readseq to produce a format that this
|
|||
|
editor can read, and use the command
|
|||
|
load/genbank some.file
|
|||
|
Dan Davison has a VMS program that will convert to/from the
|
|||
|
Olsen native binary data format. E-mail davison@uh.edu
|
|||
|
|
|||
|
Warning: Phylip format input is now supported (30Dec92), however the
|
|||
|
auto-detection of Phylip format is very probabilistic and messy,
|
|||
|
especially distinguishing sequential from interleaved versions. It
|
|||
|
is not recommended that one use readseq to convert files from Phylip
|
|||
|
format to others unless essential.
|
|||
|
|
|||
|
|
|||
|
|
|||
|
||||||||||| ReadSeq usage (revised 11Nov91)
|
|||
|
--------------------------------------------------------
|
|||
|
|
|||
|
A. determine file format:
|
|||
|
|
|||
|
short skiplines; /* result: number of header lines to skip (or 0) */
|
|||
|
short error; /* error result or 0 */
|
|||
|
short format; /* resulting format code, see ureadseq.h */
|
|||
|
char *filename = "Mysequence.file"
|
|||
|
|
|||
|
format = seqFileFormat( filename, &skiplines, &error);
|
|||
|
if (error!=0) fail;
|
|||
|
|
|||
|
B. read number and list of sequences (optional)
|
|||
|
short numseqs; /* resulting number of sequences found in file */
|
|||
|
char *seqlist; /* list of sequence names, newline separated, 0 terminated */
|
|||
|
|
|||
|
seqlist = listSeqs( filename, skiplines, format, &numseqs, &error);
|
|||
|
if (error!=0) display (seqlist);
|
|||
|
free( seqlist);
|
|||
|
|
|||
|
C. read individual sequences as desired
|
|||
|
short seqIndex; /* sequence index #, or == kListSeqs for listSeqs equivalent */
|
|||
|
long seqlen; /* length of seq */
|
|||
|
char seqid[256]; /* sequence name */
|
|||
|
char *seq; /* sequence, 0 terminated, free when done */
|
|||
|
|
|||
|
seq = readSeq( seqIndex, filename, skiplines, format,
|
|||
|
&seqlen, &numseqs, &error, seqid);
|
|||
|
if (error!=0) manipulate(seq);
|
|||
|
free(seq);
|
|||
|
|
|||
|
D. write sequences as desired
|
|||
|
int nlines; /* number of lines of sequence written */
|
|||
|
FILE* fout; /* open file pointer (stdout or other) */
|
|||
|
short outform; /* output format, see ureadseq.h */
|
|||
|
|
|||
|
nlines = writeSeq( fout, seq, seqlen, format, outform, seqid);
|
|||
|
|
|||
|
|
|||
|
Note (30Dec92): There is various processing done by the main program (in readseq.c),
|
|||
|
rather than just in the subroutines (in ureadseq.c). Especially for interleaved
|
|||
|
output formats, the writeSeq subroutine does not handle interleaving, nor some of
|
|||
|
the formatting at the top and end of output files. While seqFileFormat, listSeqs,
|
|||
|
and readSeq subroutines are fairly self-contained, the writeSeq depends a lot on
|
|||
|
auxilliary processing. At some point, this may be revised so writeSeq is self-
|
|||
|
contained.
|
|||
|
|
|||
|
Note 2: The NCBI toolkit (ftp from ncbi.nlm.nih.gov) is needed for the ASN.1 format
|
|||
|
reading (see ureadasn.c). A bastard (but workable I hope) ASN.1 format is written
|
|||
|
by writeSeq alone.
|
|||
|
|
|||
|
|
|||
|
|
|||
|
||||||||||| sequence formats....
|
|||
|
---------------------------------------------------
|
|||
|
|
|||
|
stanford/IG
|
|||
|
;comments
|
|||
|
;...
|
|||
|
seq1 info
|
|||
|
abcd...
|
|||
|
efgh1 (or 2 = terminator)
|
|||
|
;another seq
|
|||
|
;....
|
|||
|
seq2 info
|
|||
|
abcd...1
|
|||
|
--- for e.g. ----
|
|||
|
; Dro5s-T.Seq Length: 120 April 6, 1989 21:22 Check: 9487 ..
|
|||
|
dro5stseq
|
|||
|
GCCAACGACCAUACCACGCUGAAUACAUCGGUUCUCGUCCGAUCACCGAAAUUAAGCAGCGUCGCGGGCG
|
|||
|
GUUAGUACUUAGAUGGGGGACCGCUUGGGAACACCGCGUGUUGUUGGCCU1
|
|||
|
|
|||
|
; TOIG of: Dro5srna.Seq check: 9487 from: 1 to: 120
|
|||
|
---------------------------------------------------
|
|||
|
|
|||
|
Genbank:
|
|||
|
LOCUS seq1 ID..
|
|||
|
...
|
|||
|
ORIGIN ...
|
|||
|
123456789abcdefg....(1st 9 columns are formatting)
|
|||
|
hijkl...
|
|||
|
// (end of sequence)
|
|||
|
LOCUS seq2 ID ..
|
|||
|
...
|
|||
|
ORIGIN
|
|||
|
abcd...
|
|||
|
//
|
|||
|
---------------------------------------------------
|
|||
|
|
|||
|
NBRF format: (from uwgcg ToNBRF)
|
|||
|
>DL;DRO5SRNA
|
|||
|
Iubio$Dua0:[Gilbertd.Gcg]Dro5srna.Seq;2 => DRO5SRNA
|
|||
|
|
|||
|
51 AAUUAAGCAG CGUCGCGGGC GGUUAGUACU UAGAUGGGGG ACCGCUUGGG
|
|||
|
101 AACACCGCGU GUUGUUGGCC U
|
|||
|
|
|||
|
---------------------------------------------------
|
|||
|
|
|||
|
EMBL format
|
|||
|
ID345 seq1 id (the 345 are spaces)
|
|||
|
... other info
|
|||
|
SQ345Sequence (the 3,4,5 are spaces)
|
|||
|
abcd...
|
|||
|
hijk...
|
|||
|
// (! this is proper end string: 12Oct90)
|
|||
|
ID seq2 id
|
|||
|
...
|
|||
|
SQ Sequence
|
|||
|
abcd...
|
|||
|
...
|
|||
|
//
|
|||
|
---------------------------------------------------
|
|||
|
|
|||
|
UW GCG Format:
|
|||
|
comments of any form, up to ".." signal
|
|||
|
signal line has seq id, and " Check: #### .."
|
|||
|
only 1 seq/file
|
|||
|
|
|||
|
-- e.g. --- (GCG from GenBank)
|
|||
|
LOCUS DROEST6 1819 bp ss-mRNA INV 31-AUG-1987
|
|||
|
... much more ...
|
|||
|
ORIGIN 1 bp upstream of EcoRI site; chromosome BK9 region 69A1.
|
|||
|
|
|||
|
INVERTEBRATE:DROEST6 Length: 1819 January 9, 1989 16:48 Check: 8008 ..
|
|||
|
|
|||
|
1 GAATTCGCCG GAGTGAGGAG CAACATGAAC TACGTGGGAC TGGGACTTAT
|
|||
|
|
|||
|
51 CATTGTGCTG AGCTGCCTTT GGCTCGGTTC GAACGCGAGT GATACAGATG
|
|||
|
|
|||
|
|
|||
|
---------------------------------------------------
|
|||
|
|
|||
|
DNAStrider (Mac) = modified Stanford:
|
|||
|
; ### from DNA Strider Friday, April 7, 1989 11:04:24 PM
|
|||
|
; DNA sequence pBR322 4363 b.p. complete sequence
|
|||
|
;
|
|||
|
abcd...
|
|||
|
efgh
|
|||
|
// (end of sequence)
|
|||
|
---------------------------------------------------
|
|||
|
|
|||
|
Fitch format:
|
|||
|
Dro5srna.Seq
|
|||
|
GCC AAC GAC CAU ACC ACG CUG AAU ACA UCG GUU CUC GUC CGA UCA CCG AAA UUA AGC AGC
|
|||
|
GUC GCG GGC GGU UAG UAC UUA GAU GGG GGA CCG CUU GGG AAC ACC GCG UGU UGU UGG CCU
|
|||
|
Droest6.Seq
|
|||
|
GAA TTC GCC GGA GTG AGG AGC AAC ATG AAC TAC GTG GGA CTG GGA CTT ATC ATT GTG CTG
|
|||
|
AGC TGC CTT TGG CTC GGT TCG AAC GCG AGT GAT ACA GAT GAC CCT CTG TTG GTG CAG CTG
|
|||
|
---------------------------------------------------
|
|||
|
|
|||
|
W.Pearson/Fasta format:
|
|||
|
>BOVPRL GenBank entry BOVPRL from omam file. 907 nucleotides.
|
|||
|
TGCTTGGCTGAGGAGCCATAGGACGAGAGCTTCCTGGTGAAGTGTGTTTCTTGAAATCAT
|
|||
|
|
|||
|
---------------------------------------------------
|
|||
|
Phylip version 3.2 format (e.g., DNAML):
|
|||
|
|
|||
|
5 13 YF (# seqs, #bases, YF)
|
|||
|
Alpha AACGTGGCCAAAT
|
|||
|
aaaagggccc... (continued sp. alpha)
|
|||
|
Beta AAGGTCGCCAAAC
|
|||
|
aaaagggccc... (continued sp. beta)
|
|||
|
Gamma CATTTCGTCACAA
|
|||
|
aaaagggccc... (continued sp. Gamma)
|
|||
|
1234567890^-- bases must start in col 11, and run 'til #bases
|
|||
|
(spaces & newlines are okay)
|
|||
|
---------------------------------------------------
|
|||
|
Phylip version 3.3 format (e.g., DNAML):
|
|||
|
|
|||
|
5 42 YF (# seqs, #bases, YF)
|
|||
|
Turkey AAGCTNGGGC ATTTCAGGGT
|
|||
|
Salmo gairAAGCCTTGGC AGTGCAGGGT
|
|||
|
H. SapiensACCGGTTGGC CGTTCAGGGT
|
|||
|
Chimp AAACCCTTGC CGTTACGCTT
|
|||
|
Gorilla AAACCCTTGC CGGTACGCTT
|
|||
|
1234567890^-- bases must start in col 11
|
|||
|
!! this version interleaves the species -- contrary to
|
|||
|
all other output formats.
|
|||
|
|
|||
|
GAGCCCGGGC AATACAGGGT AT
|
|||
|
GAGCCGTGGC CGGGCACGGT AT
|
|||
|
ACAGGTTGGC CGTTCAGGGT AA
|
|||
|
AAACCGAGGC CGGGACACTC AT
|
|||
|
AAACCATTGC CGGTACGCTT AA
|
|||
|
|
|||
|
---------------------------------------------------
|
|||
|
Phylip version 3.4 format (e.g., DNAML)
|
|||
|
-- Both Interleaved and sequential are permitted
|
|||
|
|
|||
|
5 13 (# seqs, #bases)
|
|||
|
Alpha AACGTGGCCAAAT
|
|||
|
aaaagggccc... (continued sp. alpha)
|
|||
|
Beta AAGGTCGCCAAAC
|
|||
|
aaaagggccc... (continued sp. beta)
|
|||
|
Gamma CATTTCGTCACAA
|
|||
|
aaaagggccc... (continued sp. Gamma)
|
|||
|
1234567890^-- bases must start in col 11, and run 'til #bases
|
|||
|
(spaces, newlines and numbers are are ignored)
|
|||
|
|
|||
|
---------------------------------------------------
|
|||
|
Gary Olsen (multiple) sequence editor /print format:
|
|||
|
|
|||
|
!---------------------
|
|||
|
!17Oct91 -- error in original copy of olsen /print format, shifted right 1 space
|
|||
|
! here is correct copy:
|
|||
|
301 40 Tb.thiop CGCAGCGAAA----------GCUNUGCUAAUACCGCAUA-CGnCCUG----------------------------------------------------- Tb.thiop
|
|||
|
123456789012345678901
|
|||
|
301 42 Rhc.purp CGUAGCGAAA----------GUUACGCUAAUACCGCAUA-UUCUGUG----------------------------------------------------- Rhc.purp
|
|||
|
|
|||
|
301 44 Rhc.gela nnngnCGAAA----------GCCGGAUUAAUACCGCAUA-CGACCUA----------------------------------------------------- Rhc.gela
|
|||
|
!---------------------
|
|||
|
|
|||
|
RNase P RNA components. on 20-FEB-90 17:23:58
|
|||
|
|
|||
|
1 (E.c. pr ): Base pairing in Escherichia coli RNase P RNA.
|
|||
|
2 (chrom ): Chromatium
|
|||
|
:
|
|||
|
12 (B.brevis): Bacillus brevis RNase P RNA, B. James.
|
|||
|
13 ( 90% con): 90% conserved
|
|||
|
14 (100% con): 100% conserved
|
|||
|
15 (gram+ pr): pairing
|
|||
|
|
|||
|
1
|
|||
|
RNase P RNA components. on 20-FEB-90 17:23:58
|
|||
|
|
|||
|
Posi- Sequence
|
|||
|
tion: identity: Data:
|
|||
|
|
|||
|
1 1 E.c. pr <<<<<<<<<< {{{{{{{{<<:<<<<<<<<<<^<<<<<<====>>>> E.c. pr
|
|||
|
1 2 chrom GGAGUCGGCCAGACAGUCGCUUCCGUCCU------------------ chrom
|
|||
|
:
|
|||
|
1 12 B.brevis AUGCAGGAAAUGCGGGUAGCCGCUGCCGCAAUCGUCU------------- B.brevis
|
|||
|
1234567890123456789012 <! this should be 21 not 22,
|
|||
|
! this example must be inset on left by 1 space from olsen /print files !
|
|||
|
1 13 90% con G C G A CGC GC - - 90% con
|
|||
|
1 14 100% con G A CGC 100% con
|
|||
|
1 15 gram+ pr <<<<<<<<<< {{{{{{{{<<<<<<<<<<<<<=============== gram+ pr
|
|||
|
|
|||
|
60 1 E.c. pr >>>>>>^>>^>>>>:>> <<<^<<<< {{{{{ E.c. pr
|
|||
|
60 2 chrom -----GGUG-ACGGGGGAGGAAAGUCCGG-GCUCCAU------------- chrom
|
|||
|
: :
|
|||
|
60 10 B.stearo ----UU-CG-GCCGUAGAGGAAAGUCCAUGCUCGCACGGUGCUGAGAUGC B.stearo
|
|||
|
|
|||
|
|
|||
|
---------------------------------------------------
|
|||
|
GCG MSF format
|
|||
|
Title line
|
|||
|
|
|||
|
picorna.msf MSF: 100 Type: P January 17, 1991 17:53 Check: 541
|
|||
|
..
|
|||
|
Name: Cb3 Len: 100 Check: 7009 Weight: 1.00
|
|||
|
Name: E Len: 100 Check: 60 Weight: 1.00
|
|||
|
|
|||
|
//
|
|||
|
|
|||
|
1 50
|
|||
|
Cb3 ...gpvedai .......t.. aaigr..vad tvgtgptnse aipaltaaet
|
|||
|
E gvenae.kgv tentna.tad fvaqpvylpe .nqt...... kv.affynrs
|
|||
|
|
|||
|
51 100
|
|||
|
|
|||
|
Cb3 ghtsqvvpgd tmqtrhvkny hsrsestien flcrsacvyf teykn.....
|
|||
|
E ...spi.gaf tvks...... gs.lesgfap .fsngtc.pn sviltpgpqf
|
|||
|
|
|||
|
---------------------------------------------------
|
|||
|
PIR format
|
|||
|
This is NBRF-PIR MAILSERVER version 1.45
|
|||
|
Command-> get PIR3:A31391
|
|||
|
\\\
|
|||
|
ENTRY A31391 #Type Protein
|
|||
|
TITLE *Esterase-6 - Fruit fly (Drosophila melanogaster)
|
|||
|
|
|||
|
DATE 03-Aug-1992 #Sequence 03-Aug-1992 #Text 03-Aug-1992
|
|||
|
PLACEMENT 0.0 0.0 0.0 0.0 0.0
|
|||
|
COMMENT *This entry is not verified.
|
|||
|
SOURCE Drosophila melanogaster
|
|||
|
|
|||
|
REFERENCE
|
|||
|
#Authors Cooke P.H., Oakeshott J.G.
|
|||
|
#Citation submitted to GenBank, April 1989
|
|||
|
#Reference-number A31391
|
|||
|
#Accession A31391
|
|||
|
#Cross-reference GB:J04167
|
|||
|
|
|||
|
SUMMARY #Molecular-weight 61125 #Length 544 #Checksum 1679
|
|||
|
SEQUENCE
|
|||
|
5 10 15 20 25 30
|
|||
|
1 M N Y V G L G L I I V L S C L W L G S N A S D T D D P L L V
|
|||
|
31 Q L P Q G K L R G R D N G S Y Y S Y E S I P Y A E P P T G D
|
|||
|
61 L R F E A P E P Y K Q K W S D I F D A T K T P V A C L Q W D
|
|||
|
91 Q F T P G A N K L V G E E D C L T V S V Y K P K N S K R N S
|
|||
|
121 F P V V A H I H G G A F M F G A A W Q N G H E N V M R E G K
|
|||
|
151 F I L V K I S Y R L G P L G F V S T G D R D L P G N Y G L K
|
|||
|
181 D Q R L A L K W I K Q N I A S F G G E P Q N V L L V G H S A
|
|||
|
211 G G A S V H L Q M L R E D F G Q L A R A A F S F S G N A L D
|
|||
|
241 P W V I Q K G A R G R A F E L G R N V G C E S A E D S T S L
|
|||
|
271 K K C L K S K P A S E L V T A V R K F L I F S Y V P F A P F
|
|||
|
301 S P V L E P S D A P D A I I T Q D P R D V I K S G K F G Q V
|
|||
|
331 P W A V S Y V T E D G G Y N A A L L L K E R K S G I V I D D
|
|||
|
361 L N E R W L E L A P Y L L F Y R D T K T K K D M D D Y S R K
|
|||
|
391 I K Q E Y I G N Q R F D I E S Y S E L Q R L F T D I L F K N
|
|||
|
421 S T Q E S L D L H R K Y G K S P A Y A Y V Y D N P A E K G I
|
|||
|
451 A Q V L A N R T D Y D F G T V H G D D Y F L I F E N F V R D
|
|||
|
481 V E M R P D E Q I I S R N F I N M L A D F A S S D N G S L K
|
|||
|
511 Y G E C D F K D N V G S E K F Q L L A I Y I D G C Q N R Q H
|
|||
|
541 V E F P
|
|||
|
///
|
|||
|
\\\
|
|||
|
---------------------------------------------------
|
|||
|
PAUP format:
|
|||
|
The NEXUS Format
|
|||
|
|
|||
|
Every block starts with "BEGIN blockname;" and ends with "END;".
|
|||
|
Each block is composed of one or more statements, each
|
|||
|
terminated by a semicolon (;).
|
|||
|
|
|||
|
Comments may be included in NEXUS files by enclosing them within
|
|||
|
square brackets, as in "[This is a comment]."
|
|||
|
|
|||
|
NEXUS-conforming files are identified by a "#NEXUS" directive at
|
|||
|
the very beginning of the file (line 1, column 1). If the
|
|||
|
#NEXUS is omitted PAUP issues a warning but continues
|
|||
|
processing.
|
|||
|
|
|||
|
NEXUS files are entirely free-format. Blanks, tabs, and
|
|||
|
newlines may be placed anywhere in the file. Unless RESPECTCASE
|
|||
|
is requested, commands and data may be entered in upper case,
|
|||
|
lower case, or a mixture of upper and lower case.
|
|||
|
|
|||
|
The following conventions are used in the syntax descriptions of
|
|||
|
the various blocks. Upper-case items are entered exactly as
|
|||
|
shown. Lower-case items inside of angle brackets -- e.g., <x>
|
|||
|
-- represent items to be substituted by the user. Items inside
|
|||
|
of square brackets -- e.g., [X] -- are optional. Items inside
|
|||
|
of curly braces and separated by vertical bars -- e.g., { X | Y
|
|||
|
| Z } -- are mutually exclusive options.
|
|||
|
|
|||
|
|
|||
|
The DATA Block
|
|||
|
|
|||
|
The DATA block contains the data matrix and other associated
|
|||
|
information. Its syntax is:
|
|||
|
|
|||
|
BEGIN DATA;
|
|||
|
DIMENSIONS NTAX=<number of taxa> NCHAR=<number of characters>;
|
|||
|
[ FORMAT [ MISSING=<missing-symbol> ]
|
|||
|
[ LABELPOS={ LEFT | RIGHT } ]
|
|||
|
[ SYMBOLS="<symbols-list>" ]
|
|||
|
[ INTERLEAVE ]
|
|||
|
[ MATCHCHAR=<match-symbol> ]
|
|||
|
[ EQUATE="<symbol>=<expansion> [<symbol>=<expansion>...]" ]
|
|||
|
[ TRANSPOSE ]
|
|||
|
[ RESPECTCASE ]
|
|||
|
[ DATATYPE = { STANDARD | DNA | RNA | PROTEIN } ]; ]
|
|||
|
[ OPTIONS [ IGNORE={ INVAR | UNINFORM } ]
|
|||
|
[ MSTAXA = { UNCERTAIN | POLYMORPH | VARIABLE } ]
|
|||
|
[ ZAP = "<list of zapped characters>" ] ; ]
|
|||
|
[ CHARLABELS <label_1> label_2><3E><> <label_NCHAR> ; ]
|
|||
|
[ TAXLABELS <label1_1> <label1_2> <label1_NTAX> ; ]
|
|||
|
[ STATELABELS <currently ignored by PAUP> ; ]
|
|||
|
MATRIX <data-matrix> ;
|
|||
|
END;
|
|||
|
|
|||
|
--- example PAUP file
|
|||
|
|
|||
|
#NEXUS
|
|||
|
|
|||
|
[!Brown et al. (1982) primate mitochondrial DNA]
|
|||
|
|
|||
|
begin data;
|
|||
|
dimensions ntax=5 nchar=896;
|
|||
|
format datatype=dna matchchar=. interleave missing='-';
|
|||
|
matrix
|
|||
|
[ 2 4 6 8 ]
|
|||
|
[ 1 1 1 1 1 ]
|
|||
|
human aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcctcattactatt ctgcctagcaaactcaaact acgaacgcactcacagtcgc
|
|||
|
chimp ................a.t. .c.................a ...............t.... ..................t. .t........c.........
|
|||
|
gorilla ..................tg ....t.....t........a ........a......t.... .................... .......a..c.....c...
|
|||
|
orang ................ac.. cc.....g..t.....t..a ..c........cc....g.. .................... .......a..c.....c...
|
|||
|
gibbon ......t..a..t...ac.g .c.................a ..a..c..t..cc.g..... ......t............. .......a........c...
|
|||
|
|
|||
|
[ 8 8 8 8 8 8 ]
|
|||
|
[ 0 2 4 6 8 9 ]
|
|||
|
[ 1 1 1 1 1 6 ]
|
|||
|
human cttccccacaacaatattca tgtgcctagaccaagaagtt attatctcgaactgacactg agccacaacccaaacaaccc agctctccctaagctt
|
|||
|
chimp t................... .a................c. ........a.....g..... ...a................ ................
|
|||
|
gorilla ..................tc .a................c. ........a.g......... ...a.............tt. .a..............
|
|||
|
orang ta....a...........t. .c.......ga......acc ..cg..a.a......tg... .a.a..c.....g...cta. .a.....a........
|
|||
|
gibbon a..t.......t........ ....ac...........acc .....t..a........... .a.tg..........gctag .a..............
|
|||
|
;
|
|||
|
end;
|
|||
|
---------------------------------------------------
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
||||||||||| Sample SMTP mail header
|
|||
|
---------------------------------------------------
|
|||
|
|
|||
|
- - - - - - - - -
|
|||
|
From GenBank-Retrieval-System@genbank.bio.net Sun Nov 10 17:28:56 1991
|
|||
|
Received: from genbank.bio.net by sunflower.bio.indiana.edu
|
|||
|
(4.1/9.5jsm) id AA19328; Sun, 10 Nov 91 17:28:55 EST
|
|||
|
Received: by genbank.bio.net (5.65/IG-2.0)
|
|||
|
id AA14458; Sun, 10 Nov 91 14:30:03 -0800
|
|||
|
Date: Sun, 10 Nov 91 14:30:03 -0800
|
|||
|
Message-Id: <9111102230.AA14458@genbank.bio.net>
|
|||
|
From: Database Server <GenBank-Retrieval-System@genbank.bio.net>
|
|||
|
To: gilbertd@sunflower.bio.indiana.edu
|
|||
|
Subject: Results of Query for drorna
|
|||
|
Status: R
|
|||
|
|
|||
|
No matches on drorna.
|
|||
|
- - - - - -
|
|||
|
From GenBank-Retrieval-System@genbank.bio.net Sun Nov 10 17:28:49 1991
|
|||
|
Received: from genbank.bio.net by sunflower.bio.indiana.edu
|
|||
|
(4.1/9.5jsm) id AA19323; Sun, 10 Nov 91 17:28:47 EST
|
|||
|
Received: by genbank.bio.net (5.65/IG-2.0)
|
|||
|
id AA14461; Sun, 10 Nov 91 14:30:03 -0800
|
|||
|
Date: Sun, 10 Nov 91 14:30:03 -0800
|
|||
|
Message-Id: <9111102230.AA14461@genbank.bio.net>
|
|||
|
From: Database Server <GenBank-Retrieval-System@genbank.bio.net>
|
|||
|
To: gilbertd@sunflower.bio.indiana.edu
|
|||
|
Subject: Results of Query for droest6
|
|||
|
Status: R
|
|||
|
|
|||
|
LOCUS DROEST6 1819 bp ss-mRNA INV 31-AUG-1987
|
|||
|
DEFINITION D.melanogaster esterase-6 mRNA, complete cds.
|
|||
|
ACCESSION M15961
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
||||||||||| GCG manual discussion of sequence symbols:
|
|||
|
---------------------------------------------------
|
|||
|
|
|||
|
III_SEQUENCE_SYMBOLS
|
|||
|
|
|||
|
|
|||
|
GCG programs allow all upper and lower case letters, periods (.),
|
|||
|
asterisks (*), pluses (+), ampersands (&), and ats (@) as symbols in
|
|||
|
biological sequences. Nucleotide symbols, their complements, and the
|
|||
|
standard one-letter amino acid symbols are shown below in separate lists.
|
|||
|
The meanings of the symbols +, &, and @ have not been assigned at this
|
|||
|
writing (March, 1989).
|
|||
|
|
|||
|
GCG uses the letter codes for amino acid codes and nucleotide
|
|||
|
ambiguity proposed by IUB (Nomenclature Committee, 1985,
|
|||
|
Eur. J. Biochem. 150; 1-5). These codes are compatible with the codes
|
|||
|
used by the EMBL, GenBank, and NBRF data libraries.
|
|||
|
|
|||
|
|
|||
|
NUCLEOTIDES
|
|||
|
|
|||
|
The meaning of each symbol, its complement, and the Cambridge and
|
|||
|
Stanford equivalents are shown below. Cambridge files can be converted
|
|||
|
into GCG files and vice versa with the programs FROMSTADEN and TOSTADEN.
|
|||
|
IntelliGenetics sequence files can be interconverted with the programs
|
|||
|
FROMIG and TOIG.
|
|||
|
|
|||
|
IUB/GCG Meaning Complement Staden/Sanger Stanford
|
|||
|
|
|||
|
A A T A A
|
|||
|
C C G C C
|
|||
|
G G C G G
|
|||
|
T/U T A T T/U
|
|||
|
M A or C K 5 J
|
|||
|
R A or G Y R R
|
|||
|
W A or T W 7 L
|
|||
|
S C or G S 8 M
|
|||
|
Y C or T R Y Y
|
|||
|
K G or T M 6 K
|
|||
|
V A or C or G B not supported N
|
|||
|
H A or C or T D not supported N
|
|||
|
D A or G or T H not supported N
|
|||
|
B C or G or T V not supported N
|
|||
|
X/N G or A or T or C X -/X N
|
|||
|
. not G or A or T or C . not supported ?
|
|||
|
|
|||
|
|
|||
|
The frame ambiguity codes used by Staden are not supported by GCG
|
|||
|
and are translated by FROMSTADEN as the lower case single base
|
|||
|
equivalent.
|
|||
|
|
|||
|
Staden Code Meaning GCG
|
|||
|
|
|||
|
D C or CC c
|
|||
|
V T or TT t
|
|||
|
B A or AA a
|
|||
|
H G or GG g
|
|||
|
K C or CX c
|
|||
|
L T or TX t
|
|||
|
M A or AX a
|
|||
|
N G or GX g
|
|||
|
|
|||
|
|
|||
|
AMINO ACIDS
|
|||
|
|
|||
|
Here is a list of the standard one-letter amino acid codes and their
|
|||
|
three-letter equivalents. The synonymous codons and their depiction in
|
|||
|
the IUB codes are shown. You should recognize that the codons following
|
|||
|
semicolons (;) are not sufficiently specific to define a single amino
|
|||
|
acid even though they represent the best possible back translation into
|
|||
|
the IUB codes! All of the relationships in this list can be redefined by
|
|||
|
the user in a local data file described below.
|
|||
|
|
|||
|
IUB
|
|||
|
Symbol 3-letter Meaning Codons Depiction
|
|||
|
A Ala Alanine GCT,GCC,GCA,GCG !GCX
|
|||
|
B Asp,Asn Aspartic,
|
|||
|
Asparagine GAT,GAC,AAT,AAC !RAY
|
|||
|
C Cys Cysteine TGT,TGC !TGY
|
|||
|
D Asp Aspartic GAT,GAC !GAY
|
|||
|
E Glu Glutamic GAA,GAG !GAR
|
|||
|
F Phe Phenylalanine TTT,TTC !TTY
|
|||
|
G Gly Glycine GGT,GGC,GGA,GGG !GGX
|
|||
|
H His Histidine CAT,CAC !CAY
|
|||
|
I Ile Isoleucine ATT,ATC,ATA !ATH
|
|||
|
K Lys Lysine AAA,AAG !AAR
|
|||
|
L Leu Leucine TTG,TTA,CTT,CTC,CTA,CTG
|
|||
|
!TTR,CTX,YTR;YTX
|
|||
|
M Met Methionine ATG !ATG
|
|||
|
N Asn Asparagine AAT,AAC !AAY
|
|||
|
P Pro Proline CCT,CCC,CCA,CCG !CCX
|
|||
|
Q Gln Glutamine CAA,CAG !CAR
|
|||
|
R Arg Arginine CGT,CGC,CGA,CGG,AGA,AGG
|
|||
|
!CGX,AGR,MGR;MGX
|
|||
|
S Ser Serine TCT,TCC,TCA,TCG,AGT,AGC !TCX,AGY;WSX
|
|||
|
T Thr Threonine ACT,ACC,ACA,ACG !ACX
|
|||
|
V Val Valine GTT,GTC,GTA,GTG !GTX
|
|||
|
W Trp Tryptophan TGG !TGG
|
|||
|
X Xxx Unknown !XXX
|
|||
|
Y Tyr Tyrosine TAT, TAC !TAY
|
|||
|
Z Glu,Gln Glutamic,
|
|||
|
Glutamine GAA,GAG,CAA,CAG !SAR
|
|||
|
* End Terminator TAA, TAG, TGA !TAR,TRA;TRR
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
||||||||||| docs from PSC on sequence formats:
|
|||
|
---------------------------------------------------
|
|||
|
|
|||
|
|
|||
|
Nucleic Acid and Protein Sequence File Formats
|
|||
|
|
|||
|
|
|||
|
It will probably save you some time if you have your data in a usable
|
|||
|
format before you send it to us. However, we do have the University of
|
|||
|
Wisconsin Genetics Computing Group programs running on our VAXen and
|
|||
|
this package includes several reformatting utilities. Our programs
|
|||
|
usually recognize any of several standard formats, including GenBank,
|
|||
|
EMBL, NBRF, and MolGen/Stanford. For the purposes of annotating an
|
|||
|
analysis we find the GenBank and EMBL formats most useful, particularly
|
|||
|
if you have already received an accession number from one of these
|
|||
|
organizations for your sequence.
|
|||
|
|
|||
|
Our programs do not require that all of the line types available in
|
|||
|
GenBank, EMBL, or NBRF file formats be present for the file format to
|
|||
|
be recognized and processed. The following pages outline the essential
|
|||
|
details required for correct processing of files by our programs.
|
|||
|
Additional information may be present but will generally be ignored.
|
|||
|
|
|||
|
|
|||
|
GenBank File Format
|
|||
|
|
|||
|
File Header
|
|||
|
|
|||
|
1. The first line in the file must have "GENETIC SEQUENCE DATA BANK"
|
|||
|
in spaces 20 through 46 (see LINE 1, below).
|
|||
|
2. The next 8 lines may contain arbitrary text. They are ignored but
|
|||
|
are required to maintain the GenBank format (see LINE 2 - LINE 9).
|
|||
|
|
|||
|
Sequence Data Entries
|
|||
|
|
|||
|
3. Each sequence entry in the file should have the following format.
|
|||
|
a) first line: Must have LOCUS in the first 5 spaces. The
|
|||
|
genetic locus name or identifier must be in spaces
|
|||
|
13 - 22. The length of the sequences is right
|
|||
|
justified in spaces 23 through 29 (see LINE 10).
|
|||
|
b) second line: Must have DEFINITION in the first 10 spaces.
|
|||
|
Spaces 13 - 80 are free form text to identify the
|
|||
|
sequence (see LINE 11).
|
|||
|
c) third line: Must have ACCESSION in the first 9 spaces. Spaces
|
|||
|
13 - 18 must hold the primary accession number
|
|||
|
(see LINE 12).
|
|||
|
d) fourth line: Must have ORIGIN in the first 6 spaces. Nothing
|
|||
|
else is required on this line, it indicates that
|
|||
|
the nucleic acid sequence begins on the next line
|
|||
|
(see LINE 13).
|
|||
|
e) fifth line: Begins the nucleotide sequence. The first 9
|
|||
|
spaces of each sequence line may either be blank
|
|||
|
or may contain the position in the sequence of the
|
|||
|
first nucleotide on the line. The next 66 spaces
|
|||
|
hold the nucleotide sequence in six blocks of ten
|
|||
|
nucleotides. Each of the six blocks begins with a
|
|||
|
blank space followed by ten nucleotides. Thus the
|
|||
|
first nucleotide is in space eleven of the line while
|
|||
|
the last is in space 75 (see LINE 14, LINE 15).
|
|||
|
f) last line: Must have // in the first 2 spaces to indicate
|
|||
|
termination of the sequence (see LINE 16).
|
|||
|
|
|||
|
NOTE: Multiple sequences may appear in each file. To begin another
|
|||
|
sequence go back to a) and start again.
|
|||
|
|
|||
|
|
|||
|
Example GenBank file
|
|||
|
|
|||
|
|
|||
|
LINE 1 : GENETIC SEQUENCE DATA BANK
|
|||
|
LINE 2 :
|
|||
|
LINE 3 :
|
|||
|
LINE 4 :
|
|||
|
LINE 5 :
|
|||
|
LINE 6 :
|
|||
|
LINE 7 :
|
|||
|
LINE 8 :
|
|||
|
LINE 9 :
|
|||
|
LINE 10 :LOCUS L_Name Length BP
|
|||
|
LINE 11 :DEFINITION Describe the sequence any way you want
|
|||
|
LINE 12 :ACCESSION Accession Number
|
|||
|
LINE 13 :ORIGIN
|
|||
|
LINE 14 : 1 acgtacgtac gtacgtacgt acgtacgtac gtacgtacgt a...
|
|||
|
LINE 15 : 61 acgt...
|
|||
|
LINE 16 ://
|
|||
|
|
|||
|
|
|||
|
|
|||
|
EMBL File Format
|
|||
|
|
|||
|
Unlike the GenBank file format the EMBL file format does not require
|
|||
|
a series of header lines. Thus the first line in the file begins
|
|||
|
the first sequence entry of the file.
|
|||
|
|
|||
|
1. The first line of each sequence entry contains the two letters ID
|
|||
|
in the first two spaces. This is followed by the EMBL identifier
|
|||
|
in spaces 6 through 14. (See LINE 1).
|
|||
|
|
|||
|
2. The second line of each sequence entry has the two letters AC in
|
|||
|
the first two spaces. This is followed by the accession number in
|
|||
|
spaces 6 through 11. (See LINE 2).
|
|||
|
|
|||
|
3. The third line of each sequence entry has the two letters DE in the
|
|||
|
first two spaces. This is followed by a free form text definition
|
|||
|
in spaces 6 through 72. (See LINE 3).
|
|||
|
|
|||
|
4. The fourth line in each sequence entry has the two letters SQ in
|
|||
|
the first two spaces. This is followed by the length of the
|
|||
|
sequence beginning at or after space 13. After the sequence length
|
|||
|
there is a blank space and the two letters BP. (See LINE 4).
|
|||
|
|
|||
|
5. The nucleotide sequence begins on the fifth line of the sequence
|
|||
|
entry. Each line of sequence begins with four blank spaces. The
|
|||
|
next 66 spaces hold the nucleotide sequence in six blocks of ten
|
|||
|
nucleotides. Each of the six blocks begins with a blank space
|
|||
|
followed by ten nucleotides. Thus the first nucleotide is in space
|
|||
|
6 of the line while the last is in space 70. (See LINE 5 -
|
|||
|
LINE 6).
|
|||
|
|
|||
|
6. The last line of each sequence entry in the file is a terminator
|
|||
|
line which has the two characters // in the first two spaces.
|
|||
|
(See LINE 7).
|
|||
|
|
|||
|
7. Multiple sequences may appear in each file. To begin another
|
|||
|
sequence go back to item 1 and start again.
|
|||
|
|
|||
|
|
|||
|
Example EMBL file
|
|||
|
|
|||
|
LINE 1 :ID ID_name
|
|||
|
LINE 2 :AC Accession number
|
|||
|
LINE 3 :DE Describe the sequence any way you want
|
|||
|
LINE 4 :SQ Length BP
|
|||
|
LINE 5 : ACGTACGTAC GTACGTACGT ACGTACGTAC GTACGTA...
|
|||
|
LINE 6 : ACGT...
|
|||
|
LINE 7 ://
|
|||
|
|
|||
|
|
|||
|
|
|||
|
NBRF (protein or nucleic acid) File Format
|
|||
|
|
|||
|
1. The first line of each sequence entry begins with a greater than
|
|||
|
symbol, >. This is immediately followed by the two character
|
|||
|
sequence type specifier. Space four must contain a semi-colon.
|
|||
|
Beginning in space five is the sequence name or identification code
|
|||
|
for the NBRF database. The code is from four to six letters and
|
|||
|
numbers. (See LINE 1).
|
|||
|
|
|||
|
!!!! >> add these to readseq
|
|||
|
Specifier Sequence type
|
|||
|
|
|||
|
P1 protein, complete
|
|||
|
F1 protein, fragment
|
|||
|
DL DNA, linear
|
|||
|
DC DNA, circular
|
|||
|
RL RNA, linear
|
|||
|
RC RNA, circular
|
|||
|
N1 functional RNA, other than tRNA
|
|||
|
N3 tRNA
|
|||
|
|
|||
|
2. The second line of each sequence entry contains two kinds of
|
|||
|
information. First is the sequence name which is separated from
|
|||
|
the organism or organelle name by the three character sequence
|
|||
|
blank space, dash, blank space, " - ". There is no special
|
|||
|
character marking the beginning of this line. (See LINE 2).
|
|||
|
|
|||
|
3. Either the amino acid or nucleic acid sequence begins on line three
|
|||
|
and can begin in any space, including the first. The sequence is
|
|||
|
free format and may be interrupted by blanks for ease of reading.
|
|||
|
Protein sequences man contain special punctuation to indicate
|
|||
|
various indeterminacies in the sequence. In the NBRF data files
|
|||
|
all lines may be up to 500 characters long. However some PSC
|
|||
|
programs currently have a limit of 130 characters per line
|
|||
|
(including blanks), and BitNet will not accept lines of over eighty
|
|||
|
characters. (See LINE 3, LINE 4, and LINE 5).
|
|||
|
|
|||
|
The last character in the sequence must be an asterisks, *.
|
|||
|
|
|||
|
Example NBRF file
|
|||
|
|
|||
|
LINE 1 :>P1;CBRT
|
|||
|
LINE 2 :Cytochrome b - Rat mitochondrion (SGC1)
|
|||
|
LINE 3 :M T N I R K S H P L F K I I N H S F I D L P A P S
|
|||
|
LINE 4 : VTHICRDVN Y GWL IRY
|
|||
|
LINE 5 :TWIGGQPVEHPFIIIGQLASISYFSIILILMPISGIVEDKMLKWN*
|
|||
|
|
|||
|
|
|||
|
|
|||
|
MolGen/Stanford File Format
|
|||
|
|
|||
|
1. The first line in a sequence file is a comment line. This line
|
|||
|
begins with a semi-colon in the first space. This line need
|
|||
|
not be present. If it is present it holds descriptive text.
|
|||
|
There may be as many comment lines as desired at the first of
|
|||
|
sequence file. (See LINE 1).
|
|||
|
|
|||
|
2. The second line must be present and contains an identifier or
|
|||
|
name for the sequence in the first ten spaces. (See LINE 2).
|
|||
|
|
|||
|
3. The sequence begins on the third line and occupies up to eighty
|
|||
|
spaces. Spaces may be included in the sequence for ease of
|
|||
|
reading. The sequence continues for as many line as needed
|
|||
|
and is terminated with a 1 or 2. 1 indicates a linear sequence
|
|||
|
while 2 marks a circular sequence. (See LINE 3 and LINE 4).
|
|||
|
|
|||
|
Example MolGen/Stanford file
|
|||
|
|
|||
|
LINE 1 :; Describe the sequence any way you want
|
|||
|
LINE 2 :ECTRNAGLY2
|
|||
|
LINE 3 :ACGCACGTAC ACGTACGTAC A C G T C C G T ACG TAC GTA CGT
|
|||
|
LINE 4 : GCTTA GG G C T A1
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
||||||||||| Phylip file format
|
|||
|
---------------------------------------------------
|
|||
|
|
|||
|
Phylip 3.3 File Format (DNA sequences)
|
|||
|
|
|||
|
|
|||
|
The input and output formats for PROTPARS and for RESTML are described in
|
|||
|
their document files. In general their input formats are similar to those
|
|||
|
described here, except that the one-letter codes for data are specific to those
|
|||
|
programs and are described in those document files. Since the input formats
|
|||
|
for the eight DNA sequence programs apply to all eight, they are described
|
|||
|
here. Their input formats are standard: the data have A's, G's, C's and T's
|
|||
|
(or U's). The first line of the input file contains the number of species and
|
|||
|
the number of sites. As with the other programs, options information may
|
|||
|
follow this. In the case of DNAML, DNAMLK, and DNADIST an additional line
|
|||
|
(described in the document file for these pograms) may follow the first one.
|
|||
|
Following this, each species starts on a new line. The first 10 characters of
|
|||
|
that line are the species name. There then follows the base sequence of that
|
|||
|
species, each character being one of the letters A, B, C, D, G, H, K, M, N, O,
|
|||
|
R, S, T, U, V, W, X, Y, ?, or - (a period was also previously allowed but it is
|
|||
|
no longer allowed, because it sometimes is used to in aligned sequences to mean
|
|||
|
"the same as the sequence above"). Blanks will be ignored, and so will
|
|||
|
numerical digits. This allows GENBANK and EMBL sequence entries to be read
|
|||
|
with minimum editing.
|
|||
|
|
|||
|
These characters can be either upper or lower case. The algorithms
|
|||
|
convert all input characters to upper case (which is how they are treated).
|
|||
|
The characters constitute the IUPAC (IUB) nucleic acid code plus some slight
|
|||
|
extensions. They enable input of nucleic acid sequences taking full account of
|
|||
|
any ambiguities in the sequence.
|
|||
|
|
|||
|
The sequences can continue over multiple lines; when this is done the sequences
|
|||
|
must be either in "interleaved" format, similar to the output of alignment
|
|||
|
programs, or "sequential" format. These are described in the main document
|
|||
|
file. In sequential format all of one sequence is given, possibly on multiple
|
|||
|
lines, before the next starts. In interleaved format the first part of the
|
|||
|
file should contain the first part of each of the sequences, then possibly a
|
|||
|
line containing nothing but a carriage-return character, then the second part
|
|||
|
of each sequence, and so on. Only the first parts of the sequences should be
|
|||
|
preceded by names. Here is a hypothetical example of interleaved format:
|
|||
|
|
|||
|
5 42
|
|||
|
Turkey AAGCTNGGGC ATTTCAGGGT
|
|||
|
Salmo gairAAGCCTTGGC AGTGCAGGGT
|
|||
|
H. SapiensACCGGTTGGC CGTTCAGGGT
|
|||
|
Chimp AAACCCTTGC CGTTACGCTT
|
|||
|
Gorilla AAACCCTTGC CGGTACGCTT
|
|||
|
|
|||
|
GAGCCCGGGC AATACAGGGT AT
|
|||
|
GAGCCGTGGC CGGGCACGGT AT
|
|||
|
ACAGGTTGGC CGTTCAGGGT AA
|
|||
|
AAACCGAGGC CGGGACACTC AT
|
|||
|
AAACCATTGC CGGTACGCTT AA
|
|||
|
|
|||
|
while in sequential format the same sequences would be:
|
|||
|
|
|||
|
5 42
|
|||
|
Turkey AAGCTNGGGC ATTTCAGGGT
|
|||
|
GAGCCCGGGC AATACAGGGT AT
|
|||
|
Salmo gairAAGCCTTGGC AGTGCAGGGT
|
|||
|
GAGCCGTGGC CGGGCACGGT AT
|
|||
|
H. SapiensACCGGTTGGC CGTTCAGGGT
|
|||
|
ACAGGTTGGC CGTTCAGGGT AA
|
|||
|
Chimp AAACCCTTGC CGTTACGCTT
|
|||
|
AAACCGAGGC CGGGACACTC AT
|
|||
|
Gorilla AAACCCTTGC CGGTACGCTT
|
|||
|
AAACCATTGC CGGTACGCTT AA
|
|||
|
|
|||
|
|
|||
|
Note, of course, that a portion of a sequence like this:
|
|||
|
|
|||
|
300 AAGCGTGAAC GTTGTACTAA TRCAG
|
|||
|
|
|||
|
is perfectly legal, assuming that the species name has gone before, and is
|
|||
|
filled out to full length by blanks. The above digits and blanks will be
|
|||
|
ignored, the sequence being taken as starting at the first base symbol (in this
|
|||
|
case an A).
|
|||
|
|
|||
|
The present versions of the programs may sometimes have difficulties with
|
|||
|
the blank lines between groups of lines, and if so you might want to retype
|
|||
|
those lines, making sure that they have only a carriage-return and no blank
|
|||
|
characters on them, or you may perhaps have to eliminate them. The symptoms of
|
|||
|
this problem are that the programs complain that the sequences are not properly
|
|||
|
aligned, and you can find no other cause for this complaint.
|
|||
|
|
|||
|
------------------------------------------------
|
|||
|
|
|||
|
|
|||
|
||||||||||| ASN.1 file format
|
|||
|
---------------------------------------------------
|
|||
|
|
|||
|
|
|||
|
ASN.1 -- see NCBI toolkit docs, source and examples (ncbi.nlm.nih.gov)
|
|||
|
|
|||
|
Example asn.1 sequence file----
|
|||
|
|
|||
|
Bioseq-set ::= {
|
|||
|
seq-set {
|
|||
|
seq {
|
|||
|
id { local id 1 } , -- id essential
|
|||
|
descr { title "Dummy sequence data from nowhere" } , -- optional
|
|||
|
inst { -- inst essential
|
|||
|
repr raw ,
|
|||
|
mol dna ,
|
|||
|
length 156 ,
|
|||
|
topology linear ,
|
|||
|
seq-data
|
|||
|
iupacna "GAATTCATTTTTGAAACAAATCGACCTGACGACGGAATGGTACTCGAATTA
|
|||
|
TGGGCCAAAGGGTTTTATGGGACAAATTAATAGGTGTTCATTATATGCCACTTTCGGAGATTAGATACAGCAATGCAG
|
|||
|
TGGATTCAAAGCAATAGAGTTGTTCTT"
|
|||
|
} } ,
|
|||
|
|
|||
|
seq {
|
|||
|
id { local id 2 } ,
|
|||
|
descr { title "Dummy sequence 2 data from somewhere else" } ,
|
|||
|
inst {
|
|||
|
repr raw ,
|
|||
|
mol dna ,
|
|||
|
length 150 ,
|
|||
|
topology linear ,
|
|||
|
seq-data
|
|||
|
iupacna "TTTTTTTTTTTTGAAACAAATCGACCTGACGACGGAATGGTACTCGAATTA
|
|||
|
TGGGCCAAAGGGTTTTATGGGACAAATTAATAGGTGTTCATTATATGCCACTTTCGGAGATTAGATACAGCAATGCAG
|
|||
|
TGGATTCAAAGCAATAGAGTT"
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
partial ASN.1 description from toolkit
|
|||
|
|
|||
|
Bioseq ::= SEQUENCE {
|
|||
|
id SET OF Seq-id , -- equivalent identifiers
|
|||
|
descr Seq-descr OPTIONAL , -- descriptors
|
|||
|
inst Seq-inst , -- the sequence data
|
|||
|
annot SET OF Seq-annot OPTIONAL }
|
|||
|
|
|||
|
Seq-inst ::= SEQUENCE { -- the sequence data itself
|
|||
|
repr ENUMERATED { -- representation class
|
|||
|
not-set (0) , -- empty
|
|||
|
virtual (1) , -- no seq data
|
|||
|
raw (2) , -- continuous sequence
|
|||
|
seg (3) , -- segmented sequence
|
|||
|
const (4) , -- constructed sequence
|
|||
|
ref (5) , -- reference to another sequence
|
|||
|
consen (6) , -- consensus sequence or pattern
|
|||
|
map (7) , -- ordered map (genetic, restriction)
|
|||
|
other (255) } ,
|
|||
|
mol ENUMERATED { -- molecule class in living organism
|
|||
|
not-set (0) , -- > cdna = rna
|
|||
|
dna (1) ,
|
|||
|
rna (2) ,
|
|||
|
aa (3) ,
|
|||
|
na (4) , -- just a nucleic acid
|
|||
|
other (255) } ,
|
|||
|
length INTEGER OPTIONAL , -- length of sequence in residues
|
|||
|
fuzz Int-fuzz OPTIONAL , -- length uncertainty
|
|||
|
topology ENUMERATED { -- topology of molecule
|
|||
|
not-set (0) ,
|
|||
|
linear (1) ,
|
|||
|
circular (2) ,
|
|||
|
tandem (3) , -- some part of tandem repeat
|
|||
|
other (255) } DEFAULT linear ,
|
|||
|
strand ENUMERATED { -- strandedness in living organism
|
|||
|
not-set (0) ,
|
|||
|
ss (1) , -- single strand
|
|||
|
ds (2) , -- double strand
|
|||
|
mixed (3) ,
|
|||
|
other (255) } OPTIONAL , -- default ds for DNA, ss for RNA, pept
|
|||
|
seq-data Seq-data OPTIONAL , -- the sequence
|
|||
|
ext Seq-ext OPTIONAL , -- extensions for special types
|
|||
|
hist Seq-hist OPTIONAL } -- sequence history
|
|||
|
|
|||
|
------------------------------------------------
|