354 lines
15 KiB
Fortran
Executable file
354 lines
15 KiB
Fortran
Executable file
subroutine multid(seqid,seq,nseq,nmax,used,rnum)
|
|
c REVISED VERSION OF FORMID -- TO WORK WITH MULTIPLE ALIGNMENTS
|
|
c RESEARCHER: M. ZUKER
|
|
c JUN 1986
|
|
c DS TUDHOPE
|
|
c WRITTEN TO WORK ON VAX-11-750 UNDER VMS4.3 USING STANDARD F-77
|
|
c #### NOTE #### USE OF CARRIAGE CONTROL LINE FEED SUPPRESSION
|
|
c IS USED IN THIS PROGRAM AND IS NOT STANDARD
|
|
c FORTRAN-77
|
|
c SEE '$' IN FORMAT STATEMENT 102
|
|
implicit integer (a-z)
|
|
logical found,endfil,valid,used
|
|
integer sline(500),seqnum(500),nseq,nmax,rnum
|
|
character*8 stype
|
|
character*30 seqids(500)
|
|
c character*30 choice,seqid
|
|
character*30 seqid
|
|
character*50 filnam,fmtseq(500)
|
|
character*80 reclin
|
|
character*1 seq(nmax)
|
|
data found/.false./,
|
|
. endfil/.false./,
|
|
. valid/.false./
|
|
c **************************************************************
|
|
c subroutine MULTID
|
|
c **************************************************************
|
|
c PURPOSE:
|
|
c subroutine used to extract sequences from various
|
|
c format type files including
|
|
c STANFORD,GENBANK,EMBL,PIR,and,NRC
|
|
c This revised version of FORMID is shorter. Another
|
|
c variable RNUM is passed first into the subroutine on
|
|
c the first call to return the number of sequences in the
|
|
c file. The same variable is then used to request specific
|
|
c sequences in do_loops of the main program.
|
|
c This eliminates the task of having to select the sequences.
|
|
c **************************************************************
|
|
c variable list table:
|
|
c * -- sent down from main returned unchanged
|
|
c ** - returned to main from subroutine
|
|
c ***- sent down from main and returned changed
|
|
c INTEGERS:IDCNT -- to keep track of the number of sequence
|
|
c identifiers found in the file
|
|
c I,K -- loop counters
|
|
c LINE -- to point to the record line number of a file
|
|
c N -- counter to extract the correct number of
|
|
c sequence elements
|
|
c * NMAX -- maximum length of sequence expected by user
|
|
c ** NSEQ -- length of the sequence retrieved from file
|
|
c POINTR -- pointer to point to sequence identifier
|
|
c chosen from array of identifiers
|
|
c RNUM -- the number of identified sequences returned
|
|
c to main if first use of the subroutine,
|
|
c the number sent down from main, identifying
|
|
c a sequence if not the first time used
|
|
c SEQNUM -- an ARRAY of the length of the sequences
|
|
c for the sequence identifiers found in the
|
|
c NRC-format type files
|
|
c SLINE -- an ARRAY of the record-line numbers where
|
|
c a sequence starts in a file
|
|
c START -- defines what column of a record to start
|
|
c reading the sequence in.
|
|
c CHARACTERS:CHOICE-- character string of length 20 to read in
|
|
c choice for sequence identifier to retrieve
|
|
c FILNAM -- character string of length 50 to read in
|
|
c the filename.
|
|
c FMTSEQ -- an ARRAY of characters, each length 50,
|
|
c describing how the sequence is to be read
|
|
c RECLIN -- a record of a file, length 80 characters
|
|
c ** SEQ -- an ARRAY of characters each length 1 to
|
|
c store the sequence
|
|
c ** SEQID -- retrieved name of the sequence
|
|
c SEQIDS -- an ARRAY of retrieved names of sequences
|
|
c STYPE -- character string of length 8 defining the
|
|
c format type of the file.
|
|
c LOGICALS:FOUND -- logical variable used in looping until
|
|
c something retrieved
|
|
c *** USED -- logical variable used in determining if the
|
|
c subroutine has been previously called
|
|
c VALID -- logical variable used in looping until
|
|
c some input valid
|
|
c
|
|
c
|
|
c if the subroutine has NOT been USED then must
|
|
c input the filename and do error checking on filename
|
|
c else branch to listing the sequences available in this file
|
|
if (.not.used) then
|
|
10 found = .false.
|
|
valid = .false.
|
|
used = .false.
|
|
c initialize for a new file by setting variables back
|
|
c to zero and blanking out old names
|
|
if (nmax.eq.0) nmax = 9999
|
|
nseq = 0
|
|
stype = ' '
|
|
seqid = ' '
|
|
filnam= ' '
|
|
do 500 i = 1,100
|
|
sline(i) = 0
|
|
seqnum(i)= 0
|
|
seqids(i)=' '
|
|
fmtseq(i)= ' '
|
|
500 continue
|
|
111 write(6,102) 'Input sequence file name (/ to end) '
|
|
102 format(1x,a38,$)
|
|
read(5,110,end=999,err=111) filnam
|
|
if (filnam(1:1).eq.'/') goto 999
|
|
110 format(a50)
|
|
c open the file only after a valid filename has been retrieved
|
|
c error in filename results in prompting for the input again
|
|
open(66,file=filnam,status='OLD',err=10)
|
|
c find sequence file format type and the sequence identifiers
|
|
idcnt = 0
|
|
line = 1
|
|
c DO WHILE (.NOT.FOUND)
|
|
600 read(66,120,end = 410,err = 991) reclin
|
|
120 format(a80)
|
|
c STANFORD format, recognized by the ';' in the first column
|
|
c of the record
|
|
if (reclin(1:1).eq.';') then
|
|
c DO WHILE (.NOT.ENDFIL)
|
|
610 line = line + 1
|
|
read(66,120,end = 410,err=991) reclin
|
|
c to find the next sequence identifer scroll through the
|
|
c file until the first character in the line is not ';'
|
|
if (reclin(1:1).ne.';') then
|
|
found = .true.
|
|
stype ='STANFORD'
|
|
idcnt = idcnt + 1
|
|
sline(idcnt)= line
|
|
seqids(idcnt) = reclin(1:30)
|
|
line = line + 1
|
|
read(66,120,end = 410,err=991) reclin
|
|
c DO WHILE ((INDEX(RECLIN,'1').EQ.0).AND.
|
|
c . (INDEX(RECLIN,'2').EQ.0))
|
|
c ENDDO
|
|
615 if (index(reclin,'1').eq.0) then
|
|
if (index(reclin,'2').eq.0) then
|
|
line = line + 1
|
|
read(66,120,end = 410,err=991) reclin
|
|
goto 615
|
|
endif
|
|
endif
|
|
c assume at least one line not a sequence identifier occurs
|
|
c after last sequence read in to get around the CTRL-L problem.
|
|
c therefore, after reading in the record containing the end
|
|
c of sequence identifier, read in another record
|
|
line = line + 1
|
|
read(66,120,end = 410,err=991) reclin
|
|
endif
|
|
c ENDDO
|
|
if (.not.endfil) goto 610
|
|
c GENBANK format, recognized by the word LOCUS starting in the
|
|
c first position of the record
|
|
elseif (reclin(1:5).eq.'LOCUS') then
|
|
found = .true.
|
|
stype = 'GENBANK '
|
|
idcnt = 1
|
|
seqids(idcnt) = reclin(13:27)
|
|
c DO WHILE (.NOT.ENDFIL)
|
|
620 line = line + 1
|
|
read(66,120,end = 410,err=991) reclin
|
|
c scrolling through to find the key phrase ORIGIN because
|
|
c the line after this key phrase occurrance is where the sequence
|
|
c occurs in the file;
|
|
c read through the file obtaining other sequences by scrolling
|
|
c through to the '//' which signals the end of sequence
|
|
c Then start looking for another sequence identifier.
|
|
if (reclin(1:6).eq.'ORIGIN') then
|
|
sline(idcnt) = line
|
|
c DO WHILE (RECLIN(1:2).NE.'//')
|
|
625 if (reclin(1:2).ne.'//') then
|
|
line = line + 1
|
|
read(66,120,end = 410,err=991) reclin
|
|
goto 625
|
|
c ENDDO
|
|
endif
|
|
elseif (reclin(1:5).eq.'LOCUS') then
|
|
idcnt = idcnt + 1
|
|
seqids(idcnt) = reclin(13:27)
|
|
endif
|
|
c ENDDO
|
|
if (.not.endfil) goto 620
|
|
c PIR format, recognized by the '>' occurring in column 1 of a
|
|
c record. The sequence itself start 2 lines down from the record
|
|
c containing '>'.
|
|
elseif (reclin(1:1).eq.'>') then
|
|
found = .true.
|
|
stype = 'PIR '
|
|
c DO WHILE (.NOT.ENDFIL)
|
|
630 if (reclin(1:1).eq.'>') then
|
|
idcnt = idcnt + 1
|
|
sline(idcnt) = line + 1
|
|
seqids(idcnt) = reclin(5:25)
|
|
endif
|
|
line = line + 1
|
|
read(66,120,end = 410,err=991) reclin
|
|
c ENDDO
|
|
if (.not.endfil) goto 630
|
|
c EMBL (european) format, recognized by the key phrase ID in the
|
|
c first column of the record.
|
|
elseif (reclin(1:5).eq.'ID ') then
|
|
found = .true.
|
|
stype = 'EMBL '
|
|
c scrolling through to find the key phrase 'SQ Sequence' because
|
|
c the line after this key phrase occurrance is where the sequence
|
|
c occurs in the file;
|
|
c DO WHILE (.NOT.ENDFIL)
|
|
640 if (reclin(1:5).eq.'ID ') then
|
|
idcnt = idcnt + 1
|
|
seqids(idcnt) = reclin(6:index(reclin,' '))
|
|
elseif (reclin(1:13).eq.'SQ Sequence') then
|
|
sline(idcnt) = line
|
|
endif
|
|
line = line + 1
|
|
read(66,120,end = 410,err=991) reclin
|
|
c ENDDO
|
|
if (.not.endfil) goto 640
|
|
c NRC format, recognized by '(' occurring in the first column of
|
|
c a record. The record line following this line holds the number
|
|
c of elements in the sequence and the name of the sequence. The
|
|
c following line signals the beginning of the sequence.
|
|
c The sequence itself is read in by using variable format
|
|
c described by the '(' record line. This format statement on the
|
|
c record containing '(' ,must also be retrieved.
|
|
elseif (reclin(1:1).eq.'(') then
|
|
found = .true.
|
|
stype = 'NRC '
|
|
c DO WHILE (.NOT.ENDFIL)
|
|
650 if (reclin(1:1).eq.'(') then
|
|
idcnt = idcnt + 1
|
|
fmtseq(idcnt) = reclin(1:index(reclin,')'))
|
|
line = line + 1
|
|
read(66,140,end = 410,err = 991)
|
|
. seqnum(idcnt),seqids(idcnt)
|
|
sline(idcnt) = line
|
|
endif
|
|
line = line + 1
|
|
read(66,120,end = 410,err=991) reclin
|
|
c ENDDO
|
|
140 format(i4,5x,a30)
|
|
if (.not.endfil) goto 650
|
|
endif
|
|
c keep scrolling through the file until a key phrase signalling
|
|
c a format type is recognized or the end of file is found
|
|
line = line + 1
|
|
c ENDDO
|
|
if (.not.found) goto 600
|
|
c if a format type has not been found then this cannot be a
|
|
c sequence file and return to main.
|
|
410 if (.not.found) then
|
|
write(6,105) ' No sequence identifiers found in this file '
|
|
105 format(a50)
|
|
return
|
|
endif
|
|
c if this is a valid format type proceed
|
|
used = .true.
|
|
c if this is the first time used then return to main
|
|
c with RNUM, the number of identified sequences in the file
|
|
rnum = idcnt
|
|
endif
|
|
c IF USED this call before retreive a sequence using RNUM
|
|
seqid = ' '
|
|
pointr = rnum
|
|
if ((pointr.lt.1).or.(pointr.gt.idcnt)) then
|
|
c stop ' ERROR IN MULTID, SEQUENCE REQUESTED NOT FOUND'
|
|
call exit(1)
|
|
else
|
|
seqid = seqids(pointr)
|
|
endif
|
|
c having obtained a valid sequence choice or default of
|
|
c one available sequence, rewind the file and retrieve the
|
|
c sequence
|
|
rewind(66,err = 992)
|
|
c having retrieved the line number in the file where this
|
|
c identifier occurs from SLINE(POINTR) scroll through the
|
|
c file until this line is reached
|
|
do 550 i = 1,sline(pointr)
|
|
read(66,120,end = 410,err=991) reclin
|
|
550 continue
|
|
c if nrc type then read sequence according to format type
|
|
if (stype.eq.'NRC ') then
|
|
nseq = seqnum(pointr)
|
|
c if the number in the NRC sequence is greater than
|
|
c the maximum number sent down from main, then
|
|
c truncate to NMAX and output a message to the user.
|
|
if (nseq.gt.nmax) then
|
|
write(6,160) ' Sequence truncated to ',nseq
|
|
160 format(1x,a30,i4)
|
|
nseq = nmax
|
|
endif
|
|
read(66,fmtseq(pointr),end = 420,err=991)
|
|
. (seq(k),k=1,seqnum(pointr))
|
|
c Else if not type NRC then find the sequence by taking
|
|
c each letter of the next records until end-of-sequence
|
|
c indicator found or number in the sequence NMAX is reached
|
|
else
|
|
n = 1
|
|
found = .false.
|
|
c DO WHILE (.NOT.FOUND)
|
|
670 read(66,120,end = 420,err=991) reclin
|
|
c there is a need for the two positions to start reading the
|
|
c sequence; to accommodate the GENBANK format and the STANFORD
|
|
c end-of-sequence checks.
|
|
if (stype.eq.'GENBANK ') then
|
|
start = 10
|
|
else
|
|
start = 1
|
|
endif
|
|
do 565 i = start,80
|
|
c if the number in the sequence is less than the desired
|
|
c number, NMAX sent down from main, then retrieve sequence
|
|
if (n.le.nmax) then
|
|
if (.not.found) then
|
|
seq(n) = reclin(i:i)
|
|
c check if an early end of sequence
|
|
if ((seq(n).eq.'1').or.(seq(n).eq.'2').or.
|
|
. (reclin(1:1).eq.'/').or.(seq(n).eq.'*')) then
|
|
seq(n) = ' '
|
|
nseq = n - 1
|
|
found = .true.
|
|
c if not an end-of-sequence character but have gone far
|
|
c enough, then truncate to NMAX
|
|
elseif (n.eq.nmax) then
|
|
nseq = nmax
|
|
found = .true.
|
|
write(6,160) ' Sequence truncated to ',nseq
|
|
c if not a end-of-sequence character, check to see if it is
|
|
c not a blank character. Blank characters will not be added
|
|
c to the sequence.
|
|
elseif (seq(n).ne.' ') then
|
|
n = n + 1
|
|
endif
|
|
endif
|
|
c if the number of sequence characters found is Greater than
|
|
c NMAX then all of the sequence has been found.
|
|
else
|
|
found = .true.
|
|
nseq = n
|
|
write(6,160)
|
|
. ' Sequence truncated to ',nseq
|
|
endif
|
|
565 continue
|
|
c ENDDO
|
|
if (.not.found) goto 670
|
|
endif
|
|
420 return
|
|
c 991 stop ' ERROR IN READING FILE '
|
|
c 992 stop ' ERROR IN REWINDING FILE'
|
|
c 999 stop ' END of SESSION...GOOD BYE'
|
|
991 call exit(1)
|
|
992 call exit(1)
|
|
999 call exit(1)
|
|
end
|