gde_linux/ZUKER/formid.f
2022-03-07 20:43:05 +00:00

413 lines
16 KiB
Fortran
Executable file

subroutine formid(seqid,seq,nseq,nmax,used)
c RESEARCHER: M. ZUKER
c JUN 1986
c DS TUDHOPE
c WRITTEN TO WORK ON VAX-11-750 UNDER VMS4.3 USING STANDARD F-77
c #### NOTE #### USE OF CARRIAGE CONTROL LINE FEED SUPPRESSION
c IS USED IN THIS PROGRAM AND IS NOT STANDARD
c FORTRAN-77
c SEE '$' IN FORMAT STATEMENT 102
implicit integer*2 (a-z)
logical found,endfil,valid,used
integer*2 sline(500),seqnum(500)
integer*4 nseq,nmax
character*8 stype
character*30 seqids(500)
character*30 choice,seqid
character*50 filnam,fmtseq(500)
character*80 reclin
character*1 seq(nmax)
data found/.false./,
. endfil/.false./,
. valid/.false./
c **************************************************************
c subroutine number FS372, test main driver FP475
c biomath program catalogued as FS372.FOR named formid.for
c **************************************************************
c PURPOSE:
c subroutine used to extract sequences from various
c format type files including
c STANFORD,GENBANK,EMBL,PIR,and,NRC
c **************************************************************
c variable list table:
c * -- sent down from main returned unchanged
c ** - returned to main from subroutine
c ***- sent down from main and returned changed
c INTEGERS:IDCNT -- to keep track of the number of sequence
c identifiers found in the file
c I,K -- loop counters
c LINE -- to point to the record line number of a file
c N -- counter to extract the correct number of
c sequence elements
c * NMAX -- maximum length of sequence expected by user
c ** NSEQ -- length of the sequence retrieved from file
c POINTR -- pointer to point to sequence identifier
c chosen from array of identifiers
c SEQNUM -- an ARRAY of the length of the sequences
c for the sequence identifiers found in the
c NRC-format type files
c SLINE -- an ARRAY of the record-line numbers where
c a sequence starts in a file
c START -- defines what column of a record to start
c reading the sequence in.
c CHARACTERS:CHOICE-- character string of length 20 to read in
c choice for sequence identifier to retrieve
c FILNAM -- character string of length 50 to read in
c the filename.
c FMTSEQ -- an ARRAY of characters, each length 50,
c describing how the sequence is to be read
c RECLIN -- a record of a file, length 80 characters
c ** SEQ -- an ARRAY of characters each length 1 to
c store the sequence
c ** SEQID -- retrieved name of the sequence
c SEQIDS -- an ARRAY of retrieved names of sequences
c STYPE -- character string of length 8 defining the
c format type of the file.
c LOGICALS:FOUND -- logical variable used in looping until
c something retrieved
c *** USED -- logical variable used in determining if the
c subroutine has been previously called
c VALID -- logical variable used in looping until
c some input valid
c
c
c if the subroutine has NOT been USED then must
c input the filename and do error checking on filename
c else branch to listing the sequences available in this file
if (.not.used) then
10 found = .false.
valid = .false.
used = .false.
c initialize for a new file by setting variables back
c to zero and blanking out old names
if (nmax.eq.0) nmax = 9999
nseq = 0
stype = ' '
seqid = ' '
filnam= ' '
do 500 i = 1,500
sline(i) = 0
seqnum(i)= 0
seqids(i)=' '
fmtseq(i)=' '
500 continue
111 write(6,102) 'Input sequence file name (/ to end) '
102 format(1x,a37,$)
read(5,110,end=999,err=111) filnam
if (filnam(1:1).eq.'/') then
goto 999
endif
110 format(a50)
c open the file only after a valid filename has been retrieved
c error in filename results in prompting for the input again
open(66,file=filnam,status='OLD',err=10)
c find sequence file format type and the sequence identifiers
idcnt = 0
line = 1
c DO WHILE (.NOT.FOUND)
600 read(66,120,end = 410,err = 991) reclin
120 format(a80)
c STANFORD format, recognized by the ';' in the first column
c of the record
if (reclin(1:1).eq.';') then
c DO WHILE (.NOT.ENDFIL)
610 line = line + 1
read(66,120,end = 410,err=991) reclin
c to find the next sequence identifer scroll through the
c file until the first character in the line is not ';'
if (reclin(1:1).ne.';') then
found = .true.
stype ='STANFORD'
idcnt = idcnt + 1
sline(idcnt)= line
seqids(idcnt) = reclin(1:30)
line = line + 1
read(66,120,end = 410,err=991) reclin
c DO WHILE ((INDEX(RECLIN,'1').EQ.0).AND.
c . (INDEX(RECLIN,'2').EQ.0))
c ENDDO
615 if (index(reclin,'1').eq.0) then
if (index(reclin,'2').eq.0) then
line = line + 1
read(66,120,end = 410,err=991) reclin
goto 615
endif
endif
c assume at least one line not a sequence identifier occurs
c after last sequence read in to get around the CTRL-L problem.
c therefore, after reading in the record containing the end
c of sequence identifier, read in another record
line = line + 1
read(66,120,end = 410,err=991) reclin
endif
c ENDDO
if (.not.endfil) goto 610
c GENBANK format, recognized by the word LOCUS starting in the
c first position of the record
elseif (reclin(1:5).eq.'LOCUS') then
found = .true.
stype = 'GENBANK '
idcnt = 1
seqids(idcnt) = reclin(13:27)
c DO WHILE (.NOT.ENDFIL)
620 line = line + 1
read(66,120,end = 410,err=991) reclin
c scrolling through to find the key phrase ORIGIN because
c the line after this key phrase occurrance is where the sequence
c occurs in the file;
c read through the file obtaining other sequences by scrolling
c through to the '//' which signals the end of sequence
c Then start looking for another sequence identifier.
if (reclin(1:6).eq.'ORIGIN') then
sline(idcnt) = line
c DO WHILE (RECLIN(1:2).NE.'//')
625 if (reclin(1:2).ne.'//') then
line = line + 1
read(66,120,end = 410,err=991) reclin
goto 625
c ENDDO
endif
elseif (reclin(1:5).eq.'LOCUS') then
idcnt = idcnt + 1
seqids(idcnt) = reclin(13:27)
endif
c ENDDO
if (.not.endfil) goto 620
c PIR format, recognized by the '>' occurring in column 1 of a
c record. The sequence itself start 2 lines down from the record
c containing '>'.
elseif (reclin(1:1).eq.'>') then
found = .true.
stype = 'PIR '
c DO WHILE (.NOT.ENDFIL)
630 if (reclin(1:1).eq.'>') then
idcnt = idcnt + 1
sline(idcnt) = line + 1
seqids(idcnt) = reclin(5:25)
endif
line = line + 1
read(66,120,end = 410,err=991) reclin
c ENDDO
if (.not.endfil) goto 630
c EMBL (european) format, recognized by the key phrase ID in the
c first column of the record.
elseif (reclin(1:5).eq.'ID ') then
found = .true.
stype = 'EMBL '
c scrolling through to find the key phrase 'SQ Sequence' because
c the line after this key phrase occurrance is where the sequence
c occurs in the file;
c DO WHILE (.NOT.ENDFIL)
640 if (reclin(1:5).eq.'ID ') then
idcnt = idcnt + 1
seqids(idcnt) = reclin(6:index(reclin,' '))
elseif (reclin(1:13).eq.'SQ Sequence') then
sline(idcnt) = line
endif
line = line + 1
read(66,120,end = 410,err=991) reclin
c ENDDO
if (.not.endfil) goto 640
c NRC format, recognized by '(' occurring in the first column of
c a record. The record line following this line holds the number
c of elements in the sequence and the name of the sequence. The
c following line signals the beginning of the sequence.
c The sequence itself is read in by using variable format
c described by the '(' record line. This format statement on the
c record containing '(' ,must also be retrieved.
elseif (reclin(1:1).eq.'(') then
found = .true.
stype = 'NRC '
c DO WHILE (.NOT.ENDFIL)
650 if (reclin(1:1).eq.'(') then
idcnt = idcnt + 1
fmtseq(idcnt) = reclin(1:index(reclin,')'))
line = line + 1
read(66,140,end = 410,err = 991)
. seqnum(idcnt),seqids(idcnt)
sline(idcnt) = line
endif
line = line + 1
read(66,120,end = 410,err=991) reclin
c ENDDO
140 format(i4,5x,a30)
if (.not.endfil) goto 650
endif
c keep scrolling through the file until a key phrase signalling
c a format type is recognized or the end of file is found
line = line + 1
c ENDDO
if (.not.found) goto 600
c if a format type has not been found then this cannot be a
c sequence file and return to main.
410 if (.not.found) then
write(6,105) ' No sequence identifiers found in this file '
105 format(a50)
return
endif
c if this is a valid format type proceed
used = .true.
endif
c IF USED this call before start at listing identifiers
c listing the sequence identifiers
c allow the user to input the number requested
c or control d to finish here and return to
c entering a new filename
valid = .false.
c keep outputting the list of sequence identifiers
c while NOT VALID choice of sequence identifier is inputted
c DO WHILE (.NOT.VALID)
660 write(6,150) 'Available sequences in ',filnam
150 format(/,1x,a23,a50)
do 525 k = 1,idcnt,2
if (sline(k).ne.0) then
if (k .eq. idcnt) then
write (6,151) k,'.',seqids(k)
else
write (6,191) k,'.',seqids(k),k+1,'.',seqids(k+1)
endif
151 format(1x,i3,a1,1x,a30)
191 format(1x,i3,a1,1x,a30,3x,i3,a1,1x,a30)
endif
525 continue
choice = ' '
seqid = ' '
pointr = 0
write(6,152)
. 'Choose sequence by number or name , or ? for relist; '
write(6,152)
. ' <return> defaults to the first one, / for new file.'
152 format(1x,a60)
read(5,153,end=10) choice
153 format(a30)
c error checking of the inputted CHOICE and determining
c if this input is a number, a default value, a relist command
c or the name of the sequence identifier
i = 1
do while (choice(i:i).eq.' '.and.i.lt.6)
i = i + 1
enddo
if ((choice(i:i).ge.'1').and.(choice(i:i).le.'9')) then
c using COLLATING sequence to convert character to number value
pointr = ichar(choice(i:i)) - ichar('0')
if (choice(i+1:i+1).ne.' ') then
pointr = ichar(choice(i+1:i+1)) - ichar('0') + 10 * pointr
endif
if (choice(i+2:i+2).ne.' ') then
pointr = ichar(choice(i+2:i+2)) - ichar('0') + 10 * pointr
endif
if (pointr.gt.idcnt) then
write(6,154) ' NUMBER CHOICE BETWEEN 1 AND ',idcnt
154 format(a30,i2)
else
seqid = seqids(pointr)
valid = .true.
endif
elseif (ichar(choice(6:6)).eq.32) then
pointr = 1
seqid = seqids(1)
valid = .true.
elseif (choice(1:1).eq.'?') then
continue
else
c find out if the inputted choice is in the list of seq identifiers.
pointr = 0
do 535 k = 1,idcnt
if (seqids(k).eq.choice) then
seqid = seqids(k)
pointr = k
valid = .true.
endif
535 continue
if (pointr.eq.0) then
write(6,105) 'Does not match any in given list'
endif
endif
c ENDDO
if (.not.valid) goto 660
c having obtained a valid sequence choice or default of
c one available sequence, rewind the file and retrieve the
c sequence
rewind(66,err = 992)
c having retrieved the line number in the file where this
c identifier occurs from SLINE(POINTR) scroll through the
c file until this line is reached
c
c SUN WARNING: brancd into block.
do 550 i = 1,sline(pointr)
read(66,120,end = 410,err=991) reclin
550 continue
c if nrc type then read sequence according to format type
if (stype.eq.'NRC ') then
nseq = seqnum(pointr)
c if the number in the NRC sequence is greater than
c the maximum number sent down from main, then
c truncate to NMAX and output a message to the user.
if (nseq.gt.nmax) then
write(6,160) ' Sequence truncated to ',nseq
160 format(1x,a30,i5)
nseq = nmax
endif
read(66,fmtseq(pointr),end = 420,err=991)
. (seq(k),k=1,seqnum(pointr))
c Else if not type NRC then find the sequence by taking
c each letter of the next records until end-of-sequence
c indicator found or number in the sequence NMAX is reached
else
n = 1
found = .false.
c DO WHILE (.NOT.FOUND)
670 read(66,120,end = 420,err=991) reclin
c there is a need for the two positions to start reading the
c sequence; to accommodate the GENBANK format and the STANFORD
c end-of-sequence checks.
if (stype.eq.'GENBANK ') then
start = 10
else
start = 1
endif
do 565 i = start,80
c if the number in the sequence is less than the desired
c number, NMAX sent down from main, then retrieve sequence
if (n.le.nmax) then
if (.not.found) then
seq(n) = reclin(i:i)
c check if an early end of sequence
if ((seq(n).eq.'1').or.(seq(n).eq.'2').or.
. (reclin(1:1).eq.'/').or.(seq(n).eq.'*')) then
seq(n) = ' '
nseq = n - 1
found = .true.
c if not an end-of-sequence character but have gone far
c enough, then truncate to NMAX
elseif (n.eq.nmax) then
nseq = nmax
found = .true.
write(6,160) ' Sequence truncated to ',nseq
c if not a end-of-sequence character, check to see if it is
c not a blank character. Blank characters will not be added
c to the sequence.
elseif (seq(n).ne.' ') then
n = n + 1
endif
endif
c if the number of sequence characters found is Greater than
c NMAX then all of the sequence has been found.
else
found = .true.
nseq = n
write(6,160)
. ' Sequence truncated to ',nseq
endif
565 continue
c ENDDO
if (.not.found) goto 670
endif
420 return
c 991 stop ' ERROR IN READING FILE '
c 992 stop ' ERROR IN REWINDING FILE'
c 999 stop ' END of SESSION...GOOD BYE'
991 call exit(1)
992 call exit(1)
999 call exit(1)
end