staden-lg/src/staden/splitp3.f

393 lines
12 KiB
Fortran

C Routine to translate prosite motif library into patterns
C assumes only exact match, membership of set and NOT
C assumes no errors in library
C assumes gaps up to 40
C assumes no variation in length of motif: only variation in
C their separation ie [fred](2,3) is not dealt with (but why should it exist)
C but x(2,3) causes a new motif to be started starting 2 away with 1 extra
C position.
C the translation is crude: all go to membership of set
C nots go to membership of set for the rest of the character set
C fixed gaps are included in "weight matrices" and so will slow searches
C 18-7-91 Added titles to pattern files
C 3-2-92 Changed program so it writes pattern and weight files
C to current directory. All file names in the fofn and
C pattern files are full path names.
SUBROUTINE FMAIN()
INTEGER AP,RP,SCORE
PARAMETER (MAXSTR = 255,MAXLEN = 120, IDM = 26, MAXDEV = 5)
INTEGER WTSMAT(IDM,MAXLEN)
INTEGER DEVNOS(MAXDEV)
CHARACTER AMOS*(MAXSTR),RS*(MAXSTR),CACIDS*20,COMMAS*40
CHARACTER AAMOS*(MAXSTR)
CHARACTER*60 PATNAM,WTSNAM,HELPF,FILNAM
CHARACTER CSET*26,TITLE*80
LOGICAL ACID,NUMBER
EQUIVALENCE (AAMOS(6:),AMOS)
EXTERNAL ACID,NUMBER,INTFC
DATA COMMAS/',,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,'/
DATA CSET/'ABCDEFGHIJKLMNOPQRSTUVWXYZ'/
CALL INITLU(IDM)
CALL UNITNO(KBIN,KBOUT,DEVNOS,MAXDEV)
WRITE(KBOUT,*)
+' SPLITP3 v3.0 Feb 1992, author Rodger Staden'
WRITE(KBOUT,*)
+' Splits Prosite motif library into Staden pattern files'
WRITE(KBOUT,*)
+' and creates a file of file names. The pattern files and'
WRITE(KBOUT,*)
+' weight matrices are written to the current directory but'
WRITE(KBOUT,*)
+' full path names are included in the file of file names'
IDEV1 = DEVNOS(1)
IDEV2 = DEVNOS(2)
IDEV3 = DEVNOS(3)
IDEV4 = DEVNOS(4)
NVAR = 0
NMOTIF = 0
FILNAM = ' '
CALL OPENF1(DEVNOS(1),FILNAM,0,IOK,KBIN,KBOUT,
+'Prosite library file',
+IHELPS,IHELPE,HELPF,DEVNOS(5))
IF(IOK.NE.0) STOP
FILNAM = ' '
CALL OPENF1(DEVNOS(2),FILNAM,1,IOK,KBIN,KBOUT,
+'Name for file of pattern file names',
+IHELPS,IHELPE,HELPF,DEVNOS(5))
IF(IOK.NE.0) STOP
3 CONTINUE
LIN = 0
CALL GTSTR('Path name of motif directory',' ',PATNAM,LIN,
+KBOUT,KBIN,INFLAG)
IF(INFLAG.EQ.1) THEN
CALL HELP2(IHELPS,IHELPE,HELPF,DEVNOS(5),KBIN,KBOUT)
GO TO 3
END IF
IF(INFLAG.EQ.2) STOP
IDIR = MAX(1,INDEX(PATNAM,' '))
C
C add final slash if not present
IF(PATNAM(IDIR-1:IDIR-1).NE.'/') THEN
PATNAM(IDIR:IDIR) = '/'
IDIR = IDIR + 1
END IF
5 CONTINUE
READ(IDEV1,1000,ERR=100,END=200)AAMOS
1000 FORMAT(A)
IF(AAMOS(1:2).NE.'AC') GO TO 5
NAMLEN = INDEX(AMOS(1:),';') - 1
PATNAM(IDIR:) = AMOS(1:NAMLEN)
WTSNAM = PATNAM
PATNAM = PATNAM(1:NAMLEN+IDIR-1)//'.PAT'
CALL OPENRS(IDEV3,PATNAM(IDIR:),IOK,LRECL,1)
IF(IOK.NE.0) WRITE(*,*)'SCREAM, FILE OPENING'
6 CONTINUE
READ(IDEV1,1000,ERR=100,END=200)AAMOS
IF(AAMOS(1:2).NE.'DE') GO TO 6
TITLE = AAMOS(6:)
WRITE(IDEV3,1002)TITLE
9 CONTINUE
READ(IDEV1,1000,ERR=100,END=200)AAMOS
C Check for empty definition ie no pa line
IF(AAMOS(1:2).EQ.'//') THEN
WRITE(KBOUT,*)TITLE
WRITE(KBOUT,*)'No PA line!'
CLOSE(UNIT=IDEV3)
GO TO 5
END IF
IF(AAMOS(1:2).NE.'PA') GO TO 9
IF(INDEX(AMOS,'.').EQ.0) THEN
J = INDEX(AMOS,' ')
READ(IDEV1,1001,ERR=100,END=200)AMOS(J:)
1001 FORMAT(5X,A)
END IF
LAMOS = INDEX(AMOS,'.')
C WRITE(KBOUT,*)AMOS(1:LAMOS)
NMOTIF = NMOTIF + 1
AP = 0
RP = 0
SCORE = 0
MOTIF = 1
WRITE(IDEV3,1002)'A6 '//
+PATNAM(IDIR+2:IDIR+6)//' Class'
1002 FORMAT(A)
10 CONTINUE
AP = AP + 1
IF(AP.LT.LAMOS) THEN
IF(AMOS(AP:AP).EQ.'-') AP = AP + 1
IF(ACID(AMOS(AP:))) THEN
RP = RP + 1
RS(RP:RP) = AMOS(AP:AP)
LASTS = RP
RP = RP + 1
RS(RP:RP) = ','
SCORE = SCORE + 1
ELSE IF(AMOS(AP:AP).EQ.'[') THEN
AP = AP + 1
LSET = INDEX(AMOS(AP:),']') - 1
RP = RP + 1
RS(RP:RP+LSET-1) = AMOS(AP:AP+LSET-1)
LASTS = RP
RP = RP + LSET
AP = AP + LSET
RS(RP:RP) = ','
AP = AP + 1
SCORE = SCORE + 1
IF(AMOS(AP:AP).EQ.'(') THEN
AP = AP + 1
LSET = INDEX(AMOS(AP:),')') - 1
ICOMMA = INDEX(AMOS(AP:),',')
IF((ICOMMA.GT.0).AND.(ICOMMA.LT.LSET)) THEN
WRITE(KBOUT,*)'Variable gap in []'
NVAR = NVAR + 1
GO TO 5
END IF
NREP = INTFC(AMOS(AP:),LSET)
LREP = RP - LASTS
RP = RP + 1
CALL REPST(RS(LASTS:),RS(RP:),LREP,NREP)
RP = RP + (NREP - 1) * (LREP + 1) - 1
AP = AP + LSET
SCORE = SCORE + NREP - 1
END IF
ELSE IF(AMOS(AP:AP).EQ.'{') THEN
AP = AP + 1
LSET = INDEX(AMOS(AP:),'}') - 1
RP = RP + 1
LSETS = LSET
CALL CACID(AMOS(AP:),CACIDS,LSET)
RS(RP:RP+LSET-1) = CACIDS(1:LSET)
LASTS = RP
RP = RP + LSET
AP = AP + LSETS
RS(RP:RP) = ','
AP = AP + 1
SCORE = SCORE + 1
IF(AMOS(AP:AP).EQ.'(') THEN
AP = AP + 1
LSET = INDEX(AMOS(AP:),')') - 1
ICOMMA = INDEX(AMOS(AP:),',')
IF((ICOMMA.GT.0).AND.(ICOMMA.LT.LSET)) THEN
WRITE(*,*)'Variable gap in {}'
NVAR = NVAR + 1
GO TO 5
END IF
NREP = INTFC(AMOS(AP:),LSET)
LREP = RP - LASTS
RP = RP + 1
CALL REPST(RS(LASTS:),RS(RP:),LREP,NREP)
RP = RP + (NREP - 1) * (LREP + 1) - 1
AP = AP + LSET
SCORE = SCORE + NREP - 1
END IF
ELSE IF(AMOS(AP:AP).EQ.'x') THEN
AP = AP + 1
IF(AMOS(AP:AP).EQ.'(') THEN
AP = AP + 1
LSET = INDEX(AMOS(AP:),')') - 1
ICOMMA = INDEX(AMOS(AP:),',')
IF((ICOMMA.GT.0).AND.(ICOMMA.LT.LSET)) THEN
IF(RS(RP:RP).EQ.',') RP = RP - 1
WTSNAM = WTSNAM(1:NAMLEN+IDIR-1)//'.WTS'//CSET(MOTIF:MOTIF)
WRITE(IDEV3,1002)WTSNAM(1:NAMLEN+IDIR+4)
CALL INTRP7(RS,RP,SCORE,WTSMAT,IDM,MAXLEN,
+CUTOFF,IOK)
IF(IOK.NE.0) WRITE(*,*)'SCREAM'
CALL OPENRS(IDEV4,WTSNAM(IDIR:),IOK,LRECL,1)
IF(IOK.NE.0) WRITE(*,*)'SCREAM, FILE OPENING'
TOP = SCORE
CALL WRTSCP(TITLE,SCORE,0,CUTOFF,TOP,IDM,WTSMAT,IDEV4)
NCOMMA = INTFC(AMOS(AP:),ICOMMA-1)
JCOMMA = INTFC(AMOS(AP+ICOMMA:),LSET-ICOMMA)
WRITE(IDEV3,1002)'A6 '//
+PATNAM(IDIR+2:IDIR+6)//' Class'
WRITE(IDEV3,1008)MOTIF
1008 FORMAT(I7,' Relative motif')
WRITE(IDEV3,1003)SCORE+NCOMMA+1
1003 FORMAT(I7,' Relative start position')
WRITE(IDEV3,1004)JCOMMA-NCOMMA
1004 FORMAT(I7,' Number of extra positions')
AP = AP + LSET
RP = 0
SCORE = 0
MOTIF = MOTIF + 1
ELSE
NCOMMA = INTFC(AMOS(AP:),LSET)
RP = RP + 1
RS(RP:RP+NCOMMA-1) = COMMAS(1:NCOMMA)
RP = RP + NCOMMA - 1
AP = AP + LSET
SCORE = SCORE + NCOMMA
END IF
ELSE
RP = RP + 1
RS(RP:RP+1) = ','
SCORE = SCORE + 1
END IF
END IF
GO TO 10
END IF
RP = RP - 1
IF(RS(RP:RP).EQ.',') RP = RP - 1
WTSNAM = WTSNAM(1:NAMLEN+IDIR-1)//'.WTS'//CSET(MOTIF:MOTIF)
WRITE(IDEV3,1002)WTSNAM(1:NAMLEN+IDIR+4)
CALL INTRP7(RS,RP,SCORE,WTSMAT,IDM,MAXLEN,
+CUTOFF,IOK)
IF(IOK.NE.0) WRITE(*,*)'SCREAM'
CALL OPENRS(IDEV4,WTSNAM(IDIR:),IOK,LRECL,1)
IF(IOK.NE.0) WRITE(*,*)'SCREAM, FILE OPENING'
TOP = SCORE
CALL WRTSCP(TITLE,SCORE,0,CUTOFF,TOP,IDM,WTSMAT,IDEV4)
WRITE(IDEV2,1002)PATNAM
GO TO 5
100 CONTINUE
WRITE(*,*)'READ ERROR'
STOP
200 CONTINUE
WRITE(KBOUT,*)'Number of patterns',NMOTIF
WRITE(KBOUT,*)'Number of variable gaps',NVAR
END
SUBROUTINE REPST(S1,S2,L,N)
CHARACTER S1*(*),S2*(*)
J = 1
DO 10 I = 1,N-1
S2(J:J+L-1) = S1(1:L)
S2(J+L:J+L) = ','
J = J + L + 1
10 CONTINUE
END
INTEGER FUNCTION INTFC(STRING,LS)
CHARACTER TEMP*10,STRING*(*)
TEMP = STRING(1:LS)
CALL RJST(TEMP)
READ(TEMP,1000,ERR=10)INTFC
1000 FORMAT(I10)
RETURN
10 CONTINUE
INTFC = 0
WRITE(*,*)'Error in encode'
END
SUBROUTINE CACID(ACID,REST,LACID)
CHARACTER ACID*(*),REST*20,ACIDS*20
SAVE ACIDS
DATA ACIDS/'QWERTYIPASDFGHKLCVNM'/
J = 0
DO 10 I = 1,20
DO 5 K = 1,LACID
IF(ACID(K:K).EQ.ACIDS(I:I)) GO TO 9
5 CONTINUE
J = J + 1
REST(J:J) = ACIDS(I:I)
9 CONTINUE
10 CONTINUE
LACID = J
END
LOGICAL FUNCTION ACID(CHAR)
LOGICAL ONEOF
CHARACTER ACIDS*(20),CHAR
SAVE ACIDS
EXTERNAL ONEOF
DATA ACIDS/'QWERTYIPASDFGHKLCVNM'/
ACID = ONEOF(ACIDS,CHAR)
END
LOGICAL FUNCTION NUMBER(CHAR)
LOGICAL ONEOF
CHARACTER DIGITS*(10),CHAR
SAVE DIGITS
EXTERNAL ONEOF
DATA DIGITS/'1234567890'/
NUMBER = ONEOF(DIGITS,CHAR)
END
LOGICAL FUNCTION ONEOF(CHARS,CHAR)
CHARACTER CHARS*(*),CHAR
ONEOF = .FALSE.
IF(INDEX(CHARS,CHAR).NE.0) ONEOF = .TRUE.
END
SUBROUTINE INTRP7(STRING,ISEND,LENGTH,WT,MAXCHR,MAXLEN,
+CUTOFF,IOK)
CHARACTER STRING*(*),TERM
INTEGER WT(MAXCHR,MAXLEN)
INTEGER CTONUM
EXTERNAL CTONUM
PARAMETER (TERM = ',')
IOK = 0
C POINT TO STRING
IS = 1
ICOL = 1
C COUNT FILLED COLUMNS
CUTOFF = 0.
10 CONTINUE
CALL FILLI(WT(1,ICOL),MAXCHR,0)
CUTOFF = CUTOFF + 1.
20 CONTINUE
IF(IS.LE.ISEND)THEN
IF(STRING(IS:IS).NE.TERM)THEN
IROW = CTONUM(STRING(IS:IS))
WT(IROW,ICOL) = 1
IS = IS + 1
GO TO 20
END IF
NC = 1
30 CONTINUE
IF(IS.LE.ISEND)THEN
IF(STRING(IS:IS).EQ.TERM)THEN
ICOL = ICOL + 1
CALL FILLI(WT(1,ICOL),MAXCHR,0)
NC = NC + 1
IS = IS + 1
GO TO 30
END IF
C
C END OF TERMINATORS
C
C IF(ICOL.GT.1) GO TO 10
C ERROR TERMINATOR BEFORE ANY GOOD COLUMNS
C IOK = 1
C RETURN
GO TO 10
END IF
C STRING ENDED WITH TERMINATOR
LENGTH = ICOL - NC + 1
RETURN
END IF
C
C END REACHED WITH NO TERMINATOR (NORMAL)
IF(ICOL.GT.0)THEN
LENGTH = ICOL
IOK = 0
RETURN
END IF
IOK = 1
END
SUBROUTINE WRTSCP(TITLE,LENGTH,MIDDLE,BOT,TOP,IDM,
+SUM,IDEV)
INTEGER TOT(120)
INTEGER SUM(IDM,LENGTH)
CHARACTER CHRSET*22,TITLE*(*)
SAVE CHRSET
DATA CHRSET/'CSTPAGNDEQBZHRKMILVFYW'/
C PROTEIN MATRICES DONT WRITE ROWS FOR -X? AND SPACE SO SET DIMENSION
C TO IDM-4
CALL FILLI(TOT,120,0)
WRITE(IDEV,1018)TITLE
1018 FORMAT(' ',A)
1019 FORMAT(' P',20I4)
1020 FORMAT(' N',20I4)
1021 FORMAT(' ',A,20I4)
1022 FORMAT(' ',2I6,2F10.3)
WRITE(IDEV,1022)LENGTH,MIDDLE,BOT,TOP
NLINES=1+(LENGTH-1)/20
K1=1
DO 400 J=1,NLINES
K2=MIN((K1+19),LENGTH)
WRITE(IDEV,1019)(K,K=K1-MIDDLE,K2-MIDDLE)
WRITE(IDEV,1020)(TOT(K),K=K1,K2)
DO 390 I=1,IDM-4
WRITE(IDEV,1021)CHRSET(I:I),(SUM(I,K),K=K1,K2)
390 CONTINUE
K1=K1+20
IF(K1.GT.LENGTH)K1=LENGTH
400 CONTINUE
CLOSE(UNIT=IDEV)
END