tiger/tiger

381 lines
9.7 KiB
Python

#!/usr/bin/env python
#*********TIGER v1.02*************
#...read in options...
import time
lt_S = time.localtime()
import sys
import random
import re
from tiger_fns_102 import *
options = sys.argv
formRate = 0
formSort = 0
binNo = 10
file = ""
rate_file = ""
write_rates = False
numbered = False
doPTP = False
rands = 100
pval = 0.05
write_pvals = False
pval_file = ""
unknown = ["?"]
if len(options) < 2:
printHelp()
sys.exit(0)
for opt in range(len(options)):
if options[opt] == "-in":
file_name = options[opt + 1]
try:
file = open(file_name)
except IOError:
print ("File \"" + file_name + "\" not found...")
sys.exit(0)
elif options[opt] == "-b":
binNo = int(options[opt + 1])
elif options[opt] == "-f":
formOpts = options[opt + 1].split(",")
if "r" in formOpts:
formRate = 1
if "s" in formOpts:
formSort = 1
if "c" in formOpts:
numbered = True
elif options[opt] == "-v":
print ("TIGER version 1.02")
sys.exit(0)
elif options[opt] == "-rl":
write_rates = True
try:
rate_file = options[opt+1]
if rate_file[0] == "-":
print ("\n\nPlease specify file name for -rl option.\n\n")
sys.exit(0)
except IndexError:
print ("\n\nPlease specify file name for -rl option.\n\n")
sys.exit(0)
elif options[opt] == "-pl":
write_pvals = True
try:
pval_file = options[opt+1]
if pval_file[0] == "-":
print ("\n\nPlease specify file name for -pl option.\n\n")
sys.exit(0)
except IndexError:
print ("\n\nPlease specify file name for -pl option.\n\n")
sys.exit(0)
elif options[opt] == "-ptp":
doPTP = True
elif options[opt] == "-z":
rands = int(options[opt+1])
elif options[opt] == "-p":
pval = float(options[opt+1])
elif options[opt] == "-u":
unknown = options[opt+1].split(",")
if not file:
print ("No file specified (-in option)")
sys.exit(0)
#Parse .aln file
names = []
seqs = []
if ">" in file.readline():
tmp = FastaParse(file_name)
else:
print ("""
*******************************
File not in correct format!
TIGER accepts FastA format.
*******************************
""")
sys.exit(0)
import re
names = tmp[0]
for x, nm in (enumerate(names)):
names[x] = re.sub(" ", "", nm)
seqs = tmp[1]
lns = [len(l) for l in seqs]
lns.sort()
if lns[0] != lns[-1]:
print ("\n\nUneven sequence lengths. Ensure sequences have been aligned!\n\n")
sys.exit(0)
datatype = DNAdetect(seqs[0])
#Create array of site patterns
patterns = []
sites = []
comp_Pats = []
comp_Sites = []
for i in range(len(seqs[0])):
sites.append("".join([j[i] for j in seqs]))
patterns.append(getPattern(sites[-1], unknown))
if patterns[-1] not in comp_Pats:
comp_Pats.append(patterns[-1])
comp_Sites.append(sites[-1])
#Compare character patterns and score
ranks = list(range(len(patterns))) #ranks = range(len(patterns))
comp_ranks = []
keep = []
br = ""
if doPTP:
comp_disagree = []
disagree = list(range(len(patterns))) #disagree = range(len(patterns))
for aa, patA in enumerate(comp_Pats):
if re.search("\|", patA):
cfl = scoreConflict(aa, patA, patterns)
comp_ranks.append(1.0 - cfl)
else:
comp_ranks.append(1.0)
if doPTP:
dis = 0
for z in range(rands):
patJ = jumblePattern(comp_Sites[aa], unknown)
scr = scoreConflict(aa, patJ, patterns)
#check if site has higher conflict score than original site
if scr >= (1.0 - comp_ranks[aa]):
dis += 1
dis = float(dis)/float(rands)
comp_disagree.append(dis)
i = -1
try:
while 1:
i = patterns.index(patA, i + 1)
ranks[i] = comp_ranks[aa]
if doPTP:
disagree[i] = comp_disagree[aa]
except ValueError:
pass
#Output total alignment score to file alnRate.txt
if write_rates:
handle = open(rate_file, 'w')
handle.write("\n".join([str(r) for r in ranks]))
handle.close()
if write_pvals and not doPTP:
outF.write("-ptp option not selected. No p-values to write!")
#Find significant sites
if doPTP:
sig_dis = []
for x in range(len(disagree)):
if disagree[x] < pval:
sig_dis.append(x+1)
if write_pvals:
outF = open(pval_file, 'w')
for d in range(len(disagree)):
outF.write(str(disagree[d]))
if (d+1) in sig_dis:
outF.write(" *")
outF.write("\n")
#BINNING
maxRank = max(ranks)
minRank = min(ranks)
binStr = []
for numb, r in enumerate(ranks):
if 1 > 0:
binParts = []
for mult in range(binNo + 1):
binParts.append(minRank + (((maxRank - minRank)/binNo)*mult))
binParts.reverse()
for bin in range(1, len(binParts)):
if r < binParts[bin - 1] and r >= binParts[bin]:
binStr.append(str(bin))
break
elif round(r, 5) == round(maxRank, 5):
binStr.append("1")
break
#Make names equal lengths
filler = " "*20
filled_names = names[:]
for n, nm in enumerate(names):
nm_r = re.sub("[\s]+", "_", nm)
if len(nm_r) > 20:
filled_names[n] = nm_r[1] + "_" + nm_r[-18:]
else:
filled_names[n] = nm_r+ filler[:(20 -len(nm_r))]
#print in correct format.... (
print ("#NEXUS\n\n[This file contains data that has been analysed for site specific rates]")
print ("[using TIGER, developed by Carla Cummins in the laboratory of]")
print ("[Dr James McInerney, National University of Ireland, Maynooth]\n\n")
print ("[Histograms of number of sites in each category:]")
Hnames = []
counts = []
for b in range(binNo):
Hnames.append("Bin" + str(b+1) + "\t")
counts.append(binStr.count(str(b+1)))
histogram(counts, Hnames)
print ("\n\n")
print ("\n\n\nBEGIN TAXA;")
print ("\tDimensions NTax = "), len(seqs), ";"
print ("\tTaxLabels "), " ".join(filled_names), ";\nEND;\n"
print ("BEGIN CHARACTERS;")
print ("\tDimensions nchar = "), len(seqs[0]), ";"
print ("\tFormat datatype = "), datatype, " gap = - interleave;\nMatrix\n"
sorted = list(range(len(ranks))) #sorted = range(len(seqs[0]))
if formSort == 1:
sorted = range(len(ranks))
sr = ranks[:]
sr.sort()
sr.reverse()
s = 0
sortD = {}
for x in range(len(ranks)):
ind = sr.index(ranks[x])
sortD[x] = ind
sorted[ind] = x
sr[ind] = "|"
print (sortD)
if doPTP:
sig_sorted = []
for d in sig_dis:
sig_sorted.append(sortD[d-1]+1)
sig_dis = sig_sorted
for xy in range(0, len(seqs[0]), 60):
for xz in range(len(seqs)):
ln = filled_names[xz] + "\t"
if len(seqs[xz][xy:]) < 60:
for i in sorted[xy:]:
ln = ln + seqs[xz][i].upper()
else:
for j in sorted[xy:xy+60]:
ln = ln + seqs[xz][j].upper()
print (ln)
if formRate == 0:
for x in range(len(str(binNo))):
if x == 0:
bnls = "[Bin Numbers \t"
else:
bnls = "[" + filler + "\t"
for y in range(xy, (xy + 60)):
if y < len(binStr):
if len(binStr[sorted[y]]) - 1 < x:
bnls = bnls + " "
else:
s = sorted[y]
bnls = bnls + str(binStr[s])[x]
else:
break
print (bnls + "]")
else:
for c in range(5):
if c == 0:
rtls = "[Rank Values \t"
else:
rtls = "[" + filler + "\t"
for d in range(xy, (xy + 60)):
if d < len(ranks):
if len(str(ranks[sorted[d]])) - 1 < c:
rtls = rtls + " "
else:
rtls = rtls + str(ranks[sorted[d]])[c]
else:
break
print (rtls + "]")
if numbered:
digits = len(str(len(sorted)))
srted = sorted[:]
for sn, sv in enumerate(srted):
if len(str(sv)) < digits:
srted[sn] = str(sv) + " "*(digits-len(str(sv)))
for c in range(digits):
if c == 0:
colnms = "[Column Numbers \t"
else:
colnms = "[" + filler + "\t"
nms = [str(n)[c] for n in srted[xy:xy+60]]
colnms += "".join(nms)
print (colnms + "]")
print ("\n")
print ("\n")
print (";\nEND;\n\nBEGIN PAUP;")
if formSort:
lower_bound = 1
for c in range(1, binNo + 1):
amt = binStr.count(str(c))
if amt > 0:
charset = "\tCharset Bin" + str(c) + " = "
upper_bound = lower_bound + (binStr.count(str(c))) - 1
charset = charset + str(lower_bound) + "-" + str(upper_bound) + ";"
lower_bound = upper_bound + 1
print (charset)
else:
for x in range(1, binNo + 1):
tmpL = []
if str(x) in binStr:
for y in range(len(binStr)):
if str(binStr[y]) == str(x):
tmpL.append(str(y + 1))
print ("\tCharset Bin" + str(x) + " = ", " ".join(tmpL) + ";")
if doPTP:
print ("\tCharset Sig_Disagreement = " + " ".join([str(i) for i in sig_dis]) + ";")
print ("END;")
lt_F = time.localtime()
print ("[START TIME:", lt_S[3], ":", lt_S[4],":", lt_S[5], "]")
print ("[FINISH TIME:", lt_F[3], ":", lt_F[4],":", lt_F[5], "]")