tiger/tiger_fns_102.py

230 lines
6.0 KiB
Python

#!/usr/bin/env python
import re
# DEFINE FUNCTIONS
def shared(x, y):
sh = 0
grX = x.split("|")
grY = y.split("|")
for i in range(len(grX)):
grX[i] = set(grX[i].split(","))
for j in range(len(grY)):
grY[j] = set(grY[j].split(","))
for xXx in grX:
for yYy in grY:
if xXx.issubset(yYy):
sh += 1
break
return sh
def scoreConflict(indA, patA, patterns):
vrai = True
dividand = 0
pg = patA.split("|")
score = 0.0
pats2 = patterns[:]
overall_rank = 0.0
conflict_score = 0.0
del pats2[indA]
for bb, patB in enumerate(pats2):
if re.search("\|", patB):
dividand += 1
compB = patB.split("|")
scB = 0.0
for group in compB:
cont = 1
if vrai:
tax = group.split(",")
ch1 = tax[0]
for gp in pg:
if ch1 in gp:
for ch2 in tax[1:]:
if ch2 not in gp and cont:
scB = scB + 1.0
cont = 0
score += (scB/len(compB))
conflict_score = score/dividand
return conflict_score
def uniqify(seq):
seen = {}
result = []
for item in seq:
if item in seen : continue
seen[item] = 1
result.append(item)
return result
def getPattern(site, unknown):
considered = []
pattern = []
for x in range(len(site)):
if site[x] not in unknown:
if site[x] in considered:
pattern[considered.index(site[x])].append(str(x))
else:
considered.append(site[x])
pattern.append([str(x)])
patStr = "|".join([",".join(g) for g in pattern])
return patStr
def jumblePattern(site, unknown):
import random
siteJ = ""
while site:
pos = random.randrange(len(site))
siteJ += site[pos]
site = site[:pos] + site[(pos+1):]
return getPattern(siteJ, unknown)
def DNAdetect(seq):
seq = seq.upper()
oLen = float(len(seq))
seq_C = ""
seq_C = seq.replace("A", "")
seq_C = seq_C.replace("C", "")
seq_C = seq_C.replace("G", "")
seq_C = seq_C.replace("T", "")
nLen = float(len(seq_C))
perc = (nLen/oLen)*100
if perc < 20.0:
return "DNA"
else:
seq_C = seq.replace("0", "")
seq_C = seq_C.replace("1", "")
if len(seq_C) == 0:
return "standard"
else:
return "protein"
def histogram(num_list, name_list):
upper = float(max(num_list))
pad = len(str(upper))
parts = []
for i in range(1,61):
parts.append((upper/60)*i)
for m, n in enumerate(num_list):
pr = name_list[m] + "|"
low = 0.0
if n == 0:
pr = pr + " "*61
for p, hi in enumerate(parts):
if n > low and n <= hi:
pr = pr + "="*(p+1) + (" "*(60 - p))
break
low = hi
print ("[" + pr + "|" + str(n) + " "*(pad-len(str(n))) + "]")
def FastaParse(file_name):
file = open(file_name)
names = []
seqs = []
for line in file:
if ">" in line:
names.append(line[1:].strip())
seqs.append("")
else:
seqs[-1] += line.strip()
ret = [names, seqs]
return ret
def printHelp():
print ("""
****************
TIGER Help:
****************
TIGER: Tree-Independent Generation of Evolutionary Rates
(Developed by Carla Cummins in the lab of James Mc Inerney, NUI Maynooth, Co. Kildare, Ireland)
-Options:
-in Specify input file. File must be in FastA format and must be aligned prior.
Datasets with uneven sequence lengths will return an error.
-v Returns current TIGER version.
-f Changes output formatting options.
-f s: sorts sites depending on their agreement score
-f r: displays rank values rather than bin numbers
-f s,r: displays sorted ranks (*Be sure to put only a "," NO SPACE!)
Default prints bin numbers unsorted.
-b Set the number of bins to be used.
-b <int>: Sites will be placed into <int> number of bins. <int> is a whole number.
Default is 10
-rl A list of the rate at each site may be optionally written to a specified
file.
-rl <file.txt> : writes list of the rates at each site to file.txt.
-ptp Specifies that a PTP test should be run. *Note: this option has a huge
effect on running time!
-z Number of randomisations to be used for the PTP test.
-z <int>: each site will be randomised <int> times. <int> is a whole number.
Default is 100
-p Specify p-value which denotes significance in PTP test.
-p <float>: site will be denoted as significant if p-value is better than <float>.
<float> is a floating point number.
Default is 0.05
-pl Write a list of p-values to a specified file.
-pl <file.txt>: writes list of p-values for each site to file.txt.
-u Specify unknown characters in the alignment. Unknown characters are omitted from
site patterns and so are not considered in the analysis.
-u ?,-,*: defines ?, - and * as unknown characters. (*Be sure to put only a comma
between characters, NO SPACE!!)
Default is ? only
-Examples:
1. ./TIGER -in ExampleFile.aln -f s,r -v -rl rate_list.txt
This will run the software on "ExampleFile.aln", with sorted ranks included in the output.
The variability measure for each site will be displayed and a list of the rates at (unsorted)
sites will be written to the file "rate_list.txt".
2. ./TIGER -in ExampleFile.aln -ptp -r 1000 -p 0.01 -u ?,*
This will run the software on the file "ExampleFile.aln" with a PTP test. Sites will be
randomised 1,000 times and pass the test if their p-value is <0.01. All ? and * characters
encountered in the alignment will be ommitted from the analysis.
""")