tiger/tiger_fns_102.py

#!/usr/bin/env python

import re

# DEFINE FUNCTIONS

def shared(x, y):
	sh = 0
	grX = x.split("|")
	grY = y.split("|")
	
	for i in range(len(grX)):
		grX[i] = set(grX[i].split(","))
	for j in range(len(grY)):
		grY[j] = set(grY[j].split(","))
		
		
	for xXx in grX:
		for yYy in grY:
			if xXx.issubset(yYy):
				sh += 1
				break
				
	return sh

def scoreConflict(indA, patA, patterns):
    vrai = True

    dividand = 0

    pg = patA.split("|")
    score = 0.0
    pats2 = patterns[:]
    overall_rank = 0.0
    conflict_score = 0.0
    del pats2[indA]
    for bb, patB in enumerate(pats2):
        if re.search("\|", patB):
            dividand += 1
            compB = patB.split("|")
            scB = 0.0
            for group in compB:
                cont = 1
                if vrai:
                    tax = group.split(",")
                    ch1 = tax[0]
                    for gp in pg:
                        if ch1 in gp:
                            for ch2 in tax[1:]:
                                if ch2 not in gp and cont:
                                    scB = scB + 1.0
                                    cont = 0
            score += (scB/len(compB))

    conflict_score = score/dividand
	
    return conflict_score
    
def uniqify(seq):
    seen = {}
    result = []
    for item in seq:
        if item in seen : continue
        seen[item] = 1
        result.append(item)
    return result


def getPattern(site, unknown):
    considered = []
    pattern = []
    for x in range(len(site)):
        if site[x] not in unknown:
            if site[x] in considered:
                pattern[considered.index(site[x])].append(str(x)) 
        else:
            considered.append(site[x])
            pattern.append([str(x)])


    patStr = "|".join([",".join(g) for g in pattern])

    return patStr


def jumblePattern(site, unknown):
	import random

	siteJ = ""
	while site:
		pos = random.randrange(len(site))
		siteJ += site[pos]
		site = site[:pos] + site[(pos+1):]

	return getPattern(siteJ, unknown)


def DNAdetect(seq):
    seq = seq.upper()
    oLen = float(len(seq))
    seq_C = ""
    
    seq_C = seq.replace("A", "")
    seq_C = seq_C.replace("C", "")
    seq_C = seq_C.replace("G", "")
    seq_C = seq_C.replace("T", "")
    
    nLen = float(len(seq_C))
    perc = (nLen/oLen)*100

    if perc < 20.0:
        return "DNA"
    else:
        seq_C = seq.replace("0", "")
        seq_C = seq_C.replace("1", "")
        if len(seq_C) == 0:
            return "standard"
        else:
            return "protein"
        

def histogram(num_list, name_list):
    upper = float(max(num_list))
    pad = len(str(upper))
    parts = []
    for i in range(1,61):
        parts.append((upper/60)*i)

    for m, n in enumerate(num_list):
        pr = name_list[m] + "|"
        low = 0.0
        if n == 0:
            pr = pr + " "*61
        for p, hi  in enumerate(parts):
            if n > low and n <= hi:
                pr = pr + "="*(p+1) + (" "*(60 - p))
                break
            low = hi
        print ("[" + pr + "|" + str(n) + " "*(pad-len(str(n))) + "]")


def FastaParse(file_name):
    file = open(file_name)
    names = []
    seqs = []
    for line in file:
	    if ">" in line:
		    names.append(line[1:].strip())
		    seqs.append("")
	    else:
		    seqs[-1] += line.strip()

    ret = [names, seqs]
    return ret


def printHelp():
    print ("""
****************
TIGER Help:
****************

TIGER: Tree-Independent Generation of Evolutionary Rates

(Developed by Carla Cummins in the lab of James Mc Inerney, NUI Maynooth, Co. Kildare, Ireland)

-Options:

-in        Specify input file. File must be in FastA format and must be aligned prior.
           Datasets with uneven sequence lengths will return an error.

-v         Returns current TIGER version.

-f         Changes output formatting options.
           -f s: sorts sites depending on their agreement score
           -f r: displays rank values rather than bin numbers
           -f s,r: displays sorted ranks (*Be sure to put only a "," NO SPACE!)
           
           Default prints bin numbers unsorted.

-b         Set the number of bins to be used.
           -b <int>: Sites will be placed into <int> number of bins. <int> is a whole number.

           Default is 10

-rl       A list of the rate at each site may be optionally written to a specified
          file. 
          -rl <file.txt> : writes list of the rates at each site to file.txt.

-ptp      Specifies that a PTP test should be run. *Note: this option has a huge 
          effect on running time!

-z        Number of randomisations to be used for the PTP test. 
          -z <int>: each site will be randomised <int> times. <int> is a whole number.

	 Default is 100

-p       Specify p-value which denotes significance in PTP test.
         -p <float>: site will be denoted as significant if p-value is better than <float>.
                     <float> is a floating point number.

         Default is 0.05

-pl      Write a list of p-values to a specified file.
         -pl <file.txt>: writes list of p-values for each site to file.txt.

-u       Specify unknown characters in the alignment. Unknown characters are omitted from 
         site patterns and so are not considered in the analysis.
         -u ?,-,*: defines ?, - and * as unknown characters. (*Be sure to put only a comma
                   between characters, NO SPACE!!)
         
         Default is ? only


-Examples:
     
1.   ./TIGER -in ExampleFile.aln -f s,r -v -rl rate_list.txt

     This will run the software on "ExampleFile.aln", with sorted ranks included in the output.
     The variability measure for each site will be displayed and a list of the rates at (unsorted)
     sites will be written to the file "rate_list.txt".
  
2.   ./TIGER -in ExampleFile.aln -ptp -r 1000 -p 0.01 -u ?,*

     This will run the software on the file "ExampleFile.aln" with a PTP test. Sites will be
     randomised 1,000 times and pass the test if their p-value is <0.01. All ? and * characters
     encountered in the alignment will be ommitted from the analysis.
   
     """)
add source 2022-01-30 09:07:09 +08:00			`#!/usr/bin/env python`

			`import re`

			`# DEFINE FUNCTIONS`

			`def shared(x, y):`
			`sh = 0`
			`grX = x.split("\|")`
			`grY = y.split("\|")`

			`for i in range(len(grX)):`
			`grX[i] = set(grX[i].split(","))`
			`for j in range(len(grY)):`
			`grY[j] = set(grY[j].split(","))`


			`for xXx in grX:`
			`for yYy in grY:`
			`if xXx.issubset(yYy):`
			`sh += 1`
			`break`

			`return sh`

			`def scoreConflict(indA, patA, patterns):`
			`vrai = True`

			`dividand = 0`

			`pg = patA.split("\|")`
			`score = 0.0`
			`pats2 = patterns[:]`
			`overall_rank = 0.0`
			`conflict_score = 0.0`
			`del pats2[indA]`
			`for bb, patB in enumerate(pats2):`
			`if re.search("\\|", patB):`
			`dividand += 1`
			`compB = patB.split("\|")`
			`scB = 0.0`
			`for group in compB:`
			`cont = 1`
			`if vrai:`
			`tax = group.split(",")`
			`ch1 = tax[0]`
			`for gp in pg:`
			`if ch1 in gp:`
			`for ch2 in tax[1:]:`
			`if ch2 not in gp and cont:`
			`scB = scB + 1.0`
			`cont = 0`
			`score += (scB/len(compB))`

			`conflict_score = score/dividand`

			`return conflict_score`

			`def uniqify(seq):`
			`seen = {}`
			`result = []`
			`for item in seq:`
			`if item in seen : continue`
			`seen[item] = 1`
			`result.append(item)`
			`return result`


			`def getPattern(site, unknown):`
			`considered = []`
			`pattern = []`
			`for x in range(len(site)):`
			`if site[x] not in unknown:`
			`if site[x] in considered:`
update to support python3 2022-02-06 04:34:05 +08:00			`pattern[considered.index(site[x])].append(str(x))`
			`else:`
			`considered.append(site[x])`
			`pattern.append([str(x)])`
add source 2022-01-30 09:07:09 +08:00

			`patStr = "\|".join([",".join(g) for g in pattern])`

			`return patStr`


			`def jumblePattern(site, unknown):`
			`import random`

			`siteJ = ""`
			`while site:`
			`pos = random.randrange(len(site))`
			`siteJ += site[pos]`
			`site = site[:pos] + site[(pos+1):]`

			`return getPattern(siteJ, unknown)`


			`def DNAdetect(seq):`
			`seq = seq.upper()`
			`oLen = float(len(seq))`
			`seq_C = ""`
update to support python3 2022-02-06 04:34:05 +08:00
add source 2022-01-30 09:07:09 +08:00			`seq_C = seq.replace("A", "")`
			`seq_C = seq_C.replace("C", "")`
			`seq_C = seq_C.replace("G", "")`
			`seq_C = seq_C.replace("T", "")`

			`nLen = float(len(seq_C))`
			`perc = (nLen/oLen)*100`

			`if perc < 20.0:`
			`return "DNA"`
			`else:`
			`seq_C = seq.replace("0", "")`
			`seq_C = seq_C.replace("1", "")`
			`if len(seq_C) == 0:`
			`return "standard"`
			`else:`
			`return "protein"`


			`def histogram(num_list, name_list):`
			`upper = float(max(num_list))`
			`pad = len(str(upper))`
			`parts = []`
			`for i in range(1,61):`
			`parts.append((upper/60)*i)`

			`for m, n in enumerate(num_list):`
			`pr = name_list[m] + "\|"`
			`low = 0.0`
			`if n == 0:`
			`pr = pr + " "*61`
			`for p, hi in enumerate(parts):`
			`if n > low and n <= hi:`
			`pr = pr + "="(p+1) + (" "(60 - p))`
			`break`
			`low = hi`
update to support python3 2022-02-06 04:34:05 +08:00			`print ("[" + pr + "\|" + str(n) + " "*(pad-len(str(n))) + "]")`
add source 2022-01-30 09:07:09 +08:00

			`def FastaParse(file_name):`
			`file = open(file_name)`
			`names = []`
			`seqs = []`
			`for line in file:`
			`if ">" in line:`
			`names.append(line[1:].strip())`
			`seqs.append("")`
			`else:`
			`seqs[-1] += line.strip()`

			`ret = [names, seqs]`
			`return ret`


			`def printHelp():`
update to support python3 2022-02-06 04:34:05 +08:00			`print ("""`
add source 2022-01-30 09:07:09 +08:00			`****************`
			`TIGER Help:`
			`****************`

			`TIGER: Tree-Independent Generation of Evolutionary Rates`

			`(Developed by Carla Cummins in the lab of James Mc Inerney, NUI Maynooth, Co. Kildare, Ireland)`

			`-Options:`

			`-in Specify input file. File must be in FastA format and must be aligned prior.`
			`Datasets with uneven sequence lengths will return an error.`

			`-v Returns current TIGER version.`

			`-f Changes output formatting options.`
			`-f s: sorts sites depending on their agreement score`
			`-f r: displays rank values rather than bin numbers`
			`-f s,r: displays sorted ranks (*Be sure to put only a "," NO SPACE!)`

			`Default prints bin numbers unsorted.`

			`-b Set the number of bins to be used.`
			`-b <int>: Sites will be placed into <int> number of bins. <int> is a whole number.`

			`Default is 10`

			`-rl A list of the rate at each site may be optionally written to a specified`
			`file.`
			`-rl <file.txt> : writes list of the rates at each site to file.txt.`

			`-ptp Specifies that a PTP test should be run. *Note: this option has a huge`
			`effect on running time!`

			`-z Number of randomisations to be used for the PTP test.`
			`-z <int>: each site will be randomised <int> times. <int> is a whole number.`

			`Default is 100`

			`-p Specify p-value which denotes significance in PTP test.`
			`-p <float>: site will be denoted as significant if p-value is better than <float>.`
			`<float> is a floating point number.`

			`Default is 0.05`

			`-pl Write a list of p-values to a specified file.`
			`-pl <file.txt>: writes list of p-values for each site to file.txt.`

			`-u Specify unknown characters in the alignment. Unknown characters are omitted from`
			`site patterns and so are not considered in the analysis.`
			`-u ?,-,: defines ?, - and as unknown characters. (*Be sure to put only a comma`
			`between characters, NO SPACE!!)`

			`Default is ? only`


			`-Examples:`

			`1. ./TIGER -in ExampleFile.aln -f s,r -v -rl rate_list.txt`

			`This will run the software on "ExampleFile.aln", with sorted ranks included in the output.`
			`The variability measure for each site will be displayed and a list of the rates at (unsorted)`
			`sites will be written to the file "rate_list.txt".`

			`2. ./TIGER -in ExampleFile.aln -ptp -r 1000 -p 0.01 -u ?,*`

			`This will run the software on the file "ExampleFile.aln" with a PTP test. Sites will be`
			`randomised 1,000 times and pass the test if their p-value is <0.01. All ? and * characters`
			`encountered in the alignment will be ommitted from the analysis.`

update to support python3 2022-02-06 04:34:05 +08:00			`""")`