Select Git revision
basics.component.scss
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
phybema.py 6.80 KiB
#!/usr/bin/env python3
'''
Main script for calling phybema
author: Birgitta Päuker
'''
import sys, os, shutil
import PHYLIP_format_converter as fc
import ete
import callTreeCmp as treecmp
import callProg
import concatFiles
import write_time
import parser
import create_dir
import print_comparison
from test_requirements import check_req
from estimators import DistanceEstimator, estimator_choices
from root_dir import phybema_root_dir
from dist_matrix import DistanceMatrix
from nejo_idx import NeighborJoining, gt_neighborjoining_compute, tree2newick
ROOT_DIR = phybema_root_dir()
def run_nj(newickfilepath, inputfile):
dm = DistanceMatrix(inputfile)
nejo = NeighborJoining(dm)
gt_neighborjoining_compute(nejo)
output = list()
tree2newick(output,nejo,5)
newick_string = ''.join(output)
try:
stream = open(newickfilepath,"w")
except IOError as err:
sys.stderr.write('{}: {}\n'.format(sys.argv[0],err))
exit(1)
stream.write('{}\n'.format(newick_string))
stream.close()
# Function for writing the distancefile
def compare(reftree, datafile, datafilename, tcdistmetriclist,
njconcatfilepath, newickfilepathlist, prognamelist, nrf_metric,
nrf_list, out_dir, temp_dir):
# Comparison with reftree
if reftree:
# Getting the name of the reftreefile:
reffilebase = os.path.basename(reftree)
reffilename = os.path.splitext(reffilebase)[0]
# Paths for outfiles:
distoutfilepath = ("{}/{}_ref_{}_dist.txt"
.format(out_dir,datafilename, reffilename))
tcoutfilepath = "{}/{}_Treecmp_out.txt".format(temp_dir,datafilename)
# Run TreeCmp with the given distmetrics
if tcdistmetriclist:
treecmp.run_TreeCmp(tcdistmetriclist, reftree, njconcatfilepath,
tcoutfilepath, ROOT_DIR)
# Write the file with the distances
print_comparison.print_refcomparison(datafile, reftree, distoutfilepath,
prognamelist, tcdistmetriclist, tcoutfilepath,
nrf_metric, nrf_list)
# Comparing each Tree with the others (if more than one tool was selected)
else:
if len(prognamelist) > 1:
# Paths for outfiles:
distoutfilepath = "{}/{}_dist.txt".format(out_dir,datafilename)
tcoutfilepath = "{}/{}_TreeCmp_out.txt".format(temp_dir, datafilename)
# Run TreeCmp with the given distmetrics
if tcdistmetriclist:
treecmp.run_TreeCmp(tcdistmetriclist, None, njconcatfilepath,
tcoutfilepath, ROOT_DIR)
if nrf_metric:
nrf_matrix = ete.compute_nrf_matrix(newickfilepathlist)
# Write the file with the distances
print_comparison.print_matrixcomparison(datafile, distoutfilepath,
prognamelist, tcdistmetriclist,
tcoutfilepath,
nrf_metric, nrf_matrix)
else:
sys.stderr.write(("# For datafile {} no reference tree was given and not "
"enough tools have been selected for a pairwise "
"comparison.\n").format(datafile))
args = parser.parse_command_line()
check_req()
# Variabledeclarations:
# list with datafile,reftree tuple
datatuplelist = parser.create_datatuplelist(args.datasetpaths)
# list with the chosen distance metrics for TreeCmp
tcdistmetriclist = []
# list of the valid distance metric names
valid_metrics = ["ms", "nrf", "pd", "qt", "rf"]
# Bool if nrf was chosen as metric
nrf_metric = False
# List for the computation times
timelist = []
'''
Read in the dist metrics:
if given dist metrics is empty use all metrics
'''
tcdistmetriclist = list()
if not args.dist:
nrf_metric = True
else:
for met in args.dist:
# Check if given metric is valid
if not met in valid_metrics:
sys.stderr.write("{} is not a valid distance metric\n".format(met))
exit(1)
if met == "nrf":
nrf_metric = True
else:
tcdistmetriclist.append(met)
# Generating files for every dataset
for datafile,reftree in datatuplelist:
sys.stderr.write('# datafile {}\n'.format(datafile))
path = os.path.normpath(datafile)
dirlist = path.split(os.sep)
assert len(dirlist) >= 2
testset_key = dirlist[-2]
# Getting the name of the datafile:
datafilebase = os.path.basename(datafile)
datafilename = os.path.splitext(datafilebase)[0]
estimators = estimator_choices(ROOT_DIR,args.tools, datafilename)
prognamelist = [est.name() for est in estimators]
# Variabledeclarations: list of the newickfilepaths
newickfilepathlist = []
# List with the nrf values (for cases with reftree and nrf)
nrf_list=[]
# Paths for the generated files
temp_dir = "{}/temp".format(ROOT_DIR)
out_dir = "{}/out".format(ROOT_DIR)
create_dir.create_dir(temp_dir)
create_dir.create_dir(out_dir)
njconcatfilepath = "{}/{}_nj_concatfile.nh".format(temp_dir,testset_key)
timefilepath = "{}/{}_runtime.txt".format(out_dir,testset_key)
for estimator in estimators:
progname = estimator.name()
call_list = estimator.call_list(datafile)
fixed_out_file_name = estimator.fixed_out_file_name()
# Paths for the generated files
if args.mat:
progoutfilepath = ("{}/{}_{}.mat"
.format(out_dir,testset_key,progname))
else:
progoutfilepath = ("{}/{}_{}.mat"
.format(temp_dir,testset_key,progname))
newickfilepath = ("{}/{}_{}.nh"
.format(temp_dir,testset_key,progname))
# program call
time_taken = callProg.run_prog(progname,call_list,fixed_out_file_name,
progoutfilepath)
timelist.append((progname,time_taken))
# Getting a file with the newickstring using neighborjoining
run_nj(newickfilepath, progoutfilepath)
newickfilepathlist.append(newickfilepath)
# Computing Tree from the determined newick file
if args.out_treefile_suffix != "none":
outtreefilepath = ("{}/{}_{}_tree.{}"
.format(out_dir,testset_key,progname,
args.out_treefile_suffix))
ete.show_tree(newickfilepath, outtreefilepath)
# Get the normalised robinson foulds distance from ete toolkit,
# if option was chosen
if reftree:
nrf = None
if nrf_metric:
nrf = ete.compare_tree(newickfilepath, reftree)
nrf_list.append(nrf)
# Concatenate the newickfiles
if len(newickfilepathlist) > 1:
concatFiles.concat_files(newickfilepathlist, njconcatfilepath)
else:
njconcatfilepath = newickfilepath
# write the timefile
write_time.write_time(timefilepath, timelist, testset_key)
# print the distances and write them into a file
compare(reftree, datafile, testset_key, tcdistmetriclist,
njconcatfilepath, newickfilepathlist, prognamelist, nrf_metric,
nrf_list, out_dir, temp_dir)