phybema.py

#!/usr/bin/env python3

'''
Main script for calling phybema
author: Birgitta Päuker
'''

import sys, os, shutil
import PHYLIP_format_converter as fc
import ete
import callTreeCmp as treecmp
import callProg
import concatFiles
import write_time
import parser
import create_dir
import print_comparison
from test_requirements import check_req
from estimators import DistanceEstimator, estimator_choices
from root_dir import phybema_root_dir
from dist_matrix import DistanceMatrix
from nejo_idx import NeighborJoining, gt_neighborjoining_compute, tree2newick

ROOT_DIR = phybema_root_dir()

def run_nj(newickfilepath, inputfile):
  dm = DistanceMatrix(inputfile)
  nejo = NeighborJoining(dm)
  gt_neighborjoining_compute(nejo)
  output = list()
  tree2newick(output,nejo,5)
  newick_string = ''.join(output)
  try:
    stream = open(newickfilepath,"w")
  except IOError as err:
    sys.stderr.write('{}: {}\n'.format(sys.argv[0],err))
    exit(1)
  stream.write('{}\n'.format(newick_string))
  stream.close()

# Function for writing the distancefile
def compare(reftree, datafile, datafilename, tcdistmetriclist,
            njconcatfilepath, newickfilepathlist, prognamelist, nrf_metric,
            nrf_list, out_dir, temp_dir):
  # Comparison with reftree
  if reftree:
    # Getting the name of the reftreefile:
    reffilebase = os.path.basename(reftree)
    reffilename = os.path.splitext(reffilebase)[0]

    # Paths for outfiles:
    distoutfilepath = ("{}/{}_ref_{}_dist.txt"
                       .format(out_dir,datafilename, reffilename))
    tcoutfilepath = "{}/{}_Treecmp_out.txt".format(temp_dir,datafilename)

    # Run TreeCmp with the given distmetrics
    if tcdistmetriclist:
      treecmp.run_TreeCmp(tcdistmetriclist, reftree, njconcatfilepath,
                          tcoutfilepath, ROOT_DIR)

    # Write the file with the distances
    print_comparison.print_refcomparison(datafile, reftree, distoutfilepath,
                                prognamelist, tcdistmetriclist, tcoutfilepath,
                                nrf_metric, nrf_list)
  # Comparing each Tree with the others (if more than one tool was selected)
  else:
    if len(prognamelist) > 1:
      # Paths for outfiles:
      distoutfilepath = "{}/{}_dist.txt".format(out_dir,datafilename)
      tcoutfilepath = "{}/{}_TreeCmp_out.txt".format(temp_dir, datafilename)

      # Run TreeCmp with the given distmetrics
      if tcdistmetriclist:
        treecmp.run_TreeCmp(tcdistmetriclist, None, njconcatfilepath,
                            tcoutfilepath, ROOT_DIR)
      if nrf_metric:
        nrf_matrix = ete.compute_nrf_matrix(newickfilepathlist)

      # Write the file with the distances
      print_comparison.print_matrixcomparison(datafile, distoutfilepath,
                                              prognamelist, tcdistmetriclist,
                                              tcoutfilepath,
                                              nrf_metric, nrf_matrix)
    else:
      sys.stderr.write(("# For datafile {} no reference tree was given and not "
                        "enough tools have been selected for a pairwise "
                        "comparison.\n").format(datafile))

args = parser.parse_command_line()
check_req()

# Variabledeclarations:
# list with datafile,reftree tuple
datatuplelist = parser.create_datatuplelist(args.datasetpaths)

# list with the chosen distance metrics for TreeCmp
tcdistmetriclist = []
# list of the valid distance metric names
valid_metrics = ["ms", "nrf", "pd", "qt", "rf"]
# Bool if nrf was chosen as metric
nrf_metric = False
# List for the computation times
timelist = []

'''
Read in the dist metrics:
if given dist metrics is empty use all metrics
'''
tcdistmetriclist = list()
if not args.dist:
  nrf_metric = True
else:
  for met in args.dist:
    # Check if given metric is valid
    if not met in valid_metrics:
      sys.stderr.write("{} is not a valid distance metric\n".format(met))
      exit(1)
    if met == "nrf":
      nrf_metric = True
    else:
      tcdistmetriclist.append(met)

# Generating files for every dataset
for datafile,reftree in datatuplelist:
  sys.stderr.write('# datafile {}\n'.format(datafile))
  path = os.path.normpath(datafile)
  dirlist = path.split(os.sep)
  assert len(dirlist) >= 2
  testset_key = dirlist[-2]
  # Getting the name of the datafile:
  datafilebase = os.path.basename(datafile)
  datafilename = os.path.splitext(datafilebase)[0]

  estimators = estimator_choices(ROOT_DIR,args.tools, datafilename)
  prognamelist = [est.name() for est in estimators]

  # Variabledeclarations: list of the newickfilepaths
  newickfilepathlist = []
  # List with the nrf values (for cases with reftree and nrf)
  nrf_list=[]

  # Paths for the generated files
  temp_dir = "{}/temp".format(ROOT_DIR)
  out_dir = "{}/out".format(ROOT_DIR)
  create_dir.create_dir(temp_dir)
  create_dir.create_dir(out_dir)

  njconcatfilepath = "{}/{}_nj_concatfile.nh".format(temp_dir,testset_key)
  timefilepath = "{}/{}_runtime.txt".format(out_dir,testset_key)

  for estimator in estimators:
    progname = estimator.name()
    call_list = estimator.call_list(datafile)
    fixed_out_file_name = estimator.fixed_out_file_name()

    # Paths for the generated files
    if args.mat:
      progoutfilepath = ("{}/{}_{}.mat"
                         .format(out_dir,testset_key,progname))
    else:
      progoutfilepath = ("{}/{}_{}.mat"
                         .format(temp_dir,testset_key,progname))

    newickfilepath = ("{}/{}_{}.nh"
                      .format(temp_dir,testset_key,progname))
    # program call
    time_taken = callProg.run_prog(progname,call_list,fixed_out_file_name,
                                   progoutfilepath)
    timelist.append((progname,time_taken))

    # Getting a file with the newickstring using neighborjoining
    run_nj(newickfilepath, progoutfilepath)
    newickfilepathlist.append(newickfilepath)

    # Computing Tree from the determined newick file
    if args.out_treefile_suffix != "none":
      outtreefilepath = ("{}/{}_{}_tree.{}"
                        .format(out_dir,testset_key,progname,
                                args.out_treefile_suffix))
      ete.show_tree(newickfilepath, outtreefilepath)

    # Get the normalised robinson foulds distance from ete toolkit,
    # if option was chosen
    if reftree:
      nrf = None
      if nrf_metric:
        nrf = ete.compare_tree(newickfilepath, reftree)
        nrf_list.append(nrf)

  # Concatenate the newickfiles
  if len(newickfilepathlist) > 1:
    concatFiles.concat_files(newickfilepathlist, njconcatfilepath)
  else:
    njconcatfilepath = newickfilepath

  # write the timefile
  write_time.write_time(timefilepath, timelist, testset_key)

  # print the distances and write them into a file
  compare(reftree, datafile, testset_key, tcdistmetriclist,
          njconcatfilepath, newickfilepathlist, prognamelist, nrf_metric,
          nrf_list, out_dir, temp_dir)