Skip to content
Snippets Groups Projects
Select Git revision
  • 0c1a38446a1f2ff704534f5bdcf80c071e34e5b7
  • main default
2 results

demo_final.md

Blame
  • Code owners
    Assign users and groups as approvers for specific file changes. Learn more.
    phybema.py 6.80 KiB
    #!/usr/bin/env python3
    
    '''
    Main script for calling phybema
    author: Birgitta Päuker
    '''
    
    import sys, os, shutil
    import PHYLIP_format_converter as fc
    import ete
    import callTreeCmp as treecmp
    import callProg
    import concatFiles
    import write_time
    import parser
    import create_dir
    import print_comparison
    from test_requirements import check_req
    from estimators import DistanceEstimator, estimator_choices
    from root_dir import phybema_root_dir
    from dist_matrix import DistanceMatrix
    from nejo_idx import NeighborJoining, gt_neighborjoining_compute, tree2newick
    
    ROOT_DIR = phybema_root_dir()
    
    def run_nj(newickfilepath, inputfile):
      dm = DistanceMatrix(inputfile)
      nejo = NeighborJoining(dm)
      gt_neighborjoining_compute(nejo)
      output = list()
      tree2newick(output,nejo,5)
      newick_string = ''.join(output)
      try:
        stream = open(newickfilepath,"w")
      except IOError as err:
        sys.stderr.write('{}: {}\n'.format(sys.argv[0],err))
        exit(1)
      stream.write('{}\n'.format(newick_string))
      stream.close()
    
    # Function for writing the distancefile
    def compare(reftree, datafile, datafilename, tcdistmetriclist,
                njconcatfilepath, newickfilepathlist, prognamelist, nrf_metric,
                nrf_list, out_dir, temp_dir):
      # Comparison with reftree
      if reftree:
        # Getting the name of the reftreefile:
        reffilebase = os.path.basename(reftree)
        reffilename = os.path.splitext(reffilebase)[0]
    
        # Paths for outfiles:
        distoutfilepath = ("{}/{}_ref_{}_dist.txt"
                           .format(out_dir,datafilename, reffilename))
        tcoutfilepath = "{}/{}_Treecmp_out.txt".format(temp_dir,datafilename)
    
        # Run TreeCmp with the given distmetrics
        if tcdistmetriclist:
          treecmp.run_TreeCmp(tcdistmetriclist, reftree, njconcatfilepath,
                              tcoutfilepath, ROOT_DIR)
    
        # Write the file with the distances
        print_comparison.print_refcomparison(datafile, reftree, distoutfilepath,
                                    prognamelist, tcdistmetriclist, tcoutfilepath,
                                    nrf_metric, nrf_list)
      # Comparing each Tree with the others (if more than one tool was selected)
      else:
        if len(prognamelist) > 1:
          # Paths for outfiles:
          distoutfilepath = "{}/{}_dist.txt".format(out_dir,datafilename)
          tcoutfilepath = "{}/{}_TreeCmp_out.txt".format(temp_dir, datafilename)
    
          # Run TreeCmp with the given distmetrics
          if tcdistmetriclist:
            treecmp.run_TreeCmp(tcdistmetriclist, None, njconcatfilepath,
                                tcoutfilepath, ROOT_DIR)
          if nrf_metric:
            nrf_matrix = ete.compute_nrf_matrix(newickfilepathlist)
    
          # Write the file with the distances
          print_comparison.print_matrixcomparison(datafile, distoutfilepath,
                                                  prognamelist, tcdistmetriclist,
                                                  tcoutfilepath,
                                                  nrf_metric, nrf_matrix)
        else:
          sys.stderr.write(("# For datafile {} no reference tree was given and not "
                            "enough tools have been selected for a pairwise "
                            "comparison.\n").format(datafile))
    
    args = parser.parse_command_line()
    check_req()
    
    # Variabledeclarations:
    # list with datafile,reftree tuple
    datatuplelist = parser.create_datatuplelist(args.datasetpaths)
    
    # list with the chosen distance metrics for TreeCmp
    tcdistmetriclist = []
    # list of the valid distance metric names
    valid_metrics = ["ms", "nrf", "pd", "qt", "rf"]
    # Bool if nrf was chosen as metric
    nrf_metric = False
    # List for the computation times
    timelist = []
    
    '''
    Read in the dist metrics:
    if given dist metrics is empty use all metrics
    '''
    tcdistmetriclist = list()
    if not args.dist:
      nrf_metric = True
    else:
      for met in args.dist:
        # Check if given metric is valid
        if not met in valid_metrics:
          sys.stderr.write("{} is not a valid distance metric\n".format(met))
          exit(1)
        if met == "nrf":
          nrf_metric = True
        else:
          tcdistmetriclist.append(met)
    
    # Generating files for every dataset
    for datafile,reftree in datatuplelist:
      sys.stderr.write('# datafile {}\n'.format(datafile))
      path = os.path.normpath(datafile)
      dirlist = path.split(os.sep)
      assert len(dirlist) >= 2
      testset_key = dirlist[-2]
      # Getting the name of the datafile:
      datafilebase = os.path.basename(datafile)
      datafilename = os.path.splitext(datafilebase)[0]
    
      estimators = estimator_choices(ROOT_DIR,args.tools, datafilename)
      prognamelist = [est.name() for est in estimators]
    
      # Variabledeclarations: list of the newickfilepaths
      newickfilepathlist = []
      # List with the nrf values (for cases with reftree and nrf)
      nrf_list=[]
    
      # Paths for the generated files
      temp_dir = "{}/temp".format(ROOT_DIR)
      out_dir = "{}/out".format(ROOT_DIR)
      create_dir.create_dir(temp_dir)
      create_dir.create_dir(out_dir)
    
      njconcatfilepath = "{}/{}_nj_concatfile.nh".format(temp_dir,testset_key)
      timefilepath = "{}/{}_runtime.txt".format(out_dir,testset_key)
    
      for estimator in estimators:
        progname = estimator.name()
        call_list = estimator.call_list(datafile)
        fixed_out_file_name = estimator.fixed_out_file_name()
    
        # Paths for the generated files
        if args.mat:
          progoutfilepath = ("{}/{}_{}.mat"
                             .format(out_dir,testset_key,progname))
        else:
          progoutfilepath = ("{}/{}_{}.mat"
                             .format(temp_dir,testset_key,progname))
    
        newickfilepath = ("{}/{}_{}.nh"
                          .format(temp_dir,testset_key,progname))
        # program call
        time_taken = callProg.run_prog(progname,call_list,fixed_out_file_name,
                                       progoutfilepath)
        timelist.append((progname,time_taken))
    
        # Getting a file with the newickstring using neighborjoining
        run_nj(newickfilepath, progoutfilepath)
        newickfilepathlist.append(newickfilepath)
    
        # Computing Tree from the determined newick file
        if args.out_treefile_suffix != "none":
          outtreefilepath = ("{}/{}_{}_tree.{}"
                            .format(out_dir,testset_key,progname,
                                    args.out_treefile_suffix))
          ete.show_tree(newickfilepath, outtreefilepath)
    
        # Get the normalised robinson foulds distance from ete toolkit,
        # if option was chosen
        if reftree:
          nrf = None
          if nrf_metric:
            nrf = ete.compare_tree(newickfilepath, reftree)
            nrf_list.append(nrf)
    
      # Concatenate the newickfiles
      if len(newickfilepathlist) > 1:
        concatFiles.concat_files(newickfilepathlist, njconcatfilepath)
      else:
        njconcatfilepath = newickfilepath
    
      # write the timefile
      write_time.write_time(timefilepath, timelist, testset_key)
    
      # print the distances and write them into a file
      compare(reftree, datafile, testset_key, tcdistmetriclist,
              njconcatfilepath, newickfilepathlist, prognamelist, nrf_metric,
              nrf_list, out_dir, temp_dir)