Initial version of phybema and first test data set

differences to Phybema as described in the B.Sc.-thesis of Birgitta P"auker: 1) use a python implementation of neighor joining. This saves system call since the neighbor-joining algorithm is called as a python function 2) conversion of output format is not necessary anymore. For all distance estimators we require that they output a matrix in phylip-format, either as a triangle or full matrix and with or without zero-values on the main diagonal

Initial version of phybema and first test data set
1d420bbc · Kurtz, Prof. Dr. Stefan · bf814c34 · 1d420bbc · 1d420bbc · 1d420bbc
Commit 1d420bbc authored Jul 12, 2019 by Kurtz, Prof. Dr. Stefan
--- a/.gitignore
+++ b/.gitignore
+out
+temp
+__pycache__
--- a/Makefile
+++ b/Makefile
+.PHONY:test clean
+
+test:testsuite/test_rep.sh
+	@testsuite/test_rep.sh testset1
+
+clean:
+	@${RM} -r temp out
--- a/README.md
+++ b/README.md
--- a/src/PHYLIP_format_converter.py
+++ b/src/PHYLIP_format_converter.py
+#!/usr/bin/env python3
+
+'''
+Script to convert the PHYLIP format into the 
+for neighborjoining needed format. 
+
+The input has to be a lower left triangular or full matrix.
+Names will be cut after 10 characters and are not allowed 
+to contain blanks or other seperators.
+
+author: Birgitta Päuker
+'''
+
+import sys, re, argparse
+import numpy as np
+import openFile
+
+'''
+Function to convert the PHYLIP Format into the for neighborjoining needed format.
+'''
+def convert_format(progname, progoutfilepath, convoutfilepath):
+  # Read in the lines
+  infile= openFile.open_file(progoutfilepath)
+  lines = infile.readlines()
+  infile.close()
+
+  # Open the outputfile
+  outfile = openFile.open_file(convoutfilepath, 'w')
+
+  # Writing the headline
+  num_genomesre = re.search(r'\w+', lines[0])
+  num_genomes = num_genomesre[0]
+  
+  # Check if first line is a digit
+  if not (num_genomes.isdigit()):
+    sys.stderr.write("{}: The first line of the distmatrix from {} needs to "
+                     "be the number of sequences".format(sys.argv[0],progname))
+    exit(1)
+  outfile.write("# pairwise dist values for {} sequences\n"
+                .format(num_genomes))
+  
+  distmatrix = np.zeros((int(num_genomes), int(num_genomes)))
+  linegenomeidx = 0
+  testfullmatrix = False
+  
+  # Convert distances into needed format
+  for line in lines[1:]:
+    '''
+    for normal PHYLIP Format the used seperator is \t. 
+    But some are using " " as seperator
+    Replacing sep \t with " "
+    '''
+    line = line.replace("\t", " ")
+    values = line.strip().split(' ')
+    
+    # Remove all empty string elements
+    values = list(filter(None, values))
+  
+    # Replace a nan in the distancematrix with distance 1.0
+    for value in values[1:]:
+      if(value.isalpha()):
+        if value == "nan":
+          values[values.index(value)] = 1.0
+          sys.stdout.write("# Replaced the value nan in {} with distance 1.0\n"
+                           .format(progoutfilepath))
+        else:
+          sys.stderr.write("There are characters in the distancematrix in file "
+                           "{}\n".format(progoutfilepath))
+          exit(1)
+      
+    # Testing if there are enough distances per sequence in a line
+    if(len(values) < linegenomeidx + 1):
+      sys.stderr.write("{}: In the distmatrix of {} are not enough distances "
+                       "per sequence.\n".format(sys.argv[0], progname))
+      exit(1)
+      
+    # Writing the name of the genom (10 characters) with its index
+    name = values[0]
+    outfile.write("{}\t{}\n".format(linegenomeidx, name[:10]))
+    
+    # Filling the distmatrix:
+    if not (linegenomeidx == 0):
+      # working with the lower left triangular matrix
+      # for each processed rowgenom read in the dist to the current linegenome
+      for row in range(linegenomeidx):
+        distmatrix[linegenomeidx][row] = values[row + 1]
+        
+    # if full matrix, fill full distmatrix for later test
+    if(len(values) > linegenomeidx + 2):
+      testfullmatrix = True
+      for row in range(linegenomeidx + 1, len(values)-1):
+        distmatrix[linegenomeidx][row] = values[row + 1]
+        
+    linegenomeidx = linegenomeidx + 1
+    
+  # Testing for a full matrix if it is semetric, to find cases like fswm and 
+  # kmacs where there are no seperators between the name and the first distance.
+  if(testfullmatrix):
+    for i in range(distmatrix.shape[0]):
+      for j in range(i+1,distmatrix.shape[1]):
+        if(distmatrix[i][j] != distmatrix[j][i]):
+          sys.stderr.write("{}: The distancmatrix of {} can not be converted "
+                           "into needed format.\nPlease search for the cause in"
+                           " file {}.\nA possible reason might be a missing "
+                           "blank between genomename and first distance.\n"
+                           .format(sys.argv[0], progname, progoutfilepath))
+          exit(1)
+          
+  # Print into outputfile:
+  for lineidx in range(linegenomeidx):
+    for rowidx in range(lineidx + 1, linegenomeidx):
+      # write dist from linegenome to rowgenome
+      dist = distmatrix[rowidx][lineidx]
+      outfile.write("dist {} {} {:.10f}\n".format(lineidx, rowidx, dist))
+      
+  outfile.close()
--- a/src/calculate_ani.py
+++ b/src/calculate_ani.py
+#!/usr/bin/env python3
+import sys
+from os.path import abspath
+from tempfile import TemporaryDirectory
+from itertools import repeat
+from multiprocessing import Pool
+import numpy as np
+import shutil
+
+from anicalc.anicalc import create_fragments, calculate_ani_with_tool
+
+def convert_raw_results_to_phybema(raw_results):
+  num_sequences = len(raw_results)
+  print('# pairwise dist values for {} sequences'.format(num_sequences))
+  for idx, r in enumerate(raw_results):
+    print('{}\t{}'.format(idx,r[0]))
+  for i in range(num_sequences-1):
+    for j in range(i+1,num_sequences):
+      ani1 = raw_results[i][1][j]
+      ani2 = raw_results[j][1][i]
+      avg_ani = (ani1+ani2)/2.0
+      print('dist {} {} {:.8f}'.format(i,j,1.0 - avg_ani))
+
+DEBUG = False
+
+if not DEBUG:
+    sys.tracebacklimit = 0
+
+def main(args):
+    for prog in ["nucmer","dnadiff"]:
+      if not shutil.which(prog):
+        sys.stderr.write('{}: cannot find {} executable\n'
+                         .format(sys.argv[0],prog))
+        exit(1)
+    tmp_dir = abspath(args.tmpdir) if args.tmpdir else None
+    with TemporaryDirectory(dir=tmp_dir) as tmpdir:
+        files = create_fragments(args.infile, tmpdir,
+                                 args.min_fragment_size, args.fragment_size)
+
+        pool = Pool(args.threads)
+        raw_results = pool.starmap(calculate_ani_with_tool(args.tool),
+                                   zip(files, repeat(files), repeat(tmpdir)))
+        pool.close()
+        pool.join()
+    if args.phybema:
+      convert_raw_results_to_phybema(raw_results)
+    else:
+      index, values = zip(*raw_results)
+      results = np.stack(values, axis=0)
+      if not args.disable_pandas:
+          data = pd.DataFrame(results, index=index, columns=files)
+          data.to_csv(args.outprefix + '_ani.csv')
+      else:
+          np.savetxt(args.outprefix + '_ani.csv',
+                     results, delimiter=',', header=','.join(files))
+
+if __name__ == '__main__':
+    import argparse
+    PARSER = argparse.ArgumentParser(description='')
+    PARSER.add_argument('tool', choices=['nucmer'], help="Alignment tool")
+    PARSER.add_argument('--threads', type=int, default=1, help="")
+    PARSER.add_argument('--disable-pandas', action='store_true',
+                        help="disable pandas output")
+    PARSER.add_argument('--phybema', action='store_true',
+                        help="show format in output suitable for phybema")
+    PARSER.add_argument('--tmpdir', type=str, default=None,
+                        help=('Path to directory for temporary storage '
+                              ' (default=system default)'))
+    PARSER.add_argument('--outprefix', type=str, default="tmp",
+                        help="specify prefix of output file")
+    PARSER.add_argument('--min-fragment-size', type=int, default=100, help="")
+    PARSER.add_argument('--fragment-size', type=int, default=1020, help="")
+    PARSER.add_argument('infile', type=str, help=('name of file with sequences '
+                                                  'in fasta format'))
+
+    ARGS = PARSER.parse_args()
+    if not ARGS.disable_pandas:
+        import pandas as pd
+
+    main(ARGS)
--- a/src/callProg.py
+++ b/src/callProg.py
+#!/usr/bin/env python3
+
+'''
+Script for calling the, in estimator.py specified estimators
+using the given path.
+
+To call the programs run from the subprocess module is used.
+Run requires python version >= 3.5
+
+author: Birgitta Päuker
+'''
+
+import subprocess, sys, os, shutil
+from datetime import datetime
+import openFile
+
+'''
+Function to call the given programs with their given path. 
+The output is writen into a file at the progoutfilepath. 
+If the program writes its output into a file 
+the content will be copied into the progoutfilepath
+'''
+def run_prog(progname,call_list,fixed_out_file_name,progoutfilepath):
+  
+  sys.stderr.write("# starting program {}\n".format(progname))
+  start_time = datetime.now()
+  
+  try:
+    output = subprocess.run(call_list,stdout=subprocess.PIPE,
+                            stderr=subprocess.PIPE, universal_newlines=True)
+  except: 
+    sys.stderr.write("calling program {} failed\n".format(progname))
+    exit(1)
+    
+  end_time = datetime.now()
+
+  '''
+  Check if an Error turn up while running the program
+  andi writes to stderr even if there is an output, 
+  therefore test if stdout is empty
+  '''
+  if output.returncode != 0 and not output.stdout:
+    sys.stderr.write("{}: error when executing {}:\n{}\n"
+                     .format(sys.argv[0], progname, output.stderr))
+    exit(1)
+
+  sys.stderr.write("# program {} complete\n".format(progname))
+  time_taken = end_time - start_time
+  sys.stderr.write("# runtime: {}\n".format(time_taken))
+  
+  '''
+  if the program writes its output into a file with fixed name
+  copy content and remove the original, otherwise create new file
+  and redirect stdout to this file.
+  '''
+  if fixed_out_file_name:
+    # Check for the outfile
+    if not os.path.isfile(fixed_out_file_name):
+      sys.stderr.write("There is no output file {} for program {}\n"
+                       .format(fixed_out_file_name,progname))
+      exit(1)
+    # Copy the outfile with preserved metadata into progoutfilepath
+    shutil.copy2(fixed_out_file_name, progoutfilepath)
+
+    # Remove the outfile of the program to prevent further complications
+    # with other programs
+    os.remove(fixed_out_file_name)
+  else:
+    # Create an outfile:
+    progoutfile = openFile.open_file(progoutfilepath, "w")
+    progoutfile.write(output.stdout)
+    progoutfile.close()
+  return time_taken
--- a/src/callTreeCmp.py
+++ b/src/callTreeCmp.py
+#!/usr/bin/env python3
+
+'''
+Skript for calling TreeCmp (https://eti.pg.edu.pl/treecmp/)
+Descripted in the paper "TreeCmp: Comparison of Trees in Polynomial Time"
+(doi: 10.4137/EBO.S9657)
+
+To call the programs run from the subprocess module is used.
+Run requires python version >= 3.5
+
+author: Birgitta Päuker
+'''
+
+import subprocess, sys, os
+import openFile
+
+'''
+Function for calling TreeCmp with the generated concatenated newickfile.
+The Program will be called with the -r option if reftree is not None and
+with option -m otherwise.
+The program is called with the -N option to additionaly get the distances
+scaled by the average distance to random trees with the same number of leaves
+'''
+def run_TreeCmp(tcdistmetriclist,reftree, njconcatfilepath,
+                tcoutfilepath, root_dir):
+  # Create string with the given distance metrics
+  diststr = ""
+  for dist in tcdistmetriclist:
+    diststr = diststr + dist
+    if not dist == tcdistmetriclist[-1]:
+      diststr = diststr + " "
+      
+  # Check if njconcatfilepath is empty
+  if os.path.getsize(njconcatfilepath) == 0:
+    sys.stderr.write("{}: The newickfile {} is empty.\n"
+                         .format(sys.argv[0], njconcatfilepath))
+    exit(1)
+        
+  # Programm parameters
+  tcmp_path = root_dir + \
+              "/src_external/TreeCmp_v1.1-b308/TreeCmp/bin/TreeCmp.jar"
+  if reftree:
+    arg = ["java", "-jar", tcmp_path,"-N","-r", 
+           "{}".format(reftree), 
+           "-d",
+           "{}".format(diststr), 
+           "-i", "{}".format(njconcatfilepath),"-o", 
+           "{}".format(tcoutfilepath)]
+  else:
+    arg = ["java", "-jar", tcmp_path,"-N","-m", "-d", 
+           "{}".format(diststr),
+           "-i", 
+           "{}".format(njconcatfilepath),
+           "-o", 
+           "{}".format(tcoutfilepath)]
+    
+  # Call TreeCmp
+  try:
+    output = subprocess.run(arg,stdout=subprocess.PIPE,
+                            stderr=subprocess.PIPE, universal_newlines=True)
+  except:
+    sys.stderr.write("Calling TreeCmp failed\n")
+    exit(1)
+    
+  # Check if an Error turn up while running the programm
+  if output.stderr:
+    print('{}: an Error at TreeCmp appeard:\n{}'
+            .format(sys.argv[0],output.stderr))
+    exit(1)
+  
+  if os.path.getsize(tcoutfilepath) == 0:
+        sys.stderr.write("{}: an Error at TreeCmp appeard\n"
+                         "The Trees might contain different leaf sets.\n"
+                         .format(sys.argv[0]))
+        exit(1)
--- a/src/concatFiles.py
+++ b/src/concatFiles.py
+#!/usr/bin/env python3
+
+'''
+helper script to concatenate a list of files into one file 
+writen at a given outfilepath.
+
+author: Birgitta Päuker
+'''
+
+import argparse, sys
+import openFile
+
+'''
+Function to concatenate a list of given files into a file
+writen at the given outfilepath
+'''
+def concat_files(infiles, outfilepath):
+  if len(infiles) > 1:
+    concat = '\n'.join([open(f).read() for f in infiles])
+    concatfile = openFile.open_file(outfilepath, 'w')
+    concatfile.write(concat)
+    concatfile.close()
+  else:
+    sys.stderr.write("{}:A list of inputfiles is needed to concatenate them.\n"
+                     .format(sys.argv[0]))
+    exit(1)
+
+def parse_command_line():
+  p = argparse.ArgumentParser()
+  p.add_argument("inputfiles", nargs = '+',
+                 help = "Specify a list of inputfiles to concatenate.\n")
+  p.add_argument("outfilename",
+                 help = "Specify the filename of the concatenated files.\n")
+  args = p.parse_args()
+  return args
+
+if __name__ == "__main__":
+  args = parse_command_line()
+  concat_files(args.inputfiles, args.outfilename)
--- a/src/create_dir.py
+++ b/src/create_dir.py
+#!/usr/bin/env python3
+
+'''
+helper script to create a directory at a given path
+
+author: Birgitta Päuker
+'''
+
+import os,sys
+
+# Function to create a directory at the given path
+def create_dir(path):
+  try:
+    os.makedirs(path)
+    print("# directory {} created".format(path))
+  except FileExistsError:
+    sys.stderr.write("# directory {} already exists\n".format(path))
--- a/src/dist_matrix.py
+++ b/src/dist_matrix.py
+#!/usr/bin/env python3
+
+import argparse, sys
+import numpy as np
+
+class DistanceMatrix:
+  # provide the name of the inputfile with the distance matrix
+  def __init__(self,inputfile,mindist=None,eliminate=None):
+    try:
+      stream = open(inputfile)
+    except IOError as err:
+      sys.stderr.write('{}: {}\n'.format(sys.argv[0],err))
+      exit(1)
+    self._precision = 5
+    self._distance_matrix = None
+    self._taxa_names = list()
+    self._first = None
+    for line_num, line in enumerate(stream,1):
+      if line_num == 1:
+        try:
+          num_of_taxa = int(line.strip())
+        except ValueError as err:
+          sys.stderr.write('{}: {}\n'.format(sys.argv[0],err))
+          exit(1)
+        self._distance_matrix = np.zeros((num_of_taxa,num_of_taxa))
+      else:
+        values = line.strip().split()
+        possible_lengths = [line_num,line_num-1,num_of_taxa+1]
+        if len(values) not in possible_lengths:
+          sys.stderr.write(('{}: file {}, line {} does not contain '
+                            'exactly {} values\n')
+                            .format(sys.argv[0],inputfile,line_num,
+                                    ' or '.join(possible_lengths)))
+          exit(1)
+        taxa_name = values[0].strip()
+        self._taxa_names.append(taxa_name)
+        for idx, dist_s in enumerate(values[1:]):
+          try:
+            dist = float(dist_s)
+          except ValueError as err:
+            sys.stderr.write('{}: {}\n'.format(sys.argv[0],err))
+            exit(1)
+          self._distance_matrix[line_num-2,idx] = dist
+          self._distance_matrix[idx,line_num-2] = dist
+    if mindist:
+      num_of_taxa = len(self._taxa_names)
+      close_pairs = [(i,j) for i in range(num_of_taxa) \
+                           for j in range(i+1,num_of_taxa)
+                           if self._distance_matrix[i,j] < mindist]
+      for i,j in close_pairs:
+        sys.stderr.write('{}\t{}\n'
+                         .format(self._taxa_names[i],self._taxa_names[j]))
+    if eliminate:
+      dlist = [i for i,taxa_name in enumerate(self._taxa_names) \
+                 if taxa_name in eliminate]
+      for a in range(0,2):
+        self._distance_matrix = np.delete(self._distance_matrix,dlist,axis=a)
+      self._taxa_names = [taxa for taxa in self._taxa_names \
+                               if not (taxa in eliminate)]
+
+  # return the number of taxa
+  def num_of_taxa(self):
+    return len(self._taxa_names)
+  # return the taxon name for a given taxon index idx
+  def taxon_name(self,idx):
+    assert idx < self.num_of_taxa()
+    return self._taxa_names[idx]
+  # return the distance of the two taxa with index i an j
+  def distance(self,i,j):
+    assert i < self.num_of_taxa() and j < self.num_of_taxa()
+    return self._distance_matrix[i,j]
+  def set_precision(self,precision):
+    self._precision = precision
+  def first_set(self,first):
+    self._first = first
+  def flat_output(self):
+    def float2string(f):
+      prc = '{:.' + str(self._precision) + 'f}'
+      return prc.format(f)
+    lines = list()
+    num_taxa = len(self._taxa_names)
+    if not (self._first is None):
+      num_taxa = min(self._first,num_taxa)
+    lines.append('# pairwise dist values for {} sequences'.format(num_taxa))
+    for idx in range(num_taxa):
+      lines.append('{}\t{}'.format(idx,self._taxa_names[idx]))
+    for i in range(num_taxa):
+      for j in range(i+1,num_taxa):
+        lines.append('JKD {} {} {}'
+                     .format(i,j,float2string(self._distance_matrix[j][i])))
+    return '\n'.join(lines)
+  def __str__(self):
+    num_taxa = len(self._taxa_names)
+    if not (self._first is None):
+      num_taxa = min(self._first,num_taxa)
+    lines = ['{}{}'.format(num_taxa,'\t' * (num_taxa-1))]
+    for i in range(num_taxa):
+      this_line = ['{}'.format(self._taxa_names[i])]
+      for j in range(num_taxa):
+        if j < i:
+          prc = '{:.' + str(self._precision) + 'f}'
+          this_line.append(prc.format(self._distance_matrix[i,j]))
+        elif j > i:
+          this_line.append('')
+        else:
+          this_line.append('0')
+      lines.append('\t'.join(this_line))
+    return '\n'.join(lines)
+
+def parse_arguments():
+  p = argparse.ArgumentParser(description=('read distance matrix'))
+  p.add_argument('-p','--precision',type=int,default=5,
+                  help='specify precision of floats output (default: 5)')
+  p.add_argument('-f','--first',type=int,default=None,
+                  help=('specified the number of taxa for which the '
+                        'matrix is output'))
+  p.add_argument('--flat',action='store_true',default=False,
+                 help='show distance values line by line')
+  p.add_argument('--mindist',type=float,default=None,
+                 help=('specifiy minimum distance value, all taxa which '
+                       'have a value smaller than this are eliminated'))
+  p.add_argument('-e','--eliminate',nargs='+',
+                  help='specify taxa to be eliminated')
+  p.add_argument('inputfile',type=str,
+                  help='file with distance matrix')
+  return p.parse_args()
+
+if __name__ == '__main__':
+  args = parse_arguments()
+  dm = DistanceMatrix(args.inputfile,args.mindist,args.eliminate)
+  if args.first:
+    dm.first_set(args.first)
+  dm.set_precision(args.precision)
+  if args.flat:
+    print(dm.flat_output())
+  else:
+    print(dm)
--- a/src/estimators.py
+++ b/src/estimators.py
+#!/usr/bin/env python3
+
+'''
+ADD YOUR TOOL TO THIS SCRIPT
+
+Definition of the class DistanceEstimator to handle the specified estimators
+and their additional informations
+
+To add your tool to phybema, please insert the needed information at the
+marked location in this script.
+
+The given version of kmacs (http://kmacs.gobics.de/) and
+fswm (http://fswm.gobics.de/) are edited implementations which can be
+found in the folder src_external. The edited implementations allow to specify
+the name and path of the generated distance files. They were written to allow
+calling phybema parallel.
+
+authors: Stefan Kurtz, Birgitta Päuker
+'''
+
+import sys, re, shutil
+
+class DistanceEstimator:
+  def __init__(self,call,call_options,fixed_out_file_name,optional_end):
+    self._call = call
+    if not re.search(r'/',call):
+      self._name = call
+    else:
+      self._name = call.split('/')[-1]
+    self._name = re.sub(r'\.sh',"",self._name)
+    self._call_options = call_options
+    self._fixed_out_file_name = fixed_out_file_name
+    self._optional_end = optional_end
+  def call_list(self,inputfile):
+    l = [self._call] + self._call_options + [inputfile]
+    if self._optional_end:
+      return l + self._optional_end
+    return l
+  def call(self):
+    return self._call
+  def name_set(self,name):
+    self._name = name
+  def name(self):
+    return self._name
+  def fixed_out_file_name(self):
+    return self._fixed_out_file_name
+  def __str__(self):
+    lines = list()
+    lines.append("name={}".format(self.name()))
+    if len(self._call_options) > 0:
+      lines.append(' '.join(self._call_options))
+    if self._fixed_out_file_name:
+      lines.append("fixed_out_file_name={}"
+                   .format(self._fixed_out_file_name))
+    if self._optional_end:
+      lines.append("optional_end={}".format(self._optional_end))
+    return '\t'.join(lines)
+
+def estimator_choices(root_dir,choices,dataset, only_names=False):
+  andi = DistanceEstimator(call="andi",
+                           call_options=["--threads=1"],
+                           fixed_out_file_name=None,
+                           optional_end=None)
+  mash = DistanceEstimator(call="mash",
+                           call_options=["triangle"],
+                           fixed_out_file_name=None,
+                           optional_end=None)
+  seed_extend = DistanceEstimator(call=("{}/src/seed_extend.sh".format(root_dir)),
+                                  call_options=[],
+                                  fixed_out_file_name=None,
+                                  optional_end=None)
+  dnadiff = DistanceEstimator(call="{}/src/dnadiff.sh".format(root_dir),
+                              call_options=[],
+                              fixed_out_file_name=None,
+                              optional_end=None)
+
+  # the fswm version in phybema has been changed. There is an -o option to
+  # specify the outfilepath. It is not allowed to contain whitespaces
+  fswm = DistanceEstimator(call="{}/src_external/fswm/fswm".format(root_dir),
+                           call_options=["-t 1","-o {}_fswm_DMat".format(dataset)],
+                           fixed_out_file_name="{}_fswm_DMat".format(dataset),
+                           optional_end=None)
+
+  # The kmacs version in Phybema has been changed. It is mandatory to give an
+  # outfilepath after the k value.
+  kmacs = DistanceEstimator("{}/src_external/kmacs/kmacs".format(root_dir),
+                            [],
+                            fixed_out_file_name="{}_kmacs_DMat".format(dataset),
+                            optional_end=["17", "{}_kmacs_DMat".format(dataset)])
+  '''
+  ADD YOUR TOOL HERE
+  specify new tool here as instance of class DistanceEstimator and
+  add the instance to the following list
+  '''
+
+  all_estimators = [andi,fswm,kmacs,mash,seed_extend,dnadiff]
+  all_estimators = [a for a in all_estimators if shutil.which(a.call())]
+
+  estimator_dict = {est.name() : est for est in all_estimators}
+  if choices:
+    chosen_estimators = list()
+    for c in choices:
+      if not (c in estimator_dict):
+        sys.stderr.write(('{}: estimator {} is not available, '
+                          'possible choices: {}\n')
+                          .format(sys.argv[0],c,list(estimator_dict.keys())))
+        exit(1)
+      chosen_estimators.append(estimator_dict[c])
+  else:
+    chosen_estimators = all_estimators
+  if only_names:
+    return [est.name() for est in chosen_estimators]
+  else:
+    return chosen_estimators
--- a/src/ete.py
+++ b/src/ete.py
+#!/usr/bin/env python3
+
+'''
+Script to call functions from the ete toolkit (http://etetoolkit.org/)
+
+author: Birgitta Päuker
+'''
+
+from ete3 import Tree, TreeStyle
+import sys
+import argparse
+
+# Function to load a tree structure from a newick file
+def load_tree(treefile):
+  try:
+    t = Tree(treefile)
+  except IOError as err:
+    sys.stderr.write("Can't open file {}: {}\n".format(treefile,err))
+    exit(1)
+  return t
+
+'''
+Function to generate a pdf, svg or png of a tree 
+created from a given newick file
+'''
+def show_tree(newickfilepath, outtreefilepath):
+  t = load_tree(newickfilepath)
+  ts = TreeStyle()
+  ts.scale = 100
+  ts.show_branch_length = True
+  t.render(outtreefilepath, tree_style = ts)
+
+# Function to determine the normalised robinson foulds distance 
+def robinson_foulds_ete(t,t2):
+  # returns list with rf, max_rf, common_leaves, parts_t, parts_t2
+  rf = t.robinson_foulds(t2, unrooted_trees = True)
+  if(rf[0] == 0):
+    nrf = 0.0
+  else:
+    nrf = rf[0]/rf[1]
+  return rf[0], rf[1], nrf
+
+'''
+Function to compare a tree given from a newick file with a reference tree
+The used distance is the normalised robinson foulds distance
+'''
+def compare_tree(newickfile, reftree):
+  ref = load_tree(reftree)
+  tree = load_tree(newickfile)
+  rf = robinson_foulds_ete(tree,ref)
+  return rf[2]
+
+'''
+Function to create a matrix filled with the normalised robinson foulds 
+distances of comparing each newick tree with the others
+'''
+def compute_nrf_matrix(newickfilelist):
+  nrf_matrix = dict()
+  for i in range(0, len(newickfilelist)-1):
+    linedict = dict()
+    for j in range(i+1, len(newickfilelist)):
+      nrf = compare_tree(newickfilelist[i], newickfilelist[j])
+      linedict[j] = nrf
+    nrf_matrix[i] = linedict
+  return nrf_matrix
+
+def parse_command_line():
+  p = argparse.ArgumentParser()
+  p.add_argument('-o', '--out', type=str,
+                 help = "specify outputfile to render tree from inputfile\n"
+                 "It has to be PDF, PNG or SVG")
+  p.add_argument('-c', '--compare', nargs= '+', default = None,
+                 help="whitespace separated files of trees for"
+                 "comparison with inputfile tree. "
+                 "If -c is the last optional argument write --"
+                 "at the end of the list")
+  p.add_argument("inputfile", type=str, help="specify inputfile."
+                 "Inputfile serves as reference tree for comparison")
+  args = p.parse_args()
+
+  if not (args.out or args.compare):
+    sys.stderr.write("Usage {}: Please choose an option for the given "
+                   "tree\n".format(sys.argv[0]))
+    exit(1)
+  return args
+
+if __name__ == "__main__":
+  args = parse_command_line()
+  t = load_tree(args.inputfile)
+
+  if args.out:
+    show_tree(args.inputfile, args.out)
+
+  if args.compare:
+    i = 0
+    for fp in args.compare:
+      t2 = load_tree(fp)
+      rf = rf_dist.robinson_foulds_ete(t,t2)
+      print("{}\t{:.2}".format(fp, rf[2]))
--- a/src/nejo_idx.py
+++ b/src/nejo_idx.py
+#!/usr/bin/env python3
+
+import sys, argparse
+import numpy as np
+from ete3 import Tree
+from dist_matrix import DistanceMatrix
+
+class NJnode:
+  def __init__(self):
+    self._leftchild = None
+    self._rightchild = None
+    self._leftdist = None
+    self._rightdist = None
+  def leftchild_set(self,leftchild):
+    self._leftchild = leftchild
+  def leftchild(self):
+    return self._leftchild
+  def rightchild_set(self,rightchild):
+    self._rightchild = rightchild
+  def rightchild(self):
+    return self._rightchild
+  def leftdist_set(self,leftdist):
+    self._leftdist = leftdist
+  def leftdist(self):
+    return self._leftdist
+  def rightdist_set(self,rightdist):
+    self._rightdist = rightdist
+  def rightdist(self):
+    return self._rightdist
+
+class NeighborJoining:
+  def __init__(self,dm):
+    self._dm = dm
+    self._num_of_taxa = dm.num_of_taxa()
+    self._numofnodes = 2 * self._num_of_taxa - 2
+    self._bigmatrix = np.zeros((self._numofnodes,self._numofnodes))
+    self._nodes = list()
+    self._finalnodeA = None
+    self._finalnodeB = None
+    self._finaldist = None
+    for i in range(0,self._numofnodes):
+      self._nodes.append(NJnode())
+      for j in range(0,i):
+        if i < self._num_of_taxa:
+          dist = dm.distance(i,j)
+          assert dist >= 0
+        else:
+          dist = None
+        self._bigmatrix[j,i] = dist
+        self._bigmatrix[i,j] = dist
+  def taxon_name(self,idx):
+    return self._dm.taxon_name(idx)
+  def numofnodes(self):
+    return self._numofnodes
+  def num_of_taxa(self):
+    return self._num_of_taxa
+  def __getitem__(self,idx):
+    assert idx < len(self._nodes)
+    return self._nodes[idx]
+  def distance(self,i,j):
+    assert i != j
+    dist = self._bigmatrix[i,j]
+    assert not (dist is None), 'for {} {}'.format(i,j)
+    return dist
+  def distance_set(self,i,j,d):
+    assert i != j
+    self._bigmatrix[j,i] = d
+    self._bigmatrix[i,j] = d
+
+def update_Rtab(nejo,rtab,is_active,active_nodes):
+  assert active_nodes >= 3
+  for i in range(nejo.numofnodes()):
+    if is_active[i]:
+      rvalue = 0.0
+      for j in range(nejo.numofnodes()):
+        if j != i and is_active[j]:
+          rvalue += nejo.distance(i,j)
+      rtab[i] = rvalue/(active_nodes - 2)
+
+def gt_neighborjoining_compute(nejo):
+  newnodenum = nejo.num_of_taxa()
+  assert nejo.num_of_taxa() <= nejo.numofnodes()
+  is_active = ([True] * nejo.num_of_taxa()) + \
+              ([False] * (nejo.numofnodes() - nejo.num_of_taxa()))
+  activenodes = nejo.num_of_taxa()
+
+  # the neighbor joining takes num_of_taxa - 2 steps
+  rtab = [None] * nejo.numofnodes()
+  for step in range(nejo.num_of_taxa() - 2):
+    mindist = None
+    min_i = None
+    min_j = None
+    update_Rtab(nejo, rtab, is_active, activenodes)
+    # determine two nodes for which the distance is minimal
+    for i in range(1,nejo.numofnodes()):
+      if is_active[i]:
+        # this node exists, check the distances
+        for j in range(i):
+          if is_active[j]:
+            this_dist = nejo.distance(i,j) - (rtab[i] + rtab[j])
+            if mindist is None or this_dist < mindist:
+              mindist = this_dist
+              min_i = i
+              min_j = j
+    # add new node to L and remove the daughters
+    assert not min_i is None
+    assert not min_j is None
+    assert min_j < min_i
+    is_active[newnodenum] = True
+    is_active[min_i] = is_active[min_j] = False
+    assert activenodes > 0
+    activenodes -= 1
+
+    # save the new node
+    min_i_min_j_dist = nejo.distance(min_i,min_j)
+    nejo[newnodenum].leftchild_set(min_i)
+    nejo[newnodenum].rightchild_set(min_j)
+    nejo[newnodenum].leftdist_set((min_i_min_j_dist + rtab[min_i] 
+                                                    - rtab[min_j])/2)
+    nejo[newnodenum].rightdist_set(min_i_min_j_dist - 
+                                   nejo[newnodenum].leftdist())
+
+    # update the distances
+    for m in range(newnodenum):
+      if is_active[m]:
+        nejo.distance_set(newnodenum,m,(nejo.distance(m,min_i) + \
+                                        nejo.distance(m,min_j) - \
+                                        min_i_min_j_dist)/2)
+    newnodenum += 1
+
+  # now only two nodes are active, save them and the distance between them
+  for i in range(nejo.numofnodes()):
+    if is_active[i]:
+      nejo._finalnodeA = i
+      break
+  nejo._finalnodeB = nejo.numofnodes() - 1
+  nejo._finaldist = nejo.distance(nejo._finalnodeB,nejo._finalnodeA)
+
+def prec_str(precision):
+  return '{:.' + str(precision) + 'f}'
+
+def to_newick_node(output,nejo,nodenum,precision):
+  if nejo[nodenum].leftchild() is None:
+    assert nejo[nodenum].rightchild() is None
+    output.append('({}'.format(nejo.taxon_name(nodenum)))
+    return
+  assert not nejo[nodenum].rightchild() is None
+  assert nodenum >= nejo.num_of_taxa()
+  children = [nejo[nodenum].leftchild(),nejo[nodenum].rightchild()]
+  for idx in range(2):
+    if idx == 0:
+      output.append('(')
+    if idx == 0:
+      dist = nejo[nodenum].leftdist() 
+    else:
+      dist = nejo[nodenum].rightdist()
+    prc = prec_str(precision)
+    if children[idx] < nejo.num_of_taxa():
+      output.append(('{}:' + prc).format(nejo.taxon_name(children[idx]),dist))
+    else:
+      to_newick_node(output,nejo,children[idx],precision)
+      output.append(('):' + prc).format(dist))
+    if idx == 0:
+      output.append(',')
+
+def tree2newick(output,nejo,precision):
+  to_newick_node(output,nejo, nejo._finalnodeA,precision)
+  output.append(',')
+  to_newick_node(output,nejo, nejo._finalnodeB,precision)
+  prc = prec_str(precision)
+  output.append(('):' + prc + ');').format(nejo._finaldist))
+
+
+def parse_arguments():
+  p = argparse.ArgumentParser(description=('determine phylogenetic tree using '
+                                           'the neighbor joining algorithm'))
+  p.add_argument('-p','--precision',type=int,default=5,
+                  help='specify precision of floats output (default: 5)')
+  p.add_argument('--pdf',action='store_true',default=False,
+                 help='show tree as PDF file')
+  p.add_argument('inputfile',type=str,
+                  help='file with distance matrix')
+  return p.parse_args()
+
+if __name__ == '__main__':
+  args = parse_arguments()
+  dm = DistanceMatrix(args.inputfile)
+  nejo = NeighborJoining(dm)
+  gt_neighborjoining_compute(nejo)
+  output = list()
+  tree2newick(output,nejo,args.precision)
+  newick_string = ''.join(output)
+  if args.pdf:
+    outfilename = '{}.pdf'.format(args.inputfile)
+    this_tree = Tree(newick_string)
+    this_tree.render(outfilename)
+  else:
+    print(newick_string)
--- a/src/openFile.py
+++ b/src/openFile.py
+#!/usr/bin/env python3
+
+'''
+helper script for opening a file
+
+author: Birgitta Päuker
+'''
+
+import sys
+
+# Function to open a given file for writing or reading
+def open_file(filename, mode = 'r'):
+  try:
+    fp = open(filename,mode)
+  except IOError as err:
+    sys.stderr.write('{}: {}\n'.format(sys.argv[0],err))
+    exit(1)
+  return fp
--- a/src/parser.py
+++ b/src/parser.py
+#!/usr/bin/env python3
+
+'''
+Script to parse the phybema options and
+to create a datatuplelist with the datasets and reftrees.
+
+author: Birgitta Päuker
+'''
+
+import argparse, os, sys
+from argparse import RawTextHelpFormatter
+from pathlib import Path
+from estimators import DistanceEstimator, estimator_choices
+from root_dir import phybema_root_dir
+
+FASTA_SUFFIXES = [".fasta",".faa",".fna"]
+REFTREE_SUFFIXES = [".nh",".tre"]
+
+# Function to parse the phybema options
+def parse_command_line():
+  p = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter)
+  root_dir = phybema_root_dir()
+  possible_tools = estimator_choices(root_dir,None,None,True)
+  p.add_argument("datasetpaths", nargs = '+',
+                 help=("Specify a list of paths to directories which contain\n"
+                       "a datafile and optionally a reference tree.\n"
+                       "A datafile must be in FASTA format ({}),\n"
+                       "a reference tree must be in newick format ({}).\n"
+                       "All genomes must be contained in one FASTA file.\n"
+                       "For datafiles with a reference tree the resulting\n"
+                       "NJ-tree (derived from the distances delivered by\n"
+                       "the corresponding tool) will be compared to the\n"
+                       "reference tree.\n"
+                       "For datafiles without reference and if at least\n"
+                       "two tools have been specified, the resulting\n"
+                       "NJ-trees are compared against each other."
+                          .format(','.join(FASTA_SUFFIXES),
+                                  ','.join(REFTREE_SUFFIXES))))
+
+  p.add_argument("--tools",nargs='+',choices=possible_tools,
+                 help=("Specify the tools to evaluate.\n"
+                       "If this is the last option, then use -- before\n"
+                       "specifying any data set.\n"
+                       "default: use all tools."))
+  p.add_argument("-o", "--out_treefile_suffix", type=str,default="pdf",
+                 choices=["pdf","png","svg","none"],
+                 help=("Specify suffix of output files\n"
+                       "with the NJ-trees computed from the distance\n"
+                       "matrices delivered by each tool. The argument \"none\"\n"
+                       "means that no graphic-file is generated. This is\n"
+                       "useful for remote sessions, where this does not work\n"
+                       "anyway, as the ete toolkit, which generates the\n"
+                       "graphics output requires Qt and an X-server\n"
+                       "default: pdf"))
+  p.add_argument("-d", "--dist", nargs = '*',metavar="metric",
+                 help=("Specify the distance metrics to use for comparing\n"
+                       "the resulting NJ-trees with the reference tree.\n"
+                       "More than one metric is possible.\n"
+                       "Possible metrics are:\n"
+		       "ms\tMatching Split metric\n"
+                       "pd\tPath Difference metric\n"
+                       "qt\tQuartet metric\n"
+                       "nrf\tnormalised Robinson-Foulds distance\n"
+                       "rf\tRobinson-Foulds distance\n"
+                       "ms, pd, qt and rf are computed by TreeCmp,\n"
+                       "nrf is computed by using the ETE3 toolkit\n"
+                       "default: use nrf only\n"))
+  p.add_argument("-m","--mat", action='store_true',
+                 help=("Write the distance matrices originally computed by\n"
+                       "the different programs into out."))
+  if len(sys.argv)==1:
+    p.print_help(sys.stderr)
+    exit(1)
+
+  args = p.parse_args()
+  return args
+
+'''
+Function to create a datatuplelist with the datafiles
+and their reftree if given
+'''
+def create_datatuplelist(datasetpaths):
+  datatuplelist = []
+
+  # Read in the datafile and reftree file from given datadir
+  for datasetpath in datasetpaths:
+
+    # Check if there is a directory at datasetpath
+    if not os.path.isdir(datasetpath):
+      sys.stderr.write("{}: {} is not a directory.\n"
+                       .format(sys.argv[0], datasetpath))
+      exit(1)
+
+    # Create list of files with FASTA_SUFFIXES suffix
+    datafilepaths = [f.path for f in os.scandir(datasetpath) if Path(f).is_file
+                     and os.path.splitext(f)[1] in FASTA_SUFFIXES]
+
+    # Check if a datafile was found
+    if not datafilepaths:
+      sys.stderr.write("{}: No file with suffix {}\nin {} found\n"
+                       .format(sys.argv[0], FASTA_SUFFIXES, datasetpath))
+      exit(1)
+    # Check if more than one datafile was found
+    elif len(datafilepaths) > 1:
+      sys.stderr.write("{}:There is more than one file with a {} suffix "
+                       "in directory {}\n"
+                       .format(sys.argv[0], FASTA_SUFFIXES, datasetpath))
+      exit(1)
+    else:
+      datafile = datafilepaths[0]
+      # Check if found file is empty
+      if os.path.getsize(datafile) == 0:
+        sys.stderr.write("{}: The found datafile {} is empty.\n"
+                         .format(sys.argv[0], datafile))
+        exit(1)
+
+    # Create list of files with REFTREE_SUFFIXES suffix
+    reftreefilepaths = [f.path for f in os.scandir(datasetpath) if
+                        Path(f).is_file and os.path.splitext(f)[1]
+                        in REFTREE_SUFFIXES]
+
+    # Check if a reftree was found
+    if not reftreefilepaths:
+      reftree = None
+    # Check if more than one reftree was found
+    elif len(reftreefilepaths) > 1:
+      sys.stderr.write("{}:There is more than one file with a {} suffix "
+                       "in directory {}\n"
+                       .format(sys.argv[0], REFTREE_SUFFIXES, datasetpath))
+      exit(1)
+    else:
+      reftree = reftreefilepaths[0]
+      # Check if found file is empty
+      if os.path.getsize(reftree) == 0:
+        sys.stderr.write("{}: The found reftreefile {} is empty.\n"
+                         .format(sys.argv[0], reftree))
+        exit(1)
+
+      sys.stdout.write("# reftree {} found.\n# It will be used for comparison."
+                       "\n\n".format(reftree))
+
+    # Insert datatuple into datatuplelist
+    datatuple = datafile,reftree
+    datatuplelist.append(datatuple)
+
+  return datatuplelist
--- a/src/phybema.py
+++ b/src/phybema.py
+#!/usr/bin/env python3
+
+'''
+Main script for calling phybema
+author: Birgitta Päuker
+'''
+
+import sys, os, shutil
+import PHYLIP_format_converter as fc
+import ete
+import callTreeCmp as treecmp
+import callProg
+import concatFiles
+import write_time
+import parser
+import create_dir
+import print_comparison
+from test_requirements import check_req
+from estimators import DistanceEstimator, estimator_choices
+from root_dir import phybema_root_dir
+from dist_matrix import DistanceMatrix
+from nejo_idx import NeighborJoining, gt_neighborjoining_compute, tree2newick
+
+ROOT_DIR = phybema_root_dir()
+
+def run_nj(newickfilepath, inputfile):
+  dm = DistanceMatrix(inputfile)
+  nejo = NeighborJoining(dm)
+  gt_neighborjoining_compute(nejo)
+  output = list()
+  tree2newick(output,nejo,5)
+  newick_string = ''.join(output)
+  try:
+    stream = open(newickfilepath,"w")
+  except IOError as err:
+    sys.stderr.write('{}: {}\n'.format(sys.argv[0],err))
+    exit(1)
+  stream.write('{}\n'.format(newick_string))
+  stream.close()
+
+# Function for writing the distancefile
+def compare(reftree, datafile, datafilename, tcdistmetriclist,
+            njconcatfilepath, newickfilepathlist, prognamelist, nrf_metric,
+            nrf_list, out_dir, temp_dir):
+  # Comparison with reftree
+  if reftree:
+    # Getting the name of the reftreefile:
+    reffilebase = os.path.basename(reftree)
+    reffilename = os.path.splitext(reffilebase)[0]
+
+    # Paths for outfiles:
+    distoutfilepath = ("{}/{}_ref_{}_dist.txt"
+                       .format(out_dir,datafilename, reffilename))
+    tcoutfilepath = "{}/{}_Treecmp_out.txt".format(temp_dir,datafilename)
+
+    # Run TreeCmp with the given distmetrics
+    if tcdistmetriclist:
+      treecmp.run_TreeCmp(tcdistmetriclist, reftree, njconcatfilepath,
+                          tcoutfilepath, ROOT_DIR)
+
+    # Write the file with the distances
+    print_comparison.print_refcomparison(datafile, reftree, distoutfilepath,
+                                prognamelist, tcdistmetriclist, tcoutfilepath,
+                                nrf_metric, nrf_list)
+  # Comparing each Tree with the others (if more than one tool was selected)
+  else:
+    if len(prognamelist) > 1:
+      # Paths for outfiles:
+      distoutfilepath = "{}/{}_dist.txt".format(out_dir,datafilename)
+      tcoutfilepath = "{}/{}_TreeCmp_out.txt".format(temp_dir, datafilename)
+
+      # Run TreeCmp with the given distmetrics
+      if tcdistmetriclist:
+        treecmp.run_TreeCmp(tcdistmetriclist, None, njconcatfilepath,
+                            tcoutfilepath, ROOT_DIR)
+      if nrf_metric:
+        nrf_matrix = ete.compute_nrf_matrix(newickfilepathlist)
+
+      # Write the file with the distances
+      print_comparison.print_matrixcomparison(datafile, distoutfilepath,
+                                              prognamelist, tcdistmetriclist,
+                                              tcoutfilepath,
+                                              nrf_metric, nrf_matrix)
+    else:
+      sys.stderr.write(("# For datafile {} no reference tree was given and not "
+                        "enough tools have been selected for a pairwise "
+                        "comparison.\n").format(datafile))
+
+args = parser.parse_command_line()
+check_req()
+
+# Variabledeclarations:
+# list with datafile,reftree tuple
+datatuplelist = parser.create_datatuplelist(args.datasetpaths)
+
+# list with the chosen distance metrics for TreeCmp
+tcdistmetriclist = []
+# list of the valid distance metric names
+valid_metrics = ["ms", "nrf", "pd", "qt", "rf"]
+# Bool if nrf was chosen as metric
+nrf_metric = False
+# List for the computation times
+timelist = []
+
+'''
+Read in the dist metrics:
+if given dist metrics is empty use all metrics
+'''
+tcdistmetriclist = list()
+if not args.dist:
+  nrf_metric = True
+else:
+  for met in args.dist:
+    # Check if given metric is valid
+    if not met in valid_metrics:
+      sys.stderr.write("{} is not a valid distance metric\n".format(met))
+      exit(1)
+    if met == "nrf":
+      nrf_metric = True
+    else:
+      tcdistmetriclist.append(met)
+
+# Generating files for every dataset
+for datafile,reftree in datatuplelist:
+  sys.stderr.write('# datafile {}\n'.format(datafile))
+  path = os.path.normpath(datafile)
+  dirlist = path.split(os.sep)
+  assert len(dirlist) >= 2
+  testset_key = dirlist[-2]
+  # Getting the name of the datafile:
+  datafilebase = os.path.basename(datafile)
+  datafilename = os.path.splitext(datafilebase)[0]
+
+  estimators = estimator_choices(ROOT_DIR,args.tools, datafilename)
+  prognamelist = [est.name() for est in estimators]
+
+  # Variabledeclarations: list of the newickfilepaths
+  newickfilepathlist = []
+  # List with the nrf values (for cases with reftree and nrf)
+  nrf_list=[]
+
+  # Paths for the generated files
+  temp_dir = "{}/temp".format(ROOT_DIR)
+  out_dir = "{}/out".format(ROOT_DIR)
+  create_dir.create_dir(temp_dir)
+  create_dir.create_dir(out_dir)
+
+  njconcatfilepath = "{}/{}_nj_concatfile.nh".format(temp_dir,testset_key)
+  timefilepath = "{}/{}_runtime.txt".format(out_dir,testset_key)
+
+  for estimator in estimators:
+    progname = estimator.name()
+    call_list = estimator.call_list(datafile)
+    fixed_out_file_name = estimator.fixed_out_file_name()
+
+    # Paths for the generated files
+    if args.mat:
+      progoutfilepath = ("{}/{}_{}.mat"
+                         .format(out_dir,testset_key,progname))
+    else:
+      progoutfilepath = ("{}/{}_{}.mat"
+                         .format(temp_dir,testset_key,progname))
+
+    newickfilepath = ("{}/{}_{}.nh"
+                      .format(temp_dir,testset_key,progname))
+    # program call
+    time_taken = callProg.run_prog(progname,call_list,fixed_out_file_name,
+                                   progoutfilepath)
+    timelist.append((progname,time_taken))
+
+    # Getting a file with the newickstring using neighborjoining
+    run_nj(newickfilepath, progoutfilepath)
+    newickfilepathlist.append(newickfilepath)
+
+    # Computing Tree from the determined newick file
+    if args.out_treefile_suffix != "none":
+      outtreefilepath = ("{}/{}_{}_tree.{}"
+                        .format(out_dir,testset_key,progname,
+                                args.out_treefile_suffix))
+      ete.show_tree(newickfilepath, outtreefilepath)
+
+    # Get the normalised robinson foulds distance from ete toolkit,
+    # if option was chosen
+    if reftree:
+      nrf = None
+      if nrf_metric:
+        nrf = ete.compare_tree(newickfilepath, reftree)
+        nrf_list.append(nrf)
+
+  # Concatenate the newickfiles
+  if len(newickfilepathlist) > 1:
+    concatFiles.concat_files(newickfilepathlist, njconcatfilepath)
+  else:
+    njconcatfilepath = newickfilepath
+
+  # write the timefile
+  write_time.write_time(timefilepath, timelist, testset_key)
+
+  # print the distances and write them into a file
+  compare(reftree, datafile, testset_key, tcdistmetriclist,
+          njconcatfilepath, newickfilepathlist, prognamelist, nrf_metric,
+          nrf_list, out_dir, temp_dir)
--- a/src/print_comparison.py
+++ b/src/print_comparison.py
+#!/usr/bin/env python3
+
+'''
+Script to write a file with the distances determined by TreeCmp
+ordered by the estimators.
+The script needs the format of the TreeCmp outfile of version 1.1.
+'''
+
+import openFile
+
+# Function to write the tablehead with the distance metrics
+def print_metrics_tablehead(metricnamelist, outfile, tablehead):
+  for met in metricnamelist:
+    tablehead = tablehead + "\t{}".format(met)
+  outfile.write(tablehead + "\n")
+  print(tablehead)
+
+# Function to create a list with the names of the metrics
+def get_metricnamelist(tcdistmetriclist, tcoutfilepath, nrf_metric):
+  
+  # variable declaration:
+  metricnamelist = []
+  
+  # Get the distance metric names
+  if tcdistmetriclist:
+    # open TreeCmp Outfile
+    infile = openFile.open_file(tcoutfilepath)
+    lines = infile.readlines()
+    infile.close()
+
+    firstlineatt = lines[0].rstrip().split("\t")
+    metricnamelist = firstlineatt[3:]
+
+  if nrf_metric:
+    metricnamelist.append("normalisedR-F")
+    
+  return metricnamelist
+  
+
+# Function to write the distance file for comparing each tree with the others
+def print_matrixcomparison(datafile, distoutfilepath, prognamelist, 
+                           tcdistmetriclist, tcoutfilepath, nrf_metric, 
+                           nrf_matrix):
+  # variable declaration:
+  metricnamelist = []
+  linecount = 1
+
+  # open the outputfile
+  outfile = openFile.open_file(distoutfilepath, "w")
+  
+  # write the datafile
+  outfile.write("datafile: {}\n".format(datafile))
+  print("datafile: {}".format(datafile))
+  
+  # Get the distance metric names
+  metricnamelist = get_metricnamelist(tcdistmetriclist,tcoutfilepath, 
+                                      nrf_metric)
+
+  # Write the tablehead: Tree1  Tree2   distmetric1 ...
+  tablehead = "Program1\tProgram2"
+  print_metrics_tablehead(metricnamelist, outfile, tablehead)
+  
+  # Write Tree1 Tree2 and the distances
+  for i in range(0, len(prognamelist)-1):
+    tree1 = prognamelist[i]
+    for j in range(i+1, len(prognamelist)):
+      outline = ""
+      tree2 = prognamelist[j]
+      outline = outline + "{}\t{}".format(tree1, tree2)
+      
+      # Get a string with the distances
+      if tcdistmetriclist:
+        # open TreeCmp Outfile
+        infile = openFile.open_file(tcoutfilepath)
+        lines = infile.readlines()
+        infile.close()
+        
+        lineatt = lines[linecount].rstrip().split("\t")
+        values = lineatt[3:]
+        for value in values:
+          outline = outline + "\t{}".format(value)
+          
+      if nrf_metric:
+        outline = outline + "\t{}".format(nrf_matrix[i][j])
+        
+      # Write into the outfile
+      outfile.write(outline + "\n")
+      print(outline)
+      
+      linecount = linecount + 1
+      
+  outfile.close()
+
+# Function to write the distance file for comparing with a reference tree
+def print_refcomparison(datafile, reftree, distoutfilepath, prognamelist, 
+                        tcdistmetriclist, tcoutfilepath, nrf_metric, nrf_list):
+  
+  # variable declaration:
+  metricnamelist = []
+
+  # open the outputfile
+  outfile = openFile.open_file(distoutfilepath, "w")
+  
+  # write the datafile and reftree names
+  outfile.write("datafile:{}\nreftree:{}\n".format(datafile,reftree))
+  print("datafile:{}".format(datafile))
+  print("reftree:{}".format(reftree))
+
+  # Get the distance metric names
+  metricnamelist = get_metricnamelist(tcdistmetriclist, tcoutfilepath, 
+                                      nrf_metric)
+
+  # Write the tablehead: Programm   distmetric1 ...
+  tablehead = "Program"
+  print_metrics_tablehead(metricnamelist, outfile, tablehead)
+  
+  # Write the programs and their distances
+  for prognum in range(0,len(prognamelist)):
+    outline = "{}".format(prognamelist[prognum])
+
+    if tcdistmetriclist:
+      # open TreeCmp Outfile
+      infile = openFile.open_file(tcoutfilepath)
+      lines = infile.readlines()
+      infile.close()
+        
+      ls = lines[prognum+1].rstrip("\n").split("\t")
+      values = ls[3:]
+      for value in values:
+        outline = outline + "\t{}".format(value)
+        
+    if nrf_metric:
+      outline = outline + "\t{}".format(nrf_list[prognum])
+      
+    # Write into the outfile
+    outfile.write(outline + "\n")
+    print(outline)
+    
+  outfile.close()
--- a/src/root_dir.py
+++ b/src/root_dir.py
+#!/usr/bin/env python3
+
+'''
+helper script for getting the root directory of phybema
+
+'''
+
+import os.path
+
+def phybema_root_dir():
+  # Path to this dir
+  this_dir = os.path.dirname(os.path.abspath(__file__))
+  # Path to the root dir
+  return os.path.split(this_dir)[0]
--- a/src/test_requirements.py
+++ b/src/test_requirements.py
+#!/usr/bin/env python3
+
+'''
+Script to test if needed programs can be found in the path.
+
+author: Birgitta Päuker
+'''
+
+import shutil, sys, subprocess, re
+
+# Function to test if needed programs can be found in the path.
+def check_req():
+  
+  # Test if mash (version >=2.1) is in the path
+  if not shutil.which('mash'):
+    sys.stderr.write("{}: mash is not in your path.\n".format(sys.argv[0]))
+    exit(1)
+  else:
+    try:
+      output = subprocess.run(["mash", "--version"],stdout=subprocess.PIPE,
+                            stderr=subprocess.PIPE, universal_newlines=True)
+    except:
+      sys.stderr.write("{}:Checking mash version failed\n".format(sys.argv[0]))
+      exit(1)
+    if float(output.stdout) < 2.1:
+      sys.stderr.write("{}: mash version >= 2.1 needed.\n".format(sys.argv[0]))
+      exit(1)
+  
+  # Test if ete3 is in the path
+  if not shutil.which('ete3'):
+    sys.stderr.write("{}: ete3 is not in your path.\n".format(sys.argv[0]))
+    exit(1)
+    
+  # Test if python (version >=3.5) is in the path
+  if not shutil.which('python3'):
+    sys.stderr.write("{}: python3 is not in your path.\n".format(sys.argv[0]))
+    exit(1)
+  else:
+    try:
+      output = subprocess.run(["python3", "--version"],stdout=subprocess.PIPE,
+                            stderr=subprocess.PIPE, universal_newlines=True)
+    except:
+      sys.stderr.write("{}: Checking python3 version failed\n"
+                       .format(sys.argv[0]))
+      exit(1)
+    version = list(filter(None, (re.split(r'\s',output.stdout))))[-1]
+    versionnum = ".".join(version.split(".")[:2])
+    if float(versionnum) < 3.5:
+      sys.stderr.write("{}: python3 version >= 3.5 needed.\n"
+                       .format(sys.argv[0]))
+      exit(1)
--- a/src/write_time.py
+++ b/src/write_time.py
+#!/usr/bin/env python3
+
+'''
+Script to write a file with the computation times of the estimators.
+
+author: Birgitta Päuker
+'''
+
+import os
+import openFile
+
+# Function to write the computation times into a file
+def write_time(timefilepath, timelist, datafilename):
+  programs = []
+  times = []
+  
+  timefile = openFile.open_file(timefilepath, "w")
+  
+  for progname,time in timelist:
+    programs.append(progname)
+    times.append(time)
+  
+  # Write estimators
+  progstr = "Dataset"
+  for prog in programs:
+    progstr = progstr + "\t{}".format(prog)
+  progstr = progstr + "\n"
+  timefile.write(progstr)
+  
+  # Write times
+  timestr = "{}".format(datafilename)
+  for time in times:
+    timestr = timestr + "\t{}".format(time)
+  timefile.write('{}\n'.format(timestr))
+  
+  # Close file
+  timefile.close()