diff --git a/src/PHYLIP_format_converter.py b/src/PHYLIP_format_converter.py deleted file mode 100755 index fb778f24b5714cd8a2117f650ba30a7fe7ded570..0000000000000000000000000000000000000000 --- a/src/PHYLIP_format_converter.py +++ /dev/null @@ -1,116 +0,0 @@ -#!/usr/bin/env python3 - -''' -Script to convert the PHYLIP format into the -for neighborjoining needed format. - -The input has to be a lower left triangular or full matrix. -Names will be cut after 10 characters and are not allowed -to contain blanks or other seperators. - -author: Birgitta Päuker -''' - -import sys, re, argparse -import numpy as np -import openFile - -''' -Function to convert the PHYLIP Format into the for neighborjoining needed format. -''' -def convert_format(progname, progoutfilepath, convoutfilepath): - # Read in the lines - infile= openFile.open_file(progoutfilepath) - lines = infile.readlines() - infile.close() - - # Open the outputfile - outfile = openFile.open_file(convoutfilepath, 'w') - - # Writing the headline - num_genomesre = re.search(r'\w+', lines[0]) - num_genomes = num_genomesre[0] - - # Check if first line is a digit - if not (num_genomes.isdigit()): - sys.stderr.write("{}: The first line of the distmatrix from {} needs to " - "be the number of sequences".format(sys.argv[0],progname)) - exit(1) - outfile.write("# pairwise dist values for {} sequences\n" - .format(num_genomes)) - - distmatrix = np.zeros((int(num_genomes), int(num_genomes))) - linegenomeidx = 0 - testfullmatrix = False - - # Convert distances into needed format - for line in lines[1:]: - ''' - for normal PHYLIP Format the used seperator is \t. - But some are using " " as seperator - Replacing sep \t with " " - ''' - line = line.replace("\t", " ") - values = line.strip().split(' ') - - # Remove all empty string elements - values = list(filter(None, values)) - - # Replace a nan in the distancematrix with distance 1.0 - for value in values[1:]: - if(value.isalpha()): - if value == "nan": - values[values.index(value)] = 1.0 - sys.stdout.write("# Replaced the value nan in {} with distance 1.0\n" - .format(progoutfilepath)) - else: - sys.stderr.write("There are characters in the distancematrix in file " - "{}\n".format(progoutfilepath)) - exit(1) - - # Testing if there are enough distances per sequence in a line - if(len(values) < linegenomeidx + 1): - sys.stderr.write("{}: In the distmatrix of {} are not enough distances " - "per sequence.\n".format(sys.argv[0], progname)) - exit(1) - - # Writing the name of the genom (10 characters) with its index - name = values[0] - outfile.write("{}\t{}\n".format(linegenomeidx, name[:10])) - - # Filling the distmatrix: - if not (linegenomeidx == 0): - # working with the lower left triangular matrix - # for each processed rowgenom read in the dist to the current linegenome - for row in range(linegenomeidx): - distmatrix[linegenomeidx][row] = values[row + 1] - - # if full matrix, fill full distmatrix for later test - if(len(values) > linegenomeidx + 2): - testfullmatrix = True - for row in range(linegenomeidx + 1, len(values)-1): - distmatrix[linegenomeidx][row] = values[row + 1] - - linegenomeidx = linegenomeidx + 1 - - # Testing for a full matrix if it is semetric, to find cases like fswm and - # kmacs where there are no seperators between the name and the first distance. - if(testfullmatrix): - for i in range(distmatrix.shape[0]): - for j in range(i+1,distmatrix.shape[1]): - if(distmatrix[i][j] != distmatrix[j][i]): - sys.stderr.write("{}: The distancmatrix of {} can not be converted " - "into needed format.\nPlease search for the cause in" - " file {}.\nA possible reason might be a missing " - "blank between genomename and first distance.\n" - .format(sys.argv[0], progname, progoutfilepath)) - exit(1) - - # Print into outputfile: - for lineidx in range(linegenomeidx): - for rowidx in range(lineidx + 1, linegenomeidx): - # write dist from linegenome to rowgenome - dist = distmatrix[rowidx][lineidx] - outfile.write("dist {} {} {:.10f}\n".format(lineidx, rowidx, dist)) - - outfile.close() diff --git a/src/phybema.py b/src/phybema.py index 1224076f3e792145699a006c5edea4c7e9dc7ab4..cc8c63311bc09b68fe6f87687e0bddc504b725e5 100755 --- a/src/phybema.py +++ b/src/phybema.py @@ -6,7 +6,6 @@ author: Birgitta Päuker ''' import sys, os, shutil -import PHYLIP_format_converter as fc import ete import callTreeCmp as treecmp import callProg