diff --git a/src/anicalc/__init__.py b/src/anicalc/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/src/anicalc/anicalc.py b/src/anicalc/anicalc.py deleted file mode 100644 index d39953b1c948f1a2881995092cafc38605766180..0000000000000000000000000000000000000000 --- a/src/anicalc/anicalc.py +++ /dev/null @@ -1,116 +0,0 @@ -""" Calculation of Average nucleotide indentity (ANI)""" -import os -import numpy as np -from anicalc.fasta_iterator import fasta_next, write_sequence -from anicalc.nucmer import compute_delta, nucmer_calculate_alignments_list - - -def extend_path(program): - """ - Function to extend the a program to the full length path - with systems file path if required and checks if program is executable. - @param program: program filepath or name - @return: full filepath or error if not found or not executable - """ - fpath, _ = os.path.split(program) - if fpath: - if os.path.isfile(program) and os.access(program, os.X_OK): - return os.path.abspath(program) - raise FileNotFoundError( - "{} not found or not executable".format(program)) - for path in os.environ["PATH"].split(os.pathsep): - exe_file = os.path.abspath(os.path.join(path, program)) - if os.path.isfile(exe_file) and os.access(exe_file, os.X_OK): - return exe_file - raise FileNotFoundError("{} not found or not executable".format(program)) - - -def create_fragments(infile, outputfilepath, min_fragment_size, fragment_size): - """ - Creates fragments based on an fasta file (for later alignments) Output files - get the suffixes '.fa' for the original sequences and '.fa.split' for - the fragments. Filenames based on FASTA-headers. - @param infile: (multi-) FASTA file with the input sequence(s) - @param outputfilepath: filepath were the output should be written to - @param min_fragment_size: minimal fragment size a fragment should have - @param fragment_size: Usual size of the output fragments - @return: list of created file prefixes - """ - files = list() - for header, sequence in fasta_next(infile): - save_header = header.split(' ')[0] - save_header = save_header.split('|')[0] - files.append(save_header) - outputfile = '{}/{}.fa'.format(outputfilepath, save_header) - with open(outputfile, 'w') as file: - file.write('>' + header + '\n') - write_sequence(file, sequence) - with open(outputfile + '.split', 'w') as file: - idx = 0 - while len(sequence) > (fragment_size + min_fragment_size): - idx += 1 - fragment = sequence[:fragment_size] - file.write('>' + str(idx) + '\n') - file.write(fragment + '\n') - sequence = sequence[fragment_size:] - file.write('>' + str(idx + 1) + '\n') - file.write(sequence + '\n') - return files - - -def calculate_ani_single(alignments_list): - """ - Calculates average nucleotide indentity (ANI) from a list of local - alignments. ANI is calculated from all alignment with are at least aligned - to 70% with maximal 30% non identities. - - @param alignments_list: list of alignments represented as triple like - (alignment length, non identities, query sequence length) - @return: Average nucleotide identity - """ - ni_sum = 0 - ani_matches = 0 - for alignment_length, non_identities, query_length in alignments_list: - if (((alignment_length - non_identities) / query_length) > 0.3) and \ - ((alignment_length / query_length) >= 0.7): - ni_sum += (alignment_length - non_identities) / alignment_length - ani_matches += 1 - ani = (ni_sum / ani_matches) if ani_matches > 0 else 0 - return ani - - -def calculate_ani_nucmer(reference, files, tmpdir): - """ - Function to create alignments with nucmer and then calculate ANI for all - reference-query-pairs. Requires to have 'nucmer' and 'delta-filter' in your - path. - @param reference: Prefix of the reference file to use - (Filename should be like reference + '.fa') - @param files: List of rrefix of query filesto use - (Filename should be like query + '.fa.split') - @param tmpdir: Name of the directory where the reference and query files - are located. - (Filenames are expanded to tmpdir + '/' + filename) - @return: reference , same as input (for multiprocessing) - @return: numpy array holding all ANI - (np.nan for the alignment of reference to reference) - """ - nucmer_path = extend_path('nucmer') - delta_filter = extend_path('delta-filter') - file_paths = [compute_delta(nucmer_path, delta_filter, - tmpdir + '/' + reference + '.fa', - tmpdir + '/' + currentfile + '.fa.split', - tmpdir) - if currentfile != reference else None - for currentfile in files] - ani = (calculate_ani_single(nucmer_calculate_alignments_list(qfile)) - if qfile else np.nan for qfile in file_paths) - return reference, np.fromiter(ani, np.float) - -_CALCULATE_ANI_WITH_TOOL = { - 'nucmer': calculate_ani_nucmer -} - -def calculate_ani_with_tool(tool): - """ Wrapper for using different alignment tools""" - return _CALCULATE_ANI_WITH_TOOL[tool] diff --git a/src/anicalc/fasta_iterator.py b/src/anicalc/fasta_iterator.py deleted file mode 100644 index eb18c4db9363d3a74ace618f3c972196b9c070de..0000000000000000000000000000000000000000 --- a/src/anicalc/fasta_iterator.py +++ /dev/null @@ -1,41 +0,0 @@ -""" Provides Iterator over (multi-) FASTA file""" -import re - - -def write_sequence(filestream, sequence): - """ Writes FASTA sequence to filestream using fixed length 60 """ - for startpos in range(0, len(sequence), 60): - filestream.write(sequence[startpos:startpos + 60] + '\n') - - -def print_sequence(seq, linelength=70): - """ Prints FASTA sequence seq to sdtout with linelength (default 70)""" - for startpos in range(0, len(seq), linelength): - print(seq[startpos:startpos + linelength]) - - -def _seq_list2sequence(seq_list): - sequence = ' '.join(seq_list) - return re.sub(r'\s', "", sequence) - - -def fasta_next(filename): - """ - Iterator to get (header, sequence) tuple - from (multi-)FASTA file with filename. - """ - with open(filename, "r") as stream: - seq_list = list() - header = None - for line in stream: - if not re.search(r'^(>|\s*$|\s*#)', line): - seq_list.append(line.rstrip()) - else: - match = re.search(r'^>(.*)', line) - if match is not None: - if header is not None: - yield header, _seq_list2sequence(seq_list) - seq_list.clear() - header = match.group(1) - if header is not None: - yield header, _seq_list2sequence(seq_list) diff --git a/src/anicalc/nucmer.py b/src/anicalc/nucmer.py deleted file mode 100644 index 259312434ef5f5db4609e1e4d528d08200c447bb..0000000000000000000000000000000000000000 --- a/src/anicalc/nucmer.py +++ /dev/null @@ -1,129 +0,0 @@ -""" Module to run nucmer and parse its output""" -import subprocess -import os -import tempfile -import re - - -class Alignment: - """ Class holding information from a .delta file""" - - def __init__(self, reference, query, ref_len, query_len): - self.reference = reference - self.query = query - self.ref_len = ref_len - self.query_len = query_len - self.alignments = list() - - def __len__(self): - return sum([len(alg) for alg in self.alignments]) - - def add_segment(self, reference_coordinates, query_coordinates, *errors): - """ - Function to add an new aligned segment to an alignment object. - (not quite sure if an alignment in delta format can - hold more than one segments) - @param reference_coordinates: tuple, start and end position - in the reference sequence - @param query_coordinates: tuple, start and end position - in the query sequence - @param: errors: triple containing non identities, similarity errors and - stop codons (if protein sequence) - """ - new = AlignmentSegment(reference_coordinates, - query_coordinates, *errors) - self.alignments.append(new) - return new - - def get_errors(self): - """ Returns sum of all non identities over all alignment segments""" - return sum([int(alg.errors) for alg in self.alignments]) - - -class AlignmentSegment: - """ Class holding an alignment segment from .delta file """ - - def __init__(self, reference_coordinates, query_coordinates, *errors): - self.reference_coordinates = reference_coordinates - self.query_coordinates = query_coordinates - self.errors = int(errors[0]) - self.similarity_errors = int(errors[1]) - self.stop_codons = int(errors[2]) - self.nexts = list() - - def __len__(self): - return abs(int(self.query_coordinates[1]) - - int(self.query_coordinates[0])) + 1 - - def add_next(self, value): - """ Adds alignment information""" - self.nexts.append(value) - - -def compute_delta(nucmer_path, delta_filter_path, reference, query, tmp_dir): - """ - Computes .delta files with nucmer based on one reference FASTA and one query - multi-FASTA file. Files are automatically filtered using 'delta-filer'. - @param nucmer_path: absolute path to nucmer - @param delta_filter: absolute path to delta-filter - @param reference: absolute path to reference FASTA file - @param query: absolute path to query multi-FASTA file - @param tmp: absolute path to working directory - @return: the absolute path to the resulting filtered .delta file - """ - with tempfile.TemporaryDirectory(dir=tmp_dir) as tmpdir: - try: - subprocess.check_call([nucmer_path, '--threads=1', reference, - query], cwd=tmpdir, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - except subprocess.CalledProcessError: - subprocess.check_call([nucmer_path, reference, query], - cwd=tmpdir, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - - filtered_delta_path = '{}/{}_{}.delta'.format( - tmp_dir, os.path.basename(reference), - os.path.basename(query)) - with open(filtered_delta_path, 'w') as filtered_delta: - subprocess.check_call([delta_filter_path, '-q', tmpdir + '/out.delta'], - stdout=filtered_delta) - return filtered_delta_path - - -def read_delta(filtered_delta_path): - """ Iterator to read an .delta file to Alignment objects """ - current_alignment = None - with open(filtered_delta_path, 'r') as filtered_delta: - for line in filtered_delta: - line = line.rstrip() - if line[0] == '>': - if current_alignment is not None: - yield current_alignment - reference, query, ref_len, query_len = line.split(' ') - current_alignment = Alignment(reference[1:], query, - int(ref_len), int(query_len)) - elif re.match(r'\d+\s\d+\s\d+\s\d+\s\d+\s\d+', line): - split_line = line.split(' ') - current_segment = current_alignment.add_segment( - split_line[0:2], - split_line[2:4], - *split_line[4:] - ) - elif re.match(r'\d+', line): - split_line = line.split(' ') - if split_line[0] != '0': - current_segment.add_next(int(split_line[0])) - if current_alignment is not None: - yield current_alignment - - -def nucmer_calculate_alignments_list(file): - """ Function to create an list of triples as input for - anicalc.calculate_ani_single from an .delta file - """ - if file is not None: - output = list() - for alignment in read_delta(file): - output.append( - (len(alignment), alignment.get_errors(), alignment.query_len)) - return output - return None diff --git a/src/calculate_ani.py b/src/calculate_ani.py deleted file mode 100755 index 4beacd173f7a2245fe9a35d5d8b4e15909345824..0000000000000000000000000000000000000000 --- a/src/calculate_ani.py +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env python3 -import sys -from os.path import abspath -from tempfile import TemporaryDirectory -from itertools import repeat -from multiprocessing import Pool -import numpy as np -import shutil - -from anicalc.anicalc import create_fragments, calculate_ani_with_tool - -def convert_raw_results_to_phybema(raw_results): - num_sequences = len(raw_results) - output = list() - output.append('{}'.format(num_sequences)) - for i in range(num_sequences): - line = list() - line.append('{}'.format(raw_results[i][0])) - line.append('0.0') - for j in range(0,i): - ani1 = raw_results[i][1][j] - ani2 = raw_results[j][1][i] - avg_ani = (ani1+ani2)/2.0 - line.append('{:.8f}'.format(1.0 - avg_ani)) - output.append('\t'.join(line)) - print('\n'.join(output)) - -DEBUG = False - -if not DEBUG: - sys.tracebacklimit = 0 - -def main(args): - for prog in ["nucmer","dnadiff"]: - if not shutil.which(prog): - sys.stderr.write('{}: cannot find {} executable\n' - .format(sys.argv[0],prog)) - exit(1) - tmp_dir = abspath(args.tmpdir) if args.tmpdir else None - with TemporaryDirectory(dir=tmp_dir) as tmpdir: - files = create_fragments(args.infile, tmpdir, - args.min_fragment_size, args.fragment_size) - - pool = Pool(args.threads) - raw_results = pool.starmap(calculate_ani_with_tool(args.tool), - zip(files, repeat(files), repeat(tmpdir))) - pool.close() - pool.join() - if args.phybema: - convert_raw_results_to_phybema(raw_results) - else: - index, values = zip(*raw_results) - results = np.stack(values, axis=0) - if not args.disable_pandas: - data = pd.DataFrame(results, index=index, columns=files) - data.to_csv(args.outprefix + '_ani.csv') - else: - np.savetxt(args.outprefix + '_ani.csv', - results, delimiter=',', header=','.join(files)) - -if __name__ == '__main__': - import argparse - PARSER = argparse.ArgumentParser(description='') - PARSER.add_argument('tool', choices=['nucmer'], help="Alignment tool") - PARSER.add_argument('--threads', type=int, default=1, help="") - PARSER.add_argument('--disable-pandas', action='store_true', - help="disable pandas output") - PARSER.add_argument('--phybema', action='store_true', - help="show format in output suitable for phybema") - PARSER.add_argument('--tmpdir', type=str, default=None, - help=('Path to directory for temporary storage ' - ' (default=system default)')) - PARSER.add_argument('--outprefix', type=str, default="tmp", - help="specify prefix of output file") - PARSER.add_argument('--min-fragment-size', type=int, default=100, help="") - PARSER.add_argument('--fragment-size', type=int, default=1020, help="") - PARSER.add_argument('infile', type=str, help=('name of file with sequences ' - 'in fasta format')) - - ARGS = PARSER.parse_args() - if not ARGS.disable_pandas: - import pandas as pd - - main(ARGS) diff --git a/src/dnadiff.sh b/src/dnadiff.sh deleted file mode 100755 index 7ac96d25fea196d34a2fafa133426ad2c0a7ad02..0000000000000000000000000000000000000000 --- a/src/dnadiff.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/sh - -if test $# -ne 1 -then - echo "Usage: $0 <inputfile>" - exit 1 -fi - -inputfile=$1 -src/calculate_ani.py --phybema nucmer ${inputfile} diff --git a/src/estimators.py b/src/estimators.py index 851a60098fa178393fc79e60de84947ea5192e38..21f3fc6b0d1e31837fe70be75bb148f695e31e3e 100644 --- a/src/estimators.py +++ b/src/estimators.py @@ -65,35 +65,14 @@ def estimator_choices(root_dir,choices,dataset, only_names=False): call_options=["triangle"], fixed_out_file_name=None, optional_end=None) - seed_extend = DistanceEstimator(call=("{}/src/seed_extend.sh".format(root_dir)), - call_options=[], - fixed_out_file_name=None, - optional_end=None) - dnadiff = DistanceEstimator(call="{}/src/dnadiff.sh".format(root_dir), - call_options=[], - fixed_out_file_name=None, - optional_end=None) - # the fswm version in phybema has been changed. There is an -o option to - # specify the outfilepath. It is not allowed to contain whitespaces - fswm = DistanceEstimator(call="{}/src_external/fswm/fswm".format(root_dir), - call_options=["-t 1","-o {}_fswm_DMat".format(dataset)], - fixed_out_file_name="{}_fswm_DMat".format(dataset), - optional_end=None) - - # The kmacs version in Phybema has been changed. It is mandatory to give an - # outfilepath after the k value. - kmacs = DistanceEstimator("{}/src_external/kmacs/kmacs".format(root_dir), - [], - fixed_out_file_name="{}_kmacs_DMat".format(dataset), - optional_end=["17", "{}_kmacs_DMat".format(dataset)]) ''' ADD YOUR TOOL HERE specify new tool here as instance of class DistanceEstimator and add the instance to the following list ''' - all_estimators = [andi,fswm,kmacs,mash,seed_extend,dnadiff] + all_estimators = [andi,mash] all_estimators = [a for a in all_estimators if shutil.which(a.call())] estimator_dict = {est.name() : est for est in all_estimators} diff --git a/testsuite/test_rep.sh b/testsuite/test_rep.sh index 9aee462e1dd9352c6326ec7e7c2e2b8af7bbab42..1cd24ad0e730a2582d0d85b0ec74d96ef9fc8c07 100755 --- a/testsuite/test_rep.sh +++ b/testsuite/test_rep.sh @@ -7,7 +7,7 @@ then fi subdir=$1 -src/phybema.py -o none --tools mash andi dnadiff -- testdata/${subdir}/ +src/phybema.py -o none --tools mash andi -- testdata/${subdir}/ for prog in andi mash do for suffix in mat nh diff --git a/testsuite/testset1_dnadiff.mat b/testsuite/testset1_dnadiff.mat deleted file mode 100644 index 0f65a969d688738f7a0439374db4510ff2be3fd0..0000000000000000000000000000000000000000 --- a/testsuite/testset1_dnadiff.mat +++ /dev/null @@ -1,5 +0,0 @@ -4 -tgen_1 0.0 -tgen_2 0.0 0.00090939 -tgen_3 0.0 0.00083935 0.00051365 -tgen_4 0.0 0.00101731 0.00153718 0.00200906 diff --git a/testsuite/testset1_dnadiff.nh b/testsuite/testset1_dnadiff.nh deleted file mode 100644 index a63da1249975f2de40ada68348888e78e7300f20..0000000000000000000000000000000000000000 --- a/testsuite/testset1_dnadiff.nh +++ /dev/null @@ -1 +0,0 @@ -(tgen_2,((tgen_4:0.00064,tgen_1:-0.00064):0.00022,tgen_3:0.00055):0.00029);