From 6d1cc041a83fc765b483078d3b3963e35f2b070c Mon Sep 17 00:00:00 2001 From: Stefan Kurtz <kurtz@zbh.uni-hamburg.de> Date: Fri, 12 Jul 2019 21:07:46 +0200 Subject: [PATCH] Added program to calculate ANI based on nucmer difference to original version of Phybema: distance estimator outputs result as matrix in PHYLIP format --- Makefile | 1 + src/anicalc/__init__.py | 0 src/anicalc/anicalc.py | 116 +++++++++++++++++++++++++++++ src/anicalc/fasta_iterator.py | 41 +++++++++++ src/anicalc/nucmer.py | 129 +++++++++++++++++++++++++++++++++ src/calculate_ani.py | 16 ++-- src/dnadiff.sh | 10 +++ testsuite/test_rep.sh | 2 +- testsuite/testset1_dnadiff.mat | 5 ++ testsuite/testset1_dnadiff.nh | 1 + 10 files changed, 314 insertions(+), 7 deletions(-) create mode 100644 src/anicalc/__init__.py create mode 100644 src/anicalc/anicalc.py create mode 100644 src/anicalc/fasta_iterator.py create mode 100644 src/anicalc/nucmer.py create mode 100755 src/dnadiff.sh create mode 100644 testsuite/testset1_dnadiff.mat create mode 100644 testsuite/testset1_dnadiff.nh diff --git a/Makefile b/Makefile index 16f838d..8a52da2 100644 --- a/Makefile +++ b/Makefile @@ -2,6 +2,7 @@ test:testsuite/test_rep.sh @testsuite/test_rep.sh testset1 + @echo "all tests passed" clean: @${RM} -r temp out diff --git a/src/anicalc/__init__.py b/src/anicalc/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/anicalc/anicalc.py b/src/anicalc/anicalc.py new file mode 100644 index 0000000..d39953b --- /dev/null +++ b/src/anicalc/anicalc.py @@ -0,0 +1,116 @@ +""" Calculation of Average nucleotide indentity (ANI)""" +import os +import numpy as np +from anicalc.fasta_iterator import fasta_next, write_sequence +from anicalc.nucmer import compute_delta, nucmer_calculate_alignments_list + + +def extend_path(program): + """ + Function to extend the a program to the full length path + with systems file path if required and checks if program is executable. + @param program: program filepath or name + @return: full filepath or error if not found or not executable + """ + fpath, _ = os.path.split(program) + if fpath: + if os.path.isfile(program) and os.access(program, os.X_OK): + return os.path.abspath(program) + raise FileNotFoundError( + "{} not found or not executable".format(program)) + for path in os.environ["PATH"].split(os.pathsep): + exe_file = os.path.abspath(os.path.join(path, program)) + if os.path.isfile(exe_file) and os.access(exe_file, os.X_OK): + return exe_file + raise FileNotFoundError("{} not found or not executable".format(program)) + + +def create_fragments(infile, outputfilepath, min_fragment_size, fragment_size): + """ + Creates fragments based on an fasta file (for later alignments) Output files + get the suffixes '.fa' for the original sequences and '.fa.split' for + the fragments. Filenames based on FASTA-headers. + @param infile: (multi-) FASTA file with the input sequence(s) + @param outputfilepath: filepath were the output should be written to + @param min_fragment_size: minimal fragment size a fragment should have + @param fragment_size: Usual size of the output fragments + @return: list of created file prefixes + """ + files = list() + for header, sequence in fasta_next(infile): + save_header = header.split(' ')[0] + save_header = save_header.split('|')[0] + files.append(save_header) + outputfile = '{}/{}.fa'.format(outputfilepath, save_header) + with open(outputfile, 'w') as file: + file.write('>' + header + '\n') + write_sequence(file, sequence) + with open(outputfile + '.split', 'w') as file: + idx = 0 + while len(sequence) > (fragment_size + min_fragment_size): + idx += 1 + fragment = sequence[:fragment_size] + file.write('>' + str(idx) + '\n') + file.write(fragment + '\n') + sequence = sequence[fragment_size:] + file.write('>' + str(idx + 1) + '\n') + file.write(sequence + '\n') + return files + + +def calculate_ani_single(alignments_list): + """ + Calculates average nucleotide indentity (ANI) from a list of local + alignments. ANI is calculated from all alignment with are at least aligned + to 70% with maximal 30% non identities. + + @param alignments_list: list of alignments represented as triple like + (alignment length, non identities, query sequence length) + @return: Average nucleotide identity + """ + ni_sum = 0 + ani_matches = 0 + for alignment_length, non_identities, query_length in alignments_list: + if (((alignment_length - non_identities) / query_length) > 0.3) and \ + ((alignment_length / query_length) >= 0.7): + ni_sum += (alignment_length - non_identities) / alignment_length + ani_matches += 1 + ani = (ni_sum / ani_matches) if ani_matches > 0 else 0 + return ani + + +def calculate_ani_nucmer(reference, files, tmpdir): + """ + Function to create alignments with nucmer and then calculate ANI for all + reference-query-pairs. Requires to have 'nucmer' and 'delta-filter' in your + path. + @param reference: Prefix of the reference file to use + (Filename should be like reference + '.fa') + @param files: List of rrefix of query filesto use + (Filename should be like query + '.fa.split') + @param tmpdir: Name of the directory where the reference and query files + are located. + (Filenames are expanded to tmpdir + '/' + filename) + @return: reference , same as input (for multiprocessing) + @return: numpy array holding all ANI + (np.nan for the alignment of reference to reference) + """ + nucmer_path = extend_path('nucmer') + delta_filter = extend_path('delta-filter') + file_paths = [compute_delta(nucmer_path, delta_filter, + tmpdir + '/' + reference + '.fa', + tmpdir + '/' + currentfile + '.fa.split', + tmpdir) + if currentfile != reference else None + for currentfile in files] + ani = (calculate_ani_single(nucmer_calculate_alignments_list(qfile)) + if qfile else np.nan for qfile in file_paths) + return reference, np.fromiter(ani, np.float) + +_CALCULATE_ANI_WITH_TOOL = { + 'nucmer': calculate_ani_nucmer +} + +def calculate_ani_with_tool(tool): + """ Wrapper for using different alignment tools""" + return _CALCULATE_ANI_WITH_TOOL[tool] diff --git a/src/anicalc/fasta_iterator.py b/src/anicalc/fasta_iterator.py new file mode 100644 index 0000000..eb18c4d --- /dev/null +++ b/src/anicalc/fasta_iterator.py @@ -0,0 +1,41 @@ +""" Provides Iterator over (multi-) FASTA file""" +import re + + +def write_sequence(filestream, sequence): + """ Writes FASTA sequence to filestream using fixed length 60 """ + for startpos in range(0, len(sequence), 60): + filestream.write(sequence[startpos:startpos + 60] + '\n') + + +def print_sequence(seq, linelength=70): + """ Prints FASTA sequence seq to sdtout with linelength (default 70)""" + for startpos in range(0, len(seq), linelength): + print(seq[startpos:startpos + linelength]) + + +def _seq_list2sequence(seq_list): + sequence = ' '.join(seq_list) + return re.sub(r'\s', "", sequence) + + +def fasta_next(filename): + """ + Iterator to get (header, sequence) tuple + from (multi-)FASTA file with filename. + """ + with open(filename, "r") as stream: + seq_list = list() + header = None + for line in stream: + if not re.search(r'^(>|\s*$|\s*#)', line): + seq_list.append(line.rstrip()) + else: + match = re.search(r'^>(.*)', line) + if match is not None: + if header is not None: + yield header, _seq_list2sequence(seq_list) + seq_list.clear() + header = match.group(1) + if header is not None: + yield header, _seq_list2sequence(seq_list) diff --git a/src/anicalc/nucmer.py b/src/anicalc/nucmer.py new file mode 100644 index 0000000..2593124 --- /dev/null +++ b/src/anicalc/nucmer.py @@ -0,0 +1,129 @@ +""" Module to run nucmer and parse its output""" +import subprocess +import os +import tempfile +import re + + +class Alignment: + """ Class holding information from a .delta file""" + + def __init__(self, reference, query, ref_len, query_len): + self.reference = reference + self.query = query + self.ref_len = ref_len + self.query_len = query_len + self.alignments = list() + + def __len__(self): + return sum([len(alg) for alg in self.alignments]) + + def add_segment(self, reference_coordinates, query_coordinates, *errors): + """ + Function to add an new aligned segment to an alignment object. + (not quite sure if an alignment in delta format can + hold more than one segments) + @param reference_coordinates: tuple, start and end position + in the reference sequence + @param query_coordinates: tuple, start and end position + in the query sequence + @param: errors: triple containing non identities, similarity errors and + stop codons (if protein sequence) + """ + new = AlignmentSegment(reference_coordinates, + query_coordinates, *errors) + self.alignments.append(new) + return new + + def get_errors(self): + """ Returns sum of all non identities over all alignment segments""" + return sum([int(alg.errors) for alg in self.alignments]) + + +class AlignmentSegment: + """ Class holding an alignment segment from .delta file """ + + def __init__(self, reference_coordinates, query_coordinates, *errors): + self.reference_coordinates = reference_coordinates + self.query_coordinates = query_coordinates + self.errors = int(errors[0]) + self.similarity_errors = int(errors[1]) + self.stop_codons = int(errors[2]) + self.nexts = list() + + def __len__(self): + return abs(int(self.query_coordinates[1]) - + int(self.query_coordinates[0])) + 1 + + def add_next(self, value): + """ Adds alignment information""" + self.nexts.append(value) + + +def compute_delta(nucmer_path, delta_filter_path, reference, query, tmp_dir): + """ + Computes .delta files with nucmer based on one reference FASTA and one query + multi-FASTA file. Files are automatically filtered using 'delta-filer'. + @param nucmer_path: absolute path to nucmer + @param delta_filter: absolute path to delta-filter + @param reference: absolute path to reference FASTA file + @param query: absolute path to query multi-FASTA file + @param tmp: absolute path to working directory + @return: the absolute path to the resulting filtered .delta file + """ + with tempfile.TemporaryDirectory(dir=tmp_dir) as tmpdir: + try: + subprocess.check_call([nucmer_path, '--threads=1', reference, + query], cwd=tmpdir, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + except subprocess.CalledProcessError: + subprocess.check_call([nucmer_path, reference, query], + cwd=tmpdir, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + filtered_delta_path = '{}/{}_{}.delta'.format( + tmp_dir, os.path.basename(reference), + os.path.basename(query)) + with open(filtered_delta_path, 'w') as filtered_delta: + subprocess.check_call([delta_filter_path, '-q', tmpdir + '/out.delta'], + stdout=filtered_delta) + return filtered_delta_path + + +def read_delta(filtered_delta_path): + """ Iterator to read an .delta file to Alignment objects """ + current_alignment = None + with open(filtered_delta_path, 'r') as filtered_delta: + for line in filtered_delta: + line = line.rstrip() + if line[0] == '>': + if current_alignment is not None: + yield current_alignment + reference, query, ref_len, query_len = line.split(' ') + current_alignment = Alignment(reference[1:], query, + int(ref_len), int(query_len)) + elif re.match(r'\d+\s\d+\s\d+\s\d+\s\d+\s\d+', line): + split_line = line.split(' ') + current_segment = current_alignment.add_segment( + split_line[0:2], + split_line[2:4], + *split_line[4:] + ) + elif re.match(r'\d+', line): + split_line = line.split(' ') + if split_line[0] != '0': + current_segment.add_next(int(split_line[0])) + if current_alignment is not None: + yield current_alignment + + +def nucmer_calculate_alignments_list(file): + """ Function to create an list of triples as input for + anicalc.calculate_ani_single from an .delta file + """ + if file is not None: + output = list() + for alignment in read_delta(file): + output.append( + (len(alignment), alignment.get_errors(), alignment.query_len)) + return output + return None diff --git a/src/calculate_ani.py b/src/calculate_ani.py index f06af4e..4beacd1 100755 --- a/src/calculate_ani.py +++ b/src/calculate_ani.py @@ -11,15 +11,19 @@ from anicalc.anicalc import create_fragments, calculate_ani_with_tool def convert_raw_results_to_phybema(raw_results): num_sequences = len(raw_results) - print('# pairwise dist values for {} sequences'.format(num_sequences)) - for idx, r in enumerate(raw_results): - print('{}\t{}'.format(idx,r[0])) - for i in range(num_sequences-1): - for j in range(i+1,num_sequences): + output = list() + output.append('{}'.format(num_sequences)) + for i in range(num_sequences): + line = list() + line.append('{}'.format(raw_results[i][0])) + line.append('0.0') + for j in range(0,i): ani1 = raw_results[i][1][j] ani2 = raw_results[j][1][i] avg_ani = (ani1+ani2)/2.0 - print('dist {} {} {:.8f}'.format(i,j,1.0 - avg_ani)) + line.append('{:.8f}'.format(1.0 - avg_ani)) + output.append('\t'.join(line)) + print('\n'.join(output)) DEBUG = False diff --git a/src/dnadiff.sh b/src/dnadiff.sh new file mode 100755 index 0000000..7ac96d2 --- /dev/null +++ b/src/dnadiff.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +if test $# -ne 1 +then + echo "Usage: $0 <inputfile>" + exit 1 +fi + +inputfile=$1 +src/calculate_ani.py --phybema nucmer ${inputfile} diff --git a/testsuite/test_rep.sh b/testsuite/test_rep.sh index 2672d74..3b7ffcb 100755 --- a/testsuite/test_rep.sh +++ b/testsuite/test_rep.sh @@ -7,7 +7,7 @@ then fi subdir=$1 -src/phybema.py --tools mash andi -- testdata/${subdir}/ +src/phybema.py --tools mash andi dnadiff -- testdata/${subdir}/ for prog in andi mash do for suffix in mat nh diff --git a/testsuite/testset1_dnadiff.mat b/testsuite/testset1_dnadiff.mat new file mode 100644 index 0000000..0f65a96 --- /dev/null +++ b/testsuite/testset1_dnadiff.mat @@ -0,0 +1,5 @@ +4 +tgen_1 0.0 +tgen_2 0.0 0.00090939 +tgen_3 0.0 0.00083935 0.00051365 +tgen_4 0.0 0.00101731 0.00153718 0.00200906 diff --git a/testsuite/testset1_dnadiff.nh b/testsuite/testset1_dnadiff.nh new file mode 100644 index 0000000..a63da12 --- /dev/null +++ b/testsuite/testset1_dnadiff.nh @@ -0,0 +1 @@ +(tgen_2,((tgen_4:0.00064,tgen_1:-0.00064):0.00022,tgen_3:0.00055):0.00029); -- GitLab