diff --git a/16S/Step4_merge_sequencing_runs_add_taxonomy b/16S/Step4_merge_sequencing_runs_add_taxonomy new file mode 100644 index 0000000000000000000000000000000000000000..5410b53b93bb5c343801b0c42fae67cc971803e8 --- /dev/null +++ b/16S/Step4_merge_sequencing_runs_add_taxonomy @@ -0,0 +1,81 @@ +#Steps# + #Step1 Trim-galore for Illumina adapers and general quality + #Step2 FiltN and 16SPrimers and create Quality Plots + #Step3 Run dada2 pipeline once for differnt combination of trunclen + #Step4 Merge the SeqTabs from different sequencing runs and add taxonomy + +######################################################## +#Step4 Merge the SeqTabs from different sequencing runs# +######################################################## + +#!/bin/bash +#SBATCH --job-name=dada2-merge +#SBATCH --nodes=1 +#SBATCH --tasks-per-node=16 +#SBATCH --partition=big +#SBATCH --time=7-00:00:00 +#SBATCH --export=NONE +#SBATCH --error=dada2-merge-Run-02.08.23.txt +#SBATCH --mail-user=raphael.koll@uni-hamburg.de +#SBATCH --mail-type=ALL +source /sw/batch/init.sh +source /anaconda3/etc/profile.d/conda.sh +conda activate MambaDada2 +R -q -f dada2-merge-Run-02.08.23.R > dada2-merge-Run-02.08.23.out 2>&1 + + + ################# + #Create R script# + ################# + #Steps done before: + #Cutadapt cut our Primer sequences used from Kiel + #Trim-Galore cut out Illumina adapters (in about 30% of the reads) + +path <- "/MergeBatches" +pathOut <- "/MergeBatches" +pathTrainsets <-"/16S/trainsets" +Date <- "02.08.23_MergeRun" + +library(dada2) + +#https://benjjneb.github.io/dada2/bigdata_paired.html +#https://github.com/benjjneb/dada2/issues/1406 +st1 <-readRDS(file.path(pathOut, "Batch1_Run1_02.08.23_470_seqtab.rds")) +st2 <-readRDS(file.path(pathOut, "Batch1_Run1_reseq_02.08.23_470_seqtab.rds")) +st3 <-readRDS(file.path(pathOut, "Batch1_Run2_02.08.23_470_seqtab.rds")) +st4 <-readRDS(file.path(pathOut, "Batch2_Run1_02.08.23_470_seqtab.rds")) + +rownames(st3) <- paste(rownames(st2), "Batch1_Run2", sep="_") +rownames(st4) <- paste(rownames(st2), "Batch2_Run1", sep="_") + +st.all <- mergeSequenceTables(st1, st2, st3, st4, repeats = "sum") + +################# +#Remove chimeras# +################# +seqtab.nochim <- removeBimeraDenovo(st.all, method="consensus", multithread=TRUE, verbose=TRUE) +dim(seqtab.nochim) #Check dimensions to see if the a good number of samples have made it into the table with a good number of reads +sum(seqtab.nochim)/sum(st.all) #Check the proportion of reads that made it past the chimera check + +saveRDS(seqtab.nochim, file.path(pathOut, paste(Date, "seqtab.nochim.rds", sep=""))) + +################# +#Assign Taxonomy# +################# +taxa <- assignTaxonomy(seqtab.nochim, file.path(pathTrainsets, "silva_nr99_v138.1_train_set.fa.gz"), multithread=TRUE) +taxa <- addSpecies(taxa, file.path(pathTrainsets, "silva_species_assignment_v138.1.fa.gz")) +saveRDS(taxa, file.path(pathOut, paste(Date, "merge-Taxa_Species.rds", sep=""))) +#When you want to do this it is nessessary to in the first step NOT TAKE THE _wSpecies_train_set.fa.gz otherwise we have Species twice in the data + +taxa.print <- taxa # Removing sequence rownames for display only +rownames(taxa.print) <- NULL #Superflous information, get rid +head(taxa.print) # See if it looks good + +taxa_wSpecies <- assignTaxonomy(seqtab.nochim, file.path(pathTrainsets, "silva_nr99_v138.1_wSpecies_train_set.fa.gz"), multithread=TRUE) + +#In this step species level is already included <- but only for long-reads reliable! +# Extensions: The dada2 package also implements a method to make species level assignments based on exact matching between ASVs and sequenced reference strains. Recent analysis suggests that exact matching (or 100% identity) is the only appropriate way to assign species to 16S gene fragments. Currently, species-assignment training fastas are available for the Silva and RDP 16S databases. To follow the optional species addition step, download the silva_species_assignment_v132.fa.gz file, and place it in the directory with the fastq files. +#https://benjjneb.github.io/dada2/assign.html#species-assignment +# Fast and appropriate species-level assignment from 16S data is provided by the assignSpecies method. assignSpecies uses exact string matching against a reference database to assign Genus species binomials. In short, query sequence are compared against all reference sequences that had binomial genus-species nomenclature assigned, and the genus-species of all exact matches are recorded and returned if it is unambiguous. Recent results indicate that exact matching (or 100% identity) with amplicon sequence variants (ASVs) is the only appropriate method for species-level assignment to high-throughput 16S amplicon data. + +saveRDS(taxa_wSpecies, file.path(pathOut, paste(Date, "merge-Taxa_wSpecies.rds", sep="")))