Create Step3_Run_dada2_multipletimes

We run the dada2 pipeline with different trunclen to estimate best parameters

Create Step3_Run_dada2_multipletimes
0a4911ce · Raphael Koll · GitHub · 69000fd9 · 0a4911ce
Unverified Commit 0a4911ce authored 1 year ago by Raphael Koll Committed by GitHub 1 year ago
--- a/16S/Step3_Run_dada2_multipletimes
+++ b/16S/Step3_Run_dada2_multipletimes
+#Steps#
+    #Step1 Trim-galore for Illumina adapers and general quality 
+        #Step2 FiltN and 16SPrimers and create Quality Plots
+            #Step3 Run dada2 pipeline once for differnt combination of trunclen
+                #Step4 Merge the SeqTabs from different sequencing runs and add taxonomy
+#Step3 Run dada2 pipeline below once in full with different combinations of trunclen to estimate the best outcome
+# -bash-4.2$ sbatch dada2-pipeline-460-26.07.23.sh #Trunclen 460
+#   Samples      Input                       Output
+# "GCAU21BBEB1" 28486 27300 27145 27227 26833 26347
+# "GCAU21BBEB2" 20615 19723 19677 19703 19578 19276
+# "GCAU21BBEB3" 31033 29541 29094 29382 28326 27361
+# 276   288   318   331   335   346   351   360   364   369   371   373   374
+#   1     1     1     1     1     1     1     1     1     1     1     5     9
+# 375   376   381   382   384   385   386   387   388   389   390   391   392
+#  14    14     2     1     5     4     8     8     7     5     2     3     6
+# 393   394   395   396   397   398   399   400   401   402   403   404   405
+#   6     2     1    15    10    27    28     7    19   103   155  4238  2264
+# 406   407   408   409   410   411   412   413   414   415   416   417   418
+# 999   849   386  1405   124   129    51    48    10    26    76    24    68
+# 419   420   421   422   423   424   425   426   427   428   429   430   431
+# 564   135   536   265   482  4333   212   182   117  2062 28997  7981   254
+# 432   433   434   435   436   437   438   439   440   441   442   443   444
+#  17     1     1    12    29     3     9     6     7   159     2     4    20
+# 445   446   447   448
+#  49   955   253     9
+# -bash-4.2$ sbatch dada2-pipeline-470-26.07.23.sh #Trunclen 470
+#   Samples      Input                       Output
+# "GCAU21BBEB1" 28486 27222 27067 27158 26725 26258
+# "GCAU21BBEB2" 20615 19637 19591 19596 19470 19168
+# "GCAU21BBEB3" 31033 29387 28943 29213 28134 27168
+# 276   288   318   331   335   346   351   360   364   369   373   374   375
+#   1     1     1     1     1     1     1     1     1     1     5     9    14
+# 376   381   382   384   385   386   387   388   389   390   391   392   393
+#  14     2     1     5     5     8     8     7     5     2     3     6     6
+# 394   395   396   397   398   399   400   401   402   403   404   405   406
+#   2     1    16    10    27    28     7    20   104   157  4230  2244  1000
+# 407   408   409   410   411   412   413   414   415   416   417   418   419
+# 826   337  1314   115   131    49    44    10    25    76    21    64   432
+# 420   421   422   423   424   425   426   427   428   429   430   431   432
+# 140   449   230   458  3415   199   168   110  1801 25333  6771   236    14
+# 433   434   435   436   437   438   439   440   441   442   443   444   445
+#   2     1     1    19     3     9     2     6   111     1     3    20    38
+# 446   447   448   449   450   451   452   453   455   458
+# 743   139    10     7    12    19    58     3     1     1
+# -bash-4.2$ sbatch dada2-pipeline-480-26.07.23.sh #Trunclen 480
+#   Samples      Input                       Output
+# "GCAU21BBEB1" 28486 13843 13707 13781 13440 13293
+# "GCAU21BBEB2" 20615 9963 9922 9934 9832 9759
+# "GCAU21BBEB3" 31033 13050 12688 12934 12115 11848
+# 288   335   346   351   364   373   374   375   376   381   382   384   386
+#   1     1     1     1     1     4     6     9     7     2     1     4     4
+# 387   388   389   390   391   392   393   394   395   396   397   398   399
+#   4     3     2     2     3     6     5     2     1     9     6    20    22
+# 400   401   402   403   404   405   406   407   408   409   410   411   412
+#   8     5    37   109  2931  1741   731   589   175   723    76    73    42
+# 413   414   415   416   417   418   419   420   421   422   423   424   425
+#  30     7    23    60    16    48   264   111   302   187   335  2063   144
+# 426   427   428   429   430   431   432   433   434   435   436   437   438
+# 114    97  1321 13713  3505   173    13     2     1     1    16     3     9
+# 439   440   441   442   443   444   445   446   447   448   449   450   453
+#   1     4   105     8     3     7    34   604   118    29    10    18     1
+# 459   461   462   463   465   466   468
+#   6     1     3     1    14     3     1
+#!/bin/bash
+#SBATCH --job-name=dada2
+#SBATCH --nodes=1
+#SBATCH --tasks-per-node=16
+#SBATCH --partition=stl 
+#SBATCH --time=7-00:00:00
+#SBATCH --export=NONE
+#SBATCH --error=dada2-pipeline-470-02.08.23.txt
+#SBATCH --mail-user=raphael.koll@uni-hamburg.de
+#SBATCH --mail-type=ALL
+source /sw/batch/init.sh
+source /anaconda3/etc/profile.d/conda.sh
+conda activate MambaDada2
+R -q -f dada2-pipeline-470-02.08.23.R  > dada2-pipeline-470-02.08.23.out 2>&1
+################
+#dada2-pipeline#
+################
+path          <- "/trim_galore"
+pathOut       <- "/02.08.23"
+pathTrainsets <- "/16S/trainsets"
+Date          <- "02.08.23"
+Length        <- "470"
+library(dada2)
+library(ShortRead)
+library(ggplot2)
+head(list.files(path))
+path.cut <- file.path(path, "cutadapt")
+head(list.files(path.cut))
+fnFs <- sort(list.files(path.cut, pattern = "R1_001_val_1.fq", full.names = TRUE))
+fnRs <- sort(list.files(path.cut, pattern = "R2_001_val_2.fq", full.names = TRUE))
+sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1)
+head(sample.names)
+########
+#Filter#
+########
+filtFs <- file.path(path.cut, paste("filtered", Length, sep="_"), paste0(sample.names, "_F_filt.fastq"))
+filtRs <- file.path(path.cut, paste("filtered", Length, sep="_"), paste0(sample.names, "_R_filt.fastq"))
+names(filtFs) <- sample.names
+names(filtRs) <- sample.names
+out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs,
+              maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=TRUE,
+              compress=TRUE, multithread=TRUE, verbose=TRUE,truncLen=c(270,200)) #Amplicon + 20 biological.length.variation
+out
+saveRDS(out, file.path(pathOut,paste(Date, Length, "out.rds", sep="_")))
+filtFs <- filtFs[file.exists(filtFs)]
+filtRs <- filtRs[file.exists(filtRs)]
+###################
+#Learn Error Rates#
+###################
+errF <- learnErrors(filtFs, multithread=TRUE)
+saveRDS(errF, file.path(pathOut, paste(Date, Length,  "errF.rds", sep="_")))
+errR <- learnErrors(filtRs, multithread=TRUE)
+saveRDS(errR, file.path(pathOut, paste(Date, Length,"errR.rds", sep="_")))
+library(ggplot2)
+A<- plotErrors(errF, nominalQ=TRUE)
+ggsave(A, filename = paste(Date, Length, "errF.png", sep="_"),path= pathOut, device='png', dpi=300, width = 10,height = 10)
+B<- plotErrors(errR, nominalQ=TRUE)
+ggsave(B, filename = paste(Date, Length, "errR.png", sep="_"),path=pathOut, device='png', dpi=300, width = 10,height = 10)
+###############
+#Infering ASVs#
+###############
+errF<-readRDS(file.path(pathOut, paste(Date,Length,"errF.rds", sep="_")))
+errR<-readRDS(file.path(pathOut, paste(Date,Length,"errR.rds", sep="_")))
+#Inferring ASVs
+dadaFs <- dada(filtFs, err=errF, multithread=TRUE, pool=TRUE)
+saveRDS(dadaFs, file.path(pathOut, paste(Date,Length,"dadaFs.rds", sep="_")))
+dadaFs<-readRDS(file.path(pathOut, paste(Date,Length,"dadaFs.rds", sep="_")))
+dadaRs <- dada(filtRs, err=errR, multithread=TRUE, pool=TRUE)
+saveRDS(dadaRs, file.path(pathOut, paste(Date,Length,"dadaRs.rds", sep="_")))
+dadaRs<-readRDS(file.path(pathOut, paste(Date,Length,"dadaRs.rds", sep="_")))
+dadaFs[[1]]
+####################
+#merge paired reads#
+####################
+mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, verbose=TRUE)
+saveRDS(mergers, file.path(pathOut, paste(Date,Length,"mergers.rds", sep="_")))
+# Inspect the merger data.frame from the first sample
+head(mergers[[1]])
+#Construct sequence table
+seqtab <- makeSequenceTable(mergers)
+dim(seqtab)
+saveRDS(seqtab, file.path(pathOut, paste(Date,Length,"seqtab.rds", sep="_")))
+seqtab<-readRDS(file.path(pathOut, paste(Date,Length,"seqtab.rds", sep="_")))
+#Inspect distribution of sequence lengths
+table(nchar(getSequences(seqtab)))
+#################
+#Remove chimeras#
+#################
+seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE)
+dim(seqtab.nochim) #Check dimensions to see if the a good number of samples have made it into the table with a good number of reads
+sum(seqtab.nochim)/sum(seqtab) #Check the proportion of reads that made it past the chimera check
+saveRDS(seqtab.nochim, file.path(pathOut, paste(Date,Length,"seqtab.nochim.rds", sep="_")))
+seqtab.nochim<-readRDS(file.path(pathOut, paste(Date,Length,"seqtab.nochim.rds", sep="_")))
+##################################
+#Track reads through the pipeline#
+##################################
+dadaFs  <-readRDS(file.path(pathOut, paste(Date, Length, "dadaFs.rds", sep="_")))
+dadaRs  <-readRDS(file.path(pathOut, paste(Date, Length,"dadaRs.rds", sep="_")))
+out     <-readRDS(file.path(pathOut, paste(Date, Length,"out.rds", sep="_")))
+mergers <-readRDS(file.path(pathOut, paste(Date, Length,"mergers.rds", sep="_")))
+getN <- function(x) sum(getUniques(x)) #See how many unique sequences there are? Very tired, will check later.
+track <- cbind(out, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers, getN), rowSums(seqtab.nochim)) #As it says on the box
+# If processing a single sample, remove the sapply calls: e.g. replace sapply(dadaFs, getN) with getN(dadaFs)
+colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim") #Set column names
+rownames(track) <- sample.names #Label rownames for each sample
+head(track) #Check to see the proportion of samples that made it through the whole process.
+write.table(track, file.path(pathOut, paste(Date, Length,"Track_reads_through_pipeline.txt", sep="_")))
+ReadNum<- read.table(file.path(pathOut, paste(Date, Length,"Track_reads_through_pipeline.txt", sep="_")))
+write.csv2(track, file.path(pathOut, paste(Date, Length,"Track_reads_through_pipeline.csv", sep="_")), row.names = T, sep=";")
+#################
+#Assign Taxonomy#
+#################
+taxa_wSpecies <- assignTaxonomy(seqtab.nochim, file.path(pathTrainsets, "silva_nr99_v138.1_wSpecies_train_set.fa.gz"), multithread=TRUE)
+#In this step species level is already included <- but only for long-reads reliable!
+# Extensions: The dada2 package also implements a method to make species level assignments based on exact matching between ASVs and sequenced reference strains. Recent analysis suggests that exact matching (or 100% identity) is the only appropriate way to assign species to 16S gene fragments. Currently, species-assignment training fastas are available for the Silva and RDP 16S databases. To follow the optional species addition step, download the silva_species_assignment_v132.fa.gz file, and place it in the directory with the fastq files.
+#https://benjjneb.github.io/dada2/assign.html#species-assignment
+# Fast and appropriate species-level assignment from 16S data is provided by the assignSpecies method. assignSpecies uses exact string matching against a reference database to assign Genus species binomials. In short, query sequence are compared against all reference sequences that had binomial genus-species nomenclature assigned, and the genus-species of all exact matches are recorded and returned if it is unambiguous. Recent results indicate that exact matching (or 100% identity) with amplicon sequence variants (ASVs) is the only appropriate method for species-level assignment to high-throughput 16S amplicon data.
+taxa <- assignTaxonomy(seqtab.nochim, file.path(pathTrainsets, "silva_nr99_v138.1_train_set.fa.gz"), multithread=TRUE)
+taxa <- addSpecies(taxa, file.path(pathTrainsets, "silva_species_assignment_v138.1.fa.gz"))
+#When you want to do this it is nessessary to in the first step NOT TAKE THE _wSpecies_train_set.fa.gz otherwise we have Species twice in the data
+taxa.print <- taxa # Removing sequence rownames for display only
+rownames(taxa.print) <- NULL #Superflous information, get rid
+taxa.print # See if it looks good
+saveRDS(taxa, file.path(pathOut, paste(Date,Length, "Taxa_Species.rds", sep="_")))
+saveRDS(taxa_wSpecies, file.path(pathOut, paste(Date,Length, "Taxa_wSpecies.rds", sep="_")))