From 0a4911ceab049714ffca828eb3d9aee683cf23ab Mon Sep 17 00:00:00 2001 From: Raphael Koll <82929865+vollkorrn@users.noreply.github.com> Date: Wed, 3 Jan 2024 21:35:22 -0600 Subject: [PATCH] Create Step3_Run_dada2_multipletimes We run the dada2 pipeline with different trunclen to estimate best parameters --- 16S/Step3_Run_dada2_multipletimes | 221 ++++++++++++++++++++++++++++++ 1 file changed, 221 insertions(+) create mode 100644 16S/Step3_Run_dada2_multipletimes diff --git a/16S/Step3_Run_dada2_multipletimes b/16S/Step3_Run_dada2_multipletimes new file mode 100644 index 0000000..9c83a2b --- /dev/null +++ b/16S/Step3_Run_dada2_multipletimes @@ -0,0 +1,221 @@ +#Steps# + #Step1 Trim-galore for Illumina adapers and general quality + #Step2 FiltN and 16SPrimers and create Quality Plots + #Step3 Run dada2 pipeline once for differnt combination of trunclen + #Step4 Merge the SeqTabs from different sequencing runs and add taxonomy + + +#Step3 Run dada2 pipeline below once in full with different combinations of trunclen to estimate the best outcome + +# -bash-4.2$ sbatch dada2-pipeline-460-26.07.23.sh #Trunclen 460 +# Samples Input Output +# "GCAU21BBEB1" 28486 27300 27145 27227 26833 26347 +# "GCAU21BBEB2" 20615 19723 19677 19703 19578 19276 +# "GCAU21BBEB3" 31033 29541 29094 29382 28326 27361 +# 276 288 318 331 335 346 351 360 364 369 371 373 374 +# 1 1 1 1 1 1 1 1 1 1 1 5 9 +# 375 376 381 382 384 385 386 387 388 389 390 391 392 +# 14 14 2 1 5 4 8 8 7 5 2 3 6 +# 393 394 395 396 397 398 399 400 401 402 403 404 405 +# 6 2 1 15 10 27 28 7 19 103 155 4238 2264 +# 406 407 408 409 410 411 412 413 414 415 416 417 418 +# 999 849 386 1405 124 129 51 48 10 26 76 24 68 +# 419 420 421 422 423 424 425 426 427 428 429 430 431 +# 564 135 536 265 482 4333 212 182 117 2062 28997 7981 254 +# 432 433 434 435 436 437 438 439 440 441 442 443 444 +# 17 1 1 12 29 3 9 6 7 159 2 4 20 +# 445 446 447 448 +# 49 955 253 9 +# -bash-4.2$ sbatch dada2-pipeline-470-26.07.23.sh #Trunclen 470 +# Samples Input Output +# "GCAU21BBEB1" 28486 27222 27067 27158 26725 26258 +# "GCAU21BBEB2" 20615 19637 19591 19596 19470 19168 +# "GCAU21BBEB3" 31033 29387 28943 29213 28134 27168 +# 276 288 318 331 335 346 351 360 364 369 373 374 375 +# 1 1 1 1 1 1 1 1 1 1 5 9 14 +# 376 381 382 384 385 386 387 388 389 390 391 392 393 +# 14 2 1 5 5 8 8 7 5 2 3 6 6 +# 394 395 396 397 398 399 400 401 402 403 404 405 406 +# 2 1 16 10 27 28 7 20 104 157 4230 2244 1000 +# 407 408 409 410 411 412 413 414 415 416 417 418 419 +# 826 337 1314 115 131 49 44 10 25 76 21 64 432 +# 420 421 422 423 424 425 426 427 428 429 430 431 432 +# 140 449 230 458 3415 199 168 110 1801 25333 6771 236 14 +# 433 434 435 436 437 438 439 440 441 442 443 444 445 +# 2 1 1 19 3 9 2 6 111 1 3 20 38 +# 446 447 448 449 450 451 452 453 455 458 +# 743 139 10 7 12 19 58 3 1 1 + +# -bash-4.2$ sbatch dada2-pipeline-480-26.07.23.sh #Trunclen 480 +# Samples Input Output +# "GCAU21BBEB1" 28486 13843 13707 13781 13440 13293 +# "GCAU21BBEB2" 20615 9963 9922 9934 9832 9759 +# "GCAU21BBEB3" 31033 13050 12688 12934 12115 11848 +# 288 335 346 351 364 373 374 375 376 381 382 384 386 +# 1 1 1 1 1 4 6 9 7 2 1 4 4 +# 387 388 389 390 391 392 393 394 395 396 397 398 399 +# 4 3 2 2 3 6 5 2 1 9 6 20 22 +# 400 401 402 403 404 405 406 407 408 409 410 411 412 +# 8 5 37 109 2931 1741 731 589 175 723 76 73 42 +# 413 414 415 416 417 418 419 420 421 422 423 424 425 +# 30 7 23 60 16 48 264 111 302 187 335 2063 144 +# 426 427 428 429 430 431 432 433 434 435 436 437 438 +# 114 97 1321 13713 3505 173 13 2 1 1 16 3 9 +# 439 440 441 442 443 444 445 446 447 448 449 450 453 +# 1 4 105 8 3 7 34 604 118 29 10 18 1 +# 459 461 462 463 465 466 468 +# 6 1 3 1 14 3 1 + + +#!/bin/bash +#SBATCH --job-name=dada2 +#SBATCH --nodes=1 +#SBATCH --tasks-per-node=16 +#SBATCH --partition=stl +#SBATCH --time=7-00:00:00 +#SBATCH --export=NONE +#SBATCH --error=dada2-pipeline-470-02.08.23.txt +#SBATCH --mail-user=raphael.koll@uni-hamburg.de +#SBATCH --mail-type=ALL +source /sw/batch/init.sh +source /anaconda3/etc/profile.d/conda.sh +conda activate MambaDada2 +R -q -f dada2-pipeline-470-02.08.23.R > dada2-pipeline-470-02.08.23.out 2>&1 + +################ +#dada2-pipeline# +################ +path <- "/trim_galore" +pathOut <- "/02.08.23" +pathTrainsets <- "/16S/trainsets" +Date <- "02.08.23" +Length <- "470" + +library(dada2) +library(ShortRead) +library(ggplot2) + +head(list.files(path)) +path.cut <- file.path(path, "cutadapt") +head(list.files(path.cut)) +fnFs <- sort(list.files(path.cut, pattern = "R1_001_val_1.fq", full.names = TRUE)) +fnRs <- sort(list.files(path.cut, pattern = "R2_001_val_2.fq", full.names = TRUE)) +sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1) +head(sample.names) + +######## +#Filter# +######## +filtFs <- file.path(path.cut, paste("filtered", Length, sep="_"), paste0(sample.names, "_F_filt.fastq")) +filtRs <- file.path(path.cut, paste("filtered", Length, sep="_"), paste0(sample.names, "_R_filt.fastq")) +names(filtFs) <- sample.names +names(filtRs) <- sample.names + +out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, + maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=TRUE, + compress=TRUE, multithread=TRUE, verbose=TRUE,truncLen=c(270,200)) #Amplicon + 20 biological.length.variation +out + +saveRDS(out, file.path(pathOut,paste(Date, Length, "out.rds", sep="_"))) + +filtFs <- filtFs[file.exists(filtFs)] +filtRs <- filtRs[file.exists(filtRs)] + +################### +#Learn Error Rates# +################### +errF <- learnErrors(filtFs, multithread=TRUE) +saveRDS(errF, file.path(pathOut, paste(Date, Length, "errF.rds", sep="_"))) +errR <- learnErrors(filtRs, multithread=TRUE) + +saveRDS(errR, file.path(pathOut, paste(Date, Length,"errR.rds", sep="_"))) +library(ggplot2) +A<- plotErrors(errF, nominalQ=TRUE) +ggsave(A, filename = paste(Date, Length, "errF.png", sep="_"),path= pathOut, device='png', dpi=300, width = 10,height = 10) +B<- plotErrors(errR, nominalQ=TRUE) +ggsave(B, filename = paste(Date, Length, "errR.png", sep="_"),path=pathOut, device='png', dpi=300, width = 10,height = 10) + +############### +#Infering ASVs# +############### + +errF<-readRDS(file.path(pathOut, paste(Date,Length,"errF.rds", sep="_"))) +errR<-readRDS(file.path(pathOut, paste(Date,Length,"errR.rds", sep="_"))) + +#Inferring ASVs +dadaFs <- dada(filtFs, err=errF, multithread=TRUE, pool=TRUE) +saveRDS(dadaFs, file.path(pathOut, paste(Date,Length,"dadaFs.rds", sep="_"))) +dadaFs<-readRDS(file.path(pathOut, paste(Date,Length,"dadaFs.rds", sep="_"))) + +dadaRs <- dada(filtRs, err=errR, multithread=TRUE, pool=TRUE) +saveRDS(dadaRs, file.path(pathOut, paste(Date,Length,"dadaRs.rds", sep="_"))) +dadaRs<-readRDS(file.path(pathOut, paste(Date,Length,"dadaRs.rds", sep="_"))) + +dadaFs[[1]] + +#################### +#merge paired reads# +#################### +mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, verbose=TRUE) +saveRDS(mergers, file.path(pathOut, paste(Date,Length,"mergers.rds", sep="_"))) +# Inspect the merger data.frame from the first sample +head(mergers[[1]]) +#Construct sequence table +seqtab <- makeSequenceTable(mergers) + +dim(seqtab) +saveRDS(seqtab, file.path(pathOut, paste(Date,Length,"seqtab.rds", sep="_"))) +seqtab<-readRDS(file.path(pathOut, paste(Date,Length,"seqtab.rds", sep="_"))) +#Inspect distribution of sequence lengths +table(nchar(getSequences(seqtab))) + +################# +#Remove chimeras# +################# +seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE) +dim(seqtab.nochim) #Check dimensions to see if the a good number of samples have made it into the table with a good number of reads + +sum(seqtab.nochim)/sum(seqtab) #Check the proportion of reads that made it past the chimera check + +saveRDS(seqtab.nochim, file.path(pathOut, paste(Date,Length,"seqtab.nochim.rds", sep="_"))) +seqtab.nochim<-readRDS(file.path(pathOut, paste(Date,Length,"seqtab.nochim.rds", sep="_"))) + +################################## +#Track reads through the pipeline# +################################## +dadaFs <-readRDS(file.path(pathOut, paste(Date, Length, "dadaFs.rds", sep="_"))) +dadaRs <-readRDS(file.path(pathOut, paste(Date, Length,"dadaRs.rds", sep="_"))) +out <-readRDS(file.path(pathOut, paste(Date, Length,"out.rds", sep="_"))) +mergers <-readRDS(file.path(pathOut, paste(Date, Length,"mergers.rds", sep="_"))) + +getN <- function(x) sum(getUniques(x)) #See how many unique sequences there are? Very tired, will check later. +track <- cbind(out, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers, getN), rowSums(seqtab.nochim)) #As it says on the box +# If processing a single sample, remove the sapply calls: e.g. replace sapply(dadaFs, getN) with getN(dadaFs) +colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim") #Set column names +rownames(track) <- sample.names #Label rownames for each sample +head(track) #Check to see the proportion of samples that made it through the whole process. +write.table(track, file.path(pathOut, paste(Date, Length,"Track_reads_through_pipeline.txt", sep="_"))) +ReadNum<- read.table(file.path(pathOut, paste(Date, Length,"Track_reads_through_pipeline.txt", sep="_"))) +write.csv2(track, file.path(pathOut, paste(Date, Length,"Track_reads_through_pipeline.csv", sep="_")), row.names = T, sep=";") + +################# +#Assign Taxonomy# +################# +taxa_wSpecies <- assignTaxonomy(seqtab.nochim, file.path(pathTrainsets, "silva_nr99_v138.1_wSpecies_train_set.fa.gz"), multithread=TRUE) + +#In this step species level is already included <- but only for long-reads reliable! +# Extensions: The dada2 package also implements a method to make species level assignments based on exact matching between ASVs and sequenced reference strains. Recent analysis suggests that exact matching (or 100% identity) is the only appropriate way to assign species to 16S gene fragments. Currently, species-assignment training fastas are available for the Silva and RDP 16S databases. To follow the optional species addition step, download the silva_species_assignment_v132.fa.gz file, and place it in the directory with the fastq files. +#https://benjjneb.github.io/dada2/assign.html#species-assignment +# Fast and appropriate species-level assignment from 16S data is provided by the assignSpecies method. assignSpecies uses exact string matching against a reference database to assign Genus species binomials. In short, query sequence are compared against all reference sequences that had binomial genus-species nomenclature assigned, and the genus-species of all exact matches are recorded and returned if it is unambiguous. Recent results indicate that exact matching (or 100% identity) with amplicon sequence variants (ASVs) is the only appropriate method for species-level assignment to high-throughput 16S amplicon data. + +taxa <- assignTaxonomy(seqtab.nochim, file.path(pathTrainsets, "silva_nr99_v138.1_train_set.fa.gz"), multithread=TRUE) +taxa <- addSpecies(taxa, file.path(pathTrainsets, "silva_species_assignment_v138.1.fa.gz")) + +#When you want to do this it is nessessary to in the first step NOT TAKE THE _wSpecies_train_set.fa.gz otherwise we have Species twice in the data + +taxa.print <- taxa # Removing sequence rownames for display only +rownames(taxa.print) <- NULL #Superflous information, get rid +taxa.print # See if it looks good + +saveRDS(taxa, file.path(pathOut, paste(Date,Length, "Taxa_Species.rds", sep="_"))) +saveRDS(taxa_wSpecies, file.path(pathOut, paste(Date,Length, "Taxa_wSpecies.rds", sep="_"))) -- GitLab