Skip to content
Snippets Groups Projects
Unverified Commit 0a4911ce authored by Raphael Koll's avatar Raphael Koll Committed by GitHub
Browse files

Create Step3_Run_dada2_multipletimes

We run the dada2 pipeline with different trunclen to estimate best parameters
parent 69000fd9
Branches
Tags
No related merge requests found
#Steps#
#Step1 Trim-galore for Illumina adapers and general quality
#Step2 FiltN and 16SPrimers and create Quality Plots
#Step3 Run dada2 pipeline once for differnt combination of trunclen
#Step4 Merge the SeqTabs from different sequencing runs and add taxonomy
#Step3 Run dada2 pipeline below once in full with different combinations of trunclen to estimate the best outcome
# -bash-4.2$ sbatch dada2-pipeline-460-26.07.23.sh #Trunclen 460
# Samples Input Output
# "GCAU21BBEB1" 28486 27300 27145 27227 26833 26347
# "GCAU21BBEB2" 20615 19723 19677 19703 19578 19276
# "GCAU21BBEB3" 31033 29541 29094 29382 28326 27361
# 276 288 318 331 335 346 351 360 364 369 371 373 374
# 1 1 1 1 1 1 1 1 1 1 1 5 9
# 375 376 381 382 384 385 386 387 388 389 390 391 392
# 14 14 2 1 5 4 8 8 7 5 2 3 6
# 393 394 395 396 397 398 399 400 401 402 403 404 405
# 6 2 1 15 10 27 28 7 19 103 155 4238 2264
# 406 407 408 409 410 411 412 413 414 415 416 417 418
# 999 849 386 1405 124 129 51 48 10 26 76 24 68
# 419 420 421 422 423 424 425 426 427 428 429 430 431
# 564 135 536 265 482 4333 212 182 117 2062 28997 7981 254
# 432 433 434 435 436 437 438 439 440 441 442 443 444
# 17 1 1 12 29 3 9 6 7 159 2 4 20
# 445 446 447 448
# 49 955 253 9
# -bash-4.2$ sbatch dada2-pipeline-470-26.07.23.sh #Trunclen 470
# Samples Input Output
# "GCAU21BBEB1" 28486 27222 27067 27158 26725 26258
# "GCAU21BBEB2" 20615 19637 19591 19596 19470 19168
# "GCAU21BBEB3" 31033 29387 28943 29213 28134 27168
# 276 288 318 331 335 346 351 360 364 369 373 374 375
# 1 1 1 1 1 1 1 1 1 1 5 9 14
# 376 381 382 384 385 386 387 388 389 390 391 392 393
# 14 2 1 5 5 8 8 7 5 2 3 6 6
# 394 395 396 397 398 399 400 401 402 403 404 405 406
# 2 1 16 10 27 28 7 20 104 157 4230 2244 1000
# 407 408 409 410 411 412 413 414 415 416 417 418 419
# 826 337 1314 115 131 49 44 10 25 76 21 64 432
# 420 421 422 423 424 425 426 427 428 429 430 431 432
# 140 449 230 458 3415 199 168 110 1801 25333 6771 236 14
# 433 434 435 436 437 438 439 440 441 442 443 444 445
# 2 1 1 19 3 9 2 6 111 1 3 20 38
# 446 447 448 449 450 451 452 453 455 458
# 743 139 10 7 12 19 58 3 1 1
# -bash-4.2$ sbatch dada2-pipeline-480-26.07.23.sh #Trunclen 480
# Samples Input Output
# "GCAU21BBEB1" 28486 13843 13707 13781 13440 13293
# "GCAU21BBEB2" 20615 9963 9922 9934 9832 9759
# "GCAU21BBEB3" 31033 13050 12688 12934 12115 11848
# 288 335 346 351 364 373 374 375 376 381 382 384 386
# 1 1 1 1 1 4 6 9 7 2 1 4 4
# 387 388 389 390 391 392 393 394 395 396 397 398 399
# 4 3 2 2 3 6 5 2 1 9 6 20 22
# 400 401 402 403 404 405 406 407 408 409 410 411 412
# 8 5 37 109 2931 1741 731 589 175 723 76 73 42
# 413 414 415 416 417 418 419 420 421 422 423 424 425
# 30 7 23 60 16 48 264 111 302 187 335 2063 144
# 426 427 428 429 430 431 432 433 434 435 436 437 438
# 114 97 1321 13713 3505 173 13 2 1 1 16 3 9
# 439 440 441 442 443 444 445 446 447 448 449 450 453
# 1 4 105 8 3 7 34 604 118 29 10 18 1
# 459 461 462 463 465 466 468
# 6 1 3 1 14 3 1
#!/bin/bash
#SBATCH --job-name=dada2
#SBATCH --nodes=1
#SBATCH --tasks-per-node=16
#SBATCH --partition=stl
#SBATCH --time=7-00:00:00
#SBATCH --export=NONE
#SBATCH --error=dada2-pipeline-470-02.08.23.txt
#SBATCH --mail-user=raphael.koll@uni-hamburg.de
#SBATCH --mail-type=ALL
source /sw/batch/init.sh
source /anaconda3/etc/profile.d/conda.sh
conda activate MambaDada2
R -q -f dada2-pipeline-470-02.08.23.R > dada2-pipeline-470-02.08.23.out 2>&1
################
#dada2-pipeline#
################
path <- "/trim_galore"
pathOut <- "/02.08.23"
pathTrainsets <- "/16S/trainsets"
Date <- "02.08.23"
Length <- "470"
library(dada2)
library(ShortRead)
library(ggplot2)
head(list.files(path))
path.cut <- file.path(path, "cutadapt")
head(list.files(path.cut))
fnFs <- sort(list.files(path.cut, pattern = "R1_001_val_1.fq", full.names = TRUE))
fnRs <- sort(list.files(path.cut, pattern = "R2_001_val_2.fq", full.names = TRUE))
sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1)
head(sample.names)
########
#Filter#
########
filtFs <- file.path(path.cut, paste("filtered", Length, sep="_"), paste0(sample.names, "_F_filt.fastq"))
filtRs <- file.path(path.cut, paste("filtered", Length, sep="_"), paste0(sample.names, "_R_filt.fastq"))
names(filtFs) <- sample.names
names(filtRs) <- sample.names
out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs,
maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=TRUE,
compress=TRUE, multithread=TRUE, verbose=TRUE,truncLen=c(270,200)) #Amplicon + 20 biological.length.variation
out
saveRDS(out, file.path(pathOut,paste(Date, Length, "out.rds", sep="_")))
filtFs <- filtFs[file.exists(filtFs)]
filtRs <- filtRs[file.exists(filtRs)]
###################
#Learn Error Rates#
###################
errF <- learnErrors(filtFs, multithread=TRUE)
saveRDS(errF, file.path(pathOut, paste(Date, Length, "errF.rds", sep="_")))
errR <- learnErrors(filtRs, multithread=TRUE)
saveRDS(errR, file.path(pathOut, paste(Date, Length,"errR.rds", sep="_")))
library(ggplot2)
A<- plotErrors(errF, nominalQ=TRUE)
ggsave(A, filename = paste(Date, Length, "errF.png", sep="_"),path= pathOut, device='png', dpi=300, width = 10,height = 10)
B<- plotErrors(errR, nominalQ=TRUE)
ggsave(B, filename = paste(Date, Length, "errR.png", sep="_"),path=pathOut, device='png', dpi=300, width = 10,height = 10)
###############
#Infering ASVs#
###############
errF<-readRDS(file.path(pathOut, paste(Date,Length,"errF.rds", sep="_")))
errR<-readRDS(file.path(pathOut, paste(Date,Length,"errR.rds", sep="_")))
#Inferring ASVs
dadaFs <- dada(filtFs, err=errF, multithread=TRUE, pool=TRUE)
saveRDS(dadaFs, file.path(pathOut, paste(Date,Length,"dadaFs.rds", sep="_")))
dadaFs<-readRDS(file.path(pathOut, paste(Date,Length,"dadaFs.rds", sep="_")))
dadaRs <- dada(filtRs, err=errR, multithread=TRUE, pool=TRUE)
saveRDS(dadaRs, file.path(pathOut, paste(Date,Length,"dadaRs.rds", sep="_")))
dadaRs<-readRDS(file.path(pathOut, paste(Date,Length,"dadaRs.rds", sep="_")))
dadaFs[[1]]
####################
#merge paired reads#
####################
mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, verbose=TRUE)
saveRDS(mergers, file.path(pathOut, paste(Date,Length,"mergers.rds", sep="_")))
# Inspect the merger data.frame from the first sample
head(mergers[[1]])
#Construct sequence table
seqtab <- makeSequenceTable(mergers)
dim(seqtab)
saveRDS(seqtab, file.path(pathOut, paste(Date,Length,"seqtab.rds", sep="_")))
seqtab<-readRDS(file.path(pathOut, paste(Date,Length,"seqtab.rds", sep="_")))
#Inspect distribution of sequence lengths
table(nchar(getSequences(seqtab)))
#################
#Remove chimeras#
#################
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE)
dim(seqtab.nochim) #Check dimensions to see if the a good number of samples have made it into the table with a good number of reads
sum(seqtab.nochim)/sum(seqtab) #Check the proportion of reads that made it past the chimera check
saveRDS(seqtab.nochim, file.path(pathOut, paste(Date,Length,"seqtab.nochim.rds", sep="_")))
seqtab.nochim<-readRDS(file.path(pathOut, paste(Date,Length,"seqtab.nochim.rds", sep="_")))
##################################
#Track reads through the pipeline#
##################################
dadaFs <-readRDS(file.path(pathOut, paste(Date, Length, "dadaFs.rds", sep="_")))
dadaRs <-readRDS(file.path(pathOut, paste(Date, Length,"dadaRs.rds", sep="_")))
out <-readRDS(file.path(pathOut, paste(Date, Length,"out.rds", sep="_")))
mergers <-readRDS(file.path(pathOut, paste(Date, Length,"mergers.rds", sep="_")))
getN <- function(x) sum(getUniques(x)) #See how many unique sequences there are? Very tired, will check later.
track <- cbind(out, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers, getN), rowSums(seqtab.nochim)) #As it says on the box
# If processing a single sample, remove the sapply calls: e.g. replace sapply(dadaFs, getN) with getN(dadaFs)
colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim") #Set column names
rownames(track) <- sample.names #Label rownames for each sample
head(track) #Check to see the proportion of samples that made it through the whole process.
write.table(track, file.path(pathOut, paste(Date, Length,"Track_reads_through_pipeline.txt", sep="_")))
ReadNum<- read.table(file.path(pathOut, paste(Date, Length,"Track_reads_through_pipeline.txt", sep="_")))
write.csv2(track, file.path(pathOut, paste(Date, Length,"Track_reads_through_pipeline.csv", sep="_")), row.names = T, sep=";")
#################
#Assign Taxonomy#
#################
taxa_wSpecies <- assignTaxonomy(seqtab.nochim, file.path(pathTrainsets, "silva_nr99_v138.1_wSpecies_train_set.fa.gz"), multithread=TRUE)
#In this step species level is already included <- but only for long-reads reliable!
# Extensions: The dada2 package also implements a method to make species level assignments based on exact matching between ASVs and sequenced reference strains. Recent analysis suggests that exact matching (or 100% identity) is the only appropriate way to assign species to 16S gene fragments. Currently, species-assignment training fastas are available for the Silva and RDP 16S databases. To follow the optional species addition step, download the silva_species_assignment_v132.fa.gz file, and place it in the directory with the fastq files.
#https://benjjneb.github.io/dada2/assign.html#species-assignment
# Fast and appropriate species-level assignment from 16S data is provided by the assignSpecies method. assignSpecies uses exact string matching against a reference database to assign Genus species binomials. In short, query sequence are compared against all reference sequences that had binomial genus-species nomenclature assigned, and the genus-species of all exact matches are recorded and returned if it is unambiguous. Recent results indicate that exact matching (or 100% identity) with amplicon sequence variants (ASVs) is the only appropriate method for species-level assignment to high-throughput 16S amplicon data.
taxa <- assignTaxonomy(seqtab.nochim, file.path(pathTrainsets, "silva_nr99_v138.1_train_set.fa.gz"), multithread=TRUE)
taxa <- addSpecies(taxa, file.path(pathTrainsets, "silva_species_assignment_v138.1.fa.gz"))
#When you want to do this it is nessessary to in the first step NOT TAKE THE _wSpecies_train_set.fa.gz otherwise we have Species twice in the data
taxa.print <- taxa # Removing sequence rownames for display only
rownames(taxa.print) <- NULL #Superflous information, get rid
taxa.print # See if it looks good
saveRDS(taxa, file.path(pathOut, paste(Date,Length, "Taxa_Species.rds", sep="_")))
saveRDS(taxa_wSpecies, file.path(pathOut, paste(Date,Length, "Taxa_wSpecies.rds", sep="_")))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment