Update and rename RNAseq to 16S/trimming_adapterfilter

raw amplicon data are trimmed for adapter contamination and quality plotted

Update and rename RNAseq to 16S/trimming_adapterfilter
69000fd9 · Raphael Koll · GitHub · d2f68b76 · 69000fd9 · d2f68b76
Unverified Commit 69000fd9 authored Jan 4, 2024 by Raphael Koll Committed by GitHub Jan 4, 2024
--- a/16S/trimming_adapterfilter
+++ b/16S/trimming_adapterfilter
+#Steps#
+    #Step1 Trim-galore for Illumina adapers and general quality 
+        #Step2 FiltN and 16SPrimers and create Quality Plots
+            #Step3 Run dada2 pipeline once for differnt combination of trunclen
+                #Step4 Merge the SeqTabs from different sequencing runs and add taxonomy
+    ############################################################
+    #Step1 Trim-galore for Illumina adapers and general quality#
+    ############################################################
+        #############
+        #Trim-Galore#
+        #############
+#!/bin/bash
+#SBATCH --job-name=Trim
+#SBATCH --nodes=1
+#SBATCH --partition=std
+#SBATCH --tasks-per-node=16
+#SBATCH --time=12:00:00
+#SBATCH --export=NONE
+#SBATCH --error=error_Trim.txt
+#SBATCH --mail-user=raphael.koll@uni-hamburg.de
+#SBATCH --mail-type=ALL
+source /sw/batch/init.sh
+source /anaconda3/etc/profile.d/conda.sh
+conda activate CondaTrim
+for f1 in *_R1_001.fastq.gz
+do
+        f2=${f1%%_R1_001.fastq.gz}"_R2_001.fastq.gz"
+        trim_galore --paired --fastqc -o trim_galore/ $f1 $f2
+done
+#########
+#MultiQC#
+#########
+source /usw/baz7490/anaconda3/etc/profile.d/conda.sh
+conda activate MambaRNAseq
+multiqc .
+rm -r multiqc_data
+mv multiqc_report Batch2_Kiel_l6fbp_multiqc_report
+#######
+#Unzip#
+#######
+gunzip *fq.gz
+#Step2 FiltN and 16SPrimers and create Quality Plots
+#Steps done before:
+    #Trim-Galore cut out Illumina adapters (in about 30% of the reads)
+#!/bin/bash
+#SBATCH --job-name=dada2
+#SBATCH --nodes=1
+#SBATCH --tasks-per-node=16
+#SBATCH --partition=stl 
+#SBATCH --time=7-00:00:00
+#SBATCH --export=NONE
+#SBATCH --error=error_filterNandPrimer-Batch2-Run1-02.08.23.txt
+#SBATCH --mail-user=raphael.koll@uni-hamburg.de
+#SBATCH --mail-type=ALL
+source /sw/batch/init.sh
+source /anaconda3/etc/profile.d/conda.sh
+conda activate MambaDada2
+R -q -f cutadapt-filterNandPrimer-02.08.23.R > filterNandPrimer-02.08.23.out 2>&1
+######################################
+#cutadapt-filterNandPrimer-26.07.23.R#
+######################################
+path <- "/trim_galore"
+pathOut <- "/02.08.23"
+pathTrainsets <-"/16S/trainsets"
+Date <- "02.08.23"
+library(dada2)
+library(ShortRead)
+library(ggplot2)
+head(list.files(path))
+# Forward and reverse fastq filenames have format: SAMPLENAME_R1_001.fastq and SAMPLENAME_R2_001.fastq
+fnFs <- sort(list.files(path, pattern="R1_001_val_1.fq", full.names = TRUE))
+fnRs <- sort(list.files(path, pattern="R2_001_val_2.fq", full.names = TRUE))
+# Extract sample names, assuming filenames have format: SAMPLENAME_XXX.fastq
+sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1)
+##cutadapt - identify and remove primer sequences: #Primer sequences: 341F and 806R used
+FWD = "CCTACGGGAGGCAGCAG"
+REV = "GGACTACHVGGGTWTCTAAT"
+##########START cutadapt PRIMER SCREENING AND REMOVAL##################
+allOrients <- function(primer) {
+  # Create all orientations of the input sequence
+  require(Biostrings)
+  dna <- DNAString(primer)  # The Biostrings works w/ DNAString objects rather than character vectors
+  orients <- c(Forward = dna, Complement = Biostrings::complement(dna), Reverse = Biostrings::reverse(dna),
+               RevComp = Biostrings::reverseComplement(dna))
+  return(sapply(orients, toString))  # Convert back to character vector
+}
+FWD.orients <- allOrients(FWD)
+REV.orients <- allOrients(REV)
+FWD.orients
+fnFs.filtN <- file.path(path, "filtN", basename(fnFs)) # Put N-filtered files in filtN/ subdirectory
+fnRs.filtN <- file.path(path, "filtN", basename(fnRs))
+##Takes some minutes when not running via batch system##
+filterAndTrim(fnFs, fnFs.filtN, fnRs, fnRs.filtN, maxN = 0, multithread = FALSE)
+primerHits <- function(primer, fn) {
+  # Counts number of reads in which the primer is found
+  nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE)
+  return(sum(nhits > 0))
+}
+rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.filtN[[1]]),
+      FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.filtN[[1]]),
+      REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs.filtN[[1]]),
+      REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.filtN[[1]]))
+#Forward Complement Reverse RevComp
+#FWD.ForwardReads   27814          0       0       0
+#FWD.ReverseReads       0          0       0      21
+#REV.ForwardReads       0          0       0       2
+#REV.ReverseReads   26861          0       0       0
+cutadapt <- paste0("/anaconda3/envs/CondaTrim/bin/cutadapt") #Conda path to cutadapt
+system2(cutadapt, args = "--version") # Run shell commands from R
+path.cut <- file.path(path, "cutadapt")
+if(!dir.exists(path.cut)) dir.create(path.cut)
+fnFs.cut <- file.path(path.cut, basename(fnFs))
+fnRs.cut <- file.path(path.cut, basename(fnRs))
+FWD.RC <- dada2:::rc(FWD)
+REV.RC <- dada2:::rc(REV)
+# Trim FWD and the reverse-complement of REV off of R1 (forward reads)
+R1.flags <- paste("-g", FWD, "-a", REV.RC, "--minimum-length 100")
+# Trim REV and the reverse-complement of FWD off of R2 (reverse reads)
+R2.flags <- paste("-G", REV, "-A", FWD.RC, "--minimum-length 100")
+# Run Cutadapt
+for(i in seq_along(fnFs)) {
+  system2(cutadapt, args = c(R1.flags, R2.flags, "-n", 2, # -n 2 required to remove FWD and REV from reads
+                             "-o", fnFs.cut[i], "-p", fnRs.cut[i], # output files
+                             fnFs.filtN[i], fnRs.filtN[i])) # input files
+}
+##TAKES some minutes when not run via batch##
+path.cut <- file.path(path, "cutadapt")
+if(!dir.exists(path.cut)) dir.create(path.cut)
+fnFs.cut <- file.path(path.cut, basename(fnFs))
+fnRs.cut <- file.path(path.cut, basename(fnRs))
+FWD.RC <- dada2:::rc(FWD)
+REV.RC <- dada2:::rc(REV)
+# Trim FWD and the reverse-complement of REV off of R1 (forward reads)
+R1.flags <- paste("-g", FWD, "-a", REV.RC,"--minimum-length 100")
+# Trim REV and the reverse-complement of FWD off of R2 (reverse reads)
+R2.flags <- paste("-G", REV, "-A", FWD.RC,"--minimum-length 100")
+# Run Cutadapt
+for(i in seq_along(fnFs)) {
+  system2(cutadapt, args = c(R1.flags, R2.flags, "-n", 2, # -n 2 required to remove FWD and REV from reads
+                             "-o", fnFs.cut[i], "-p", fnRs.cut[i], # output files
+                             fnFs.filtN[i], fnRs.filtN[i])) # input files
+}
+rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.cut[[1]]),
+      FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.cut[[1]]),
+      REV.ForwardReads = sapply(REV.orients, primerHits,fn = fnFs.cut[[1]]),
+      REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.cut[[1]]))
+# Forward and reverse fastq filenames have the format:
+cutFs <- sort(list.files(path.cut, pattern = "R1_001_val_1.fq", full.names = TRUE))
+cutRs <- sort(list.files(path.cut, pattern = "R2_001_val_2.fq", full.names = TRUE))
+# Extract sample names, assuming filenames have format:
+get.sample.name <- function(fname) strsplit(basename(fname), "_")[[1]][1]
+sample.names <- unname(sapply(cutFs, get.sample.name))
+head(sample.names)
+############FINISH CUTADAPT PRIMER REMOVAL######################
+############ Plot selected QualityPlot #########################
+library(dada2)
+library(ShortRead)
+library(ggplot2)
+head(list.files(path))
+# Forward and reverse fastq filenames have format: SAMPLENAME_R1_001.fastq and SAMPLENAME_R2_001.fastq
+path.cut <- file.path(path, "cutadapt")
+head(list.files(path.cut))
+cutFs <- sort(list.files(path.cut, pattern = "R1_001_val_1.fq", full.names = TRUE))
+cutRs <- sort(list.files(path.cut, pattern = "R2_001_val_2.fq", full.names = TRUE))
+for(i in 1:length(cutFs[c(1,10,20,50,100,110,150)])){
+    plot.quals <- plotQualityProfile(cutFs[[i]])
+    g <- gsub( "_.*$", "", basename(cutFs[[i]]))
+    ggsave(plot.quals, filename = paste(g,"F.png", sep="_"), path = pathOut, device='png', dpi=300, width = 8,
+    height = 6)}
+for(i in 1:length(cutRs[c(1,10,20,50,100,110,150)])){
+    plot.quals <- plotQualityProfile(cutRs[[i]])
+    g <- gsub( "_.*$", "", basename(cutRs[[i]]))
+    ggsave(plot.quals, filename = paste(g,"R.png", sep="_"), path = pathOut, device='png', dpi=300, width = 8,
+    height = 6)}
--- a/RNAseq
+++ b/RNAseq