Skip to content
Snippets Groups Projects
Unverified Commit 69000fd9 authored by Raphael Koll's avatar Raphael Koll Committed by GitHub
Browse files

Update and rename RNAseq to 16S/trimming_adapterfilter

raw amplicon data are trimmed for adapter contamination and quality plotted
parent d2f68b76
No related branches found
No related tags found
No related merge requests found
#Steps#
#Step1 Trim-galore for Illumina adapers and general quality
#Step2 FiltN and 16SPrimers and create Quality Plots
#Step3 Run dada2 pipeline once for differnt combination of trunclen
#Step4 Merge the SeqTabs from different sequencing runs and add taxonomy
############################################################
#Step1 Trim-galore for Illumina adapers and general quality#
############################################################
#############
#Trim-Galore#
#############
#!/bin/bash
#SBATCH --job-name=Trim
#SBATCH --nodes=1
#SBATCH --partition=std
#SBATCH --tasks-per-node=16
#SBATCH --time=12:00:00
#SBATCH --export=NONE
#SBATCH --error=error_Trim.txt
#SBATCH --mail-user=raphael.koll@uni-hamburg.de
#SBATCH --mail-type=ALL
source /sw/batch/init.sh
source /anaconda3/etc/profile.d/conda.sh
conda activate CondaTrim
for f1 in *_R1_001.fastq.gz
do
f2=${f1%%_R1_001.fastq.gz}"_R2_001.fastq.gz"
trim_galore --paired --fastqc -o trim_galore/ $f1 $f2
done
#########
#MultiQC#
#########
source /usw/baz7490/anaconda3/etc/profile.d/conda.sh
conda activate MambaRNAseq
multiqc .
rm -r multiqc_data
mv multiqc_report Batch2_Kiel_l6fbp_multiqc_report
#######
#Unzip#
#######
gunzip *fq.gz
#Step2 FiltN and 16SPrimers and create Quality Plots
#Steps done before:
#Trim-Galore cut out Illumina adapters (in about 30% of the reads)
#!/bin/bash
#SBATCH --job-name=dada2
#SBATCH --nodes=1
#SBATCH --tasks-per-node=16
#SBATCH --partition=stl
#SBATCH --time=7-00:00:00
#SBATCH --export=NONE
#SBATCH --error=error_filterNandPrimer-Batch2-Run1-02.08.23.txt
#SBATCH --mail-user=raphael.koll@uni-hamburg.de
#SBATCH --mail-type=ALL
source /sw/batch/init.sh
source /anaconda3/etc/profile.d/conda.sh
conda activate MambaDada2
R -q -f cutadapt-filterNandPrimer-02.08.23.R > filterNandPrimer-02.08.23.out 2>&1
######################################
#cutadapt-filterNandPrimer-26.07.23.R#
######################################
path <- "/trim_galore"
pathOut <- "/02.08.23"
pathTrainsets <-"/16S/trainsets"
Date <- "02.08.23"
library(dada2)
library(ShortRead)
library(ggplot2)
head(list.files(path))
# Forward and reverse fastq filenames have format: SAMPLENAME_R1_001.fastq and SAMPLENAME_R2_001.fastq
fnFs <- sort(list.files(path, pattern="R1_001_val_1.fq", full.names = TRUE))
fnRs <- sort(list.files(path, pattern="R2_001_val_2.fq", full.names = TRUE))
# Extract sample names, assuming filenames have format: SAMPLENAME_XXX.fastq
sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1)
##cutadapt - identify and remove primer sequences: #Primer sequences: 341F and 806R used
FWD = "CCTACGGGAGGCAGCAG"
REV = "GGACTACHVGGGTWTCTAAT"
##########START cutadapt PRIMER SCREENING AND REMOVAL##################
allOrients <- function(primer) {
# Create all orientations of the input sequence
require(Biostrings)
dna <- DNAString(primer) # The Biostrings works w/ DNAString objects rather than character vectors
orients <- c(Forward = dna, Complement = Biostrings::complement(dna), Reverse = Biostrings::reverse(dna),
RevComp = Biostrings::reverseComplement(dna))
return(sapply(orients, toString)) # Convert back to character vector
}
FWD.orients <- allOrients(FWD)
REV.orients <- allOrients(REV)
FWD.orients
fnFs.filtN <- file.path(path, "filtN", basename(fnFs)) # Put N-filtered files in filtN/ subdirectory
fnRs.filtN <- file.path(path, "filtN", basename(fnRs))
##Takes some minutes when not running via batch system##
filterAndTrim(fnFs, fnFs.filtN, fnRs, fnRs.filtN, maxN = 0, multithread = FALSE)
primerHits <- function(primer, fn) {
# Counts number of reads in which the primer is found
nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE)
return(sum(nhits > 0))
}
rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.filtN[[1]]),
FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.filtN[[1]]),
REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs.filtN[[1]]),
REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.filtN[[1]]))
#Forward Complement Reverse RevComp
#FWD.ForwardReads 27814 0 0 0
#FWD.ReverseReads 0 0 0 21
#REV.ForwardReads 0 0 0 2
#REV.ReverseReads 26861 0 0 0
cutadapt <- paste0("/anaconda3/envs/CondaTrim/bin/cutadapt") #Conda path to cutadapt
system2(cutadapt, args = "--version") # Run shell commands from R
path.cut <- file.path(path, "cutadapt")
if(!dir.exists(path.cut)) dir.create(path.cut)
fnFs.cut <- file.path(path.cut, basename(fnFs))
fnRs.cut <- file.path(path.cut, basename(fnRs))
FWD.RC <- dada2:::rc(FWD)
REV.RC <- dada2:::rc(REV)
# Trim FWD and the reverse-complement of REV off of R1 (forward reads)
R1.flags <- paste("-g", FWD, "-a", REV.RC, "--minimum-length 100")
# Trim REV and the reverse-complement of FWD off of R2 (reverse reads)
R2.flags <- paste("-G", REV, "-A", FWD.RC, "--minimum-length 100")
# Run Cutadapt
for(i in seq_along(fnFs)) {
system2(cutadapt, args = c(R1.flags, R2.flags, "-n", 2, # -n 2 required to remove FWD and REV from reads
"-o", fnFs.cut[i], "-p", fnRs.cut[i], # output files
fnFs.filtN[i], fnRs.filtN[i])) # input files
}
##TAKES some minutes when not run via batch##
path.cut <- file.path(path, "cutadapt")
if(!dir.exists(path.cut)) dir.create(path.cut)
fnFs.cut <- file.path(path.cut, basename(fnFs))
fnRs.cut <- file.path(path.cut, basename(fnRs))
FWD.RC <- dada2:::rc(FWD)
REV.RC <- dada2:::rc(REV)
# Trim FWD and the reverse-complement of REV off of R1 (forward reads)
R1.flags <- paste("-g", FWD, "-a", REV.RC,"--minimum-length 100")
# Trim REV and the reverse-complement of FWD off of R2 (reverse reads)
R2.flags <- paste("-G", REV, "-A", FWD.RC,"--minimum-length 100")
# Run Cutadapt
for(i in seq_along(fnFs)) {
system2(cutadapt, args = c(R1.flags, R2.flags, "-n", 2, # -n 2 required to remove FWD and REV from reads
"-o", fnFs.cut[i], "-p", fnRs.cut[i], # output files
fnFs.filtN[i], fnRs.filtN[i])) # input files
}
rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.cut[[1]]),
FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.cut[[1]]),
REV.ForwardReads = sapply(REV.orients, primerHits,fn = fnFs.cut[[1]]),
REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.cut[[1]]))
# Forward and reverse fastq filenames have the format:
cutFs <- sort(list.files(path.cut, pattern = "R1_001_val_1.fq", full.names = TRUE))
cutRs <- sort(list.files(path.cut, pattern = "R2_001_val_2.fq", full.names = TRUE))
# Extract sample names, assuming filenames have format:
get.sample.name <- function(fname) strsplit(basename(fname), "_")[[1]][1]
sample.names <- unname(sapply(cutFs, get.sample.name))
head(sample.names)
############FINISH CUTADAPT PRIMER REMOVAL######################
############ Plot selected QualityPlot #########################
library(dada2)
library(ShortRead)
library(ggplot2)
head(list.files(path))
# Forward and reverse fastq filenames have format: SAMPLENAME_R1_001.fastq and SAMPLENAME_R2_001.fastq
path.cut <- file.path(path, "cutadapt")
head(list.files(path.cut))
cutFs <- sort(list.files(path.cut, pattern = "R1_001_val_1.fq", full.names = TRUE))
cutRs <- sort(list.files(path.cut, pattern = "R2_001_val_2.fq", full.names = TRUE))
for(i in 1:length(cutFs[c(1,10,20,50,100,110,150)])){
plot.quals <- plotQualityProfile(cutFs[[i]])
g <- gsub( "_.*$", "", basename(cutFs[[i]]))
ggsave(plot.quals, filename = paste(g,"F.png", sep="_"), path = pathOut, device='png', dpi=300, width = 8,
height = 6)}
for(i in 1:length(cutRs[c(1,10,20,50,100,110,150)])){
plot.quals <- plotQualityProfile(cutRs[[i]])
g <- gsub( "_.*$", "", basename(cutRs[[i]]))
ggsave(plot.quals, filename = paste(g,"R.png", sep="_"), path = pathOut, device='png', dpi=300, width = 8,
height = 6)}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment