diff --git a/16S/trimming_adapterfilter b/16S/trimming_adapterfilter new file mode 100644 index 0000000000000000000000000000000000000000..bfa3116f820b3247944c141aff9960ecdda228c9 --- /dev/null +++ b/16S/trimming_adapterfilter @@ -0,0 +1,199 @@ +#Steps# + #Step1 Trim-galore for Illumina adapers and general quality + #Step2 FiltN and 16SPrimers and create Quality Plots + #Step3 Run dada2 pipeline once for differnt combination of trunclen + #Step4 Merge the SeqTabs from different sequencing runs and add taxonomy + + ############################################################ + #Step1 Trim-galore for Illumina adapers and general quality# + ############################################################ + ############# + #Trim-Galore# + ############# +#!/bin/bash +#SBATCH --job-name=Trim +#SBATCH --nodes=1 +#SBATCH --partition=std +#SBATCH --tasks-per-node=16 +#SBATCH --time=12:00:00 +#SBATCH --export=NONE +#SBATCH --error=error_Trim.txt +#SBATCH --mail-user=raphael.koll@uni-hamburg.de +#SBATCH --mail-type=ALL +source /sw/batch/init.sh +source /anaconda3/etc/profile.d/conda.sh +conda activate CondaTrim +for f1 in *_R1_001.fastq.gz +do + f2=${f1%%_R1_001.fastq.gz}"_R2_001.fastq.gz" + trim_galore --paired --fastqc -o trim_galore/ $f1 $f2 +done + +######### +#MultiQC# +######### +source /usw/baz7490/anaconda3/etc/profile.d/conda.sh +conda activate MambaRNAseq +multiqc . + +rm -r multiqc_data +mv multiqc_report Batch2_Kiel_l6fbp_multiqc_report + + +####### +#Unzip# +####### +gunzip *fq.gz + +#Step2 FiltN and 16SPrimers and create Quality Plots +#Steps done before: + #Trim-Galore cut out Illumina adapters (in about 30% of the reads) + +#!/bin/bash +#SBATCH --job-name=dada2 +#SBATCH --nodes=1 +#SBATCH --tasks-per-node=16 +#SBATCH --partition=stl +#SBATCH --time=7-00:00:00 +#SBATCH --export=NONE +#SBATCH --error=error_filterNandPrimer-Batch2-Run1-02.08.23.txt +#SBATCH --mail-user=raphael.koll@uni-hamburg.de +#SBATCH --mail-type=ALL +source /sw/batch/init.sh +source /anaconda3/etc/profile.d/conda.sh +conda activate MambaDada2 +R -q -f cutadapt-filterNandPrimer-02.08.23.R > filterNandPrimer-02.08.23.out 2>&1 + +###################################### +#cutadapt-filterNandPrimer-26.07.23.R# +###################################### +path <- "/trim_galore" +pathOut <- "/02.08.23" +pathTrainsets <-"/16S/trainsets" +Date <- "02.08.23" + +library(dada2) +library(ShortRead) +library(ggplot2) +head(list.files(path)) +# Forward and reverse fastq filenames have format: SAMPLENAME_R1_001.fastq and SAMPLENAME_R2_001.fastq +fnFs <- sort(list.files(path, pattern="R1_001_val_1.fq", full.names = TRUE)) +fnRs <- sort(list.files(path, pattern="R2_001_val_2.fq", full.names = TRUE)) +# Extract sample names, assuming filenames have format: SAMPLENAME_XXX.fastq +sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1) + +##cutadapt - identify and remove primer sequences: #Primer sequences: 341F and 806R used +FWD = "CCTACGGGAGGCAGCAG" +REV = "GGACTACHVGGGTWTCTAAT" + +##########START cutadapt PRIMER SCREENING AND REMOVAL################## +allOrients <- function(primer) { + # Create all orientations of the input sequence + require(Biostrings) + dna <- DNAString(primer) # The Biostrings works w/ DNAString objects rather than character vectors + orients <- c(Forward = dna, Complement = Biostrings::complement(dna), Reverse = Biostrings::reverse(dna), + RevComp = Biostrings::reverseComplement(dna)) + return(sapply(orients, toString)) # Convert back to character vector +} +FWD.orients <- allOrients(FWD) +REV.orients <- allOrients(REV) +FWD.orients + +fnFs.filtN <- file.path(path, "filtN", basename(fnFs)) # Put N-filtered files in filtN/ subdirectory +fnRs.filtN <- file.path(path, "filtN", basename(fnRs)) + +##Takes some minutes when not running via batch system## +filterAndTrim(fnFs, fnFs.filtN, fnRs, fnRs.filtN, maxN = 0, multithread = FALSE) + + +primerHits <- function(primer, fn) { + # Counts number of reads in which the primer is found + nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE) + return(sum(nhits > 0)) +} +rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.filtN[[1]]), + FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.filtN[[1]]), + REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs.filtN[[1]]), + REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.filtN[[1]])) +#Forward Complement Reverse RevComp +#FWD.ForwardReads 27814 0 0 0 +#FWD.ReverseReads 0 0 0 21 +#REV.ForwardReads 0 0 0 2 +#REV.ReverseReads 26861 0 0 0 + +cutadapt <- paste0("/anaconda3/envs/CondaTrim/bin/cutadapt") #Conda path to cutadapt +system2(cutadapt, args = "--version") # Run shell commands from R + +path.cut <- file.path(path, "cutadapt") +if(!dir.exists(path.cut)) dir.create(path.cut) +fnFs.cut <- file.path(path.cut, basename(fnFs)) +fnRs.cut <- file.path(path.cut, basename(fnRs)) + +FWD.RC <- dada2:::rc(FWD) +REV.RC <- dada2:::rc(REV) +# Trim FWD and the reverse-complement of REV off of R1 (forward reads) +R1.flags <- paste("-g", FWD, "-a", REV.RC, "--minimum-length 100") +# Trim REV and the reverse-complement of FWD off of R2 (reverse reads) +R2.flags <- paste("-G", REV, "-A", FWD.RC, "--minimum-length 100") +# Run Cutadapt +for(i in seq_along(fnFs)) { + system2(cutadapt, args = c(R1.flags, R2.flags, "-n", 2, # -n 2 required to remove FWD and REV from reads + "-o", fnFs.cut[i], "-p", fnRs.cut[i], # output files + fnFs.filtN[i], fnRs.filtN[i])) # input files +} +##TAKES some minutes when not run via batch## +path.cut <- file.path(path, "cutadapt") +if(!dir.exists(path.cut)) dir.create(path.cut) +fnFs.cut <- file.path(path.cut, basename(fnFs)) +fnRs.cut <- file.path(path.cut, basename(fnRs)) + +FWD.RC <- dada2:::rc(FWD) +REV.RC <- dada2:::rc(REV) +# Trim FWD and the reverse-complement of REV off of R1 (forward reads) +R1.flags <- paste("-g", FWD, "-a", REV.RC,"--minimum-length 100") +# Trim REV and the reverse-complement of FWD off of R2 (reverse reads) +R2.flags <- paste("-G", REV, "-A", FWD.RC,"--minimum-length 100") +# Run Cutadapt +for(i in seq_along(fnFs)) { + system2(cutadapt, args = c(R1.flags, R2.flags, "-n", 2, # -n 2 required to remove FWD and REV from reads + "-o", fnFs.cut[i], "-p", fnRs.cut[i], # output files + fnFs.filtN[i], fnRs.filtN[i])) # input files +} + +rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.cut[[1]]), + FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.cut[[1]]), + REV.ForwardReads = sapply(REV.orients, primerHits,fn = fnFs.cut[[1]]), + REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.cut[[1]])) + +# Forward and reverse fastq filenames have the format: +cutFs <- sort(list.files(path.cut, pattern = "R1_001_val_1.fq", full.names = TRUE)) +cutRs <- sort(list.files(path.cut, pattern = "R2_001_val_2.fq", full.names = TRUE)) + +# Extract sample names, assuming filenames have format: +get.sample.name <- function(fname) strsplit(basename(fname), "_")[[1]][1] +sample.names <- unname(sapply(cutFs, get.sample.name)) +head(sample.names) + +############FINISH CUTADAPT PRIMER REMOVAL###################### + + +############ Plot selected QualityPlot ######################### +library(dada2) +library(ShortRead) +library(ggplot2) +head(list.files(path)) +# Forward and reverse fastq filenames have format: SAMPLENAME_R1_001.fastq and SAMPLENAME_R2_001.fastq +path.cut <- file.path(path, "cutadapt") +head(list.files(path.cut)) +cutFs <- sort(list.files(path.cut, pattern = "R1_001_val_1.fq", full.names = TRUE)) +cutRs <- sort(list.files(path.cut, pattern = "R2_001_val_2.fq", full.names = TRUE)) +for(i in 1:length(cutFs[c(1,10,20,50,100,110,150)])){ + plot.quals <- plotQualityProfile(cutFs[[i]]) + g <- gsub( "_.*$", "", basename(cutFs[[i]])) + ggsave(plot.quals, filename = paste(g,"F.png", sep="_"), path = pathOut, device='png', dpi=300, width = 8, + height = 6)} +for(i in 1:length(cutRs[c(1,10,20,50,100,110,150)])){ + plot.quals <- plotQualityProfile(cutRs[[i]]) + g <- gsub( "_.*$", "", basename(cutRs[[i]])) + ggsave(plot.quals, filename = paste(g,"R.png", sep="_"), path = pathOut, device='png', dpi=300, width = 8, + height = 6)} diff --git a/RNAseq b/RNAseq deleted file mode 100644 index 8b137891791fe96927ad78e64b0aad7bded08bdc..0000000000000000000000000000000000000000 --- a/RNAseq +++ /dev/null @@ -1 +0,0 @@ -