- Upstream analysis of RNAseq
- DOWNLOAD TOOLS
- SETUP WORKING DIR
- PATH
- RNASEQ: RAW DATA PROCESSING & ALIGNMENT
- COMMAND TO DOWNLOAD RAW DATA OF THE PROJECT
- CHANGE THE FILE NAME
- RAW DATA PROCESSING: QUALITY CONTROL
- RAW DATA PROCESSING: TRIMMING & FILTERING
- ALIGNMENT WITH STAR
- POST-PROCESSING ALIGNMENT DATA: REMOVE DUPLICATE
- POST-PROCESSING ALIGNMENT DATA: QUALITY CONTROL USING RSEQQC
- QUANTIFICATION
mkdir tools
sudo apt update
sudo apt install sra-toolkit
fastq-dump --version
## Install & unzip
wget https://www.bioinformatics.babraham.ac.uk/projects/fastqc/fastqc_v0.12.1.zip
unzip fastqc_v0.12.1.zip
## Write at the end of the .bashrc file. This command will let you export the path to FastQC, and
execute it everywhere
nano ~/.bashrc
export PATH='path/to/FastQC/':$PATH
## For example:
export PATH='/home/duydao/dnaseq_work/tools/FastQC':$PATH
source ~/.bashrc
## Try it by running
fastqc
wget http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-0.39.zip
unzip Trimmomatic-0.39.zip
# Get latest STAR source from releases
wget https://github.com/alexdobin/STAR/archive/2.7.10b.tar.gz
tar -xzf 2.7.10b.tar.gz
cd STAR-2.7.10b
# Compile
cd source
make STAR
# Export path to STAR
nano ~/.bashrc
export PATH='path/to/tools/STAR-2.7.10b/bin/Linux_x86_64/':$PATH
git clone https://github.com/ShiLab-Bioinformatics/subread.git
cd subread/src
make -f Makefile.Linux
nano ~/.bashrc
export PATH='path/to/tools/subread/bin':$PATH
Download at this link: https://sourceforge.net/projects/rseqc/files/RSeQC-5.0.1.tar.gz/download
mv ~/Downloads/RSeQC-5.0.1.tar.gz tools/
gunzip RSeQC-5.0.1.tar.gz
tar -xvf RSeQC-5.0.1.tar.gz
git clone https://github.com/broadinstitute/picard.git
cd picard/
./gradlew shadowJar
mkdir sacCer2
mkdir -p sacCer2/sra/
mkdir -p sacCer2/ref/
mkdir -p sacCer2/ref/annotation/
mkdir -p sacCer2/ref/genome/
mkdir -p sacCer2/raw/
mkdir -p sacCer2/raw/qc_check
mkdir -p sacCer2/trim/
mkdir -p sacCer2/trim/qc_check
mkdir -p sacCer2/align
mkdir -p sacCer2/align/bam
mkdir -p sacCer2/counts/
.
├── align
├── counts
├── raw
│ └── qc_check
├── ref
│ ├── annotation
│ └── genome
├── sra
└── trim
└── qc_check
cd $p_ref/genome/
wget https://hgdownload.soe.ucsc.edu/goldenPath/sacCer3/bigZips/sacCer3.fa.gz
gunzip sacCer3.fa.gz
cd $p_ref/annotation/
wget https://hgdownload.soe.ucsc.edu/goldenPath/sacCer3/bigZips/genes/sacCer3.ensGene.gtf.gz
gunzip sacCer3.ensGene.gtf.gz
Follow the guide at this link: https://www.biostars.org/p/465274/
Download the BED of sacCer3 at this link: https://genome.ucsc.edu/cgi-bin/hgTables?hgsid=1648314330_jwKWTJMxFAfhUJN4aLNKfho0wfgN&clade=other&org=S.+cerevisiae&db=sacCer3&hgta_group=genes&hgta_track=ensGene&hgta_table=0&hgta_regionType=genome&position=chrIV%3A765%2C966-775%2C965&hgta_outputType=primaryTable&hgta_outFileName=
p_sra='your/path/to/sra'
p_raw='your/path/to/raw'
p_trim="your/path/to/trim"
trimmomatic_jar='your/path/to/trimmomatic-0.39.jar'
p_ref='your/path/to/ref'
p_align='your/path/to/align'
p_counts='your/path/to/counts'
p_bam='your/path/to/bam'
cd $p_sra
nano SraAccList.csv
SRR23867632
SRR23867633
SRR23867634
SRR23867635
SRR23867636
SRR23867637
for sra_acc in $(cat $p_sra/SraAccList.csv); do
# Get the SRA
prefetch -v $sra_acc --output-directory $p_sra
echo "=== $sra_acc sra file downloaded ==="
# Download fastq file
fastq-dump \
--outdir $p_raw/ \
--split-files $p_sra/${sra_acc}/${sra_acc}.sra \
&& gzip -f $p_raw/*.fastq
echo "=== Split $sra_acc file completed ==="
done
cd $p_raw
mv SRR23867633_1.fastq.gz WT_C_1_R1.fastq.gz
mv SRR23867633_2.fastq.gz WT_C_1_R2.fastq.gz
mv SRR23867634_1.fastq.gz WT_E_1_R1.fastq.gz
mv SRR23867634_2.fastq.gz WT_E_1_R2.fastq.gz
mv SRR23867635_1.fastq.gz WT_C_2_R1.fastq.gz
mv SRR23867635_2.fastq.gz WT_C_2_R2.fastq.gz
mv SRR23867632_1.fastq.gz WT_E_2_R1.fastq.gz
mv SRR23867632_2.fastq.gz WT_E_2_R2.fastq.gz
mv SRR23867636_1.fastq.gz WT_C_3_R1.fastq.gz
mv SRR23867636_2.fastq.gz WT_C_3_R2.fastq.gz
mv SRR23867637_1.fastq.gz WT_E_3_R1.fastq.gz
mv SRR23867637_2.fastq.gz WT_E_3_R2.fastq.gz
tree -h raw/
├── [4.0K] qc_check
├── [450M] WT_C_1_R1.fastq.gz
├── [448M] WT_C_1_R2.fastq.gz
├── [445M] WT_C_2_R1.fastq.gz
├── [449M] WT_C_2_R2.fastq.gz
├── [463M] WT_C_3_R1.fastq.gz
├── [468M] WT_C_3_R2.fastq.gz
├── [472M] WT_E_1_R1.fastq.gz
├── [472M] WT_E_1_R2.fastq.gz
├── [493M] WT_E_2_R1.fastq.gz
├── [560M] WT_E_2_R2.fastq.gz
├── [485M] WT_E_3_R1.fastq.gz
└── [483M] WT_E_3_R2.fastq.gz
Use FastQC
for file in $(ls $p_raw/*gz); do
fastqc $file -o $p_raw/qc_check/
done
Trim and remove bad quality sequence with Trimmomatic
## Loop over the files ending with ".gz"
cd $p_raw
for file in $(ls *_R1*.gz);do
# Extract the base name without the extension
base_name="${file%_*.*.*}"
# Construct the input and output file names
read_1=${base_name}_R1.fastq.gz
read_2=${base_name}_R2.fastq.gz
output_paired_1="${base_name}_R1_paired.fastq.gz"
output_unpaired_1="${base_name}_R1_unpaired.fastq.gz"
output_paired_2="${base_name}_R2_paired.fastq.gz"
output_unpaired_2="${base_name}_R2_unpaired.fastq.gz"
# Run Trimmomatic with the specified parameters
java -jar $trimmomatic_jar PE \
-threads 14 \
-trimlog "$p_trim/trim_test.log" \
"$p_raw/$read_1" \
"$p_raw/$read_2" \
"$p_trim/$output_paired_1" \
"$p_trim/$output_unpaired_1" \
"$p_trim/$output_paired_2" \
"$p_trim/$output_unpaired_2" \
HEADCROP:10 \
CROP:65 \
LEADING:3 \
TRAILING:3 \
SLIDINGWINDOW:4:30 \
MINLEN:36
# Remove unpaired file
rm $p_trim/$output_unpaired_1 $p_trim/$output_unpaired_2
# Check fastqc again
fastqc $p_trim/$output_paired_1 -o $p_trim/qc_check
fastqc $p_trim/$output_paired_2 -o $p_trim/qc_check
done
# Using STAR to index the genome
STAR \
--runThreadN 12 \
--runMode genomeGenerate \
--genomeDir $p_ref/genome \
--genomeFastaFiles $p_ref/genome/sacCer3.fa
cd $p_trim
#
for file in $(ls *_R1*.gz);do
# Extract the base name without the extension
base_name="${file%_*_*.*.*}"
# Construct the input and output file names
read_1=${base_name}_R1_paired.fastq.gz
read_2=${base_name}_R2_paired.fastq.gz
# Align with STAR
STAR \
--runThreadN 12 \
--runMode alignReads \
--readFilesType Fastx \
--readFilesCommand zcat \
--genomeDir $p_ref/genome \
--sjdbGTFfile $p_ref/annotation/sacCer3.ensGene.gtf \
--sjdbOverhang 100 \
--readFilesIn \
$read_1 \
$read_2 \
--outSAMtype BAM SortedByCoordinate \
--outSAMmode Full \
--outFileNamePrefix $p_align/${base_name}_
done
cd $p_align
mkdir bam
mv *.bam bam/
cd $p_align/bam/
# Create path to your picard tool
picard_jar='path/to/picard.jar'
# Using for loop to remove all duplicate in bam file
for file in $(ls *.bam);do
base_name="${file%Aligned*}"
java -jar $picard_jar MarkDuplicates \
--INPUT ${base_name}Aligned.sortedByCoord.out.bam \
--OUTPUT ${base_name}_rmdup.bam \
--METRICS_FILE ${base_name}_rmdup.metrics2 \
--REMOVE_DUPLICATES true
done
Try to explore the quality of all of your BAM file using RSeqQC package
-
Index bam
samtools index WT_C_1_rmdup.bam
-
Genebody coverage
geneBody_coverage.py \ -r $p_ref/annotation/sacCer3.ens.bed \ -i $p_bam/WT_C_1_rmdup.bam \ -o gene_coverage
-
Junction annotation
junction_annotation.py \ -r $p_ref/annotation/sacCer3.ens.bed \ -i $p_bam/WT_C_1_rmdup.bam \ -o WT_C_1
-
Junction saturation
junction_saturation.py \ -r $p_ref/annotation/sacCer3.ens.bed \ -i $p_bam/WT_C_1_rmdup.bam \ -o WT_C_1
-
Read Distribution
read_distribution.py \ -i $p_bam/WT_C_1_rmdup.bam \ -r $p_ref/annotation/sacCer3.ens.bed
featureCounts counts how many reads map to genomic features, such as genes, exon, promoter and genomic bins.
featureCounts \
-p \
-a $p_annotation/sacCer3.ensGene.gtf \
-o $p_counts/all.featureCounts.txt \
$p_bam/WT_C_1_rmdup.bam \
$p_bam/WT_C_2_rmdup.bam \
$p_bam/WT_C_3_rmdup.bam \
$p_bam/WT_E_1_rmdup.bam \
$p_bam/WT_E_2_rmdup.bam \
$p_bam/WT_E_3_rmdup.bam