mirror of
https://github.com/gladstone-institutes/Bioinformatics-Workshops.git
synced 2025-11-30 09:45:43 -08:00
141 lines
7.4 KiB
Text
141 lines
7.4 KiB
Text
#Commands to run on wynton in session 2 of the Intro to RNA-seq data analysis workshop
|
|
|
|
#login to the wynton cluster
|
|
{local}$ ssh alice@log2.wynton.ucsf.edu
|
|
#enter your wynton password when prompted and hit enter
|
|
|
|
#once you are logged in to wynton,
|
|
#login to the development node
|
|
[alice@log2 ~]$ ssh dev3
|
|
|
|
#go to the Intro_to_RNA-seq_data_analysis folder that we uploaded in session 1
|
|
[alice@dev3 ~]$ cd Intro_to_RNA-seq_data_analysis/
|
|
|
|
#list the contents of the Intro_to_RNA-seq_data_analysis folder
|
|
#the singularity container "rna_seq_container.sif" that we created in session 1 should appear in the result
|
|
[alice@dev3 Intro_to_RNA-seq_data_analysis]$ ls
|
|
|
|
#run fastqc on the Bacteria_GATTACA_L001_R1_001.fastq file using the singularity container
|
|
[alice@dev3 Intro_to_RNA-seq_data_analysis]$ singularity exec rna_seq_container.sif fastqc Bacteria_GATTACA_L001_R1_001.fastq
|
|
|
|
#once the above command completes running,
|
|
#check the output - there should be 2 output files
|
|
# 1. Bacteria_GATTACA_L001_R1_001_fastqc.html
|
|
# 2. Bacteria_GATTACA_L001_R1_001_fastqc.zip
|
|
#singularity container is just another way of running the same tool
|
|
#so the output files should match the ones we got when we ran fastqc without singularity container in session 1
|
|
[alice@dev3 Intro_to_RNA-seq_data_analysis]$ ls
|
|
|
|
#download the fastqc html result file from wynton to the Downloads folder on local computer
|
|
#open a new terminal (MacOS) or command prompt (Windows) window
|
|
{local}$ scp alice@dt2.wynton.ucsf.edu:~/Intro_to_RNA-seq_data_analysis/Bacteria_GATTACA_L001_R1_001_fastqc.html Downloads
|
|
|
|
#once the above command completes running, open the downloaded Bacteria_GATTACA_L001_R1_001_fastqc.html file
|
|
#inspect the fastqc html file for different QC parameters
|
|
#you will notice adapter content reported in the fastqc html file
|
|
#so now, we will trim the reads in the fastq file to remove these adapters using cutadapt
|
|
|
|
#go back to the terminal (MacOS) or command prompt (Windows) window where you are logged in to wynton
|
|
#run cutadapt to trim the adapaters from the Bacteria_GATTACA_L001_R1_001.fastq file using the singularity container
|
|
[alice@dev3 Intro_to_RNA-seq_data_analysis]$ singularity exec rna_seq_container.sif cutadapt \
|
|
-a file:Adapter_Sequence.fasta \
|
|
-o trimmed.fastq \
|
|
--minimum-length 20 \
|
|
--quality-cutoff 20 \
|
|
Bacteria_GATTACA_L001_R1_001.fastq
|
|
|
|
#once the above command completes running,
|
|
#check the output - there should be a trimmed.fastq output file
|
|
#the trimmed.fastq file is the trimmed fastq file generated by cutadapt after removing the adapter content
|
|
[alice@dev3 Intro_to_RNA-seq_data_analysis]$ ls
|
|
|
|
#run fastqc on the trimmed.fastq file using the singularity container to check if the adapter content is gone
|
|
[alice@dev3 Intro_to_RNA-seq_data_analysis]$ singularity exec rna_seq_container.sif fastqc trimmed.fastq
|
|
|
|
#once the above command completes running,
|
|
#check the output - there should be 2 output files
|
|
# 1. trimmed_fastqc.html
|
|
# 2. trimmed_fastqc.zip
|
|
[alice@dev3 Intro_to_RNA-seq_data_analysis]$ ls
|
|
|
|
#go back to the terminal (MacOS) or command prompt (Windows) window where you are on your local computer
|
|
#download the fastqc html result file from wynton to the Downloads folder on local computer
|
|
{local}$ scp alice@dt2.wynton.ucsf.edu:~/Intro_to_RNA-seq_data_analysis/trimmed_fastqc.html Downloads
|
|
|
|
#open and inpect the trimmed_fastqc.html file - the adapter content should be gone
|
|
#if all looks good in the html file, then run STAR to align the trimmed reads to the reference genome
|
|
|
|
#Before we can map reads to the reference genome using STAR,
|
|
#we need to index it. This will generate a transformed version of the
|
|
#genome that allows STAR to efficiently map sequences to it.
|
|
#go back to the terminal (MacOS) or command prompt (Windows) window where you are logged in to wynton
|
|
#run the below command to make a folder to store the star index files
|
|
[alice@dev3 Intro_to_RNA-seq_data_analysis]$ mkdir star_index
|
|
|
|
#run the below command to generate the star index
|
|
[alice@dev3 Intro_to_RNA-seq_data_analysis]$ singularity exec rna_seq_container.sif STAR \
|
|
--runMode genomeGenerate \
|
|
--genomeDir ./star_index \
|
|
--genomeFastaFiles rDNA_sequence.fasta \
|
|
--genomeSAindexNbases 3
|
|
|
|
#once the above command completes running,
|
|
#check the contents of the star_index folder - there should be 8 output files
|
|
# 1. chrLength.txt
|
|
# 2. chrNameLength.txt
|
|
# 3. chrName.txt
|
|
# 4. chrStart.txt
|
|
# 5. Genome
|
|
# 6. genomeParameters.txt
|
|
# 7. SA
|
|
# 8. SAindex
|
|
#for more details on STAR go to:
|
|
#https://physiology.med.cornell.edu/faculty/skrabanek/lab/angsd/lecture_notes/STARmanual.pdf
|
|
[alice@dev3 Intro_to_RNA-seq_data_analysis]$ ls star_index/
|
|
|
|
#run the star alignment on the trimmed.fastq file using the star index generated in the previous commands
|
|
[alice@dev3 Intro_to_RNA-seq_data_analysis]$ singularity exec rna_seq_container.sif STAR \
|
|
--genomeDir ./star_index \
|
|
--readFilesIn ./trimmed.fastq
|
|
|
|
#once the above command completes running,
|
|
#check the output - there should be 5 output files
|
|
# 1. Log.final.out - a summary of mapping statistics for the sample
|
|
# 2. Aligned.out.sam - the aligned reads, in SAM format
|
|
# 3. Log.out - a running log from STAR, with information about the run
|
|
# 4. Log.progress.out - job progress with the number of processed reads, % of mapped reads etc., updated every ~1 minute
|
|
# 5. SJ.out.tab - high confidence collapsed splice junctions in tab-delimited format. Only junctions supported by uniquely mapping reads are reported
|
|
#for more details on STAR go to:
|
|
#https://physiology.med.cornell.edu/faculty/skrabanek/lab/angsd/lecture_notes/STARmanual.pdf
|
|
[alice@dev3 Intro_to_RNA-seq_data_analysis]$ ls
|
|
|
|
#look at the contents of the Log.final.out file to check the mapping statistics for the sample
|
|
[alice@dev3 Intro_to_RNA-seq_data_analysis]$ less Intro_to_RNA-seq_data_analysis/Log.final.out
|
|
|
|
#if the mapping statistics look good then run featureCounts to generate the read count matrices
|
|
#if the mapping statistics donot look good then troubleshoot the issues by seeking help online or contacting the Gladstone Bioinformatics Core
|
|
#run the featureCounts on the Aligned.out.sam file
|
|
[alice@dev3 Intro_to_RNA-seq_data_analysis]$ singularity exec rna_seq_container.sif featureCounts \
|
|
-a rDNA.gtf \
|
|
-t CDS \
|
|
-o counts.txt \
|
|
Aligned.out.sam
|
|
|
|
#once the above command completes running,
|
|
#check the output - there should be 2 output files
|
|
# 1. counts.txt - the counts matrix with genes as rows and samples as columns.
|
|
# there might be additional columns for gene related information( e.g., start position, end position, strand etc.)
|
|
# 2. counts.txt.summary - tabulates how many the reads were “assigned” or counted and the reason they remained “unassigned”
|
|
[alice@dev3 Intro_to_RNA-seq_data_analysis]$ ls
|
|
|
|
#look at the contents of the counts.txt.summary file to check the number of reads assigned to a gene
|
|
#make sure most of the reads are assigned to a gene, if not then try to troubleshoot by seeking help online or contacting the Gladstone Bioinformatics Core
|
|
[alice@dev3 Intro_to_RNA-seq_data_analysis]$ less counts.txt.summary
|
|
|
|
#the counts.txt file can be used for further analysis (such as differential gene expression)
|
|
#go back to the terminal (MacOS) or command prompt (Windows) window where you are on your local computer
|
|
#download the counts.txt file from wynton to the Downloads folder on local computer
|
|
{local}$ scp alice@dt2.wynton.ucsf.edu:~/Intro_to_RNA-seq_data_analysis/counts.txt Downloads
|
|
|
|
|
|
############## END SESSION 2 ##############
|