Gladstone-Bioinformatics-Wo.../intro-rna-seq/steps_on_wynton_part2.txt
ayushi-agrawal-gladstone e43c3505b0 v4 May 2023
2023-05-11 15:06:17 -07:00

141 lines
7.4 KiB
Text

#Commands to run on wynton in session 2 of the Intro to RNA-seq data analysis workshop
#login to the wynton cluster
{local}$ ssh alice@log2.wynton.ucsf.edu
#enter your wynton password when prompted and hit enter
#once you are logged in to wynton,
#login to the development node
[alice@log2 ~]$ ssh dev3
#go to the Intro_to_RNA-seq_data_analysis folder that we uploaded in session 1
[alice@dev3 ~]$ cd Intro_to_RNA-seq_data_analysis/
#list the contents of the Intro_to_RNA-seq_data_analysis folder
#the singularity container "rna_seq_container.sif" that we created in session 1 should appear in the result
[alice@dev3 Intro_to_RNA-seq_data_analysis]$ ls
#run fastqc on the Bacteria_GATTACA_L001_R1_001.fastq file using the singularity container
[alice@dev3 Intro_to_RNA-seq_data_analysis]$ singularity exec rna_seq_container.sif fastqc Bacteria_GATTACA_L001_R1_001.fastq
#once the above command completes running,
#check the output - there should be 2 output files
# 1. Bacteria_GATTACA_L001_R1_001_fastqc.html
# 2. Bacteria_GATTACA_L001_R1_001_fastqc.zip
#singularity container is just another way of running the same tool
#so the output files should match the ones we got when we ran fastqc without singularity container in session 1
[alice@dev3 Intro_to_RNA-seq_data_analysis]$ ls
#download the fastqc html result file from wynton to the Downloads folder on local computer
#open a new terminal (MacOS) or command prompt (Windows) window
{local}$ scp alice@dt2.wynton.ucsf.edu:~/Intro_to_RNA-seq_data_analysis/Bacteria_GATTACA_L001_R1_001_fastqc.html Downloads
#once the above command completes running, open the downloaded Bacteria_GATTACA_L001_R1_001_fastqc.html file
#inspect the fastqc html file for different QC parameters
#you will notice adapter content reported in the fastqc html file
#so now, we will trim the reads in the fastq file to remove these adapters using cutadapt
#go back to the terminal (MacOS) or command prompt (Windows) window where you are logged in to wynton
#run cutadapt to trim the adapaters from the Bacteria_GATTACA_L001_R1_001.fastq file using the singularity container
[alice@dev3 Intro_to_RNA-seq_data_analysis]$ singularity exec rna_seq_container.sif cutadapt \
-a file:Adapter_Sequence.fasta \
-o trimmed.fastq \
--minimum-length 20 \
--quality-cutoff 20 \
Bacteria_GATTACA_L001_R1_001.fastq
#once the above command completes running,
#check the output - there should be a trimmed.fastq output file
#the trimmed.fastq file is the trimmed fastq file generated by cutadapt after removing the adapter content
[alice@dev3 Intro_to_RNA-seq_data_analysis]$ ls
#run fastqc on the trimmed.fastq file using the singularity container to check if the adapter content is gone
[alice@dev3 Intro_to_RNA-seq_data_analysis]$ singularity exec rna_seq_container.sif fastqc trimmed.fastq
#once the above command completes running,
#check the output - there should be 2 output files
# 1. trimmed_fastqc.html
# 2. trimmed_fastqc.zip
[alice@dev3 Intro_to_RNA-seq_data_analysis]$ ls
#go back to the terminal (MacOS) or command prompt (Windows) window where you are on your local computer
#download the fastqc html result file from wynton to the Downloads folder on local computer
{local}$ scp alice@dt2.wynton.ucsf.edu:~/Intro_to_RNA-seq_data_analysis/trimmed_fastqc.html Downloads
#open and inpect the trimmed_fastqc.html file - the adapter content should be gone
#if all looks good in the html file, then run STAR to align the trimmed reads to the reference genome
#Before we can map reads to the reference genome using STAR,
#we need to index it. This will generate a transformed version of the
#genome that allows STAR to efficiently map sequences to it.
#go back to the terminal (MacOS) or command prompt (Windows) window where you are logged in to wynton
#run the below command to make a folder to store the star index files
[alice@dev3 Intro_to_RNA-seq_data_analysis]$ mkdir star_index
#run the below command to generate the star index
[alice@dev3 Intro_to_RNA-seq_data_analysis]$ singularity exec rna_seq_container.sif STAR \
--runMode genomeGenerate \
--genomeDir ./star_index \
--genomeFastaFiles rDNA_sequence.fasta \
--genomeSAindexNbases 3
#once the above command completes running,
#check the contents of the star_index folder - there should be 8 output files
# 1. chrLength.txt
# 2. chrNameLength.txt
# 3. chrName.txt
# 4. chrStart.txt
# 5. Genome
# 6. genomeParameters.txt
# 7. SA
# 8. SAindex
#for more details on STAR go to:
#https://physiology.med.cornell.edu/faculty/skrabanek/lab/angsd/lecture_notes/STARmanual.pdf
[alice@dev3 Intro_to_RNA-seq_data_analysis]$ ls star_index/
#run the star alignment on the trimmed.fastq file using the star index generated in the previous commands
[alice@dev3 Intro_to_RNA-seq_data_analysis]$ singularity exec rna_seq_container.sif STAR \
--genomeDir ./star_index \
--readFilesIn ./trimmed.fastq
#once the above command completes running,
#check the output - there should be 5 output files
# 1. Log.final.out - a summary of mapping statistics for the sample
# 2. Aligned.out.sam - the aligned reads, in SAM format
# 3. Log.out - a running log from STAR, with information about the run
# 4. Log.progress.out - job progress with the number of processed reads, % of mapped reads etc., updated every ~1 minute
# 5. SJ.out.tab - high confidence collapsed splice junctions in tab-delimited format. Only junctions supported by uniquely mapping reads are reported
#for more details on STAR go to:
#https://physiology.med.cornell.edu/faculty/skrabanek/lab/angsd/lecture_notes/STARmanual.pdf
[alice@dev3 Intro_to_RNA-seq_data_analysis]$ ls
#look at the contents of the Log.final.out file to check the mapping statistics for the sample
[alice@dev3 Intro_to_RNA-seq_data_analysis]$ less Intro_to_RNA-seq_data_analysis/Log.final.out
#if the mapping statistics look good then run featureCounts to generate the read count matrices
#if the mapping statistics donot look good then troubleshoot the issues by seeking help online or contacting the Gladstone Bioinformatics Core
#run the featureCounts on the Aligned.out.sam file
[alice@dev3 Intro_to_RNA-seq_data_analysis]$ singularity exec rna_seq_container.sif featureCounts \
-a rDNA.gtf \
-t CDS \
-o counts.txt \
Aligned.out.sam
#once the above command completes running,
#check the output - there should be 2 output files
# 1. counts.txt - the counts matrix with genes as rows and samples as columns.
# there might be additional columns for gene related information( e.g., start position, end position, strand etc.)
# 2. counts.txt.summary - tabulates how many the reads were “assigned” or counted and the reason they remained “unassigned”
[alice@dev3 Intro_to_RNA-seq_data_analysis]$ ls
#look at the contents of the counts.txt.summary file to check the number of reads assigned to a gene
#make sure most of the reads are assigned to a gene, if not then try to troubleshoot by seeking help online or contacting the Gladstone Bioinformatics Core
[alice@dev3 Intro_to_RNA-seq_data_analysis]$ less counts.txt.summary
#the counts.txt file can be used for further analysis (such as differential gene expression)
#go back to the terminal (MacOS) or command prompt (Windows) window where you are on your local computer
#download the counts.txt file from wynton to the Downloads folder on local computer
{local}$ scp alice@dt2.wynton.ucsf.edu:~/Intro_to_RNA-seq_data_analysis/counts.txt Downloads
############## END SESSION 2 ##############