From cd68477b9d4e2216d8f28c1d6572eec1ca0a6deb Mon Sep 17 00:00:00 2001 From: Ayushi Agrawal <88406934+ayushi-agrawal-gladstone@users.noreply.github.com> Date: Mon, 21 Mar 2022 11:46:20 -0700 Subject: [PATCH] Add files via upload --- intro-rna-seq/steps_on_wynton_session2.txt | 141 +++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 intro-rna-seq/steps_on_wynton_session2.txt diff --git a/intro-rna-seq/steps_on_wynton_session2.txt b/intro-rna-seq/steps_on_wynton_session2.txt new file mode 100644 index 0000000..124ed86 --- /dev/null +++ b/intro-rna-seq/steps_on_wynton_session2.txt @@ -0,0 +1,141 @@ +#Commands to run on wynton in session 2 of the Intro to RNA-seq data analysis workshop + +#login to the wynton cluster +{local}$ ssh alice@log2.wynton.ucsf.edu +#enter your wynton password when prompted and hit enter + +#once you are logged in to wynton, +#login to the development node +[alice@log2 ~]$ ssh dev3 + +#go to the Intro_to_RNA-seq_data_analysis folder that we uploaded in session 1 +[alice@dev3 ~]$ cd Intro_to_RNA-seq_data_analysis/ + +#list the contents of the Intro_to_RNA-seq_data_analysis folder +#the singularity container "rna_seq_container.sif" that we created in session 1 should appear in the result +[alice@dev3 Intro_to_RNA-seq_data_analysis]$ ls + +#run fastqc on the Bacteria_GATTACA_L001_R1_001.fastq file using the singularity container +[alice@dev3 Intro_to_RNA-seq_data_analysis]$ singularity exec rna_seq_container.sif fastqc Bacteria_GATTACA_L001_R1_001.fastq + +#once the above command completes running, +#check the output - there should be 2 output files +# 1. Bacteria_GATTACA_L001_R1_001_fastqc.html +# 2. Bacteria_GATTACA_L001_R1_001_fastqc.zip +#singularity container is just another way of running the same tool +#so the output files should match the ones we got when we ran fastqc without singularity container in session 1 +[alice@dev3 Intro_to_RNA-seq_data_analysis]$ ls + +#download the fastqc html result file from wynton to the Downloads folder on local computer +#open a new terminal (MacOS) or command prompt (Windows) window +{local}$ scp alice@dt2.wynton.ucsf.edu:~/Intro_to_RNA-seq_data_analysis/Bacteria_GATTACA_L001_R1_001_fastqc.html Downloads + +#once the above command completes running, open the downloaded Bacteria_GATTACA_L001_R1_001_fastqc.html file +#inspect the fastqc html file for different QC parameters +#you will notice adapter content reported in the fastqc html file +#so now, we will trim the reads in the fastq file to remove these adapters using cutadapt + +#go back to the terminal (MacOS) or command prompt (Windows) window where you are logged in to wynton +#run cutadapt to trim the adapaters from the Bacteria_GATTACA_L001_R1_001.fastq file using the singularity container +[alice@dev3 Intro_to_RNA-seq_data_analysis]$ singularity exec rna_seq_container.sif cutadapt \ +-a file:Adapter_Sequence.fasta \ +-o trimmed.fastq \ +--minimum-length 20 \ +--quality-cutoff 20 \ +Bacteria_GATTACA_L001_R1_001.fastq + +#once the above command completes running, +#check the output - there should be a trimmed.fastq output file +#the trimmed.fastq file is the trimmed fastq file generated by cutadapt after removing the adapter content +[alice@dev3 Intro_to_RNA-seq_data_analysis]$ ls + +#run fastqc on the trimmed.fastq file using the singularity container to check if the adapter content is gone +[alice@dev3 Intro_to_RNA-seq_data_analysis]$ singularity exec rna_seq_container.sif fastqc trimmed.fastq + +#once the above command completes running, +#check the output - there should be 2 output files +# 1. trimmed_fastqc.html +# 2. trimmed_fastqc.zip +[alice@dev3 Intro_to_RNA-seq_data_analysis]$ ls + +#go back to the terminal (MacOS) or command prompt (Windows) window where you are on your local computer +#download the fastqc html result file from wynton to the Downloads folder on local computer +{local}$ scp alice@dt2.wynton.ucsf.edu:~/Intro_to_RNA-seq_data_analysis/trimmed_fastqc.html Downloads + +#open and inpect the trimmed_fastqc.html file - the adapter content should be gone +#if all looks good in the html file, then run STAR to align the trimmed reads to the reference genome + +#Before we can map reads to the reference genome using STAR, +#we need to index it. This will generate a transformed version of the +#genome that allows STAR to efficiently map sequences to it. +#go back to the terminal (MacOS) or command prompt (Windows) window where you are logged in to wynton +#run the below command to make a folder to store the star index files +[alice@dev3 Intro_to_RNA-seq_data_analysis]$ mkdir star_index + +#run the below command to generate the star index +[alice@dev3 Intro_to_RNA-seq_data_analysis]$ singularity exec rna_seq_container.sif STAR \ +--runMode genomeGenerate \ +--genomeDir ./star_index \ +--genomeFastaFiles rDNA_sequence.fasta \ +--genomeSAindexNbases 3 + +#once the above command completes running, +#check the contents of the star_index folder - there should be 8 output files +# 1. chrLength.txt +# 2. chrNameLength.txt +# 3. chrName.txt +# 4. chrStart.txt +# 5. Genome +# 6. genomeParameters.txt +# 7. SA +# 8. SAindex +#for more details on STAR go to: +#https://physiology.med.cornell.edu/faculty/skrabanek/lab/angsd/lecture_notes/STARmanual.pdf +[alice@dev3 Intro_to_RNA-seq_data_analysis]$ ls star_index/ + +#run the star alignment on the trimmed.fastq file using the star index generated in the previous commands +[alice@dev3 Intro_to_RNA-seq_data_analysis]$ singularity exec rna_seq_container.sif STAR \ +--genomeDir ./star_index \ +--readFilesIn ./trimmed.fastq + +#once the above command completes running, +#check the output - there should be 5 output files +# 1. Log.final.out - a summary of mapping statistics for the sample +# 2. Aligned.out.sam - the aligned reads, in SAM format +# 3. Log.out - a running log from STAR, with information about the run +# 4. Log.progress.out - job progress with the number of processed reads, % of mapped reads etc., updated every ~1 minute +# 5. SJ.out.tab - high confidence collapsed splice junctions in tab-delimited format. Only junctions supported by uniquely mapping reads are reported +#for more details on STAR go to: +#https://physiology.med.cornell.edu/faculty/skrabanek/lab/angsd/lecture_notes/STARmanual.pdf +[alice@dev3 Intro_to_RNA-seq_data_analysis]$ ls + +#look at the contents of the Log.final.out file to check the mapping statistics for the sample +[alice@dev3 Intro_to_RNA-seq_data_analysis]$ less Intro_to_RNA-seq_data_analysis/Log.final.out + +#if the mapping statistics look good then run featureCounts to generate the read count matrices +#if the mapping statistics donot look good then troubleshoot the issues by seeking help online or contacting the Gladstone Bioinformatics Core +#run the featureCounts on the Aligned.out.sam file +[alice@dev3 Intro_to_RNA-seq_data_analysis]$ singularity exec rna_seq_container.sif featureCounts \ +-a rDNA.gtf \ +-t CDS \ +-o counts.txt \ +Aligned.out.sam + +#once the above command completes running, +#check the output - there should be 2 output files +# 1. counts.txt - the counts matrix with genes as rows and samples as columns. +# there might be additional columns for gene related information( e.g., start position, end position, strand etc.) +# 2. counts.txt.summary - tabulates how many the reads were “assigned” or counted and the reason they remained “unassigned” +[alice@dev3 Intro_to_RNA-seq_data_analysis]$ ls + +#look at the contents of the counts.txt.summary file to check the number of reads assigned to a gene +#make sure most of the reads are assigned to a gene, if not then try to troubleshoot by seeking help online or contacting the Gladstone Bioinformatics Core +[alice@dev3 Intro_to_RNA-seq_data_analysis]$ less counts.txt.summary + +#the counts.txt file can be used for further analysis (such as differential gene expression) +#go back to the terminal (MacOS) or command prompt (Windows) window where you are on your local computer +#download the counts.txt file from wynton to the Downloads folder on local computer +{local}$ scp alice@dt2.wynton.ucsf.edu:~/Intro_to_RNA-seq_data_analysis/counts.txt Downloads + + +############## END SESSION 2 ##############