#Commands to run on wynton in session 2 of the Intro to RNA-seq data analysis workshop #login to the wynton cluster {local}$ ssh alice@log2.wynton.ucsf.edu #enter your wynton password when prompted and hit enter #once you are logged in to wynton, #login to the development node [alice@log2 ~]$ ssh dev3 #go to the Intro_to_RNA-seq_data_analysis folder that we uploaded in session 1 [alice@dev3 ~]$ cd Intro_to_RNA-seq_data_analysis/ #list the contents of the Intro_to_RNA-seq_data_analysis folder #the singularity container "rna_seq_container.sif" that we created in session 1 should appear in the result [alice@dev3 Intro_to_RNA-seq_data_analysis]$ ls #run fastqc on the Bacteria_GATTACA_L001_R1_001.fastq file using the singularity container [alice@dev3 Intro_to_RNA-seq_data_analysis]$ singularity exec rna_seq_container.sif fastqc Bacteria_GATTACA_L001_R1_001.fastq #once the above command completes running, #check the output - there should be 2 output files # 1. Bacteria_GATTACA_L001_R1_001_fastqc.html # 2. Bacteria_GATTACA_L001_R1_001_fastqc.zip #singularity container is just another way of running the same tool #so the output files should match the ones we got when we ran fastqc without singularity container in session 1 [alice@dev3 Intro_to_RNA-seq_data_analysis]$ ls #download the fastqc html result file from wynton to the Downloads folder on local computer #open a new terminal (MacOS) or command prompt (Windows) window {local}$ scp alice@dt2.wynton.ucsf.edu:~/Intro_to_RNA-seq_data_analysis/Bacteria_GATTACA_L001_R1_001_fastqc.html Downloads #once the above command completes running, open the downloaded Bacteria_GATTACA_L001_R1_001_fastqc.html file #inspect the fastqc html file for different QC parameters #you will notice adapter content reported in the fastqc html file #so now, we will trim the reads in the fastq file to remove these adapters using cutadapt #go back to the terminal (MacOS) or command prompt (Windows) window where you are logged in to wynton #run cutadapt to trim the adapaters from the Bacteria_GATTACA_L001_R1_001.fastq file using the singularity container [alice@dev3 Intro_to_RNA-seq_data_analysis]$ singularity exec rna_seq_container.sif cutadapt \ -a file:Adapter_Sequence.fasta \ -o trimmed.fastq \ --minimum-length 20 \ --quality-cutoff 20 \ Bacteria_GATTACA_L001_R1_001.fastq #once the above command completes running, #check the output - there should be a trimmed.fastq output file #the trimmed.fastq file is the trimmed fastq file generated by cutadapt after removing the adapter content [alice@dev3 Intro_to_RNA-seq_data_analysis]$ ls #run fastqc on the trimmed.fastq file using the singularity container to check if the adapter content is gone [alice@dev3 Intro_to_RNA-seq_data_analysis]$ singularity exec rna_seq_container.sif fastqc trimmed.fastq #once the above command completes running, #check the output - there should be 2 output files # 1. trimmed_fastqc.html # 2. trimmed_fastqc.zip [alice@dev3 Intro_to_RNA-seq_data_analysis]$ ls #go back to the terminal (MacOS) or command prompt (Windows) window where you are on your local computer #download the fastqc html result file from wynton to the Downloads folder on local computer {local}$ scp alice@dt2.wynton.ucsf.edu:~/Intro_to_RNA-seq_data_analysis/trimmed_fastqc.html Downloads #open and inpect the trimmed_fastqc.html file - the adapter content should be gone #if all looks good in the html file, then run STAR to align the trimmed reads to the reference genome #Before we can map reads to the reference genome using STAR, #we need to index it. This will generate a transformed version of the #genome that allows STAR to efficiently map sequences to it. #go back to the terminal (MacOS) or command prompt (Windows) window where you are logged in to wynton #run the below command to make a folder to store the star index files [alice@dev3 Intro_to_RNA-seq_data_analysis]$ mkdir star_index #run the below command to generate the star index [alice@dev3 Intro_to_RNA-seq_data_analysis]$ singularity exec rna_seq_container.sif STAR \ --runMode genomeGenerate \ --genomeDir ./star_index \ --genomeFastaFiles rDNA_sequence.fasta \ --genomeSAindexNbases 3 #once the above command completes running, #check the contents of the star_index folder - there should be 8 output files # 1. chrLength.txt # 2. chrNameLength.txt # 3. chrName.txt # 4. chrStart.txt # 5. Genome # 6. genomeParameters.txt # 7. SA # 8. SAindex #for more details on STAR go to: #https://physiology.med.cornell.edu/faculty/skrabanek/lab/angsd/lecture_notes/STARmanual.pdf [alice@dev3 Intro_to_RNA-seq_data_analysis]$ ls star_index/ #run the star alignment on the trimmed.fastq file using the star index generated in the previous commands [alice@dev3 Intro_to_RNA-seq_data_analysis]$ singularity exec rna_seq_container.sif STAR \ --genomeDir ./star_index \ --readFilesIn ./trimmed.fastq #once the above command completes running, #check the output - there should be 5 output files # 1. Log.final.out - a summary of mapping statistics for the sample # 2. Aligned.out.sam - the aligned reads, in SAM format # 3. Log.out - a running log from STAR, with information about the run # 4. Log.progress.out - job progress with the number of processed reads, % of mapped reads etc., updated every ~1 minute # 5. SJ.out.tab - high confidence collapsed splice junctions in tab-delimited format. Only junctions supported by uniquely mapping reads are reported #for more details on STAR go to: #https://physiology.med.cornell.edu/faculty/skrabanek/lab/angsd/lecture_notes/STARmanual.pdf [alice@dev3 Intro_to_RNA-seq_data_analysis]$ ls #look at the contents of the Log.final.out file to check the mapping statistics for the sample [alice@dev3 Intro_to_RNA-seq_data_analysis]$ less Intro_to_RNA-seq_data_analysis/Log.final.out #if the mapping statistics look good then run featureCounts to generate the read count matrices #if the mapping statistics donot look good then troubleshoot the issues by seeking help online or contacting the Gladstone Bioinformatics Core #run the featureCounts on the Aligned.out.sam file [alice@dev3 Intro_to_RNA-seq_data_analysis]$ singularity exec rna_seq_container.sif featureCounts \ -a rDNA.gtf \ -t CDS \ -o counts.txt \ Aligned.out.sam #once the above command completes running, #check the output - there should be 2 output files # 1. counts.txt - the counts matrix with genes as rows and samples as columns. # there might be additional columns for gene related information( e.g., start position, end position, strand etc.) # 2. counts.txt.summary - tabulates how many the reads were “assigned” or counted and the reason they remained “unassigned” [alice@dev3 Intro_to_RNA-seq_data_analysis]$ ls #look at the contents of the counts.txt.summary file to check the number of reads assigned to a gene #make sure most of the reads are assigned to a gene, if not then try to troubleshoot by seeking help online or contacting the Gladstone Bioinformatics Core [alice@dev3 Intro_to_RNA-seq_data_analysis]$ less counts.txt.summary #the counts.txt file can be used for further analysis (such as differential gene expression) #go back to the terminal (MacOS) or command prompt (Windows) window where you are on your local computer #download the counts.txt file from wynton to the Downloads folder on local computer {local}$ scp alice@dt2.wynton.ucsf.edu:~/Intro_to_RNA-seq_data_analysis/counts.txt Downloads ############## END SESSION 2 ##############