diff --git a/single-cell-analysis/Sessions_1-2/Sessions_1-2.pptx b/single-cell-analysis/Sessions_1-2/Sessions_1-2.pptx index 4491af7..48d7818 100644 Binary files a/single-cell-analysis/Sessions_1-2/Sessions_1-2.pptx and b/single-cell-analysis/Sessions_1-2/Sessions_1-2.pptx differ diff --git a/single-cell-analysis/Sessions_1-2/hands_on_component.Rmd b/single-cell-analysis/Sessions_1-2/hands_on_component.Rmd deleted file mode 100644 index 5e932c2..0000000 --- a/single-cell-analysis/Sessions_1-2/hands_on_component.Rmd +++ /dev/null @@ -1,198 +0,0 @@ ---- -title: 'Hands-on component of Single-cell RNA-seq workshop: Sessions 1-2' -author: "Krishna Choudhary" -date: "3/27/2021" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -##Introduction - -We'll review the current practices for some of the common steps in scRNA-seq data analysis in the Sessions 1-2. We'll discuss different practices for each step and the assumptions underlying various tools and their limitations. This document provides an exposure to one of the popular tools for such analysis. As we'll discuss, the right choice of method in any application depends on a number of factors, including the biological systems under study and the characteristics of the data in hand. For the purpose of our workshop, we'll limit the hands-on component to the Seurat package in R. In general, analysis might require multiple tools in different languages and/or novel development. For more, please see the slide deck in materials. - -The following is based on [this vignette](https://satijalab.org/seurat/articles/pbmc3k_tutorial.html) from the Seurat developers. - -## Setup the working environment - -```{r message=FALSE, warning=FALSE} -library(dplyr) -library(Seurat) -library(patchwork) -``` - -## Load the data - -Please ensure that the directory named "pbmc3k_data" in the workshop materials is in the same directory as this .Rmd file. - -```{r} -data <- CreateSeuratObject(counts = Read10X("pbmc3k_data"), - project = "Hello_scWorld", #Name this whatever. - min.cells = 3, # Don't keep genes observed in fewer than 3 cells - min.features = 200) # Don't keep cells with fewer than 200 genes -``` - -## Filter poor quality or uninteresting cells - -```{r} -#Assess percent of mitochondrial counts in each cell -data[["percent_mt"]] <- PercentageFeatureSet(object = data, - pattern = "^MT-") - -#Violin plot -VlnPlot(object = data, - features = c("nFeature_RNA", - "nCount_RNA", - "percent_mt"), - ncol = 3) - -#Other plotting otions -plot1 <- FeatureScatter(object = data, - feature1 = "nCount_RNA", - feature2 = "percent_mt") -plot2 <- FeatureScatter(object = data, - feature1 = "nCount_RNA", - feature2 = "nFeature_RNA") -plot1 + plot2 - -#Subset data -data <- subset(x = data, - subset = nFeature_RNA > 200 & - nFeature_RNA < 2500 & - percent_mt < 5) - -VlnPlot(object = data, - features = c("nFeature_RNA", - "nCount_RNA", - "percent_mt"), - ncol = 3) - -``` - -## Normalization - -```{r} -data <- NormalizeData(object = data, - normalization.method = "LogNormalize", - scale.factor = 10000) - -``` - -## Feature selection - -```{r} -data <- FindVariableFeatures( object = data, - selection.method = "vst", - nfeatures = 2000) - -#View the 10 most highly variable genes -top10 <- head(x = VariableFeatures(object = data), 10) -print(top10) - -#Seurat allows plotting variable features with and without labels -plot1 <- VariableFeaturePlot(object = data) -plot2 <- LabelPoints(plot = plot1, - points = top10, - repel = TRUE) -plot1 -plot2 - -``` - -## Dimensionality reduction - -### Linear dimensionality reduction -```{r} -#By default Seurat only scales the variable features. -#Explicit input required to rescale all the genes -scale_genes <- rownames(data) -data <- ScaleData(object = data, - features = scale_genes) - -# Use the highly variable genes to find principal components -data <- RunPCA(object = data, - features = VariableFeatures(object = data), - verbose = FALSE) - -#Examine and visualize PCA results a few different ways -print(x = data[["pca"]], dims = 1:5, nfeatures = 5) - -DimPlot(data, reduction = "pca") - -data <- JackStraw(object = data, num.replicate = 100) -data <- ScoreJackStraw(object = data, dims = 1:20) -JackStrawPlot(data, dims = 1:15) - -ElbowPlot(data) - -VizDimLoadings(object = data, dims = 1:2, reduction = "pca") - -``` - -### Nonlinear dimensionality reduction -```{r} -data <- RunUMAP(data, dims = 1:10) -data <- RunTSNE(data, dims = 1:10) - -DimPlot(data, reduction = "umap") -DimPlot(data, reduction = "tsne") -``` - -## Clustering -```{r} -data <- FindNeighbors(data, dims = 1:10) -data <- FindClusters(data, resolution = 0.5) - -DimPlot(data, reduction = "umap", label = TRUE) -DimPlot(data, reduction = "tsne", label = TRUE) -DimPlot(data, reduction = "pca", label = TRUE) -``` - -### Save the Seurat object -```{r} -test <- data[, 1:10 ] -saveRDS(test, file = "hello_scWorld.rds") -``` - -## Find markers -```{r} -# find all markers of cluster 1 -cluster1.markers <- FindMarkers(data, - ident.1 = 1, - min.pct = 0.25) -head(cluster1.markers, n = 5) - -# find all markers distinguishing cluster 5 from clusters 0 and 3 -cluster5.markers <- FindMarkers(data, - ident.1 = 5, - ident.2 = c(0, 3), - min.pct = 0.25) -head(cluster5.markers, - n = 5) - -# find markers for every cluster compared to all remaining cells, report only the positive ones -data.markers <- FindAllMarkers(data, - only.pos = TRUE, - min.pct = 0.25, - logfc.threshold = 0.25) -data.markers %>% - group_by(., cluster) %>% - top_n(., n = 2, wt = avg_log2FC) - -``` - -## Additional visualizations - -```{r} -VlnPlot(data, features = c("MS4A1", "CD79A")) - -FeaturePlot(data, features = c("MS4A1", "CD79A")) -``` - -```{r} -sessionInfo() -``` - -## THE END. diff --git a/single-cell-analysis/Sessions_1-2/hands_on_component.html b/single-cell-analysis/Sessions_1-2/hands_on_component.html deleted file mode 100644 index 0c7aeaa..0000000 --- a/single-cell-analysis/Sessions_1-2/hands_on_component.html +++ /dev/null @@ -1,565 +0,0 @@ - - - - - - - - - - - - - - -Hands-on component of Single-cell RNA-seq workshop: Sessions 1-2 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - -
-

Introduction

-

We'll review the current practices for some of the common steps in scRNA-seq data analysis in the Sessions 1-2. We'll discuss different practices for each step and the assumptions underlying various tools and their limitations. This document provides an exposure to one of the popular tools for such analysis. As we'll discuss, the right choice of method in any application depends on a number of factors, including the biological systems under study and the characteristics of the data in hand. For the purpose of our workshop, we'll limit the hands-on component to the Seurat package in R. In general, analysis might require multiple tools in different languages and/or novel development. For more, please see the slide deck in materials.

-

The following is based on this vignette from the Seurat developers.

-
-
-

Setup the working environment

-
library(dplyr)
-library(Seurat)
-library(patchwork)
-
-
-

Load the data

-

Please ensure that the directory named "pbmc3k_data" in the workshop materials is in the same directory as this .Rmd file.

-
data <- CreateSeuratObject(counts = Read10X("pbmc3k_data"),
-                           project = "Hello_scWorld",  #Name this whatever.
-                           min.cells = 3,  # Don't keep genes observed in fewer than 3 cells
-                           min.features = 200)  # Don't keep cells with fewer than 200 genes
-
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
-## ('-')
-
-
-

Filter poor quality or uninteresting cells

-
#Assess percent of mitochondrial counts in each cell 
-data[["percent_mt"]] <- PercentageFeatureSet(object = data, 
-                                             pattern = "^MT-")
-
-#Violin plot
-VlnPlot(object = data, 
-        features = c("nFeature_RNA", 
-                   "nCount_RNA", 
-                   "percent_mt"), 
-        ncol = 3)
-

-
#Other plotting otions
-plot1 <- FeatureScatter(object = data, 
-                        feature1 = "nCount_RNA", 
-                        feature2 = "percent_mt")
-plot2 <- FeatureScatter(object = data,
-                        feature1 = "nCount_RNA", 
-                        feature2 = "nFeature_RNA")
-plot1 + plot2
-

-
#Subset data
-data <- subset(x = data, 
-               subset =  nFeature_RNA > 200 & 
-                 nFeature_RNA < 2500 & 
-                 percent_mt < 5)
-
-VlnPlot(object = data, 
-        features = c("nFeature_RNA",
-                   "nCount_RNA",
-                   "percent_mt"),
-        ncol = 3)
-

-
-
-

Normalization

-
data <- NormalizeData(object = data, 
-                      normalization.method = "LogNormalize", 
-                      scale.factor = 10000)
-
-
-

Feature selection

-
data <- FindVariableFeatures( object = data, 
-                              selection.method = "vst", 
-                              nfeatures = 2000)
-
-#View the 10 most highly variable genes
-top10 <- head(x = VariableFeatures(object = data), 10)
-print(top10)
-
##  [1] "PPBP"   "LYZ"    "S100A9" "IGLL5"  "GNLY"   "FTL"    "PF4"    "FTH1"  
-##  [9] "GNG11"  "S100A8"
-
#Seurat allows plotting variable features with and without labels
-plot1 <- VariableFeaturePlot(object = data)
-plot2 <- LabelPoints(plot = plot1, 
-                     points = top10, 
-                     repel = TRUE)
-
## When using repel, set xnudge and ynudge to 0 for optimal results
-
plot1 
-
## Warning: Transformation introduced infinite values in continuous x-axis
-

-
plot2
-
## Warning: Transformation introduced infinite values in continuous x-axis
-

-
-
-

Dimensionality reduction

-
-

Linear dimensionality reduction

-
#By default Seurat only scales the variable features.
-#Explicit input required to rescale all the genes
-scale_genes <- rownames(data)
-data <- ScaleData(object = data, 
-                  features = scale_genes)
-
## Centering and scaling data matrix
-
# Use the highly variable genes to find principal components
-data <- RunPCA(object = data,
-               features = VariableFeatures(object = data),  
-               verbose = FALSE)  
-
-#Examine and visualize PCA results a few different ways
-print(x = data[["pca"]], dims = 1:5, nfeatures = 5)
-
## PC_ 1 
-## Positive:  CST3, TYROBP, LST1, AIF1, FTL 
-## Negative:  MALAT1, LTB, IL32, IL7R, CD2 
-## PC_ 2 
-## Positive:  CD79A, MS4A1, TCL1A, HLA-DQA1, HLA-DQB1 
-## Negative:  NKG7, PRF1, CST7, GZMB, GZMA 
-## PC_ 3 
-## Positive:  HLA-DQA1, CD79A, CD79B, HLA-DQB1, HLA-DPB1 
-## Negative:  PPBP, PF4, SDPR, SPARC, GNG11 
-## PC_ 4 
-## Positive:  HLA-DQA1, CD79B, CD79A, MS4A1, HLA-DQB1 
-## Negative:  VIM, IL7R, S100A6, IL32, S100A8 
-## PC_ 5 
-## Positive:  GZMB, NKG7, S100A8, FGFBP2, GNLY 
-## Negative:  LTB, IL7R, CKB, VIM, MS4A7
-
DimPlot(data, reduction = "pca")
-

-
data <- JackStraw(object = data, num.replicate = 100)
-data <- ScoreJackStraw(object = data, dims = 1:20)
-JackStrawPlot(data, dims = 1:15)
-
## Warning: Removed 23496 rows containing missing values (geom_point).
-

-
ElbowPlot(data)
-

-
VizDimLoadings(object = data, dims = 1:2, reduction = "pca")
-

-
-
-

Nonlinear dimensionality reduction

-
data <- RunUMAP(data, dims = 1:10)
-
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
-## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
-## This message will be shown once per session
-
## 05:36:09 UMAP embedding parameters a = 0.9922 b = 1.112
-
## 05:36:09 Read 2638 rows and found 10 numeric columns
-
## 05:36:09 Using Annoy for neighbor search, n_neighbors = 30
-
## 05:36:09 Building Annoy index with metric = cosine, n_trees = 50
-
## 0%   10   20   30   40   50   60   70   80   90   100%
-
## [----|----|----|----|----|----|----|----|----|----|
-
## **************************************************|
-## 05:36:09 Writing NN index file to temp file /var/folders/7z/my8thnc51jv128q_9jfpwxb80000gp/T//RtmpiC4ILv/file641570ebcbaa
-## 05:36:09 Searching Annoy index using 1 thread, search_k = 3000
-## 05:36:10 Annoy recall = 100%
-## 05:36:10 Commencing smooth kNN distance calibration using 1 thread
-## 05:36:11 Initializing from normalized Laplacian + noise
-## 05:36:11 Commencing optimization for 500 epochs, with 105124 positive edges
-## 05:36:14 Optimization finished
-
data <- RunTSNE(data, dims = 1:10)
-
-DimPlot(data, reduction = "umap")
-

-
DimPlot(data, reduction = "tsne")
-

-
-
-
-

Clustering

-
data <- FindNeighbors(data, dims = 1:10)
-
## Computing nearest neighbor graph
-
## Computing SNN
-
data <- FindClusters(data, resolution = 0.5)
-
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
-## 
-## Number of nodes: 2638
-## Number of edges: 95965
-## 
-## Running Louvain algorithm...
-## Maximum modularity in 10 random starts: 0.8723
-## Number of communities: 9
-## Elapsed time: 0 seconds
-
DimPlot(data, reduction = "umap", label = TRUE) 
-

-
DimPlot(data, reduction = "tsne", label = TRUE)
-

-
DimPlot(data, reduction = "pca", label = TRUE)
-

-
-

Save the Seurat object

-
test <- data[, 1:10 ]
-saveRDS(test, file = "hello_scWorld.rds")
-
-
-
-

Find markers

-
# find all markers of cluster 1
-cluster1.markers <- FindMarkers(data, 
-                                ident.1 = 1, 
-                                min.pct = 0.25)
-head(cluster1.markers, n = 5)
-
##                p_val avg_log2FC pct.1 pct.2     p_val_adj
-## S100A9  0.000000e+00   5.570063 0.996 0.215  0.000000e+00
-## S100A8  0.000000e+00   5.477394 0.975 0.121  0.000000e+00
-## FCN1    0.000000e+00   3.394219 0.952 0.151  0.000000e+00
-## LGALS2  0.000000e+00   3.800484 0.908 0.059  0.000000e+00
-## CD14   2.856582e-294   2.815626 0.667 0.028 3.917516e-290
-
# find all markers distinguishing cluster 5 from clusters 0 and 3
-cluster5.markers <- FindMarkers(data, 
-                                ident.1 = 5, 
-                                ident.2 = c(0, 3), 
-                                min.pct = 0.25)
-head(cluster5.markers,
-     n = 5)
-
##                       p_val avg_log2FC pct.1 pct.2     p_val_adj
-## FCGR3A        2.150929e-209   4.267579 0.975 0.039 2.949784e-205
-## IFITM3        6.103366e-199   3.877105 0.975 0.048 8.370156e-195
-## CFD           8.891428e-198   3.411039 0.938 0.037 1.219370e-193
-## CD68          2.374425e-194   3.014535 0.926 0.035 3.256286e-190
-## RP11-290F20.3 9.308287e-191   2.722684 0.840 0.016 1.276538e-186
-
# find markers for every cluster compared to all remaining cells, report only the positive ones
-data.markers <- FindAllMarkers(data, 
-                               only.pos = TRUE, 
-                               min.pct = 0.25, 
-                               logfc.threshold = 0.25)
-
## Calculating cluster 0
-
## Calculating cluster 1
-
## Calculating cluster 2
-
## Calculating cluster 3
-
## Calculating cluster 4
-
## Calculating cluster 5
-
## Calculating cluster 6
-
## Calculating cluster 7
-
## Calculating cluster 8
-
data.markers %>%
-  group_by(., cluster) %>% 
-  top_n(., n = 2, wt = avg_log2FC)
-
## Registered S3 method overwritten by 'cli':
-##   method     from         
-##   print.boxx spatstat.geom
-
## # A tibble: 18 x 7
-## # Groups:   cluster [9]
-##        p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene    
-##        <dbl>      <dbl> <dbl> <dbl>     <dbl> <fct>   <chr>   
-##  1 1.74e-109       1.07 0.897 0.593 2.39e-105 0       LDHB    
-##  2 1.17e- 83       1.33 0.435 0.108 1.60e- 79 0       CCR7    
-##  3 0.              5.57 0.996 0.215 0.        1       S100A9  
-##  4 0.              5.48 0.975 0.121 0.        1       S100A8  
-##  5 7.99e- 87       1.28 0.981 0.644 1.10e- 82 2       LTB     
-##  6 2.61e- 59       1.24 0.424 0.111 3.58e- 55 2       AQP3    
-##  7 0.              4.31 0.936 0.041 0.        3       CD79A   
-##  8 9.48e-271       3.59 0.622 0.022 1.30e-266 3       TCL1A   
-##  9 1.17e-178       2.97 0.957 0.241 1.60e-174 4       CCL5    
-## 10 4.93e-169       3.01 0.595 0.056 6.76e-165 4       GZMK    
-## 11 3.51e-184       3.31 0.975 0.134 4.82e-180 5       FCGR3A  
-## 12 2.03e-125       3.09 1     0.315 2.78e-121 5       LST1    
-## 13 1.05e-265       4.89 0.986 0.071 1.44e-261 6       GZMB    
-## 14 6.82e-175       4.92 0.958 0.135 9.36e-171 6       GNLY    
-## 15 1.48e-220       3.87 0.812 0.011 2.03e-216 7       FCER1A  
-## 16 1.67e- 21       2.87 1     0.513 2.28e- 17 7       HLA-DPB1
-## 17 7.73e-200       7.24 1     0.01  1.06e-195 8       PF4     
-## 18 3.68e-110       8.58 1     0.024 5.05e-106 8       PPBP
-
-
-

Additional visualizations

-
VlnPlot(data, features = c("MS4A1", "CD79A"))
-

-
FeaturePlot(data, features = c("MS4A1", "CD79A"))
-

-
sessionInfo()
-
## R version 4.0.2 (2020-06-22)
-## Platform: x86_64-apple-darwin17.0 (64-bit)
-## Running under: macOS Mojave 10.14.6
-## 
-## Matrix products: default
-## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
-## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
-## 
-## locale:
-## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
-## 
-## attached base packages:
-## [1] stats     graphics  grDevices utils     datasets  methods   base     
-## 
-## other attached packages:
-## [1] patchwork_1.1.0    SeuratObject_4.0.0 Seurat_4.0.1       dplyr_1.0.2       
-## 
-## loaded via a namespace (and not attached):
-##   [1] Rtsne_0.15            colorspace_2.0-0      deldir_0.2-3         
-##   [4] ellipsis_0.3.1        ggridges_0.5.2        spatstat.data_2.1-0  
-##   [7] leiden_0.3.5          listenv_0.8.0         farver_2.0.3         
-##  [10] ggrepel_0.8.2         RSpectra_0.16-0       fansi_0.4.1          
-##  [13] codetools_0.2-18      splines_4.0.2         knitr_1.30           
-##  [16] polyclip_1.10-0       jsonlite_1.7.1        ica_1.0-2            
-##  [19] cluster_2.1.0         png_0.1-7             uwot_0.1.10          
-##  [22] shiny_1.5.0           sctransform_0.3.2     spatstat.sparse_2.0-0
-##  [25] compiler_4.0.2        httr_1.4.2            assertthat_0.2.1     
-##  [28] Matrix_1.2-18         fastmap_1.0.1         lazyeval_0.2.2       
-##  [31] limma_3.46.0          cli_2.1.0             later_1.1.0.1        
-##  [34] htmltools_0.5.0       tools_4.0.2           igraph_1.2.6         
-##  [37] gtable_0.3.0          glue_1.4.2            RANN_2.6.1           
-##  [40] reshape2_1.4.4        Rcpp_1.0.5            scattermore_0.7      
-##  [43] vctrs_0.3.4           nlme_3.1-150          lmtest_0.9-38        
-##  [46] xfun_0.19             stringr_1.4.0         globals_0.13.1       
-##  [49] mime_0.9              miniUI_0.1.1.1        lifecycle_0.2.0      
-##  [52] irlba_2.3.3           goftest_1.2-2         future_1.20.1        
-##  [55] MASS_7.3-53           zoo_1.8-8             scales_1.1.1         
-##  [58] spatstat.core_2.0-0   promises_1.1.1        spatstat.utils_2.1-0 
-##  [61] parallel_4.0.2        RColorBrewer_1.1-2    yaml_2.2.1           
-##  [64] reticulate_1.18       pbapply_1.4-3         gridExtra_2.3        
-##  [67] ggplot2_3.3.2         rpart_4.1-15          stringi_1.5.3        
-##  [70] rlang_0.4.8           pkgconfig_2.0.3       matrixStats_0.57.0   
-##  [73] evaluate_0.14         lattice_0.20-41       ROCR_1.0-11          
-##  [76] purrr_0.3.4           tensor_1.5            htmlwidgets_1.5.2    
-##  [79] labeling_0.4.2        cowplot_1.1.0         tidyselect_1.1.0     
-##  [82] parallelly_1.21.0     RcppAnnoy_0.0.18      plyr_1.8.6           
-##  [85] magrittr_1.5          R6_2.5.0              generics_0.1.0       
-##  [88] DBI_1.1.0             pillar_1.4.6          withr_2.3.0          
-##  [91] mgcv_1.8-33           fitdistrplus_1.1-1    survival_3.2-7       
-##  [94] abind_1.4-5           tibble_3.0.4          future.apply_1.6.0   
-##  [97] crayon_1.3.4          KernSmooth_2.23-18    utf8_1.1.4           
-## [100] spatstat.geom_2.0-1   plotly_4.9.2.1        rmarkdown_2.5        
-## [103] grid_4.0.2            data.table_1.13.2     digest_0.6.27        
-## [106] xtable_1.8-4          tidyr_1.1.2           httpuv_1.5.4         
-## [109] munsell_0.5.0         viridisLite_0.3.0
-
-
-

THE END.

-
- - - - -
- - - - - - - - - - - - - - - diff --git a/single-cell-analysis/Sessions_1-2/hello_scWorld.rds b/single-cell-analysis/Sessions_1-2/hello_scWorld.rds deleted file mode 100644 index 4f39061..0000000 Binary files a/single-cell-analysis/Sessions_1-2/hello_scWorld.rds and /dev/null differ diff --git a/single-cell-analysis/Sessions_1-2/link_to_download.txt b/single-cell-analysis/Sessions_1-2/link_to_download.txt new file mode 100644 index 0000000..5136e9a --- /dev/null +++ b/single-cell-analysis/Sessions_1-2/link_to_download.txt @@ -0,0 +1,3 @@ +Download zip folder containing workshop code and data: + +https://www.dropbox.com/sh/175caaess2l307v/AADBb3chB1tFedBuz3qE4i_3a?dl=0 \ No newline at end of file diff --git a/single-cell-analysis/Sessions_1-2/pbmc3k_filtered_gene_bc_matrices.tar.gz b/single-cell-analysis/Sessions_1-2/pbmc3k_filtered_gene_bc_matrices.tar.gz deleted file mode 100644 index a28c842..0000000 Binary files a/single-cell-analysis/Sessions_1-2/pbmc3k_filtered_gene_bc_matrices.tar.gz and /dev/null differ