From cda43e36db217ea8978070d6af179f3e812e72e6 Mon Sep 17 00:00:00 2001 From: Ayushi Agrawal <88406934+ayushi-agrawal-gladstone@users.noreply.github.com> Date: Tue, 11 Apr 2023 18:23:25 -0700 Subject: [PATCH] Update scRNAseq.html --- docs/scRNAseq.html | 740 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 736 insertions(+), 4 deletions(-) diff --git a/docs/scRNAseq.html b/docs/scRNAseq.html index 0729bf5..8bf78cf 100644 --- a/docs/scRNAseq.html +++ b/docs/scRNAseq.html @@ -1,6 +1,738 @@ + + -
-##Introduction
+We’ll review the current practices for some of the common steps in +scRNA-seq data analysis in the Sessions 1-2. We’ll discuss different +practices for each step and the assumptions underlying various tools and +their limitations. This document provides an exposure to one of the +popular tools for such analysis. As we’ll discuss, the right choice of +method in any application depends on a number of factors, including the +biological systems under study and the characteristics of the data in +hand. For the purpose of our workshop, we’ll limit the hands-on +component to the Seurat package in R. In general, analysis might require +multiple tools in different languages and/or novel development. For +more, please see the slide deck in materials.
+The following is based on this +vignette from the Seurat developers.
+library(dplyr)
+library(Seurat)
+library(patchwork)
+Please ensure that the directory named “pbmc3k_data” in the workshop +materials is in the same directory as this .Rmd file.
+data <- CreateSeuratObject(counts = Read10X("data"),
+ project = "Hello_scWorld", #Name this whatever.
+ min.cells = 3, # Don't keep genes observed in fewer than 3 cells
+ min.features = 200, # Don't keep cells with fewer than 200 genes
+ names.delim = NULL) # Don't try and parse the sample names
+#Assess percent of mitochondrial counts in each cell
+data[["percent_mt"]] <- PercentageFeatureSet(object = data,
+ pattern = "^mt-")
+
+#Violin plot
+VlnPlot(object = data,
+ features = c("nFeature_RNA",
+ "nCount_RNA",
+ "percent_mt"),
+ ncol = 3)
+#Other plotting otions
+plot1 <- FeatureScatter(object = data,
+ feature1 = "nCount_RNA",
+ feature2 = "percent_mt")
+plot2 <- FeatureScatter(object = data,
+ feature1 = "nCount_RNA",
+ feature2 = "nFeature_RNA")
+plot1 + plot2
+#Subset data
+data <- subset(x = data,
+ subset = nFeature_RNA > 200 &
+ nCount_RNA > 950000 &
+ percent_mt < 20)
+
+VlnPlot(object = data,
+ features = c("nFeature_RNA",
+ "nCount_RNA",
+ "percent_mt"),
+ ncol = 3)
+data <- NormalizeData(object = data,
+ normalization.method = "LogNormalize",
+ scale.factor = 10000)
+data <- FindVariableFeatures( object = data,
+ selection.method = "vst",
+ nfeatures = 2000)
+
+#View the 10 most highly variable genes
+top10 <- head(x = VariableFeatures(object = data), 10)
+print(top10)
+## [1] "Gm10800" "Ccl21a" "Ccl19" "Gm21541" "Glycam1" "Gm2564" "Ccl19.1"
+## [8] "Gm10801" "Ccl20" "Gzma"
+#Seurat allows plotting variable features with and without labels
+plot1 <- VariableFeaturePlot(object = data)
+plot2 <- LabelPoints(plot = plot1,
+ points = top10,
+ repel = TRUE)
+## When using repel, set xnudge and ynudge to 0 for optimal results
+plot1
+## Warning: Transformation introduced infinite values in continuous x-axis
+plot2
+## Warning: Transformation introduced infinite values in continuous x-axis
+#By default Seurat only scales the variable features.
+#Explicit input required to rescale all the genes
+scale_genes <- rownames(data)
+data <- ScaleData(object = data,
+ features = scale_genes)
+## Centering and scaling data matrix
+# Use the highly variable genes to find principal components
+data <- RunPCA(object = data,
+ features = VariableFeatures(object = data),
+ verbose = FALSE)
+
+#Examine and visualize PCA results a few different ways
+print(x = data[["pca"]], dims = 1:5, nfeatures = 5)
+## PC_ 1
+## Positive: Serping1, Col3a1, Col1a2, Col1a1, Serpinf1
+## Negative: Cd74, Tyrobp, H2-Aa, H2-Ab1, Fcer1g
+## PC_ 2
+## Positive: Plvap, Lrg1, Cldn5, Ctla2a, Fabp4
+## Negative: Lyz2, Cd68, Fcer1g, Tyrobp, Lgals3
+## PC_ 3
+## Positive: Nkg7, Ctsw, Icos, Gm26917, Cd27
+## Negative: Plvap, Lrg1, Ms4a6d, Lyz2, Tmem88
+## PC_ 4
+## Positive: Slc45a2, Tyrp1, Car14, Dct, Tspan10
+## Negative: Cfb, Lrg1, Clu, Gm26905, Plvap
+## PC_ 5
+## Positive: Nkg7, Actb, Ctsw, AW112010, Gzmb
+## Negative: Gm11168, Gm26870, Gm10717, Gm21738, Gm10719
+DimPlot(data, reduction = "pca")
+data <- JackStraw(object = data, num.replicate = 100)
+data <- ScoreJackStraw(object = data, dims = 1:20)
+JackStrawPlot(data, dims = 1:15)
+## Warning: Removed 21000 rows containing missing values (`geom_point()`).
+ElbowPlot(data)
+VizDimLoadings(object = data, dims = 1:2, reduction = "pca")
+data <- RunUMAP(data, dims = 1:10)
+## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
+## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
+## This message will be shown once per session
+## 10:18:04 UMAP embedding parameters a = 0.9922 b = 1.112
+## 10:18:04 Read 5438 rows and found 10 numeric columns
+## 10:18:04 Using Annoy for neighbor search, n_neighbors = 30
+## 10:18:04 Building Annoy index with metric = cosine, n_trees = 50
+## 0% 10 20 30 40 50 60 70 80 90 100%
+## [----|----|----|----|----|----|----|----|----|----|
+## **************************************************|
+## 10:18:05 Writing NN index file to temp file /var/folders/1t/y84b3_bs2wq0l8ks9pr15jmw0000gq/T//RtmpEYxVdI/file687b79e58625
+## 10:18:05 Searching Annoy index using 1 thread, search_k = 3000
+## 10:18:06 Annoy recall = 100%
+## 10:18:06 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
+## 10:18:06 Initializing from normalized Laplacian + noise (using irlba)
+## 10:18:07 Commencing optimization for 500 epochs, with 228306 positive edges
+## 10:18:15 Optimization finished
+data <- RunTSNE(data, dims = 1:10)
+
+DimPlot(data, reduction = "umap")
+DimPlot(data, reduction = "tsne")
+data <- FindNeighbors(data, dims = 1:10)
+## Computing nearest neighbor graph
+## Computing SNN
+data <- FindClusters(data, resolution = 0.5)
+## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
+##
+## Number of nodes: 5438
+## Number of edges: 177873
+##
+## Running Louvain algorithm...
+## Maximum modularity in 10 random starts: 0.9331
+## Number of communities: 18
+## Elapsed time: 0 seconds
+DimPlot(data, reduction = "umap", label = TRUE)
+DimPlot(data, reduction = "tsne", label = TRUE)
+DimPlot(data, reduction = "pca", label = TRUE)
+test <- data[, 1:10 ]
+saveRDS(test, file = "hello_scWorld.rds")
+# find all markers of cluster 1
+cluster1.markers <- FindMarkers(data,
+ ident.1 = 1,
+ min.pct = 0.25)
+head(cluster1.markers, n = 5)
+## p_val avg_log2FC pct.1 pct.2 p_val_adj
+## Spi1 0 1.672875 0.914 0.208 0
+## Cybb 0 1.862620 0.857 0.147 0
+## Fcgr1 0 1.924539 0.572 0.034 0
+## Cd68 0 3.266696 0.923 0.191 0
+## Ccl6 0 4.016911 0.819 0.091 0
+# find all markers distinguishing cluster 5 from clusters 0 and 3
+cluster5.markers <- FindMarkers(data,
+ ident.1 = 5,
+ ident.2 = c(0, 3),
+ min.pct = 0.25)
+head(cluster5.markers,
+ n = 5)
+## p_val avg_log2FC pct.1 pct.2 p_val_adj
+## Sparc 0.000000e+00 4.614014 0.927 0.019 0.000000e+00
+## Igfbp7 5.582004e-293 5.709793 0.850 0.021 1.191870e-288
+## Ptrf 3.510770e-272 3.342912 0.774 0.009 7.496196e-268
+## Clu 1.710715e-265 3.763025 0.784 0.016 3.652719e-261
+## Plvap 1.616214e-262 4.448397 0.740 0.004 3.450939e-258
+# find markers for every cluster compared to all remaining cells, report only the positive ones
+data.markers <- FindAllMarkers(data,
+ only.pos = TRUE,
+ min.pct = 0.25,
+ logfc.threshold = 0.25)
+## Calculating cluster 0
+## Calculating cluster 1
+## Calculating cluster 2
+## Calculating cluster 3
+## Calculating cluster 4
+## Calculating cluster 5
+## Calculating cluster 6
+## Calculating cluster 7
+## Calculating cluster 8
+## Calculating cluster 9
+## Calculating cluster 10
+## Calculating cluster 11
+## Calculating cluster 12
+## Calculating cluster 13
+## Calculating cluster 14
+## Calculating cluster 15
+## Calculating cluster 16
+## Calculating cluster 17
+data.markers %>%
+ group_by(., cluster) %>%
+ top_n(., n = 2, wt = avg_log2FC)
+## # A tibble: 36 × 7
+## # Groups: cluster [18]
+## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
+## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
+## 1 0 2.73 0.735 0.139 0 0 Tcf7
+## 2 3.68e-180 2.45 0.907 0.55 7.85e-176 0 Vps37b
+## 3 0 5.82 0.984 0.332 0 1 Lyz2
+## 4 2.35e-202 4.02 0.407 0.049 5.01e-198 1 Chil3
+## 5 5.55e-239 2.82 0.811 0.24 1.19e-234 2 Plbd1
+## 6 1.13e-150 3.19 0.938 0.789 2.42e-146 2 Cst3
+## 7 0 3.76 0.745 0.112 0 3 Cxcr6
+## 8 0 3.40 0.838 0.155 0 3 Icos
+## 9 0 4.46 0.935 0.038 0 4 Gpihbp1
+## 10 0 3.68 0.938 0.104 0 4 Esam
+## # … with 26 more rows
+VlnPlot(data, features = c("Vps37b", "Tcf7"))
+FeaturePlot(data, features = c("Vps37b", "Tcf7"))
+sessionInfo()
+## R version 4.1.2 (2021-11-01)
+## Platform: x86_64-apple-darwin17.0 (64-bit)
+## Running under: macOS Big Sur 10.16
+##
+## Matrix products: default
+## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
+## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
+##
+## locale:
+## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
+##
+## attached base packages:
+## [1] stats graphics grDevices utils datasets methods base
+##
+## other attached packages:
+## [1] patchwork_1.1.2 SeuratObject_4.1.3 Seurat_4.3.0 dplyr_1.0.10
+##
+## loaded via a namespace (and not attached):
+## [1] ggbeeswarm_0.7.1 Rtsne_0.16 colorspace_2.0-3
+## [4] deldir_1.0-6 ellipsis_0.3.2 ggridges_0.5.4
+## [7] rstudioapi_0.14 spatstat.data_3.0-0 farver_2.1.1
+## [10] leiden_0.4.3 listenv_0.9.0 ggrepel_0.9.2
+## [13] fansi_1.0.3 codetools_0.2-18 splines_4.1.2
+## [16] cachem_1.0.6 knitr_1.41 polyclip_1.10-4
+## [19] jsonlite_1.8.4 ica_1.0-3 cluster_2.1.4
+## [22] png_0.1-8 uwot_0.1.14 shiny_1.7.4
+## [25] sctransform_0.3.5 spatstat.sparse_3.0-0 compiler_4.1.2
+## [28] httr_1.4.4 assertthat_0.2.1 Matrix_1.5-1
+## [31] fastmap_1.1.0 lazyeval_0.2.2 limma_3.50.3
+## [34] cli_3.5.0 later_1.3.0 htmltools_0.5.4
+## [37] tools_4.1.2 igraph_1.3.5 gtable_0.3.1
+## [40] glue_1.6.2 RANN_2.6.1 reshape2_1.4.4
+## [43] Rcpp_1.0.9 scattermore_0.8 jquerylib_0.1.4
+## [46] vctrs_0.5.1 nlme_3.1-161 spatstat.explore_3.0-5
+## [49] progressr_0.12.0 lmtest_0.9-40 spatstat.random_3.0-1
+## [52] xfun_0.35 stringr_1.5.0 globals_0.16.2
+## [55] mime_0.12 miniUI_0.1.1.1 lifecycle_1.0.3
+## [58] irlba_2.3.5.1 goftest_1.2-3 future_1.30.0
+## [61] MASS_7.3-58.1 zoo_1.8-11 scales_1.2.1
+## [64] promises_1.2.0.1 spatstat.utils_3.0-1 parallel_4.1.2
+## [67] RColorBrewer_1.1-3 yaml_2.3.6 reticulate_1.26
+## [70] pbapply_1.6-0 gridExtra_2.3 ggrastr_1.0.1
+## [73] ggplot2_3.4.0 sass_0.4.4 stringi_1.7.8
+## [76] highr_0.9 rlang_1.0.6 pkgconfig_2.0.3
+## [79] matrixStats_0.63.0 evaluate_0.19 lattice_0.20-45
+## [82] ROCR_1.0-11 purrr_1.0.0 tensor_1.5
+## [85] labeling_0.4.2 htmlwidgets_1.6.0 cowplot_1.1.1
+## [88] tidyselect_1.2.0 parallelly_1.33.0 RcppAnnoy_0.0.20
+## [91] plyr_1.8.8 magrittr_2.0.3 R6_2.5.1
+## [94] generics_0.1.3 DBI_1.1.3 withr_2.5.0
+## [97] pillar_1.8.1 fitdistrplus_1.1-8 survival_3.4-0
+## [100] abind_1.4-5 sp_1.5-1 tibble_3.1.8
+## [103] future.apply_1.10.0 crayon_1.5.2 KernSmooth_2.23-20
+## [106] utf8_1.2.2 spatstat.geom_3.0-3 plotly_4.10.1
+## [109] rmarkdown_2.19 grid_4.1.2 data.table_1.14.6
+## [112] digest_0.6.31 xtable_1.8-4 tidyr_1.2.1
+## [115] httpuv_1.6.7 munsell_0.5.0 beeswarm_0.4.0
+## [118] viridisLite_0.4.1 vipor_0.4.5 bslib_0.4.2
+