closes #20

2025-11-30 09:45:43 -08:00 · 2024-05-18 08:18:12 -07:00 · 2024-05-18 08:18:12 -07:00 · fd2fc5b190
commit fd2fc5b190
parent 094d41cd0d
6 changed files with 1040 additions and 1502 deletions
--- a/intro-r-data-analysis/Intro_to_R_data_analysis_part_1.Rmd
+++ b/intro-r-data-analysis/Intro_to_R_data_analysis_part_1.Rmd
@ -1,7 +1,8 @@
 ---
-title: "Introduction to R Data Analysis - Part 1"
+title: "Introduction to R Data Analysis"
+subtitle: "Part 1"
 author: "Natalie Elphick"
-date: "January 22nd, 2024"
+date: "May 20th, 2024"
 knit: (function(input, ...) {
    rmarkdown::render(
      input,
@ -16,6 +17,7 @@ output:

 ```{r, setup, include=FALSE}
 library(tidyverse)
+knitr::opts_chunk$set(comment = "")
 ```

 ## 
@ -25,10 +27,10 @@ library(tidyverse)
 ## Introductions

 **Natalie Elphick**    
-Bioinformatician I   
+Bioinformatician I  

-**Michela Traglia (TA)**     
-Senior Statistician    
+**Yihang Xin (Online TA)**    
+Software Engineer III 


 ## Poll 1
@ -36,7 +38,7 @@ Senior Statistician
 **What is your level of experience with coding/data analysis?**

 1.  I know another data analysis programming language (Python, Matlab etc.)
-2.  I can use Excel to do linear regression
+2.  I can use Excel
 3.  I know some R
 4.  All of the above
 5.  None of the above
@ -50,8 +52,8 @@ Senior Statistician
 1.  What is R and why should you use it?
 2.  The RStudio interface
 3.  File types
-4.  Error messages
-5.  Variables
+4.  Variables
+5.  Error and warning messages
 6.  Types & data structures
 7.  Math and logic operations
 8.  Functions and packages
@ -86,9 +88,9 @@ functionality
 # RStudio

 ## RStudio
- RStudio is an integrated development
+-   RStudio is an integrated development
 environment (IDE)  
- It makes R code easier to write by providing a
+-   It is an app that makes R code easier to write by providing a
 feature rich graphical user interface (GUI)

 <br>
@ -131,8 +133,6 @@ feature rich graphical user interface (GUI)
 ## Variable definition
 -    Variables store information that is referenced and manipulated
 in a computer program
-    In contrast to the mathematical definition of a variable,
-variables in computer science are _mutable_
 -    There are 3 ways to define variables in R, but one is preferred:
 ```{r}
 x <- 1  # Preferred way
@ -141,7 +141,59 @@ x = 1
 print(x)
 ```

-## Variable naming
+## Example
+-   Run the following in the R console:
+
+```{r}
+x <- 1  
+y <- 4
+z <- y
+x + y + z
+```
+
+# Error and Warning Messages
+
+## Errors
+-   **Errors**: Stop the execution of your code and must be fixed for the code to run successfully
+
+```{r, eval=FALSE}
+x <- 5
+y <- 10
+z <- x + a
+```
+
+```{r,echo=FALSE}
+message("Error: object 'a' not found")
+```
+
+
+## Common Errors
+
+- **Syntax Error:** Invalid R code syntax (e.g. misplaced parentheses)
+```{r,echo=FALSE}
+message('Error: unexpected ")"')
+```
+
+- **Object not found:** This variable is not defined (e.g. misspelled variables)
+
+```{r,echo=FALSE}
+message('Error: object "a" not found')
+```
+
+
+See this [article](https://statsandr.com/blog/top-10-errors-in-r/) for more common errors and how to fix them.
+
+## Warnings
+-   Do not stop the execution but indicate potential issues that you should be aware of and might need to address
+
+```{r}
+a <- c(1, 2, 3, 4, 5)
+b <- c(6, 7, 8, 9)
+result <- a + b
+```
+
+
+## Variable Naming

 -   Variables names must start with a letter and can contain
 underscores and periods
@ -176,7 +228,7 @@ DogBreeds <- c("Labrador Retriever", "Akita", "Bulldog")

 ## Data Types
 - Integer
-    - Whole numbers (in R denote with L ex. 1L,2L)
+    - Whole numbers (in R denoted with L ex. 1L,2L)
 - Numeric
    - Decimal numbers
 - Logical
@ -191,7 +243,7 @@ DogBreeds <- c("Labrador Retriever", "Akita", "Bulldog")
 **Which of these is not the correct data type for the value?**

 1.  1.5 - Numeric
-2.  "Labrador Retriever" - Character
+2.  "1" - Character
 3.  NA - Logical
 4.  1 - Integer

@ -227,6 +279,8 @@ DogBreeds <- c("Labrador Retriever", "Akita", "Bulldog")
 countdown::countdown(minutes = 10,
                     seconds = 0,
                     color_border = "black",
+                     color_running_background = "#47d193",
+                     color_finished_background = "#a3184e",
                     padding = "50px",
                     margin = "5%",
                     font_size = "5em",
@ -297,6 +351,7 @@ x & !y
 execution of code

 ```{r}
+dog_breeds <- c("Labrador Retriever", "Akita", "Bulldog")
 if ("Akita" %in% dog_breeds) {
  print("dog_breeds already contains Akita")
 } else {
@ -379,11 +434,17 @@ library(ggplot2) # Makes all of the ggplot2 functions available
 -    The tidyverse is a collection of commonly used data analysis
 packages  
      -   Learning curve is less steep
-      -   Lots of useful packages for data analysis
+      -   Lots of useful packages for cleaning and "wrangling" data into the correct format

-##
+## Why use Tidyverse Packages?

-![tidyverse](assets/tidyverse.png)
+-   Most of the work in data analysis is getting data into the correct format to create outputs
+-   The tidyverse collection of packages simplifies this process
+    -   Intuitive syntax
+    -   Comprehensive (data manipulation, cleaning, modeling and graphics)
+    -   Consistent data structure
+    -   Strong community support
+    

 # End of Part 1

@ -392,11 +453,8 @@ packages

 ## Upcoming Workshops

-1. [Introduction to Statistics, Experimental Design, and Hypothesis Testing](https://gladstone.org/index.php/events/introduction-statistics-experimental-design-and-hypothesis-testing-0) 
-    -   Jan 25, 2024 (Session 1 - 10am–12pm) (Session 2 - 1pm–3pm)
-    -   Jan 26, 2024 (Session 3 - 10am–12pm)
+[Single Cell ATAC-Seq Data Analysis Part 2](https://gladstone.org/events/single-cell-atac-seq-data-analysis-part-2-1)

-2. [Intermediate RNA-Seq Analysis Using R](https://gladstone.org/index.php/events/intermediate-rna-seq-analysis-using-r-4)
-    -   Feb 1, 2024 (9:30am-12:00pm)
+- Check [this link](https://gladstone.org/events?series=data-science-training-program) at the end of the summer for out fall workshop schedule


--- a/intro-r-data-analysis/Intro_to_R_data_analysis_part_2.Rmd
+++ b/intro-r-data-analysis/Intro_to_R_data_analysis_part_2.Rmd
@ -1,7 +1,8 @@
 ---
-title: "Introduction to R Data Analysis - Part 2"
+title: "Introduction to R Data Analysis"
+subtitle: "Part 2"
 author: "Natalie Elphick"
-date: "January 23rd, 2024"
+date: "May 21st, 2024"
 knit: (function(input, ...) {
    rmarkdown::render(
      input,
@ -19,6 +20,7 @@ library(kableExtra)
 library(tidyverse)
 library(readxl)
 theme_set(theme_grey(base_size = 16))
+knitr::opts_chunk$set(comment = "")
 ```

 ## 
@ -29,7 +31,10 @@ theme_set(theme_grey(base_size = 16))
 **Natalie Elphick**    
 Bioinformatician I  

-**Yihang Xin (TA)**     
+**Michela Traglia (In Person TA)**    
+Senior Statistician
+
+**Yihang Xin (Online TA)**     
 Software Engineer III   

 # Schedule
@ -46,11 +51,11 @@ Software Engineer III
 -   The tidyverse packages work well together because they share
 common data representations and design principles
    -   Rows = observations, columns = variables
-   [ggplot2](), for data visualization.
-   [dplyr](), for data manipulation.
-   [tidyr](), for data tidying.
-   [readr](), for data import.
-   [purrr](), for iteration.
+-   [ggplot2](https://ggplot2.tidyverse.org/), for data visualization.
+-   [dplyr](https://dplyr.tidyverse.org/), for data manipulation.
+-   [tidyr](https://tidyr.tidyverse.org/), for data tidying.
+-   [readr](https://readr.tidyverse.org/), for data import.
+-   [purrr](https://purrr.tidyverse.org/), for iteration.
 -   and more..

 ## dplyr
@ -67,66 +72,38 @@ common data representations and design principles

 ## Example Dataframe
 -   mpg is a dataframe built into the ggplot2 package
-```{r, eval = FALSE}
+```{r}
 head(mpg)
 ```

-```{r, echo = FALSE}
-head(mpg) |>
-  kable() |>
-  kable_styling("striped") |>
-  scroll_box(width = "100%")
-```

 ## Select Columns

-```{r, eval = FALSE}
+```{r}
 select(.data = mpg,
       year, cty, hwy, manufacturer)
 ```

-```{r, echo = FALSE}
-select(.data = mpg,
-       year, cty, hwy, manufacturer) |>
-  head() |>
-  kable() |>
-  kable_styling("striped") |>
-  scroll_box(width = "100%")
-```
-

 ## Filter Rows


-```{r, eval = FALSE}
+```{r}
 filter(.data = mpg,
       year == 2008)
 ```

-```{r, echo = FALSE}
-filter(.data = mpg,
-       year == 2008) |>
-  head() |>
-  kable() |>
-  kable_styling("striped") |>
-  scroll_box(width = "100%")
-```
+
 ## Arrange Rows

 -   desc() is used to arrange rows in descending order, the default is ascending
-```{r, eval = FALSE}
+```{r}
 arrange(.data = mpg,
-        desc(cyl))
+        desc(cty))
 ```

-```{r, echo = FALSE}
-arrange(.data = mpg,
-        desc(cyl)) |>
-  head(n = 3) |>
-  kable() |>
-  kable_styling("striped") |>
-  scroll_box(width = "100%")
-```
+
+
 ## Summarising data
 -    The dplyr **summarise()** function computes a table of
 summaries for a data frame
@ -136,6 +113,9 @@ variable(s)
 different categorical groupings

 ## Group and Summarise
+
+- Get the mean and median city mileage within manufacturer
+
 ```{r, eval = FALSE}
 summarise(group_by(.data = mpg,
                   manufacturer),
@ -144,37 +124,27 @@ summarise(group_by(.data = mpg,
 ```

 ```{r, echo = FALSE}
-summarise(group_by(.data = mpg,
+summarise(.data = group_by(.data = mpg,
                   manufacturer),
          mean_cty = mean(cty),
          median_cty = median(cty)) |>
-  head() |>
-  kable() |>
-  kable_styling("striped") |>
-  scroll_box(width = "100%")
+  head(10)
 ```

+
+
 ## The pipe operator |>
 -   Allows "chaining" of function calls to make code more readable
-```{r, eval = FALSE}
-mpg |>
-  group_by(manufacturer) |>
-  summarise(mean_cty = mean(cty),
-            median_cty = median(cty))
-```
-
-```{r, echo = FALSE}
+```{r}
 mpg |>
  group_by(manufacturer) |>
  summarise(mean_cty = mean(cty),
            median_cty = median(cty)) |>
-  head(n = 4) |>
-  kable() |>
-  kable_styling("striped") |>
-  scroll_box(width = "100%")
+  head(5)
 ```


+
 # Plotting

 ## ggplot2
@ -204,11 +174,10 @@ ggplot(data = mpg,                         # Input dataframe
 ## Adding and Modifying Layers 

 ```{r, fig.dim=c(10,4)}
-ggplot(data = mpg,
-       mapping = aes(x = class, y = cty, fill = class)) +
-  geom_violin() +
-  geom_boxplot(width = 0.1,
-               fill = "white")
+ggplot(data = mpg,                         
+       mapping = aes(x = cty, y = hwy)) +  
+  geom_point() +
+  geom_smooth(formula = y ~ x, method = "lm")
 ```


@ -221,6 +190,8 @@ ggplot(data = mpg,
 countdown::countdown(minutes = 10,
                     seconds = 0,
                     color_border = "black",
+                     color_running_background = "#47d193",
+                     color_finished_background = "#a3184e",
                     padding = "50px",
                     margin = "5%",
                     font_size = "5em",
@ -234,8 +205,8 @@ countdown::countdown(minutes = 10,

 ## Dataset Description
 -   PanTHERIA
-    -   A global species-level data set of key life-history, ecological and geographical traits of all known extant and recently extinct mammals compiled from the literature
-    -   Macroecological and macroevolutionary research projects
+    -   A global species-level data set of key traits of all known extant and recently extinct mammals compiled from literature
+    -   Used in macroecological and macroevolutionary research projects
    -   Data is organized by taxonomic rank

 ## Taxonomic Rank
@ -252,10 +223,20 @@ read_xlsx("Intro_to_R_workshop_materials/PanTHERIA.xlsx") |>
  scroll_box(width = "100%")
 ```

+## Hands-on Analysis

+-   We will read in the data and explore if the trophic level has a significant impact on the adult body mass of mammals  
+
+
+Steps:    
+1.    Combine and clean the data   
+2.    Visualize adult body mass by trophic level  
+3.    Check for overrepresented groups  
+4.    Fit a simple linear model  

 ## Hands-on Analysis
 -   Open part_2.Rmd
+-   If you just want to follow along and not run code, open part2_filled_out.html



@ -263,63 +244,34 @@ read_xlsx("Intro_to_R_workshop_materials/PanTHERIA.xlsx") |>

 ## General Tips

+-   Follow any relevant institutional guidelines on using LLMs
 -   Always confirm ChatGPT's outputs are correct
 -   Provide as much detail as possible about the problem in the 1st prompt
 -   Use separate chats for separate tasks/projects
-   Try the 'Custom Instructions' function that adds additional information to every prompt
-   Can visit webpages (GPT 4 only), which can help get more specific answers
+-   Try the 'Custom Instructions' function

 ## Code Tips

-   Commented R code yields better responses in my experience
+-   Commented R code yields better responses
 -   Provide the code and error message in the same prompt
 -   ChatGPT can work well to convert syntax and improve your code:
    -   "Turn this loop into a function : [your code]"
    -   "Is there a better way to do this : [your code]"
 -   Check out the file: `example_code/1_convert_syntax_example.R` for an example use case

-# Finding R Packages
-
-## Key Questions
-
-   What assay was the package designed for?
-   When was the last release?
-   Is it maintained (frequent updates)?
-   Does it work on all operating systems?
-   Are other people using it? (citations)
-   Do they respond to github issues?
-   Is there a benchmarking paper?
-
-## BioConductor and CRAN
-
-   Both of these have stringent requirements for packages they host (eg. for BioConductor they have to run on all major operating systems)
-
-   Prefer BioConductor packages if available over CRAN
-
-   Prefer CRAN packages over ones only hosted on GitHub
-
-## Start with the Assay
-
-   Click [here](https://www.bioconductor.org/packages/release/BiocViews.html#___Sequencing) to go to BioC views
-   Pick the assay you want to analyse
-   Pick the type of analysis you want to do
-   Find a package that does it
-   Find benchmarking papers to narrow the list of packages down
-   Find the vignette on the package page and refer to the manual for any questions not covered by it
-

 # Additional Resources

 ## R
-
-   [R Markdown: The Definitive Guide](https://bookdown.org/yihui/rmarkdown/how-to-read-this-book.html) : Excellent R markdown reference
-
 -   [R for Data Science](https://r4ds.hadley.nz/)
-
+-   [Top 10 R Errors and How to Fix them](https://statsandr.com/blog/top-10-errors-in-r/)
+-   [R Markdown: The Definitive Guide](https://bookdown.org/yihui/rmarkdown/how-to-read-this-book.html) : Excellent R markdown reference
 -   [ggplot2: elegant graphics for data analysis](https://ggplot2-book.org/)

 -   [Advanced R](https://adv-r.hadley.nz/)

+
+
 ## Statistics

 -   [Data Analysis in R](https://bookdown.org/steve_midway/DAR) : This book has more statistics details than *R for Data Science*
@ -346,10 +298,9 @@ read_xlsx("Intro_to_R_workshop_materials/PanTHERIA.xlsx") |>

 ## Upcoming Workshops

-1. [Introduction to Statistics, Experimental Design, and Hypothesis Testing](https://gladstone.org/index.php/events/introduction-statistics-experimental-design-and-hypothesis-testing-0) 
-    -   Jan 25, 2024 (Session 1 - 10am–12pm) (Session 2 - 1pm–3pm)
-    -   Jan 26, 2024 (Session 3 - 10am–12pm)
+[Single Cell ATAC-Seq Data Analysis Part 2](https://gladstone.org/events/single-cell-atac-seq-data-analysis-part-2-1)
+
+- Check [this link](https://gladstone.org/events?series=data-science-training-program) at the end of the summer for out fall workshop schedule
+

-2. [Intermediate RNA-Seq Analysis Using R](https://gladstone.org/index.php/events/intermediate-rna-seq-analysis-using-r-4)
-    -   Feb 1, 2024 (9:30am-12:00pm)

--- a/intro-r-data-analysis/renv.lock
+++ b/intro-r-data-analysis/renv.lock
@ -22,7 +22,7 @@
    },
    "MASS": {
      "Package": "MASS",
-      "Version": "7.3-60",
+      "Version": "7.3-60.0.1",
      "Source": "Repository",
      "Repository": "CRAN",
      "Requirements": [
@ -33,11 +33,11 @@
        "stats",
        "utils"
      ],
-      "Hash": "a56a6365b3fa73293ea8d084be0d9bb0"
+      "Hash": "b765b28387acc8ec9e9c1530713cb19c"
    },
    "Matrix": {
      "Package": "Matrix",
-      "Version": "1.6-1.1",
+      "Version": "1.6-5",
      "Source": "Repository",
      "Repository": "CRAN",
      "Requirements": [
@ -50,7 +50,7 @@
        "stats",
        "utils"
      ],
-      "Hash": "1a00d4828f33a9d690806e98bd17150c"
+      "Hash": "8c7115cd3a0e048bda2a7cd110549f7a"
    },
    "R6": {
      "Package": "R6",
@ -872,7 +872,7 @@
    },
    "lattice": {
      "Package": "lattice",
-      "Version": "0.21-9",
+      "Version": "0.22-6",
      "Source": "Repository",
      "Repository": "CRAN",
      "Requirements": [
@ -883,7 +883,7 @@
        "stats",
        "utils"
      ],
-      "Hash": "5558c61e0136e247252f5f952cdaad6a"
+      "Hash": "cc5ac1ba4c238c7ca9fa6a87ca11a7e2"
    },
    "learnr": {
      "Package": "learnr",
@ -977,7 +977,7 @@
    },
    "mgcv": {
      "Package": "mgcv",
-      "Version": "1.9-0",
+      "Version": "1.9-1",
      "Source": "Repository",
      "Repository": "CRAN",
      "Requirements": [
@ -990,7 +990,7 @@
        "stats",
        "utils"
      ],
-      "Hash": "086028ca0460d0c368028d3bda58f31b"
+      "Hash": "110ee9d83b496279960e162ac97764ce"
    },
    "mime": {
      "Package": "mime",
@ -1033,7 +1033,7 @@
    },
    "nlme": {
      "Package": "nlme",
-      "Version": "3.1-163",
+      "Version": "3.1-164",
      "Source": "Repository",
      "Repository": "CRAN",
      "Requirements": [
@ -1043,7 +1043,7 @@
        "stats",
        "utils"
      ],
-      "Hash": "8d1938040a05566f4f7a14af4feadd6b"
+      "Hash": "a623a2239e642806158bc4dc3f51565d"
    },
    "openssl": {
      "Package": "openssl",
--- a/intro-r-data-analysis/style.css
+++ b/intro-r-data-analysis/style.css
@ -129,4 +129,14 @@ small {
 .big-picture img{
    max-width: 70%;
    border: 1px solid black !important;
-}
+}
+
+/* Chage link color to sky blue */
+.reveal a {
+    color: #0c74dc;
+}
+
+/* Change link color to magenta on hover */
+.reveal a:hover {
+    color: #9c0366 !important;
+}