Introduction to R Data Analysis - Part 1
+ Introduction to R Data Analysis
@@ -1085,7 +1085,7 @@ visibility: hidden; }
+}
+
+.reveal a {
+color: #0c74dc;
+}
+
+.reveal a:hover {
+color: #9c0366 !important;
+}
+
-
-
-
Introduction to R Data Analysis - Part 2
+
Introduction to R Data Analysis
+
Part 2
Natalie Elphick
-
January 23rd, 2024
+
May 21st, 2024
@@ -2799,7 +2809,9 @@ class CountdownTimer {
Introductions
Natalie Elphick
Bioinformatician I
-
Yihang Xin (TA)
+
Michela Traglia (In Person TA)
+Senior Statistician
+
Yihang Xin (Online TA)
Software Engineer III
@@ -2826,11 +2838,16 @@ data representations and design principles
@@ -2860,628 +2877,53 @@ allows any operation to be done “by group”
mpg is a dataframe built into the ggplot2 package
head(mpg)
-
-
-
-
-
-manufacturer
-
-
-model
-
-
-displ
-
-
-year
-
-
-cyl
-
-
-trans
-
-
-drv
-
-
-cty
-
-
-hwy
-
-
-fl
-
-
-class
-
-
-
-
-
-
-audi
-
-
-a4
-
-
-1.8
-
-
-1999
-
-
-4
-
-
-auto(l5)
-
-
-f
-
-
-18
-
-
-29
-
-
-p
-
-
-compact
-
-
-
-
-audi
-
-
-a4
-
-
-1.8
-
-
-1999
-
-
-4
-
-
-manual(m5)
-
-
-f
-
-
-21
-
-
-29
-
-
-p
-
-
-compact
-
-
-
-
-audi
-
-
-a4
-
-
-2.0
-
-
-2008
-
-
-4
-
-
-manual(m6)
-
-
-f
-
-
-20
-
-
-31
-
-
-p
-
-
-compact
-
-
-
-
-audi
-
-
-a4
-
-
-2.0
-
-
-2008
-
-
-4
-
-
-auto(av)
-
-
-f
-
-
-21
-
-
-30
-
-
-p
-
-
-compact
-
-
-
-
-audi
-
-
-a4
-
-
-2.8
-
-
-1999
-
-
-6
-
-
-auto(l5)
-
-
-f
-
-
-16
-
-
-26
-
-
-p
-
-
-compact
-
-
-
-
-audi
-
-
-a4
-
-
-2.8
-
-
-1999
-
-
-6
-
-
-manual(m5)
-
-
-f
-
-
-18
-
-
-26
-
-
-p
-
-
-compact
-
-
-
-
-
+
# A tibble: 6 × 11
+ manufacturer model displ year cyl trans drv cty hwy fl class
+ <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
+1 audi a4 1.8 1999 4 auto(l5) f 18 29 p compa…
+2 audi a4 1.8 1999 4 manual(m5) f 21 29 p compa…
+3 audi a4 2 2008 4 manual(m6) f 20 31 p compa…
+4 audi a4 2 2008 4 auto(av) f 21 30 p compa…
+5 audi a4 2.8 1999 6 auto(l5) f 16 26 p compa…
+6 audi a4 2.8 1999 6 manual(m5) f 18 26 p compa…
# A tibble: 234 × 4
+ year cty hwy manufacturer
+ <int> <int> <int> <chr>
+ 1 1999 18 29 audi
+ 2 1999 21 29 audi
+ 3 2008 20 31 audi
+ 4 2008 21 30 audi
+ 5 1999 16 26 audi
+ 6 1999 18 26 audi
+ 7 2008 18 27 audi
+ 8 1999 18 26 audi
+ 9 1999 16 25 audi
+10 2008 20 28 audi
+# ℹ 224 more rows
Filter Rows
-
filter(.data = mpg,
- year ==2008)
-
-
-
-
-
-manufacturer
-
-
-model
-
-
-displ
-
-
-year
-
-
-cyl
-
-
-trans
-
-
-drv
-
-
-cty
-
-
-hwy
-
-
-fl
-
-
-class
-
-
-
-
-
-
-audi
-
-
-a4
-
-
-2.0
-
-
-2008
-
-
-4
-
-
-manual(m6)
-
-
-f
-
-
-20
-
-
-31
-
-
-p
-
-
-compact
-
-
-
-
-audi
-
-
-a4
-
-
-2.0
-
-
-2008
-
-
-4
-
-
-auto(av)
-
-
-f
-
-
-21
-
-
-30
-
-
-p
-
-
-compact
-
-
-
-
-audi
-
-
-a4
-
-
-3.1
-
-
-2008
-
-
-6
-
-
-auto(av)
-
-
-f
-
-
-18
-
-
-27
-
-
-p
-
-
-compact
-
-
-
-
-audi
-
-
-a4 quattro
-
-
-2.0
-
-
-2008
-
-
-4
-
-
-manual(m6)
-
-
-4
-
-
-20
-
-
-28
-
-
-p
-
-
-compact
-
-
-
-
-audi
-
-
-a4 quattro
-
-
-2.0
-
-
-2008
-
-
-4
-
-
-auto(s6)
-
-
-4
-
-
-19
-
-
-27
-
-
-p
-
-
-compact
-
-
-
-
-audi
-
-
-a4 quattro
-
-
-3.1
-
-
-2008
-
-
-6
-
-
-auto(s6)
-
-
-4
-
-
-17
-
-
-25
-
-
-p
-
-
-compact
-
-
-
-
-
+
filter(.data = mpg,
+ year ==2008)
+
# A tibble: 117 × 11
+ manufacturer model displ year cyl trans drv cty hwy fl class
+ <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
+ 1 audi a4 2 2008 4 manu… f 20 31 p comp…
+ 2 audi a4 2 2008 4 auto… f 21 30 p comp…
+ 3 audi a4 3.1 2008 6 auto… f 18 27 p comp…
+ 4 audi a4 quattro 2 2008 4 manu… 4 20 28 p comp…
+ 5 audi a4 quattro 2 2008 4 auto… 4 19 27 p comp…
+ 6 audi a4 quattro 3.1 2008 6 auto… 4 17 25 p comp…
+ 7 audi a4 quattro 3.1 2008 6 manu… 4 15 25 p comp…
+ 8 audi a6 quattro 3.1 2008 6 auto… 4 17 25 p mids…
+ 9 audi a6 quattro 4.2 2008 8 auto… 4 16 23 p mids…
+10 chevrolet c1500 sub… 5.3 2008 8 auto… r 14 20 r suv
+# ℹ 107 more rows
Arrange Rows
@@ -3489,156 +2931,22 @@ compact
desc() is used to arrange rows in descending order, the default is
ascending
-
arrange(.data = mpg,
-desc(cyl))
-
-
-
-
-
-manufacturer
-
-
-model
-
-
-displ
-
-
-year
-
-
-cyl
-
-
-trans
-
-
-drv
-
-
-cty
-
-
-hwy
-
-
-fl
-
-
-class
-
-
-
-
-
-
-audi
-
-
-a6 quattro
-
-
-4.2
-
-
-2008
-
-
-8
-
-
-auto(s6)
-
-
-4
-
-
-16
-
-
-23
-
-
-p
-
-
-midsize
-
-
-
-
-chevrolet
-
-
-c1500 suburban 2wd
-
-
-5.3
-
-
-2008
-
-
-8
-
-
-auto(l4)
-
-
-r
-
-
-14
-
-
-20
-
-
-r
-
-
-suv
-
-
-
-
-chevrolet
-
-
-c1500 suburban 2wd
-
-
-5.3
-
-
-2008
-
-
-8
-
-
-auto(l4)
-
-
-r
-
-
-11
-
-
-15
-
-
-e
-
-
-suv
-
-
-
-
-
+
arrange(.data = mpg,
+desc(cty))
+
# A tibble: 234 × 11
+ manufacturer model displ year cyl trans drv cty hwy fl class
+ <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
+ 1 volkswagen new beetle 1.9 1999 4 manu… f 35 44 d subc…
+ 2 volkswagen jetta 1.9 1999 4 manu… f 33 44 d comp…
+ 3 volkswagen new beetle 1.9 1999 4 auto… f 29 41 d subc…
+ 4 honda civic 1.6 1999 4 manu… f 28 33 r subc…
+ 5 toyota corolla 1.8 2008 4 manu… f 28 37 r comp…
+ 6 honda civic 1.8 2008 4 manu… f 26 34 r subc…
+ 7 toyota corolla 1.8 1999 4 manu… f 26 35 r comp…
+ 8 toyota corolla 1.8 2008 4 auto… f 26 35 r comp…
+ 9 honda civic 1.6 1999 4 manu… f 25 32 r subc…
+10 honda civic 1.8 2008 4 auto… f 25 36 r subc…
+# ℹ 224 more rows
Summarising data
@@ -3653,168 +2961,45 @@ different categorical groupings
# A tibble: 5 × 3
+ manufacturer mean_cty median_cty
+ <chr> <dbl> <dbl>
+1 audi 17.6 17.5
+2 chevrolet 15 15
+3 dodge 13.1 13
+4 ford 14 14
+5 honda 24.4 24
@@ -3841,28 +3026,27 @@ modified by adding layers
Creating ggplots
-
+
Plot Example
-
ggplot(data = mpg, # Input dataframe
-mapping =aes(x = cty, y = hwy)) +# Aesthetic mapping
-geom_point() # Point graph
-
+
ggplot(data = mpg, # Input dataframe
+mapping =aes(x = cty, y = hwy)) +# Aesthetic mapping
+geom_point() # Point graph
+
Adding and Modifying Layers
-
ggplot(data = mpg,
-mapping =aes(x = class, y = cty, fill = class)) +
-geom_violin() +
-geom_boxplot(width =0.1,
-fill ="white")
-
+
ggplot(data = mpg,
+mapping =aes(x = cty, y = hwy)) +
+geom_point() +
+geom_smooth(formula = y ~ x, method ="lm")
+
10 min break
-
+
10:00
@@ -3879,17 +3063,317 @@ modified by adding layers
PanTHERIA
-
A global species-level data set of key life-history, ecological and
-geographical traits of all known extant and recently extinct mammals
-compiled from the literature
-
Macroecological and macroevolutionary research projects
+
A global species-level data set of key traits of all known extant
+and recently extinct mammals compiled from literature
+
Used in macroecological and macroevolutionary research projects
Data is organized by taxonomic rank
Taxonomic Rank
-
+
Data Preview
@@ -5053,7 +4537,21 @@ NA
Hands-on Analysis
+
We will read in the data and explore if the trophic level has a
+significant impact on the adult body mass of mammals
+
+
Steps:
+1. Combine and clean the data
+2. Visualize adult body mass by trophic level
+3. Check for overrepresented groups
+4. Fit a simple linear model
+
+
+
Hands-on Analysis
+
Open part_2.Rmd
+
If you just want to follow along and not run code, open
+part2_filled_out.html
@@ -5064,20 +4562,18 @@ NA
General Tips
+
Follow any relevant institutional guidelines on using LLMs
Always confirm ChatGPT’s outputs are correct
Provide as much detail as possible about the problem in the 1st
prompt
Use separate chats for separate tasks/projects
-
Try the ‘Custom Instructions’ function that adds additional
-information to every prompt
-
Can visit webpages (GPT 4 only), which can help get more specific
-answers
+
Try the ‘Custom Instructions’ function
Code Tips
-
Commented R code yields better responses in my experience
+
Commented R code yields better responses
Provide the code and error message in the same prompt
ChatGPT can work well to convert syntax and improve your code:
@@ -5090,46 +4586,6 @@ case
-
-
Finding R Packages
-
-
-
-
Key Questions
-
-
What assay was the package designed for?
-
When was the last release?
-
Is it maintained (frequent updates)?
-
Does it work on all operating systems?
-
Are other people using it? (citations)
-
Do they respond to github issues?
-
Is there a benchmarking paper?
-
-
-
-
BioConductor and CRAN
-
-
Both of these have stringent requirements for packages they host
-(eg. for BioConductor they have to run on all major operating
-systems)
-
Prefer BioConductor packages if available over CRAN
-
Prefer CRAN packages over ones only hosted on GitHub
Check this
+link at the end of the summer for out fall workshop schedule
+
diff --git a/intro-r-data-analysis/Intro_to_R_data_analysis_part_1.Rmd b/intro-r-data-analysis/Intro_to_R_data_analysis_part_1.Rmd
index 34b6af1..1909ed2 100644
--- a/intro-r-data-analysis/Intro_to_R_data_analysis_part_1.Rmd
+++ b/intro-r-data-analysis/Intro_to_R_data_analysis_part_1.Rmd
@@ -1,7 +1,8 @@
---
-title: "Introduction to R Data Analysis - Part 1"
+title: "Introduction to R Data Analysis"
+subtitle: "Part 1"
author: "Natalie Elphick"
-date: "January 22nd, 2024"
+date: "May 20th, 2024"
knit: (function(input, ...) {
rmarkdown::render(
input,
@@ -16,6 +17,7 @@ output:
```{r, setup, include=FALSE}
library(tidyverse)
+knitr::opts_chunk$set(comment = "")
```
##
@@ -25,10 +27,10 @@ library(tidyverse)
## Introductions
**Natalie Elphick**
-Bioinformatician I
+Bioinformatician I
-**Michela Traglia (TA)**
-Senior Statistician
+**Yihang Xin (Online TA)**
+Software Engineer III
## Poll 1
@@ -36,7 +38,7 @@ Senior Statistician
**What is your level of experience with coding/data analysis?**
1. I know another data analysis programming language (Python, Matlab etc.)
-2. I can use Excel to do linear regression
+2. I can use Excel
3. I know some R
4. All of the above
5. None of the above
@@ -50,8 +52,8 @@ Senior Statistician
1. What is R and why should you use it?
2. The RStudio interface
3. File types
-4. Error messages
-5. Variables
+4. Variables
+5. Error and warning messages
6. Types & data structures
7. Math and logic operations
8. Functions and packages
@@ -86,9 +88,9 @@ functionality
# RStudio
## RStudio
-- RStudio is an integrated development
+- RStudio is an integrated development
environment (IDE)
-- It makes R code easier to write by providing a
+- It is an app that makes R code easier to write by providing a
feature rich graphical user interface (GUI)
@@ -131,8 +133,6 @@ feature rich graphical user interface (GUI)
## Variable definition
- Variables store information that is referenced and manipulated
in a computer program
-- In contrast to the mathematical definition of a variable,
-variables in computer science are _mutable_
- There are 3 ways to define variables in R, but one is preferred:
```{r}
x <- 1 # Preferred way
@@ -141,7 +141,59 @@ x = 1
print(x)
```
-## Variable naming
+## Example
+- Run the following in the R console:
+
+```{r}
+x <- 1
+y <- 4
+z <- y
+x + y + z
+```
+
+# Error and Warning Messages
+
+## Errors
+- **Errors**: Stop the execution of your code and must be fixed for the code to run successfully
+
+```{r, eval=FALSE}
+x <- 5
+y <- 10
+z <- x + a
+```
+
+```{r,echo=FALSE}
+message("Error: object 'a' not found")
+```
+
+
+## Common Errors
+
+- **Syntax Error:** Invalid R code syntax (e.g. misplaced parentheses)
+```{r,echo=FALSE}
+message('Error: unexpected ")"')
+```
+
+- **Object not found:** This variable is not defined (e.g. misspelled variables)
+
+```{r,echo=FALSE}
+message('Error: object "a" not found')
+```
+
+
+See this [article](https://statsandr.com/blog/top-10-errors-in-r/) for more common errors and how to fix them.
+
+## Warnings
+- Do not stop the execution but indicate potential issues that you should be aware of and might need to address
+
+```{r}
+a <- c(1, 2, 3, 4, 5)
+b <- c(6, 7, 8, 9)
+result <- a + b
+```
+
+
+## Variable Naming
- Variables names must start with a letter and can contain
underscores and periods
@@ -176,7 +228,7 @@ DogBreeds <- c("Labrador Retriever", "Akita", "Bulldog")
## Data Types
- Integer
- - Whole numbers (in R denote with L ex. 1L,2L)
+ - Whole numbers (in R denoted with L ex. 1L,2L)
- Numeric
- Decimal numbers
- Logical
@@ -191,7 +243,7 @@ DogBreeds <- c("Labrador Retriever", "Akita", "Bulldog")
**Which of these is not the correct data type for the value?**
1. 1.5 - Numeric
-2. "Labrador Retriever" - Character
+2. "1" - Character
3. NA - Logical
4. 1 - Integer
@@ -227,6 +279,8 @@ DogBreeds <- c("Labrador Retriever", "Akita", "Bulldog")
countdown::countdown(minutes = 10,
seconds = 0,
color_border = "black",
+ color_running_background = "#47d193",
+ color_finished_background = "#a3184e",
padding = "50px",
margin = "5%",
font_size = "5em",
@@ -297,6 +351,7 @@ x & !y
execution of code
```{r}
+dog_breeds <- c("Labrador Retriever", "Akita", "Bulldog")
if ("Akita" %in% dog_breeds) {
print("dog_breeds already contains Akita")
} else {
@@ -379,11 +434,17 @@ library(ggplot2) # Makes all of the ggplot2 functions available
- The tidyverse is a collection of commonly used data analysis
packages
- Learning curve is less steep
- - Lots of useful packages for data analysis
+ - Lots of useful packages for cleaning and "wrangling" data into the correct format
-##
+## Why use Tidyverse Packages?
-
+- Most of the work in data analysis is getting data into the correct format to create outputs
+- The tidyverse collection of packages simplifies this process
+ - Intuitive syntax
+ - Comprehensive (data manipulation, cleaning, modeling and graphics)
+ - Consistent data structure
+ - Strong community support
+
# End of Part 1
@@ -392,11 +453,8 @@ packages
## Upcoming Workshops
-1. [Introduction to Statistics, Experimental Design, and Hypothesis Testing](https://gladstone.org/index.php/events/introduction-statistics-experimental-design-and-hypothesis-testing-0)
- - Jan 25, 2024 (Session 1 - 10am–12pm) (Session 2 - 1pm–3pm)
- - Jan 26, 2024 (Session 3 - 10am–12pm)
+[Single Cell ATAC-Seq Data Analysis Part 2](https://gladstone.org/events/single-cell-atac-seq-data-analysis-part-2-1)
-2. [Intermediate RNA-Seq Analysis Using R](https://gladstone.org/index.php/events/intermediate-rna-seq-analysis-using-r-4)
- - Feb 1, 2024 (9:30am-12:00pm)
+- Check [this link](https://gladstone.org/events?series=data-science-training-program) at the end of the summer for out fall workshop schedule
diff --git a/intro-r-data-analysis/Intro_to_R_data_analysis_part_2.Rmd b/intro-r-data-analysis/Intro_to_R_data_analysis_part_2.Rmd
index 2e1218e..a950c3d 100644
--- a/intro-r-data-analysis/Intro_to_R_data_analysis_part_2.Rmd
+++ b/intro-r-data-analysis/Intro_to_R_data_analysis_part_2.Rmd
@@ -1,7 +1,8 @@
---
-title: "Introduction to R Data Analysis - Part 2"
+title: "Introduction to R Data Analysis"
+subtitle: "Part 2"
author: "Natalie Elphick"
-date: "January 23rd, 2024"
+date: "May 21st, 2024"
knit: (function(input, ...) {
rmarkdown::render(
input,
@@ -19,6 +20,7 @@ library(kableExtra)
library(tidyverse)
library(readxl)
theme_set(theme_grey(base_size = 16))
+knitr::opts_chunk$set(comment = "")
```
##
@@ -29,7 +31,10 @@ theme_set(theme_grey(base_size = 16))
**Natalie Elphick**
Bioinformatician I
-**Yihang Xin (TA)**
+**Michela Traglia (In Person TA)**
+Senior Statistician
+
+**Yihang Xin (Online TA)**
Software Engineer III
# Schedule
@@ -46,11 +51,11 @@ Software Engineer III
- The tidyverse packages work well together because they share
common data representations and design principles
- Rows = observations, columns = variables
-- [ggplot2](), for data visualization.
-- [dplyr](), for data manipulation.
-- [tidyr](), for data tidying.
-- [readr](), for data import.
-- [purrr](), for iteration.
+- [ggplot2](https://ggplot2.tidyverse.org/), for data visualization.
+- [dplyr](https://dplyr.tidyverse.org/), for data manipulation.
+- [tidyr](https://tidyr.tidyverse.org/), for data tidying.
+- [readr](https://readr.tidyverse.org/), for data import.
+- [purrr](https://purrr.tidyverse.org/), for iteration.
- and more..
## dplyr
@@ -67,66 +72,38 @@ common data representations and design principles
## Example Dataframe
- mpg is a dataframe built into the ggplot2 package
-```{r, eval = FALSE}
+```{r}
head(mpg)
```
-```{r, echo = FALSE}
-head(mpg) |>
- kable() |>
- kable_styling("striped") |>
- scroll_box(width = "100%")
-```
## Select Columns
-```{r, eval = FALSE}
+```{r}
select(.data = mpg,
year, cty, hwy, manufacturer)
```
-```{r, echo = FALSE}
-select(.data = mpg,
- year, cty, hwy, manufacturer) |>
- head() |>
- kable() |>
- kable_styling("striped") |>
- scroll_box(width = "100%")
-```
-
## Filter Rows
-```{r, eval = FALSE}
+```{r}
filter(.data = mpg,
year == 2008)
```
-```{r, echo = FALSE}
-filter(.data = mpg,
- year == 2008) |>
- head() |>
- kable() |>
- kable_styling("striped") |>
- scroll_box(width = "100%")
-```
+
## Arrange Rows
- desc() is used to arrange rows in descending order, the default is ascending
-```{r, eval = FALSE}
+```{r}
arrange(.data = mpg,
- desc(cyl))
+ desc(cty))
```
-```{r, echo = FALSE}
-arrange(.data = mpg,
- desc(cyl)) |>
- head(n = 3) |>
- kable() |>
- kable_styling("striped") |>
- scroll_box(width = "100%")
-```
+
+
## Summarising data
- The dplyr **summarise()** function computes a table of
summaries for a data frame
@@ -136,6 +113,9 @@ variable(s)
different categorical groupings
## Group and Summarise
+
+- Get the mean and median city mileage within manufacturer
+
```{r, eval = FALSE}
summarise(group_by(.data = mpg,
manufacturer),
@@ -144,37 +124,27 @@ summarise(group_by(.data = mpg,
```
```{r, echo = FALSE}
-summarise(group_by(.data = mpg,
+summarise(.data = group_by(.data = mpg,
manufacturer),
mean_cty = mean(cty),
median_cty = median(cty)) |>
- head() |>
- kable() |>
- kable_styling("striped") |>
- scroll_box(width = "100%")
+ head(10)
```
+
+
## The pipe operator |>
- Allows "chaining" of function calls to make code more readable
-```{r, eval = FALSE}
-mpg |>
- group_by(manufacturer) |>
- summarise(mean_cty = mean(cty),
- median_cty = median(cty))
-```
-
-```{r, echo = FALSE}
+```{r}
mpg |>
group_by(manufacturer) |>
summarise(mean_cty = mean(cty),
median_cty = median(cty)) |>
- head(n = 4) |>
- kable() |>
- kable_styling("striped") |>
- scroll_box(width = "100%")
+ head(5)
```
+
# Plotting
## ggplot2
@@ -204,11 +174,10 @@ ggplot(data = mpg, # Input dataframe
## Adding and Modifying Layers
```{r, fig.dim=c(10,4)}
-ggplot(data = mpg,
- mapping = aes(x = class, y = cty, fill = class)) +
- geom_violin() +
- geom_boxplot(width = 0.1,
- fill = "white")
+ggplot(data = mpg,
+ mapping = aes(x = cty, y = hwy)) +
+ geom_point() +
+ geom_smooth(formula = y ~ x, method = "lm")
```
@@ -221,6 +190,8 @@ ggplot(data = mpg,
countdown::countdown(minutes = 10,
seconds = 0,
color_border = "black",
+ color_running_background = "#47d193",
+ color_finished_background = "#a3184e",
padding = "50px",
margin = "5%",
font_size = "5em",
@@ -234,8 +205,8 @@ countdown::countdown(minutes = 10,
## Dataset Description
- PanTHERIA
- - A global species-level data set of key life-history, ecological and geographical traits of all known extant and recently extinct mammals compiled from the literature
- - Macroecological and macroevolutionary research projects
+ - A global species-level data set of key traits of all known extant and recently extinct mammals compiled from literature
+ - Used in macroecological and macroevolutionary research projects
- Data is organized by taxonomic rank
## Taxonomic Rank
@@ -252,10 +223,20 @@ read_xlsx("Intro_to_R_workshop_materials/PanTHERIA.xlsx") |>
scroll_box(width = "100%")
```
+## Hands-on Analysis
+- We will read in the data and explore if the trophic level has a significant impact on the adult body mass of mammals
+
+
+Steps:
+1. Combine and clean the data
+2. Visualize adult body mass by trophic level
+3. Check for overrepresented groups
+4. Fit a simple linear model
## Hands-on Analysis
- Open part_2.Rmd
+- If you just want to follow along and not run code, open part2_filled_out.html
@@ -263,63 +244,34 @@ read_xlsx("Intro_to_R_workshop_materials/PanTHERIA.xlsx") |>
## General Tips
+- Follow any relevant institutional guidelines on using LLMs
- Always confirm ChatGPT's outputs are correct
- Provide as much detail as possible about the problem in the 1st prompt
- Use separate chats for separate tasks/projects
-- Try the 'Custom Instructions' function that adds additional information to every prompt
-- Can visit webpages (GPT 4 only), which can help get more specific answers
+- Try the 'Custom Instructions' function
## Code Tips
-- Commented R code yields better responses in my experience
+- Commented R code yields better responses
- Provide the code and error message in the same prompt
- ChatGPT can work well to convert syntax and improve your code:
- "Turn this loop into a function : [your code]"
- "Is there a better way to do this : [your code]"
- Check out the file: `example_code/1_convert_syntax_example.R` for an example use case
-# Finding R Packages
-
-## Key Questions
-
-- What assay was the package designed for?
-- When was the last release?
-- Is it maintained (frequent updates)?
-- Does it work on all operating systems?
-- Are other people using it? (citations)
-- Do they respond to github issues?
-- Is there a benchmarking paper?
-
-## BioConductor and CRAN
-
-- Both of these have stringent requirements for packages they host (eg. for BioConductor they have to run on all major operating systems)
-
-- Prefer BioConductor packages if available over CRAN
-
-- Prefer CRAN packages over ones only hosted on GitHub
-
-## Start with the Assay
-
-- Click [here](https://www.bioconductor.org/packages/release/BiocViews.html#___Sequencing) to go to BioC views
-- Pick the assay you want to analyse
-- Pick the type of analysis you want to do
-- Find a package that does it
-- Find benchmarking papers to narrow the list of packages down
-- Find the vignette on the package page and refer to the manual for any questions not covered by it
-
# Additional Resources
## R
-
-- [R Markdown: The Definitive Guide](https://bookdown.org/yihui/rmarkdown/how-to-read-this-book.html) : Excellent R markdown reference
-
- [R for Data Science](https://r4ds.hadley.nz/)
-
+- [Top 10 R Errors and How to Fix them](https://statsandr.com/blog/top-10-errors-in-r/)
+- [R Markdown: The Definitive Guide](https://bookdown.org/yihui/rmarkdown/how-to-read-this-book.html) : Excellent R markdown reference
- [ggplot2: elegant graphics for data analysis](https://ggplot2-book.org/)
- [Advanced R](https://adv-r.hadley.nz/)
+
+
## Statistics
- [Data Analysis in R](https://bookdown.org/steve_midway/DAR) : This book has more statistics details than *R for Data Science*
@@ -346,10 +298,9 @@ read_xlsx("Intro_to_R_workshop_materials/PanTHERIA.xlsx") |>
## Upcoming Workshops
-1. [Introduction to Statistics, Experimental Design, and Hypothesis Testing](https://gladstone.org/index.php/events/introduction-statistics-experimental-design-and-hypothesis-testing-0)
- - Jan 25, 2024 (Session 1 - 10am–12pm) (Session 2 - 1pm–3pm)
- - Jan 26, 2024 (Session 3 - 10am–12pm)
+[Single Cell ATAC-Seq Data Analysis Part 2](https://gladstone.org/events/single-cell-atac-seq-data-analysis-part-2-1)
+
+- Check [this link](https://gladstone.org/events?series=data-science-training-program) at the end of the summer for out fall workshop schedule
+
-2. [Intermediate RNA-Seq Analysis Using R](https://gladstone.org/index.php/events/intermediate-rna-seq-analysis-using-r-4)
- - Feb 1, 2024 (9:30am-12:00pm)
diff --git a/intro-r-data-analysis/renv.lock b/intro-r-data-analysis/renv.lock
index 52c9d40..6812de9 100644
--- a/intro-r-data-analysis/renv.lock
+++ b/intro-r-data-analysis/renv.lock
@@ -22,7 +22,7 @@
},
"MASS": {
"Package": "MASS",
- "Version": "7.3-60",
+ "Version": "7.3-60.0.1",
"Source": "Repository",
"Repository": "CRAN",
"Requirements": [
@@ -33,11 +33,11 @@
"stats",
"utils"
],
- "Hash": "a56a6365b3fa73293ea8d084be0d9bb0"
+ "Hash": "b765b28387acc8ec9e9c1530713cb19c"
},
"Matrix": {
"Package": "Matrix",
- "Version": "1.6-1.1",
+ "Version": "1.6-5",
"Source": "Repository",
"Repository": "CRAN",
"Requirements": [
@@ -50,7 +50,7 @@
"stats",
"utils"
],
- "Hash": "1a00d4828f33a9d690806e98bd17150c"
+ "Hash": "8c7115cd3a0e048bda2a7cd110549f7a"
},
"R6": {
"Package": "R6",
@@ -872,7 +872,7 @@
},
"lattice": {
"Package": "lattice",
- "Version": "0.21-9",
+ "Version": "0.22-6",
"Source": "Repository",
"Repository": "CRAN",
"Requirements": [
@@ -883,7 +883,7 @@
"stats",
"utils"
],
- "Hash": "5558c61e0136e247252f5f952cdaad6a"
+ "Hash": "cc5ac1ba4c238c7ca9fa6a87ca11a7e2"
},
"learnr": {
"Package": "learnr",
@@ -977,7 +977,7 @@
},
"mgcv": {
"Package": "mgcv",
- "Version": "1.9-0",
+ "Version": "1.9-1",
"Source": "Repository",
"Repository": "CRAN",
"Requirements": [
@@ -990,7 +990,7 @@
"stats",
"utils"
],
- "Hash": "086028ca0460d0c368028d3bda58f31b"
+ "Hash": "110ee9d83b496279960e162ac97764ce"
},
"mime": {
"Package": "mime",
@@ -1033,7 +1033,7 @@
},
"nlme": {
"Package": "nlme",
- "Version": "3.1-163",
+ "Version": "3.1-164",
"Source": "Repository",
"Repository": "CRAN",
"Requirements": [
@@ -1043,7 +1043,7 @@
"stats",
"utils"
],
- "Hash": "8d1938040a05566f4f7a14af4feadd6b"
+ "Hash": "a623a2239e642806158bc4dc3f51565d"
},
"openssl": {
"Package": "openssl",
diff --git a/intro-r-data-analysis/style.css b/intro-r-data-analysis/style.css
index 6e7ace6..9928aff 100644
--- a/intro-r-data-analysis/style.css
+++ b/intro-r-data-analysis/style.css
@@ -129,4 +129,14 @@ small {
.big-picture img{
max-width: 70%;
border: 1px solid black !important;
-}
\ No newline at end of file
+}
+
+/* Chage link color to sky blue */
+.reveal a {
+ color: #0c74dc;
+}
+
+/* Change link color to magenta on hover */
+.reveal a:hover {
+ color: #9c0366 !important;
+}