From 719fd6fb2300b970ba366668ae543b034c0cbaf9 Mon Sep 17 00:00:00 2001 From: Natalie Elphick Date: Thu, 18 Jan 2024 11:18:04 -0800 Subject: [PATCH] make some changes to the hands on --- .../Intro_to_R_workshop_materials/part_2.Rmd | 128 +- .../Intro_to_R_workshop_materials/part_2.html | 1341 ----------------- .../part_2_filled_out.Rmd | 102 +- .../part_2_filled_out.html | 122 +- 4 files changed, 173 insertions(+), 1520 deletions(-) mode change 100755 => 100644 intro-r-data-analysis/Intro_to_R_workshop_materials/part_2.Rmd delete mode 100644 intro-r-data-analysis/Intro_to_R_workshop_materials/part_2.html diff --git a/intro-r-data-analysis/Intro_to_R_workshop_materials/part_2.Rmd b/intro-r-data-analysis/Intro_to_R_workshop_materials/part_2.Rmd old mode 100755 new mode 100644 index 960c09c..0ba2db5 --- a/intro-r-data-analysis/Intro_to_R_workshop_materials/part_2.Rmd +++ b/intro-r-data-analysis/Intro_to_R_workshop_materials/part_2.Rmd @@ -10,9 +10,9 @@ knitr::opts_chunk$set(echo = TRUE) # Load packages library(dplyr) # tidyverse data frame manipulation package library(tidyr) # functions to help clean data -library(magrittr) # this package provides the pipe operator %>% library(readxl) # read excel files library(ggplot2) # highly customizable plots +library(broom) # clean up linear model results ``` ## R Markdown @@ -30,11 +30,9 @@ hist(val, breaks = 20) ```{r} # The ggplot version of the same plot - ggplot(data = tibble(values = val), - mapping = aes(x = values))+ - geom_histogram(bins = 20) - + mapping = aes(x = values)) + +geom_histogram(bins = 20) ``` @@ -53,29 +51,21 @@ The data we will be analyzing is from the PanTHERIA database which is "a global # na = "NA" tells read_xlsx how missing values appear in the data # the default is empty cells. Run "?read_xlsx" for more info -sheet1 <- read_xlsx(path = "PanTHERIA.xlsx", sheet = 1, na = "NA") -sheet2 <- read_xlsx(path = "PanTHERIA.xlsx", sheet = 2, na = "NA") -sheet3 <- read_xlsx(path = "PanTHERIA.xlsx", sheet = 3, na = "NA") - - -colnames(sheet1) == colnames(sheet2) & colnames(sheet2) == colnames(sheet3) # rbind (row-bind) combines data frames by row -pantheria <- rbind(sheet1, sheet2, sheet3) ``` ```{r} # How many rows and columns are there? -nrow(pantheria) -ncol(pantheria) + ``` ```{r} # What does the data look like? -head(pantheria) + ``` @@ -88,66 +78,36 @@ We will exploring adult body mass from these mammals as it relates to their trop Let's start by subsetting the data with `select()` ```{r} -# Pipes (%>%) work by passing the data in front of the pipe to the first argument +# Pipes (|>) work by passing the data in front of the pipe to the first argument # of the function after it, this prevents a lot of nested function calls and makes # code easier to read. -pantheria <- pantheria %>% - select(Order, - Family, - Genus, - Species, - TrophicLevel, - AdultBodyMass_g) %>% - drop_na() %>% - distinct() - -pantheria <- select(pantheria, - Order, - Family, - Genus, - Species, - TrophicLevel, - AdultBodyMass_g) -pantheria <- drop_na(pantheria) -pantheria <- distinct(pantheria) - - - ``` Data is almost never clean, for example there should be only 3 trophic levels: ```{r} -# unique elements of a vector -unique(pantheria$TrophicLevel) + ``` Let's fix the TrophicLevel column using `mutate()` ```{r} # mutate allows us to add columns or modify existing ones -pantheria <- mutate(pantheria,TrophicLevel = tolower(TrophicLevel)) -pantheria %>% - mutate(AdultBodyMass_g = gsub("\\.", "-",AdultBodyMass_g)) ``` + ## Exercise 5: Summarizing data Now we can summarize the adult body mass by trophic level by computing standard metrics like mean and standard deviation. ```{r} -pantheria %>% - group_by(TrophicLevel) %>% - summarize(Mean = mean(AdultBodyMass_g), - Median = median(AdultBodyMass_g), - `Standard Deviation` = sd(AdultBodyMass_g), - Min = min(AdultBodyMass_g), - Max = max(AdultBodyMass_g)) + ``` + ## Exercise 6: Plotting Data According to the table above, body masses have a really wide range across trophic levels. Let's visualize the distribution of adult body masses. @@ -156,29 +116,13 @@ According to the table above, body masses have a really wide range across trophi # ggplot2 constructs graphics in layers, each layer is separated by "+" # x and y values are supplied in aes(), the type of plot is specified using # the "geom" functions -ggplot(data = pantheria, mapping = aes(x = log10(AdultBodyMass_g))) + - geom_histogram(bins = 50, fill ="#CE3274") + - xlab(label = "log10 Adult Body Mass") + - ylab(label = "Frequency") + - ggtitle(label = "Histogram of Adult Body Mass") + - theme_classic() + - theme(plot.title = element_text(hjust = .5, face = "bold"), - axis.title.x = element_text(face = "bold")) ``` -The data looks skewed even after log10 transformation. Let's view the distribution by trophic level. +The data looks skewed even after log2 transformation. Let's view the distribution by trophic level. ```{r} -pantheria %>% - ggplot(mapping = aes(x = log10(AdultBodyMass_g),fill = TrophicLevel)) + - geom_histogram(bins = 50) + - facet_grid(rows = vars(TrophicLevel)) + - xlab(label = "log10 Adult Body Mass") + - ylab(label = "Frequency") + - ggtitle(label = "Histogram of Adult Body Mass") + - theme_classic() + - theme(plot.title = element_text(hjust = .5)) + ``` It is clear that trophic level does have an impact on the distribution of adult body mass, carnivores tend to be smaller which makes sense because carnivores have higher metabolic demands and so there might be a selection pressure towards smaller carnivores. If we wanted to confirm this by fitting a model, we could use the `lm()` function to fit a linear model. @@ -189,10 +133,8 @@ An important caveat of the data is that some Orders of mammals are more biodiver ```{r} # Your code -pantheria %>% - group_by(Order) %>% - summarise(n = (n()/nrow(pantheria)) * 100) %>% - arrange(desc(n)) + + ``` @@ -232,49 +174,47 @@ pantheria %>% *Exercise 7.1 hint: group_by + summarize(n = n()) + arrange* -## Exercise 7.2: Hands on coding +## Exercise 7.2: Visualize Top Orders Now that we see what the over represented Orders are, we can plot their body masses by trophic level to see if they are skewing the overall distributions. ```{r} -# Character vector of the top 2 Orders from above -top_orders <- c("Rodentia", "Chiroptera") -# filter uses a conditional to select rows from the data - -pantheria %>% - filter(Order %in% top_orders) %>% - ggplot(mapping = aes(x = log10(AdultBodyMass_g), fill = Order)) + - geom_histogram(bins = ) + - facet_grid(rows = vars(TrophicLevel), - cols = vars(Order)) ``` It looks like one of them is mostly made up of small carnivores. Let's remove it and redo the plot of body mass distribution by trophic level. ```{r} -pantheria %>% - filter(Order != "Chiroptera") %>% - ggplot(mapping = aes(x = log10(AdultBodyMass_g),fill = TrophicLevel)) + - geom_histogram(bins = 40 ) + - facet_grid(rows = vars(TrophicLevel)) -pantheria %>% - ggplot(mapping = aes(x = log10(AdultBodyMass_g),fill = TrophicLevel)) ``` -We can see now that body mass of carnivorous mammals is much less skewed than the initial plots show. There is still an effect of trophic level on body mass, but the effect size is likely much smaller than we would estimate by including all `r sum(pantheria$Order == "Chiroptera")` *Chiroptera*. Now that we have generated these plots, we can generate a full report that contains all of the text and code, click `Knit` to render the HTML report. +We can see now that body mass of carnivorous mammals is much less skewed than the initial plots show. There is still an effect of trophic level on body mass, but the effect size is likely much smaller than we would estimate by including all `r sum(pantheria$Order == "Chiroptera")` *Chiroptera*. + + +## Exercise 7.3: Fit a linear model + +```{r} + +``` + + +```{r} + +``` + + +Now that we have generated these plots and results, we can generate a full report that contains all of the text and code, click `Knit` to render the HTML report. ## End of workshop exercises Hopefully this workshop has provided a good foundation for you to learn R. If you would like some additional practice, check out the resources on the [workshop wiki](https://github.com/gladstone-institutes/Bioinformatics-Workshops/wiki/Introduction-to-R-for-Data-Analysis). R also contains many built in datasets you can use for practice: ```{r, echo=FALSE} available_datasets <- data() -available_datasets$results %>% - as_tibble() %>% - select(-LibPath) %>% knitr::kable() +available_datasets$results |> + as_tibble() |> + select(-LibPath) |> knitr::kable() ``` diff --git a/intro-r-data-analysis/Intro_to_R_workshop_materials/part_2.html b/intro-r-data-analysis/Intro_to_R_workshop_materials/part_2.html deleted file mode 100644 index d25ea3e..0000000 --- a/intro-r-data-analysis/Intro_to_R_workshop_materials/part_2.html +++ /dev/null @@ -1,1341 +0,0 @@ - - - - - - - - - - - - - - -Intro to R Data Analysis: Part 2 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - -
-

R Markdown

-

This is an R Markdown document. Markdown is a simple formatting -syntax for authoring HTML, PDF, and MS Word documents. For more details -on using R Markdown see http://rmarkdown.rstudio.com. Guide to markdown syntax -https://www.markdownguide.org/basic-syntax/.

-

When you click the Knit button a document will be -generated that includes both content as well as the output of any -embedded R code chunks within the document. You can embed an R code -chunk like this:

-
# Simulates 100 observations from a normal distribution
-# and plots a histogram
-val <- rnorm(n = 100)
-hist(val) 
-

-
# The ggplot version of the same plot
-  ggplot(data = tibble(values = val),
-         mapping = aes(x = values))+
-  geom_histogram(bins = 20)
-

-

Important: before running the code below, click Session -> -Set Working Directory -> To Source File Location

-
-
-

Exercise 3: Reading in Data

-

The data we will be analyzing is from the PanTHERIA database which is -“a global species-level data set of key life-history, ecological and -geographical traits of all known extant and recently extinct mammals -(PanTHERIA) developed for a number of macroecological and -macroevolutionary research projects.”

-
# The data is spread across 3 sheets in an excel file. We need to 
-# combine these data into one table/data frame.
-
-# na = "NA" tells read_xlsx how missing values appear in the data
-# the default is empty cells. Run "?read_xlsx" for more info
-sheet1 <- read_xlsx(path = "PanTHERIA.xlsx", sheet = 1, na = "NA")
-sheet2 <- read_xlsx(path = "PanTHERIA.xlsx", sheet = 2, na = "NA")
-sheet3 <- read_xlsx(path = "PanTHERIA.xlsx", sheet = 3, na = "NA")
-
-# rbind (row-bind) combines data frames by row
-pantheria <- rbind(sheet1, sheet2, sheet3)
-
# How many rows and columns are there?
-nrow(pantheria)
-
## [1] 2161
-
ncol(pantheria)
-
## [1] 54
-
# What does the data look like?
-head(pantheria)
-
## # A tibble: 6 × 54
-##   Order     Family          Genus Species Binomial ActivityCycle AdultBodyMass_g
-##   <chr>     <chr>           <chr> <chr>   <chr>    <chr>                   <dbl>
-## 1 Carnivora Canidae         Canis latrans Canis l… crepuscular            11989.
-## 2 Carnivora Canidae         Canis lupus   Canis l… crepuscular            31757.
-## 3 Carnivora Canidae         Canis simens… Canis s… diurnal                14362.
-## 4 Carnivora Canidae         Atel… microt… Atelocy… <NA>                    8363.
-## 5 Cetacea   Balaenopteridae Bala… muscul… Balaeno… <NA>               154321304.
-## 6 Cetacea   Balaenopteridae Bala… physal… Balaeno… <NA>                47506008.
-## # ℹ 47 more variables: AdultForearmLen_mm <dbl>, AdultHeadBodyLen_mm <dbl>,
-## #   AgeatEyeOpening_d <dbl>, AgeatFirstBirth_d <dbl>,
-## #   BasalMetRate_mLO2hr <dbl>, BasalMetRateMass_g <dbl>, DietBreadth <dbl>,
-## #   DispersalAge_d <dbl>, GestationLen_d <dbl>, HabitatBreadth <dbl>,
-## #   HomeRange_km2 <dbl>, HomeRange_Indiv_km2 <dbl>, InterbirthInterval_d <dbl>,
-## #   LitterSize <dbl>, LittersPerYear <dbl>, MaxLongevity_m <dbl>,
-## #   NeonateBodyMass_g <dbl>, NeonateHeadBodyLen_mm <dbl>, …
-
-
-

Exercise 4: Filtering and Reformatting Data

-

We will exploring adult body mass from these mammals as it relates to -their trophic level using dpylr and ggplot2. -Download the cheatsheets for these packages at the following links:

- -

Let’s start by subsetting the data with select()

-
# Pipes (%>%) work by passing the data in front of the pipe to the first argument
-# of the function after it, this prevents a lot of nested function calls and makes
-# code easier to read.
-
-pantheria <- pantheria %>%     # Passes pantheria as the first argument of select
-  select(Order,   
-         Family,               # select returns the specified columns
-         Genus,                
-         Species,
-         TrophicLevel,
-         AdultBodyMass_g) %>%
-  drop_na() %>%                # Remove any rows that have NAs
-  distinct()                   # Remove any duplicate rows
-

Data is almost never clean, for example there should be only 3 -trophic levels:

-
unique(pantheria$TrophicLevel) # unique elements of a vector
-
## [1] "carnivore" "herbivore" "omnivore"  "Omnivore"
-

Let’s fix the TrophicLevel column using mutate()

-
# mutate allows us to add columns or modify existing ones
-pantheria <- pantheria %>%
-  mutate(TrophicLevel = tolower(TrophicLevel)) # Make column lowercase
-
-
-

Exercise 5: Summarizing data

-

Now we can summarize the adult body mass by trophic level by -computing standard metrics like mean and standard deviation.

-
pantheria %>%
-  group_by(TrophicLevel) %>%                            # Group observations by this column
-  summarize(Mean = mean(AdultBodyMass_g),               # Summarize will calculate these group wise
-            `Standard Deviation` = sd(AdultBodyMass_g), # Quasi quotation lets us add spaces to column names
-            Min = min(AdultBodyMass_g),
-            Max = max(AdultBodyMass_g)) %>%
-  ungroup() %>%
-  arrange(desc(Mean))                                   # Order the data frame by descending mean body mass
-
## # A tibble: 3 × 5
-##   TrophicLevel    Mean `Standard Deviation`   Min        Max
-##   <chr>          <dbl>                <dbl> <dbl>      <dbl>
-## 1 carnivore    824948.             7963841.  1.96 154321304.
-## 2 omnivore      77419.             1173853.  3.29  27324024.
-## 3 herbivore     59854.              305498.  5.55   4750000.
-
-
-

Exercise 6: Plotting Data

-

According to the table above, body masses have a really wide range -across trophic levels. Let’s visualize the distribution of adult body -masses.

-
# ggplot2 constructs graphics in layers, each layer is separated by "+"
-# x and y values are supplied in aes(), the type of plot is specified using
-# the "geom" functions
-
-ggplot(data = pantheria,                             # input data
-       mapping =  aes(x = log10(AdultBodyMass_g))) + # log10 transform adult body mass
-  geom_histogram(fill = "#CE3274",                   # type of plot
-                 bins = 40) + 
-  xlab(label = "log10 Adult Body Mass (g)") +        # x label
-  ylab(label = "Frequency") +                        # y label
-  labs(title = "Histogram of log10 Adult Body Mass") # title 
-

-

The data looks skewed even after log10 transformation. Let’s view the -distribution by trophic level.

-
pantheria %>%
-  ggplot(aes(x = log10(AdultBodyMass_g), fill = TrophicLevel)) + # Color by trophic level
-  geom_histogram(bins = 40) +
-  facet_grid(rows = vars(TrophicLevel)) +                        # Split the plot into rows by trophic level
-  ylab(label = "Frequency") + 
-  xlab(label = "log10 Adult Body Mass (g)") + 
-  labs(title = "Histograms of log10 Adult Body Mass") +
-  theme(plot.title = element_text(hjust = 0.5))                  # Center the plot title
-

-

It is clear that trophic level does have an impact on the -distribution of adult body mass, carnivores tend to be smaller which -makes sense because carnivores have higher metabolic demands and so -there might be a selection pressure towards smaller carnivores. If we -wanted to confirm this by fitting a model, we could use the -lm() function to fit a linear model.

-
-
-

Exercise 7.1: Hands on coding

-

An important caveat of the data is that some Orders of mammals are -more biodiverse than others and are therefore over represented in the -dataset. Using the dplyr cheatsheet, write code generates a -table of Orders and what percentage of the data they are. Scroll down to -see the hint if you are having trouble.

-
# Your code 
-

Exercise 7.1 hint: group_by + summarize + arrange

-
-
-

Exercise 7.2: Hands on coding

-

Now that we see what the over represented Orders are, we can plot -their body masses by trophic level to see if they are skewing the -overall distributions.

-
top_orders <- c("Rodentia", "Chiroptera") # Character vector of the top 2 Orders from above
-# filter uses a conditional to select rows from the data
-pantheria %>%
-  filter(Order %in% top_orders) %>%
-  ggplot(aes(x = log10(AdultBodyMass_g), fill = Order)) +
-  geom_histogram(bins = 40) +
-  facet_grid(rows = vars(TrophicLevel),
-             cols = vars(Order)) +
-  ylab(label = "Frequency") + 
-  xlab(label = "log10 Adult Body Mass (g)") + 
-  theme(plot.title = element_text(hjust = 0.5))   
-

-

It looks like one of them is mostly made up of small cornivores. -Let’s remove it and redo the plot of body mass distribution by trophic -level.

-
pantheria %>%
-  filter(Order != "Chiroptera") %>%
-  ggplot(aes(x = log10(AdultBodyMass_g),fill = TrophicLevel)) + # Color by trophic level
-  geom_histogram(bins = 40) +
-  facet_grid(rows = vars(TrophicLevel)) +                       # Split the plot into rows by trophic level
-  ylab(label = "Frequency") + 
-  xlab(label = "log10 Adult Body Mass (g)") + 
-  labs(title = "Histograms of log10 Adult Body Mass")
-

-

We can see now that body mass of carnivorous mammals is much less -skewed than the initial plots show. There is still an effect of trophic -level on body mass, but the effect size is likely much smaller than we -would estimate by including all 393 Chiroptera. Now that we -have generated these plots, we can generate a full report that contains -all of the text and code, click Knit to render the HTML -report.

-
-
-

End of workshop exercises

-

Hopefully this workshop has provided a good foundation for you to -learn R. If you would like some additional practice, check out the -resources on the workshop -wiki. R also contains many built in datasets you can use for -practice:

- ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
PackageItemTitle
ggplot2diamondsPrices of over 50,000 round cut diamonds
ggplot2economicsUS economic time series
ggplot2economics_longUS economic time series
ggplot2faithfuld2d density estimate of Old Faithful data
ggplot2luv_colours‘colors()’ in Luv space
ggplot2midwestMidwest demographics
ggplot2mpgFuel economy data from 1999 to 2008 for 38 popular -models of cars
ggplot2msleepAn updated and expanded version of the mammals sleep -dataset
ggplot2presidentialTerms of 12 presidents from Eisenhower to Trump
ggplot2sealsVector field of seal movements
ggplot2txhousingHousing sales in TX
tidyrbillboardSong rankings for Billboard top 100 in the year -2000
tidyrcms_patient_careData from the Centers for Medicare & Medicaid -Services
tidyrcms_patient_experienceData from the Centers for Medicare & Medicaid -Services
tidyrconstructionCompleted construction in the US in 2018
tidyrfish_encountersFish encounters
tidyrhouseholdHousehold data
tidyrpopulationWorld Health Organization TB data
tidyrrelig_incomePew religion and income survey
tidyrsmithsSome data about the Smith family
tidyrtable1Example tabular representations
tidyrtable2Example tabular representations
tidyrtable3Example tabular representations
tidyrtable4aExample tabular representations
tidyrtable4bExample tabular representations
tidyrtable5Example tabular representations
tidyrus_rent_incomeUS rent and income data
tidyrwhoWorld Health Organization TB data
tidyrwho2World Health Organization TB data
tidyrworld_bank_popPopulation data from the world bank
dplyrband_instrumentsBand membership
dplyrband_instruments2Band membership
dplyrband_membersBand membership
dplyrstarwarsStarwars characters
dplyrstormsStorm tracks data
datasetsAirPassengersMonthly Airline Passenger Numbers 1949-1960
datasetsBJsalesSales Data with Leading Indicator
datasetsBJsales.lead (BJsales)Sales Data with Leading Indicator
datasetsBODBiochemical Oxygen Demand
datasetsCO2Carbon Dioxide Uptake in Grass Plants
datasetsChickWeightWeight versus age of chicks on different diets
datasetsDNaseElisa assay of DNase
datasetsEuStockMarketsDaily Closing Prices of Major European Stock Indices, -1991-1998
datasetsFormaldehydeDetermination of Formaldehyde
datasetsHairEyeColorHair and Eye Color of Statistics Students
datasetsHarman23.corHarman Example 2.3
datasetsHarman74.corHarman Example 7.4
datasetsIndomethPharmacokinetics of Indomethacin
datasetsInsectSpraysEffectiveness of Insect Sprays
datasetsJohnsonJohnsonQuarterly Earnings per Johnson & Johnson Share
datasetsLakeHuronLevel of Lake Huron 1875-1972
datasetsLifeCycleSavingsIntercountry Life-Cycle Savings Data
datasetsLoblollyGrowth of Loblolly pine trees
datasetsNileFlow of the River Nile
datasetsOrangeGrowth of Orange Trees
datasetsOrchardSpraysPotency of Orchard Sprays
datasetsPlantGrowthResults from an Experiment on Plant Growth
datasetsPuromycinReaction Velocity of an Enzymatic Reaction
datasetsSeatbeltsRoad Casualties in Great Britain 1969-84
datasetsTheophPharmacokinetics of Theophylline
datasetsTitanicSurvival of passengers on the Titanic
datasetsToothGrowthThe Effect of Vitamin C on Tooth Growth in Guinea -Pigs
datasetsUCBAdmissionsStudent Admissions at UC Berkeley
datasetsUKDriverDeathsRoad Casualties in Great Britain 1969-84
datasetsUKgasUK Quarterly Gas Consumption
datasetsUSAccDeathsAccidental Deaths in the US 1973-1978
datasetsUSArrestsViolent Crime Rates by US State
datasetsUSJudgeRatingsLawyers’ Ratings of State Judges in the US Superior -Court
datasetsUSPersonalExpenditurePersonal Expenditure Data
datasetsUScitiesDDistances Between European Cities and Between US -Cities
datasetsVADeathsDeath Rates in Virginia (1940)
datasetsWWWusageInternet Usage per Minute
datasetsWorldPhonesThe World’s Telephones
datasetsability.covAbility and Intelligence Tests
datasetsairmilesPassenger Miles on Commercial US Airlines, -1937-1960
datasetsairqualityNew York Air Quality Measurements
datasetsanscombeAnscombe’s Quartet of ‘Identical’ Simple Linear -Regressions
datasetsattenuThe Joyner-Boore Attenuation Data
datasetsattitudeThe Chatterjee-Price Attitude Data
datasetsaustresQuarterly Time Series of the Number of Australian -Residents
datasetsbeaver1 (beavers)Body Temperature Series of Two Beavers
datasetsbeaver2 (beavers)Body Temperature Series of Two Beavers
datasetscarsSpeed and Stopping Distances of Cars
datasetschickwtsChicken Weights by Feed Type
datasetsco2Mauna Loa Atmospheric CO2 Concentration
datasetscrimtabStudent’s 3000 Criminals Data
datasetsdiscoveriesYearly Numbers of Important Discoveries
datasetsesophSmoking, Alcohol and (O)esophageal Cancer
datasetseuroConversion Rates of Euro Currencies
datasetseuro.cross (euro)Conversion Rates of Euro Currencies
datasetseurodistDistances Between European Cities and Between US -Cities
datasetsfaithfulOld Faithful Geyser Data
datasetsfdeaths (UKLungDeaths)Monthly Deaths from Lung Diseases in the UK
datasetsfreenyFreeny’s Revenue Data
datasetsfreeny.x (freeny)Freeny’s Revenue Data
datasetsfreeny.y (freeny)Freeny’s Revenue Data
datasetsinfertInfertility after Spontaneous and Induced Abortion
datasetsirisEdgar Anderson’s Iris Data
datasetsiris3Edgar Anderson’s Iris Data
datasetsislandsAreas of the World’s Major Landmasses
datasetsldeaths (UKLungDeaths)Monthly Deaths from Lung Diseases in the UK
datasetslhLuteinizing Hormone in Blood Samples
datasetslongleyLongley’s Economic Regression Data
datasetslynxAnnual Canadian Lynx trappings 1821-1934
datasetsmdeaths (UKLungDeaths)Monthly Deaths from Lung Diseases in the UK
datasetsmorleyMichelson Speed of Light Data
datasetsmtcarsMotor Trend Car Road Tests
datasetsnhtempAverage Yearly Temperatures in New Haven
datasetsnottemAverage Monthly Temperatures at Nottingham, -1920-1939
datasetsnpkClassical N, P, K Factorial Experiment
datasetsoccupationalStatusOccupational Status of Fathers and their Sons
datasetsprecipAnnual Precipitation in US Cities
datasetspresidentsQuarterly Approval Ratings of US Presidents
datasetspressureVapor Pressure of Mercury as a Function of -Temperature
datasetsquakesLocations of Earthquakes off Fiji
datasetsranduRandom Numbers from Congruential Generator RANDU
datasetsriversLengths of Major North American Rivers
datasetsrockMeasurements on Petroleum Rock Samples
datasetssleepStudent’s Sleep Data
datasetsstack.loss (stackloss)Brownlee’s Stack Loss Plant Data
datasetsstack.x (stackloss)Brownlee’s Stack Loss Plant Data
datasetsstacklossBrownlee’s Stack Loss Plant Data
datasetsstate.abb (state)US State Facts and Figures
datasetsstate.area (state)US State Facts and Figures
datasetsstate.center (state)US State Facts and Figures
datasetsstate.division (state)US State Facts and Figures
datasetsstate.name (state)US State Facts and Figures
datasetsstate.region (state)US State Facts and Figures
datasetsstate.x77 (state)US State Facts and Figures
datasetssunspot.monthMonthly Sunspot Data, from 1749 to “Present”
datasetssunspot.yearYearly Sunspot Data, 1700-1988
datasetssunspotsMonthly Sunspot Numbers, 1749-1983
datasetsswissSwiss Fertility and Socioeconomic Indicators (1888) -Data
datasetstreeringYearly Treering Data, -6000-1979
datasetstreesDiameter, Height and Volume for Black Cherry Trees
datasetsuspopPopulations Recorded by the US Census
datasetsvolcanoTopographic Information on Auckland’s Maunga Whau -Volcano
datasetswarpbreaksThe Number of Breaks in Yarn during Weaving
datasetswomenAverage Heights and Weights for American Women
-
- - - - -
- - - - - - - - - - - - - - - diff --git a/intro-r-data-analysis/Intro_to_R_workshop_materials/part_2_filled_out.Rmd b/intro-r-data-analysis/Intro_to_R_workshop_materials/part_2_filled_out.Rmd index c97fae2..9de5dc0 100644 --- a/intro-r-data-analysis/Intro_to_R_workshop_materials/part_2_filled_out.Rmd +++ b/intro-r-data-analysis/Intro_to_R_workshop_materials/part_2_filled_out.Rmd @@ -10,9 +10,9 @@ knitr::opts_chunk$set(echo = TRUE) # Load packages library(dplyr) # tidyverse data frame manipulation package library(tidyr) # functions to help clean data -library(magrittr) # this package provides the pipe operator %>% library(readxl) # read excel files library(ggplot2) # highly customizable plots +library(broom) # clean up linear model results ``` ## R Markdown @@ -30,9 +30,9 @@ hist(val, breaks = 20) ```{r} # The ggplot version of the same plot - ggplot(data = tibble(values = val), - mapping = aes(x = values))+ - geom_histogram(bins = 20) +ggplot(data = tibble(values = val), + mapping = aes(x = values)) + +geom_histogram(bins = 20) ``` @@ -81,18 +81,18 @@ We will exploring adult body mass from these mammals as it relates to their trop Let's start by subsetting the data with `select()` ```{r} -# Pipes (%>%) work by passing the data in front of the pipe to the first argument +# Pipes (|>) work by passing the data in front of the pipe to the first argument # of the function after it, this prevents a lot of nested function calls and makes # code easier to read. -pantheria <- pantheria %>% # Passes pantheria as the first argument of select +pantheria <- pantheria |> # Passes pantheria as the first argument of select select(Order, Family, # select returns the specified columns Genus, Species, TrophicLevel, - AdultBodyMass_g) %>% - drop_na() %>% # Remove any rows that have NAs + AdultBodyMass_g) |> + drop_na() |> # Remove any rows that have NAs distinct() # Remove any duplicate rows ``` @@ -105,7 +105,7 @@ Let's fix the TrophicLevel column using `mutate()` ```{r} # mutate allows us to add columns or modify existing ones -pantheria <- pantheria %>% +pantheria <- pantheria |> mutate(TrophicLevel = tolower(TrophicLevel)) # Make column lowercase ``` @@ -115,13 +115,13 @@ pantheria <- pantheria %>% Now we can summarize the adult body mass by trophic level by computing standard metrics like mean and standard deviation. ```{r} -pantheria %>% - group_by(TrophicLevel) %>% # Group observations by this column +pantheria |> + group_by(TrophicLevel) |> # Group observations by this column summarize(Mean = mean(AdultBodyMass_g), # Summarize will calculate these group wise `Standard Deviation` = sd(AdultBodyMass_g), # Quasi quotation lets us add spaces to column names Min = min(AdultBodyMass_g), - Max = max(AdultBodyMass_g)) %>% - ungroup() %>% + Max = max(AdultBodyMass_g)) |> + ungroup() |> arrange(desc(Mean)) # Order the data frame by descending mean body mass ``` @@ -137,24 +137,24 @@ According to the table above, body masses have a really wide range across trophi # the "geom" functions ggplot(data = pantheria, # input data - mapping = aes(x = log10(AdultBodyMass_g))) + # log10 transform adult body mass + mapping = aes(x = log2(AdultBodyMass_g))) + # log2 transform adult body mass geom_histogram(fill = "#CE3274", # type of plot bins = 40) + - xlab(label = "log10 Adult Body Mass (g)") + # x label + xlab(label = "log2 Adult Body Mass (g)") + # x label ylab(label = "Frequency") + # y label - labs(title = "Histogram of log10 Adult Body Mass") # title + labs(title = "Histogram of log2 Adult Body Mass") # title ``` -The data looks skewed even after log10 transformation. Let's view the distribution by trophic level. +The data looks skewed even after log2 transformation. Let's view the distribution by trophic level. ```{r} -pantheria %>% - ggplot(aes(x = log10(AdultBodyMass_g), fill = TrophicLevel)) + # Color by trophic level +pantheria |> + ggplot(aes(x = log2(AdultBodyMass_g), fill = TrophicLevel)) + # Color by trophic level geom_histogram(bins = 40) + facet_grid(rows = vars(TrophicLevel)) + # Split the plot into rows by trophic level ylab(label = "Frequency") + - xlab(label = "log10 Adult Body Mass (g)") + - labs(title = "Histograms of log10 Adult Body Mass") + + xlab(label = "log2 Adult Body Mass (g)") + + labs(title = "Histograms of log2 Adult Body Mass") + theme(plot.title = element_text(hjust = 0.5)) # Center the plot title ``` @@ -166,9 +166,9 @@ An important caveat of the data is that some Orders of mammals are more biodiver ```{r} # Your code -pantheria %>% - group_by(Order) %>% - summarise(n = n()) %>% +pantheria |> + group_by(Order) |> + summarise(n = n()) |> arrange(desc(n)) ``` @@ -210,7 +210,7 @@ pantheria %>% *Exercise 7.1 hint: group_by + summarize(n = n()) + arrange* -## Exercise 7.2: Hands on coding +## Exercise 7.2: Visualize Top Orders Now that we see what the over represented Orders are, we can plot their body masses by trophic level to see if they are skewing the overall distributions. @@ -219,40 +219,66 @@ Now that we see what the over represented Orders are, we can plot their body mas top_orders <- c("Rodentia", "Chiroptera") # Character vector of the top 2 Orders from above # filter uses a conditional to select rows from the data -pantheria %>% - filter(Order %in% top_orders) %>% - ggplot(aes(x = log10(AdultBodyMass_g), fill = Order)) + +pantheria |> + filter(Order %in% top_orders) |> + ggplot(aes(x = log2(AdultBodyMass_g), fill = Order)) + geom_histogram(bins = 40) + facet_grid(rows = vars(TrophicLevel), cols = vars(Order)) + ylab(label = "Frequency") + - xlab(label = "log10 Adult Body Mass (g)") + + xlab(label = "log2 Adult Body Mass (g)") + theme(plot.title = element_text(hjust = 0.5)) ``` It looks like one of them is mostly made up of small carnivores. Let's remove it and redo the plot of body mass distribution by trophic level. ```{r} -pantheria %>% - filter(Order != "Chiroptera") %>% - ggplot(aes(x = log10(AdultBodyMass_g),fill = TrophicLevel)) + # Color by trophic level +pantheria |> + filter(Order != "Chiroptera") |> + ggplot(aes(x = log2(AdultBodyMass_g),fill = TrophicLevel)) + # Color by trophic level geom_histogram(bins = 40) + facet_grid(rows = vars(TrophicLevel)) + # Split the plot into rows by trophic level ylab(label = "Frequency") + - xlab(label = "log10 Adult Body Mass (g)") + - labs(title = "Histograms of log10 Adult Body Mass") + xlab(label = "log2 Adult Body Mass (g)") + + labs(title = "Histograms of log2 Adult Body Mass") ``` -We can see now that body mass of carnivorous mammals is much less skewed than the initial plots show. There is still an effect of trophic level on body mass, but the effect size is likely much smaller than we would estimate by including all `r sum(pantheria$Order == "Chiroptera")` *Chiroptera*. Now that we have generated these plots, we can generate a full report that contains all of the text and code, click `Knit` to render the HTML report. +We can see now that body mass of carnivorous mammals is much less skewed than the initial plots show. There is still an effect of trophic level on body mass, but the effect size is likely much smaller than we would estimate by including all `r sum(pantheria$Order == "Chiroptera")` *Chiroptera*. + + +## Exercise 7.3: Fit a linear model + +```{r} +# Set the reference level to herbivore +pantheria <- pantheria |> + mutate(TrophicLevel = factor(TrophicLevel, + levels = c("herbivore","omnivore","carnivore"))) + +lm(data = pantheria, log2(AdultBodyMass_g) ~ TrophicLevel) |> + tidy() +``` + + +```{r} +# Remove Chiroptera +pantheria <- pantheria |> + filter(Order != "Chiroptera") + +lm(data = pantheria, log2(AdultBodyMass_g) ~ TrophicLevel) |> + tidy() +``` + + +Now that we have generated these plots and results, we can generate a full report that contains all of the text and code, click `Knit` to render the HTML report. ## End of workshop exercises Hopefully this workshop has provided a good foundation for you to learn R. If you would like some additional practice, check out the resources on the [workshop wiki](https://github.com/gladstone-institutes/Bioinformatics-Workshops/wiki/Introduction-to-R-for-Data-Analysis). R also contains many built in datasets you can use for practice: ```{r, echo=FALSE} available_datasets <- data() -available_datasets$results %>% - as_tibble() %>% - select(-LibPath) %>% knitr::kable() +available_datasets$results |> + as_tibble() |> + select(-LibPath) |> knitr::kable() ``` diff --git a/intro-r-data-analysis/Intro_to_R_workshop_materials/part_2_filled_out.html b/intro-r-data-analysis/Intro_to_R_workshop_materials/part_2_filled_out.html index daa94b5..73ff8ff 100644 --- a/intro-r-data-analysis/Intro_to_R_workshop_materials/part_2_filled_out.html +++ b/intro-r-data-analysis/Intro_to_R_workshop_materials/part_2_filled_out.html @@ -10,7 +10,7 @@ - + Intro to R Data Analysis: Part 2 @@ -350,7 +350,7 @@ display: none;

Intro to R Data Analysis: Part 2

-

2023-03-29

+

2024-01-18

@@ -369,12 +369,12 @@ chunk like this:

# and plots a histogram val <- rnorm(n = 100) hist(val, breaks = 20) -

+

# The ggplot version of the same plot
-  ggplot(data = tibble(values = val),
-         mapping = aes(x = values))+
-  geom_histogram(bins = 20)
-

+ggplot(data = tibble(values = val), + mapping = aes(x = values)) + +geom_histogram(bins = 20) +

Important: before running the code below, click Session -> Set Working Directory -> To Source File Location

@@ -432,18 +432,18 @@ cheatsheet cheatsheet

Let’s start by subsetting the data with select()

-
# Pipes (%>%) work by passing the data in front of the pipe to the first argument
+
# Pipes (|>) work by passing the data in front of the pipe to the first argument
 # of the function after it, this prevents a lot of nested function calls and makes
 # code easier to read.
 
-pantheria <- pantheria %>%     # Passes pantheria as the first argument of select
+pantheria <- pantheria |>     # Passes pantheria as the first argument of select
   select(Order,   
          Family,               # select returns the specified columns
          Genus,                
          Species,
          TrophicLevel,
-         AdultBodyMass_g) %>%
-  drop_na() %>%                # Remove any rows that have NAs
+         AdultBodyMass_g) |>
+  drop_na() |>                # Remove any rows that have NAs
   distinct()                   # Remove any duplicate rows

Data is almost never clean, for example there should be only 3 trophic levels:

@@ -451,20 +451,20 @@ trophic levels:

## [1] "carnivore" "herbivore" "omnivore"  "Omnivore"

Let’s fix the TrophicLevel column using mutate()

# mutate allows us to add columns or modify existing ones
-pantheria <- pantheria %>%
+pantheria <- pantheria |>
   mutate(TrophicLevel = tolower(TrophicLevel)) # Make column lowercase

Exercise 5: Summarizing data

Now we can summarize the adult body mass by trophic level by computing standard metrics like mean and standard deviation.

-
pantheria %>%
-  group_by(TrophicLevel) %>%                            # Group observations by this column
+
pantheria |>
+  group_by(TrophicLevel) |>                            # Group observations by this column
   summarize(Mean = mean(AdultBodyMass_g),               # Summarize will calculate these group wise
             `Standard Deviation` = sd(AdultBodyMass_g), # Quasi quotation lets us add spaces to column names
             Min = min(AdultBodyMass_g),
-            Max = max(AdultBodyMass_g)) %>%
-  ungroup() %>%
+            Max = max(AdultBodyMass_g)) |>
+  ungroup() |>
   arrange(desc(Mean))                                   # Order the data frame by descending mean body mass
## # A tibble: 3 × 5
 ##   TrophicLevel    Mean `Standard Deviation`   Min        Max
@@ -483,24 +483,24 @@ masses.

# the "geom" functions ggplot(data = pantheria, # input data - mapping = aes(x = log10(AdultBodyMass_g))) + # log10 transform adult body mass + mapping = aes(x = log2(AdultBodyMass_g))) + # log2 transform adult body mass geom_histogram(fill = "#CE3274", # type of plot bins = 40) + - xlab(label = "log10 Adult Body Mass (g)") + # x label + xlab(label = "log2 Adult Body Mass (g)") + # x label ylab(label = "Frequency") + # y label - labs(title = "Histogram of log10 Adult Body Mass") # title
-

-

The data looks skewed even after log10 transformation. Let’s view the + labs(title = "Histogram of log2 Adult Body Mass") # title

+

+

The data looks skewed even after log2 transformation. Let’s view the distribution by trophic level.

-
pantheria %>%
-  ggplot(aes(x = log10(AdultBodyMass_g), fill = TrophicLevel)) + # Color by trophic level
+
pantheria |>
+  ggplot(aes(x = log2(AdultBodyMass_g), fill = TrophicLevel)) + # Color by trophic level
   geom_histogram(bins = 40) +
   facet_grid(rows = vars(TrophicLevel)) +                        # Split the plot into rows by trophic level
   ylab(label = "Frequency") + 
-  xlab(label = "log10 Adult Body Mass (g)") + 
-  labs(title = "Histograms of log10 Adult Body Mass") +
+  xlab(label = "log2 Adult Body Mass (g)") + 
+  labs(title = "Histograms of log2 Adult Body Mass") +
   theme(plot.title = element_text(hjust = 0.5))                  # Center the plot title
-

+

It is clear that trophic level does have an impact on the distribution of adult body mass, carnivores tend to be smaller which makes sense because carnivores have higher metabolic demands and so @@ -516,9 +516,9 @@ dataset. Using the dplyr cheatsheet, write code generates a table of Orders and what percentage of the data they are. Scroll down to see the hint if you are having trouble.

# Your code 
-pantheria %>%
-  group_by(Order) %>%
-  summarise(n = n()) %>%
+pantheria |>
+  group_by(Order) |>
+  summarise(n = n()) |>
   arrange(desc(n))
## # A tibble: 29 × 2
 ##    Order               n
@@ -537,42 +537,70 @@ pantheria %>%
 

Exercise 7.1 hint: group_by + summarize(n = n()) + arrange

-
-

Exercise 7.2: Hands on coding

+
+

Exercise 7.2: Visualize Top Orders

Now that we see what the over represented Orders are, we can plot their body masses by trophic level to see if they are skewing the overall distributions.

top_orders <- c("Rodentia", "Chiroptera") # Character vector of the top 2 Orders from above
 # filter uses a conditional to select rows from the data
-pantheria %>%
-  filter(Order %in% top_orders) %>%
-  ggplot(aes(x = log10(AdultBodyMass_g), fill = Order)) +
+pantheria |>
+  filter(Order %in% top_orders) |>
+  ggplot(aes(x = log2(AdultBodyMass_g), fill = Order)) +
   geom_histogram(bins = 40) +
   facet_grid(rows = vars(TrophicLevel),
              cols = vars(Order)) +
   ylab(label = "Frequency") + 
-  xlab(label = "log10 Adult Body Mass (g)") + 
+  xlab(label = "log2 Adult Body Mass (g)") + 
   theme(plot.title = element_text(hjust = 0.5))   
-

-

It looks like one of them is mostly made up of small cornivores. +

+

It looks like one of them is mostly made up of small carnivores. Let’s remove it and redo the plot of body mass distribution by trophic level.

-
pantheria %>%
-  filter(Order != "Chiroptera") %>%
-  ggplot(aes(x = log10(AdultBodyMass_g),fill = TrophicLevel)) + # Color by trophic level
+
pantheria |>
+  filter(Order != "Chiroptera") |>
+  ggplot(aes(x = log2(AdultBodyMass_g),fill = TrophicLevel)) + # Color by trophic level
   geom_histogram(bins = 40) +
   facet_grid(rows = vars(TrophicLevel)) +                       # Split the plot into rows by trophic level
   ylab(label = "Frequency") + 
-  xlab(label = "log10 Adult Body Mass (g)") + 
-  labs(title = "Histograms of log10 Adult Body Mass")
-

+ xlab(label = "log2 Adult Body Mass (g)") + + labs(title = "Histograms of log2 Adult Body Mass")
+

We can see now that body mass of carnivorous mammals is much less skewed than the initial plots show. There is still an effect of trophic level on body mass, but the effect size is likely much smaller than we -would estimate by including all 393 Chiroptera. Now that we -have generated these plots, we can generate a full report that contains -all of the text and code, click Knit to render the HTML -report.

+would estimate by including all 393 Chiroptera.

+
+
+

Exercise 7.3: Fit a linear model

+
# Set the reference level to herbivore
+pantheria <- pantheria |>
+  mutate(TrophicLevel = factor(TrophicLevel,
+                               levels = c("herbivore","omnivore","carnivore")))
+
+lm(data = pantheria, log2(AdultBodyMass_g) ~ TrophicLevel) |>
+  tidy()
+
## # A tibble: 3 × 5
+##   term                  estimate std.error statistic  p.value
+##   <chr>                    <dbl>     <dbl>     <dbl>    <dbl>
+## 1 (Intercept)              10.2      0.166     61.6  0       
+## 2 TrophicLevelomnivore     -1.72     0.238     -7.22 7.52e-13
+## 3 TrophicLevelcarnivore    -2.82     0.253    -11.2  4.66e-28
+
# Remove Chiroptera
+pantheria <- pantheria |>
+  filter(Order != "Chiroptera")
+
+lm(data = pantheria, log2(AdultBodyMass_g) ~ TrophicLevel) |>
+  tidy()
+
## # A tibble: 3 × 5
+##   term                  estimate std.error statistic  p.value
+##   <chr>                    <dbl>     <dbl>     <dbl>    <dbl>
+## 1 (Intercept)             11.0       0.173     63.6  0       
+## 2 TrophicLevelomnivore    -2.08      0.243     -8.53 3.26e-17
+## 3 TrophicLevelcarnivore   -0.813     0.296     -2.74 6.16e- 3
+

Now that we have generated these plots and results, we can generate a +full report that contains all of the text and code, click +Knit to render the HTML report.

End of workshop exercises