diff --git a/intro-r-data-analysis/Intro_to_R_workshop_materials/part_2.Rmd b/intro-r-data-analysis/Intro_to_R_workshop_materials/part_2.Rmd old mode 100755 new mode 100644 index 960c09c..0ba2db5 --- a/intro-r-data-analysis/Intro_to_R_workshop_materials/part_2.Rmd +++ b/intro-r-data-analysis/Intro_to_R_workshop_materials/part_2.Rmd @@ -10,9 +10,9 @@ knitr::opts_chunk$set(echo = TRUE) # Load packages library(dplyr) # tidyverse data frame manipulation package library(tidyr) # functions to help clean data -library(magrittr) # this package provides the pipe operator %>% library(readxl) # read excel files library(ggplot2) # highly customizable plots +library(broom) # clean up linear model results ``` ## R Markdown @@ -30,11 +30,9 @@ hist(val, breaks = 20) ```{r} # The ggplot version of the same plot - ggplot(data = tibble(values = val), - mapping = aes(x = values))+ - geom_histogram(bins = 20) - + mapping = aes(x = values)) + +geom_histogram(bins = 20) ``` @@ -53,29 +51,21 @@ The data we will be analyzing is from the PanTHERIA database which is "a global # na = "NA" tells read_xlsx how missing values appear in the data # the default is empty cells. Run "?read_xlsx" for more info -sheet1 <- read_xlsx(path = "PanTHERIA.xlsx", sheet = 1, na = "NA") -sheet2 <- read_xlsx(path = "PanTHERIA.xlsx", sheet = 2, na = "NA") -sheet3 <- read_xlsx(path = "PanTHERIA.xlsx", sheet = 3, na = "NA") - - -colnames(sheet1) == colnames(sheet2) & colnames(sheet2) == colnames(sheet3) # rbind (row-bind) combines data frames by row -pantheria <- rbind(sheet1, sheet2, sheet3) ``` ```{r} # How many rows and columns are there? -nrow(pantheria) -ncol(pantheria) + ``` ```{r} # What does the data look like? -head(pantheria) + ``` @@ -88,66 +78,36 @@ We will exploring adult body mass from these mammals as it relates to their trop Let's start by subsetting the data with `select()` ```{r} -# Pipes (%>%) work by passing the data in front of the pipe to the first argument +# Pipes (|>) work by passing the data in front of the pipe to the first argument # of the function after it, this prevents a lot of nested function calls and makes # code easier to read. -pantheria <- pantheria %>% - select(Order, - Family, - Genus, - Species, - TrophicLevel, - AdultBodyMass_g) %>% - drop_na() %>% - distinct() - -pantheria <- select(pantheria, - Order, - Family, - Genus, - Species, - TrophicLevel, - AdultBodyMass_g) -pantheria <- drop_na(pantheria) -pantheria <- distinct(pantheria) - - - ``` Data is almost never clean, for example there should be only 3 trophic levels: ```{r} -# unique elements of a vector -unique(pantheria$TrophicLevel) + ``` Let's fix the TrophicLevel column using `mutate()` ```{r} # mutate allows us to add columns or modify existing ones -pantheria <- mutate(pantheria,TrophicLevel = tolower(TrophicLevel)) -pantheria %>% - mutate(AdultBodyMass_g = gsub("\\.", "-",AdultBodyMass_g)) ``` + ## Exercise 5: Summarizing data Now we can summarize the adult body mass by trophic level by computing standard metrics like mean and standard deviation. ```{r} -pantheria %>% - group_by(TrophicLevel) %>% - summarize(Mean = mean(AdultBodyMass_g), - Median = median(AdultBodyMass_g), - `Standard Deviation` = sd(AdultBodyMass_g), - Min = min(AdultBodyMass_g), - Max = max(AdultBodyMass_g)) + ``` + ## Exercise 6: Plotting Data According to the table above, body masses have a really wide range across trophic levels. Let's visualize the distribution of adult body masses. @@ -156,29 +116,13 @@ According to the table above, body masses have a really wide range across trophi # ggplot2 constructs graphics in layers, each layer is separated by "+" # x and y values are supplied in aes(), the type of plot is specified using # the "geom" functions -ggplot(data = pantheria, mapping = aes(x = log10(AdultBodyMass_g))) + - geom_histogram(bins = 50, fill ="#CE3274") + - xlab(label = "log10 Adult Body Mass") + - ylab(label = "Frequency") + - ggtitle(label = "Histogram of Adult Body Mass") + - theme_classic() + - theme(plot.title = element_text(hjust = .5, face = "bold"), - axis.title.x = element_text(face = "bold")) ``` -The data looks skewed even after log10 transformation. Let's view the distribution by trophic level. +The data looks skewed even after log2 transformation. Let's view the distribution by trophic level. ```{r} -pantheria %>% - ggplot(mapping = aes(x = log10(AdultBodyMass_g),fill = TrophicLevel)) + - geom_histogram(bins = 50) + - facet_grid(rows = vars(TrophicLevel)) + - xlab(label = "log10 Adult Body Mass") + - ylab(label = "Frequency") + - ggtitle(label = "Histogram of Adult Body Mass") + - theme_classic() + - theme(plot.title = element_text(hjust = .5)) + ``` It is clear that trophic level does have an impact on the distribution of adult body mass, carnivores tend to be smaller which makes sense because carnivores have higher metabolic demands and so there might be a selection pressure towards smaller carnivores. If we wanted to confirm this by fitting a model, we could use the `lm()` function to fit a linear model. @@ -189,10 +133,8 @@ An important caveat of the data is that some Orders of mammals are more biodiver ```{r} # Your code -pantheria %>% - group_by(Order) %>% - summarise(n = (n()/nrow(pantheria)) * 100) %>% - arrange(desc(n)) + + ``` @@ -232,49 +174,47 @@ pantheria %>% *Exercise 7.1 hint: group_by + summarize(n = n()) + arrange* -## Exercise 7.2: Hands on coding +## Exercise 7.2: Visualize Top Orders Now that we see what the over represented Orders are, we can plot their body masses by trophic level to see if they are skewing the overall distributions. ```{r} -# Character vector of the top 2 Orders from above -top_orders <- c("Rodentia", "Chiroptera") -# filter uses a conditional to select rows from the data - -pantheria %>% - filter(Order %in% top_orders) %>% - ggplot(mapping = aes(x = log10(AdultBodyMass_g), fill = Order)) + - geom_histogram(bins = ) + - facet_grid(rows = vars(TrophicLevel), - cols = vars(Order)) ``` It looks like one of them is mostly made up of small carnivores. Let's remove it and redo the plot of body mass distribution by trophic level. ```{r} -pantheria %>% - filter(Order != "Chiroptera") %>% - ggplot(mapping = aes(x = log10(AdultBodyMass_g),fill = TrophicLevel)) + - geom_histogram(bins = 40 ) + - facet_grid(rows = vars(TrophicLevel)) -pantheria %>% - ggplot(mapping = aes(x = log10(AdultBodyMass_g),fill = TrophicLevel)) ``` -We can see now that body mass of carnivorous mammals is much less skewed than the initial plots show. There is still an effect of trophic level on body mass, but the effect size is likely much smaller than we would estimate by including all `r sum(pantheria$Order == "Chiroptera")` *Chiroptera*. Now that we have generated these plots, we can generate a full report that contains all of the text and code, click `Knit` to render the HTML report. +We can see now that body mass of carnivorous mammals is much less skewed than the initial plots show. There is still an effect of trophic level on body mass, but the effect size is likely much smaller than we would estimate by including all `r sum(pantheria$Order == "Chiroptera")` *Chiroptera*. + + +## Exercise 7.3: Fit a linear model + +```{r} + +``` + + +```{r} + +``` + + +Now that we have generated these plots and results, we can generate a full report that contains all of the text and code, click `Knit` to render the HTML report. ## End of workshop exercises Hopefully this workshop has provided a good foundation for you to learn R. If you would like some additional practice, check out the resources on the [workshop wiki](https://github.com/gladstone-institutes/Bioinformatics-Workshops/wiki/Introduction-to-R-for-Data-Analysis). R also contains many built in datasets you can use for practice: ```{r, echo=FALSE} available_datasets <- data() -available_datasets$results %>% - as_tibble() %>% - select(-LibPath) %>% knitr::kable() +available_datasets$results |> + as_tibble() |> + select(-LibPath) |> knitr::kable() ``` diff --git a/intro-r-data-analysis/Intro_to_R_workshop_materials/part_2.html b/intro-r-data-analysis/Intro_to_R_workshop_materials/part_2.html deleted file mode 100644 index d25ea3e..0000000 --- a/intro-r-data-analysis/Intro_to_R_workshop_materials/part_2.html +++ /dev/null @@ -1,1341 +0,0 @@ - - - - -
- - - - - - - - - -This is an R Markdown document. Markdown is a simple formatting -syntax for authoring HTML, PDF, and MS Word documents. For more details -on using R Markdown see http://rmarkdown.rstudio.com. Guide to markdown syntax -https://www.markdownguide.org/basic-syntax/.
-When you click the Knit button a document will be -generated that includes both content as well as the output of any -embedded R code chunks within the document. You can embed an R code -chunk like this:
-# Simulates 100 observations from a normal distribution
-# and plots a histogram
-val <- rnorm(n = 100)
-hist(val)
-# The ggplot version of the same plot
- ggplot(data = tibble(values = val),
- mapping = aes(x = values))+
- geom_histogram(bins = 20)
-Important: before running the code below, click Session -> -Set Working Directory -> To Source File Location
-The data we will be analyzing is from the PanTHERIA database which is -“a global species-level data set of key life-history, ecological and -geographical traits of all known extant and recently extinct mammals -(PanTHERIA) developed for a number of macroecological and -macroevolutionary research projects.”
-# The data is spread across 3 sheets in an excel file. We need to
-# combine these data into one table/data frame.
-
-# na = "NA" tells read_xlsx how missing values appear in the data
-# the default is empty cells. Run "?read_xlsx" for more info
-sheet1 <- read_xlsx(path = "PanTHERIA.xlsx", sheet = 1, na = "NA")
-sheet2 <- read_xlsx(path = "PanTHERIA.xlsx", sheet = 2, na = "NA")
-sheet3 <- read_xlsx(path = "PanTHERIA.xlsx", sheet = 3, na = "NA")
-
-# rbind (row-bind) combines data frames by row
-pantheria <- rbind(sheet1, sheet2, sheet3)
-# How many rows and columns are there?
-nrow(pantheria)
-## [1] 2161
-ncol(pantheria)
-## [1] 54
-# What does the data look like?
-head(pantheria)
-## # A tibble: 6 × 54
-## Order Family Genus Species Binomial ActivityCycle AdultBodyMass_g
-## <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
-## 1 Carnivora Canidae Canis latrans Canis l… crepuscular 11989.
-## 2 Carnivora Canidae Canis lupus Canis l… crepuscular 31757.
-## 3 Carnivora Canidae Canis simens… Canis s… diurnal 14362.
-## 4 Carnivora Canidae Atel… microt… Atelocy… <NA> 8363.
-## 5 Cetacea Balaenopteridae Bala… muscul… Balaeno… <NA> 154321304.
-## 6 Cetacea Balaenopteridae Bala… physal… Balaeno… <NA> 47506008.
-## # ℹ 47 more variables: AdultForearmLen_mm <dbl>, AdultHeadBodyLen_mm <dbl>,
-## # AgeatEyeOpening_d <dbl>, AgeatFirstBirth_d <dbl>,
-## # BasalMetRate_mLO2hr <dbl>, BasalMetRateMass_g <dbl>, DietBreadth <dbl>,
-## # DispersalAge_d <dbl>, GestationLen_d <dbl>, HabitatBreadth <dbl>,
-## # HomeRange_km2 <dbl>, HomeRange_Indiv_km2 <dbl>, InterbirthInterval_d <dbl>,
-## # LitterSize <dbl>, LittersPerYear <dbl>, MaxLongevity_m <dbl>,
-## # NeonateBodyMass_g <dbl>, NeonateHeadBodyLen_mm <dbl>, …
-We will exploring adult body mass from these mammals as it relates to
-their trophic level using dpylr and ggplot2.
-Download the cheatsheets for these packages at the following links:
Let’s start by subsetting the data with select()
# Pipes (%>%) work by passing the data in front of the pipe to the first argument
-# of the function after it, this prevents a lot of nested function calls and makes
-# code easier to read.
-
-pantheria <- pantheria %>% # Passes pantheria as the first argument of select
- select(Order,
- Family, # select returns the specified columns
- Genus,
- Species,
- TrophicLevel,
- AdultBodyMass_g) %>%
- drop_na() %>% # Remove any rows that have NAs
- distinct() # Remove any duplicate rows
-Data is almost never clean, for example there should be only 3 -trophic levels:
-unique(pantheria$TrophicLevel) # unique elements of a vector
-## [1] "carnivore" "herbivore" "omnivore" "Omnivore"
-Let’s fix the TrophicLevel column using mutate()
# mutate allows us to add columns or modify existing ones
-pantheria <- pantheria %>%
- mutate(TrophicLevel = tolower(TrophicLevel)) # Make column lowercase
-Now we can summarize the adult body mass by trophic level by -computing standard metrics like mean and standard deviation.
-pantheria %>%
- group_by(TrophicLevel) %>% # Group observations by this column
- summarize(Mean = mean(AdultBodyMass_g), # Summarize will calculate these group wise
- `Standard Deviation` = sd(AdultBodyMass_g), # Quasi quotation lets us add spaces to column names
- Min = min(AdultBodyMass_g),
- Max = max(AdultBodyMass_g)) %>%
- ungroup() %>%
- arrange(desc(Mean)) # Order the data frame by descending mean body mass
-## # A tibble: 3 × 5
-## TrophicLevel Mean `Standard Deviation` Min Max
-## <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 carnivore 824948. 7963841. 1.96 154321304.
-## 2 omnivore 77419. 1173853. 3.29 27324024.
-## 3 herbivore 59854. 305498. 5.55 4750000.
-According to the table above, body masses have a really wide range -across trophic levels. Let’s visualize the distribution of adult body -masses.
-# ggplot2 constructs graphics in layers, each layer is separated by "+"
-# x and y values are supplied in aes(), the type of plot is specified using
-# the "geom" functions
-
-ggplot(data = pantheria, # input data
- mapping = aes(x = log10(AdultBodyMass_g))) + # log10 transform adult body mass
- geom_histogram(fill = "#CE3274", # type of plot
- bins = 40) +
- xlab(label = "log10 Adult Body Mass (g)") + # x label
- ylab(label = "Frequency") + # y label
- labs(title = "Histogram of log10 Adult Body Mass") # title
-The data looks skewed even after log10 transformation. Let’s view the -distribution by trophic level.
-pantheria %>%
- ggplot(aes(x = log10(AdultBodyMass_g), fill = TrophicLevel)) + # Color by trophic level
- geom_histogram(bins = 40) +
- facet_grid(rows = vars(TrophicLevel)) + # Split the plot into rows by trophic level
- ylab(label = "Frequency") +
- xlab(label = "log10 Adult Body Mass (g)") +
- labs(title = "Histograms of log10 Adult Body Mass") +
- theme(plot.title = element_text(hjust = 0.5)) # Center the plot title
-It is clear that trophic level does have an impact on the
-distribution of adult body mass, carnivores tend to be smaller which
-makes sense because carnivores have higher metabolic demands and so
-there might be a selection pressure towards smaller carnivores. If we
-wanted to confirm this by fitting a model, we could use the
-lm() function to fit a linear model.
An important caveat of the data is that some Orders of mammals are
-more biodiverse than others and are therefore over represented in the
-dataset. Using the dplyr cheatsheet, write code generates a
-table of Orders and what percentage of the data they are. Scroll down to
-see the hint if you are having trouble.
# Your code
-Exercise 7.1 hint: group_by + summarize + arrange
-Now that we see what the over represented Orders are, we can plot -their body masses by trophic level to see if they are skewing the -overall distributions.
-top_orders <- c("Rodentia", "Chiroptera") # Character vector of the top 2 Orders from above
-# filter uses a conditional to select rows from the data
-pantheria %>%
- filter(Order %in% top_orders) %>%
- ggplot(aes(x = log10(AdultBodyMass_g), fill = Order)) +
- geom_histogram(bins = 40) +
- facet_grid(rows = vars(TrophicLevel),
- cols = vars(Order)) +
- ylab(label = "Frequency") +
- xlab(label = "log10 Adult Body Mass (g)") +
- theme(plot.title = element_text(hjust = 0.5))
-It looks like one of them is mostly made up of small cornivores. -Let’s remove it and redo the plot of body mass distribution by trophic -level.
-pantheria %>%
- filter(Order != "Chiroptera") %>%
- ggplot(aes(x = log10(AdultBodyMass_g),fill = TrophicLevel)) + # Color by trophic level
- geom_histogram(bins = 40) +
- facet_grid(rows = vars(TrophicLevel)) + # Split the plot into rows by trophic level
- ylab(label = "Frequency") +
- xlab(label = "log10 Adult Body Mass (g)") +
- labs(title = "Histograms of log10 Adult Body Mass")
-We can see now that body mass of carnivorous mammals is much less
-skewed than the initial plots show. There is still an effect of trophic
-level on body mass, but the effect size is likely much smaller than we
-would estimate by including all 393 Chiroptera. Now that we
-have generated these plots, we can generate a full report that contains
-all of the text and code, click Knit to render the HTML
-report.
Hopefully this workshop has provided a good foundation for you to -learn R. If you would like some additional practice, check out the -resources on the workshop -wiki. R also contains many built in datasets you can use for -practice:
-| Package | -Item | -Title | -
|---|---|---|
| ggplot2 | -diamonds | -Prices of over 50,000 round cut diamonds | -
| ggplot2 | -economics | -US economic time series | -
| ggplot2 | -economics_long | -US economic time series | -
| ggplot2 | -faithfuld | -2d density estimate of Old Faithful data | -
| ggplot2 | -luv_colours | -‘colors()’ in Luv space | -
| ggplot2 | -midwest | -Midwest demographics | -
| ggplot2 | -mpg | -Fuel economy data from 1999 to 2008 for 38 popular -models of cars | -
| ggplot2 | -msleep | -An updated and expanded version of the mammals sleep -dataset | -
| ggplot2 | -presidential | -Terms of 12 presidents from Eisenhower to Trump | -
| ggplot2 | -seals | -Vector field of seal movements | -
| ggplot2 | -txhousing | -Housing sales in TX | -
| tidyr | -billboard | -Song rankings for Billboard top 100 in the year -2000 | -
| tidyr | -cms_patient_care | -Data from the Centers for Medicare & Medicaid -Services | -
| tidyr | -cms_patient_experience | -Data from the Centers for Medicare & Medicaid -Services | -
| tidyr | -construction | -Completed construction in the US in 2018 | -
| tidyr | -fish_encounters | -Fish encounters | -
| tidyr | -household | -Household data | -
| tidyr | -population | -World Health Organization TB data | -
| tidyr | -relig_income | -Pew religion and income survey | -
| tidyr | -smiths | -Some data about the Smith family | -
| tidyr | -table1 | -Example tabular representations | -
| tidyr | -table2 | -Example tabular representations | -
| tidyr | -table3 | -Example tabular representations | -
| tidyr | -table4a | -Example tabular representations | -
| tidyr | -table4b | -Example tabular representations | -
| tidyr | -table5 | -Example tabular representations | -
| tidyr | -us_rent_income | -US rent and income data | -
| tidyr | -who | -World Health Organization TB data | -
| tidyr | -who2 | -World Health Organization TB data | -
| tidyr | -world_bank_pop | -Population data from the world bank | -
| dplyr | -band_instruments | -Band membership | -
| dplyr | -band_instruments2 | -Band membership | -
| dplyr | -band_members | -Band membership | -
| dplyr | -starwars | -Starwars characters | -
| dplyr | -storms | -Storm tracks data | -
| datasets | -AirPassengers | -Monthly Airline Passenger Numbers 1949-1960 | -
| datasets | -BJsales | -Sales Data with Leading Indicator | -
| datasets | -BJsales.lead (BJsales) | -Sales Data with Leading Indicator | -
| datasets | -BOD | -Biochemical Oxygen Demand | -
| datasets | -CO2 | -Carbon Dioxide Uptake in Grass Plants | -
| datasets | -ChickWeight | -Weight versus age of chicks on different diets | -
| datasets | -DNase | -Elisa assay of DNase | -
| datasets | -EuStockMarkets | -Daily Closing Prices of Major European Stock Indices, -1991-1998 | -
| datasets | -Formaldehyde | -Determination of Formaldehyde | -
| datasets | -HairEyeColor | -Hair and Eye Color of Statistics Students | -
| datasets | -Harman23.cor | -Harman Example 2.3 | -
| datasets | -Harman74.cor | -Harman Example 7.4 | -
| datasets | -Indometh | -Pharmacokinetics of Indomethacin | -
| datasets | -InsectSprays | -Effectiveness of Insect Sprays | -
| datasets | -JohnsonJohnson | -Quarterly Earnings per Johnson & Johnson Share | -
| datasets | -LakeHuron | -Level of Lake Huron 1875-1972 | -
| datasets | -LifeCycleSavings | -Intercountry Life-Cycle Savings Data | -
| datasets | -Loblolly | -Growth of Loblolly pine trees | -
| datasets | -Nile | -Flow of the River Nile | -
| datasets | -Orange | -Growth of Orange Trees | -
| datasets | -OrchardSprays | -Potency of Orchard Sprays | -
| datasets | -PlantGrowth | -Results from an Experiment on Plant Growth | -
| datasets | -Puromycin | -Reaction Velocity of an Enzymatic Reaction | -
| datasets | -Seatbelts | -Road Casualties in Great Britain 1969-84 | -
| datasets | -Theoph | -Pharmacokinetics of Theophylline | -
| datasets | -Titanic | -Survival of passengers on the Titanic | -
| datasets | -ToothGrowth | -The Effect of Vitamin C on Tooth Growth in Guinea -Pigs | -
| datasets | -UCBAdmissions | -Student Admissions at UC Berkeley | -
| datasets | -UKDriverDeaths | -Road Casualties in Great Britain 1969-84 | -
| datasets | -UKgas | -UK Quarterly Gas Consumption | -
| datasets | -USAccDeaths | -Accidental Deaths in the US 1973-1978 | -
| datasets | -USArrests | -Violent Crime Rates by US State | -
| datasets | -USJudgeRatings | -Lawyers’ Ratings of State Judges in the US Superior -Court | -
| datasets | -USPersonalExpenditure | -Personal Expenditure Data | -
| datasets | -UScitiesD | -Distances Between European Cities and Between US -Cities | -
| datasets | -VADeaths | -Death Rates in Virginia (1940) | -
| datasets | -WWWusage | -Internet Usage per Minute | -
| datasets | -WorldPhones | -The World’s Telephones | -
| datasets | -ability.cov | -Ability and Intelligence Tests | -
| datasets | -airmiles | -Passenger Miles on Commercial US Airlines, -1937-1960 | -
| datasets | -airquality | -New York Air Quality Measurements | -
| datasets | -anscombe | -Anscombe’s Quartet of ‘Identical’ Simple Linear -Regressions | -
| datasets | -attenu | -The Joyner-Boore Attenuation Data | -
| datasets | -attitude | -The Chatterjee-Price Attitude Data | -
| datasets | -austres | -Quarterly Time Series of the Number of Australian -Residents | -
| datasets | -beaver1 (beavers) | -Body Temperature Series of Two Beavers | -
| datasets | -beaver2 (beavers) | -Body Temperature Series of Two Beavers | -
| datasets | -cars | -Speed and Stopping Distances of Cars | -
| datasets | -chickwts | -Chicken Weights by Feed Type | -
| datasets | -co2 | -Mauna Loa Atmospheric CO2 Concentration | -
| datasets | -crimtab | -Student’s 3000 Criminals Data | -
| datasets | -discoveries | -Yearly Numbers of Important Discoveries | -
| datasets | -esoph | -Smoking, Alcohol and (O)esophageal Cancer | -
| datasets | -euro | -Conversion Rates of Euro Currencies | -
| datasets | -euro.cross (euro) | -Conversion Rates of Euro Currencies | -
| datasets | -eurodist | -Distances Between European Cities and Between US -Cities | -
| datasets | -faithful | -Old Faithful Geyser Data | -
| datasets | -fdeaths (UKLungDeaths) | -Monthly Deaths from Lung Diseases in the UK | -
| datasets | -freeny | -Freeny’s Revenue Data | -
| datasets | -freeny.x (freeny) | -Freeny’s Revenue Data | -
| datasets | -freeny.y (freeny) | -Freeny’s Revenue Data | -
| datasets | -infert | -Infertility after Spontaneous and Induced Abortion | -
| datasets | -iris | -Edgar Anderson’s Iris Data | -
| datasets | -iris3 | -Edgar Anderson’s Iris Data | -
| datasets | -islands | -Areas of the World’s Major Landmasses | -
| datasets | -ldeaths (UKLungDeaths) | -Monthly Deaths from Lung Diseases in the UK | -
| datasets | -lh | -Luteinizing Hormone in Blood Samples | -
| datasets | -longley | -Longley’s Economic Regression Data | -
| datasets | -lynx | -Annual Canadian Lynx trappings 1821-1934 | -
| datasets | -mdeaths (UKLungDeaths) | -Monthly Deaths from Lung Diseases in the UK | -
| datasets | -morley | -Michelson Speed of Light Data | -
| datasets | -mtcars | -Motor Trend Car Road Tests | -
| datasets | -nhtemp | -Average Yearly Temperatures in New Haven | -
| datasets | -nottem | -Average Monthly Temperatures at Nottingham, -1920-1939 | -
| datasets | -npk | -Classical N, P, K Factorial Experiment | -
| datasets | -occupationalStatus | -Occupational Status of Fathers and their Sons | -
| datasets | -precip | -Annual Precipitation in US Cities | -
| datasets | -presidents | -Quarterly Approval Ratings of US Presidents | -
| datasets | -pressure | -Vapor Pressure of Mercury as a Function of -Temperature | -
| datasets | -quakes | -Locations of Earthquakes off Fiji | -
| datasets | -randu | -Random Numbers from Congruential Generator RANDU | -
| datasets | -rivers | -Lengths of Major North American Rivers | -
| datasets | -rock | -Measurements on Petroleum Rock Samples | -
| datasets | -sleep | -Student’s Sleep Data | -
| datasets | -stack.loss (stackloss) | -Brownlee’s Stack Loss Plant Data | -
| datasets | -stack.x (stackloss) | -Brownlee’s Stack Loss Plant Data | -
| datasets | -stackloss | -Brownlee’s Stack Loss Plant Data | -
| datasets | -state.abb (state) | -US State Facts and Figures | -
| datasets | -state.area (state) | -US State Facts and Figures | -
| datasets | -state.center (state) | -US State Facts and Figures | -
| datasets | -state.division (state) | -US State Facts and Figures | -
| datasets | -state.name (state) | -US State Facts and Figures | -
| datasets | -state.region (state) | -US State Facts and Figures | -
| datasets | -state.x77 (state) | -US State Facts and Figures | -
| datasets | -sunspot.month | -Monthly Sunspot Data, from 1749 to “Present” | -
| datasets | -sunspot.year | -Yearly Sunspot Data, 1700-1988 | -
| datasets | -sunspots | -Monthly Sunspot Numbers, 1749-1983 | -
| datasets | -swiss | -Swiss Fertility and Socioeconomic Indicators (1888) -Data | -
| datasets | -treering | -Yearly Treering Data, -6000-1979 | -
| datasets | -trees | -Diameter, Height and Volume for Black Cherry Trees | -
| datasets | -uspop | -Populations Recorded by the US Census | -
| datasets | -volcano | -Topographic Information on Auckland’s Maunga Whau -Volcano | -
| datasets | -warpbreaks | -The Number of Breaks in Yarn during Weaving | -
| datasets | -women | -Average Heights and Weights for American Women | -
# The ggplot version of the same plot
- ggplot(data = tibble(values = val),
- mapping = aes(x = values))+
- geom_histogram(bins = 20)
-Important: before running the code below, click Session -> Set Working Directory -> To Source File Location
@@ -432,18 +432,18 @@ cheatsheet cheatsheetLet’s start by subsetting the data with select()
# Pipes (%>%) work by passing the data in front of the pipe to the first argument
+# Pipes (|>) work by passing the data in front of the pipe to the first argument
# of the function after it, this prevents a lot of nested function calls and makes
# code easier to read.
-pantheria <- pantheria %>% # Passes pantheria as the first argument of select
+pantheria <- pantheria |> # Passes pantheria as the first argument of select
select(Order,
Family, # select returns the specified columns
Genus,
Species,
TrophicLevel,
- AdultBodyMass_g) %>%
- drop_na() %>% # Remove any rows that have NAs
+ AdultBodyMass_g) |>
+ drop_na() |> # Remove any rows that have NAs
distinct() # Remove any duplicate rows
Data is almost never clean, for example there should be only 3
trophic levels:
@@ -451,20 +451,20 @@ trophic levels:
## [1] "carnivore" "herbivore" "omnivore" "Omnivore"
Let’s fix the TrophicLevel column using mutate()
# mutate allows us to add columns or modify existing ones
-pantheria <- pantheria %>%
+pantheria <- pantheria |>
mutate(TrophicLevel = tolower(TrophicLevel)) # Make column lowercase
Exercise 5: Summarizing data
Now we can summarize the adult body mass by trophic level by
computing standard metrics like mean and standard deviation.
-pantheria %>%
- group_by(TrophicLevel) %>% # Group observations by this column
+pantheria |>
+ group_by(TrophicLevel) |> # Group observations by this column
summarize(Mean = mean(AdultBodyMass_g), # Summarize will calculate these group wise
`Standard Deviation` = sd(AdultBodyMass_g), # Quasi quotation lets us add spaces to column names
Min = min(AdultBodyMass_g),
- Max = max(AdultBodyMass_g)) %>%
- ungroup() %>%
+ Max = max(AdultBodyMass_g)) |>
+ ungroup() |>
arrange(desc(Mean)) # Order the data frame by descending mean body mass
## # A tibble: 3 × 5
## TrophicLevel Mean `Standard Deviation` Min Max
@@ -483,24 +483,24 @@ masses.
# the "geom" functions
ggplot(data = pantheria, # input data
- mapping = aes(x = log10(AdultBodyMass_g))) + # log10 transform adult body mass
+ mapping = aes(x = log2(AdultBodyMass_g))) + # log2 transform adult body mass
geom_histogram(fill = "#CE3274", # type of plot
bins = 40) +
- xlab(label = "log10 Adult Body Mass (g)") + # x label
+ xlab(label = "log2 Adult Body Mass (g)") + # x label
ylab(label = "Frequency") + # y label
- labs(title = "Histogram of log10 Adult Body Mass") # title
-
-The data looks skewed even after log10 transformation. Let’s view the
+ labs(title = "Histogram of log2 Adult Body Mass") # title
+
+The data looks skewed even after log2 transformation. Let’s view the
distribution by trophic level.
-pantheria %>%
- ggplot(aes(x = log10(AdultBodyMass_g), fill = TrophicLevel)) + # Color by trophic level
+pantheria |>
+ ggplot(aes(x = log2(AdultBodyMass_g), fill = TrophicLevel)) + # Color by trophic level
geom_histogram(bins = 40) +
facet_grid(rows = vars(TrophicLevel)) + # Split the plot into rows by trophic level
ylab(label = "Frequency") +
- xlab(label = "log10 Adult Body Mass (g)") +
- labs(title = "Histograms of log10 Adult Body Mass") +
+ xlab(label = "log2 Adult Body Mass (g)") +
+ labs(title = "Histograms of log2 Adult Body Mass") +
theme(plot.title = element_text(hjust = 0.5)) # Center the plot title
-
+
It is clear that trophic level does have an impact on the
distribution of adult body mass, carnivores tend to be smaller which
makes sense because carnivores have higher metabolic demands and so
@@ -516,9 +516,9 @@ dataset. Using the dplyr cheatsheet, write code generates a
table of Orders and what percentage of the data they are. Scroll down to
see the hint if you are having trouble.
# Your code
-pantheria %>%
- group_by(Order) %>%
- summarise(n = n()) %>%
+pantheria |>
+ group_by(Order) |>
+ summarise(n = n()) |>
arrange(desc(n))
## # A tibble: 29 × 2
## Order n
@@ -537,42 +537,70 @@ pantheria %>%
Exercise 7.1 hint: group_by + summarize(n = n()) +
arrange
-
-Exercise 7.2: Hands on coding
+
+Exercise 7.2: Visualize Top Orders
Now that we see what the over represented Orders are, we can plot
their body masses by trophic level to see if they are skewing the
overall distributions.
top_orders <- c("Rodentia", "Chiroptera") # Character vector of the top 2 Orders from above
# filter uses a conditional to select rows from the data
-pantheria %>%
- filter(Order %in% top_orders) %>%
- ggplot(aes(x = log10(AdultBodyMass_g), fill = Order)) +
+pantheria |>
+ filter(Order %in% top_orders) |>
+ ggplot(aes(x = log2(AdultBodyMass_g), fill = Order)) +
geom_histogram(bins = 40) +
facet_grid(rows = vars(TrophicLevel),
cols = vars(Order)) +
ylab(label = "Frequency") +
- xlab(label = "log10 Adult Body Mass (g)") +
+ xlab(label = "log2 Adult Body Mass (g)") +
theme(plot.title = element_text(hjust = 0.5))
-
-It looks like one of them is mostly made up of small cornivores.
+

+It looks like one of them is mostly made up of small carnivores.
Let’s remove it and redo the plot of body mass distribution by trophic
level.
-pantheria %>%
- filter(Order != "Chiroptera") %>%
- ggplot(aes(x = log10(AdultBodyMass_g),fill = TrophicLevel)) + # Color by trophic level
+pantheria |>
+ filter(Order != "Chiroptera") |>
+ ggplot(aes(x = log2(AdultBodyMass_g),fill = TrophicLevel)) + # Color by trophic level
geom_histogram(bins = 40) +
facet_grid(rows = vars(TrophicLevel)) + # Split the plot into rows by trophic level
ylab(label = "Frequency") +
- xlab(label = "log10 Adult Body Mass (g)") +
- labs(title = "Histograms of log10 Adult Body Mass")
-
+ xlab(label = "log2 Adult Body Mass (g)") +
+ labs(title = "Histograms of log2 Adult Body Mass")
+
We can see now that body mass of carnivorous mammals is much less
skewed than the initial plots show. There is still an effect of trophic
level on body mass, but the effect size is likely much smaller than we
-would estimate by including all 393 Chiroptera. Now that we
-have generated these plots, we can generate a full report that contains
-all of the text and code, click Knit to render the HTML
-report.
+would estimate by including all 393 Chiroptera.
+
+
+Exercise 7.3: Fit a linear model
+# Set the reference level to herbivore
+pantheria <- pantheria |>
+ mutate(TrophicLevel = factor(TrophicLevel,
+ levels = c("herbivore","omnivore","carnivore")))
+
+lm(data = pantheria, log2(AdultBodyMass_g) ~ TrophicLevel) |>
+ tidy()
+## # A tibble: 3 × 5
+## term estimate std.error statistic p.value
+## <chr> <dbl> <dbl> <dbl> <dbl>
+## 1 (Intercept) 10.2 0.166 61.6 0
+## 2 TrophicLevelomnivore -1.72 0.238 -7.22 7.52e-13
+## 3 TrophicLevelcarnivore -2.82 0.253 -11.2 4.66e-28
+# Remove Chiroptera
+pantheria <- pantheria |>
+ filter(Order != "Chiroptera")
+
+lm(data = pantheria, log2(AdultBodyMass_g) ~ TrophicLevel) |>
+ tidy()
+## # A tibble: 3 × 5
+## term estimate std.error statistic p.value
+## <chr> <dbl> <dbl> <dbl> <dbl>
+## 1 (Intercept) 11.0 0.173 63.6 0
+## 2 TrophicLevelomnivore -2.08 0.243 -8.53 3.26e-17
+## 3 TrophicLevelcarnivore -0.813 0.296 -2.74 6.16e- 3
+Now that we have generated these plots and results, we can generate a
+full report that contains all of the text and code, click
+Knit to render the HTML report.
End of workshop exercises