From fd2fc5b190f8ea138d4e50094fae3f4f8f168171 Mon Sep 17 00:00:00 2001 From: Natalie Elphick Date: Sat, 18 May 2024 08:18:12 -0700 Subject: [PATCH] closes #20 --- docs/Intro_to_R_data_analysis_part_1.html | 296 ++- docs/Intro_to_R_data_analysis_part_2.html | 1941 ++++++----------- .../Intro_to_R_data_analysis_part_1.Rmd | 104 +- .../Intro_to_R_data_analysis_part_2.Rmd | 169 +- intro-r-data-analysis/renv.lock | 20 +- intro-r-data-analysis/style.css | 12 +- 6 files changed, 1040 insertions(+), 1502 deletions(-) diff --git a/docs/Intro_to_R_data_analysis_part_1.html b/docs/Intro_to_R_data_analysis_part_1.html index b6cabad..be6348b 100644 --- a/docs/Intro_to_R_data_analysis_part_1.html +++ b/docs/Intro_to_R_data_analysis_part_1.html @@ -4,7 +4,7 @@ - Introduction to R Data Analysis - Part 1 + Introduction to R Data Analysis @@ -1085,7 +1085,7 @@ visibility: hidden; } +} + +.reveal a { +color: #0c74dc; +} + +.reveal a:hover { +color: #9c0366 !important; +} + - -
-

Introduction to R Data Analysis - Part 2

+

Introduction to R Data Analysis

+

Part 2

Natalie Elphick

-

January 23rd, 2024

+

May 21st, 2024

@@ -2799,7 +2809,9 @@ class CountdownTimer {

Introductions

Natalie Elphick
Bioinformatician I

-

Yihang Xin (TA)
+

Michela Traglia (In Person TA)
+Senior Statistician

+

Yihang Xin (Online TA)
Software Engineer III

@@ -2826,11 +2838,16 @@ data representations and design principles
  • Rows = observations, columns = variables
-
  • ggplot2, for data visualization.
  • -
  • dplyr, for data manipulation.
  • -
  • tidyr, for data tidying.
  • -
  • readr, for data import.
  • -
  • purrr, for iteration.
  • +
  • ggplot2, for data +visualization.
  • +
  • dplyr, for data +manipulation.
  • +
  • tidyr, for data +tidying.
  • +
  • readr, for data +import.
  • +
  • purrr, for +iteration.
  • and more..
  • @@ -2860,628 +2877,53 @@ allows any operation to be done “by group”
  • mpg is a dataframe built into the ggplot2 package
  • head(mpg)
    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -manufacturer - -model - -displ - -year - -cyl - -trans - -drv - -cty - -hwy - -fl - -class -
    -audi - -a4 - -1.8 - -1999 - -4 - -auto(l5) - -f - -18 - -29 - -p - -compact -
    -audi - -a4 - -1.8 - -1999 - -4 - -manual(m5) - -f - -21 - -29 - -p - -compact -
    -audi - -a4 - -2.0 - -2008 - -4 - -manual(m6) - -f - -20 - -31 - -p - -compact -
    -audi - -a4 - -2.0 - -2008 - -4 - -auto(av) - -f - -21 - -30 - -p - -compact -
    -audi - -a4 - -2.8 - -1999 - -6 - -auto(l5) - -f - -16 - -26 - -p - -compact -
    -audi - -a4 - -2.8 - -1999 - -6 - -manual(m5) - -f - -18 - -26 - -p - -compact -
    -
    +
    # A tibble: 6 × 11
    +  manufacturer model displ  year   cyl trans      drv     cty   hwy fl    class 
    +  <chr>        <chr> <dbl> <int> <int> <chr>      <chr> <int> <int> <chr> <chr> 
    +1 audi         a4      1.8  1999     4 auto(l5)   f        18    29 p     compa…
    +2 audi         a4      1.8  1999     4 manual(m5) f        21    29 p     compa…
    +3 audi         a4      2    2008     4 manual(m6) f        20    31 p     compa…
    +4 audi         a4      2    2008     4 auto(av)   f        21    30 p     compa…
    +5 audi         a4      2.8  1999     6 auto(l5)   f        16    26 p     compa…
    +6 audi         a4      2.8  1999     6 manual(m5) f        18    26 p     compa…

    Select Columns

    -
    select(.data = mpg,
    -       year, cty, hwy, manufacturer)
    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -year - -cty - -hwy - -manufacturer -
    -1999 - -18 - -29 - -audi -
    -1999 - -21 - -29 - -audi -
    -2008 - -20 - -31 - -audi -
    -2008 - -21 - -30 - -audi -
    -1999 - -16 - -26 - -audi -
    -1999 - -18 - -26 - -audi -
    -
    +
    select(.data = mpg,
    +       year, cty, hwy, manufacturer)
    +
    # A tibble: 234 × 4
    +    year   cty   hwy manufacturer
    +   <int> <int> <int> <chr>       
    + 1  1999    18    29 audi        
    + 2  1999    21    29 audi        
    + 3  2008    20    31 audi        
    + 4  2008    21    30 audi        
    + 5  1999    16    26 audi        
    + 6  1999    18    26 audi        
    + 7  2008    18    27 audi        
    + 8  1999    18    26 audi        
    + 9  1999    16    25 audi        
    +10  2008    20    28 audi        
    +# ℹ 224 more rows

    Filter Rows

    -
    filter(.data = mpg,
    -       year == 2008)
    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -manufacturer - -model - -displ - -year - -cyl - -trans - -drv - -cty - -hwy - -fl - -class -
    -audi - -a4 - -2.0 - -2008 - -4 - -manual(m6) - -f - -20 - -31 - -p - -compact -
    -audi - -a4 - -2.0 - -2008 - -4 - -auto(av) - -f - -21 - -30 - -p - -compact -
    -audi - -a4 - -3.1 - -2008 - -6 - -auto(av) - -f - -18 - -27 - -p - -compact -
    -audi - -a4 quattro - -2.0 - -2008 - -4 - -manual(m6) - -4 - -20 - -28 - -p - -compact -
    -audi - -a4 quattro - -2.0 - -2008 - -4 - -auto(s6) - -4 - -19 - -27 - -p - -compact -
    -audi - -a4 quattro - -3.1 - -2008 - -6 - -auto(s6) - -4 - -17 - -25 - -p - -compact -
    -
    +
    filter(.data = mpg,
    +       year == 2008)
    +
    # A tibble: 117 × 11
    +   manufacturer model      displ  year   cyl trans drv     cty   hwy fl    class
    +   <chr>        <chr>      <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
    + 1 audi         a4           2    2008     4 manu… f        20    31 p     comp…
    + 2 audi         a4           2    2008     4 auto… f        21    30 p     comp…
    + 3 audi         a4           3.1  2008     6 auto… f        18    27 p     comp…
    + 4 audi         a4 quattro   2    2008     4 manu… 4        20    28 p     comp…
    + 5 audi         a4 quattro   2    2008     4 auto… 4        19    27 p     comp…
    + 6 audi         a4 quattro   3.1  2008     6 auto… 4        17    25 p     comp…
    + 7 audi         a4 quattro   3.1  2008     6 manu… 4        15    25 p     comp…
    + 8 audi         a6 quattro   3.1  2008     6 auto… 4        17    25 p     mids…
    + 9 audi         a6 quattro   4.2  2008     8 auto… 4        16    23 p     mids…
    +10 chevrolet    c1500 sub…   5.3  2008     8 auto… r        14    20 r     suv  
    +# ℹ 107 more rows

    Arrange Rows

    @@ -3489,156 +2931,22 @@ compact
  • desc() is used to arrange rows in descending order, the default is ascending
  • -
    arrange(.data = mpg,
    -        desc(cyl))
    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -manufacturer - -model - -displ - -year - -cyl - -trans - -drv - -cty - -hwy - -fl - -class -
    -audi - -a6 quattro - -4.2 - -2008 - -8 - -auto(s6) - -4 - -16 - -23 - -p - -midsize -
    -chevrolet - -c1500 suburban 2wd - -5.3 - -2008 - -8 - -auto(l4) - -r - -14 - -20 - -r - -suv -
    -chevrolet - -c1500 suburban 2wd - -5.3 - -2008 - -8 - -auto(l4) - -r - -11 - -15 - -e - -suv -
    -
    +
    arrange(.data = mpg,
    +        desc(cty))
    +
    # A tibble: 234 × 11
    +   manufacturer model      displ  year   cyl trans drv     cty   hwy fl    class
    +   <chr>        <chr>      <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
    + 1 volkswagen   new beetle   1.9  1999     4 manu… f        35    44 d     subc…
    + 2 volkswagen   jetta        1.9  1999     4 manu… f        33    44 d     comp…
    + 3 volkswagen   new beetle   1.9  1999     4 auto… f        29    41 d     subc…
    + 4 honda        civic        1.6  1999     4 manu… f        28    33 r     subc…
    + 5 toyota       corolla      1.8  2008     4 manu… f        28    37 r     comp…
    + 6 honda        civic        1.8  2008     4 manu… f        26    34 r     subc…
    + 7 toyota       corolla      1.8  1999     4 manu… f        26    35 r     comp…
    + 8 toyota       corolla      1.8  2008     4 auto… f        26    35 r     comp…
    + 9 honda        civic        1.6  1999     4 manu… f        25    32 r     subc…
    +10 honda        civic        1.8  2008     4 auto… f        25    36 r     subc…
    +# ℹ 224 more rows

    Summarising data

    @@ -3653,168 +2961,45 @@ different categorical groupings

    Group and Summarise

    -
    summarise(group_by(.data = mpg,
    -                   manufacturer),
    -          mean_cty = mean(cty),
    -          median_cty = median(cty))
    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -manufacturer - -mean_cty - -median_cty -
    -audi - -17.61111 - -17.5 -
    -chevrolet - -15.00000 - -15.0 -
    -dodge - -13.13514 - -13.0 -
    -ford - -14.00000 - -14.0 -
    -honda - -24.44444 - -24.0 -
    -hyundai - -18.64286 - -18.5 -
    -
    +
      +
    • Get the mean and median city mileage within manufacturer
    • +
    +
    summarise(group_by(.data = mpg,
    +                   manufacturer),
    +          mean_cty = mean(cty),
    +          median_cty = median(cty))
    +
    # A tibble: 10 × 3
    +   manufacturer mean_cty median_cty
    +   <chr>           <dbl>      <dbl>
    + 1 audi             17.6       17.5
    + 2 chevrolet        15         15  
    + 3 dodge            13.1       13  
    + 4 ford             14         14  
    + 5 honda            24.4       24  
    + 6 hyundai          18.6       18.5
    + 7 jeep             13.5       14  
    + 8 land rover       11.5       11.5
    + 9 lincoln          11.3       11  
    +10 mercury          13.2       13  

    The pipe operator |>

    • Allows “chaining” of function calls to make code more readable
    -
    mpg |>
    -  group_by(manufacturer) |>
    -  summarise(mean_cty = mean(cty),
    -            median_cty = median(cty))
    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -manufacturer - -mean_cty - -median_cty -
    -audi - -17.61111 - -17.5 -
    -chevrolet - -15.00000 - -15.0 -
    -dodge - -13.13514 - -13.0 -
    -ford - -14.00000 - -14.0 -
    -
    +
    mpg |>
    +  group_by(manufacturer) |>
    +  summarise(mean_cty = mean(cty),
    +            median_cty = median(cty)) |>
    +  head(5)
    +
    # A tibble: 5 × 3
    +  manufacturer mean_cty median_cty
    +  <chr>           <dbl>      <dbl>
    +1 audi             17.6       17.5
    +2 chevrolet        15         15  
    +3 dodge            13.1       13  
    +4 ford             14         14  
    +5 honda            24.4       24  
    @@ -3841,28 +3026,27 @@ modified by adding layers

    Creating ggplots

    -



    Plotting

    +



    Plotting

    Plot Example

    -
    ggplot(data = mpg,                         # Input dataframe
    -       mapping = aes(x = cty, y = hwy)) +  # Aesthetic mapping
    -  geom_point()                             # Point graph
    -

    +
    ggplot(data = mpg,                         # Input dataframe
    +       mapping = aes(x = cty, y = hwy)) +  # Aesthetic mapping
    +  geom_point()                             # Point graph
    +

    Adding and Modifying Layers

    -
    ggplot(data = mpg,
    -       mapping = aes(x = class, y = cty, fill = class)) +
    -  geom_violin() +
    -  geom_boxplot(width = 0.1,
    -               fill = "white")
    -

    +
    ggplot(data = mpg,                         
    +       mapping = aes(x = cty, y = hwy)) +  
    +  geom_point() +
    +  geom_smooth(formula = y ~ x, method = "lm")
    +

    10 min break

    -
    +
    10:00
    @@ -3879,17 +3063,317 @@ modified by adding layers
    • PanTHERIA
        -
      • A global species-level data set of key life-history, ecological and -geographical traits of all known extant and recently extinct mammals -compiled from the literature
      • -
      • Macroecological and macroevolutionary research projects
      • +
      • A global species-level data set of key traits of all known extant +and recently extinct mammals compiled from literature
      • +
      • Used in macroecological and macroevolutionary research projects
      • Data is organized by taxonomic rank

    Taxonomic Rank

    -

    Taxonomy

    +

    + + + + + + + + + + + + + লাল শিয়াল + (ভালপেস ভালপেস) + Rotfuchs + (Vulpes vulpes) + Zorro rojo + (Vulpes vulpes) + + မြေခွေးနီ (Vulpes vulpes) + обыкновенная лисица + (Vulpes vulpes) + + රතු හිවලා (වුල්පෙස් වුල්පෙස්) + Rödräv + (Vulpes vulpes) + + लाल लोमड़ी + वुल्पेस वुल्पेस + Црвена лисица + (Vulpes vulpes) + Red fox + (Vulpes vulpes) + + + অধিজগৎজগৎপর্বশ্রেণিবর্গগোত্রগণ + প্রজাতি + DomäneReichStammKlasseOrdnungFamilieGattung + Art + DominioReinoFiloClaseOrdenFamiliaGénero + Especie + နယ်ပယ်လောကမျိုးပေါင်းစုမျိုးပေါင်းမျိုးစဉ်မျိုးရင်းမျိုးစု + မျိုးစိတ် + ДоменЦарствоТипКлассОтрядСемействоРод + Вид + වසමරාජධානියවංශයවර්ගයගෝත්‍රයකුලයගණය + විශේෂය + DomänRikeFylumKlassOrdningFamiljSläkte + Art + + अधिजगत्जगत् संघवर्गगणकुटुम्बवंश + जाति + ДоменЦарствоКоленоКласаРедСемејствоРод + Вид + DomainKingdomPhylumClassOrderFamilyGenus + Species + + + সুকেন্দ্রিকপ্রাণীমেরুদণ্ডীস্তন্যপায়ীশ্বাপদক্যানিডেভালপেস + ভালপেস ভালপেস + EucariotaAnimaliaCordadosMamíferosCarnívoraCánidosVulpes + Vulpes vulpes + ယူကာရုတ်တိရစ္ဆာန်ကော်ဒိတ်နို့တိုက်သတ္တဝါကာနီဗိုရာခွေးမျိုးရင်းVulpes + Vuples vulpes + ЭукариотыЖивотныеХордовыеМлекопитающиеХищныеПсовыеVulpes + Vulpes vulpes + යුකේරියාඇනිමේලියාකෝඩේටාමමාලියාකානිවෝරාකානිඩේවුල්පෙස් + Vuples vulpes + EukaryoterDjurRyggsträngsdjurDäggdjurRovdjurHunddjurVulpes + Vulpes vulpes + + सुकेन्द्रकप्राणीरज्जुकीस्तनधारी मांसाहारीश्वानवुल्पेस + वुल्पेस वुल्पेस + ЕукариотиЖивотниХордовиЦицачиЅверовиКучињаЛисици + Црвена лисица + EukaryaAnimaliaChordataMammaliaCarnivoraCanidaeVulpes + Vulpes vulpes + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    Data Preview

    @@ -5053,7 +4537,21 @@ NA

    Hands-on Analysis

      +
    • We will read in the data and explore if the trophic level has a +significant impact on the adult body mass of mammals
    • +
    +

    Steps:
    +1. Combine and clean the data
    +2. Visualize adult body mass by trophic level
    +3. Check for overrepresented groups
    +4. Fit a simple linear model

    +
    +
    +

    Hands-on Analysis

    +
    • Open part_2.Rmd
    • +
    • If you just want to follow along and not run code, open +part2_filled_out.html
    @@ -5064,20 +4562,18 @@ NA

    General Tips

      +
    • Follow any relevant institutional guidelines on using LLMs
    • Always confirm ChatGPT’s outputs are correct
    • Provide as much detail as possible about the problem in the 1st prompt
    • Use separate chats for separate tasks/projects
    • -
    • Try the ‘Custom Instructions’ function that adds additional -information to every prompt
    • -
    • Can visit webpages (GPT 4 only), which can help get more specific -answers
    • +
    • Try the ‘Custom Instructions’ function

    Code Tips

      -
    • Commented R code yields better responses in my experience
    • +
    • Commented R code yields better responses
    • Provide the code and error message in the same prompt
    • ChatGPT can work well to convert syntax and improve your code:
        @@ -5090,46 +4586,6 @@ case
    -
    -

    Finding R Packages

    - -
    -
    -

    Key Questions

    -
      -
    • What assay was the package designed for?
    • -
    • When was the last release?
    • -
    • Is it maintained (frequent updates)?
    • -
    • Does it work on all operating systems?
    • -
    • Are other people using it? (citations)
    • -
    • Do they respond to github issues?
    • -
    • Is there a benchmarking paper?
    • -
    -
    -
    -

    BioConductor and CRAN

    -
      -
    • Both of these have stringent requirements for packages they host -(eg. for BioConductor they have to run on all major operating -systems)

    • -
    • Prefer BioConductor packages if available over CRAN

    • -
    • Prefer CRAN packages over ones only hosted on GitHub

    • -
    -
    -
    -

    Start with the Assay

    -
      -
    • Click here -to go to BioC views
    • -
    • Pick the assay you want to analyse
    • -
    • Pick the type of analysis you want to do
    • -
    • Find a package that does it
    • -
    • Find benchmarking papers to narrow the list of packages down
    • -
    • Find the vignette on the package page and refer to the manual for -any questions not covered by it
    • -
    -
    -

    Additional Resources

    @@ -5137,10 +4593,12 @@ any questions not covered by it

    R

    Upcoming Workshops

    -
      -
    1. Introduction -to Statistics, Experimental Design, and Hypothesis Testing +

      Single +Cell ATAC-Seq Data Analysis Part 2

        -
      • Jan 25, 2024 (Session 1 - 10am–12pm) (Session 2 - 1pm–3pm)
      • -
      • Jan 26, 2024 (Session 3 - 10am–12pm)
      • -
    2. -
    3. Intermediate -RNA-Seq Analysis Using R -
        -
      • Feb 1, 2024 (9:30am-12:00pm)
      • -
    4. -
    +
  • Check this +link at the end of the summer for out fall workshop schedule
  • +
    diff --git a/intro-r-data-analysis/Intro_to_R_data_analysis_part_1.Rmd b/intro-r-data-analysis/Intro_to_R_data_analysis_part_1.Rmd index 34b6af1..1909ed2 100644 --- a/intro-r-data-analysis/Intro_to_R_data_analysis_part_1.Rmd +++ b/intro-r-data-analysis/Intro_to_R_data_analysis_part_1.Rmd @@ -1,7 +1,8 @@ --- -title: "Introduction to R Data Analysis - Part 1" +title: "Introduction to R Data Analysis" +subtitle: "Part 1" author: "Natalie Elphick" -date: "January 22nd, 2024" +date: "May 20th, 2024" knit: (function(input, ...) { rmarkdown::render( input, @@ -16,6 +17,7 @@ output: ```{r, setup, include=FALSE} library(tidyverse) +knitr::opts_chunk$set(comment = "") ``` ## @@ -25,10 +27,10 @@ library(tidyverse) ## Introductions **Natalie Elphick** -Bioinformatician I +Bioinformatician I -**Michela Traglia (TA)** -Senior Statistician +**Yihang Xin (Online TA)** +Software Engineer III ## Poll 1 @@ -36,7 +38,7 @@ Senior Statistician **What is your level of experience with coding/data analysis?** 1. I know another data analysis programming language (Python, Matlab etc.) -2. I can use Excel to do linear regression +2. I can use Excel 3. I know some R 4. All of the above 5. None of the above @@ -50,8 +52,8 @@ Senior Statistician 1. What is R and why should you use it? 2. The RStudio interface 3. File types -4. Error messages -5. Variables +4. Variables +5. Error and warning messages 6. Types & data structures 7. Math and logic operations 8. Functions and packages @@ -86,9 +88,9 @@ functionality # RStudio ## RStudio -- RStudio is an integrated development +- RStudio is an integrated development environment (IDE) -- It makes R code easier to write by providing a +- It is an app that makes R code easier to write by providing a feature rich graphical user interface (GUI)
    @@ -131,8 +133,6 @@ feature rich graphical user interface (GUI) ## Variable definition - Variables store information that is referenced and manipulated in a computer program -- In contrast to the mathematical definition of a variable, -variables in computer science are _mutable_ - There are 3 ways to define variables in R, but one is preferred: ```{r} x <- 1 # Preferred way @@ -141,7 +141,59 @@ x = 1 print(x) ``` -## Variable naming +## Example +- Run the following in the R console: + +```{r} +x <- 1 +y <- 4 +z <- y +x + y + z +``` + +# Error and Warning Messages + +## Errors +- **Errors**: Stop the execution of your code and must be fixed for the code to run successfully + +```{r, eval=FALSE} +x <- 5 +y <- 10 +z <- x + a +``` + +```{r,echo=FALSE} +message("Error: object 'a' not found") +``` + + +## Common Errors + +- **Syntax Error:** Invalid R code syntax (e.g. misplaced parentheses) +```{r,echo=FALSE} +message('Error: unexpected ")"') +``` + +- **Object not found:** This variable is not defined (e.g. misspelled variables) + +```{r,echo=FALSE} +message('Error: object "a" not found') +``` + + +See this [article](https://statsandr.com/blog/top-10-errors-in-r/) for more common errors and how to fix them. + +## Warnings +- Do not stop the execution but indicate potential issues that you should be aware of and might need to address + +```{r} +a <- c(1, 2, 3, 4, 5) +b <- c(6, 7, 8, 9) +result <- a + b +``` + + +## Variable Naming - Variables names must start with a letter and can contain underscores and periods @@ -176,7 +228,7 @@ DogBreeds <- c("Labrador Retriever", "Akita", "Bulldog") ## Data Types - Integer - - Whole numbers (in R denote with L ex. 1L,2L) + - Whole numbers (in R denoted with L ex. 1L,2L) - Numeric - Decimal numbers - Logical @@ -191,7 +243,7 @@ DogBreeds <- c("Labrador Retriever", "Akita", "Bulldog") **Which of these is not the correct data type for the value?** 1. 1.5 - Numeric -2. "Labrador Retriever" - Character +2. "1" - Character 3. NA - Logical 4. 1 - Integer @@ -227,6 +279,8 @@ DogBreeds <- c("Labrador Retriever", "Akita", "Bulldog") countdown::countdown(minutes = 10, seconds = 0, color_border = "black", + color_running_background = "#47d193", + color_finished_background = "#a3184e", padding = "50px", margin = "5%", font_size = "5em", @@ -297,6 +351,7 @@ x & !y execution of code ```{r} +dog_breeds <- c("Labrador Retriever", "Akita", "Bulldog") if ("Akita" %in% dog_breeds) { print("dog_breeds already contains Akita") } else { @@ -379,11 +434,17 @@ library(ggplot2) # Makes all of the ggplot2 functions available - The tidyverse is a collection of commonly used data analysis packages - Learning curve is less steep - - Lots of useful packages for data analysis + - Lots of useful packages for cleaning and "wrangling" data into the correct format -## +## Why use Tidyverse Packages? -![tidyverse](assets/tidyverse.png) +- Most of the work in data analysis is getting data into the correct format to create outputs +- The tidyverse collection of packages simplifies this process + - Intuitive syntax + - Comprehensive (data manipulation, cleaning, modeling and graphics) + - Consistent data structure + - Strong community support + # End of Part 1 @@ -392,11 +453,8 @@ packages ## Upcoming Workshops -1. [Introduction to Statistics, Experimental Design, and Hypothesis Testing](https://gladstone.org/index.php/events/introduction-statistics-experimental-design-and-hypothesis-testing-0) - - Jan 25, 2024 (Session 1 - 10am–12pm) (Session 2 - 1pm–3pm) - - Jan 26, 2024 (Session 3 - 10am–12pm) +[Single Cell ATAC-Seq Data Analysis Part 2](https://gladstone.org/events/single-cell-atac-seq-data-analysis-part-2-1) -2. [Intermediate RNA-Seq Analysis Using R](https://gladstone.org/index.php/events/intermediate-rna-seq-analysis-using-r-4) - - Feb 1, 2024 (9:30am-12:00pm) +- Check [this link](https://gladstone.org/events?series=data-science-training-program) at the end of the summer for out fall workshop schedule diff --git a/intro-r-data-analysis/Intro_to_R_data_analysis_part_2.Rmd b/intro-r-data-analysis/Intro_to_R_data_analysis_part_2.Rmd index 2e1218e..a950c3d 100644 --- a/intro-r-data-analysis/Intro_to_R_data_analysis_part_2.Rmd +++ b/intro-r-data-analysis/Intro_to_R_data_analysis_part_2.Rmd @@ -1,7 +1,8 @@ --- -title: "Introduction to R Data Analysis - Part 2" +title: "Introduction to R Data Analysis" +subtitle: "Part 2" author: "Natalie Elphick" -date: "January 23rd, 2024" +date: "May 21st, 2024" knit: (function(input, ...) { rmarkdown::render( input, @@ -19,6 +20,7 @@ library(kableExtra) library(tidyverse) library(readxl) theme_set(theme_grey(base_size = 16)) +knitr::opts_chunk$set(comment = "") ``` ## @@ -29,7 +31,10 @@ theme_set(theme_grey(base_size = 16)) **Natalie Elphick** Bioinformatician I -**Yihang Xin (TA)** +**Michela Traglia (In Person TA)** +Senior Statistician + +**Yihang Xin (Online TA)** Software Engineer III # Schedule @@ -46,11 +51,11 @@ Software Engineer III - The tidyverse packages work well together because they share common data representations and design principles - Rows = observations, columns = variables -- [ggplot2](), for data visualization. -- [dplyr](), for data manipulation. -- [tidyr](), for data tidying. -- [readr](), for data import. -- [purrr](), for iteration. +- [ggplot2](https://ggplot2.tidyverse.org/), for data visualization. +- [dplyr](https://dplyr.tidyverse.org/), for data manipulation. +- [tidyr](https://tidyr.tidyverse.org/), for data tidying. +- [readr](https://readr.tidyverse.org/), for data import. +- [purrr](https://purrr.tidyverse.org/), for iteration. - and more.. ## dplyr @@ -67,66 +72,38 @@ common data representations and design principles ## Example Dataframe - mpg is a dataframe built into the ggplot2 package -```{r, eval = FALSE} +```{r} head(mpg) ``` -```{r, echo = FALSE} -head(mpg) |> - kable() |> - kable_styling("striped") |> - scroll_box(width = "100%") -``` ## Select Columns -```{r, eval = FALSE} +```{r} select(.data = mpg, year, cty, hwy, manufacturer) ``` -```{r, echo = FALSE} -select(.data = mpg, - year, cty, hwy, manufacturer) |> - head() |> - kable() |> - kable_styling("striped") |> - scroll_box(width = "100%") -``` - ## Filter Rows -```{r, eval = FALSE} +```{r} filter(.data = mpg, year == 2008) ``` -```{r, echo = FALSE} -filter(.data = mpg, - year == 2008) |> - head() |> - kable() |> - kable_styling("striped") |> - scroll_box(width = "100%") -``` + ## Arrange Rows - desc() is used to arrange rows in descending order, the default is ascending -```{r, eval = FALSE} +```{r} arrange(.data = mpg, - desc(cyl)) + desc(cty)) ``` -```{r, echo = FALSE} -arrange(.data = mpg, - desc(cyl)) |> - head(n = 3) |> - kable() |> - kable_styling("striped") |> - scroll_box(width = "100%") -``` + + ## Summarising data - The dplyr **summarise()** function computes a table of summaries for a data frame @@ -136,6 +113,9 @@ variable(s) different categorical groupings ## Group and Summarise + +- Get the mean and median city mileage within manufacturer + ```{r, eval = FALSE} summarise(group_by(.data = mpg, manufacturer), @@ -144,37 +124,27 @@ summarise(group_by(.data = mpg, ``` ```{r, echo = FALSE} -summarise(group_by(.data = mpg, +summarise(.data = group_by(.data = mpg, manufacturer), mean_cty = mean(cty), median_cty = median(cty)) |> - head() |> - kable() |> - kable_styling("striped") |> - scroll_box(width = "100%") + head(10) ``` + + ## The pipe operator |> - Allows "chaining" of function calls to make code more readable -```{r, eval = FALSE} -mpg |> - group_by(manufacturer) |> - summarise(mean_cty = mean(cty), - median_cty = median(cty)) -``` - -```{r, echo = FALSE} +```{r} mpg |> group_by(manufacturer) |> summarise(mean_cty = mean(cty), median_cty = median(cty)) |> - head(n = 4) |> - kable() |> - kable_styling("striped") |> - scroll_box(width = "100%") + head(5) ``` + # Plotting ## ggplot2 @@ -204,11 +174,10 @@ ggplot(data = mpg, # Input dataframe ## Adding and Modifying Layers ```{r, fig.dim=c(10,4)} -ggplot(data = mpg, - mapping = aes(x = class, y = cty, fill = class)) + - geom_violin() + - geom_boxplot(width = 0.1, - fill = "white") +ggplot(data = mpg, + mapping = aes(x = cty, y = hwy)) + + geom_point() + + geom_smooth(formula = y ~ x, method = "lm") ``` @@ -221,6 +190,8 @@ ggplot(data = mpg, countdown::countdown(minutes = 10, seconds = 0, color_border = "black", + color_running_background = "#47d193", + color_finished_background = "#a3184e", padding = "50px", margin = "5%", font_size = "5em", @@ -234,8 +205,8 @@ countdown::countdown(minutes = 10, ## Dataset Description - PanTHERIA - - A global species-level data set of key life-history, ecological and geographical traits of all known extant and recently extinct mammals compiled from the literature - - Macroecological and macroevolutionary research projects + - A global species-level data set of key traits of all known extant and recently extinct mammals compiled from literature + - Used in macroecological and macroevolutionary research projects - Data is organized by taxonomic rank ## Taxonomic Rank @@ -252,10 +223,20 @@ read_xlsx("Intro_to_R_workshop_materials/PanTHERIA.xlsx") |> scroll_box(width = "100%") ``` +## Hands-on Analysis +- We will read in the data and explore if the trophic level has a significant impact on the adult body mass of mammals + + +Steps: +1. Combine and clean the data +2. Visualize adult body mass by trophic level +3. Check for overrepresented groups +4. Fit a simple linear model ## Hands-on Analysis - Open part_2.Rmd +- If you just want to follow along and not run code, open part2_filled_out.html @@ -263,63 +244,34 @@ read_xlsx("Intro_to_R_workshop_materials/PanTHERIA.xlsx") |> ## General Tips +- Follow any relevant institutional guidelines on using LLMs - Always confirm ChatGPT's outputs are correct - Provide as much detail as possible about the problem in the 1st prompt - Use separate chats for separate tasks/projects -- Try the 'Custom Instructions' function that adds additional information to every prompt -- Can visit webpages (GPT 4 only), which can help get more specific answers +- Try the 'Custom Instructions' function ## Code Tips -- Commented R code yields better responses in my experience +- Commented R code yields better responses - Provide the code and error message in the same prompt - ChatGPT can work well to convert syntax and improve your code: - "Turn this loop into a function : [your code]" - "Is there a better way to do this : [your code]" - Check out the file: `example_code/1_convert_syntax_example.R` for an example use case -# Finding R Packages - -## Key Questions - -- What assay was the package designed for? -- When was the last release? -- Is it maintained (frequent updates)? -- Does it work on all operating systems? -- Are other people using it? (citations) -- Do they respond to github issues? -- Is there a benchmarking paper? - -## BioConductor and CRAN - -- Both of these have stringent requirements for packages they host (eg. for BioConductor they have to run on all major operating systems) - -- Prefer BioConductor packages if available over CRAN - -- Prefer CRAN packages over ones only hosted on GitHub - -## Start with the Assay - -- Click [here](https://www.bioconductor.org/packages/release/BiocViews.html#___Sequencing) to go to BioC views -- Pick the assay you want to analyse -- Pick the type of analysis you want to do -- Find a package that does it -- Find benchmarking papers to narrow the list of packages down -- Find the vignette on the package page and refer to the manual for any questions not covered by it - # Additional Resources ## R - -- [R Markdown: The Definitive Guide](https://bookdown.org/yihui/rmarkdown/how-to-read-this-book.html) : Excellent R markdown reference - - [R for Data Science](https://r4ds.hadley.nz/) - +- [Top 10 R Errors and How to Fix them](https://statsandr.com/blog/top-10-errors-in-r/) +- [R Markdown: The Definitive Guide](https://bookdown.org/yihui/rmarkdown/how-to-read-this-book.html) : Excellent R markdown reference - [ggplot2: elegant graphics for data analysis](https://ggplot2-book.org/) - [Advanced R](https://adv-r.hadley.nz/) + + ## Statistics - [Data Analysis in R](https://bookdown.org/steve_midway/DAR) : This book has more statistics details than *R for Data Science* @@ -346,10 +298,9 @@ read_xlsx("Intro_to_R_workshop_materials/PanTHERIA.xlsx") |> ## Upcoming Workshops -1. [Introduction to Statistics, Experimental Design, and Hypothesis Testing](https://gladstone.org/index.php/events/introduction-statistics-experimental-design-and-hypothesis-testing-0) - - Jan 25, 2024 (Session 1 - 10am–12pm) (Session 2 - 1pm–3pm) - - Jan 26, 2024 (Session 3 - 10am–12pm) +[Single Cell ATAC-Seq Data Analysis Part 2](https://gladstone.org/events/single-cell-atac-seq-data-analysis-part-2-1) + +- Check [this link](https://gladstone.org/events?series=data-science-training-program) at the end of the summer for out fall workshop schedule + -2. [Intermediate RNA-Seq Analysis Using R](https://gladstone.org/index.php/events/intermediate-rna-seq-analysis-using-r-4) - - Feb 1, 2024 (9:30am-12:00pm) diff --git a/intro-r-data-analysis/renv.lock b/intro-r-data-analysis/renv.lock index 52c9d40..6812de9 100644 --- a/intro-r-data-analysis/renv.lock +++ b/intro-r-data-analysis/renv.lock @@ -22,7 +22,7 @@ }, "MASS": { "Package": "MASS", - "Version": "7.3-60", + "Version": "7.3-60.0.1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -33,11 +33,11 @@ "stats", "utils" ], - "Hash": "a56a6365b3fa73293ea8d084be0d9bb0" + "Hash": "b765b28387acc8ec9e9c1530713cb19c" }, "Matrix": { "Package": "Matrix", - "Version": "1.6-1.1", + "Version": "1.6-5", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -50,7 +50,7 @@ "stats", "utils" ], - "Hash": "1a00d4828f33a9d690806e98bd17150c" + "Hash": "8c7115cd3a0e048bda2a7cd110549f7a" }, "R6": { "Package": "R6", @@ -872,7 +872,7 @@ }, "lattice": { "Package": "lattice", - "Version": "0.21-9", + "Version": "0.22-6", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -883,7 +883,7 @@ "stats", "utils" ], - "Hash": "5558c61e0136e247252f5f952cdaad6a" + "Hash": "cc5ac1ba4c238c7ca9fa6a87ca11a7e2" }, "learnr": { "Package": "learnr", @@ -977,7 +977,7 @@ }, "mgcv": { "Package": "mgcv", - "Version": "1.9-0", + "Version": "1.9-1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -990,7 +990,7 @@ "stats", "utils" ], - "Hash": "086028ca0460d0c368028d3bda58f31b" + "Hash": "110ee9d83b496279960e162ac97764ce" }, "mime": { "Package": "mime", @@ -1033,7 +1033,7 @@ }, "nlme": { "Package": "nlme", - "Version": "3.1-163", + "Version": "3.1-164", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -1043,7 +1043,7 @@ "stats", "utils" ], - "Hash": "8d1938040a05566f4f7a14af4feadd6b" + "Hash": "a623a2239e642806158bc4dc3f51565d" }, "openssl": { "Package": "openssl", diff --git a/intro-r-data-analysis/style.css b/intro-r-data-analysis/style.css index 6e7ace6..9928aff 100644 --- a/intro-r-data-analysis/style.css +++ b/intro-r-data-analysis/style.css @@ -129,4 +129,14 @@ small { .big-picture img{ max-width: 70%; border: 1px solid black !important; -} \ No newline at end of file +} + +/* Chage link color to sky blue */ +.reveal a { + color: #0c74dc; +} + +/* Change link color to magenta on hover */ +.reveal a:hover { + color: #9c0366 !important; +}