refactor: simplify setup chunk by loading tidyverse and organizing code.

This commit is contained in:
Nick Heppler 2025-05-07 09:50:43 -04:00
parent 62fbbce3f6
commit 75e0c840a4

View File

@ -23,50 +23,59 @@ geometry: margin=1in
--- ---
```{r setup, include=FALSE} ```{r setup, include=FALSE}
# Global setup
knitr::opts_chunk$set( knitr::opts_chunk$set(
echo = TRUE, echo = TRUE,
message = FALSE, message = FALSE,
warning = FALSE warning = FALSE
) )
# Load necessary libraries
#library(dplyr) set.seed(123) # For reproducibility
#library(ggplot2)
library(knitr) # Load tidyverse and additional necessary libraries
library(lubridate)
library(RColorBrewer)
library(scales)
library(sf)
library(tidyverse) library(tidyverse)
library(sf)
library(tigris) library(tigris)
library(scales)
library(RColorBrewer)
library(viridis) library(viridis)
# Load survey data files from CSV as tibbles. # Define file paths
survey_data <- read_csv("data/_25_Million_Trees_Initiative_Survey_0.csv") survey_path <- "data/_25_Million_Trees_Initiative_Survey_0.csv"
location_points <- read_csv("data/location_points_1.csv") locations_pt_path <- "data/location_points_1.csv"
location_polygons <- read_csv("data/location_polygons_2.csv") locations_poly_path <- "data/location_polygons_2.csv"
participant_organizations <- read_csv("data/participant_organizations_3.csv") participants_path <- "data/participant_organizations_3.csv"
species_planted <- read_csv("data/species_planted_4.csv") species_path <- "data/species_planted_4.csv"
vendors <- read_csv("data/vendors_5.csv") vendors_path <- "data/vendors_5.csv"
# Transform date stored as character or numeric vectors to POSIXct objects. # Check for expected files
stopifnot(file.exists(survey_path))
stopifnot(file.exists(locations_pt_path))
stopifnot(file.exists(locations_poly_path))
stopifnot(file.exists(participants_path))
stopifnot(file.exists(species_path))
stopifnot(file.exists(vendors_path))
# Load survey and related datasets
survey_data <- read_csv(survey_path)
location_points <- read_csv(locations_pt_path)
location_polygons <- read_csv(locations_poly_path)
participant_organizations <- read_csv(participants_path)
species_planted <- read_csv(species_path)
vendors <- read_csv(vendors_path)
# Convert character dates to POSIXct
survey_data <- survey_data %>% survey_data <- survey_data %>%
mutate(CreationDate = mdy_hms(CreationDate)) mutate(CreationDate = mdy_hms(CreationDate))
# Count the records to be excluded (Exclude Result == 1) # Count and filter records based on exclusion flag
excluded_count <- survey_data %>% excluded_count <- survey_data %>% filter(`Exclude Result` == 1) %>% nrow()
filter(`Exclude Result` == 1) %>% used_count <- survey_data %>% filter(`Exclude Result` == 0) %>% nrow()
nrow()
# Count the records that are used (Exclude Result == 0)
used_count <- survey_data %>%
filter(`Exclude Result` == 0) %>%
nrow()
# Ignore excluded data.
survey_data <- survey_data %>% survey_data <- survey_data %>%
filter(`Exclude Result` == 0) filter(`Exclude Result` == 0)
# Join the data based on the ParentGlobalID, ensuring all rows from survey_data are retained # Join related datasets by GlobalID
combined_data <- survey_data %>% combined_data <- survey_data %>%
left_join(location_points, by = c("GlobalID" = "ParentGlobalID")) %>% left_join(location_points, by = c("GlobalID" = "ParentGlobalID")) %>%
left_join(location_polygons, by = c("GlobalID" = "ParentGlobalID")) %>% left_join(location_polygons, by = c("GlobalID" = "ParentGlobalID")) %>%