refactor: simplify setup chunk by loading tidyverse and organizing code.

2025-05-07 09:50:43 -04:00 · 2025-05-07 09:50:43 -04:00 · 75e0c840a4
commit 75e0c840a4
parent 62fbbce3f6
1 changed files with 39 additions and 30 deletions
--- a/report.Rmd
+++ b/report.Rmd
@ -23,50 +23,59 @@ geometry: margin=1in
 ---

 ```{r setup, include=FALSE}
+# Global setup
 knitr::opts_chunk$set(
-	echo = TRUE,
-	message = FALSE,
-	warning = FALSE
+  echo = TRUE,
+  message = FALSE,
+  warning = FALSE
 )
-# Load necessary libraries
-#library(dplyr)
-#library(ggplot2)
-library(knitr)
-library(lubridate)
-library(RColorBrewer)
-library(scales)
-library(sf)
+
+set.seed(123)  # For reproducibility
+
+# Load tidyverse and additional necessary libraries
 library(tidyverse)
+library(sf)
 library(tigris)
+library(scales)
+library(RColorBrewer)
 library(viridis)

-# Load survey data files from CSV as tibbles.
-survey_data <- read_csv("data/_25_Million_Trees_Initiative_Survey_0.csv")
-location_points <- read_csv("data/location_points_1.csv")
-location_polygons <- read_csv("data/location_polygons_2.csv")
-participant_organizations <- read_csv("data/participant_organizations_3.csv")
-species_planted <- read_csv("data/species_planted_4.csv")
-vendors <- read_csv("data/vendors_5.csv")
+# Define file paths
+survey_path <- "data/_25_Million_Trees_Initiative_Survey_0.csv"
+locations_pt_path <- "data/location_points_1.csv"
+locations_poly_path <- "data/location_polygons_2.csv"
+participants_path <- "data/participant_organizations_3.csv"
+species_path <- "data/species_planted_4.csv"
+vendors_path <- "data/vendors_5.csv"

-# Transform date stored as character or numeric vectors to POSIXct objects.
+# Check for expected files
+stopifnot(file.exists(survey_path))
+stopifnot(file.exists(locations_pt_path))
+stopifnot(file.exists(locations_poly_path))
+stopifnot(file.exists(participants_path))
+stopifnot(file.exists(species_path))
+stopifnot(file.exists(vendors_path))
+
+# Load survey and related datasets
+survey_data <- read_csv(survey_path)
+location_points <- read_csv(locations_pt_path)
+location_polygons <- read_csv(locations_poly_path)
+participant_organizations <- read_csv(participants_path)
+species_planted <- read_csv(species_path)
+vendors <- read_csv(vendors_path)
+
+# Convert character dates to POSIXct
 survey_data <- survey_data %>%
  mutate(CreationDate = mdy_hms(CreationDate))

-# Count the records to be excluded (Exclude Result == 1)
-excluded_count <- survey_data %>%
-  filter(`Exclude Result` == 1) %>%
-  nrow()
+# Count and filter records based on exclusion flag
+excluded_count <- survey_data %>% filter(`Exclude Result` == 1) %>% nrow()
+used_count <- survey_data %>% filter(`Exclude Result` == 0) %>% nrow()

-# Count the records that are used (Exclude Result == 0)
-used_count <- survey_data %>%
-  filter(`Exclude Result` == 0) %>%
-  nrow()
-
-# Ignore excluded data.
 survey_data <- survey_data %>%
  filter(`Exclude Result` == 0)

-# Join the data based on the ParentGlobalID, ensuring all rows from survey_data are retained
+# Join related datasets by GlobalID
 combined_data <- survey_data %>%
  left_join(location_points, by = c("GlobalID" = "ParentGlobalID")) %>%
  left_join(location_polygons, by = c("GlobalID" = "ParentGlobalID")) %>%