Clean and standardize column names across survey data tibbles.

This commit is contained in:
Nick Heppler 2025-05-16 14:34:28 -04:00
parent d40c874921
commit 2f58126369

View File

@ -65,12 +65,25 @@ participant_organizations <- read_csv(participants_path)
species_planted <- read_csv(species_path) species_planted <- read_csv(species_path)
vendors <- read_csv(vendors_path) vendors <- read_csv(vendors_path)
# Clean column names by removing (Required) and (Optional) and trimming whitespace
# Define a function to clean column names
clean_column_names <- function(df) {
colnames(df) <- gsub("\\s*\\(Required\\)|\\s*\\(Optional\\)", "", colnames(df))
colnames(df) <- str_trim(colnames(df))
return(df)
}
# Apply the function to the relevant tibbles
survey_data <- clean_column_names(survey_data)
species_planted <- clean_column_names(species_planted)
participant_organizations <- clean_column_names(participant_organizations)
# Convert relevant date columns to datetime format and recode planting agency responses to standardized labels # Convert relevant date columns to datetime format and recode planting agency responses to standardized labels
survey_data <- survey_data %>% survey_data <- survey_data %>%
mutate(CreationDate = mdy_hms(CreationDate)) %>% mutate(CreationDate = mdy_hms(CreationDate)) %>%
mutate(`Start Date of Planting (Required)` = mdy_hms(`Start Date of Planting (Required)`)) %>% mutate(`Start Date of Planting` = mdy_hms(`Start Date of Planting`)) %>%
mutate(`End Date of Planting (Required)` = mdy_hms(`End Date of Planting (Required)`)) %>% mutate(`End Date of Planting` = mdy_hms(`End Date of Planting`)) %>%
mutate(`Who Planted The Tree(s)? (Required)` = recode(`Who Planted The Tree(s)? (Required)`, mutate(`Who Planted The Tree(s)?` = recode(`Who Planted The Tree(s)?`,
"agency" = "State Agency", "agency" = "State Agency",
"community" = "Community Organization", "community" = "Community Organization",
"landowner" = "Private Landowner", "landowner" = "Private Landowner",
@ -85,7 +98,7 @@ survey_data <- survey_data %>%
Submitted_Date_Str = if_else( Submitted_Date_Str = if_else(
!is.na(Submitted_Date_Str), !is.na(Submitted_Date_Str),
paste0("20", Submitted_Date_Str), # add "20" prefix to "24-11-07" paste0("20", Submitted_Date_Str),
NA_character_ NA_character_
), ),
@ -119,30 +132,30 @@ subtitle: "`r format(min(survey_data$CreationDate, na.rm = TRUE), "%B %d, %Y")`
## Key Findings ## Key Findings
```{r key-findings-summary} ```{r key-findings-summary}
kf_date_planting_start <- format(min(survey_data$`Start Date of Planting (Required)`, na.rm = TRUE), "%B %d, %Y") kf_date_planting_start <- format(min(survey_data$`Start Date of Planting`, na.rm = TRUE), "%B %d, %Y")
kf_date_planting_end <- format(max(survey_data$`End Date of Planting (Required)`, na.rm = TRUE), "%B %d, %Y") kf_date_planting_end <- format(max(survey_data$`End Date of Planting`, na.rm = TRUE), "%B %d, %Y")
kf_total_trees <- format(sum(survey_data$`Number of Trees Planted (Required)`), big.mark = ",") kf_total_trees <- format(sum(survey_data$`Number of Trees Planted`), big.mark = ",")
kf_region_total_trees_ranked <- survey_data %>% kf_region_total_trees_ranked <- survey_data %>%
group_by(Region) %>% group_by(Region) %>%
summarise(Total_Trees = sum(`Number of Trees Planted (Required)`, na.rm = TRUE)) %>% summarise(Total_Trees = sum(`Number of Trees Planted`, na.rm = TRUE)) %>%
arrange(desc(Total_Trees)) arrange(desc(Total_Trees))
kf_participant_total_trees_ranked <- survey_data %>% kf_participant_total_trees_ranked <- survey_data %>%
group_by(`Who Planted The Tree(s)? (Required)`) %>% group_by(`Who Planted The Tree(s)?`) %>%
summarise(Total_Trees = sum(`Number of Trees Planted (Required)`, na.rm = TRUE)) %>% summarise(Total_Trees = sum(`Number of Trees Planted`, na.rm = TRUE)) %>%
arrange(desc(Total_Trees)) arrange(desc(Total_Trees))
kf_dac_total_trees <- sum(survey_data$`Number of Trees Planted (Required)`[!is.na(survey_data$`Disadvantaged Communities Indicator`)], na.rm = TRUE) kf_dac_total_trees <- sum(survey_data$`Number of Trees Planted`[!is.na(survey_data$`Disadvantaged Communities Indicator`)], na.rm = TRUE)
kf_dac_percent <- (kf_dac_total_trees / sum(survey_data$`Number of Trees Planted (Required)`, na.rm = TRUE)) * 100 kf_dac_percent <- (kf_dac_total_trees / sum(survey_data$`Number of Trees Planted`, na.rm = TRUE)) * 100
kf_dac_percent_display <- round(kf_dac_percent, 1) kf_dac_percent_display <- round(kf_dac_percent, 1)
kf_generic_tree_type_ranked <- species_planted %>% kf_generic_tree_type_ranked <- species_planted %>%
filter(!is.na(`Generic Type of Tree (Optional)`)) %>% filter(!is.na(`Generic Type of Tree`)) %>%
count(`Generic Type of Tree (Optional)`, name = "Survey_Count") %>% count(`Generic Type of Tree`, name = "Survey_Count") %>%
arrange(desc(Survey_Count)) arrange(desc(Survey_Count))
kf_most_common_generic_tree_type <- kf_generic_tree_type_ranked$`Generic Type of Tree (Optional)`[1] kf_most_common_generic_tree_type <- kf_generic_tree_type_ranked$`Generic Type of Tree`[1]
kf_most_common_generic_tree_type_count <- kf_generic_tree_type_ranked$Survey_Count[1] kf_most_common_generic_tree_type_count <- kf_generic_tree_type_ranked$Survey_Count[1]
kf_most_common_generic_tree_type_count_formatted <- format(kf_most_common_generic_tree_type_count, big.mark = ",") kf_most_common_generic_tree_type_count_formatted <- format(kf_most_common_generic_tree_type_count, big.mark = ",")
@ -161,9 +174,9 @@ Between **`r kf_date_planting_start` and `r kf_date_planting_end`**, a total of
These efforts reflect broad collaboration between **municipal governments**, **community organizations**, **private landowners**, and other stakeholders. These efforts reflect broad collaboration between **municipal governments**, **community organizations**, **private landowners**, and other stakeholders.
- **Most Trees Planted**: The highest number of trees were reported in **`r kf_region_total_trees_ranked$Region[1]`**, followed by **`r kf_region_total_trees_ranked$Region[2]`**. - **Most Trees Planted**: The highest number of trees were reported in **`r kf_region_total_trees_ranked$Region[1]`**, followed by **`r kf_region_total_trees_ranked$Region[2]`**.
- **Top Planting Groups**: The most trees, approximately **`r scales::comma(kf_participant_total_trees_ranked$Total_Trees[1])`**, were planted by **`r kf_participant_total_trees_ranked$"Who Planted The Tree(s)? (Required)"[1]`**, followed by **`r kf_participant_total_trees_ranked$"Who Planted The Tree(s)? (Required)"[2]`**, which contributed **`r scales::comma(kf_participant_total_trees_ranked$Total_Trees[2])`** trees. - **Top Planting Groups**: The most trees, approximately **`r scales::comma(kf_participant_total_trees_ranked$Total_Trees[1])`**, were planted by **`r kf_participant_total_trees_ranked$"Who Planted The Tree(s)?"[1]`**, followed by **`r kf_participant_total_trees_ranked$"Who Planted The Tree(s)?"[2]`**, which contributed **`r scales::comma(kf_participant_total_trees_ranked$Total_Trees[2])`** trees.
- **Disadvantaged Communities**: Approximately **`r kf_dac_percent_display`%** of all trees were planted in **Disadvantaged Communities**, as defined by New York States Climate Act. - **Disadvantaged Communities**: Approximately **`r kf_dac_percent_display`%** of all trees were planted in **Disadvantaged Communities**, as defined by New York States Climate Act.
- **Most Reported Tree Genus**: **`r kf_most_common_generic_tree_type`** appeared most frequently, reported in **`r kf_most_common_generic_tree_type_count_formatted`** surveys. - **Most Reported Tree**: **`r kf_most_common_generic_tree_type`** appeared most frequently, reported in **`r kf_most_common_generic_tree_type_count_formatted`** surveys.
- The project received data from **`r kf_total_surveys_formatted` unique surveys**, representing **`r kf_unique_counties_formatted` counties** and **`r kf_unique_municipalities_formatted` municipalities**. - The project received data from **`r kf_total_surveys_formatted` unique surveys**, representing **`r kf_unique_counties_formatted` counties** and **`r kf_unique_municipalities_formatted` municipalities**.
These findings help track progress toward equity-centered climate goals, highlight areas of strong participation, and support data-driven planning for future tree planting across the state. These findings help track progress toward equity-centered climate goals, highlight areas of strong participation, and support data-driven planning for future tree planting across the state.
@ -399,8 +412,8 @@ calculate_response_rates <- function(survey_data, fields, caption) {
``` ```
```{r response-rate-table-optional, echo=TRUE, message=FALSE, fig.height=6, fig.width=8} ```{r response-rate-table-optional, echo=TRUE, message=FALSE, fig.height=6, fig.width=8}
fields <- c("Planter Contact Email (Optional)", "Funding Source (Optional)", "Land Ownership (Optional)", fields <- c("Planter Contact Email", "Funding Source", "Land Ownership",
"Tree Size Planted (Optional)", "Source of Trees (Optional)", "Total Number of Species Planted") "Tree Size Planted", "Source of Trees", "Total Number of Species Planted")
calculate_response_rates(survey_data, fields, "Response Rates for Key Survey Questions") calculate_response_rates(survey_data, fields, "Response Rates for Key Survey Questions")
``` ```
@ -461,7 +474,7 @@ create_histogram <- function(data, field, x_labels = NULL, color_palette = c("#1
create_histogram( create_histogram(
survey_data, survey_data,
field = "Who Planted The Tree(s)? (Required)", field = "Who Planted The Tree(s)?",
x_labels = c( x_labels = c(
"agency" = "State Agency", "agency" = "State Agency",
"community" = "Community Organization", "community" = "Community Organization",
@ -524,8 +537,8 @@ create_bar_chart <- function(data, field, sum_field = NULL, x_labels = NULL, col
create_bar_chart( create_bar_chart(
survey_data, survey_data,
field = "Who Planted The Tree(s)? (Required)", field = "Who Planted The Tree(s)?",
sum_field = "Number of Trees Planted (Required)", sum_field = "Number of Trees Planted",
x_labels = c( x_labels = c(
"agency" = "State Agency", "agency" = "State Agency",
"community" = "Community Organization", "community" = "Community Organization",
@ -586,7 +599,7 @@ This table presents a detailed summary of tree planting activity by participant
```{r participant-type-table, echo=TRUE} ```{r participant-type-table, echo=TRUE}
survey_data %>% survey_data %>%
create_summary_table("Who Planted The Tree(s)? (Required)", "Number of Trees Planted (Required)", remove_na = FALSE, table_font_size = 16) create_summary_table("Who Planted The Tree(s)?", "Number of Trees Planted", remove_na = FALSE, table_font_size = 16)
``` ```
## Named User Activity ## Named User Activity
@ -596,7 +609,7 @@ This table breaks down the number of submissions and trees planted by named user
```{r named-user-activity-table} ```{r named-user-activity-table}
survey_data %>% survey_data %>%
mutate(Creator = ifelse(is.na(Creator), "Public User", Creator)) %>% mutate(Creator = ifelse(is.na(Creator), "Public User", Creator)) %>%
create_summary_table("Creator", "Number of Trees Planted (Required)", remove_na = FALSE, table_font_size = 16) create_summary_table("Creator", "Number of Trees Planted", remove_na = FALSE, table_font_size = 16)
``` ```
## Unique E-mail Activity ## Unique E-mail Activity
@ -605,8 +618,8 @@ This table summarizes the planting activity associated with unique email address
```{r unique-email-activity-table} ```{r unique-email-activity-table}
survey_data %>% survey_data %>%
mutate(`Planter Contact Email (Optional)` = ifelse(is.na(`Planter Contact Email (Optional)`), "Not Provided", `Planter Contact Email (Optional)`)) %>% mutate(`Planter Contact Email` = ifelse(is.na(`Planter Contact Email`), "Not Provided", `Planter Contact Email`)) %>%
create_summary_table("Planter Contact Email (Optional)", "Number of Trees Planted (Required)", remove_na = FALSE, table_font_size = 16) create_summary_table("Planter Contact Email", "Number of Trees Planted", remove_na = FALSE, table_font_size = 16)
``` ```
### Municipal Activity ### Municipal Activity
@ -615,13 +628,13 @@ This table presents the number of trees planted by self-reported municipality. I
```{r municipal-activity-table} ```{r municipal-activity-table}
survey_data %>% survey_data %>%
mutate(`Participant Municipality (Optional)` = case_when( mutate(`Participant Municipality` = case_when(
str_starts(`Participant Municipality (Optional)`, "c_") ~ str_replace(`Participant Municipality (Optional)`, "^c_", "") %>% paste0(" (city)"), str_starts(`Participant Municipality`, "c_") ~ str_replace(`Participant Municipality`, "^c_", "") %>% paste0(" (city)"),
str_starts(`Participant Municipality (Optional)`, "v_") ~ str_replace(`Participant Municipality (Optional)`, "^v_", "") %>% paste0(" (village)"), str_starts(`Participant Municipality`, "v_") ~ str_replace(`Participant Municipality`, "^v_", "") %>% paste0(" (village)"),
str_starts(`Participant Municipality (Optional)`, "t_") ~ str_replace(`Participant Municipality (Optional)`, "^t_", "") %>% paste0(" (town)"), str_starts(`Participant Municipality`, "t_") ~ str_replace(`Participant Municipality`, "^t_", "") %>% paste0(" (town)"),
TRUE ~ `Participant Municipality (Optional)` TRUE ~ `Participant Municipality`
)) %>% )) %>%
create_summary_table("Participant Municipality (Optional)", "Number of Trees Planted (Required)", remove_na = FALSE, table_font_size = 16) create_summary_table("Participant Municipality", "Number of Trees Planted", remove_na = FALSE, table_font_size = 16)
``` ```
@ -632,15 +645,15 @@ This table highlights planting contributions by named organizations, either sele
```{r organization-activity-table} ```{r organization-activity-table}
survey_data %>% survey_data %>%
inner_join(participant_organizations, by = c("GlobalID" = "ParentGlobalID")) %>% inner_join(participant_organizations, by = c("GlobalID" = "ParentGlobalID")) %>%
filter(!(is.na(`Participant Organization (Optional)`) & is.na(`Other (Optional)`))) %>% filter(!(is.na(`Participant Organization`) & is.na(`Other`))) %>%
filter(!(tolower(`Participant Organization (Optional)`) == "other" & is.na(`Other (Optional)`))) %>% filter(!(tolower(`Participant Organization`) == "other" & is.na(`Other`))) %>%
mutate(`Participant Organization (Optional)` = ifelse( mutate(`Participant Organization` = ifelse(
tolower(`Participant Organization (Optional)`) == "other" & !is.na(`Other (Optional)`), tolower(`Participant Organization`) == "other" & !is.na(`Other`),
`Other (Optional)`, `Other`,
`Participant Organization (Optional)` `Participant Organization`
)) %>% )) %>%
mutate(`Participant Organization (Optional)` = str_replace_all(`Participant Organization (Optional)`, "_", " ")) %>% mutate(`Participant Organization` = str_replace_all(`Participant Organization`, "_", " ")) %>%
create_summary_table("Participant Organization (Optional)", "Number of Trees Planted (Required)", remove_na = FALSE, table_font_size = 16) create_summary_table("Participant Organization", "Number of Trees Planted", remove_na = FALSE, table_font_size = 16)
``` ```
# Location Analysis {.tabset} # Location Analysis {.tabset}
@ -700,7 +713,7 @@ Use this map to identify which regions are leading in planting activity, and whe
```{r create-region-choropleth-map, echo=TRUE, message=FALSE, fig.height=6, fig.width=8} ```{r create-region-choropleth-map, echo=TRUE, message=FALSE, fig.height=6, fig.width=8}
survey_data_aggregated <- survey_data %>% survey_data_aggregated <- survey_data %>%
group_by(Region) %>% group_by(Region) %>%
summarise(total_trees = sum(`Number of Trees Planted (Required)`, na.rm = TRUE)) summarise(total_trees = sum(`Number of Trees Planted`, na.rm = TRUE))
shapefile_path <- "/home/nick/gitea/tree-tracker-report/data/redc/redc.shp" shapefile_path <- "/home/nick/gitea/tree-tracker-report/data/redc/redc.shp"
@ -732,7 +745,7 @@ plot_geographic_data(joined_data = survey_data_joined,
The table below breaks down the total number of trees planted by region. It also shows each regions percentage contribution to overall planting activity across New York State. The table below breaks down the total number of trees planted by region. It also shows each regions percentage contribution to overall planting activity across New York State.
```{r create-summary-table-region, echo=TRUE, message=FALSE, fig.height=6, fig.width=8} ```{r create-summary-table-region, echo=TRUE, message=FALSE, fig.height=6, fig.width=8}
create_summary_table(survey_data, "Region", "Number of Trees Planted (Required)", remove_na = FALSE, table_font_size = 16) create_summary_table(survey_data, "Region", "Number of Trees Planted", remove_na = FALSE, table_font_size = 16)
``` ```
## By County ## By County
@ -744,7 +757,7 @@ This visual helps uncover local patterns within regions, and may guide localized
```{r create-county-choropleth-map, echo=TRUE, message=FALSE, fig.height=6, fig.width=8} ```{r create-county-choropleth-map, echo=TRUE, message=FALSE, fig.height=6, fig.width=8}
survey_data_aggregated <- survey_data %>% survey_data_aggregated <- survey_data %>%
group_by(County) %>% group_by(County) %>%
summarise(total_trees = sum(`Number of Trees Planted (Required)`, na.rm = TRUE)) summarise(total_trees = sum(`Number of Trees Planted`, na.rm = TRUE))
geographic_data <- counties(state = "NY", cb = TRUE, progress = FALSE) %>% geographic_data <- counties(state = "NY", cb = TRUE, progress = FALSE) %>%
st_as_sf() %>% st_as_sf() %>%
@ -769,7 +782,7 @@ plot_geographic_data(joined_data = survey_data_joined,
This table provides a detailed breakdown of trees planted by county. Use it alongside the map to validate trends or investigate specific areas. This table provides a detailed breakdown of trees planted by county. Use it alongside the map to validate trends or investigate specific areas.
```{r create-summary-table-county, echo=TRUE, message=FALSE, fig.height=6, fig.width=8} ```{r create-summary-table-county, echo=TRUE, message=FALSE, fig.height=6, fig.width=8}
create_summary_table(survey_data, "County", "Number of Trees Planted (Required)", remove_na = FALSE, table_font_size = 16) create_summary_table(survey_data, "County", "Number of Trees Planted", remove_na = FALSE, table_font_size = 16)
``` ```
# Tree Analysis {.tabset} # Tree Analysis {.tabset}
@ -847,7 +860,7 @@ This table summarizes the number and percentage of surveys by **tree genus**. It
* **"Not Provided"**: Includes submissions where the genus was not specified. * **"Not Provided"**: Includes submissions where the genus was not specified.
```{r create-summary-table-genus, echo=TRUE, message=FALSE, fig.height=6, fig.width=8} ```{r create-summary-table-genus, echo=TRUE, message=FALSE, fig.height=6, fig.width=8}
create_species_summary_table(species_planted, "Generic Type of Tree (Optional)", "Tree Genus") create_species_summary_table(species_planted, "Generic Type of Tree", "Tree Genus")
``` ```
--- ---
@ -861,7 +874,7 @@ This table provides a breakdown of survey submissions by **tree species**. It of
* **"Not Provided"**: Surveys that omitted species details. * **"Not Provided"**: Surveys that omitted species details.
```{r create-summary-table-species, echo=TRUE, message=FALSE, fig.height=6, fig.width=8} ```{r create-summary-table-species, echo=TRUE, message=FALSE, fig.height=6, fig.width=8}
create_species_summary_table(species_planted, "Tree Species (Optional)", "Tree Species") create_species_summary_table(species_planted, "Tree Species", "Tree Species")
``` ```
# Disadvantaged Communities {.tabset} # Disadvantaged Communities {.tabset}
@ -883,7 +896,7 @@ This table presents the total number of trees planted within DACs, grouped by Ne
```{r create-summary-table-region-dac, echo=TRUE, message=FALSE, fig.height=6, fig.width=8} ```{r create-summary-table-region-dac, echo=TRUE, message=FALSE, fig.height=6, fig.width=8}
survey_data %>% survey_data %>%
filter(!is.na(`Disadvantaged Communities Indicator`)) %>% filter(!is.na(`Disadvantaged Communities Indicator`)) %>%
create_summary_table("Region", "Number of Trees Planted (Required)", remove_na = FALSE, table_font_size = 16) create_summary_table("Region", "Number of Trees Planted", remove_na = FALSE, table_font_size = 16)
``` ```
--- ---
@ -895,7 +908,7 @@ This table summarizes tree planting within DACs by **county**. It provides a mor
```{r create-summary-table-county-dac, echo=TRUE, message=FALSE, fig.height=6, fig.width=8} ```{r create-summary-table-county-dac, echo=TRUE, message=FALSE, fig.height=6, fig.width=8}
survey_data %>% survey_data %>%
filter(!is.na(`Disadvantaged Communities Indicator`)) %>% filter(!is.na(`Disadvantaged Communities Indicator`)) %>%
create_summary_table("County", "Number of Trees Planted (Required)", remove_na = FALSE, table_font_size = 16) create_summary_table("County", "Number of Trees Planted", remove_na = FALSE, table_font_size = 16)
``` ```
--- ---
@ -907,5 +920,5 @@ This table breaks down the number of trees planted within DACs by **municipality
```{r create-summary-table-county-municipality, echo=TRUE, message=FALSE, fig.height=6, fig.width=8} ```{r create-summary-table-county-municipality, echo=TRUE, message=FALSE, fig.height=6, fig.width=8}
survey_data %>% survey_data %>%
filter(!is.na(`Disadvantaged Communities Indicator`)) %>% filter(!is.na(`Disadvantaged Communities Indicator`)) %>%
create_summary_table("Municipality", "Number of Trees Planted (Required)", remove_na = FALSE, table_font_size = 16) create_summary_table("Municipality", "Number of Trees Planted", remove_na = FALSE, table_font_size = 16)
``` ```