Refactor report

Add RProject files to ignore.
2025-02-14 14:00:19 -05:00 · 2025-02-14 08:38:28 -05:00
2 changed files with 180 additions and 197 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,3 @@
 data/
 report.html
 .Rproj.user
--- a/report.Rmd
+++ b/report.Rmd
@ -98,84 +98,103 @@ By applying these validation checks, the integrity and consistency of the data i
 ## Submission Analysis {.tabset}
-### Submission Trend Analysis
+### Submissions by Day of Week
 The histogram presented below visualizes the number of survey submissions based on the day of the week. Each bar represents the frequency of submissions for a particular day, with the x-axis displaying the days (Monday through Sunday) and the y-axis showing the number of submissions for each corresponding day.
-```{r submission-trend-stats, echo=FALSE, message=FALSE}
+This chart helps identify any trends in survey participation, such as whether submissions are more frequent at the beginning or end of the week. This could be valuable for understanding user behavior and improving survey timing or outreach strategies.
-## library(dplyr)
+
 ```{r submission-histogram-survey-submissions-day-of-week, echo=FALSE, message=FALSE, fig.height=6, fig.width=8}
 library(dplyr)
 library(ggplot2)
 # Assuming 'survey_data' is your tibble
 survey_data %>%
  mutate(DayOfWeek = factor(weekdays(CreationDate), 
                            levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))) %>%  # Set order of days
  ggplot(aes(x = DayOfWeek)) + 
  geom_bar(stat = "count") +  # Create the histogram (bar plot)
  geom_text(aes(label = after_stat(count)), stat = "count", vjust = -0.25) +  # Add labels above the bars
  xlab("Day of the Week") +
  ylab("Number of Submissions") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Angle labels for better readability
 ```
 ```{r func-plot_submission_trends, echo=FALSE}
 # Load necessary libraries
 library(tidyverse)
 # Custom color palette
 custom_palette <- c(
  "#233f28", # primary
  "#7e9084", # secondary
  "#d9e1dd", # tertiary
  "#face00"  # accent
 )
 # Ensure CreationDate is in Date format
 survey_data$CreationDate <- as.Date(survey_data$CreationDate)
-# Summarize the data to calculate the total number of submissions by CreationDate
+# Define the function to plot survey submission trends
-summary_data <- survey_data %>%
+plot_submission_trends <- function(data, days_ago = 30) {
-  filter(`Exclude Result` == 0) %>%
+  
-  group_by(CreationDate) %>%
+  # Calculate the start date (days_ago days before today)
-  summarise(total_submissions = n(), .groups = "drop")
+  start_date <- Sys.Date() - days_ago
-
+  
-# Number of days that have elapsed between the first and last submission date
+  # Filter the data based on the calculated start date (up to today)
-date_range <- range(summary_data$CreationDate)
+  submission_trends <- data %>%
-elapsed_days <- as.integer(difftime(date_range[2], date_range[1], units = "days"))
+    filter(CreationDate >= start_date) %>%
-
+    group_by(CreationDate) %>%
-# Number of days with 0 submissions
+    summarize(submissions = n())
-all_dates <- data.frame(CreationDate = seq.Date(date_range[1], date_range[2], by = "day"))
+  
-merged_data <- left_join(all_dates, summary_data, by = "CreationDate")
+  # Create the plot
-days_with_0_submissions <- sum(is.na(merged_data$total_submissions))
+  ggplot(submission_trends, aes(x = CreationDate, y = submissions)) +
-
+    geom_line(color = custom_palette[1], linewidth = 1) +  # Line color from palette
-# Summary statistics based on the count of submissions
+    geom_point(color = custom_palette[1], size = 3, shape = 16) +  # Points for visibility
-submission_summary <- summary(merged_data$total_submissions, na.rm = TRUE)
+    labs(
-
+      title = "Survey Submission Trends by Date",
-# Dates where submissions exceeded the 3rd quartile
+      subtitle = paste("Tracking submissions for the last", days_ago, "days"),
-third_quartile <- quantile(merged_data$total_submissions, 0.75, na.rm = TRUE)
+      x = "Submission Date",
-dates_above_3rd_quartile <- merged_data %>%
+      y = "Number of Submissions"
-  filter(total_submissions > third_quartile) %>%
+    ) +
-  pull(CreationDate)
+    theme_minimal() +
-
+    theme(
      plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
      plot.subtitle = element_text(hjust = 0.5, size = 12, color = "grey40"),
      axis.title.x = element_text(color = "black", size = 12),
      axis.title.y = element_text(color = "black", size = 12),
      axis.text = element_text(color = "black", size = 10),
      panel.grid.major = element_line(color = "grey90"),
      panel.grid.minor = element_blank(),
      axis.text.x = element_text(angle = 45, hjust = 1)  # Rotate x-axis labels
    ) +
    # Add a smoothed trend line (loess)
    geom_smooth(method = "loess", color = custom_palette[4], linewidth = 1, linetype = "dashed")
 }
 ```
-The survey has been active for **`r elapsed_days`** days.During this period **`r days_with_0_submissions`** days had no submission.
+### 30 Day Trend
 The plot below visualizes the survey submission trends for the past 30 days. It shows the number of submissions made each day, highlighting variations over the last month. This type of plot is helpful for understanding trends in user activity, such as identifying peak submission days, periods of low activity, or gradual changes over time.
-The following visualization illustrates the trend in the total number of submissions throughout the survey period, providing insights into any patterns or changes in submission activity.
+The data used for this plot is filtered to include only submissions made in the last 30 days, with the submission count for each date represented by both the line and the points on the graph. A smoothed trend line (dashed) has been added to help visualize the overall submission pattern over this period.
 ```{r submission-trend-plot, echo=FALSE, message=FALSE, fig.height=6, fig.width=8}
 #library(ggplot2)
 # Plot Submission Trend
 ggplot(summary_data, aes(x = CreationDate, y = total_submissions)) + 
  geom_line(color = "#233f28", linewidth = 1) +
  geom_point(color = "#7e9084", size = 3) +
  geom_smooth(method = "loess", color = "#face00", linewidth = 1, linetype = "dashed") +
  labs(
    title = "Total Number of Submissions by Date",
    x = "Submission Date",
    y = "Total Number of Submissions"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(size = 16, face = "bold", color = "#233f28"),
    axis.title = element_text(size = 12, color = "#233f28"),
    axis.text = element_text(size = 10, color = "#233f28"),  
    plot.margin = margin(10, 10, 10, 10),
    panel.grid.major = element_line(color = "#d9e1dd", linewidth = 0.3),
    panel.background = element_rect(fill = "#d9e1dd"),
    axis.text.x = element_text(angle = 45, hjust = 1)
  ) +
  scale_x_date(date_labels = "%b %Y", date_breaks = "1 months")
 ```{r plot-submission-trends-30d, echo=FALSE, message=FALSE, fig.height=6, fig.width=8}
 plot_submission_trends(survey_data, days_ago = 30)
 ```
-### Survey Response Rates by Field
+### 90 Day Trend
-The table below shows the response rates for a selection of optional fields within the survey. Each field represents a different aspect of the survey, and the response rate reflects the percentage of respondents who provided valid answers for each field.
+The plot below visualizes the survey submission trends for the past 90 days. It shows the number of submissions made each day, highlighting variations over the last month. This type of plot is helpful for understanding trends in user activity, such as identifying peak submission days, periods of low activity, or gradual changes over time.
- **Planter Contact Email**: The percentage of respondents who provided their email address.
+The data used for this plot is filtered to include only submissions made in the last 90 days, with the submission count for each date represented by both the line and the points on the graph. A smoothed trend line (dashed) has been added to help visualize the overall submission pattern over this period.
 - **Funding Source**: The percentage of respondents who identified their funding source.
 - **Land Ownership**: The percentage of respondents who indicated their land ownership status.
 - **Tree Size Planted**: The percentage of respondents who specified the size of trees they planted.
 - **Source of Trees**: The percentage of respondents who reported the source of the trees they planted.
 - **Species Planted**: The percentage of respondents who provided the species of tree(s) they planted.
-This breakdown helps identify which survey fields received higher levels of engagement, and which may require further clarification or encouragement to improve response rates.
+```{r plot-submission-trends-90d, echo=FALSE, message=FALSE}
 plot_submission_trends(survey_data, days_ago = 90)
 ```
-```{r response-rate, echo=FALSE, message=FALSE}
+### Response Rates to Top-Level Optional Questions
 The table below summarizes the response rates for optional key top-level questions in the survey. These are the questions that all participants are asked, with some triggering additional follow-up questions based on responses. The response rate is the percentage of participants who provided an answer for each question.
 The "Total Number of Species Planted" question has special handling—only responses greater than 0 are considered valid, whereas for other questions, any non-NA value counts as a response.
 ```{r optonal-top-level-question-response-rate-table, echo=FALSE, message=FALSE, fig.height=6, fig.width=8}
 # List of fields to check for response rates, with special handling for 'Total Number of Species Planted'
 fields <- c("Planter Contact Email", "Funding Source", "Land Ownership", 
            "Tree Size Planted", "Source of Trees", "Total Number of Species Planted")
@ -197,27 +216,47 @@ response_rates_rounded <- round(response_rates, 2)
 # Sort the response rates in descending order (highest to lowest)
 sorted_response_rates <- sort(response_rates_rounded, decreasing = TRUE)
-# Print the sorted, rounded response rates
+# Create a clean data frame with the field names and their response rates
-sorted_response_rates
+response_rate_table <- data.frame(
  "Field" = names(sorted_response_rates),
  "Response Rate (%)" = sorted_response_rates,
  stringsAsFactors = FALSE  # Ensure the "Field" column is treated as character, not factor
 )
 # Remove the row names (the extra column that appears as a result of conversion)
 rownames(response_rate_table) <- NULL
 # Fix column names to ensure proper headers
 colnames(response_rate_table) <- c("Field", "Response Rate (%)")
 # Display the table using kable for better formatting
 library(knitr)
 kable(response_rate_table, caption = "Response Rates for Key Survey Questions", align = "l")
 ```
-## Participant Type Analysis {.tabset}
+The following provides additional context for each survey question/field, detailing what the percentage represents.
-### Number of Submissions
+- **Planter Contact Email**: The percentage of respondents who provided their email address.
-The first visualization shows the distribution of the number of tree planting surveys based on the participant type. This breakdown helps highlight which groups are contributing most to the tree planting initiative.
+- **Funding Source**: The percentage of respondents who identified their funding source.
 - **Land Ownership**: The percentage of respondents who indicated their land ownership status.
 - **Tree Size Planted**: The percentage of respondents who specified the size of trees they planted.
 - **Source of Trees**: The percentage of respondents who reported the source of the trees they planted.
 - **Total Number of Species Planted	**: The percentage of respondents who provided the species of tree(s) they planted.
-```{r participant-type-surveys, echo=FALSE, message=FALSE}
+## Participant Analysis {.tabset}
-#library(ggplot2)
+The following section contains an analysis of tree planting by participant type.
 #library(dplyr)
 ### Submissions
 The following plot shows the distribution of survey submissions based on participant type. This breakdown highlights the contributions of each participant group to the tree planting initiative.
 ```{r participant-type-surveys, echo=FALSE, message=FALSE, fig.height=6, fig.width=8}
 ggplot(survey_data, aes(x = `Who Planted The Tree(s)?`)) + 
  geom_bar(fill = "#233f28", color = "#7e9084") +
  geom_text(stat = "count", aes(label = scales::comma(after_stat(count))), 
            position = position_stack(vjust = 0.5),  # Places text in the middle of the bars
-            color = "#ffffff", size = 5) +  # Adjust label size
+            color = "#face00", size = 5) +  # Use accent color for text labels
  labs(
-    title = "Number of Tree Planting Submissions by Participant Type",
+    title = "Distribution of Tree Planting Submissions by Participant Type",
    x = "Participant Type",
    y = "Number of Submissions"
  ) +
@ -241,28 +280,22 @@ ggplot(survey_data, aes(x = `Who Planted The Tree(s)?`)) +
 ```
-### Total Trees Planted
+### Trees Planted
-This second plot provides a breakdown of the total number of trees planted by participant type. This visualization helps to assess the contribution of each participant group to the overall impact of the tree planting program.
+This plot visualizes the total number of trees planted by each participant type, helping to evaluate the overall impact of different groups in the tree planting program.
-```{r participant-type-planted, echo=FALSE, message=FALSE}
+```{r participant-type-planted, echo=FALSE, message=FALSE, fig.height=6, fig.width=8}
 library(ggplot2)
 library(dplyr)
 summary_data <- survey_data %>%
  group_by(`Who Planted The Tree(s)?`) %>%
  summarise(total_trees = sum(`Number of Trees Planted`, na.rm = TRUE))
 library(ggplot2)
 library(dplyr)
 # Assuming 'summary_data' is already defined
 ggplot(summary_data, aes(x = `Who Planted The Tree(s)?`, y = total_trees)) + 
  geom_bar(stat = "identity", fill = "#233f28", color = "#7e9084") +
  geom_text(aes(label = scales::comma(total_trees)), 
            position = position_stack(vjust = 0.5),  # Places text in the middle of the bars
-            color = "#ffffff", size = 5) +  # Accent color for text labels
+            color = "#face00", size = 5) +  # Accent color for text labels
  labs(
-    title = "Total Number of Trees Planted by Participant Type",
+    title = "Contribution of Each Participant Type to Total Trees Planted",
    x = "Participant Type",
    y = "Total Number of Trees Planted"
  ) +
@ -285,7 +318,9 @@ ggplot(summary_data, aes(x = `Who Planted The Tree(s)?`, y = total_trees)) +
  )
 ```
-```{r participant-type-table, echo=FALSE, message=FALSE}
+The following table provides a breakdown of the total number of trees planted by participant type. It shows both the total number of trees planted by each group and their proportional contribution to the overall planting efforts. This information helps assess which participant types have contributed the most to the tree planting program.
 ```{r participant-type-table, echo=FALSE, message=FALSE, fig.height=6, fig.width=8}
 # Summarize the data to calculate the total number of trees planted by participant type
 summary_data <- survey_data %>%
  group_by(`Who Planted The Tree(s)?`) %>%
@ -312,127 +347,74 @@ summary_data_formatted <- summary_data %>%
    percentage = paste0(round(percentage, 1), "%")  # Round percentage and append '%'
  )
 # Print the table
 summary_data_formatted %>%
  knitr::kable(col.names = c("Participant Type", "Total Trees Planted", "Percentage of Total Trees"),
-               caption = "Total Number of Trees Planted by Participant Type and their Proportional Contribution") %>%
+               caption = "Breakdown of Total Trees Planted by Participant Type and Their Contribution to the Overall Tree Planting Effort",
-  kableExtra::kable_styling(full_width = F, position = "center", bootstrap_options = c("striped", "hover"))
+               align = c("l", "c", "c")) %>%  # Align Participant Type left, and others center
-```
+  kableExtra::kable_styling(
-
+    full_width = F, 
-
+    position = "center", 
-## Region Overview
+    bootstrap_options = c("striped", "hover"),
-This section provides an overview of regional involved and response to the tree planting survey. 
+    font_size = 14,
-
+    fixed_thead = TRUE
 In the table below, we aggregate plantings by Region. The results are provided in descending order of Total Trees Planted.
 ```{r region-summary, echo=FALSE, warning=FALSE, message=FALSE}
 # Summarize the data by Region
 region_summary_data <- survey_data %>%
  group_by(Region) %>%
  summarise(
    total_records = n(),  # Count the number of records in each region
    total_trees_planted = sum(`Number of Trees Planted`, na.rm = TRUE),  # Sum of trees planted in each region
    mean_trees_planted = mean(`Number of Trees Planted`, na.rm = TRUE),  # Mean number of trees planted
    median_trees_planted = median(`Number of Trees Planted`, na.rm = TRUE)  # Median number of trees planted
  ) %>%
-  arrange(desc(total_trees_planted))  # Sort by total trees planted in descending order
+  kableExtra::column_spec(1, width = "20em", bold = TRUE) %>%  # Participant Type column bold and wider
-
+  kableExtra::column_spec(2, width = "12em", color = "black") %>%  # Total Trees column
-# Format the table to display the total number of records and trees planted
+  kableExtra::column_spec(3, width = "12em", color = "black") %>%  # Percentage column
-region_summary_data_formatted <- region_summary_data %>%
+  kableExtra::add_footnote("Total number of trees and percentage represent each participant's contribution to the overall tree planting effort.")
  mutate(
    total_trees_planted = scales::comma(total_trees_planted),  # Add commas to the total number of trees
    total_records = scales::comma(total_records),  # Add commas to the total number of records
    mean_trees_planted = round(mean_trees_planted, 1),  # Round mean for readability
    median_trees_planted = round(median_trees_planted, 1)  # Round median for readability
  )
 # Print the summary table
 region_summary_data_formatted %>%
  knitr::kable(col.names = c("Region", "Total Submissions", "Total Trees Planted", "Mean", "Median"),
               caption = "Total Records, Trees Planted, Mean, and Median by Region (Sorted by Trees Planted)") %>%
  kableExtra::kable_styling(full_width = F, position = "center", bootstrap_options = c("striped", "hover"))
 ```
 ## County Overview
 This section provides an overview of counties involved and response to the tree planting survey. 
 In the table below, we aggregate plantings by County. The results are provided in descending order of Total Trees Planted.
 ```{r county-summary, echo=FALSE, warning=FALSE, message=FALSE}
 # Summarize the data by Region
 county_summary_data <- survey_data %>%
  group_by(County) %>%
  summarise(
    total_records = n(),  # Count the number of records in each county
    total_trees_planted = sum(`Number of Trees Planted`, na.rm = TRUE),  # Sum of trees planted in each region
    mean_trees_planted = mean(`Number of Trees Planted`, na.rm = TRUE),  # Mean number of trees planted
    median_trees_planted = median(`Number of Trees Planted`, na.rm = TRUE)  # Median number of trees planted
  ) %>%
  arrange(desc(total_trees_planted))  # Sort by total trees planted in descending order
 # Format the table to display the total number of records and trees planted
 county_summary_data_formatted <- county_summary_data %>%
  mutate(
    total_trees_planted = scales::comma(total_trees_planted),  # Add commas to the total number of trees
    total_records = scales::comma(total_records),  # Add commas to the total number of records
    mean_trees_planted = round(mean_trees_planted, 1),  # Round mean for readability
    median_trees_planted = round(median_trees_planted, 1)  # Round median for readability
  )
 # Print the summary table
 county_summary_data_formatted %>%
  knitr::kable(col.names = c("County", "Total Submissions", "Total Trees Planted", "Mean", "Median"),
               caption = "Total Records, Trees Planted, Mean, and Median by County (Sorted by Trees Planted)") %>%
  kableExtra::kable_styling(full_width = F, position = "center", bootstrap_options = c("striped", "hover"))
 ```
-## Species Overview
+## Location Analysis{.tabset}
 The following section contains details on species plantings. These results indicate the number of occurrences where the tree species was planted. They are not necessarily the number of those trees planted, but can be used to indicate popularity.
-```{r species-detail, echo=FALSE, message=FALSE}
+```{r func-create_summary_table, echo=FALSE}
-#library(tidyverse)
+create_summary_table <- function(data, field) {
-# Count unique values in 'Generic.Species.of.Tree' and 'Precise.Species.of.Tree', handling NA and sorting
+  # Summarize the data based on the field provided
-generic_species_count <- species_data %>%
+  summary_data <- data %>%
-  count(`Generic.Species.of.Tree`) %>%
+    group_by(!!sym(field)) %>%  # Dynamically use the provided field name
-  mutate(
+    summarise(
-    `Generic.Species.of.Tree` = if_else(is.na(`Generic.Species.of.Tree`), "Null Response", `Generic.Species.of.Tree`),
+      submissions = n(),  # Count of submissions
-    `Generic.Species.of.Tree` = str_replace_all(`Generic.Species.of.Tree`, "_", " "), # Replace underscores with spaces
+      total_trees = sum(`Number of Trees Planted`, na.rm = TRUE)  # Sum of trees planted
-    `Generic.Species.of.Tree` = str_to_title(`Generic.Species.of.Tree`) # Convert to Title Case
+    ) %>%
-  ) %>%
+    mutate(
-  arrange(desc(n)) # Sort by count in descending order
+      submissions_percentage = submissions / sum(submissions) * 100,  # Proportion of submissions
      trees_percentage = total_trees / sum(total_trees) * 100  # Proportion of trees planted
    )
-precise_species_count <- species_data %>%
+  # Format the table to display commas for the totals and round percentages
-  count(`Precise.Species.of.Tree`) %>%
+  summary_data_formatted <- summary_data %>%
-  mutate(
+    mutate(
-    `Precise.Species.of.Tree` = if_else(is.na(`Precise.Species.of.Tree`), "Null Response", `Precise.Species.of.Tree`),
+      submissions = scales::comma(submissions),
-    `Precise.Species.of.Tree` = str_replace_all(`Precise.Species.of.Tree`, "_", " "), # Replace underscores with spaces
+      total_trees = scales::comma(total_trees),
-    `Precise.Species.of.Tree` = str_to_title(`Precise.Species.of.Tree`) # Convert to Title Case
+      submissions_percentage = paste0(round(submissions_percentage, 1), "%"),
-  ) %>%
+      trees_percentage = paste0(round(trees_percentage, 1), "%")
-  arrange(desc(n)) # Sort by count in descending order
+    )
-# Print the results
+  # Create and style the table
-print(generic_species_count)
+  summary_data_formatted %>%
-print(precise_species_count)
+    knitr::kable(col.names = c(field, "Number of Submissions", "Number of Trees Planted", "Proportion of Submissions (%)", "Proportion of Trees Planted (%)"),
                 caption = paste("Summary of Submissions and Trees Planted by", field),
                 align = c("l", "c", "c", "c", "c")) %>%
    kableExtra::kable_styling(
      full_width = F,
      position = "center",
      bootstrap_options = c("striped", "hover"),
      font_size = 14
    ) %>%
    kableExtra::column_spec(1, width = "20em", bold = TRUE) %>%  # First column bold and wider
    kableExtra::column_spec(2, width = "12em") %>%  # Total Trees column
    kableExtra::column_spec(3, width = "12em") %>%  # Percentage column
    kableExtra::add_footnote("The proportions represent the percentage of submissions and trees planted for each category relative to the overall dataset.")
 }
 ```
-## Tree Count
+### By Region
-In this section, we present summary statistics for the number of trees planted by all participants in various tree planting surveys.
+```{r create-summary-table-region, echo=FALSE, message=FALSE}
-
+create_summary_table(survey_data, "Region")
 ```{r summary-stats, echo=FALSE, warning=FALSE, message=FALSE}
 # Calculate summary statistics
 summary_stats <- summary(survey_data$`Number of Trees Planted`, na.rm = TRUE)
 ```
-Below is a summary of the `Number of Trees Planted` across participants:
+### By County
-
+```{r create-summary-table-county, echo=FALSE, message=FALSE}
-| Statistic   | Value       |
+create_summary_table(survey_data, "County")
-|-------------|-------------|
+```
 | Min         | `r summary_stats["Min"]`  |
 | 1st Qu.     | `r summary_stats["1st Qu."]` |
 | Median      | `r summary_stats["Median"]` |
 | Mean        | `r summary_stats["Mean"]` |
 | 3rd Qu.     | `r summary_stats["3rd Qu."]` |
 | Max         | `r summary_stats["Max"]` |
 The summary statistics for the number of trees planted provide insight into the distribution of trees planted by all participants in the tree planting surveys. While the median value gives us a sense of the "typical" number of trees planted, the mean might be skewed by a few participants planting a very large number of trees.
Author	SHA1	Message	Date
Nick Heppler	cfd4ec0113	Refactor report	2025-02-14 14:00:19 -05:00
Nick Heppler	529fe505b6	Add RProject files to ignore.	2025-02-14 08:38:28 -05:00