library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(here)
here() starts at /Users/libaijie/ma415-lab0/ma4615-sp25-final-project-team-9
# Reserved codes that should be treated as missing
reserved_codes <- c(-3, -4, -5, -6, -9, -12, -13)

# Load datasets using here::here()
harassment_data <- read_csv(here("dataset", "Harassment and Bullying.csv"), show_col_types = FALSE)
enrollment_data <- read_csv(here("dataset", "Enrollment.csv"), show_col_types = FALSE)
support_data <- read_csv(here("dataset", "School Support.csv"), show_col_types = FALSE)

# Variables to extract
harassment_vars <- c(
  "SCH_HBALLEGATIONS_SEX", "SCH_HBALLEGATIONS_RAC", "SCH_HBALLEGATIONS_DIS", "SCH_HBALLEGATIONS_REL",
  "SCH_HBREPORTED_RAC_HI_M", "SCH_HBREPORTED_RAC_HI_F",
  "SCH_HBREPORTED_RAC_AM_M", "SCH_HBREPORTED_RAC_AM_F",
  "SCH_HBREPORTED_RAC_AS_M", "SCH_HBREPORTED_RAC_AS_F",
  "SCH_HBREPORTED_RAC_HP_M", "SCH_HBREPORTED_RAC_HP_F",
  "SCH_HBREPORTED_RAC_BL_M", "SCH_HBREPORTED_RAC_BL_F",
  "SCH_HBREPORTED_RAC_WH_M", "SCH_HBREPORTED_RAC_WH_F",
  "SCH_HBREPORTED_RAC_TR_M", "SCH_HBREPORTED_RAC_TR_F"
)

enrollment_vars <- c(
  "SCH_ENR_HI_M", "SCH_ENR_HI_F", "SCH_ENR_AM_M", "SCH_ENR_AM_F",
  "SCH_ENR_AS_M", "SCH_ENR_AS_F", "SCH_ENR_HP_M", "SCH_ENR_HP_F",
  "SCH_ENR_BL_M", "SCH_ENR_BL_F", "SCH_ENR_WH_M", "SCH_ENR_WH_F",
  "SCH_ENR_TR_M", "SCH_ENR_TR_F"
)

support_vars <- c("SCH_FTECOUNSELORS", "SCH_FTESERVICES_PSY", "SCH_FTESECURITY_GUA")

# Clean harassment data
harassment_clean <- harassment_data %>%
  select(all_of(harassment_vars)) %>%
  rename_with(~ str_remove(., "^SCH_")) %>%
  mutate(across(everything(), ~ ifelse(.x %in% reserved_codes, NA, .x)))

# Clean enrollment data
enrollment_clean <- enrollment_data %>%
  select(all_of(enrollment_vars)) %>%
  rename_with(~ str_remove(., "^SCH_")) %>%
  mutate(across(everything(), ~ ifelse(.x %in% reserved_codes, NA, .x)))

# Clean support data
support_clean <- support_data %>%
  select(all_of(support_vars)) %>%
  rename_with(~ str_remove(., "^SCH_")) %>%
  mutate(across(everything(), ~ ifelse(.x %in% reserved_codes, NA, .x)))

# Merge datasets together (row-wise binding by row order)
final_clean_data <- bind_cols(harassment_clean, enrollment_clean, support_clean)

# Save the cleaned dataset
write_rds(final_clean_data, here("dataset", "final_clean_data.rds"))
write_csv(final_clean_data, here("dataset", "final_clean_data.csv"))