6.5 Conclusion
This chapter walked through a complete data cleaning pipeline — from raw event-level data to a polished employee-level data frame. Along the way, we:
- Filtered the data to focus on employees with sufficient service history
- Aggregated 740 absence events into per-employee summaries
- Recoded numeric variables into readable categorical labels
- Created new derived variables (
College,High absenteeism) - Converted character columns to factors for modeling
The cleaned dataset is now saved and ready for use. In the next chapters, we will visualize these employee-level patterns (Chapters 7–8) and build models to predict high absenteeism (Chapters 9–10).
6.5.1 Complete Code Listing
Below is the complete R code used in this chapter as a single script. You can copy this code into a Quarto document (.qmd file), render it, and reproduce all of the data cleaning from this chapter.
Complete Code
# -------------------------------------------
# 1. Load packages and data
# -------------------------------------------
if(!require("pacman")) install.packages("pacman")
pacman::p_load("tidyverse", "psych", "lubridate")
# Load the raw dataset
absenteeism <- read_delim(
"https://ljkelly3141.github.io/datasets/bi-book/Absenteeism_at_work.csv",
delim = ";"
)
# -------------------------------------------
# 2. Create a custom MODE function
# -------------------------------------------
# Returns the most frequently occurring value
MODE <- function(x) {
names(sort(-table(x)))[1] |> as.numeric()
}
# -------------------------------------------
# 3. Filter by service time
# -------------------------------------------
# Keep only employees with 6+ months of service
Absenteeism.by.employee <- absenteeism |>
filter(`Service time` >= 6)
# -------------------------------------------
# 4. Aggregate data by employee
# -------------------------------------------
# Compute per-employee summaries within groups
Absenteeism.by.employee <- Absenteeism.by.employee |>
group_by(ID) |>
mutate(`Most common reason for absence` = MODE(`Reason for absence`),
`Month of absence` = MODE(`Month of absence`),
`Day of the week` = MODE(`Day of the week`),
Education = max(Education),
`Absenteeism time in hours` =
sum(`Absenteeism time in hours`)/`Service time`,
`Number of absence` = n()/`Service time`)
# -------------------------------------------
# 5. Collapse to one row per employee
# -------------------------------------------
Absenteeism.by.employee <- Absenteeism.by.employee |>
group_by(ID) |>
summarise(across(everything(), mean))
# -------------------------------------------
# 6. Convert day/month to readable labels
# -------------------------------------------
Absenteeism.by.employee <- Absenteeism.by.employee |>
mutate(
`Day of the week` =
wday(`Day of the week` + 1, label = TRUE) |> as.character(),
`Month of absence` =
month(
ymd(010101) + months(`Month of absence` - 1),
label = TRUE, abbr = TRUE
) |> as.character()
)
# -------------------------------------------
# 7. Select key variables
# -------------------------------------------
Absenteeism.by.employee <- Absenteeism.by.employee |>
select(ID, `Number of absence`, `Absenteeism time in hours`,
`Most common reason for absence`, `Month of absence`,
`Day of the week`, `Body mass index`, Age,
`Social smoker`, `Social drinker`, Son, Pet, Education)
# -------------------------------------------
# 8. Recode binary/count variables
# -------------------------------------------
Absenteeism.by.employee <- Absenteeism.by.employee |>
mutate(`Social smoker` = ifelse(`Social smoker` == 0,
"Non-smoker", "Smoker"),
`Social drinker` = ifelse(`Social drinker` == 0,
"Non-drinker", "Social drinker"),
Children = ifelse(Son == 0, "Non-parent", "Parent"),
Pet = ifelse(Pet == 0, "No Pet(s)", "Pet(s)")) |>
select(-Son, -ID)
# -------------------------------------------
# 9. Simplify education levels
# -------------------------------------------
Absenteeism.by.employee <- Absenteeism.by.employee |>
mutate(
College = ifelse(Education >= 2, "college", "high school"),
Education = factor(Education) |>
fct_recode("high school" = "1", "graduate" = "2",
"postgraduate" = "3", "master and doctor" = "4")
)
# -------------------------------------------
# 10. Categorize absenteeism (high vs. low)
# -------------------------------------------
Absenteeism.by.employee <- Absenteeism.by.employee |>
mutate(`High absenteeism` =
ifelse(`Absenteeism time in hours` >= median(`Absenteeism time in hours`),
"High Absenteeism", "Low Absenteeism"))
# -------------------------------------------
# 11. Convert characters to factors
# -------------------------------------------
Absenteeism.by.employee <- Absenteeism.by.employee |>
mutate(across(where(is_character), as_factor))
# -------------------------------------------
# 12. Save the cleaned data
# -------------------------------------------
write.csv(Absenteeism.by.employee,
file = "data/Absenteeism_by_employee_1.csv",
row.names = FALSE)