#install libraries
install.packages(c("tidyverse", "lubridate", "data.table", "leaflet", "nnet", "DT", "ggplot2"))

The downloaded binary packages are in
	/var/folders/r0/14zsy__j0js721wf9chnqr9w0000gn/T//RtmpdQ3nlo/downloaded_packages

#load libraries
library(tidyverse)
library(lubridate)
library(data.table)  # For fast loading if needed
library(leaflet)     # Maps
library(nnet)        # Neural nets
library(DT)          # Interactive tables
library(ggplot2)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Attaching package: ‘data.table’


The following objects are masked from ‘package:lubridate’:

    hour, isoweek, mday, minute, month, quarter, second, wday, week,
    yday, year


The following objects are masked from ‘package:dplyr’:

    between, first, last


The following object is masked from ‘package:purrr’:

    transpose

#load data
mta_data <- fread("/Users/jhwinniec/Downloads/MTA_Bus_Automated_Camera_Enforcement_Violations__Beginning_October_2019_20250921.csv") 
mta_data <- as_tibble(mta_data)

qc_lat <- 40.736
qc_long <- -73.817
qc_routes <- c("Q44+")

head(mta_data)
summary(mta_data)
dim(mta_data)
names(mta_data)

  Violation ID        Vehicle ID        First Occurrence   Last Occurrence   
 Min.   :215520220   Length:3778568     Length:3778568     Length:3778568    
 1st Qu.:428897558   Class :character   Class :character   Class :character  
 Median :448461566   Mode  :character   Mode  :character   Mode  :character  
 Mean   :431477090                                                           
 3rd Qu.:465921364                                                           
 Max.   :489749182                                                           
 Violation Status   Violation Type     Bus Route ID       Violation Latitude
 Length:3778568     Length:3778568     Length:3778568     Min.   :40.53     
 Class :character   Class :character   Class :character   1st Qu.:40.70     
 Mode  :character   Mode  :character   Mode  :character   Median :40.78     
                                                          Mean   :40.76     
                                                          3rd Qu.:40.83     
                                                          Max.   :40.88     
 Violation Longitude    Stop ID        Stop Name         Bus Stop Latitude
 Min.   :-74.18      Min.   :100017   Length:3778568     Min.   :40.53    
 1st Qu.:-73.95      1st Qu.:104344   Class :character   1st Qu.:40.70    
 Median :-73.93      Median :401610   Mode  :character   Median :40.78    
 Mean   :-73.92      Mean   :341673                      Mean   :40.76    
 3rd Qu.:-73.89      3rd Qu.:404820                      3rd Qu.:40.83    
 Max.   :-73.70      Max.   :984012                      Max.   :40.88    
 Bus Stop Longitude Violation Georeference Bus Stop Georeference
 Min.   :-74.19     Length:3778568         Length:3778568       
 1st Qu.:-73.95     Class :character       Class :character     
 Median :-73.93     Mode  :character       Mode  :character     
 Mean   :-73.92                                                 
 3rd Qu.:-73.89                                                 
 Max.   :-73.70

# Check column types
str(mta_data[, c("Violation Latitude", "Violation Longitude", "Violation Status")])

# Check status values
unique(mta_data$`Violation Status`)

# Sample lat/long values
head(mta_data$`Violation Latitude`, 10)
head(mta_data$`Violation Longitude`, 10)

tibble [3,778,568 × 3] (S3: tbl_df/tbl/data.frame)
 $ Violation Latitude : num [1:3778568] 40.8 40.9 40.7 40.8 40.8 ...
 $ Violation Longitude: num [1:3778568] -73.9 -73.9 -73.9 -73.8 -74 ...
 $ Violation Status   : chr [1:3778568] "TECHNICAL ISSUE/OTHER" "EXEMPT - BUS/PARATRANSIT" "TECHNICAL ISSUE/OTHER" "EXEMPT - OTHER" ...
 - attr(*, ".internal.selfref")=<externalptr>

# Check Bus Route ID values
unique(mta_data$`Bus Route ID`)

# Check exempt status counts
table(mta_data$`Violation Status`)

# Check timestamp parsing
sum(is.na(mdy_hms(mta_data$`First Occurrence`)))
sum(is.na(mdy_hms(mta_data$`Last Occurrence`)))

# Check distance range
summary(sqrt((mta_data$`Violation Latitude` - 40.736)^2 + (mta_data$`Violation Longitude` - (-73.817))^2) * 111)

 DRIVER/VEHICLE INFO MISSING     EXEMPT - BUS/PARATRANSIT 
                      273968                       190192 
EXEMPT - COMMERCIAL UNDER 20   EXEMPT - EMERGENCY VEHICLE 
                      257374                       286253 
              EXEMPT - OTHER        TECHNICAL ISSUE/OTHER 
                      136991                       320912 
            VIOLATION ISSUED 
                     2312878

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.8903 14.0271 16.0810 15.2345 17.8585 44.1360

violations_qc <- mta_data %>%
  mutate(
    first_occurrence = mdy_hms(`First Occurrence`, truncated = 3), #time
    is_exempt = str_detect(`Violation Status`, "(?i)EXEMPT"), #finding to see if it is exempt
    dist_to_campus_km = sqrt((`Violation Latitude` - qc_lat)^2 + (`Violation Longitude` - qc_long)^2) * 111
  ) %>% #finding the distance away from campus
  filter(
    `Bus Route ID` %in% qc_routes,  # Q44+ only
    is_exempt,                      # Exempt vehicles
    dist_to_campus_km <= 5          # Within 5 km
  )

dim(violations_qc)
head(violations_qc)

repeats_qc <- violations_qc %>%
  group_by(`Vehicle ID`) %>% #group by vehicle ID
  summarise(
    violation_count = n(), # compute the number of violations n() for each vehicle
    .groups = "drop" # removes all groups so the repeats_qc is a regular data frame
  ) %>%
  filter(violation_count > 1) %>%
  arrange(desc(violation_count)) %>%
  slice_head(n = 250)

dim(repeats_qc)
head(repeats_qc, 50)

#Estimate delays (1.5 min/violation) and student impact (45 students/bus) for the top 250 repeat offenders.

# Calculate delays
repeats_with_delay <- repeats_qc %>%
  mutate(
    total_delay_min = violation_count * 1.5,  # 1.5 min per violation
    students_affected = total_delay_min * 45   # 45 students per bus
  )

# Summarize
total_delay_hours <- sum(repeats_with_delay$total_delay_min) / 60
total_student_hours <- sum(repeats_with_delay$students_affected) / 60
cat("Total delay from top 250 repeats:", round(total_delay_hours, 1), "hours\n")
cat("Total student-hours lost:", round(total_student_hours, 1), "\n")

# Display table
DT::datatable(repeats_with_delay, options = list(pageLength = 10))

Total delay from top 250 repeats: 279 hours
Total student-hours lost: 12552.8

install.packages(c("leaflet.extras", "geosphere"))
library(leaflet.extras)
library(geosphere)

The downloaded binary packages are in
	/var/folders/r0/14zsy__j0js721wf9chnqr9w0000gn/T//RtmpdQ3nlo/downloaded_packages

library(leaflet)
library(leaflet.extras)

mock_ai_lat <- 40.737827 
mock_ai_long <- -73.825019 

map <- leaflet(data = violations_qc) %>%
  addTiles() %>%
  addHeatmap(
    lng = ~`Violation Longitude`, 
    lat = ~`Violation Latitude`, 
    blur = 20,
    max = 0.05,
    radius = 15
  ) %>%
  addMarkers(
    lng = qc_long, 
    lat = qc_lat, 
    popup = "Queens College",
    label = "Queens College"
  ) %>%
  addMarkers(
    lng = mock_ai_long, 
    lat = mock_ai_lat, 
    popup = "Mock AI Camera Location",
    label = "Mock AI Camera"
  ) %>%
  setView(lng = qc_long, lat = qc_lat, zoom = 14)

# Save to HTML
library(htmlwidgets)
saveWidget(map, "bus_lane_map.html", selfcontained = TRUE)

library(plotly)

# Calculate violation type counts
violation_counts <- violations_qc %>%
  count(`Violation Type`) %>%
  mutate(label = paste(`Violation Type`, ": ", n))

# Bar chart with counts
plot_ly(data = violation_counts, x = ~`Violation Type`, y = ~n, type = "bar", 
        marker = list(color = "steelblue"),
        text = ~n, textposition = "auto") %>%
  layout(title = "Distribution of Violation Types Near Queens College",
         xaxis = list(title = "Violation Type"), yaxis = list(title = "Count"))
plotly::export(p = last_plot(), file = "violation_types_bar.png")

Attaching package: ‘plotly’


The following object is masked from ‘package:ggplot2’:

    last_plot


The following object is masked from ‘package:stats’:

    filter


The following object is masked from ‘package:graphics’:

    layout

Warning message:
“'plotly::export' is deprecated.
Use 'orca' instead.
See help("Deprecated")”

Error: Package `webshot` required for `export`.
Please install and try again.
Traceback:

1. plotly::export(p = last_plot(), file = "violation_types_bar.png")
2. try_library("webshot", "export")
3. stop("Package `", pkg, "` required", if (!is.null(fun)) paste0(" for `", 
 .     fun, "`"), ".\n", "Please install and try again.", call. = FALSE)

# 2. Histogram: Violation Counts for Repeat Offenders
ggplot(repeats_qc, aes(x = violation_count)) +
  geom_histogram(binwidth = 5, fill = "orange") +
  labs(title = "Histogram of Violation Counts for Top 250 Repeat Offenders", x = "Violation Count", y = "Frequency") +
  theme_minimal()

# 3. Time Series: Violations Over Time (per day)
violations_qc %>%
  mutate(date = date(first_occurrence)) %>%
  group_by(date) %>%
  summarise(daily_violations = n(), .groups = "drop") %>%
  ggplot(aes(x = date, y = daily_violations)) +
  geom_line(color = "purple") +
  labs(title = "Time Series of Violations Over Time Near Queens College", x = "Date", y = "Daily Violations") +
  theme_minimal()

library(plotly)

# Density data
density_data <- violations_qc %>%
  left_join(repeats_qc %>% select(`Vehicle ID`, violation_count), by = "Vehicle ID") %>%
  mutate(is_repeat = ifelse(violation_count > 1, "Repeat", "Single"))

plot_ly(data = density_data, x = ~dist_to_campus_km, type = "histogram", histnorm = "probability density", 
        color = ~is_repeat, opacity = 0.5, bingroup = 1) %>%
  layout(title = "Density of Distance to Campus by Repeat Offender Status",
         xaxis = list(title = "Distance to Campus (km)"), yaxis = list(title = "Density"),
         barmode = "overlay")
plotly::export(p = last_plot(), file = "distance_density.png")

Warning message in RColorBrewer::brewer.pal(N, "Set2"):
“minimal value for n is 3, returning requested palette with 3 different levels
”
Warning message in RColorBrewer::brewer.pal(N, "Set2"):
“minimal value for n is 3, returning requested palette with 3 different levels
”
Warning message in RColorBrewer::brewer.pal(N, "Set2"):
“minimal value for n is 3, returning requested palette with 3 different levels
”
Warning message in RColorBrewer::brewer.pal(N, "Set2"):
“minimal value for n is 3, returning requested palette with 3 different levels
”

Warning message:
“'plotly::export' is deprecated.
Use 'orca' instead.
See help("Deprecated")”
Warning message in RColorBrewer::brewer.pal(N, "Set2"):
“minimal value for n is 3, returning requested palette with 3 different levels
”
Warning message in RColorBrewer::brewer.pal(N, "Set2"):
“minimal value for n is 3, returning requested palette with 3 different levels
”
Warning message in RColorBrewer::brewer.pal(N, "Set2"):
“minimal value for n is 3, returning requested palette with 3 different levels
”
Warning message in RColorBrewer::brewer.pal(N, "Set2"):
“minimal value for n is 3, returning requested palette with 3 different levels
”
Warning message in RColorBrewer::brewer.pal(N, "Set2"):
“minimal value for n is 3, returning requested palette with 3 different levels
”
Warning message in RColorBrewer::brewer.pal(N, "Set2"):
“minimal value for n is 3, returning requested palette with 3 different levels
”

Error: Package `webshot` required for `export`.
Please install and try again.
Traceback:

1. plotly::export(p = last_plot(), file = "distance_density.png")
2. try_library("webshot", "export")
3. stop("Package `", pkg, "` required", if (!is.null(fun)) paste0(" for `", 
 .     fun, "`"), ".\n", "Please install and try again.", call. = FALSE)

violations_qc %>%
  mutate(month = floor_date(as.Date(`First Occurrence`, format = "%m/%d/%Y"), "month")) %>%
  group_by(month) %>%
  summarise(avg_dist_km = mean(dist_to_campus_km, na.rm = TRUE)) %>%
  ggplot(aes(x = month, y = avg_dist_km)) +
  geom_line() +
  labs(title = "Average Distance to Campus Over Time", x = "Month", y = "Avg Distance (km)")

# Create a dataset with vehicle violation counts and other features
regression_data <- violations_qc %>%
  group_by(`Vehicle ID`) %>%
  summarise(
    violation_count = n(),
    avg_distance = mean(dist_to_campus_km),
    first_violation = min(first_occurrence),
    last_violation = max(first_occurrence),
    violation_types = n_distinct(`Violation Type`),
    stop_count = n_distinct(`Stop ID`),
    .groups = "drop"
  ) %>%
  # Add time-based features
  mutate(
    activity_duration = as.numeric(difftime(last_violation, first_violation, units = "days")),
    is_repeat_offender = violation_count > 1
  ) %>%
  # Join with original data to get additional features
  left_join(
    violations_qc %>% 
      select(`Vehicle ID`, `Violation Status`, `Violation Type`, dist_to_campus_km) %>%
      distinct(`Vehicle ID`, .keep_all = TRUE),
    by = "Vehicle ID"
  )

# Split data into training and testing sets
set.seed(123) # For reproducibility
train_indices <- sample(1:nrow(regression_data), size = 0.7 * nrow(regression_data))
train <- regression_data[train_indices, ]
test <- regression_data[-train_indices, ]

# Model 1: Basic model with distance and violation type
repeat_model1 <- lm(violation_count ~ avg_distance + violation_types, data = train)
summary(repeat_model1)

Call:
lm(formula = violation_count ~ avg_distance + violation_types, 
    data = train)

Residuals:
   Min     1Q Median     3Q    Max 
-14.59  -1.50  -0.96   0.00 542.33 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)      -7.0642     0.6430 -10.986  < 2e-16 ***
avg_distance      0.5022     0.1588   3.162  0.00157 ** 
violation_types   7.5195     0.3214  23.399  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 12.15 on 6567 degrees of freedom
Multiple R-squared:  0.07997,	Adjusted R-squared:  0.07968 
F-statistic: 285.4 on 2 and 6567 DF,  p-value: < 2.2e-16

# Model 2: Add activity duration
repeat_model2 <- lm(violation_count ~ avg_distance + violation_types + activity_duration, data = train)
summary(repeat_model2)

Call:
lm(formula = violation_count ~ avg_distance + violation_types + 
    activity_duration, data = train)

Residuals:
   Min     1Q Median     3Q    Max 
-16.39  -0.82  -0.60  -0.07 541.92 

Coefficients:
                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)       -2.6590498  0.7068447  -3.762  0.00017 ***
avg_distance       0.2715476  0.1573451   1.726  0.08443 .  
violation_types    3.3358454  0.4345641   7.676 1.88e-14 ***
activity_duration  0.0128234  0.0009122  14.057  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 11.98 on 6566 degrees of freedom
Multiple R-squared:  0.1068,	Adjusted R-squared:  0.1064 
F-statistic: 261.8 on 3 and 6566 DF,  p-value: < 2.2e-16

# Model 3: Add stop count
repeat_model3 <- lm(violation_count ~ avg_distance + violation_types + activity_duration + stop_count, data = train)
summary(repeat_model3)

Call:
lm(formula = violation_count ~ avg_distance + violation_types + 
    activity_duration + stop_count, data = train)

Residuals:
   Min     1Q Median     3Q    Max 
-33.07  -0.38  -0.18  -0.11 520.64 

Coefficients:
                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)        0.8604348  0.6679098   1.288    0.198    
avg_distance      -0.0846454  0.1470299  -0.576    0.565    
violation_types   -2.5012190  0.4450080  -5.621 1.98e-08 ***
activity_duration  0.0040909  0.0008937   4.578 4.79e-06 ***
stop_count         3.1039736  0.0982029  31.608  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 11.16 on 6565 degrees of freedom
Multiple R-squared:  0.2248,	Adjusted R-squared:  0.2243 
F-statistic:   476 on 4 and 6565 DF,  p-value: < 2.2e-16

# Make predictions on test set
repeat_model1_prediction <- predict(repeat_model1, newdata = test)
repeat_model2_prediction <- predict(repeat_model2, newdata = test)
repeat_model3_prediction <- predict(repeat_model3, newdata = test)

# Calculate RMSE for each model
repeat_model1_error <- test$violation_count - repeat_model1_prediction
repeat_model2_error <- test$violation_count - repeat_model2_prediction
repeat_model3_error <- test$violation_count - repeat_model3_prediction

cat("Model 1 RMSE:", sqrt(mean(repeat_model1_error^2)), "\n")
cat("Model 2 RMSE:", sqrt(mean(repeat_model2_error^2)), "\n")
cat("Model 3 RMSE:", sqrt(mean(repeat_model3_error^2)), "\n")

Model 1 RMSE: 10.54162 
Model 2 RMSE: 10.22466 
Model 3 RMSE: 9.500314

# Interpret the best model
cat("\n=== Most Important Factors for Repeat Offenders ===\n")
best_model <- repeat_model3  # Assuming model 4 is best based on RMSE
summary(best_model)

=== Most Important Factors for Repeat Offenders ===

Call:
lm(formula = violation_count ~ avg_distance + violation_types + 
    activity_duration + stop_count, data = train)

Residuals:
   Min     1Q Median     3Q    Max 
-33.07  -0.38  -0.18  -0.11 520.64 

Coefficients:
                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)        0.8604348  0.6679098   1.288    0.198    
avg_distance      -0.0846454  0.1470299  -0.576    0.565    
violation_types   -2.5012190  0.4450080  -5.621 1.98e-08 ***
activity_duration  0.0040909  0.0008937   4.578 4.79e-06 ***
stop_count         3.1039736  0.0982029  31.608  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 11.16 on 6565 degrees of freedom
Multiple R-squared:  0.2248,	Adjusted R-squared:  0.2243 
F-statistic:   476 on 4 and 6565 DF,  p-value: < 2.2e-16

# Extract and interpret coefficients
coefficients_df <- as.data.frame(summary(best_model)$coefficients)
coefficients_df$Variable <- rownames(coefficients_df)
coefficients_df <- coefficients_df %>%
  arrange(desc(abs(Estimate))) %>%
  select(Variable, Estimate, `Std. Error`, `t value`, `Pr(>|t|)`)

print(coefficients_df)

                           Variable    Estimate   Std. Error    t value
stop_count               stop_count  3.10397365 0.0982028933 31.6077617
violation_types     violation_types -2.50121902 0.4450079735 -5.6206162
(Intercept)             (Intercept)  0.86043484 0.6679098395  1.2882500
avg_distance           avg_distance -0.08464537 0.1470299177 -0.5757016
activity_duration activity_duration  0.00409090 0.0008936921  4.5775270
                       Pr(>|t|)
stop_count        3.119302e-204
violation_types    1.980931e-08
(Intercept)        1.977044e-01
avg_distance       5.648366e-01
activity_duration  4.791708e-06

# Create a visualization of important factors
library(ggplot2)

# Plot coefficient magnitudes
ggplot(coefficients_df %>% filter(Variable != "(Intercept)"), 
       aes(x = reorder(Variable, abs(Estimate)), y = abs(Estimate), fill = Estimate > 0)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Factor Importance for Predicting Repeat Offenses",
       x = "Variable",
       y = "Coefficient Magnitude (Absolute Value)") +
  theme_minimal()

Violation ID	Vehicle ID	First Occurrence	Last Occurrence	Violation Status	Violation Type	Bus Route ID	Violation Latitude	Violation Longitude	Stop ID	Stop Name	Bus Stop Latitude	Bus Stop Longitude	Violation Georeference	Bus Stop Georeference
<int>	<chr>	<chr>	<chr>	<chr>	<chr>	<chr>	<dbl>	<dbl>	<int>	<chr>	<dbl>	<dbl>	<chr>	<chr>
489749182	c5ae1411153b52556a1e648cc80d718aa519a4bdd189abdbd715dc9058027231	08/20/2025 11:12:08 PM	08/21/2025 12:24:08 AM	TECHNICAL ISSUE/OTHER	MOBILE BUS STOP	BX36	40.84051	-73.88119	102498	EAST TREMONT AV/VYSE AV	40.84108	-73.88248	POINT (-73.881189 40.840509)	POINT (-73.882483 40.841076)
489744714	df9044acf85cf55488aea4cd3ce1d0e17ef050551726b632252f1b3dbc871f5e	08/20/2025 11:48:59 PM	08/20/2025 11:54:47 PM	EXEMPT - BUS/PARATRANSIT	MOBILE BUS STOP	BX28	40.87402	-73.89065	100080	PAUL AV/BEDFORD PARK BLVD	40.87463	-73.89154	POINT (-73.890646 40.874017)	POINT (-73.891539 40.874629)
489743631	eb5a337966ba65f66ab1db8e169d2446a4fb429b0efc6374052434be832c610d	08/20/2025 10:33:13 PM	08/20/2025 11:56:02 PM	TECHNICAL ISSUE/OTHER	MOBILE DOUBLE PARKED	Q53+	40.72197	-73.86714	550473	WOODHAVEN BLVD/PENELOPE AV	40.72249	-73.86774	POINT (-73.867136 40.721971)	POINT (-73.867736 40.722487)
489741945	3f877f70d9b253515a945be807c9c62d5814949f810310f6fe3f8bbe33a39104	08/20/2025 10:50:45 PM	08/20/2025 11:32:43 PM	EXEMPT - OTHER	MOBILE BUS STOP	Q44+	40.76253	-73.83173	501140	UNION ST/35 AV	40.76542	-73.82794	POINT (-73.831728 40.762529)	POINT (-73.827944 40.765422)
489741940	7feac037b62d591ffb1214e356157f3dd197fc22fee5bbde47b1b65af6f0d4fd	08/20/2025 10:52:57 AM	08/20/2025 11:16:57 AM	EXEMPT - EMERGENCY VEHICLE	MOBILE BUS STOP	M101	40.81511	-73.95504	401458	AMSTERDAM AV/W 131 ST	40.81601	-73.95442	POINT (-73.95504 40.815113)	POINT (-73.954424 40.816009)
489741935	e9122b4e6dac9160ed4fca952a3815fe4c737bfc70cb97e4120a5dbe9a25967f	08/20/2025 11:35:29 PM	08/20/2025 11:39:32 PM	EXEMPT - EMERGENCY VEHICLE	MOBILE BUS STOP	M101	40.79563	-73.94168	402704	3 AV/E 109 ST	40.79377	-73.94300	POINT (-73.941683 40.79563)	POINT (-73.943005 40.793765)

Violation ID	Vehicle ID	First Occurrence	Last Occurrence	Violation Status	Violation Type	Bus Route ID	Violation Latitude	Violation Longitude	Stop ID	Stop Name	Bus Stop Latitude	Bus Stop Longitude	Violation Georeference	Bus Stop Georeference	first_occurrence	is_exempt	dist_to_campus_km
<int>	<chr>	<chr>	<chr>	<chr>	<chr>	<chr>	<dbl>	<dbl>	<int>	<chr>	<dbl>	<dbl>	<chr>	<chr>	<dttm>	<lgl>	<dbl>
489741945	3f877f70d9b253515a945be807c9c62d5814949f810310f6fe3f8bbe33a39104	08/20/2025 10:50:45 PM	08/20/2025 11:32:43 PM	EXEMPT - OTHER	MOBILE BUS STOP	Q44+	40.76253	-73.83173	501140	UNION ST/35 AV	40.76542	-73.82794	POINT (-73.831728 40.762529)	POINT (-73.827944 40.765422)	2025-08-20 22:50:45	TRUE	3.368081
489741818	9efab913d2329aae1a294cc0316679b86dcdccc7e7294f3ec6b96d79db49850f	08/20/2025 11:15:31 PM	08/20/2025 11:19:56 PM	EXEMPT - BUS/PARATRANSIT	MOBILE BUS STOP	Q44+	40.70529	-73.80949	502137	SUTPHIN BLVD/HILLSIDE AV	40.70538	-73.80959	POINT (-73.809492 40.705292)	POINT (-73.809594 40.705382)	2025-08-20 23:15:31	TRUE	3.508990
489740582	e9f61fd7d4d8df0a7d76bf2ac1b8c8b0d808cc51376310d77627c6f28f474911	08/20/2025 10:12:48 PM	08/20/2025 10:19:47 PM	EXEMPT - BUS/PARATRANSIT	MOBILE BUS STOP	Q44+	40.70259	-73.79940	503964	MERRICK BLVD/ARCHER AV	40.70491	-73.79331	POINT (-73.799399 40.702592)	POINT (-73.793306 40.704907)	2025-08-20 22:12:48	TRUE	4.191466
489740523	b6e7951e7990abd20c90f8de0b99d0f402d064c8880ad45ea48ef3a56f683669	08/20/2025 10:00:53 PM	08/20/2025 10:06:10 PM	EXEMPT - BUS/PARATRANSIT	MOBILE BUS STOP	Q44+	40.70527	-73.80951	502137	SUTPHIN BLVD/HILLSIDE AV	40.70538	-73.80959	POINT (-73.80951 40.705266)	POINT (-73.809594 40.705382)	2025-08-20 22:00:53	TRUE	3.511319
489739375	3f877f70d9b253515a945be807c9c62d5814949f810310f6fe3f8bbe33a39104	08/20/2025 08:04:45 PM	08/20/2025 08:22:51 PM	EXEMPT - OTHER	MOBILE BUS STOP	Q44+	40.76253	-73.83172	501140	UNION ST/35 AV	40.76542	-73.82794	POINT (-73.831717 40.762534)	POINT (-73.827944 40.765422)	2025-08-20 20:04:45	TRUE	3.367973
489738933	e9f61fd7d4d8df0a7d76bf2ac1b8c8b0d808cc51376310d77627c6f28f474911	08/20/2025 07:48:43 PM	08/20/2025 08:32:30 PM	EXEMPT - BUS/PARATRANSIT	MOBILE BUS STOP	Q44+	40.70270	-73.79886	500552	MERRICK BLVD/218 ST	40.67950	-73.75116	POINT (-73.798864 40.7027)	POINT (-73.751159 40.679505)	2025-08-20 19:48:43	TRUE	4.208942

Vehicle ID	violation_count
<chr>	<int>
729afe2bc01420ab8c66a36692cfc829ea5a0f829b17c705e85beb53caf45423	552
d3394e8be16cf7189dccc9b3621153fcffb272574bc3079a8a3aa63839f249e0	499
80228a16bca871e024130c63ddf5552024720ac5f9b583df1c7b35bfdb52d630	444
1ff22289fd4c31a9404d9470c08cddf9bec229181bf1054605ed0afafaaa7605	247
b274337d1e7fcd20a3dd7a23f4c43553b2234a9c4935028d947e368e8156a83d	232
58428b9d48d2ac0c9c2213135b018758add2fbc612d48c082dff2624cddbfd16	196
a0d302f41d928ef753f174c829474b0a90c3ae6d76ceb2bbce002370b7f7b264	167
c61d7f2b095752319ea064c9fdaa4013066a46cc6ad274376d01d100ba65dfd0	138
1ef67d49a3ae37531196dd43a654bd3ccbc14ba05c9847ada319c21d8b0078ef	130
7eb12b78f11bc97acf8ce0fa70650184e2a34f23e5eab879ebbef0eada899757	128
fcebef166d0d27d7af8d0ed51a9dc28b0ae5ae4c02a12508b2da74a3434166c0	125
3c66ad50a027b798cb4f3c199c8b50e8294ee2f9405161a8c275b2df4f8d515a	108
1d4072d3c03d5741fdfe8ab5a031a5151f2834dfbdf4d71a39776534fae7afe3	106
3f877f70d9b253515a945be807c9c62d5814949f810310f6fe3f8bbe33a39104	105
51bf6a492c89eced90e9664cd5f440b3e6cd57ae5409a5d875f6a0af041ceadb	101
e0bfcfa239ec3fa0a85f07efc0b55732572c7097e77440c3688df52f1aea4fa1	101
d77aedb500177384438bd4025334ae5ce637cd48e3eea7a8a167b19d5e867507	98
37bfd59a56e24a86baee38446c5125414aacbd4cc33482712fd1f04df7de1f0e	84
68811b888539bf4b911cb3de0a2e84bc9ec26fae74f9f5ed4557c731e4ef43b8	83
050b16d1a9fd29b7da492be0800620a1f5f500a7f871a6be1f59c7727b38941e	82
293c511ae6a173bc6545ef7e304b2ceeb1ed034dbfa89441d8b5bc9d995a1438	78
288ec97b8554e213ec32f7f9b14f81027ceb2e98c49de4f92b7099f467e0a274	75
4fb920e4d0ceff3e5665e309ea41bc845137b674f9de16e11c7263a2e368675b	75
10de3ff906a4ac43145e78ea516dd318731643ff2644bcec6d9646c7748d0468	72
e020820037319aaca4e4ed8d3c29e907f203d722ea46293949ae6e1c9b3ca446	72
e622eae5783223d64f6142c05dbc6af93e67ee488589422d1f532ec821855e79	72
883d92b57a8d209aa5bc83f563a9a658c76db1d1f67e4f463b22702312ee52f2	71
800f52a98f35c40bb1384f0154e44d447b561f0d1da0552376d955cb73eea63d	70
38a80ae4e3559673e8645d2a51d64d26d9e0f0e9c334b5c51dc651c2fb187a7b	69
d60e8afa5ebe3e9ba18700d8a656d864fd0342d2eafee26d30d0b46f474106ef	68
0aa238d1b03e67c9b37b79d34e5a239a44e6668daac6e062d3b6cc70dc987762	67
6153161619a06155dd2b0b9e0f42ac3265d65eb2fd1e3b87f1fef9c347952e7e	67
0f80c19299bbfe391d80051e220432b98dae22078a8031a5f64f1c132ea529e9	64
dfaecbe7ed788f771a1ee8f32344f660801b839911dbca164d174759b7ebbf5f	61
26d2ba9350b185f807be38c20d050cb5e67e88d231148c96c5bded4721b56fdf	60
c4d60bb6d53366f62f12c54670b813452b4f73c90221d6a537c8412f3783e89b	58
11819a6b9cdfe432764940fba65c7c833ea154a85f5b91571a93d1d9dd394e21	57
8d2d022d2ddb4cffe800131bfeff86258de50524a9d68ed4110ea6b09ace9e2a	57
446248f385cb17f236107f4a6528d56b6b147140a7a4a71b7459aa81a96cc914	56
c02faad25804e77e9d3b623ce7315fa75db6d07b9d48d5176d572bbbd5ea72d1	56
ceb64885b26b987de830a7f76b48ece9ae2acca01ab3091ab7f20e7ddb5751e7	56
628442d1ea38d201b49b99bd25d91909ede7bcc75548e135b30f311da15893b0	55
e71fb98053b33b72590cb6cd5aaf2a850d047fc61d5581702bdc26635fcbee14	55
19f773c8af4bea22b46c12768bf513a6bee17de0cccf068b1e35be57d8ab8f8e	54
78faa93a02ed9724c64a76b28a5c2a5ef7c361eda0edf8dc64f52311f1f278fc	54
e9f61fd7d4d8df0a7d76bf2ac1b8c8b0d808cc51376310d77627c6f28f474911	53
55b73572002bcfb2de7bea93a1761bd01d65b49069d3d64f52c6b959847fec3a	52
999a4044be40f3b4844875a99fab21f1b76d4e219c295ab25bcda463b09d3692	52
3bef07147dbb7b1699e9b95c26e01393591a73b72b0d9d490bc4c68c0b15d810	51
7004ca4b89c88c3ccb204a485ce23c35bd69d32e9cfd13d335d68b4ac44f2e02	51

Queens College Bus Violations Analysis¶

Data Preprocessing & Filtering Violations¶

Identifying top 250 repeat offenders¶

Calculate delays and student impact¶

Heatmap for violation hotspots¶

Graphs/plots for data visualizations¶

Advanced Models - Regression to find to see which factors appear to be most important for predicting repeat offender likelihood¶