Are there differences in car crash rates by county?#

import numpy as np
import pandas as pd
import json
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
def plot_choroplethmap(counties, fips, value, value_min, value_max, text, title):
    fig = go.Figure(go.Choroplethmapbox(geojson=counties, locations=fips, z=value,
                                    colorscale="Viridis", zmin=value_min, zmax = value_max,
                                    marker_opacity=0.8, marker_line_width=1, 
                                    text=text))

    fig.update_layout(mapbox_style="carto-positron",
                      mapbox_zoom=4.3, mapbox_center = {"lat": 37.5, "lon": -120})
    fig.update_layout(margin={"r":150,"t":30,"l":150,"b":0}, 
                      title={"text":title})
    fig.show()
# relative path to data & figures directory
from paths import DATA_PATH, FIGURES_PATH
# Organizing the dataset and abtract features we want
county_data_2019 = pd.read_excel(f"{DATA_PATH}/location_2019.xlsm", header=2, sheet_name="Table 8A - Crashes")
county_data_2019 = county_data_2019[county_data_2019["County"].notna()].reset_index()
county_data_2019["Total Crashes"] = county_data_2019.iloc[:, 4:7].sum(axis=1)
county_data_2019 = county_data_2019.iloc[:, [1, 7, 8, 15]].drop(58)

fatality_df = pd.read_excel(f"{DATA_PATH}/location_2019.xlsm", header=3, sheet_name="Table 8I").drop(58)
county_data_2019["Total Fatal"] = fatality_df["Total"]

injury_df = pd.read_excel(f"{DATA_PATH}/location_2019.xlsm", header=3, sheet_name="Table 8K").drop(58)
county_data_2019["Total Injury"] = injury_df["Total"]

alcohol_df = pd.read_excel(f"{DATA_PATH}/location_2019.xlsm", header=4, sheet_name="Table 8L")
county_data_2019["Alcohol Involved Crashes"] = alcohol_df.fillna(0).iloc[0:58:, [9, 10]].sum(axis=1).astype(int)

population_df = pd.read_excel(f"{DATA_PATH}/location_2019.xlsm", header=4, sheet_name="Table 8B").iloc[0:58, 4].astype(int)
county_data_2019["Population"] = population_df
# Display the first 10 rows of the dataset
county_data_2019.head(10)
County Alcohol Involved Fatal Alcohol Involved Injury Total Crashes Total Fatal Total Injury Alcohol Involved Crashes Population
0 Alameda 28 655 22768 96 10441 469 1668965
1 Alpine 0 4 77 2 66 1 1123
2 Amador 4 38 521 16 276 29 37724
3 Butte 9 143 2226 34 1262 130 214532
4 Calaveras 4 45 598 11 305 44 44403
5 Colusa 1 21 352 12 205 22 22045
6 Contra Costa 27 492 12195 77 5982 383 1147269
7 Del Norte 1 19 285 11 217 20 27207
8 El Dorado 11 103 1644 29 861 96 188818
9 Fresno 34 305 7848 135 4247 297 1018437
# Load the fips code of each county in California
county_fips = pd.read_excel(f"{DATA_PATH}/county_fips.xlsx")
county_fips.head(10)
county fips
0 Alameda 1
1 Alpine 3
2 Amador 5
3 Butte 7
4 Calaveras 9
5 Colusa 11
6 Contra Costa 13
7 Del Norte 15
8 El Dorado 17
9 Fresno 19
# Combine the county fips with the California State fips to create the full fips
cal_fip = "06"
def create_full_fip(county_fip, state_fip):
    if len(county_fip) > 3:
        raise("Invalid county_fip")
    while len(county_fip) < 3:
        county_fip = "0" + county_fip
        
    return state_fip + county_fip

county_fips["fips"] = county_fips["fips"].astype(str)
county_fips["fips"] = county_fips["fips"].apply(lambda fips: create_full_fip(fips, cal_fip))
county_fips = county_fips.rename(columns={"county":"County"})
county_fips.head()
County fips
0 Alameda 06001
1 Alpine 06003
2 Amador 06005
3 Butte 06007
4 Calaveras 06009
# Organize the dataset again by rearanging column order
county_data_2019 = pd.merge(county_fips, county_data_2019, on="County")
county_data_2019 = county_data_2019.iloc[:, [0, 1, 8, 4, 5, 6, 7, 2, 3]]
county_data_2019.head(10)
County fips Population Total Crashes Total Fatal Total Injury Alcohol Involved Crashes Alcohol Involved Fatal Alcohol Involved Injury
0 Alameda 06001 1668965 22768 96 10441 469 28 655
1 Alpine 06003 1123 77 2 66 1 0 4
2 Amador 06005 37724 521 16 276 29 4 38
3 Butte 06007 214532 2226 34 1262 130 9 143
4 Calaveras 06009 44403 598 11 305 44 4 45
5 Colusa 06011 22045 352 12 205 22 1 21
6 Contra Costa 06013 1147269 12195 77 5982 383 27 492
7 Del Norte 06015 27207 285 11 217 20 1 19
8 El Dorado 06017 188818 1644 29 861 96 11 103
9 Fresno 06019 1018437 7848 135 4247 297 34 305
# Import json file that contains all geographical shapes of counties in the US
counties = json.load(open(f"{DATA_PATH}/CA-flips.json"))
    
# Select only the countries in California
CA_counties = []
for county in counties["features"]:
    if county["properties"]["STATE"] == "06":
        CA_counties.append(county)

counties["features"] = CA_counties
# Save the json file for California counties for main notebook
with open(f"{DATA_PATH}/CA-flips.json", "w") as file:
    file.write(json.dumps(counties))
# plot collision count by county choroplethmap
plot_choroplethmap(counties=counties, 
                   fips=county_data_2019["fips"], 
                   value=county_data_2019["Total Crashes"].tolist(), 
                   value_min=0, 
                   value_max=20000, 
                   text=county_data_2019["County"].tolist(), 
                   title="Collision Count by County in California")
# Take the log transformation of both variables
x = np.log(county_data_2019["Population"])
y = np.log(county_data_2019["Total Crashes"])

# Plot the scatter plot of log(population) vs log(#incidence per 1000 capita)
plt.scatter(x, y, s=15)
plt.ylabel('total')

# Find the best-fit line
a, b = np.polyfit(x, y, 1)
plt.plot(x, a*x+b, color="red")
plt.title("Relationship between county population and collision count")
plt.xlabel("log(Population)")
plt.ylabel("log(collision count)")
plt.savefig(f"{FIGURES_PATH}/scatter3_1.png")
plt.show()
../_images/58340e2416ad6b0b0587f2d0a9415aea692ee7cb9fb76e9d13b7814843bfe908.png
# plot collision frequency (per 1000 capita) by county choroplethmap
county_data_2019["Crashes per 1000 capita"] = county_data_2019["Total Crashes"].astype(int).divide(county_data_2019["Population"].astype(int))*1000
plot_choroplethmap(counties=counties, 
                   fips=county_data_2019["fips"], 
                   value=county_data_2019["Crashes per 1000 capita"].tolist(), 
                   value_min=5, 
                   value_max=20, 
                   text=county_data_2019["County"].tolist(), 
                   title="Collision Count per 1000 Capita by County in California")
# Take the log transformation of both variables
x = np.log(county_data_2019["Population"])
y = np.log(county_data_2019["Crashes per 1000 capita"])

# Plot the scatter plot of log(population) vs log(#incidence per 1000 capita)
plt.scatter(x, y, s=15)
plt.ylabel('total')

# Find the best-fit line
a, b = np.polyfit(x, y, 1)
plt.plot(x, a*x+b, color="red") 
plt.title("Relationship between county population and collision rate")
plt.xlabel("log(Population)")
plt.ylabel("log(Collision count per 1000 capita)")
plt.savefig(f"{FIGURES_PATH}/scatter3_2.png")

plt.show()
../_images/dd7c7ecb00d835486a466a82912c2c1253e49280786ead83d3831601145245f9.png
# plot fatal rate by county choroplethmap
county_data_2019["Overall Fatal Rate"] = county_data_2019["Total Fatal"].astype(int).divide(county_data_2019["Total Crashes"].astype(int))*100
plot_choroplethmap(counties=counties, 
                   fips=county_data_2019["fips"], 
                   value=county_data_2019["Overall Fatal Rate"].tolist(), 
                   value_min=0, 
                   value_max=3, 
                   text=county_data_2019["County"].tolist(), 
                   title="Collision Fatal Rate by County in California")
import matplotlib.pyplot as plt

# Take the log transformation of population
x = np.log(county_data_2019["Population"])
y = county_data_2019["Overall Fatal Rate"]

# Plot the scatter plot of log(population) vs Fatal Rate
plt.scatter(x, y, s=15)
plt.ylabel('total')

# Find the best-fit line
a, b = np.polyfit(x, y, 1)
plt.plot(x, a*x+b, color="red") 
plt.title("Relationship between county population and collision fatal rate")
plt.xlabel("log(population)")
plt.ylabel("Fatal Rate (Fatal Collision Count/Total Collision Count)")
plt.savefig(f"{FIGURES_PATH}/scatter3_3.png")

plt.show()
../_images/25bcc0f11f362b2bc934ac3d1735e4dd37a70b45231d8cad7c1a4904b54b3777.png
# use barplot to compare the distribution of overall fatal rate and alcohol-involved fatal rate
county_data_2019["Alcohol Involved Fatal Rate"] = county_data_2019["Alcohol Involved Fatal"].divide(county_data_2019["Alcohol Involved Crashes"])*100

sns.boxplot(data=county_data_2019.loc[:, ["Overall Fatal Rate", "Alcohol Involved Fatal Rate"]], showfliers = False).set(
    ylabel='Fatal Rate in %',
    title="Comparison of Overall Fatal Rate and Alcohol-Involved Fatal Rate"
);
plt.savefig(f"{FIGURES_PATH}/boxplot3_4.png")
../_images/eabee3f52c4ec22dea1cddef47ffb181b943d1e404be4e6586069991bb5004a3.png
# Save the dataset for main notebook
county_data_2019.to_csv(f"{DATA_PATH}/county_data_2019.csv")