Are there differences in car crash rates by county?

Are there differences in car crash rates by county?#

import numpy as np
import pandas as pd
import json
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

def plot_choroplethmap(counties, fips, value, value_min, value_max, text, title):
    fig = go.Figure(go.Choroplethmapbox(geojson=counties, locations=fips, z=value,
                                    colorscale="Viridis", zmin=value_min, zmax = value_max,
                                    marker_opacity=0.8, marker_line_width=1, 
                                    text=text))

    fig.update_layout(mapbox_style="carto-positron",
                      mapbox_zoom=4.3, mapbox_center = {"lat": 37.5, "lon": -120})
    fig.update_layout(margin={"r":150,"t":30,"l":150,"b":0}, 
                      title={"text":title})
    fig.show()

# relative path to data & figures directory
from paths import DATA_PATH, FIGURES_PATH

# Organizing the dataset and abtract features we want
county_data_2019 = pd.read_excel(f"{DATA_PATH}/location_2019.xlsm", header=2, sheet_name="Table 8A - Crashes")
county_data_2019 = county_data_2019[county_data_2019["County"].notna()].reset_index()
county_data_2019["Total Crashes"] = county_data_2019.iloc[:, 4:7].sum(axis=1)
county_data_2019 = county_data_2019.iloc[:, [1, 7, 8, 15]].drop(58)

fatality_df = pd.read_excel(f"{DATA_PATH}/location_2019.xlsm", header=3, sheet_name="Table 8I").drop(58)
county_data_2019["Total Fatal"] = fatality_df["Total"]

injury_df = pd.read_excel(f"{DATA_PATH}/location_2019.xlsm", header=3, sheet_name="Table 8K").drop(58)
county_data_2019["Total Injury"] = injury_df["Total"]

alcohol_df = pd.read_excel(f"{DATA_PATH}/location_2019.xlsm", header=4, sheet_name="Table 8L")
county_data_2019["Alcohol Involved Crashes"] = alcohol_df.fillna(0).iloc[0:58:, [9, 10]].sum(axis=1).astype(int)

population_df = pd.read_excel(f"{DATA_PATH}/location_2019.xlsm", header=4, sheet_name="Table 8B").iloc[0:58, 4].astype(int)
county_data_2019["Population"] = population_df

# Display the first 10 rows of the dataset
county_data_2019.head(10)

	County	Alcohol Involved Fatal	Alcohol Involved Injury	Total Crashes	Total Fatal	Total Injury	Alcohol Involved Crashes	Population
0	Alameda	28	655	22768	96	10441	469	1668965
1	Alpine	0	4	77	2	66	1	1123
2	Amador	4	38	521	16	276	29	37724
3	Butte	9	143	2226	34	1262	130	214532
4	Calaveras	4	45	598	11	305	44	44403
5	Colusa	1	21	352	12	205	22	22045
6	Contra Costa	27	492	12195	77	5982	383	1147269
7	Del Norte	1	19	285	11	217	20	27207
8	El Dorado	11	103	1644	29	861	96	188818
9	Fresno	34	305	7848	135	4247	297	1018437

# Load the fips code of each county in California
county_fips = pd.read_excel(f"{DATA_PATH}/county_fips.xlsx")
county_fips.head(10)

	county	fips
0	Alameda	1
1	Alpine	3
2	Amador	5
3	Butte	7
4	Calaveras	9
5	Colusa	11
6	Contra Costa	13
7	Del Norte	15
8	El Dorado	17
9	Fresno	19

# Combine the county fips with the California State fips to create the full fips
cal_fip = "06"
def create_full_fip(county_fip, state_fip):
    if len(county_fip) > 3:
        raise("Invalid county_fip")
    while len(county_fip) < 3:
        county_fip = "0" + county_fip
        
    return state_fip + county_fip

county_fips["fips"] = county_fips["fips"].astype(str)
county_fips["fips"] = county_fips["fips"].apply(lambda fips: create_full_fip(fips, cal_fip))
county_fips = county_fips.rename(columns={"county":"County"})
county_fips.head()

	County	fips
0	Alameda	06001
1	Alpine	06003
2	Amador	06005
3	Butte	06007
4	Calaveras	06009

# Organize the dataset again by rearanging column order
county_data_2019 = pd.merge(county_fips, county_data_2019, on="County")
county_data_2019 = county_data_2019.iloc[:, [0, 1, 8, 4, 5, 6, 7, 2, 3]]
county_data_2019.head(10)

	County	fips	Population	Total Crashes	Total Fatal	Total Injury	Alcohol Involved Crashes	Alcohol Involved Fatal	Alcohol Involved Injury
0	Alameda	06001	1668965	22768	96	10441	469	28	655
1	Alpine	06003	1123	77	2	66	1	0	4
2	Amador	06005	37724	521	16	276	29	4	38
3	Butte	06007	214532	2226	34	1262	130	9	143
4	Calaveras	06009	44403	598	11	305	44	4	45
5	Colusa	06011	22045	352	12	205	22	1	21
6	Contra Costa	06013	1147269	12195	77	5982	383	27	492
7	Del Norte	06015	27207	285	11	217	20	1	19
8	El Dorado	06017	188818	1644	29	861	96	11	103
9	Fresno	06019	1018437	7848	135	4247	297	34	305

# Import json file that contains all geographical shapes of counties in the US
counties = json.load(open(f"{DATA_PATH}/CA-flips.json"))
    
# Select only the countries in California
CA_counties = []
for county in counties["features"]:
    if county["properties"]["STATE"] == "06":
        CA_counties.append(county)

counties["features"] = CA_counties

# Save the json file for California counties for main notebook
with open(f"{DATA_PATH}/CA-flips.json", "w") as file:
    file.write(json.dumps(counties))

# plot collision count by county choroplethmap
plot_choroplethmap(counties=counties, 
                   fips=county_data_2019["fips"], 
                   value=county_data_2019["Total Crashes"].tolist(), 
                   value_min=0, 
                   value_max=20000, 
                   text=county_data_2019["County"].tolist(), 
                   title="Collision Count by County in California")

# Take the log transformation of both variables
x = np.log(county_data_2019["Population"])
y = np.log(county_data_2019["Total Crashes"])

# Plot the scatter plot of log(population) vs log(#incidence per 1000 capita)
plt.scatter(x, y, s=15)
plt.ylabel('total')

# Find the best-fit line
a, b = np.polyfit(x, y, 1)
plt.plot(x, a*x+b, color="red")
plt.title("Relationship between county population and collision count")
plt.xlabel("log(Population)")
plt.ylabel("log(collision count)")
plt.savefig(f"{FIGURES_PATH}/scatter3_1.png")
plt.show()

../_images/58340e2416ad6b0b0587f2d0a9415aea692ee7cb9fb76e9d13b7814843bfe908.png

# plot collision frequency (per 1000 capita) by county choroplethmap
county_data_2019["Crashes per 1000 capita"] = county_data_2019["Total Crashes"].astype(int).divide(county_data_2019["Population"].astype(int))*1000
plot_choroplethmap(counties=counties, 
                   fips=county_data_2019["fips"], 
                   value=county_data_2019["Crashes per 1000 capita"].tolist(), 
                   value_min=5, 
                   value_max=20, 
                   text=county_data_2019["County"].tolist(), 
                   title="Collision Count per 1000 Capita by County in California")

# Take the log transformation of both variables
x = np.log(county_data_2019["Population"])
y = np.log(county_data_2019["Crashes per 1000 capita"])

# Plot the scatter plot of log(population) vs log(#incidence per 1000 capita)
plt.scatter(x, y, s=15)
plt.ylabel('total')

# Find the best-fit line
a, b = np.polyfit(x, y, 1)
plt.plot(x, a*x+b, color="red") 
plt.title("Relationship between county population and collision rate")
plt.xlabel("log(Population)")
plt.ylabel("log(Collision count per 1000 capita)")
plt.savefig(f"{FIGURES_PATH}/scatter3_2.png")

plt.show()

../_images/dd7c7ecb00d835486a466a82912c2c1253e49280786ead83d3831601145245f9.png

# plot fatal rate by county choroplethmap
county_data_2019["Overall Fatal Rate"] = county_data_2019["Total Fatal"].astype(int).divide(county_data_2019["Total Crashes"].astype(int))*100
plot_choroplethmap(counties=counties, 
                   fips=county_data_2019["fips"], 
                   value=county_data_2019["Overall Fatal Rate"].tolist(), 
                   value_min=0, 
                   value_max=3, 
                   text=county_data_2019["County"].tolist(), 
                   title="Collision Fatal Rate by County in California")

import matplotlib.pyplot as plt

# Take the log transformation of population
x = np.log(county_data_2019["Population"])
y = county_data_2019["Overall Fatal Rate"]

# Plot the scatter plot of log(population) vs Fatal Rate
plt.scatter(x, y, s=15)
plt.ylabel('total')

# Find the best-fit line
a, b = np.polyfit(x, y, 1)
plt.plot(x, a*x+b, color="red") 
plt.title("Relationship between county population and collision fatal rate")
plt.xlabel("log(population)")
plt.ylabel("Fatal Rate (Fatal Collision Count/Total Collision Count)")
plt.savefig(f"{FIGURES_PATH}/scatter3_3.png")

plt.show()

../_images/25bcc0f11f362b2bc934ac3d1735e4dd37a70b45231d8cad7c1a4904b54b3777.png

# use barplot to compare the distribution of overall fatal rate and alcohol-involved fatal rate
county_data_2019["Alcohol Involved Fatal Rate"] = county_data_2019["Alcohol Involved Fatal"].divide(county_data_2019["Alcohol Involved Crashes"])*100

sns.boxplot(data=county_data_2019.loc[:, ["Overall Fatal Rate", "Alcohol Involved Fatal Rate"]], showfliers = False).set(
    ylabel='Fatal Rate in %',
    title="Comparison of Overall Fatal Rate and Alcohol-Involved Fatal Rate"
);
plt.savefig(f"{FIGURES_PATH}/boxplot3_4.png")

../_images/eabee3f52c4ec22dea1cddef47ffb181b943d1e404be4e6586069991bb5004a3.png

# Save the dataset for main notebook
county_data_2019.to_csv(f"{DATA_PATH}/county_data_2019.csv")