Predicting Mental Health Performance#
We create models to predict mental health performance using other features.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
from sklearn.metrics import roc_curve, roc_auc_score
# use cleaned dataset
df = pd.read_csv('../data/CitieSHealth_BCN_DATA_PanelStudy_20220414_Clean.csv')
df.head()
| no2 | pm25 | black carbon | performance | occurrence_mental | wellbeing | stress | sleep quality | gender | diet | alcohol | drug_use | education | access_greenbluespaces | smoke | age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 28.54 | 8.16 | 0.50 | 51.24 | 2 | 8 | 5 | 8 | Female | Yes | No | No | university | Yes | No | 37 |
| 1 | 44.51 | 13.38 | 1.51 | 56.01 | 10 | 9 | 8 | 9 | Female | Yes | Yes | No | university | Yes | Yes | 28 |
| 2 | 33.81 | 16.53 | 1.17 | 58.18 | 14 | 3 | 5 | 2 | Female | Yes | No | No | university | Yes | No | 29 |
| 3 | 37.86 | 15.67 | 1.21 | 71.48 | 12 | 9 | 6 | 2 | Female | No | Yes | No | university | Yes | No | 33 |
| 4 | 37.82 | 18.45 | 1.56 | 46.05 | 9 | 3 | 5 | 10 | Female | Yes | Yes | No | baccalaureate | No | No | 45 |
numerical = list((df.dtypes[df.dtypes == 'float64'].index) | (df.dtypes[df.dtypes == 'int64'].index))
categorical = list((df.dtypes[df.dtypes != 'float64'].index) & (df.dtypes[df.dtypes != 'int64'].index))
/tmp/ipykernel_618/2314732951.py:1: FutureWarning: Index.__or__ operating as a set operation is deprecated, in the future this will be a logical operation matching Series.__or__. Use index.union(other) instead.
numerical = list((df.dtypes[df.dtypes == 'float64'].index) | (df.dtypes[df.dtypes == 'int64'].index))
/tmp/ipykernel_618/2314732951.py:2: FutureWarning: Index.__and__ operating as a set operation is deprecated, in the future this will be a logical operation matching Series.__and__. Use index.intersection(other) instead.
categorical = list((df.dtypes[df.dtypes != 'float64'].index) & (df.dtypes[df.dtypes != 'int64'].index))
def ohe(data, column):
enc = OneHotEncoder()
enc.fit(data[column])
encoded_data = pd.DataFrame(enc.transform(data[column]).toarray().astype(int))
encoded_data.columns = enc.get_feature_names_out()
encoded_data = encoded_data.set_index(data.index)
return encoded_data
ohe_df = pd.concat([df[numerical], ohe(df[categorical], categorical)], axis=1)
ohe_df.head()
| age | black carbon | no2 | occurrence_mental | performance | pm25 | sleep quality | stress | wellbeing | gender_Female | ... | alcohol_Yes | drug_use_No | drug_use_Yes | education_baccalaureate | education_primary or less | education_university | access_greenbluespaces_No | access_greenbluespaces_Yes | smoke_No | smoke_Yes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 37 | 0.50 | 28.54 | 2 | 51.24 | 8.16 | 8 | 5 | 8 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 |
| 1 | 28 | 1.51 | 44.51 | 10 | 56.01 | 13.38 | 9 | 8 | 9 | 1 | ... | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
| 2 | 29 | 1.17 | 33.81 | 14 | 58.18 | 16.53 | 2 | 5 | 3 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 |
| 3 | 33 | 1.21 | 37.86 | 12 | 71.48 | 15.67 | 2 | 6 | 9 | 1 | ... | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 |
| 4 | 45 | 1.56 | 37.82 | 9 | 46.05 | 18.45 | 10 | 5 | 3 | 1 | ... | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 |
5 rows × 24 columns
X = ohe_df.drop('performance', axis=1)
y = ohe_df['performance']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
Linear Regression#
model = LinearRegression().fit(X_train, y_train)
print('model intercept :', model.intercept_)
print('model coefficients : ', model.coef_)
print('Model score : ', model.score(X, y))
model intercept : 58.31824436597431
model coefficients : [-0.44714672 1.42349775 -0.03183401 0.76227076 -0.11147233 -0.01605531
-0.57671488 -0.00766175 -0.25765266 0.25765266 -0.96103905 0.96103905
-0.06319105 0.06319105 2.95368591 -2.95368591 -0.91066547 -1.62917906
2.53984453 -0.29977911 0.29977911 -0.44647134 0.44647134]
Model score : 0.28852599872359175
y_pred = model.predict(X_test)
lr_rmse = mean_squared_error(y_test, y_pred, squared=False)
lr_rmse
8.826256209196428
lr_rmse / (np.max(y_test) - np.min(y_test))
0.10816490452446603
Random Forest#
model = RandomForestRegressor(n_estimators=100, max_depth=4, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
rf_rmse = mean_squared_error(y_test, y_pred, squared=False)
rf_rmse
8.723155813250099
rf_rmse / (np.max(y_test) - np.min(y_test))
0.10690141928002572
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=800)
tree.plot_tree(model.estimators_[0]);