EDA and Data Cleaning#

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

Data from https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction#

# Import raw data
raw_data = pd.read_csv('../data/heart.csv')
raw_data.head()
Age Sex ChestPainType RestingBP Cholesterol FastingBS RestingECG MaxHR ExerciseAngina Oldpeak ST_Slope HeartDisease
0 40 M ATA 140 289 0 Normal 172 N 0.0 Up 0
1 49 F NAP 160 180 0 Normal 156 N 1.0 Flat 1
2 37 M ATA 130 283 0 ST 98 N 0.0 Up 0
3 48 F ASY 138 214 0 Normal 108 Y 1.5 Flat 1
4 54 M NAP 150 195 0 Normal 122 N 0.0 Up 0

Check for NaN values in columns#

nan_count = raw_data.isna().sum()
print(nan_count)
Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64
raw_data.hist(figsize=(14, 10))
plt.tight_layout()
plt.savefig('../figures/eda_histograms.png')
../_images/3b283683a54dcc6f336050c835f1021a29f3839031c6abc7d5b9cfb432fe3f4f.png

Looking at class distributions#

fig,axes = plt.subplots(nrows=3,ncols=5,figsize=(16,10))

# loop through each column in the dataframe
for i, col in enumerate(raw_data.columns[:-1]):
    # select the appropriate subplot axis based on the index i
    ax = axes[i // 5, i % 5]
    # plot a histogram of the column for each group in 'HeartDisease'
    raw_data.groupby('HeartDisease')[col].hist(alpha=0.4, ax=ax)
    # set the title, y-axis label, and x-axis label for the subplot
    ax.set_title(col)
    ax.set_ylabel("Count")
    ax.set_xlabel("Value")
    ax.legend(raw_data['HeartDisease'])

fig.delaxes(axes[2][1])
fig.delaxes(axes[2][2])
fig.delaxes(axes[2][3])
fig.delaxes(axes[2][4])
# adjust the spacing between the subplots for better readability
plt.tight_layout()
plt.savefig('../figures/eda_class_distributions.png')
../_images/367c6a0d476f79a6271dea727f8f7e2e162db11b962874be36a03626f857a061.png

PCA#

categorical_columns= ['Sex', 'ChestPainType','RestingECG', 'ExerciseAngina', 'ST_Slope' ]
X = raw_data.drop(columns=categorical_columns)
y = raw_data['HeartDisease']
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)
# Using PCA
pca = PCA(n_components = 7)
pca.fit(X_standardized)
PCA(n_components=7)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
p_c = pca.fit_transform(X_standardized)
# print(principalComponents)

principal_df = pd.DataFrame(data=p_c, columns = ['PC1','PC2','PC3','PC4','PC5','PC6','PC7'])

final_df = pd.concat([principal_df, y],axis=1)
final_df.head()
PC1 PC2 PC3 PC4 PC5 PC6 PC7 HeartDisease
0 -2.417053 -0.508044 -0.536672 0.492483 -0.592140 0.263087 0.035454 0
1 0.138073 -0.813523 -0.245526 0.495047 -1.577625 -0.012082 0.836385 1
2 -1.346575 0.120716 0.050249 -1.194085 -0.405928 1.707199 -1.379748 0
3 0.768445 -0.416296 0.876559 -0.468257 -0.780828 0.872467 -0.231420 1
4 -0.519631 -0.324544 -1.160150 -1.158588 -0.430199 0.032315 -0.483574 0
# Plot
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 Component PCA of Non Categorical Data', fontsize = 20)
targets = [0,1]
colors = ['r', 'b']
for target, color in zip(targets,colors):
    indicesToKeep = final_df['HeartDisease'] == target
    ax.scatter(final_df.loc[indicesToKeep, 'PC1']
               , final_df.loc[indicesToKeep, 'PC2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()
plt.savefig('../figures/2_component_pca.png')
../_images/e88bf1377cbeff6fc02a9a58630bd9ad417d1a7ed2727c51d139af7cacae3349.png
## Variability of components
cum_var = pca.explained_variance_ratio_.cumsum()
cum_var=np.insert(cum_var,0,0)
k_features = list(range(0,X.shape[1]+1))

# Plot figure
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(1,1,1)
ax.plot(k_features,cum_var)
# plt.rc('axes', titlesize=14)
# plt.rc('axes', labelsize=12)
ax.set_xlabel("Number of Eigenvalues")
ax.set_ylabel("Cumulative Sum of Variance")
ax.set_title("Cumulative Sum of Variance vs Number of Eigenvalues")
ax.set_xlim([0,7])
ax.set_ylim([0,1.01])
ax.grid()
plt.savefig('../figures/sum_of_variance.png')
../_images/1e7ff1c2b5f88d65c477f2170375ccf733c1dd12e96e60d76471c2a6e3c323c7.png