Data Cleaning#
import numpy as np
import pandas as pd
# import original data
health = pd.read_csv('data/CitieSHealth_BCN_DATA_PanelStudy_20220414.csv')
np.shape(health)
(3348, 95)
# The list of variables that we are intersted in
variables = ['no2bcn_24h', 'pm25bcn','BCμg','performance',
'occurrence_mental', 'bienestar', 'estres', 'sueno',
'gender', 'dieta', 'alcohol','drogas', 'education',
'access_greenbluespaces_300mbuff','smoke', 'age_yrs']
renamed_variables = {'no2bcn_24h':'no2', 'pm25bcn':'pm25', 'BCμg':'black carbon',
'bienestar': 'wellbeing', 'estres':'stress', 'sueno':'sleep quality',
'dieta':'diet', 'drogas':'drug_use', 'age_yrs':'age',
'access_greenbluespaces_300mbuff': 'access_greenbluespaces'}
health_clean = (
health
.loc[:, variables]
.convert_dtypes() #automatically convert variable type
.rename(columns = renamed_variables)
.round(2)
.dropna()
)
health_clean.shape
(2221, 16)
health_clean['gender'].value_counts()
Mujer 1588
Hombre 628
Otra 5
Name: gender, dtype: Int64
health_clean['education'].value_counts()
Universitario 1879
Bachillerato 300
Primario o menos 42
Name: education, dtype: Int64
# translate some non-English contents
health_clean = health_clean.replace({'gender':{'Mujer':'Female', 'Hombre':'Male', 'Otra':'Other'},
'education':{'Universitario':'university',
'Bachillerato':'baccalaureate',
'Primario o menos':'primary or less'}})
# Omit 'Other' in gender column since the sample size for other was too small
health_clean = health_clean[health_clean['gender'] != 'Other']
health_clean.shape
(2216, 16)
health_clean.head()
| no2 | pm25 | black carbon | performance | occurrence_mental | wellbeing | stress | sleep quality | gender | diet | alcohol | drug_use | education | access_greenbluespaces | smoke | age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 28.54 | 8.16 | 0.5 | 51.24 | 2 | 8 | 5 | 8 | Female | Yes | No | No | university | Yes | No | 37 |
| 2 | 44.51 | 13.38 | 1.51 | 56.01 | 10 | 9 | 8 | 9 | Female | Yes | Yes | No | university | Yes | Yes | 28 |
| 3 | 33.81 | 16.53 | 1.17 | 58.18 | 14 | 3 | 5 | 2 | Female | Yes | No | No | university | Yes | No | 29 |
| 4 | 37.86 | 15.67 | 1.21 | 71.48 | 12 | 9 | 6 | 2 | Female | No | Yes | No | university | Yes | No | 33 |
| 6 | 37.82 | 18.45 | 1.56 | 46.05 | 9 | 3 | 5 | 10 | Female | Yes | Yes | No | baccalaureate | No | No | 45 |
# save the file after cleaning
health_clean.to_csv('data/CitieSHealth_BCN_DATA_PanelStudy_20220414_Clean.csv', index=False)