Data Cleaning#

import numpy as np
import pandas as pd
# import original data
health = pd.read_csv('data/CitieSHealth_BCN_DATA_PanelStudy_20220414.csv')
np.shape(health)
(3348, 95)
# The list of variables that we are intersted in
variables = ['no2bcn_24h', 'pm25bcn','BCμg','performance',
             'occurrence_mental', 'bienestar', 'estres', 'sueno', 
             'gender', 'dieta', 'alcohol','drogas', 'education', 
             'access_greenbluespaces_300mbuff','smoke', 'age_yrs']

renamed_variables = {'no2bcn_24h':'no2', 'pm25bcn':'pm25', 'BCμg':'black carbon',
                     'bienestar': 'wellbeing', 'estres':'stress', 'sueno':'sleep quality',
                    'dieta':'diet', 'drogas':'drug_use', 'age_yrs':'age',
                     'access_greenbluespaces_300mbuff': 'access_greenbluespaces'}

health_clean = (
    health
    .loc[:, variables]
    .convert_dtypes()    #automatically convert variable type
    .rename(columns = renamed_variables)
    .round(2)
    .dropna()
)
health_clean.shape
(2221, 16)
health_clean['gender'].value_counts()
Mujer     1588
Hombre     628
Otra         5
Name: gender, dtype: Int64
health_clean['education'].value_counts()
Universitario       1879
Bachillerato         300
Primario o menos      42
Name: education, dtype: Int64
# translate some non-English contents
health_clean = health_clean.replace({'gender':{'Mujer':'Female', 'Hombre':'Male', 'Otra':'Other'},
                                    'education':{'Universitario':'university', 
                                                 'Bachillerato':'baccalaureate',
                                                 'Primario o menos':'primary or less'}})

# Omit 'Other' in gender column since the sample size for other was too small
health_clean = health_clean[health_clean['gender'] != 'Other']
health_clean.shape
(2216, 16)
health_clean.head()
no2 pm25 black carbon performance occurrence_mental wellbeing stress sleep quality gender diet alcohol drug_use education access_greenbluespaces smoke age
1 28.54 8.16 0.5 51.24 2 8 5 8 Female Yes No No university Yes No 37
2 44.51 13.38 1.51 56.01 10 9 8 9 Female Yes Yes No university Yes Yes 28
3 33.81 16.53 1.17 58.18 14 3 5 2 Female Yes No No university Yes No 29
4 37.86 15.67 1.21 71.48 12 9 6 2 Female No Yes No university Yes No 33
6 37.82 18.45 1.56 46.05 9 3 5 10 Female Yes Yes No baccalaureate No No 45
# save the file after cleaning
health_clean.to_csv('data/CitieSHealth_BCN_DATA_PanelStudy_20220414_Clean.csv', index=False)