Education and Performance

Education and Performance#

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

data = pd.read_csv('data/CitieSHealth_BCN_DATA_PanelStudy_20220414_Clean.csv')
data

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[2], line 1
----> 1 data = pd.read_csv('data/CitieSHealth_BCN_DATA_PanelStudy_20220414_Clean.csv')
data

File /srv/conda/envs/notebook/lib/python3.10/site-packages/pandas/util/_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs)
   else:
       kwargs[new_arg_name] = new_arg_value
--> 211 return func(*args, **kwargs)

File /srv/conda/envs/notebook/lib/python3.10/site-packages/pandas/util/_decorators.py:331, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
if len(args) > num_allow_args:
   warnings.warn(
       msg.format(arguments=_format_argument_list(allow_args)),
       FutureWarning,
       stacklevel=find_stack_level(),
   )
--> 331 return func(*args, **kwargs)

File /srv/conda/envs/notebook/lib/python3.10/site-packages/pandas/io/parsers/readers.py:950, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
kwds_defaults = _refine_defaults_read(
   dialect,
   delimiter,
   (...)
   defaults={"delimiter": ","},
)
kwds.update(kwds_defaults)
--> 950 return _read(filepath_or_buffer, kwds)

File /srv/conda/envs/notebook/lib/python3.10/site-packages/pandas/io/parsers/readers.py:605, in _read(filepath_or_buffer, kwds)
_validate_names(kwds.get("names", None))
# Create the parser.
--> 605 parser = TextFileReader(filepath_or_buffer, **kwds)
if chunksize or iterator:
   return parser

File /srv/conda/envs/notebook/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1442, in TextFileReader.__init__(self, f, engine, **kwds)
   self.options["has_index_names"] = kwds["has_index_names"]
self.handles: IOHandles | None = None
-> 1442 self._engine = self._make_engine(f, self.engine)

File /srv/conda/envs/notebook/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1735, in TextFileReader._make_engine(self, f, engine)
   if "b" not in mode:
       mode += "b"
-> 1735 self.handles = get_handle(
   f,
   mode,
   encoding=self.options.get("encoding", None),
   compression=self.options.get("compression", None),
   memory_map=self.options.get("memory_map", False),
   is_text=is_text,
   errors=self.options.get("encoding_errors", "strict"),
   storage_options=self.options.get("storage_options", None),
)
assert self.handles is not None
f = self.handles.handle

File /srv/conda/envs/notebook/lib/python3.10/site-packages/pandas/io/common.py:856, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
elif isinstance(handle, str):
   # Check whether the filename is to be opened in binary mode.
   # Binary mode does not support 'encoding' and 'newline'.
   if ioargs.encoding and "b" not in ioargs.mode:
       # Encoding
--> 856         handle = open(
           handle,
           ioargs.mode,
           encoding=ioargs.encoding,
           errors=errors,
           newline="",
       )
   else:
       # Binary mode
       handle = open(handle, ioargs.mode)

FileNotFoundError: [Errno 2] No such file or directory: 'data/CitieSHealth_BCN_DATA_PanelStudy_20220414_Clean.csv'

sns.histplot(x = 'performance', hue = 'education', data = data)

plt.savefig('outputs/histogram_for_performance')

sns.histplot(x = 'occurrence_mental', hue = 'education', data = data)

plt.savefig('outputs/histogram_for_occurrence_mental')

sns.histplot(x = 'stress', hue = 'education', data = data)

plt.savefig('outputs/histogram_for_stress')

sns.barplot(x = 'education', y = 'stress', data = data)

plt.savefig('outputs/barplot_for_stressVsEducation')

sns.barplot(x = 'education', y = 'occurrence_mental', data = data)

plt.savefig('outputs/barplot_for_occurrenceVsEducation')

sns.barplot(x = 'education', y = 'performance', data = data)

plt.savefig('outputs/barplot_for_performanceVsEducation')

sns.scatterplot(x = 'stress', y = 'performance', hue = 'education', data = data)

plt.savefig('outputs/scatterplot_for_stressVsperformance')

sns.scatterplot(x = 'occurrence_mental', y = 'performance', hue = 'education', data = data)

plt.savefig('outputs/scatterplot_for_occurrenceVsperformance')

average_mental_health_issue_occurence_univ = np.mean(data[data['education'] == 'university']['occurrence_mental'])
average_mental_health_issue_occurence_bac = np.mean(data[data['education'] == 'baccalaureate']['occurrence_mental'])
average_mental_health_issue_occurence_pri_less = np.mean(data[data['education'] == 'primary or less']['occurrence_mental'])

print('Average Mental Health Issue Occurence for people with University education is {:0.3f}'.format(average_mental_health_issue_occurence_univ))
print('Average Mental Health Issue Occurence for people with Baccalaureate education is {:0.3f}'.format(average_mental_health_issue_occurence_bac))
print('Average Mental Health Issue Occurence for people with Primary or less education is {:0.3f}'.format(average_mental_health_issue_occurence_pri_less))

average_stress_univ = np.mean(data[data['education'] == 'university']['stress'])
average_stress_bac = np.mean(data[data['education'] == 'baccalaureate']['stress'])
average_stress_pri_less = np.mean(data[data['education'] == 'primary or less']['stress'])

print('Average Stress Level for people with University education is {:0.3f}'.format(average_stress_univ))
print('Average Stress Level for people with Baccalaureate education is {:0.3f}'.format(average_stress_bac))
print('Average Stress Level for people with Primary or less education is {:0.3f}'.format(average_stress_pri_less))

average_performance_univ = np.mean(data[data['education'] == 'university']['performance'])
average_performance_bac = np.mean(data[data['education'] == 'baccalaureate']['performance'])
average_performance_pri_less = np.mean(data[data['education'] == 'primary or less']['performance'])

print('Average Stroop Test score for people with University education is {:0.3f}'.format(average_performance_univ))
print('Average Stroop Test score for people with Baccalaureate education is {:0.3f}'.format(average_performance_bac))
print('Average Stroop Test score for people with Primary or less education is {:0.3f}'.format(average_performance_pri_less))

def hypothesis_testing(educ1, educ2, column, data, observed_diff, cutoff):
    diffs = []
    for i in np.arange(20000):
        data['shuffled_education'] = data['education'].sample(frac = 1, replace = False).values
        diff = np.abs(np.mean(data[data['shuffled_education'] == educ1][column]) - np.mean(data[data['shuffled_education'] == educ2][column]))
        diffs += [diff]
        data = data.drop(['shuffled_education'], axis = 1)
        
    pvalue = sum(observed_diff <= diffs) / len(diffs)
    if pvalue <= cutoff:
        print('P value is {:0.3f}'.format(pvalue) + '.' + ' ' + 'Thus, Reject the Null Hypothesis')
    else:
        print('P value is {:0.3f}'.format(pvalue) + '.' + ' ' + 'Thus, Fail to Rejct the Null Hypothesis')

observed_diff = np.abs(average_mental_health_issue_occurence_univ - average_mental_health_issue_occurence_bac)
hypothesis_testing('university', 'baccalaureate', 'occurrence_mental', data, observed_diff, 0.05)

observed_diff = np.abs(average_mental_health_issue_occurence_univ - average_mental_health_issue_occurence_pri_less)
hypothesis_testing('university', 'primary or less', 'occurrence_mental', data, observed_diff, 0.05)

observed_diff = np.abs(average_mental_health_issue_occurence_bac - average_mental_health_issue_occurence_pri_less)
hypothesis_testing('baccalaureate', 'primary or less', 'occurrence_mental', data, observed_diff, 0.05)

observed_diff = np.abs(average_stress_univ - average_stress_bac)
hypothesis_testing('university', 'baccalaureate', 'stress', data, observed_diff, 0.05)

observed_diff = np.abs(average_stress_univ - average_stress_pri_less)
hypothesis_testing('university', 'primary or less', 'stress', data, observed_diff, 0.05)

observed_diff = np.abs(average_stress_pri_less - average_stress_bac)
hypothesis_testing('primary or less', 'baccalaureate', 'stress', data, observed_diff, 0.05)

observed_diff = np.abs(average_performance_univ - average_performance_bac)
hypothesis_testing('university', 'baccalaureate', 'performance', data, observed_diff, 0.05)

observed_diff = np.abs(average_performance_univ - average_performance_pri_less)
hypothesis_testing('university', 'primary or less', 'performance', data, observed_diff, 0.05)

observed_diff = np.abs(average_performance_bac - average_performance_pri_less)
hypothesis_testing('baccalaureate', 'primary or less', 'performance', data, observed_diff, 0.05)