import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%%capture
#read the dataset
gss = pd.read_csv("https://github.com/jkropko/DS-6001/raw/master/localdata/gss2018.csv",
encoding='cp1252', na_values=['IAP','IAP,DK,NA,uncodeable', 'NOT SURE',
'DK', 'IAP, DK, NA, uncodeable', '.a', "CAN'T CHOOSE"])
#Cleans the data and gets it ready to be used for data visualizations:
mycols = ['id', 'wtss', 'sex', 'educ', 'region', 'age', 'coninc',
'prestg10', 'mapres10', 'papres10', 'sei10', 'satjob',
'fechld', 'fefam', 'fepol', 'fepresch', 'meovrwrk']
gss_clean = gss[mycols]
gss_clean = gss_clean.rename({'wtss':'weight',
'educ':'education',
'coninc':'income',
'prestg10':'job_prestige',
'mapres10':'mother_job_prestige',
'papres10':'father_job_prestige',
'sei10':'socioeconomic_index',
'fechld':'relationship',
'fefam':'male_breadwinner',
'fehire':'hire_women',
'fejobaff':'preference_hire_women',
'fepol':'men_bettersuited',
'fepresch':'child_suffer',
'meovrwrk':'men_overwork'},axis=1)
gss_clean.age = gss_clean.age.replace({'89 or older':'89'})
gss_clean.age = gss_clean.age.astype('float')
The dataframe now contains the following features:
id
- a numeric unique ID for each person who responded to the surveyweight
- survey sample weightssex
- male or femaleeducation
- years of formal educationregion
- region of the country where the respondent livesage
- ageincome
- the respondent's personal annual incomejob_prestige
- the respondent's occupational prestige score, as measured by the GSS using the methodology described abovemother_job_prestige
- the respondent's mother's occupational prestige score, as measured by the GSS using the methodology described abovefather_job_prestige
-the respondent's father's occupational prestige score, as measured by the GSS using the methodology described abovesocioeconomic_index
- an index measuring the respondent's socioeconomic statussatjob
- responses to "On the whole, how satisfied are you with the work you do?"relationship
- agree or disagree with: "A working mother can establish just as warm and secure a relationship with her children as a mother who does not work."male_breadwinner
- agree or disagree with: "It is much better for everyone involved if the man is the achiever outside the home and the woman takes care of the home and family."men_bettersuited
- agree or disagree with: "Most men are better suited emotionally for politics than are most women."child_suffer
- agree or disagree with: "A preschool child is likely to suffer if his or her mother works."men_overwork
- agree or disagree with: "Family life often suffers because men concentrate too much on their work."#reorder categories
#use seaborn to create a barplot for responses of the question "A working mother can establish just as warm and secure a relationship
#with her children as a mother who does not work."
palette = ['red','red','blue','blue']
gss_clean['relationship'] = gss_clean['relationship'].astype('category')
gss_clean['relationship'] = gss_clean['relationship'].cat.reorder_categories(['strongly agree',
'agree',
'disagree',
'strongly disagree'])
mybars = gss_clean['relationship'].value_counts().sort_index(ascending=False)
# seaborn
plt.figure(figsize=(10,8));
palette.reverse()
sns.barplot(y=mybars.index,x=mybars.values,orient='h',palette=palette,alpha=0.5);
plt.title('Responses for the relationship question');
plt.ylabel('Response');
plt.xlabel('Frequency');
temp_df = gss_clean[['sex','relationship']]
temp_df_g = temp_df.groupby(['sex','relationship']).size().reset_index()
temp_df_g = temp_df_g.rename({0:'count'}, axis=1)
plt.figure(figsize=(10,8))
s = sns.barplot(x='relationship',y='count',hue='sex',data=temp_df_g);
plt.title('Relationship response counts by Sex')
for rect in s.patches:
xcoor = rect.get_x() + .5*rect.get_width()
ycoor = rect.get_height()
plt.text(xcoor, ycoor, str(int(ycoor)),
horizontalalignment='center',
verticalalignment='bottom',
fontsize=14)
tempdf_reg = gss_clean[['region','relationship']]
tempdf_reg_g = tempdf_reg.groupby(['region','relationship']).size().sort_index(ascending=False).reset_index()
tempdf_reg_g = tempdf_reg_g.rename({0:'count'},axis=1)
g = sns.FacetGrid(col='region',hue='region',data=tempdf_reg_g,col_wrap=3);
g.map(plt.barh, 'relationship','count');
g.set_titles('{col_name}');
g.set_axis_labels('Counts', 'Relationship response') ;
g.fig.subplots_adjust(top=.8);
g.fig.suptitle('Relationship response counts by region', fontsize=16);
#Lineplot
temp_line = gss_clean[['socioeconomic_index', 'job_prestige','education']]
temp_line = temp_line[temp_line['education']>=8]
temp_line_g = temp_line.groupby('education').mean().round(2)#.reset_index()
temp_line_g.columns=['Socioeconomic Status','Occupational Prestige']
#temp_line_g.rename(index=['Index'])
temp_line_g.plot(kind='line',color=['orange','lightblue'],style=['--','-'],figsize=[15,8])
plt.title('Occupational Prestige and Socioeconomic Status vs. Years of Education');
plt.xlabel('Years of formal education');
plt.ylabel('Value');
plt.legend(title='Index');
plt.ylim(0,80)
corr_df = gss_clean[['job_prestige','father_job_prestige','mother_job_prestige','income','socioeconomic_index']]
plt.figure(figsize=(15,10))
c = sns.heatmap(corr_df.corr(),annot=True,center=0.)
plt.title('Correlation Between Personal Economic indicators');
temp_edu = gss_clean[['income','education','job_prestige']]
temp_edu['educ_cat'] = temp_edu['education']
temp_edu['educ_cat'] = temp_edu['educ_cat'].astype('object')
for i in range(0,11):
temp_edu.educ_cat.replace({i:'10 years or fewer'},inplace=True)
for i in range(11,17):
temp_edu.educ_cat.replace({i:str(i)+' years'},inplace=True)
for i in range(17,21):
temp_edu.educ_cat.replace({i:'More than 16 years'},inplace=True)
temp_edu = temp_edu.sort_values(by='education')
g = sns.FacetGrid(temp_edu, col = 'educ_cat', height=4, aspect=1,col_wrap=2)
g.map(sns.regplot, 'job_prestige', 'income')
g.set_titles('{col_name}')
g.set_axis_labels('Occupational Prestige','Income')
g.fig.subplots_adjust(top=.95)
g.fig.suptitle('Income vs. Prestige by Education', fontsize=16);
from scipy import stats
def trim_mean(x): return stats.trim_mean(x,.1)
temp_gen_inc = gss_clean[['income','sex','job_prestige']]
temp_gen_inc.groupby('sex')['income'].agg([trim_mean]).round(2)
plt.figure(figsize=(10,5))
sns.boxplot(data=temp_gen_inc,y='sex',x='income',orient='h',hue='sex').set(title="Income by Gender");
plt.figure(figsize=(10,5))
sns.violinplot(data=temp_gen_inc,y='sex',x='income',orient='h',hue='sex').set(title="Income by Gender");
plt.figure(figsize=(15,15));
g = sns.FacetGrid(temp_gen_inc, col = 'sex', height=6, aspect=1,col_wrap=2);
g.map(sns.regplot, 'job_prestige', 'income');
g.set_titles('{col_name}')
g.set_axis_labels('Occupational Prestige','Income')
g.fig.subplots_adjust(top=.95);
g.fig.suptitle('Income vs. Prestige by Sex', fontsize=16);
#Create job_prestige bins
temp_gen_inc['job_prestige_groups']=pd.cut(temp_gen_inc.job_prestige,4)
temp_gen_inc['job_prestige_groups'].unique()
#groupby sex and job prestige groups
temp_inc_g = temp_gen_inc.groupby(['job_prestige_groups','sex'])['income'].agg({'mean'}).round(0)
temp_inc_g = temp_inc_g.reset_index().dropna()
temp_inc_g.columns = ['Job prestige groups','Sex','Mean Income']
plt.figure(figsize=(10,8))
s = sns.barplot(y='Mean Income',x='Job prestige groups',data=temp_inc_g,hue='Sex');
for rect in s.patches:
xcoor = rect.get_x() + .5*rect.get_width()
ycoor = rect.get_height()
plt.text(xcoor, ycoor, str(int(ycoor)),
horizontalalignment='center',
verticalalignment='bottom',
fontsize=14)
plt.title('Income by job prestige score bins - Male vs Female');
#create income bins
temp_gen_inc['income_group']=pd.cut(temp_gen_inc.income,5)
#group by sex and income groups
temp_inc_job_g = temp_gen_inc.groupby(['income_group','sex'])['job_prestige'].agg({'mean'}).round(0)
temp_inc_g_i_df = temp_inc_job_g.reset_index().dropna()
temp_inc_g_i_df.columns = ['Income group', 'Sex', 'Mean Income']
plt.figure(figsize=(20,6))
s = sns.barplot(y='Mean Income',x='Income group',data=temp_inc_g_i_df,hue='Sex');
for rect in s.patches:
xcoor = rect.get_x() + .5*rect.get_width()
ycoor = rect.get_height()
plt.text(xcoor, ycoor, str(int(ycoor)),
horizontalalignment='center',
verticalalignment='bottom',
fontsize=14)
plt.title('Prestige scores by income bins - Male vs Female');