Data Visualization - Plots Examples¶

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

Read data¶

%%capture
#read the dataset
gss = pd.read_csv("https://github.com/jkropko/DS-6001/raw/master/localdata/gss2018.csv",
                 encoding='cp1252', na_values=['IAP','IAP,DK,NA,uncodeable', 'NOT SURE',
                                               'DK', 'IAP, DK, NA, uncodeable', '.a', "CAN'T CHOOSE"])

Basic Cleaning¶

rename columns
check data types

#Cleans the data and gets it ready to be used for data visualizations:
mycols = ['id', 'wtss', 'sex', 'educ', 'region', 'age', 'coninc',
          'prestg10', 'mapres10', 'papres10', 'sei10', 'satjob',
          'fechld', 'fefam', 'fepol', 'fepresch', 'meovrwrk'] 
gss_clean = gss[mycols]
gss_clean = gss_clean.rename({'wtss':'weight', 
                              'educ':'education', 
                              'coninc':'income', 
                              'prestg10':'job_prestige',
                              'mapres10':'mother_job_prestige', 
                              'papres10':'father_job_prestige', 
                              'sei10':'socioeconomic_index', 
                              'fechld':'relationship', 
                              'fefam':'male_breadwinner', 
                              'fehire':'hire_women', 
                              'fejobaff':'preference_hire_women', 
                              'fepol':'men_bettersuited', 
                              'fepresch':'child_suffer',
                              'meovrwrk':'men_overwork'},axis=1)
gss_clean.age = gss_clean.age.replace({'89 or older':'89'})
gss_clean.age = gss_clean.age.astype('float')

The dataframe now contains the following features:

id - a numeric unique ID for each person who responded to the survey
weight - survey sample weights
sex - male or female
education - years of formal education
region - region of the country where the respondent lives
age - age
income - the respondent's personal annual income
job_prestige - the respondent's occupational prestige score, as measured by the GSS using the methodology described above
mother_job_prestige - the respondent's mother's occupational prestige score, as measured by the GSS using the methodology described above
father_job_prestige -the respondent's father's occupational prestige score, as measured by the GSS using the methodology described above
socioeconomic_index - an index measuring the respondent's socioeconomic status
satjob - responses to "On the whole, how satisfied are you with the work you do?"
relationship - agree or disagree with: "A working mother can establish just as warm and secure a relationship with her children as a mother who does not work."
male_breadwinner - agree or disagree with: "It is much better for everyone involved if the man is the achiever outside the home and the woman takes care of the home and family."
men_bettersuited - agree or disagree with: "Most men are better suited emotionally for politics than are most women."
child_suffer - agree or disagree with: "A preschool child is likely to suffer if his or her mother works."
men_overwork - agree or disagree with: "Family life often suffers because men concentrate too much on their work."

Barplot¶

#reorder categories
#use seaborn to create a barplot for responses of the question "A working mother can establish just as warm and secure a relationship 
#with her children as a mother who does not work."
palette = ['red','red','blue','blue']
gss_clean['relationship'] = gss_clean['relationship'].astype('category')
gss_clean['relationship'] = gss_clean['relationship'].cat.reorder_categories(['strongly agree', 
                                                            'agree', 
                                                            'disagree', 
                                                            'strongly disagree'])
mybars = gss_clean['relationship'].value_counts().sort_index(ascending=False)

# seaborn
plt.figure(figsize=(10,8));
palette.reverse()
sns.barplot(y=mybars.index,x=mybars.values,orient='h',palette=palette,alpha=0.5);
plt.title('Responses for the relationship question');
plt.ylabel('Response');
plt.xlabel('Frequency');

Clustered Barplot¶

temp_df = gss_clean[['sex','relationship']]
temp_df_g = temp_df.groupby(['sex','relationship']).size().reset_index()
temp_df_g = temp_df_g.rename({0:'count'}, axis=1)

plt.figure(figsize=(10,8))
s = sns.barplot(x='relationship',y='count',hue='sex',data=temp_df_g);
plt.title('Relationship response counts by Sex')
for rect in s.patches:
    xcoor = rect.get_x() + .5*rect.get_width()
    ycoor = rect.get_height()
    plt.text(xcoor, ycoor, str(int(ycoor)), 
             horizontalalignment='center', 
             verticalalignment='bottom',
             fontsize=14)

Grid barplot¶

tempdf_reg = gss_clean[['region','relationship']]
tempdf_reg_g = tempdf_reg.groupby(['region','relationship']).size().sort_index(ascending=False).reset_index()
tempdf_reg_g = tempdf_reg_g.rename({0:'count'},axis=1)

g = sns.FacetGrid(col='region',hue='region',data=tempdf_reg_g,col_wrap=3);
g.map(plt.barh, 'relationship','count');
g.set_titles('{col_name}');
g.set_axis_labels('Counts', 'Relationship response') ;
g.fig.subplots_adjust(top=.8);
g.fig.suptitle('Relationship response counts by region', fontsize=16);

Lineplot¶

#Lineplot
temp_line = gss_clean[['socioeconomic_index', 'job_prestige','education']]
temp_line = temp_line[temp_line['education']>=8]

temp_line_g = temp_line.groupby('education').mean().round(2)#.reset_index()
temp_line_g.columns=['Socioeconomic Status','Occupational Prestige']
#temp_line_g.rename(index=['Index'])

temp_line_g.plot(kind='line',color=['orange','lightblue'],style=['--','-'],figsize=[15,8])
plt.title('Occupational Prestige and Socioeconomic Status vs. Years of Education');
plt.xlabel('Years of formal education');
plt.ylabel('Value');
plt.legend(title='Index');
plt.ylim(0,80)

(0.0, 80.0)

Correlation plot¶

corr_df = gss_clean[['job_prestige','father_job_prestige','mother_job_prestige','income','socioeconomic_index']]
plt.figure(figsize=(15,10))
c = sns.heatmap(corr_df.corr(),annot=True,center=0.)
plt.title('Correlation Between Personal Economic indicators');

Facet grid plot¶

temp_edu = gss_clean[['income','education','job_prestige']]
temp_edu['educ_cat']  = temp_edu['education']
temp_edu['educ_cat'] = temp_edu['educ_cat'].astype('object')
for i in range(0,11):
    temp_edu.educ_cat.replace({i:'10 years or fewer'},inplace=True)

for i in range(11,17):
    temp_edu.educ_cat.replace({i:str(i)+' years'},inplace=True)

for i in range(17,21):
    temp_edu.educ_cat.replace({i:'More than 16 years'},inplace=True)
temp_edu = temp_edu.sort_values(by='education')

g = sns.FacetGrid(temp_edu, col = 'educ_cat', height=4, aspect=1,col_wrap=2)
g.map(sns.regplot, 'job_prestige', 'income')
g.set_titles('{col_name}')
g.set_axis_labels('Occupational Prestige','Income') 
g.fig.subplots_adjust(top=.95)
g.fig.suptitle('Income vs. Prestige by Education', fontsize=16);

Gender Inequality - Income using Boxplot and Violin plot¶

from scipy import stats
def trim_mean(x): return stats.trim_mean(x,.1)

temp_gen_inc = gss_clean[['income','sex','job_prestige']]
temp_gen_inc.groupby('sex')['income'].agg([trim_mean]).round(2)

plt.figure(figsize=(10,5))
sns.boxplot(data=temp_gen_inc,y='sex',x='income',orient='h',hue='sex').set(title="Income by Gender");

plt.figure(figsize=(10,5))
sns.violinplot(data=temp_gen_inc,y='sex',x='income',orient='h',hue='sex').set(title="Income by Gender");

Relationship between income and job prestige¶

plt.figure(figsize=(15,15));
g = sns.FacetGrid(temp_gen_inc, col = 'sex', height=6, aspect=1,col_wrap=2);
g.map(sns.regplot, 'job_prestige', 'income');
g.set_titles('{col_name}')
g.set_axis_labels('Occupational Prestige','Income') 
g.fig.subplots_adjust(top=.95);
g.fig.suptitle('Income vs. Prestige by Sex', fontsize=16);

<Figure size 1080x1080 with 0 Axes>

#Create job_prestige bins
temp_gen_inc['job_prestige_groups']=pd.cut(temp_gen_inc.job_prestige,4)
temp_gen_inc['job_prestige_groups'].unique()

[(32.0, 48.0], (15.936, 32.0], (48.0, 64.0], (64.0, 80.0], NaN]
Categories (4, interval[float64]): [(15.936, 32.0] < (32.0, 48.0] < (48.0, 64.0] < (64.0, 80.0]]

#groupby sex and job prestige groups
temp_inc_g = temp_gen_inc.groupby(['job_prestige_groups','sex'])['income'].agg({'mean'}).round(0)
temp_inc_g = temp_inc_g.reset_index().dropna()

temp_inc_g.columns = ['Job prestige groups','Sex','Mean Income']

plt.figure(figsize=(10,8))
s = sns.barplot(y='Mean Income',x='Job prestige groups',data=temp_inc_g,hue='Sex');
for rect in s.patches:
    xcoor = rect.get_x() + .5*rect.get_width()
    ycoor = rect.get_height()
    plt.text(xcoor, ycoor, str(int(ycoor)), 
             horizontalalignment='center', 
             verticalalignment='bottom',
             fontsize=14)
plt.title('Income by job prestige score bins - Male vs Female');

#create income bins
temp_gen_inc['income_group']=pd.cut(temp_gen_inc.income,5)

#group by sex and income groups
temp_inc_job_g = temp_gen_inc.groupby(['income_group','sex'])['job_prestige'].agg({'mean'}).round(0)
temp_inc_g_i_df = temp_inc_job_g.reset_index().dropna()

temp_inc_g_i_df.columns = ['Income group', 'Sex', 'Mean Income']

plt.figure(figsize=(20,6))
s = sns.barplot(y='Mean Income',x='Income group',data=temp_inc_g_i_df,hue='Sex');
for rect in s.patches:
    xcoor = rect.get_x() + .5*rect.get_width()
    ycoor = rect.get_height()
    plt.text(xcoor, ycoor, str(int(ycoor)), 
             horizontalalignment='center', 
             verticalalignment='bottom',
             fontsize=14)
plt.title('Prestige scores by income bins - Male vs Female');

	trim_mean
sex
female	51759.14
male	55496.02

Data Visualization - Plots Examples¶

Data: 2019 General Social Survey (https://gss.norc.org/)¶

Read data¶

Basic Cleaning¶

Barplot¶

Clustered Barplot¶

Grid barplot¶

Lineplot¶

Correlation plot¶

Facet grid plot¶

Gender Inequality - Income using Boxplot and Violin plot¶

Relationship between income and job prestige¶