import pandas as pd
from matplotlib import pyplot as plt
import re
benefits = pd.read_csv("../../data/BenefitsCostSharing.csv",dtype={'BenefitName':'str',
'CoinsInnTier1':'str',
'CopayInnTier1':'str',
'ImportDate':'str'})
def eda(dataframe):
missing_value = pd.DataFrame(dataframe.isnull().sum(), columns = ['IsNull'])
typevals = pd.DataFrame(dataframe.dtypes, columns = ['DType'])
print "dataframe index \n", dataframe.index
print "dataframe shape \n", dataframe.shape
print "dataframe describe \n", dataframe.describe()
nunique = []
for i in dataframe:
nunique.append([i,dataframe[i].nunique()])
nunique = pd.DataFrame(nunique, columns = ['index','NUnique'])
nunique.set_index('index', inplace = True)
return pd.concat([missing_value, typevals, nunique], axis=1)
eda(benefits)
busrules = pd.read_csv("../../data/BusinessRules.csv")
eda(busrules)
planatts = pd.read_csv("../../data/PlanAttributes.csv")
eda(planatts)
benefits.fillna('0', inplace = True)
print benefits.shape
benefits.head()
# def extractor(i):
# try:
# return int(re.findall(r'\d+', i))
# except:
# return 0
# benefits['CoinsInnTier1'].apply(extractor, converty_type = 'int')
benefits['Deductible'] = benefits.CoinsInnTier1.str.contains('deductible', case = False)
benefits.Deductible.value_counts()
#The number of records for each year
benefits.groupby('BusinessYear')['BenefitName'].count()
print '- Number of individual benefit plans: ', len(benefits.groupby('PlanId')['BusinessYear'].count())
print '\n- Average number of benefits per plan: ', benefits.shape[0]/benefits.PlanId.nunique()
benefits.groupby('PlanId')['BusinessYear'].count().hist(figsize = (12,8), color = 'green')
plt.title('How many plans coverting a certain number of benefits, \n Years 2014 - 2016')
_ = plt.xlabel('Number of benefits covered by insurance')
_ = plt.ylabel('Frequencey of insurance plans with a certain coverage')
plt.show()
# Checking a cross-section with a single state (Missour in this case)
benefits.groupby(['StateCode','IssuerId'])['IssuerId'].count().xs('MO',axis = 0)
z = pd.DataFrame(benefits.groupby(['StateCode','IssuerId'])['IssuerId'].count())
# z.to_csv('../../data/groupbystateissuerid.csv')
benefits.groupby(['StateCode','IssuerId'])['IssuerId'].count().to_frame(name = 'z').reset_index(inplace = True)
z.reset_index(level=0, inplace=True)
z.head()
z_1 = pd.DataFrame(benefits.groupby(['StateCode'])['IssuerId'].count())
z_1.reset_index(inplace=True)
z_1.head()
type(z_1)
z_concat = z.merge(z_1, how = 'outer', left_on='StateCode', right_on='StateCode')
z_concat['Share'] = z_concat.IssuerId_x/z_concat.IssuerId_y
z_concat.head()
a = z_concat[z_concat.Share>.10].groupby('StateCode').count()
a.to_csv('../../data/numberofinscons.csv')
a.Share.head()
rates_1.head(3)
meanrates = pd.DataFrame(rates_1.groupby('StateCode')['IndividualRate'].mean())
meanrates.head()
c = pd.concat([a.Share,meanrates], axis = 1)
c.head()
plt.figure(figsize = (12,8))
plt.scatter(c.Share, c.IndividualRate, color = 'green')
_ = plt.title("Number of Major Insurers per State vs Individual Premium ")
plt.show()
# benefits.fillna(0, inplace = True)
Note: </br> Check what share of all observations do take the first 100 benefits.
benefits.shape
benefits.BenefitName.value_counts(sort=True)[:100].sum()
benefits.BenefitName.value_counts().plot(figsize = (16,10),color = 'green')
plt.xticks(rotation=45, fontsize = 14)
plt.title("Frequency of Medical Procedures covered by Insurance")
plt.show()
rates = pd.read_csv('../../data/Rate.csv')
rates.shape
rates.head()
rates.groupby(rates.StateCode).IndividualRate.mean().head(4)
rates.groupby([rates.StateCode, rates.BusinessYear]).IndividualRate.mean()
# export_.to_csv('../data/groupbystateindrates.csv')
rates_1 = rates[rates.IndividualRate < 9000]
rates_1.IndividualRate.value_counts().head(5)
rates_1 = rates[rates.IndividualRate < 9000]
rates_1.IndividualRate.value_counts().head(5)
rates_1.columns
rates_1[rates_1.CoupleAndTwoDependents.isnull() == False].head()
export_2 = rates_1.groupby([rates_1.StateCode, rates_1.BusinessYear]).IndividualRate.mean()
export_2.to_csv('../../data/export_2.csv')
rates_1.IndividualTobaccoRate.unique()
rates_1.IndividualRate.count()
rates_1.IndividualTobaccoRate.count()
rates_1.shape
Additional information can be found here: https://www.healthmarkets.com/content/smoking-and-health-insurance
d.head()
d = pd.DataFrame(rates_1[rates_1.IndividualTobaccoRate.isnull() != True][[
'StateCode','IndividualRate',
'IndividualTobaccoRate']].groupby('StateCode').sum())
d['SurCharge'] = d.IndividualTobaccoRate/d.IndividualRate
d['SurCharge'].sort_values().plot(figsize = (16,12),kind = 'bar', color = 'green')
plt.title('Average surcharge on individual medical insurance premiums for tobacco smokers, \n Years 2014 - 2016')
plt.xlabel('State', fontsize = 14)
plt.ylabel('Surcharge coefficient Smoker/Non-Smoker, x times')
plt.show()
pop_df = pd.read_csv('../../data/US_Population.csv')
pop_df.head()
HICovs_2013 = pd.read_csv('../../data/HICoverage_2013.csv')
HICovs_2014 = pd.read_csv('../../data/HICoverage_2014.csv')
HICovs_2015 = pd.read_csv('../../data/HICoverage_2015.csv')
HICovs_2016 = pd.read_csv('../../data/HICoverage_2016.csv')
HICovs_2013.head(3)
HICovs_2014.head(3)
HICovs_2015.head(3)
HICovs_2016.head(3)
HICovs = pd.concat([HICovs_2013, HICovs_2014, HICovs_2015, HICovs_2016], axis=1, join='inner')
HICovs.drop('Location', axis = 1, inplace=True)
HICovs['State'] = HICovs_2014.Location
HICovs.dropna(inplace = True)
print HICovs.shape
HICovs.head(3)
HICovs['UnInsured_Pct_2013'] = HICovs.Uninsured/HICovs.Total
HICovs['UnInsured_Pct_2014'] = HICovs.Uninsured_2014/HICovs.Total_2014
HICovs['UnInsured_Pct_2015'] = HICovs.Uninsured_2014/HICovs.Total_2015
HICovs['UnInsured_Pct_2016'] = HICovs.Uninsured_2014/HICovs.Total_2016
HICovs.head(3)
HICovs[['State','UnInsured_Pct_2013']].sort_values('UnInsured_Pct_2013',ascending=False).head(5)
HICovs[['State','UnInsured_Pct_2014']].sort_values('UnInsured_Pct_2014',ascending=False).head(5)
HICovs[['State','UnInsured_Pct_2015']].sort_values('UnInsured_Pct_2015a',ascending=False).head(5)
HICovs[['State','UnInsured_Pct_2016']].sort_values('UnInsured_Pct_2016',ascending=False).head(5)
print HICovs.Uninsured.sum()
print HICovs.Total.sum()
HICovs.Uninsured.sum()/HICovs.Total.sum()
print 'Percent of uninsured people in the US '
print '2013 ', round(HICovs.Uninsured.sum()/float(HICovs.Total.sum()),3)*100,'%'
print '2014 ', round(HICovs.Uninsured_2014.sum()/HICovs.Total_2014.sum(),3)*100,'%'
print '2015 ', round(HICovs.Uninsured_2015.sum()/HICovs.Total_2015.sum(),3)*100,'%'
print '2016 ', round(HICovs.Uninsured_2016.sum()/HICovs.Total_2016.sum(),3)*100,'%'
print '\n Overall number of uninsured 2013 vs 2016'
print HICovs.Uninsured.sum(), '- in 2013 vs', int(HICovs.Uninsured_2016.sum()), '- in 2016'
HICovs['Change_2010_16'] = 2*(HICovs.Uninsured_2016-HICovs.Uninsured)/(HICovs.Total_2016+HICovs.Total)
print '\n The top 7 states where the number of uninsured \n people has significantly decreased between 2010 - 2016'
HICovs[['State', 'Uninsured', 'Uninsured_2016','Change_2010_16']].sort_values('Change_2010_16', ascending = True).head(7)
print '\n The states where the number of uninsured people \n has increased between 2010 - 2016'
HICovs[['State', 'Uninsured', 'Uninsured_2016','Change_2010_16']].sort_values('Change_2010_16', ascending = False).head(5)
medcosts_2012 = pd.read_csv('../../data/Medical_costs_2012.csv', dtype = {'Hospital Care':'int',
'Physician and Clinical Services':'int',
'Prescription Drugs and Other Medical Nondurables':'int'})
medcosts_2012['Year'] = 2012
medcosts_2013 = pd.read_csv('../../data/Medical_costs_2013.csv', dtype = {'Hospital Care':'int',
'Physician and Clinical Services':'int',
'Prescription Drugs and Other Medical Nondurables':'int'})
medcosts_2013['Year'] = 2013
medcosts_2014 = pd.read_csv('../../data/Medical_costs_2014.csv', dtype = {'Hospital Care':'int',
'Physician and Clinical Services':'int',
'Prescription Drugs and Other Medical Nondurables':'int'})
medcosts_2014['Year'] = 2014
medcosts = pd.concat([medcosts_2012, medcosts_2013, medcosts_2014], ignore_index=True)
medcosts.columns = medcosts.columns.str.strip()
medcosts
eda(medcosts)
medcosts['Total'] = medcosts['Hospital Care'] + medcosts['Physician and Clinical Services'] + medcosts['Other Professional Services'] + medcosts['Prescription Drugs and Other Medical Nondurables'] + medcosts['Nursing Home Care'] + medcosts['Dental Services'] + medcosts['Home Health Care'] + medcosts['Medical Durables'] + medcosts['Other Health, Residential, and Personal Care']
medcosts[medcosts.Year == 2012]['Total'].head(3)
print '\n Top 5 most expensive states for medical costs per capita 2012'
medcosts[medcosts.Year == 2012].sort_values('Total', ascending = False).head(5)
print '\n Top 5 most expensive states for medical costs per capita 2014'
medcosts[medcosts.Year == 2014].sort_values('Total', ascending = False).head(5)
US_medcosts = medcosts[medcosts['Location'].str.contains('United')]
medcosts = medcosts.drop(medcosts['Location'].str.contains('United'))
medcosts.head(5)
medcosts.shape
US_medcosts
for i in US_medcosts.columns[1:-2]:
US_medcosts[i+'_pct'] = US_medcosts[i]/US_medcosts.Total
US_medcosts
# for i in range(US_medcosts.shape[0]):
#US_medcosts[0,2:12]#.plot(figsize = (16,8),kind = 'pie', subplots=True)
#plt.show()
US_medcosts.mean()[:9].plot.pie(fontsize = 20,figsize = (12,12), subplots = True, autopct = '%.1f' )
plt.show()
mean_rates = pd.DataFrame(rates_1.groupby('StateCode')['IndividualRate'].mean())
mean_rates.head()
State_Codes = pd.read_csv('../../data/US_codes.csv')
State_Codes.set_index('StateCode',inplace=True)
State_Codes.State.str.strip()
State_Codes.head()
ind_ann_rates = pd.concat([mean_rates, State_Codes], axis = 1, join='inner', ignore_index=True)
ind_ann_rates.columns = ['MeanIndPremium','State']
ind_ann_rates['MeanAnnualPremium'] = ind_ann_rates.MeanIndPremium*12
print ind_ann_rates.shape
ind_ann_rates.head()
ind_ann_rates.set_index('State', inplace = True)
ind_ann_rates.head()
meanmedcosts = pd.DataFrame(medcosts.groupby('Location')['Total'].mean())
meanmedcosts.head()
ind_ann_rates = ind_ann_rates.merge(meanmedcosts, left_index=True, right_index=True, how='inner')
ind_ann_rates.drop('MeanIndPremium', axis = 1, inplace = True)
ind_ann_rates.rename(columns={'Total':'HealthCosts'}, inplace = True)
plt.figure(figsize = (12,8))
plt.scatter(ind_ann_rates.HealthCosts, ind_ann_rates.MeanAnnualPremium, color = 'green')
plt.title('Mean Annual Premiums vs Mean Annual Health Costs Per Capita')
plt.xlabel('Mean Annual Health Costs Per Capita')
plt.ylabel('Mean Annual Premiums')
plt.show()
ind_ann_rates.corr('pearson')
stats = pd.read_csv('../../data/states.csv')
stats = stats[['State','MeanHHIncome_15']]
stats.State = stats.State.str.strip()
stats.set_index('State', inplace = True)
ind_ann_rates_1 = ind_ann_rates.merge(stats, left_index=True, right_index=True, how='left')
ind_ann_rates_1.head()
ind_ann_rates_1.corr('pearson')
plt.figure(figsize = (12,8))
plt.scatter(ind_ann_rates_1.MeanHHIncome_15, ind_ann_rates_1.MeanAnnualPremium, color = 'green')
plt.title('Mean Annual Premiums vs Mean Annual Household Incomes')
plt.xlabel('Mean Annual Household Income')
plt.ylabel('Mean Annual Premium')
plt.show()