In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import re

Dataset Benefits

In [227]:
benefits = pd.read_csv("../../data/BenefitsCostSharing.csv",dtype={'BenefitName':'str',
                                                                  'CoinsInnTier1':'str',
                                                                   'CopayInnTier1':'str',
                                                                  'ImportDate':'str'})
In [228]:
def eda(dataframe):
    missing_value =  pd.DataFrame(dataframe.isnull().sum(), columns = ['IsNull']) 
    typevals = pd.DataFrame(dataframe.dtypes, columns = ['DType'])
    print "dataframe index \n", dataframe.index

    print "dataframe shape \n", dataframe.shape
    print "dataframe describe \n", dataframe.describe()
    nunique = []
    for i in dataframe:
        nunique.append([i,dataframe[i].nunique()])
    nunique = pd.DataFrame(nunique, columns = ['index','NUnique'])
    nunique.set_index('index', inplace = True)
    return pd.concat([missing_value, typevals, nunique], axis=1)
eda(benefits)
dataframe index 
RangeIndex(start=0, stop=5048408, step=1)
dataframe shape 
(5048408, 32)
dataframe describe 
       BusinessYear      IssuerId     IssuerId2       LimitQty   MinimumStay  \
count  5.048408e+06  5.048408e+06  5.048408e+06  687683.000000  16727.000000   
mean   2.015127e+03  5.220360e+04  5.220360e+04     142.399374     47.937048   
std    7.563664e-01  2.592498e+04  2.592498e+04    1723.372817      5.892297   
min    2.014000e+03  1.004600e+04  1.004600e+04       1.000000      5.000000   
25%    2.015000e+03  3.253600e+04  3.253600e+04       1.000000     48.000000   
50%    2.015000e+03  4.839600e+04  4.839600e+04      20.000000     48.000000   
75%    2.016000e+03  7.498000e+04  7.498000e+04      40.000000     48.000000   
max    2.016000e+03  9.996900e+04  9.996900e+04   75000.000000     90.000000   

          RowNumber    VersionNum  
count  5.048408e+06  5.048408e+06  
mean   9.839018e+01  7.637094e+00  
std    2.177890e+01  3.803627e+00  
min    6.100000e+01  1.000000e+00  
25%    8.000000e+01  5.000000e+00  
50%    9.900000e+01  7.000000e+00  
75%    1.170000e+02  9.000000e+00  
max    1.560000e+02  2.400000e+01  
Out[228]:
IsNull DType NUnique
index
BenefitName 0 object 861
BusinessYear 0 int64 3
CoinsInnTier1 1113847 object 115
CoinsInnTier2 4571587 object 37
CoinsOutofNet 1113847 object 60
CopayInnTier1 1113847 object 516
CopayInnTier2 4571587 object 278
CopayOutofNet 1113849 object 232
EHBVarReason 3020737 object 12
Exclusions 4572247 object 1720
Explanation 4075700 object 6227
ImportDate 0 object 266
IsCovered 215980 object 3
IsEHB 1817362 object 1
IsExclFromInnMOOP 983986 object 5
IsExclFromOonMOOP 982626 object 4
IsStateMandate 4250463 object 1
IsSubjToDedTier1 2465877 object 5
IsSubjToDedTier2 2466054 object 4
IssuerId 0 int64 910
IssuerId2 0 int64 910
LimitQty 4360725 float64 94
LimitUnit 4360539 object 88
MinimumStay 5031681 float64 3
PlanId 0 object 52394
QuantLimitOnSvc 3264532 object 5
RowNumber 0 int64 96
SourceName 0 object 3
StandardComponentId 0 object 16807
StateCode 0 object 39
StateCode2 0 object 39
VersionNum 0 int64 23
In [49]:
busrules = pd.read_csv("../../data/BusinessRules.csv")
In [52]:
eda(busrules)
dataframe index 
RangeIndex(start=0, stop=21085, step=1)
dataframe shape 
(21085, 23)
dataframe describe 
       BusinessYear      IssuerId    VersionNum     IssuerId2     RowNumber
count  21085.000000  21085.000000  21085.000000  21085.000000  21085.000000
mean    2015.321745  52857.554517      6.791036  52857.554517     11.050036
std        0.646315  26097.535028      4.092377  26097.535028      2.863195
min     2014.000000  10046.000000      1.000000  10046.000000     10.000000
25%     2015.000000  31609.000000      4.000000  31609.000000     10.000000
50%     2015.000000  49046.000000      6.000000  49046.000000     10.000000
75%     2016.000000  76526.000000      8.000000  76526.000000     11.000000
max     2016.000000  99969.000000     24.000000  99969.000000     49.000000
Out[52]:
IsNull DType NUnique
index
BusinessYear 0 int64 3
StateCode 0 object 39
IssuerId 0 int64 910
SourceName 0 object 3
VersionNum 0 int64 23
ImportDate 0 object 268
IssuerId2 0 int64 910
TIN 0 object 343
ProductId 1683 object 3034
StandardComponentId 1012 object 14189
EnrolleeContractRateDeterminationRule 247 object 2
TwoParentFamilyMaxDependentsRule 539 object 3
SingleParentFamilyMaxDependentsRule 539 object 3
DependentMaximumAgRule 183 object 12
ChildrenOnlyContractMaxChildrenRule 669 object 2
DomesticPartnerAsSpouseIndicator 510 object 2
SameSexPartnerAsSpouseIndicator 510 object 2
AgeDeterminationRule 411 object 3
MinimumTobaccoFreeMonthsRule 404 object 7
CohabitationRule 355 object 378
RowNumber 0 int64 38
MarketCoverage 1208 object 2
DentalOnlyPlan 1208 object 2
In [53]:
planatts = pd.read_csv("../../data/PlanAttributes.csv")
/Users/baurjansafi/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (0,7,8,9,10,11,12,13,14,15,18,19,20,22,23,24,25,26,27,28,29,31,32,33,34,39,40,41,42,66,67,68,70,71,72,75,76,77,79,80,81,82,84,85,86,88,89,90,92,93,94,96,97,98,136,137,138,140,141,142,145,146,147,150,151,152,154,155,156,158,159,160,162,163,164,166,167,168) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
In [54]:
eda(planatts)
dataframe index 
RangeIndex(start=0, stop=77353, step=1)
dataframe shape 
(77353, 176)
dataframe describe 
       BeginPrimaryCareCostSharingAfterNumberOfVisits  \
count                                    77353.000000   
mean                                         0.154202   
std                                          0.666628   
min                                          0.000000   
25%                                          0.000000   
50%                                          0.000000   
75%                                          0.000000   
max                                          6.000000   

       BeginPrimaryCareDeductibleCoinsuranceAfterNumberOfCopays  \
count                                       77353.000000          
mean                                            0.298877          
std                                             1.204669          
min                                             0.000000          
25%                                             0.000000          
50%                                             0.000000          
75%                                             0.000000          
max                                            10.000000          

       BenefitPackageId  BusinessYear  DEHBInnTier2FamilyMOOP  \
count       77353.00000  77353.000000                     0.0   
mean            4.90586   2015.111980                     NaN   
std             6.50454      0.763831                     NaN   
min             1.00000   2014.000000                     NaN   
25%             1.00000   2015.000000                     NaN   
50%             3.00000   2015.000000                     NaN   
75%             6.00000   2016.000000                     NaN   
max            84.00000   2016.000000                     NaN   

       DEHBInnTier2FamilyPerGroupMOOP  DEHBInnTier2FamilyPerPersonMOOP  \
count                             0.0                              0.0   
mean                              NaN                              NaN   
std                               NaN                              NaN   
min                               NaN                              NaN   
25%                               NaN                              NaN   
50%                               NaN                              NaN   
75%                               NaN                              NaN   
max                               NaN                              NaN   

       DEHBInnTier2IndividualMOOP  EHBPercentPremiumS4  \
count                         0.0         41877.000000   
mean                          NaN             0.994479   
std                           NaN             0.035219   
min                           NaN             0.000000   
25%                           NaN             0.998277   
50%                           NaN             1.000000   
75%                           NaN             1.000000   
max                           NaN             1.000000   

       EHBPercentTotalPremium          HPID  InpatientCopaymentMaximumDays  \
count            23110.000000  1.185000e+04                   77353.000000   
mean                 0.994788  7.433721e+09                       0.151216   
std                  0.014080  2.677375e+08                       0.782707   
min                  0.883550  7.003819e+09                       0.000000   
25%                  0.997000  7.205839e+09                       0.000000   
50%                  1.000000  7.376547e+09                       0.000000   
75%                  1.000000  7.720082e+09                       0.000000   
max                  1.000000  7.992708e+09                      10.000000   

           IssuerId     IssuerId2     RowNumber    VersionNum  
count  77353.000000  77353.000000  77353.000000  77353.000000  
mean   52708.787429  52708.787429     18.618295      7.267785  
std    25999.008907  25999.008907     24.165791      3.785002  
min    10046.000000  10046.000000      4.000000      1.000000  
25%    32536.000000  32536.000000      5.000000      5.000000  
50%    49046.000000  49046.000000      9.000000      7.000000  
75%    75605.000000  75605.000000     21.000000      9.000000  
max    99969.000000  99969.000000    245.000000     24.000000  
Out[54]:
IsNull DType NUnique
index
AVCalculatorOutputNumber 22825 object 8075
BeginPrimaryCareCostSharingAfterNumberOfVisits 0 int64 7
BeginPrimaryCareDeductibleCoinsuranceAfterNumberOfCopays 0 int64 8
BenefitPackageId 0 int64 66
BusinessYear 0 int64 3
CSRVariationType 0 object 21
ChildOnlyOffering 0 object 4
ChildOnlyPlanId 76916 object 66
CompositeRatingOffered 49972 object 2
DEHBCombInnOonFamilyMOOP 76425 object 6
DEHBCombInnOonFamilyPerGroupMOOP 77130 object 3
DEHBCombInnOonFamilyPerPersonMOOP 77130 object 3
DEHBCombInnOonIndividualMOOP 76202 object 10
DEHBDedCombInnOonFamily 58680 object 15
DEHBDedCombInnOonFamilyPerGroup 67927 object 14
DEHBDedCombInnOonFamilyPerPerson 67927 object 21
DEHBDedCombInnOonIndividual 49254 object 36
DEHBDedInnTier1Coinsurance 49254 object 23
DEHBDedInnTier1Family 58680 object 27
DEHBDedInnTier1FamilyPerGroup 67927 object 29
DEHBDedInnTier1FamilyPerPerson 67927 object 29
DEHBDedInnTier1Individual 49254 object 53
DEHBDedInnTier2Coinsurance 74592 object 13
DEHBDedInnTier2Family 76397 object 19
DEHBDedInnTier2FamilyPerGroup 75548 object 11
DEHBDedInnTier2FamilyPerPerson 75548 object 11
DEHBDedInnTier2Individual 74592 object 29
DEHBDedOutOfNetFamily 58680 object 23
DEHBDedOutOfNetFamilyPerGroup 67927 object 25
DEHBDedOutOfNetFamilyPerPerson 67927 object 25
... ... ... ...
TEHBDedInnTier1FamilyPerGroup 63253 object 162
TEHBDedInnTier1FamilyPerPerson 63253 object 152
TEHBDedInnTier1Individual 39748 object 296
TEHBDedInnTier2Coinsurance 72037 object 18
TEHBDedInnTier2Family 75052 object 82
TEHBDedInnTier2FamilyPerGroup 74338 object 78
TEHBDedInnTier2FamilyPerPerson 74338 object 73
TEHBDedInnTier2Individual 72037 object 151
TEHBDedOutOfNetFamily 53848 object 149
TEHBDedOutOfNetFamilyPerGroup 63253 object 165
TEHBDedOutOfNetFamilyPerPerson 63253 object 154
TEHBDedOutOfNetIndividual 39748 object 290
TEHBInnTier1FamilyMOOP 36103 object 205
TEHBInnTier1FamilyPerGroupMOOP 54050 object 154
TEHBInnTier1FamilyPerPersonMOOP 54050 object 146
TEHBInnTier1IndividualMOOP 12800 object 305
TEHBInnTier2FamilyMOOP 74096 object 87
TEHBInnTier2FamilyPerGroupMOOP 72533 object 79
TEHBInnTier2FamilyPerPersonMOOP 72533 object 74
TEHBInnTier2IndividualMOOP 69276 object 161
TEHBOutOfNetFamilyMOOP 36103 object 166
TEHBOutOfNetFamilyPerGroupMOOP 54050 object 169
TEHBOutOfNetFamilyPerPersonMOOP 54050 object 170
TEHBOutOfNetIndividualMOOP 12800 object 329
TIN 0 object 346
URLForEnrollmentPayment 15853 object 463
URLForSummaryofBenefitsCoverage 7124 object 20321
UniquePlanDesign 11649 object 3
VersionNum 0 int64 23
WellnessProgramOffered 11637 object 7

176 rows × 3 columns

Let's start looking into the datasets

  • Replace all NA data with zeros
  • Try to remove all % signs and see the coverage
  • Check frequency of benefits
In [206]:
benefits.fillna('0', inplace = True)
print benefits.shape
benefits.head()
(5048408, 32)
Out[206]:
BenefitName BusinessYear CoinsInnTier1 CoinsInnTier2 CoinsOutofNet CopayInnTier1 CopayInnTier2 CopayOutofNet EHBVarReason Exclusions ... LimitUnit MinimumStay PlanId QuantLimitOnSvc RowNumber SourceName StandardComponentId StateCode StateCode2 VersionNum
0 Routine Dental Services (Adult) 2014 20% 0 20% No Charge 0 No Charge Above EHB 0 ... Dollars per Year 0 21989AK0010001-00 Yes 68 HIOS 21989AK0010001 AK AK 6
1 Dental Check-Up for Children 2014 20% 0 20% No Charge 0 No Charge Substantially Equal 0 ... Visit(s) per 6 Months 0 21989AK0010001-00 Yes 104 HIOS 21989AK0010001 AK AK 6
2 Basic Dental Care - Child 2014 40% 0 40% No Charge 0 No Charge Substantially Equal 0 ... 0 0 21989AK0010001-00 0 110 HIOS 21989AK0010001 AK AK 6
3 Orthodontia - Child 2014 50% 0 50% No Charge 0 No Charge Additional EHB Benefit 0 ... 0 0 21989AK0010001-00 0 111 HIOS 21989AK0010001 AK AK 6
4 Major Dental Care - Child 2014 50% 0 50% No Charge 0 No Charge Substantially Equal 0 ... 0 0 21989AK0010001-00 0 112 HIOS 21989AK0010001 AK AK 6

5 rows × 32 columns

In [245]:
# def extractor(i):
#     try:
#         return int(re.findall(r'\d+', i))
#     except:
#         return 0
# benefits['CoinsInnTier1'].apply(extractor, converty_type = 'int')
In [246]:
benefits['Deductible'] = benefits.CoinsInnTier1.str.contains('deductible', case = False)
In [247]:
benefits.Deductible.value_counts()
Out[247]:
True     2261169
False    1673392
Name: Deductible, dtype: int64
In [282]:
#The number of records for each year

benefits.groupby('BusinessYear')['BenefitName'].count()
Out[282]:
BusinessYear
2014    1164869
2015    2079286
2016    1804253
Name: BenefitName, dtype: int64
In [392]:
print '- Number of individual benefit plans: ', len(benefits.groupby('PlanId')['BusinessYear'].count())
print '\n- Average number of benefits per plan: ', benefits.shape[0]/benefits.PlanId.nunique()
benefits.groupby('PlanId')['BusinessYear'].count().hist(figsize = (12,8), color = 'green')
plt.title('How many plans coverting a certain number of benefits, \n Years 2014 - 2016')
_ = plt.xlabel('Number of benefits covered by insurance')
_ = plt.ylabel('Frequencey of insurance plans with a certain coverage')
plt.show()
- Number of individual benefit plans:  52394

- Average number of benefits per plan:  96
In [251]:
# Checking a cross-section with a single state (Missour in this case)
benefits.groupby(['StateCode','IssuerId'])['IssuerId'].count().xs('MO',axis = 0)
Out[251]:
IssuerId
16049     6636
28214      200
29416      612
30494      108
30613    10182
32753    22260
32904      216
34762    16650
35853      108
39371      117
40807      324
44240    17356
44527    12493
48616       54
48786      238
53204       90
56603       36
59976      190
68265      144
68396      108
71691       45
74483     6607
81313      144
82905      108
86182       54
86779       36
94322       36
Name: IssuerId, dtype: int64
In [327]:
z = pd.DataFrame(benefits.groupby(['StateCode','IssuerId'])['IssuerId'].count())
# z.to_csv('../../data/groupbystateissuerid.csv')
In [328]:
benefits.groupby(['StateCode','IssuerId'])['IssuerId'].count().to_frame(name = 'z').reset_index(inplace = True)
z.reset_index(level=0, inplace=True)
In [329]:
z.head()
Out[329]:
StateCode IssuerId
IssuerId
21989 AK 207
38344 AK 16546
38536 AK 70
42507 AK 112
45858 AK 108
In [330]:
z_1 = pd.DataFrame(benefits.groupby(['StateCode'])['IssuerId'].count())

z_1.reset_index(inplace=True)
z_1.head()
Out[330]:
StateCode IssuerId
0 AK 41320
1 AL 28417
2 AR 68064
3 AZ 221612
4 DE 31370
In [331]:
type(z_1)
Out[331]:
pandas.core.frame.DataFrame
In [332]:
z_concat = z.merge(z_1, how = 'outer', left_on='StateCode', right_on='StateCode')
In [333]:
z_concat['Share'] = z_concat.IssuerId_x/z_concat.IssuerId_y
z_concat.head()
Out[333]:
StateCode IssuerId_x IssuerId_y Share
0 AK 207 41320 0.005010
1 AK 16546 41320 0.400436
2 AK 70 41320 0.001694
3 AK 112 41320 0.002711
4 AK 108 41320 0.002614
In [592]:
a = z_concat[z_concat.Share>.10].groupby('StateCode').count()
a.to_csv('../../data/numberofinscons.csv')
a.Share.head()
Out[592]:
StateCode
AK    2
AL    3
AR    4
AZ    3
DE    3
Name: Share, dtype: int64
In [593]:
rates_1.head(3)
Out[593]:
BusinessYear StateCode IssuerId SourceName VersionNum ImportDate IssuerId2 FederalTIN RateEffectiveDate RateExpirationDate ... IndividualRate IndividualTobaccoRate Couple PrimarySubscriberAndOneDependent PrimarySubscriberAndTwoDependents PrimarySubscriberAndThreeOrMoreDependents CoupleAndOneDependent CoupleAndTwoDependents CoupleAndThreeOrMoreDependents RowNumber
0 2014 AK 21989 HIOS 6 2014-03-19 07:06:49 21989 93-0438772 2014-01-01 2014-12-31 ... 29.00 NaN NaN NaN NaN NaN NaN NaN NaN 14
1 2014 AK 21989 HIOS 6 2014-03-19 07:06:49 21989 93-0438772 2014-01-01 2014-12-31 ... 36.95 NaN 73.9 107.61 107.61 107.61 144.56 144.56 144.56 14
2 2014 AK 21989 HIOS 6 2014-03-19 07:06:49 21989 93-0438772 2014-01-01 2014-12-31 ... 36.95 NaN 73.9 107.61 107.61 107.61 144.56 144.56 144.56 15

3 rows × 24 columns

In [608]:
meanrates = pd.DataFrame(rates_1.groupby('StateCode')['IndividualRate'].mean())
meanrates.head()
Out[608]:
IndividualRate
StateCode
AK 631.788031
AL 250.278098
AR 212.122069
AZ 352.059498
DE 359.123401
In [609]:
c = pd.concat([a.Share,meanrates], axis = 1)
c.head()
Out[609]:
Share IndividualRate
StateCode
AK 2 631.788031
AL 3 250.278098
AR 4 212.122069
AZ 3 352.059498
DE 3 359.123401
In [610]:
plt.figure(figsize = (12,8))
plt.scatter(c.Share, c.IndividualRate, color = 'green')
_ = plt.title("Number of Major Insurers per State vs Individual Premium ")
plt.show()
In [106]:
# benefits.fillna(0, inplace = True)

Note: </br> Check what share of all observations do take the first 100 benefits.

In [337]:
benefits.shape
Out[337]:
(5048408, 33)
In [338]:
benefits.BenefitName.value_counts(sort=True)[:100].sum()
Out[338]:
4952493L
In [391]:
benefits.BenefitName.value_counts().plot(figsize = (16,10),color = 'green')
plt.xticks(rotation=45, fontsize = 14)
plt.title("Frequency of Medical Procedures covered by Insurance")
plt.show()

The following dataset describes rates of monthly premiums for each signed insurance in the dataset

In [341]:
rates = pd.read_csv('../../data/Rate.csv')
In [342]:
rates.shape
Out[342]:
(12694445, 24)
In [343]:
rates.head()
Out[343]:
BusinessYear StateCode IssuerId SourceName VersionNum ImportDate IssuerId2 FederalTIN RateEffectiveDate RateExpirationDate ... IndividualRate IndividualTobaccoRate Couple PrimarySubscriberAndOneDependent PrimarySubscriberAndTwoDependents PrimarySubscriberAndThreeOrMoreDependents CoupleAndOneDependent CoupleAndTwoDependents CoupleAndThreeOrMoreDependents RowNumber
0 2014 AK 21989 HIOS 6 2014-03-19 07:06:49 21989 93-0438772 2014-01-01 2014-12-31 ... 29.00 NaN NaN NaN NaN NaN NaN NaN NaN 14
1 2014 AK 21989 HIOS 6 2014-03-19 07:06:49 21989 93-0438772 2014-01-01 2014-12-31 ... 36.95 NaN 73.9 107.61 107.61 107.61 144.56 144.56 144.56 14
2 2014 AK 21989 HIOS 6 2014-03-19 07:06:49 21989 93-0438772 2014-01-01 2014-12-31 ... 36.95 NaN 73.9 107.61 107.61 107.61 144.56 144.56 144.56 15
3 2014 AK 21989 HIOS 6 2014-03-19 07:06:49 21989 93-0438772 2014-01-01 2014-12-31 ... 32.00 NaN NaN NaN NaN NaN NaN NaN NaN 15
4 2014 AK 21989 HIOS 6 2014-03-19 07:06:49 21989 93-0438772 2014-01-01 2014-12-31 ... 32.00 NaN NaN NaN NaN NaN NaN NaN NaN 16

5 rows × 24 columns

In [344]:
rates.groupby(rates.StateCode).IndividualRate.mean().head(4)
Out[344]:
StateCode
AK     7024.934674
AL    12131.738408
AR    15069.197977
AZ     3976.503845
Name: IndividualRate, dtype: float64
In [345]:
rates.groupby([rates.StateCode, rates.BusinessYear]).IndividualRate.mean()
Out[345]:
StateCode  BusinessYear
AK         2014            26667.898057
           2015              623.849621
           2016              628.555865
AL         2014            30376.926656
           2015              215.844990
           2016              231.690536
AR         2014            40102.113094
           2015              196.282286
           2016              297.828922
AZ         2014             9103.873065
           2015              393.848115
           2016              353.645713
DE         2014              280.979500
           2015              368.509249
           2016              395.512996
FL         2014              267.417355
           2015              342.451145
           2016              327.351469
GA         2014            25214.679575
           2015              297.344257
           2016              382.078460
HI         2016              256.341476
IA         2014              341.813347
           2015              261.334281
           2016              336.530705
ID         2014            18457.376815
IL         2014              379.076225
           2015              375.995723
           2016              362.115567
IN         2014             9811.632657
                               ...     
PA         2014             7402.982039
           2015              312.545187
           2016              324.729081
SC         2014            34540.400559
           2015              390.375389
           2016              411.545330
SD         2014            39368.231782
           2015              349.074508
           2016              348.996351
TN         2014            44975.596390
           2015              345.751294
           2016              407.263990
TX         2014            17043.163987
           2015              274.799288
           2016              277.089626
UT         2014            20333.377578
           2015              403.987676
           2016              282.916561
VA         2014              399.604184
           2015              282.240625
           2016              258.568196
WI         2014              474.703744
           2015              460.238902
           2016              495.136374
WV         2014              206.104756
           2015              377.256669
           2016              396.895335
WY         2014            73648.582224
           2015              398.847285
           2016              389.786427
Name: IndividualRate, Length: 111, dtype: float64
In [346]:
# export_.to_csv('../data/groupbystateindrates.csv')
In [347]:
rates_1 = rates[rates.IndividualRate < 9000]
rates_1.IndividualRate.value_counts().head(5)
Out[347]:
0.00     682484
99.99     87941
99.00     36455
1.50      24660
18.49     15032
Name: IndividualRate, dtype: int64
In [348]:
rates_1 = rates[rates.IndividualRate < 9000]
rates_1.IndividualRate.value_counts().head(5)
Out[348]:
0.00     682484
99.99     87941
99.00     36455
1.50      24660
18.49     15032
Name: IndividualRate, dtype: int64
In [349]:
rates_1.columns
Out[349]:
Index([u'BusinessYear', u'StateCode', u'IssuerId', u'SourceName',
       u'VersionNum', u'ImportDate', u'IssuerId2', u'FederalTIN',
       u'RateEffectiveDate', u'RateExpirationDate', u'PlanId', u'RatingAreaId',
       u'Tobacco', u'Age', u'IndividualRate', u'IndividualTobaccoRate',
       u'Couple', u'PrimarySubscriberAndOneDependent',
       u'PrimarySubscriberAndTwoDependents',
       u'PrimarySubscriberAndThreeOrMoreDependents', u'CoupleAndOneDependent',
       u'CoupleAndTwoDependents', u'CoupleAndThreeOrMoreDependents',
       u'RowNumber'],
      dtype='object')
In [350]:
rates_1[rates_1.CoupleAndTwoDependents.isnull() == False].head()
Out[350]:
BusinessYear StateCode IssuerId SourceName VersionNum ImportDate IssuerId2 FederalTIN RateEffectiveDate RateExpirationDate ... IndividualRate IndividualTobaccoRate Couple PrimarySubscriberAndOneDependent PrimarySubscriberAndTwoDependents PrimarySubscriberAndThreeOrMoreDependents CoupleAndOneDependent CoupleAndTwoDependents CoupleAndThreeOrMoreDependents RowNumber
1 2014 AK 21989 HIOS 6 2014-03-19 07:06:49 21989 93-0438772 2014-01-01 2014-12-31 ... 36.95 NaN 73.9 107.61 107.61 107.61 144.56 144.56 144.56 14
2 2014 AK 21989 HIOS 6 2014-03-19 07:06:49 21989 93-0438772 2014-01-01 2014-12-31 ... 36.95 NaN 73.9 107.61 107.61 107.61 144.56 144.56 144.56 15
5 2014 AK 21989 HIOS 6 2014-03-19 07:06:49 21989 93-0438772 2014-01-01 2014-12-31 ... 36.95 NaN 73.9 107.61 107.61 107.61 144.56 144.56 144.56 16
6 2014 AK 21989 HIOS 6 2014-03-19 07:06:49 21989 93-0438772 2014-01-01 2014-12-31 ... 32.45 NaN 64.9 94.50 94.50 94.50 126.95 126.95 126.95 17
9 2014 AK 21989 HIOS 6 2014-03-19 07:06:49 21989 93-0438772 2014-01-01 2014-12-31 ... 32.45 NaN 64.9 94.50 94.50 94.50 126.95 126.95 126.95 18

5 rows × 24 columns

The mean Individual Rates for each state across 2014 - 2016

In [353]:
export_2 = rates_1.groupby([rates_1.StateCode, rates_1.BusinessYear]).IndividualRate.mean()
In [354]:
export_2.to_csv('../../data/export_2.csv')
In [355]:
rates_1.IndividualTobaccoRate.unique()
Out[355]:
array([     nan,   111.74,   193.59, ...,  1331.4 ,  1150.62,  1191.32])
In [356]:
rates_1.IndividualRate.count()
Out[356]:
12621995
In [357]:
rates_1.IndividualTobaccoRate.count()
Out[357]:
4932349
In [358]:
rates_1.shape
Out[358]:
(12621995, 24)

Difference for smokers and non-smokers

Additional information can be found here: https://www.healthmarkets.com/content/smoking-and-health-insurance

In [384]:
d.head()
Out[384]:
IndividualRate IndividualTobaccoRate SurCharge
StateCode
AK 4651114.00 4.997045e+06 1.074376
AL 10168914.53 1.134159e+07 1.115319
AR 9572833.60 1.130994e+07 1.181462
AZ 31734321.95 3.611266e+07 1.137969
DE 4947674.23 5.262465e+06 1.063624
In [390]:
d = pd.DataFrame(rates_1[rates_1.IndividualTobaccoRate.isnull() != True][[ 
                                                         'StateCode','IndividualRate', 
                                                         'IndividualTobaccoRate']].groupby('StateCode').sum())

d['SurCharge'] = d.IndividualTobaccoRate/d.IndividualRate
d['SurCharge'].sort_values().plot(figsize = (16,12),kind = 'bar', color = 'green')
plt.title('Average surcharge on individual medical insurance premiums for tobacco smokers, \n Years 2014 - 2016')
plt.xlabel('State', fontsize = 14)
plt.ylabel('Surcharge coefficient Smoker/Non-Smoker, x times')
plt.show()

Adding more data to check possible correlations

In [364]:
pop_df = pd.read_csv('../../data/US_Population.csv')
In [363]:
pop_df.head()
Out[363]:
State Population
0 California 39144818
1 Texas 27469114
2 Florida 20271272
3 New York 19795791
4 Illinois 12859995
In [89]:
HICovs_2013 = pd.read_csv('../../data/HICoverage_2013.csv')
HICovs_2014 = pd.read_csv('../../data/HICoverage_2014.csv')
HICovs_2015 = pd.read_csv('../../data/HICoverage_2015.csv')
HICovs_2016 = pd.read_csv('../../data/HICoverage_2016.csv')
In [90]:
HICovs_2013.head(3)
Out[90]:
Location Employer Non-Group Medicaid Medicare Other Public Uninsured Total
0 Alabama 2126500 174200 869700 783000 85600 724800 4763800
1 Alaska 364900 24000 95000 55200 60600 102200 701900
2 Arizona 2883800 170800 1346100 842000 137400 1223000 6603100
In [67]:
HICovs_2014.head(3)
Out[67]:
Location Employer_2014 Non-Group_2014 Medicaid_2014 Medicare_2014 Other Public_2014 Uninsured_2014 Total_2014
0 Alabama 2202800.0 288900.0 891900.0 718400.0 143900.0 522200.0 4768100.0
1 Alaska 345300.0 26800.0 130100.0 55300.0 37300.0 100800.0 695600.0
2 Arizona 2835200.0 333500.0 1639400.0 911100.0 1765100.0 827100.0 8311400.0
In [68]:
HICovs_2015.head(3)
Out[68]:
Location Employer_2015 Non-Group_2015 Medicaid_2015 Medicare_2015 Other Public_2015 Uninsured_2015 Total_2015
0 Alabama 2218000.0 291500.0 911400.0 719100.0 174500.0 519400.0 4833900.0
1 Alaska 355700.0 22300.0 128100.0 60900.0 47700.0 90500.0 705200.0
2 Arizona 2766500.0 278400.0 1711500.0 949000.0 189300.0 844800.0 6739500.0
In [69]:
HICovs_2016.head(3)
Out[69]:
Location Employer_2016 Non-Group_2016 Medicaid_2016 Medicare_2016 OtherPublic_2016 Uninsured_2016 Total_2016
0 Alabama 2263800.0 262400.0 997000.0 761200.0 128800.0 420800.0 4834000.0
1 Alaska 324400.0 20300.0 145400.0 68200.0 55600.0 96900.0 710800.0
2 Arizona 3010700.0 377000.0 1468400.0 1028000.0 172500.0 833700.0 6890300.0
In [91]:
HICovs = pd.concat([HICovs_2013, HICovs_2014, HICovs_2015, HICovs_2016], axis=1, join='inner')
HICovs.drop('Location', axis = 1, inplace=True)
HICovs['State'] = HICovs_2014.Location
HICovs.dropna(inplace = True)
In [92]:
print HICovs.shape
HICovs.head(3)
(51, 29)
Out[92]:
Employer Non-Group Medicaid Medicare Other Public Uninsured Total Employer_2014 Non-Group_2014 Medicaid_2014 ... Uninsured_2015 Total_2015 Employer_2016 Non-Group_2016 Medicaid_2016 Medicare_2016 OtherPublic_2016 Uninsured_2016 Total_2016 State
0 2126500 174200 869700 783000 85600 724800 4763800 2202800.0 288900.0 891900.0 ... 519400.0 4833900.0 2263800.0 262400.0 997000.0 761200.0 128800.0 420800.0 4834000.0 Alabama
1 364900 24000 95000 55200 60600 102200 701900 345300.0 26800.0 130100.0 ... 90500.0 705200.0 324400.0 20300.0 145400.0 68200.0 55600.0 96900.0 710800.0 Alaska
2 2883800 170800 1346100 842000 137400 1223000 6603100 2835200.0 333500.0 1639400.0 ... 844800.0 6739500.0 3010700.0 377000.0 1468400.0 1028000.0 172500.0 833700.0 6890300.0 Arizona

3 rows × 29 columns

In [93]:
HICovs['UnInsured_Pct_2013'] = HICovs.Uninsured/HICovs.Total
HICovs['UnInsured_Pct_2014'] = HICovs.Uninsured_2014/HICovs.Total_2014
HICovs['UnInsured_Pct_2015'] = HICovs.Uninsured_2014/HICovs.Total_2015
HICovs['UnInsured_Pct_2016'] = HICovs.Uninsured_2014/HICovs.Total_2016
HICovs.head(3)
Out[93]:
Employer Non-Group Medicaid Medicare Other Public Uninsured Total Employer_2014 Non-Group_2014 Medicaid_2014 ... Medicaid_2016 Medicare_2016 OtherPublic_2016 Uninsured_2016 Total_2016 State UnInsured_Pct_2013 UnInsured_Pct_2014 UnInsured_Pct_2015 UnInsured_Pct_2016
0 2126500 174200 869700 783000 85600 724800 4763800 2202800.0 288900.0 891900.0 ... 997000.0 761200.0 128800.0 420800.0 4834000.0 Alabama 0.152147 0.109520 0.108029 0.108026
1 364900 24000 95000 55200 60600 102200 701900 345300.0 26800.0 130100.0 ... 145400.0 68200.0 55600.0 96900.0 710800.0 Alaska 0.145605 0.144911 0.142938 0.141812
2 2883800 170800 1346100 842000 137400 1223000 6603100 2835200.0 333500.0 1639400.0 ... 1468400.0 1028000.0 172500.0 833700.0 6890300.0 Arizona 0.185216 0.099514 0.122724 0.120038

3 rows × 33 columns

In [94]:
HICovs[['State','UnInsured_Pct_2013']].sort_values('UnInsured_Pct_2013',ascending=False).head(5)
Out[94]:
State UnInsured_Pct_2013
43 Texas 0.204889
28 Nevada 0.193235
9 Florida 0.186306
2 Arizona 0.185216
31 New Mexico 0.164416
In [77]:
HICovs[['State','UnInsured_Pct_2014']].sort_values('UnInsured_Pct_2014',ascending=False).head(5)
Out[77]:
State UnInsured_Pct_2014
43 Texas 0.168600
36 Oklahoma 0.155914
10 Georgia 0.155192
9 Florida 0.145757
1 Alaska 0.144911
In [76]:
HICovs[['State','UnInsured_Pct_2015']].sort_values('UnInsured_Pct_2015a',ascending=False).head(5)
Out[76]:
State UnInsured_Pct_2015
43 Texas 0.164009
10 Georgia 0.153048
36 Oklahoma 0.149475
9 Florida 0.143145
1 Alaska 0.142938
In [75]:
HICovs[['State','UnInsured_Pct_2016']].sort_values('UnInsured_Pct_2016',ascending=False).head(5)
Out[75]:
State UnInsured_Pct_2016
43 Texas 0.162571
10 Georgia 0.150428
36 Oklahoma 0.148785
1 Alaska 0.141812
9 Florida 0.139924
In [99]:
print HICovs.Uninsured.sum()
print HICovs.Total.sum()
HICovs.Uninsured.sum()/HICovs.Total.sum()
41795400
313789500
Out[99]:
0L
In [134]:
print 'Percent of uninsured people in the US '
print '2013 ', round(HICovs.Uninsured.sum()/float(HICovs.Total.sum()),3)*100,'%'
print '2014 ', round(HICovs.Uninsured_2014.sum()/HICovs.Total_2014.sum(),3)*100,'%'
print '2015 ', round(HICovs.Uninsured_2015.sum()/HICovs.Total_2015.sum(),3)*100,'%'
print '2016 ', round(HICovs.Uninsured_2016.sum()/HICovs.Total_2016.sum(),3)*100,'%'
print '\n Overall number of uninsured 2013 vs 2016'
print HICovs.Uninsured.sum(), '- in 2013 vs', int(HICovs.Uninsured_2016.sum()), '- in 2016'
Percent of uninsured people in the US 
2013  13.3 %
2014  10.2 %
2015  9.1 %
2016  8.8 %

 Overall number of uninsured 2013 vs 2016
41795400 - in 2013 vs 28051700 - in 2016
In [135]:
HICovs['Change_2010_16'] = 2*(HICovs.Uninsured_2016-HICovs.Uninsured)/(HICovs.Total_2016+HICovs.Total)
print '\n The top 7 states where the number of uninsured \n people has significantly decreased between 2010 - 2016'
HICovs[['State', 'Uninsured', 'Uninsured_2016','Change_2010_16']].sort_values('Change_2010_16', ascending = True).head(7)
 The top 7 states where the number of uninsured 
 people has significantly decreased between 2010 - 2016
Out[135]:
State Uninsured Uninsured_2016 Change_2010_16
28 Nevada 534700 255600.0 -0.097753
26 Montana 158000 72700.0 -0.084114
17 Kentucky 624100 271400.0 -0.080266
3 Arkansas 436800 225500.0 -0.072237
4 California 5594100 3030800.0 -0.066265
37 Oregon 481400 218400.0 -0.065030
40 South Carolina 758000 448200.0 -0.064560
In [137]:
print '\n The states where the number of uninsured people \n has increased between 2010 - 2016'
HICovs[['State', 'Uninsured', 'Uninsured_2016','Change_2010_16']].sort_values('Change_2010_16', ascending = False).head(5)
 The states where the number of uninsured people 
 has increased between 2010 - 2016
Out[137]:
State Uninsured Uninsured_2016 Change_2010_16
21 Massachusetts 208700 379100.0 0.025206
7 Delaware 62700 84100.0 0.023040
44 Utah 359700 373900.0 0.004755
11 Hawaii 68600 75000.0 0.004658
1 Alaska 102200 96900.0 -0.007503
In [166]:
medcosts_2012 = pd.read_csv('../../data/Medical_costs_2012.csv', dtype = {'Hospital Care':'int',
                                                                          'Physician and Clinical Services':'int',
                                                                          'Prescription Drugs and Other Medical Nondurables':'int'})
medcosts_2012['Year'] = 2012
medcosts_2013 = pd.read_csv('../../data/Medical_costs_2013.csv', dtype = {'Hospital Care':'int',
                                                                          'Physician and Clinical Services':'int',
                                                                          'Prescription Drugs and Other Medical Nondurables':'int'})
medcosts_2013['Year'] = 2013
medcosts_2014 = pd.read_csv('../../data/Medical_costs_2014.csv', dtype = {'Hospital Care':'int',
                                                                          'Physician and Clinical Services':'int',
                                                                          'Prescription Drugs and Other Medical Nondurables':'int'})
medcosts_2014['Year'] = 2014
In [167]:
medcosts = pd.concat([medcosts_2012, medcosts_2013, medcosts_2014], ignore_index=True)
medcosts.columns = medcosts.columns.str.strip()
medcosts
Out[167]:
Location Hospital Care Physician and Clinical Services Other Professional Services Prescription Drugs and Other Medical Nondurables Nursing Home Care Dental Services Home Health Care Medical Durables Other Health, Residential, and Personal Care Year
0 United States 2875 1777 243 996 469 346 245 139 443 2012
1 Alabama 2440 1662 157 1272 385 270 199 133 303 2012
2 Alaska 4198 3069 406 640 179 507 222 131 779 2012
3 Arizona 2267 1693 225 759 284 332 141 168 313 2012
4 Arkansas 2530 1485 231 1036 439 299 172 97 515 2012
5 California 2561 1877 209 846 352 372 275 93 434 2012
6 Colorado 2439 1523 286 611 379 408 173 163 323 2012
7 Connecticut 3115 2093 298 1309 847 460 266 133 779 2012
8 Delaware 3680 2210 325 1287 592 409 244 189 496 2012
9 District of Columbia 5113 2181 213 1021 632 419 545 116 984 2012
10 Florida 2559 2105 284 1125 492 313 284 243 230 2012
11 Georgia 2198 1649 201 841 296 305 161 132 225 2012
12 Hawaii 2605 1829 218 1013 336 345 130 163 165 2012
13 Idaho 2748 1256 275 724 353 386 165 142 332 2012
14 Illinois 3034 1786 304 939 499 352 217 120 413 2012
15 Indiana 3375 1729 217 997 560 325 149 155 340 2012
16 Iowa 3064 1475 241 1049 626 341 156 133 563 2012
17 Kansas 2828 1740 254 1058 551 348 157 119 421 2012
18 Kentucky 2942 1636 215 1095 464 260 140 139 399 2012
19 Louisiana 2773 1525 224 1226 390 278 288 96 503 2012
20 Maine 3817 1619 320 996 587 376 191 130 965 2012
21 Maryland 3109 1987 269 1032 624 371 160 121 441 2012
22 Massachusetts 3817 2220 281 1077 727 471 551 163 764 2012
23 Michigan 3107 1652 247 975 463 369 271 253 300 2012
24 Minnesota 3057 1694 249 873 561 395 433 129 785 2012
25 Mississippi 3107 1563 173 1067 441 244 211 154 409 2012
26 Missouri 3369 1513 221 1147 514 292 215 100 386 2012
27 Montana 3390 1620 279 793 406 370 146 160 482 2012
28 Nebraska 3265 1733 311 1108 591 313 93 175 390 2012
29 Nevada 2032 1696 247 886 242 336 238 145 205 2012
... ... ... ... ... ... ... ... ... ... ... ...
126 Massachusetts 3933 2264 301 1250 720 480 667 170 774 2014
127 Michigan 3256 1738 265 1106 478 379 288 273 273 2014
128 Minnesota 3373 1759 270 955 585 404 524 135 867 2014
129 Mississippi 3228 1565 190 1132 450 253 212 165 452 2014
130 Missouri 3492 1542 228 1273 531 295 227 107 413 2014
131 Montana 3722 1658 303 865 422 387 151 165 546 2014
132 Nebraska 3492 1758 296 1242 605 326 97 185 412 2014
133 Nevada 2376 1872 273 966 250 349 233 150 244 2014
134 New Hampshire 3771 2227 263 1222 584 533 228 191 571 2014
135 New Jersey 2915 2369 361 1355 565 413 244 159 478 2014
136 New Mexico 3166 1591 268 748 283 348 250 207 354 2014
137 New York 3633 1873 299 1435 619 361 478 144 936 2014
138 North Carolina 2779 1612 221 1238 453 357 195 142 267 2014
139 North Dakota 4479 1713 260 1175 745 430 72 165 811 2014
140 Ohio 3809 1779 233 1023 605 317 259 144 542 2014
141 Oklahoma 3118 1651 260 1180 408 342 195 115 359 2014
142 Oregon 2940 1879 330 882 467 457 167 129 793 2014
143 Pennsylvania 3416 1967 308 1345 774 316 244 141 747 2014
144 Rhode Island 3617 1855 311 1411 742 335 252 138 888 2014
145 South Carolina 2933 1618 204 1193 383 319 219 154 288 2014
146 South Dakota 4415 1635 256 949 574 416 93 126 468 2014
147 Tennessee 2617 1880 219 1216 413 287 214 136 391 2014
148 Texas 2698 1695 208 1073 305 289 282 115 333 2014
149 Utah 2351 1319 212 784 230 376 188 161 361 2014
150 Vermont 4670 1750 321 1158 563 463 301 129 837 2014
151 Virginia 2912 1877 225 1015 438 383 152 135 420 2014
152 Washington 3090 2064 324 795 462 498 188 146 346 2014
153 West Virginia 4037 1807 238 1377 539 297 272 186 710 2014
154 Wisconsin 3502 2168 264 1103 553 396 230 125 363 2014
155 Wyoming 4001 1771 377 751 385 414 77 129 416 2014

156 rows × 11 columns

In [168]:
eda(medcosts)
dataframe index 
RangeIndex(start=0, stop=156, step=1)
dataframe shape 
(156, 11)
dataframe describe 
       Hospital Care  Physician and Clinical Services  \
count     156.000000                       156.000000   
mean     3201.211538                      1803.307692   
std       629.076928                       306.256262   
min      2032.000000                      1246.000000   
25%      2745.750000                      1626.500000   
50%      3112.000000                      1739.500000   
75%      3512.000000                      1879.250000   
max      5233.000000                      3368.000000   

       Other Professional Services  \
count                   156.000000   
mean                    262.089744   
std                      52.089176   
min                     157.000000   
25%                     221.750000   
50%                     255.500000   
75%                     299.000000   
max                     465.000000   

       Prescription Drugs and Other Medical Nondurables  Nursing Home Care  \
count                                        156.000000         156.000000   
mean                                        1036.429487         487.698718   
std                                          198.617483         142.940792   
min                                          584.000000         179.000000   
25%                                          895.750000         387.000000   
50%                                         1055.000000         474.000000   
75%                                         1169.500000         584.250000   
max                                         1525.000000         854.000000   

       Dental Services  Home Health Care  Medical Durables  \
count       156.000000        156.000000        156.000000   
mean        364.108974        225.250000        145.801282   
std          63.418790        106.682322         32.427073   
min         244.000000         67.000000         93.000000   
25%         318.500000        163.000000        126.000000   
50%         352.500000        210.500000        139.000000   
75%         402.250000        252.250000        163.000000   
max         542.000000        667.000000        273.000000   

       Other Health, Residential, and Personal Care         Year  
count                                    156.000000   156.000000  
mean                                     499.378205  2013.000000  
std                                      213.946069     0.819126  
min                                      162.000000  2012.000000  
25%                                      348.500000  2012.000000  
50%                                      439.000000  2013.000000  
75%                                      620.750000  2014.000000  
max                                     1100.000000  2014.000000  
Out[168]:
IsNull DType NUnique
index
Location 0 object 52
Hospital Care 0 int64 148
Physician and Clinical Services 0 int64 147
Other Professional Services 0 int64 104
Prescription Drugs and Other Medical Nondurables 0 int64 143
Nursing Home Care 0 int64 129
Dental Services 0 int64 109
Home Health Care 0 int64 118
Medical Durables 0 int64 81
Other Health, Residential, and Personal Care 0 int64 138
Year 0 int64 3
In [169]:
medcosts['Total'] = medcosts['Hospital Care'] + medcosts['Physician and Clinical Services'] + medcosts['Other Professional Services'] + medcosts['Prescription Drugs and Other Medical Nondurables'] + medcosts['Nursing Home Care'] + medcosts['Dental Services'] + medcosts['Home Health Care'] + medcosts['Medical Durables'] + medcosts['Other Health, Residential, and Personal Care']
medcosts[medcosts.Year == 2012]['Total'].head(3)
Out[169]:
0     7533
1     6821
2    10131
Name: Total, dtype: int64
In [170]:
print '\n Top 5 most expensive states for medical costs per capita 2012'
medcosts[medcosts.Year == 2012].sort_values('Total', ascending = False).head(5)
 Top 5 most expensive states for medical costs per capita 2012
Out[170]:
Location Hospital Care Physician and Clinical Services Other Professional Services Prescription Drugs and Other Medical Nondurables Nursing Home Care Dental Services Home Health Care Medical Durables Other Health, Residential, and Personal Care Year Total
9 District of Columbia 5113 2181 213 1021 632 419 545 116 984 2012 11224
2 Alaska 4198 3069 406 640 179 507 222 131 779 2012 10131
22 Massachusetts 3817 2220 281 1077 727 471 551 163 764 2012 10071
8 Delaware 3680 2210 325 1287 592 409 244 189 496 2012 9432
46 Vermont 4257 1642 296 1022 530 441 269 119 725 2012 9301
In [171]:
print '\n Top 5 most expensive states for medical costs per capita 2014'
medcosts[medcosts.Year == 2014].sort_values('Total', ascending = False).head(5)
 Top 5 most expensive states for medical costs per capita 2014
Out[171]:
Location Hospital Care Physician and Clinical Services Other Professional Services Prescription Drugs and Other Medical Nondurables Nursing Home Care Dental Services Home Health Care Medical Durables Other Health, Residential, and Personal Care Year Total
113 District of Columbia 5233 2457 223 1171 583 410 653 116 1098 2014 11944
106 Alaska 4715 3368 465 640 204 542 195 139 797 2014 11065
126 Massachusetts 3933 2264 301 1250 720 480 667 170 774 2014 10559
112 Delaware 4078 2259 348 1525 608 409 239 197 591 2014 10254
150 Vermont 4670 1750 321 1158 563 463 301 129 837 2014 10192
In [172]:
US_medcosts = medcosts[medcosts['Location'].str.contains('United')]
medcosts = medcosts.drop(medcosts['Location'].str.contains('United'))
medcosts.head(5)
Out[172]:
Location Hospital Care Physician and Clinical Services Other Professional Services Prescription Drugs and Other Medical Nondurables Nursing Home Care Dental Services Home Health Care Medical Durables Other Health, Residential, and Personal Care Year Total
2 Alaska 4198 3069 406 640 179 507 222 131 779 2012 10131
3 Arizona 2267 1693 225 759 284 332 141 168 313 2012 6182
4 Arkansas 2530 1485 231 1036 439 299 172 97 515 2012 6804
5 California 2561 1877 209 846 352 372 275 93 434 2012 7019
6 Colorado 2439 1523 286 611 379 408 173 163 323 2012 6305
In [173]:
medcosts.shape
Out[173]:
(154, 12)
In [174]:
US_medcosts
Out[174]:
Location Hospital Care Physician and Clinical Services Other Professional Services Prescription Drugs and Other Medical Nondurables Nursing Home Care Dental Services Home Health Care Medical Durables Other Health, Residential, and Personal Care Year Total
0 United States 2875 1777 243 996 469 346 245 139 443 2012 7533
52 United States 2966 1801 249 1014 472 348 253 143 456 2013 7702
104 United States 3079 1874 260 1114 479 354 262 146 475 2014 8043
In [175]:
for i in US_medcosts.columns[1:-2]:
    US_medcosts[i+'_pct'] = US_medcosts[i]/US_medcosts.Total
US_medcosts
/Users/baurjansafi/anaconda/lib/python2.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
Out[175]:
Location Hospital Care Physician and Clinical Services Other Professional Services Prescription Drugs and Other Medical Nondurables Nursing Home Care Dental Services Home Health Care Medical Durables Other Health, Residential, and Personal Care ... Total Hospital Care_pct Physician and Clinical Services_pct Other Professional Services_pct Prescription Drugs and Other Medical Nondurables_pct Nursing Home Care_pct Dental Services_pct Home Health Care_pct Medical Durables_pct Other Health, Residential, and Personal Care_pct
0 United States 2875 1777 243 996 469 346 245 139 443 ... 7533 0.381654 0.235895 0.032258 0.132218 0.062259 0.045931 0.032524 0.018452 0.058808
52 United States 2966 1801 249 1014 472 348 253 143 456 ... 7702 0.385095 0.233835 0.032329 0.131654 0.061283 0.045183 0.032849 0.018567 0.059205
104 United States 3079 1874 260 1114 479 354 262 146 475 ... 8043 0.382817 0.232998 0.032326 0.138506 0.059555 0.044013 0.032575 0.018152 0.059058

3 rows × 21 columns

In [203]:
# for i in range(US_medcosts.shape[0]):
#US_medcosts[0,2:12]#.plot(figsize = (16,8),kind = 'pie', subplots=True)
#plt.show()

US_medcosts.mean()[:9].plot.pie(fontsize = 20,figsize = (12,12), subplots = True, autopct = '%.1f' )
plt.show()
In [473]:
mean_rates = pd.DataFrame(rates_1.groupby('StateCode')['IndividualRate'].mean())
In [522]:
mean_rates.head()
Out[522]:
IndividualRate
StateCode
AK 631.788031
AL 250.278098
AR 212.122069
AZ 352.059498
DE 359.123401
In [534]:
State_Codes = pd.read_csv('../../data/US_codes.csv')
State_Codes.set_index('StateCode',inplace=True)
State_Codes.State.str.strip()
State_Codes.head()
Out[534]:
State
StateCode
AL Alabama
AK Alaska
AZ Arizona
AR Arkansas
CA California
In [630]:
ind_ann_rates = pd.concat([mean_rates, State_Codes], axis = 1, join='inner', ignore_index=True)
ind_ann_rates.columns = ['MeanIndPremium','State']
ind_ann_rates['MeanAnnualPremium'] = ind_ann_rates.MeanIndPremium*12
print ind_ann_rates.shape
ind_ann_rates.head()
(39, 3)
Out[630]:
MeanIndPremium State MeanAnnualPremium
StateCode
AK 631.788031 Alaska 7581.456368
AL 250.278098 Alabama 3003.337176
AR 212.122069 Arkansas 2545.464830
AZ 352.059498 Arizona 4224.713971
DE 359.123401 Delaware 4309.480811
In [631]:
ind_ann_rates.set_index('State', inplace = True)
ind_ann_rates.head()
Out[631]:
MeanIndPremium MeanAnnualPremium
State
Alaska 631.788031 7581.456368
Alabama 250.278098 3003.337176
Arkansas 212.122069 2545.464830
Arizona 352.059498 4224.713971
Delaware 359.123401 4309.480811
In [632]:
meanmedcosts = pd.DataFrame(medcosts.groupby('Location')['Total'].mean())
meanmedcosts.head()
Out[632]:
Total
Location
Alabama 7137.000000
Alaska 10541.666667
Arizona 6298.333333
Arkansas 7047.333333
California 7275.333333
In [633]:
ind_ann_rates = ind_ann_rates.merge(meanmedcosts, left_index=True,  right_index=True, how='inner')
ind_ann_rates.drop('MeanIndPremium', axis = 1, inplace = True)
ind_ann_rates.rename(columns={'Total':'HealthCosts'}, inplace = True)
In [676]:
plt.figure(figsize = (12,8))
plt.scatter(ind_ann_rates.HealthCosts, ind_ann_rates.MeanAnnualPremium, color = 'green')
plt.title('Mean Annual Premiums vs Mean Annual Health Costs Per Capita')
plt.xlabel('Mean Annual Health Costs Per Capita')
plt.ylabel('Mean Annual Premiums')
plt.show()
In [635]:
ind_ann_rates.corr('pearson')
Out[635]:
MeanAnnualPremium HealthCosts
MeanAnnualPremium 1.000000 0.555653
HealthCosts 0.555653 1.000000

Let's mix in other stats

In [667]:
stats = pd.read_csv('../../data/states.csv')
In [668]:
stats = stats[['State','MeanHHIncome_15']]
stats.State = stats.State.str.strip()
stats.set_index('State', inplace = True)
In [671]:
ind_ann_rates_1 = ind_ann_rates.merge(stats, left_index=True, right_index=True, how='left')
ind_ann_rates_1.head()
Out[671]:
MeanAnnualPremium HealthCosts MeanHHIncome_15
Alaska 7581.456368 10541.666667 75112.0
Alabama 3003.337176 7137.000000 44509.0
Arkansas 2545.464830 7047.333333 42798.0
Arizona 4224.713971 6298.333333 52248.0
Delaware 4309.480811 9817.000000 57756.0
In [672]:
ind_ann_rates_1.corr('pearson')
Out[672]:
MeanAnnualPremium HealthCosts MeanHHIncome_15
MeanAnnualPremium 1.000000 0.555653 0.386399
HealthCosts 0.555653 1.000000 0.379315
MeanHHIncome_15 0.386399 0.379315 1.000000
In [677]:
plt.figure(figsize = (12,8))
plt.scatter(ind_ann_rates_1.MeanHHIncome_15, ind_ann_rates_1.MeanAnnualPremium, color = 'green')
plt.title('Mean Annual Premiums vs Mean Annual Household Incomes')
plt.xlabel('Mean Annual Household Income')
plt.ylabel('Mean Annual Premium')
plt.show()