West Nile Virus Group Project.

West Nile Virus project was announced on kaggle.com (https://www.kaggle.com/c/predict-west-nile-virus) in 2015.
The task was to predict where and when in the city of Chicago shall we expect an outbreak of West Nile Virus. From the data provided by the competition organizers we had three sets of data:

train data with the number of mosquitos cought and collected from each trap (over 130) once a week on even years
weather data for each day for each day
spray data for the days in train data.

The team built a dataset with over 1500 features and achieved Kaggle score of about 0.76. The main contributors to the project are:

Rolland Jeannier (http://rolandjeannier.com)
Yoni Levine (https://github.com/yonilevine)
Evan Lutins (https://github.com/elutins)

import pandas as pd
import numpy as np
from sklearn import ensemble, preprocessing
import datetime as dt
from sklearn import model_selection, metrics
from matplotlib import pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import pickle
%matplotlib inline

/home/roland/anaconda3/envs/ga-immersive/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

Loading data and building featuresÂ¶

# Load dataset 
train = pd.read_csv('asset/train.csv')
test = pd.read_csv('asset/test.csv')
sample = pd.read_csv('asset/sampleSubmission.csv')
weather = pd.read_csv('asset/weather.csv')
spray = pd.read_csv('asset/spray.csv')

# Get labels
labels = train.WnvPresent.values

# Creating dummy variables for the weather data
weather.CodeSum = weather.CodeSum.apply(str.split)
for (i,list_) in enumerate(weather.CodeSum):
    for item in list_:
        if item not in weather.columns:
            weather[item] = 0
        weather.set_value(col=item,index=i,value=1)


# Not using codesum anymore
weather = weather.drop('CodeSum', axis=1)
# Split station 1 and 2 and join horizontally
weather_stn1 = weather[weather['Station']==1]
weather_stn2 = weather[weather['Station']==2]
weather_stn1 = weather_stn1.drop('Station', axis=1)
weather_stn2 = weather_stn2.drop('Station', axis=1)
weather = weather_stn1.merge(weather_stn2, on='Date')

train.Date = pd.to_datetime(train.Date)
test.Date = pd.to_datetime(test.Date)
weather.Date = pd.to_datetime(weather.Date)
spray.Date = pd.to_datetime(spray.Date)

# replace some missing values and T with -1
weather = weather.replace('M', -1)
weather = weather.replace('-', -1)
weather = weather.replace('T', -1)
weather = weather.replace(' T', -1)
weather = weather.replace('  T', -1)

weather['humidity'] = (weather.DewPoint_x)/(weather.Tavg_x.apply(float))
#weather['yoni_interaction_term_y'] = (weather.DewPoint_y)/(weather.Tavg_y.apply(float))

# Taking derivities of some columns
#weather['tmax_diff'] = weather.Tmax_x.diff()
weather['dewpoint_diff'] = weather.DewPoint_x.diff()
#weather['tmin_diff'] = weather.Tmin_x.diff()
#weather['wetbulb_diff'] = weather.WetBulb_x().apply(float).diff()
weather['precip_diff'] = weather.PrecipTotal_x.apply(float).diff()
weather['tavg_diff'] = weather.Tavg_x.apply(float).diff()

# Functions to extract month and day from dataset
train['year'] = train['Date'].dt.year
train['month'] = train['Date'].dt.month
train['day'] = train['Date'].dt.day
test['year'] = test['Date'].dt.year
test['month'] = test['Date'].dt.month
test['day'] = test['Date'].dt.day

# drop address columns
train = train.drop(['Address', 'AddressNumberAndStreet','WnvPresent', 'NumMosquitos'], axis = 1)
test = test.drop(['Id', 'Address', 'AddressNumberAndStreet'], axis = 1)

# This method merges the weather df with a dataframe and does so mulitple times by days for the given amount
# of days
def merge_weather_previous_days(df, days, df_weather = weather):
    for i in range(1, days):
        date_col = 'Date'+ '_' + str(i)
        if(i == 0):
            date_col = 'Date'
        df[date_col] = df.Date.apply(lambda a: a - pd.Timedelta(days=1))
        df = df.merge(weather, left_on=date_col, right_on='Date', suffixes=("","_day_" + str(i)))
    return df


def merge_weather_previous_weeks(df, weeks, df_weather = weather):
    for i in range(1, weeks):
        date_col = 'Date'+ '_' + str(i)
        if(i == 0):
            date_col = 'Date'
        df[date_col] = df.Date.apply(lambda a: a - pd.Timedelta(weeks=1))
        df = df.merge(weather, left_on=date_col, right_on='Date', suffixes=("","_week_" + str(i)))
    return df


# #This method is attempting to take the mean of weather over a weekly period, and append that values to our DF
# def agg_by_week(df, df_weather = weather):
#     # grouping data in weather column by week and taking the average.
#     # i'm going to use this data to build columns in the passed in data frame that has the average value
#     # of the 
#     agg_weather = df_weather.set_index('Date').groupby(pd.TimeGrouper('W')).mean()
#     for d in agg_weather.index:
#         for c in agg_weather:
#             if c not in df.columns:
#                 df[c] = agg_weather.get_value(d, c)
#             for j in range(1, len(df)):
#                 if df.get_value(j, "Date") < d:
#                     if df.get_value(j-1, "Date") > d:
#                         df.set_value(j, c, agg_weather.get_value(d,c))
#                 else:
#                     break
                    
#This method is attempting to take the mean of weather over a weekly period, and append that values to our DF
def agg_by_week(df, df_weather = weather):
    # grouping data in weather column by week and taking the average.
    # i'm going to use this data to build columns in the passed in data frame that has the average value
    # of the 
    for i in range(0, len(df)):
        df_date = df.get_value(i, 'Date')
        agg_weather = df_weather.set_index('Date').groupby(pd.TimeGrouper('W')).mean()
        for c in agg_weather.columns:
            col_name = c+ "_agg"
            if c.startswith('codesum'):
                continue
            if c not in df.columns:
                df[col_name] = -1
            value =  agg_weather[c][(agg_weather.index < df_date) & 
                                (agg_weather.index > df_date - pd.Timedelta(weeks=1))][0]
            #print(value)
            if np.isnan(value):
                value = -1
            df.set_value(i, col_name, value)

# Putting historical weather data onto the data frame
train = merge_weather_previous_days(train, 4, weather)
test =  merge_weather_previous_days(test, 4, weather)

train = merge_weather_previous_weeks(train, 3, weather)
test =  merge_weather_previous_weeks(test, 3, weather)

# This function allows you to create columns for the data_df input that will have a value of 1 or zero
# If a trap has been sprayed with in the time period specified

def create_sprayed_cols(data_df, spray_df, time_period=2):
    # Iterating over unique dates that sprays took place
    # Sprays took place over 10 days as trucks drove around chicago
    for date in set(spray_df.Date):
        # I only want data for this unique date
        spray_temp = spray_df[spray_df.Date == date]
        # Resetting index to make iterating easier
        spray_temp.index = range(0, len(spray_temp))
        
        # I am creating a column for every unique date and initalizing it's rows to 0
        # I will set these values to 1 when I find a trap that was sprayed
        col_name = 'spray_'+date.strftime('%Y-%m-%d')+"_"+str(time_period)
        data_df[col_name] = 0
        # Iterating over each row of our training data to determine if a trap is in the location
        # of a spray. I am also checking to see if the spray was in the past
        for r in range(0,len(data_df)):
            if data_df.get_value(r,'Date') > date and data_df.get_value(r,'Date') < date + pd.Timedelta(weeks=time_period):

                # I am casting the lat and long to ints, and multiplaying by 100 to truncate precision
                # In other words, I'm taking pin points and making them into squares
                cur_lat = int(data_df.get_value(r, 'Latitude') * 100)
                cur_long = int(data_df.get_value(r, 'Longitude') * 100)
                
                # Iterating over each value in my spray data
                for i in range(0, len(spray_temp)):
                    
                    spray_lat = int(spray_temp.get_value(i,'Latitude')*100)
                    spray_long = int(spray_temp.get_value(i,'Longitude')*100)
                    
                    latdiff = spray_lat - cur_lat
                    longdiff = spray_long - cur_long
                    dis = .5 **(latdiff ** 2 + longdiff ** 2)
                    
                    # I am now checking if something is in the square +/- some threshold
                    if (cur_lat < spray_lat + 15 and cur_lat > spray_lat - 15) and \
                    (cur_long < spray_long + 15 and cur_long > spray_long - 15):
                        data_df.set_value(r,col_name, 1)
                        break

# adding spray data
create_sprayed_cols(train, spray, time_period=2)
create_sprayed_cols(test,spray, time_period=2)

#Convert categorical data to numbers
# lbl = preprocessing.LabelEncoder()
# lbl.fit(list(train['Species'].values) + list(test['Species'].values))
# train['Species'] = lbl.transform(train['Species'].values)
# test['Species'] = lbl.transform(test['Species'].values)

# lbl.fit(list(train['Street'].values) + list(test['Street'].values))
# train['Street'] = lbl.transform(train['Street'].values)
# test['Street'] = lbl.transform(test['Street'].values)

# lbl.fit(list(train['Trap'].values) + list(test['Trap'].values))
# train['Trap'] = lbl.transform(train['Trap'].values)
# test['Trap'] = lbl.transform(test['Trap'].values)

#Converting data to catagoreical data. Instead of using the label encoder
train = pd.get_dummies(train, prefix = ['Trap', 'Species', 'Block', 'Street'],\
                                      columns=['Trap','Species','Block','Street'])
test = pd.get_dummies(test, prefix = ['Trap', 'Species','Block','Street'],\
                                     columns=['Trap','Species', 'Block', 'Street'])

# drop columns with -1s
train = train.loc[:,(train != -1).any(axis=0)]
test = test.loc[:,(test != -1).any(axis=0)]

# Making everything numerical if it is a string
train = train.convert_objects(convert_numeric=True)
test = test.convert_objects(convert_numeric=True)

/home/roland/anaconda3/envs/ga-immersive/lib/python2.7/site-packages/ipykernel_launcher.py:2: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  
/home/roland/anaconda3/envs/ga-immersive/lib/python2.7/site-packages/ipykernel_launcher.py:3: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  This is separate from the ipykernel package so we can avoid doing imports until

# Converting data to polar coordinates. This may help location become a better predictor.

epicenter_lat = 41.903002
epicenter_long = -87.688267

# epicenter_rho = np.sqrt(epicenter_lat**2 + epicenter_long**2)
# epicenter_phi = np.tan(epicenter_long, epicenter_lat)

train['rho'] = np.sqrt((train['Latitude'] - epicenter_lat)**2 + (train['Longitude'] - epicenter_long)**2)
train['phi'] = np.arctan2((train['Latitude']- epicenter_lat),train['Longitude']- epicenter_long)
test['rho'] = np.sqrt((test['Latitude'] - epicenter_lat)**2 + (test['Longitude'] - epicenter_long)**2)
test['phi'] = np.arctan2((test['Latitude']- epicenter_lat),test['Longitude']- epicenter_long)
train['phi_x_rho'] = train.rho * train.phi
test['phi_x_rho'] = test.rho * test.phi

train = train.drop(train[list(filter(lambda a: a.startswith('Date'), train.columns))], axis=1)
test = test.drop(test[list(filter(lambda a: a.startswith('Date'), test.columns))], axis=1)
train = train.drop(['Latitude', 'Longitude'], axis=1)
test = test.drop(['Latitude', 'Longitude'], axis=1)
train = train.drop(set(train.columns) - set(test.columns), axis=1)
test = test.drop(set(test.columns) - set(train.columns), axis=1)

Saving data to CSV to spped up future model building.Â¶

train.to_csv('train_w_weather.csv', index=False)
test.to_csv('test_w_weather.csv', index=False)

# train = pd.read_csv('train_w_weather.csv')
# test = pd.read_csv('test_w_weather.csv')

# labels = train.WnvPresent.values

Grid searching belowÂ¶

features = train.columns
#features = best_features
xgb1 = xgb.XGBClassifier()

parameters = {'nthread':[6], #when use hyperthread, xgboost may become slower
              'objective':['binary:logistic'],
              'learning_rate': [0.05], #so called `eta` value
              'max_depth': [2],
              'min_child_weight': [1],
              'silent': [1],
              'subsample': [.91],
              'colsample_bytree': [.58],
              'n_estimators': [1500], #number of trees, change it to 1000 for better results
              #'missing':[-999],
              'seed': [1337]}

clf = model_selection.GridSearchCV(xgb1, parameters, n_jobs=6, 
                   cv=model_selection.StratifiedKFold(n_splits=3, random_state=None, shuffle=True
                                                     ), 
                   scoring='roc_auc',
                   verbose=0, refit=True)

clf.fit(train[features], labels)
#model = clf.best_estimator_

GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=True),
       error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=6,
       param_grid={'n_estimators': [1500], 'subsample': [0.91], 'seed': [1337], 'colsample_bytree': [0.58], 'silent': [1], 'objective': ['binary:logistic'], 'learning_rate': [0.05], 'nthread': [6], 'min_child_weight': [1], 'max_depth': [2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

#trust your CV!
best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
print('Raw AUC score:', score)
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

test_probs = clf.predict_proba(test[features])[:,1]

predictions = clf.predict_proba(test[features])[:,1]
sample['WnvPresent'] = predictions
sample.to_csv('beat_the_benchmark.csv', index=False)
model = clf.best_estimator_
# sample = pd.read_csv('../input/sample_submission.csv')
# sample.QuoteConversion_Flag = test_probs
# sample.to_csv("xgboost_best_parameter_submission.csv", index=False)

/home/roland/anaconda3/envs/ga-immersive/lib/python2.7/site-packages/sklearn/model_selection/_search.py:667: DeprecationWarning: The grid_scores_ attribute was deprecated in version 0.18 in favor of the more elaborate cv_results_ attribute. The grid_scores_ attribute will not be available from 0.20
  DeprecationWarning)

('Raw AUC score:', 0.83521729927491584)
colsample_bytree: 0.58
learning_rate: 0.05
max_depth: 2
min_child_weight: 1
n_estimators: 1500
nthread: 6
objective: 'binary:logistic'
seed: 1337
silent: 1
subsample: 0.91

Evaluating resultsÂ¶

np.mean(model_selection.cross_val_score(model, X=train, y=labels, scoring='roc_auc', n_jobs=5))

Looking at feature importancesÂ¶

# Zipping feature importances and sorting by how importnat they are
importances = list(zip(train.columns,model.feature_importances_))
#importances = list(zip(best_features,model.feature_importances_))
importances.sort(key=lambda a: a[1])
importances = importances[::-1]
# std = np.std([model.feature_importances_ for tree in model.estimators_],
#              axis=0)

# Print the feature ranking
print("Feature ranking:")

for f in range(0, len(importances)):
    print("{}: {}".format(importances[f][0], importances[f][1]))

# Plot the feature importances of the forest
sns.barplot(x=train.columns, y=model.feature_importances_)

Feature ranking:
phi: 0.0584525838494
phi_x_rho: 0.0531166642904
rho: 0.051176328212
Sunrise_x: 0.0317729823291
humidity: 0.0167353861034
ResultSpeed_y: 0.0133398007601
Species_CULEX TERRITANS: 0.0128547176719
year: 0.0123696336523
Sunrise_x_day_2: 0.0109143825248
humidity_week_1: 0.0106718409806
Species_CULEX RESTUANS: 0.0104292994365
AvgSpeed_x_week_1: 0.00945913139731
ResultSpeed_y_week_1: 0.00897404830903
Sunset_x: 0.00873150583357
Species_CULEX SALINARIUS: 0.00848896428943
Block_22: 0.00824642274529
AddressAccuracy: 0.00800388026983
dewpoint_diff: 0.00751879718155
ResultDir_x: 0.00727625517175
DewPoint_x: 0.00727625517175
Block_62: 0.00679117161781
Block_58: 0.00679117161781
Trap_T017: 0.00679117161781
AvgSpeed_y_week_1: 0.00679117161781
Tmin_y_week_1: 0.00654862960801
Species_CULEX PIPIENS: 0.0063060875982
Block_25: 0.00606354605407
Block_30: 0.00582100404426
Trap_T235: 0.00582100404426
Trap_T003: 0.00582100404426
StnPressure_y_week_1: 0.00582100404426
Trap_T090: 0.00557846250013
Species_CULEX PIPIENS/RESTUANS: 0.00533592049032
Trap_T069: 0.00533592049032
Tmin_x_week_1: 0.00533592049032
AvgSpeed_x_week_2: 0.00509337848052
ResultSpeed_x: 0.00509337848052
Trap_T225: 0.00485083693638
Trap_T200: 0.00485083693638
Trap_T008: 0.00485083693638
precip_diff_week_1: 0.00485083693638
Tavg_y_week_1: 0.00485083693638
ResultDir_y: 0.00485083693638
day: 0.00485083693638
Trap_T045: 0.00460829492658
spray_2013-07-25_2: 0.00460829492658
PrecipTotal_y_week_1: 0.00460829492658
SeaLevel_x_week_1: 0.00460829492658
tavg_diff: 0.00460829492658
ResultSpeed_y_week_2: 0.00436575291678
dewpoint_diff_week_1: 0.00436575291678
ResultDir_y_week_1: 0.00436575291678
ResultSpeed_x_week_1: 0.00436575291678
AvgSpeed_x: 0.00436575291678
Trap_T103: 0.00412321137264
StnPressure_x_week_1: 0.00412321137264
humidity_day_2: 0.00412321137264
ResultSpeed_y_day_2: 0.00412321137264
AvgSpeed_y: 0.00412321137264
Street_ N OAK PARK AVE: 0.00388066936284
Trap_T228: 0.00388066936284
Trap_T082: 0.00388066936284
spray_2013-08-16_2: 0.00388066936284
AvgSpeed_y_week_2: 0.00388066936284
WetBulb_x_week_1: 0.00388066936284
Tavg_x_week_1: 0.00388066936284
precip_diff: 0.00388066936284
Block_11: 0.00363812758587
spray_2013-08-08_2: 0.00363812758587
dewpoint_diff_week_2: 0.00363812758587
humidity_week_2: 0.00363812758587
Tmax_y_week_1: 0.00363812758587
Tmax_x_week_1: 0.00363812758587
Block_50: 0.0033955858089
Block_40: 0.0033955858089
Trap_T062: 0.0033955858089
dewpoint_diff_day_2: 0.0033955858089
SeaLevel_x: 0.0033955858089
PrecipTotal_x: 0.0033955858089
Depart_x: 0.0033955858089
month: 0.0033955858089
Street_ N SPRINGFIELD AVE: 0.0031530437991
Block_52: 0.0031530437991
Block_49: 0.0031530437991
Trap_T222: 0.0031530437991
Tmin_y_week_2: 0.0031530437991
tavg_diff_week_1: 0.0031530437991
ResultDir_x_week_1: 0.0031530437991
humidity_day_3: 0.0031530437991
ResultSpeed_x_day_2: 0.0031530437991
Tmin_y: 0.0031530437991
Street_ W MONTROSE DR: 0.00291050202213
Trap_T900: 0.00291050202213
Trap_T144: 0.00291050202213
Trap_T061: 0.00291050202213
Trap_T049: 0.00291050202213
Trap_T028: 0.00291050202213
Depart_x_week_1: 0.00291050202213
Sunrise_x_day_3: 0.00291050202213
Tmax_y: 0.00291050202213
Tavg_x: 0.00291050202213
Tmin_x: 0.00291050202213
Street_ W 69TH ST: 0.00266796024516
Street_ S KOSTNER AVE: 0.00266796024516
Block_91: 0.00266796024516
Block_89: 0.00266796024516
Block_46: 0.00266796024516
Trap_T212: 0.00266796024516
Trap_T114: 0.00266796024516
Trap_T096: 0.00266796024516
Trap_T081: 0.00266796024516
Trap_T037: 0.00266796024516
StnPressure_y_week_2: 0.00266796024516
ResultDir_y_day_2: 0.00266796024516
DewPoint_y: 0.00266796024516
Block_61: 0.00242541846819
Block_35: 0.00242541846819
Block_12: 0.00242541846819
Trap_T151: 0.00242541846819
Trap_T147: 0.00242541846819
PrecipTotal_y_week_2: 0.00242541846819
SeaLevel_y_week_1: 0.00242541846819
PrecipTotal_x_week_1: 0.00242541846819
DewPoint_x_day_3: 0.00242541846819
ResultDir_x_day_2: 0.00242541846819
Block_71: 0.00218287645839
Trap_T903: 0.00218287645839
Trap_T230: 0.00218287645839
Trap_T079: 0.00218287645839
Trap_T048: 0.00218287645839
Trap_T014: 0.00218287645839
Trap_T009: 0.00218287645839
spray_2013-08-15_2: 0.00218287645839
Depart_x_week_2: 0.00218287645839
DewPoint_y_week_1: 0.00218287645839
DewPoint_x_week_1: 0.00218287645839
Sunset_x_day_2: 0.00218287645839
DewPoint_x_day_2: 0.00218287645839
Street_ S DR MARTIN LUTHER KING JR DR: 0.00194033468142
Block_36: 0.00194033468142
Block_17: 0.00194033468142
Trap_T227: 0.00194033468142
Trap_T224: 0.00194033468142
Trap_T102: 0.00194033468142
Trap_T080: 0.00194033468142
Trap_T074: 0.00194033468142
Trap_T067: 0.00194033468142
Trap_T013: 0.00194033468142
ResultDir_x_week_2: 0.00194033468142
Tmax_x_week_2: 0.00194033468142
HZ_x_week_1: 0.00194033468142
ResultSpeed_x_day_3: 0.00194033468142
WetBulb_x: 0.00194033468142
Tmax_x: 0.00194033468142
Street_ W 89TH ST: 0.00169779290445
Street_ S TORRENCE AVE: 0.00169779290445
Street_ S STONY ISLAND AVE: 0.00169779290445
Street_ S HOYNE AVE: 0.00169779290445
Street_ N MOSELL AVE: 0.00169779290445
Block_51: 0.00169779290445
Block_37: 0.00169779290445
Trap_T220: 0.00169779290445
Trap_T215: 0.00169779290445
Trap_T095: 0.00169779290445
Trap_T018: 0.00169779290445
Trap_T011: 0.00169779290445
spray_2011-09-07_2: 0.00169779290445
StnPressure_x: 0.00169779290445
Street_ W WEBSTER AVE: 0.00145525101107
Street_ W OHARE AIRPORT: 0.00145525101107
Street_ W 65TH ST: 0.00145525101107
Street_ S UNION AVE: 0.00145525101107
Street_ S KOLMAR: 0.00145525101107
Street_ S AVENUE L: 0.00145525101107
Street_ N PULASKI RD: 0.00145525101107
Street_ E 118TH ST: 0.00145525101107
Block_48: 0.00145525101107
Block_33: 0.00145525101107
Trap_T226: 0.00145525101107
Trap_T145: 0.00145525101107
Trap_T115: 0.00145525101107
tavg_diff_week_2: 0.00145525101107
DewPoint_y_week_2: 0.00145525101107
Tavg_y_week_2: 0.00145525101107
Sunrise_x_week_1: 0.00145525101107
AvgSpeed_y_day_3: 0.00145525101107
ResultSpeed_y_day_3: 0.00145525101107
AvgSpeed_x_day_3: 0.00145525101107
tavg_diff_day_2: 0.00145525101107
SeaLevel_y_day_2: 0.00145525101107
Tmin_y_day_2: 0.00145525101107
AvgSpeed_x_day_2: 0.00145525101107
SeaLevel_x_day_2: 0.00145525101107
SeaLevel_y: 0.00145525101107
Street_ W STRONG ST: 0.0012127092341
Street_ N WESTERN AVE: 0.0012127092341
Street_  W ARMITAGE AVENUE: 0.0012127092341
Block_53: 0.0012127092341
Block_45: 0.0012127092341
Trap_T223: 0.0012127092341
Trap_T083: 0.0012127092341
Trap_T016: 0.0012127092341
Trap_T002: 0.0012127092341
SeaLevel_x_week_2: 0.0012127092341
tavg_diff_day_3: 0.0012127092341
precip_diff_day_3: 0.0012127092341
ResultDir_x_day_3: 0.0012127092341
precip_diff_day_2: 0.0012127092341
PrecipTotal_x_day_2: 0.0012127092341
Tmin_x_day_2: 0.0012127092341
PrecipTotal_y: 0.0012127092341
Street_ W ADDISON ST: 0.000970167340711
Street_ W 72ND ST: 0.000970167340711
Street_ S MELVINA AVE: 0.000970167340711
Street_ S COTTAGE GROVE: 0.000970167340711
Street_ N RUTHERFORD AVE: 0.000970167340711
Street_ N MILWAUKEE AVE: 0.000970167340711
Block_70: 0.000970167340711
Block_65: 0.000970167340711
Block_13: 0.000970167340711
Block_10: 0.000970167340711
Trap_T231: 0.000970167340711
Trap_T097: 0.000970167340711
Trap_T071: 0.000970167340711
Trap_T065: 0.000970167340711
Trap_T063: 0.000970167340711
Trap_T054: 0.000970167340711
Trap_T043: 0.000970167340711
Trap_T015: 0.000970167340711
ResultDir_y_week_2: 0.000970167340711
SeaLevel_y_week_2: 0.000970167340711
HZ_x_week_2: 0.000970167340711
PrecipTotal_x_week_2: 0.000970167340711
Tmin_x_week_2: 0.000970167340711
Cool_y_week_1: 0.000970167340711
WetBulb_y_week_1: 0.000970167340711
dewpoint_diff_day_3: 0.000970167340711
ResultDir_y_day_3: 0.000970167340711
SeaLevel_y_day_3: 0.000970167340711
AvgSpeed_y_day_2: 0.000970167340711
Tmax_x_day_2: 0.000970167340711
BR_y: 0.000970167340711
StnPressure_y: 0.000970167340711
Cool_y: 0.000970167340711
Street_ W ROOSEVELT: 0.000727625505533
Street_ W OHARE: 0.000727625505533
Street_ W MONTANA ST: 0.000727625505533
Street_ W GREENLEAF AVE: 0.000727625505533
Street_ W FULLERTON AVE: 0.000727625505533
Street_ W CHICAGO: 0.000727625505533
Street_ W 104TH ST: 0.000727625505533
Street_ S SOUTH SHORE DR: 0.000727625505533
Street_ S PULASKI RD: 0.000727625505533
Street_ S DOTY AVE: 0.000727625505533
Street_ N LEMONT AVE: 0.000727625505533
Street_ N CANNON DR: 0.000727625505533
Block_90: 0.000727625505533
Block_73: 0.000727625505533
Block_68: 0.000727625505533
Block_60: 0.000727625505533
Block_41: 0.000727625505533
Trap_T155: 0.000727625505533
Trap_T138: 0.000727625505533
Trap_T099: 0.000727625505533
Trap_T089: 0.000727625505533
Trap_T073: 0.000727625505533
Trap_T066: 0.000727625505533
Trap_T033: 0.000727625505533
Trap_T031: 0.000727625505533
Trap_T012: 0.000727625505533
precip_diff_week_2: 0.000727625505533
WetBulb_y_week_2: 0.000727625505533
Tmax_y_week_2: 0.000727625505533
WetBulb_x_week_2: 0.000727625505533
Tavg_x_week_2: 0.000727625505533
Sunset_x_week_1: 0.000727625505533
Cool_x_week_1: 0.000727625505533
Depart_x_day_3: 0.000727625505533
Tmax_x_day_3: 0.000727625505533
TS_y_day_2: 0.000727625505533
RA_y_day_2: 0.000727625505533
DewPoint_y_day_2: 0.000727625505533
TS_x_day_2: 0.000727625505533
Depart_x_day_2: 0.000727625505533
Tavg_y: 0.000727625505533
Street_ W HIGGINS AVE: 0.000485083670355
Street_ W 51ST ST: 0.000485083670355
Street_ S VINCENNES: 0.000485083670355
Street_ S DEARBORN ST: 0.000485083670355
Street_ S ASHLAND AVE: 0.000485083670355
Street_ N OKETO AVE: 0.000485083670355
Street_ N CENTRAL PARK DR: 0.000485083670355
Street_ N ASHLAND AVE OVERPASS: 0.000485083670355
Street_ N ASHLAND AVE: 0.000485083670355
Street_ E 115TH ST: 0.000485083670355
Block_79: 0.000485083670355
Block_77: 0.000485083670355
Block_67: 0.000485083670355
Block_66: 0.000485083670355
Block_63: 0.000485083670355
Block_42: 0.000485083670355
Block_24: 0.000485083670355
Block_15: 0.000485083670355
Trap_T233: 0.000485083670355
Trap_T221: 0.000485083670355
Trap_T158: 0.000485083670355
Trap_T135: 0.000485083670355
Trap_T085: 0.000485083670355
Trap_T077: 0.000485083670355
Trap_T030: 0.000485083670355
Trap_T027: 0.000485083670355
spray_2013-08-22_2: 0.000485083670355
Sunrise_x_week_2: 0.000485083670355
Cool_x_week_2: 0.000485083670355
DewPoint_x_week_2: 0.000485083670355
BR_y_week_1: 0.000485083670355
BR_x_week_1: 0.000485083670355
SnowFall_x_week_1: 0.000485083670355
TSRA_y_day_3: 0.000485083670355
RA_x_day_3: 0.000485083670355
HZ_x_day_3: 0.000485083670355
BR_y_day_2: 0.000485083670355
WetBulb_y_day_2: 0.000485083670355
HZ_x_day_2: 0.000485083670355
StnPressure_x_day_2: 0.000485083670355
WetBulb_x_day_2: 0.000485083670355
TS_y: 0.000485083670355
TS_x: 0.000485083670355
TSRA_x: 0.000485083670355
HZ_x: 0.000485083670355
Cool_x: 0.000485083670355
Street_ W HIGGINS RD: 0.000242541835178
Street_ W GARFIELD BLVD: 0.000242541835178
Street_ W FOSTER AVE: 0.000242541835178
Street_ W BELDEN AVE: 0.000242541835178
Street_ W 95TH ST: 0.000242541835178
Street_ W 64TH ST: 0.000242541835178
Street_ W 63RD ST: 0.000242541835178
Street_ W 127TH PL: 0.000242541835178
Street_ W 116TH ST: 0.000242541835178
Street_ W 113TH ST: 0.000242541835178
Street_ S MUSKEGON AVE: 0.000242541835178
Street_ S EBERHART AVE: 0.000242541835178
Street_ S CHICAGO AVE: 0.000242541835178
Street_ S CENTRAL AVE: 0.000242541835178
Street_ S CARPENTER ST: 0.000242541835178
Street_ N PITTSBURGH AVE: 0.000242541835178
Street_ N LARAMIE AVE: 0.000242541835178
Street_ N HARLEM AVE: 0.000242541835178
Street_ E 91ST PL: 0.000242541835178
Street_ E 138TH ST: 0.000242541835178
Street_ E 130TH ST: 0.000242541835178
Block_96: 0.000242541835178
Block_82: 0.000242541835178
Block_81: 0.000242541835178
Block_72: 0.000242541835178
Block_64: 0.000242541835178
Block_28: 0.000242541835178
Block_21: 0.000242541835178
Block_14: 0.000242541835178
Trap_T236: 0.000242541835178
Trap_T232: 0.000242541835178
Trap_T156: 0.000242541835178
Trap_T152: 0.000242541835178
Trap_T143: 0.000242541835178
Trap_T094: 0.000242541835178
Trap_T086: 0.000242541835178
Trap_T047: 0.000242541835178
Trap_T035: 0.000242541835178
spray_2013-08-29_2: 0.000242541835178
spray_2013-09-05_2: 0.000242541835178
HZ_y_week_2: 0.000242541835178
BR_y_week_2: 0.000242541835178
Cool_y_week_2: 0.000242541835178
HZ_y_week_1: 0.000242541835178
Heat_y_week_1: 0.000242541835178
TS_x_week_1: 0.000242541835178
TSRA_x_week_1: 0.000242541835178
Heat_x_week_1: 0.000242541835178
Cool_y_day_3: 0.000242541835178
Tmin_y_day_3: 0.000242541835178
BR_x_day_3: 0.000242541835178
SeaLevel_x_day_3: 0.000242541835178
StnPressure_x_day_3: 0.000242541835178
Sunset_x_day_3: 0.000242541835178
Tmin_x_day_3: 0.000242541835178
DZ_y_day_2: 0.000242541835178
PrecipTotal_y_day_2: 0.000242541835178
Cool_x_day_2: 0.000242541835178
Tavg_x_day_2: 0.000242541835178
TSRA_y: 0.000242541835178
RA_y: 0.000242541835178
RA_x: 0.000242541835178
SnowFall_x: 0.000242541835178
Street_ W SUNNYSIDE AVE: 0.0
Street_ W ROSCOE ST: 0.0
Street_ W PERSHING RD: 0.0
Street_ W LELAND AVE: 0.0
Street_ W GRANVILLE AVE: 0.0
Street_ W GRAND AVE: 0.0
Street_ W FARWELL AVE: 0.0
Street_ W BALMORAL AVE: 0.0
Street_ W AGATITE AVE: 0.0
Street_ W 85TH ST: 0.0
Street_ W 77TH ST: 0.0
Street_ W 37TH PL.: 0.0
Street_ W 31ST ST: 0.0
Street_ W 18TH ST: 0.0
Street_ S WESTERN AVE: 0.0
Street_ S WALLACE ST: 0.0
Street_ S THROOP: 0.0
Street_ S STATE ST: 0.0
Street_ S RACINE AVE: 0.0
Street_ S PEORIA ST: 0.0
Street_ S NORDICA: 0.0
Street_ S MILLARD AVE: 0.0
Street_ S MACKINAW: 0.0
Street_ S LONGWOOD DR: 0.0
Street_ S KILPATRICK AVE: 0.0
Street_ S KEDZIE AVE: 0.0
Street_ S HAMLIN AVE: 0.0
Street_ S CORNELL AVE: 0.0
Street_ S CICERO AVE: 0.0
Street_ S CALIFORNIA AVE: 0.0
Street_ S CALIFORNIA: 0.0
Street_ S BRANDON: 0.0
Street_ S AVENUE G: 0.0
Street_ S ARTESIAN AVE: 0.0
Street_ N TRIPP AVE: 0.0
Street_ N STREETER DR: 0.0
Street_ N STAVE ST: 0.0
Street_ N RIDGE AVE: 0.0
Street_ N RICHMOND ST: 0.0
Street_ N OAKLEY AVE: 0.0
Street_ N MONT CLARE AVE: 0.0
Street_ N MCCLELLAN AVE: 0.0
Street_ N MANDELL AVE: 0.0
Street_ N LONG AVE: 0.0
Street_ N LAWLER AVE: 0.0
Street_ N KEDVALE AVE: 0.0
Street_ N HUMBOLDT DR: 0.0
Street_ N FRANCISCO AVE: 0.0
Street_ N CAMPBELL AVE: 0.0
Street_ N CALIFORNIA AVE: 0.0
Street_ N AVONDALE AVE: 0.0
Street_ N AUSTIN AVE: 0.0
Street_ E RANDOLPH ST: 0.0
Street_ E 91ST ST: 0.0
Street_ E 67TH ST: 0.0
Street_ E 111TH ST: 0.0
Street_ E 105TH ST: 0.0
Block_98: 0.0
Block_93: 0.0
Block_80: 0.0
Block_75: 0.0
Block_55: 0.0
Block_47: 0.0
Block_43: 0.0
Block_39: 0.0
Block_38: 0.0
Block_34: 0.0
Block_29: 0.0
Block_27: 0.0
Block_20: 0.0
Block_18: 0.0
Species_CULEX TARSALIS: 0.0
Species_CULEX ERRATICUS: 0.0
Trap_T238: 0.0
Trap_T237: 0.0
Trap_T229: 0.0
Trap_T219: 0.0
Trap_T218: 0.0
Trap_T209: 0.0
Trap_T206: 0.0
Trap_T162: 0.0
Trap_T161: 0.0
Trap_T160: 0.0
Trap_T159: 0.0
Trap_T157: 0.0
Trap_T154: 0.0
Trap_T153: 0.0
Trap_T150: 0.0
Trap_T149: 0.0
Trap_T148: 0.0
Trap_T146: 0.0
Trap_T142: 0.0
Trap_T141: 0.0
Trap_T129: 0.0
Trap_T128: 0.0
Trap_T107: 0.0
Trap_T100: 0.0
Trap_T094B: 0.0
Trap_T092: 0.0
Trap_T091: 0.0
Trap_T088: 0.0
Trap_T084: 0.0
Trap_T078: 0.0
Trap_T076: 0.0
Trap_T075: 0.0
Trap_T072: 0.0
Trap_T070: 0.0
Trap_T060: 0.0
Trap_T054C: 0.0
Trap_T051: 0.0
Trap_T050: 0.0
Trap_T046: 0.0
Trap_T044: 0.0
Trap_T040: 0.0
Trap_T039: 0.0
Trap_T036: 0.0
Trap_T034: 0.0
Trap_T025: 0.0
Trap_T019: 0.0
Trap_T007: 0.0
Trap_T006: 0.0
Trap_T005: 0.0
Trap_T004: 0.0
Trap_T001: 0.0
spray_2011-08-29_2: 0.0
spray_2013-07-17_2: 0.0
GR_y_week_2: 0.0
VCFG_y_week_2: 0.0
SN_y_week_2: 0.0
SQ_y_week_2: 0.0
FG_y_week_2: 0.0
MIFG_y_week_2: 0.0
FG+_y_week_2: 0.0
BCFG_y_week_2: 0.0
DZ_y_week_2: 0.0
TS_y_week_2: 0.0
FU_y_week_2: 0.0
VCTS_y_week_2: 0.0
TSRA_y_week_2: 0.0
RA_y_week_2: 0.0
Heat_y_week_2: 0.0
GR_x_week_2: 0.0
VCFG_x_week_2: 0.0
SN_x_week_2: 0.0
SQ_x_week_2: 0.0
FG_x_week_2: 0.0
MIFG_x_week_2: 0.0
FG+_x_week_2: 0.0
BCFG_x_week_2: 0.0
DZ_x_week_2: 0.0
TS_x_week_2: 0.0
FU_x_week_2: 0.0
VCTS_x_week_2: 0.0
TSRA_x_week_2: 0.0
RA_x_week_2: 0.0
BR_x_week_2: 0.0
ResultSpeed_x_week_2: 0.0
StnPressure_x_week_2: 0.0
SnowFall_x_week_2: 0.0
Depth_x_week_2: 0.0
Sunset_x_week_2: 0.0
Heat_x_week_2: 0.0
GR_y_week_1: 0.0
VCFG_y_week_1: 0.0
SN_y_week_1: 0.0
SQ_y_week_1: 0.0
FG_y_week_1: 0.0
MIFG_y_week_1: 0.0
FG+_y_week_1: 0.0
BCFG_y_week_1: 0.0
DZ_y_week_1: 0.0
TS_y_week_1: 0.0
FU_y_week_1: 0.0
VCTS_y_week_1: 0.0
TSRA_y_week_1: 0.0
RA_y_week_1: 0.0
GR_x_week_1: 0.0
VCFG_x_week_1: 0.0
SN_x_week_1: 0.0
SQ_x_week_1: 0.0
FG_x_week_1: 0.0
MIFG_x_week_1: 0.0
FG+_x_week_1: 0.0
BCFG_x_week_1: 0.0
DZ_x_week_1: 0.0
FU_x_week_1: 0.0
VCTS_x_week_1: 0.0
RA_x_week_1: 0.0
Depth_x_week_1: 0.0
GR_y_day_3: 0.0
VCFG_y_day_3: 0.0
SN_y_day_3: 0.0
SQ_y_day_3: 0.0
FG_y_day_3: 0.0
MIFG_y_day_3: 0.0
FG+_y_day_3: 0.0
BCFG_y_day_3: 0.0
DZ_y_day_3: 0.0
TS_y_day_3: 0.0
FU_y_day_3: 0.0
VCTS_y_day_3: 0.0
RA_y_day_3: 0.0
HZ_y_day_3: 0.0
BR_y_day_3: 0.0
StnPressure_y_day_3: 0.0
PrecipTotal_y_day_3: 0.0
Heat_y_day_3: 0.0
WetBulb_y_day_3: 0.0
DewPoint_y_day_3: 0.0
Tavg_y_day_3: 0.0
Tmax_y_day_3: 0.0
GR_x_day_3: 0.0
VCFG_x_day_3: 0.0
SN_x_day_3: 0.0
SQ_x_day_3: 0.0
FG_x_day_3: 0.0
MIFG_x_day_3: 0.0
FG+_x_day_3: 0.0
BCFG_x_day_3: 0.0
DZ_x_day_3: 0.0
TS_x_day_3: 0.0
FU_x_day_3: 0.0
VCTS_x_day_3: 0.0
TSRA_x_day_3: 0.0
PrecipTotal_x_day_3: 0.0
SnowFall_x_day_3: 0.0
Depth_x_day_3: 0.0
Cool_x_day_3: 0.0
Heat_x_day_3: 0.0
WetBulb_x_day_3: 0.0
Tavg_x_day_3: 0.0
GR_y_day_2: 0.0
VCFG_y_day_2: 0.0
SN_y_day_2: 0.0
SQ_y_day_2: 0.0
FG_y_day_2: 0.0
MIFG_y_day_2: 0.0
FG+_y_day_2: 0.0
BCFG_y_day_2: 0.0
FU_y_day_2: 0.0
VCTS_y_day_2: 0.0
TSRA_y_day_2: 0.0
HZ_y_day_2: 0.0
StnPressure_y_day_2: 0.0
Cool_y_day_2: 0.0
Heat_y_day_2: 0.0
Tavg_y_day_2: 0.0
Tmax_y_day_2: 0.0
GR_x_day_2: 0.0
VCFG_x_day_2: 0.0
SN_x_day_2: 0.0
SQ_x_day_2: 0.0
FG_x_day_2: 0.0
MIFG_x_day_2: 0.0
FG+_x_day_2: 0.0
BCFG_x_day_2: 0.0
DZ_x_day_2: 0.0
FU_x_day_2: 0.0
VCTS_x_day_2: 0.0
TSRA_x_day_2: 0.0
RA_x_day_2: 0.0
BR_x_day_2: 0.0
SnowFall_x_day_2: 0.0
Depth_x_day_2: 0.0
Heat_x_day_2: 0.0
GR_y: 0.0
VCFG_y: 0.0
SN_y: 0.0
SQ_y: 0.0
FG_y: 0.0
MIFG_y: 0.0
FG+_y: 0.0
BCFG_y: 0.0
DZ_y: 0.0
FU_y: 0.0
VCTS_y: 0.0
HZ_y: 0.0
Heat_y: 0.0
WetBulb_y: 0.0
GR_x: 0.0
VCFG_x: 0.0
SN_x: 0.0
SQ_x: 0.0
FG_x: 0.0
MIFG_x: 0.0
FG+_x: 0.0
BCFG_x: 0.0
DZ_x: 0.0
FU_x: 0.0
VCTS_x: 0.0
BR_x: 0.0
Depth_x: 0.0
Heat_x: 0.0

<matplotlib.axes._subplots.AxesSubplot at 0x7f35fe7d59d0>

best_features = [ x[0] for x in list(filter(lambda a : a[1] >0, importances))]

temp = []
for prefix in ['Trap','Species', 'Block', 'Street']:
    importance = 0
    for i in importances:
        if i[0].startswith(prefix):
            importance += i[1]
    temp.append((prefix, importance))

temp

[('Trap', 0.15648854883329477),
 ('Species', 0.047709923703223467),
 ('Block', 0.086832061628228985),
 ('Street', 0.060353053486323915)]

importances = importances + temp

importances.sort(key=lambda a: a[1])
importances = importances[::-1]

names = zip(*importances)[0]
values = zip(*importances)[1]

fig, ax = plt.subplots()
fig.set_size_inches(11.7, 8.27)
features = sns.barplot(x=list(names)[0:50], y=list(values)[0:50])
features.set_ylabel('Feature importance')
_= plt.setp(features.get_xticklabels(), rotation=90)

t = list(values)

t = t[0:40]
t.append(sum(list(values)[40::]))

zip(x,t)

[('Trap', 0.15648854883329477),
 ('Block', 0.086832061628228985),
 ('phi', 0.0615458),
 ('Street', 0.060353053486323915),
 ('phi_x_rho', 0.059398856),
 ('rho', 0.052480917),
 ('Species', 0.047709923703223467),
 ('Sunrise_x', 0.029818702),
 ('humidity', 0.015744274),
 ('Species_CULEX RESTUANS', 0.012881679),
 ('ResultSpeed_y', 0.012404581),
 ('year', 0.012166031),
 ('Species_CULEX TERRITANS', 0.011211832),
 ('Sunrise_x_day_2', 0.0095419846),
 ('dewpoint_diff', 0.0093034348),
 ('humidity_week_1', 0.0090648858),
 ('AvgSpeed_x_week_1', 0.0088263359),
 ('Species_CULEX PIPIENS', 0.008587786),
 ('ResultSpeed_y_week_1', 0.008587786),
 ('AvgSpeed_y_week_1', 0.008349237),
 ('Sunset_x', 0.008349237),
 ('Species_CULEX SALINARIUS', 0.0076335878),
 ('Block_22', 0.0073950384),
 ('Species_CULEX PIPIENS/RESTUANS', 0.0073950384),
 ('DewPoint_x', 0.0073950384),
 ('day', 0.0071564885),
 ('Tmax_x_week_1', 0.0069179391),
 ('ResultSpeed_x', 0.0069179391),
 ('Block_58', 0.0064408397),
 ('Tmin_y_week_1', 0.0064408397),
 ('ResultSpeed_x_week_1', 0.0064408397),
 ('StnPressure_y_week_1', 0.0062022903),
 ('ResultDir_x', 0.0059637404),
 ('Trap_T090', 0.005725191),
 ('Trap_T103', 0.0054866411),
 ('Trap_T082', 0.0054866411),
 ('dewpoint_diff_week_1', 0.0054866411),
 ('PrecipTotal_y_week_1', 0.0052480916),
 ('humidity_day_2', 0.0052480916),
 ('Trap_T235', 0.0050095418),
 ('Trap_T017', 0.54174618214892689)]

x = list(names)[0:50]+['Remainder']

x

['Trap',
 'Block',
 'phi',
 'Street',
 'phi_x_rho',
 'rho',
 'Species',
 'Sunrise_x',
 'humidity',
 'Species_CULEX RESTUANS',
 'ResultSpeed_y',
 'year',
 'Species_CULEX TERRITANS',
 'Sunrise_x_day_2',
 'dewpoint_diff',
 'humidity_week_1',
 'AvgSpeed_x_week_1',
 'Species_CULEX PIPIENS',
 'ResultSpeed_y_week_1',
 'AvgSpeed_y_week_1',
 'Sunset_x',
 'Species_CULEX SALINARIUS',
 'Block_22',
 'Species_CULEX PIPIENS/RESTUANS',
 'DewPoint_x',
 'day',
 'Tmax_x_week_1',
 'ResultSpeed_x',
 'Block_58',
 'Tmin_y_week_1',
 'ResultSpeed_x_week_1',
 'StnPressure_y_week_1',
 'ResultDir_x',
 'Trap_T090',
 'Trap_T103',
 'Trap_T082',
 'dewpoint_diff_week_1',
 'PrecipTotal_y_week_1',
 'humidity_day_2',
 'Trap_T235',
 'Trap_T017',
 'Trap_T008',
 'Tmin_x_week_1',
 'Block_62',
 'Block_25',
 'Trap_T228',
 'Trap_T003',
 'humidity_week_2',
 'AvgSpeed_x_week_2',
 'ResultDir_y_week_1',
 'Remainder']