West Nile Virus project was announced on kaggle.com (https://www.kaggle.com/c/predict-west-nile-virus) in 2015.
The task was to predict where and when in the city of Chicago shall we expect an outbreak of West Nile Virus. From the data provided by the competition organizers we had three sets of data:
The team built a dataset with over 1500 features and achieved Kaggle score of about 0.76. The main contributors to the project are:
import pandas as pd
import numpy as np
from sklearn import ensemble, preprocessing
import datetime as dt
from sklearn import model_selection, metrics
from matplotlib import pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import pickle
%matplotlib inline
# Load dataset
train = pd.read_csv('asset/train.csv')
test = pd.read_csv('asset/test.csv')
sample = pd.read_csv('asset/sampleSubmission.csv')
weather = pd.read_csv('asset/weather.csv')
spray = pd.read_csv('asset/spray.csv')
# Get labels
labels = train.WnvPresent.values
# Creating dummy variables for the weather data
weather.CodeSum = weather.CodeSum.apply(str.split)
for (i,list_) in enumerate(weather.CodeSum):
for item in list_:
if item not in weather.columns:
weather[item] = 0
weather.set_value(col=item,index=i,value=1)
# Not using codesum anymore
weather = weather.drop('CodeSum', axis=1)
# Split station 1 and 2 and join horizontally
weather_stn1 = weather[weather['Station']==1]
weather_stn2 = weather[weather['Station']==2]
weather_stn1 = weather_stn1.drop('Station', axis=1)
weather_stn2 = weather_stn2.drop('Station', axis=1)
weather = weather_stn1.merge(weather_stn2, on='Date')
train.Date = pd.to_datetime(train.Date)
test.Date = pd.to_datetime(test.Date)
weather.Date = pd.to_datetime(weather.Date)
spray.Date = pd.to_datetime(spray.Date)
# replace some missing values and T with -1
weather = weather.replace('M', -1)
weather = weather.replace('-', -1)
weather = weather.replace('T', -1)
weather = weather.replace(' T', -1)
weather = weather.replace(' T', -1)
weather['humidity'] = (weather.DewPoint_x)/(weather.Tavg_x.apply(float))
#weather['yoni_interaction_term_y'] = (weather.DewPoint_y)/(weather.Tavg_y.apply(float))
# Taking derivities of some columns
#weather['tmax_diff'] = weather.Tmax_x.diff()
weather['dewpoint_diff'] = weather.DewPoint_x.diff()
#weather['tmin_diff'] = weather.Tmin_x.diff()
#weather['wetbulb_diff'] = weather.WetBulb_x().apply(float).diff()
weather['precip_diff'] = weather.PrecipTotal_x.apply(float).diff()
weather['tavg_diff'] = weather.Tavg_x.apply(float).diff()
# Functions to extract month and day from dataset
train['year'] = train['Date'].dt.year
train['month'] = train['Date'].dt.month
train['day'] = train['Date'].dt.day
test['year'] = test['Date'].dt.year
test['month'] = test['Date'].dt.month
test['day'] = test['Date'].dt.day
# drop address columns
train = train.drop(['Address', 'AddressNumberAndStreet','WnvPresent', 'NumMosquitos'], axis = 1)
test = test.drop(['Id', 'Address', 'AddressNumberAndStreet'], axis = 1)
# This method merges the weather df with a dataframe and does so mulitple times by days for the given amount
# of days
def merge_weather_previous_days(df, days, df_weather = weather):
for i in range(1, days):
date_col = 'Date'+ '_' + str(i)
if(i == 0):
date_col = 'Date'
df[date_col] = df.Date.apply(lambda a: a - pd.Timedelta(days=1))
df = df.merge(weather, left_on=date_col, right_on='Date', suffixes=("","_day_" + str(i)))
return df
def merge_weather_previous_weeks(df, weeks, df_weather = weather):
for i in range(1, weeks):
date_col = 'Date'+ '_' + str(i)
if(i == 0):
date_col = 'Date'
df[date_col] = df.Date.apply(lambda a: a - pd.Timedelta(weeks=1))
df = df.merge(weather, left_on=date_col, right_on='Date', suffixes=("","_week_" + str(i)))
return df
# #This method is attempting to take the mean of weather over a weekly period, and append that values to our DF
# def agg_by_week(df, df_weather = weather):
# # grouping data in weather column by week and taking the average.
# # i'm going to use this data to build columns in the passed in data frame that has the average value
# # of the
# agg_weather = df_weather.set_index('Date').groupby(pd.TimeGrouper('W')).mean()
# for d in agg_weather.index:
# for c in agg_weather:
# if c not in df.columns:
# df[c] = agg_weather.get_value(d, c)
# for j in range(1, len(df)):
# if df.get_value(j, "Date") < d:
# if df.get_value(j-1, "Date") > d:
# df.set_value(j, c, agg_weather.get_value(d,c))
# else:
# break
#This method is attempting to take the mean of weather over a weekly period, and append that values to our DF
def agg_by_week(df, df_weather = weather):
# grouping data in weather column by week and taking the average.
# i'm going to use this data to build columns in the passed in data frame that has the average value
# of the
for i in range(0, len(df)):
df_date = df.get_value(i, 'Date')
agg_weather = df_weather.set_index('Date').groupby(pd.TimeGrouper('W')).mean()
for c in agg_weather.columns:
col_name = c+ "_agg"
if c.startswith('codesum'):
continue
if c not in df.columns:
df[col_name] = -1
value = agg_weather[c][(agg_weather.index < df_date) &
(agg_weather.index > df_date - pd.Timedelta(weeks=1))][0]
#print(value)
if np.isnan(value):
value = -1
df.set_value(i, col_name, value)
# Putting historical weather data onto the data frame
train = merge_weather_previous_days(train, 4, weather)
test = merge_weather_previous_days(test, 4, weather)
train = merge_weather_previous_weeks(train, 3, weather)
test = merge_weather_previous_weeks(test, 3, weather)
# This function allows you to create columns for the data_df input that will have a value of 1 or zero
# If a trap has been sprayed with in the time period specified
def create_sprayed_cols(data_df, spray_df, time_period=2):
# Iterating over unique dates that sprays took place
# Sprays took place over 10 days as trucks drove around chicago
for date in set(spray_df.Date):
# I only want data for this unique date
spray_temp = spray_df[spray_df.Date == date]
# Resetting index to make iterating easier
spray_temp.index = range(0, len(spray_temp))
# I am creating a column for every unique date and initalizing it's rows to 0
# I will set these values to 1 when I find a trap that was sprayed
col_name = 'spray_'+date.strftime('%Y-%m-%d')+"_"+str(time_period)
data_df[col_name] = 0
# Iterating over each row of our training data to determine if a trap is in the location
# of a spray. I am also checking to see if the spray was in the past
for r in range(0,len(data_df)):
if data_df.get_value(r,'Date') > date and data_df.get_value(r,'Date') < date + pd.Timedelta(weeks=time_period):
# I am casting the lat and long to ints, and multiplaying by 100 to truncate precision
# In other words, I'm taking pin points and making them into squares
cur_lat = int(data_df.get_value(r, 'Latitude') * 100)
cur_long = int(data_df.get_value(r, 'Longitude') * 100)
# Iterating over each value in my spray data
for i in range(0, len(spray_temp)):
spray_lat = int(spray_temp.get_value(i,'Latitude')*100)
spray_long = int(spray_temp.get_value(i,'Longitude')*100)
latdiff = spray_lat - cur_lat
longdiff = spray_long - cur_long
dis = .5 **(latdiff ** 2 + longdiff ** 2)
# I am now checking if something is in the square +/- some threshold
if (cur_lat < spray_lat + 15 and cur_lat > spray_lat - 15) and \
(cur_long < spray_long + 15 and cur_long > spray_long - 15):
data_df.set_value(r,col_name, 1)
break
# adding spray data
create_sprayed_cols(train, spray, time_period=2)
create_sprayed_cols(test,spray, time_period=2)
#Convert categorical data to numbers
# lbl = preprocessing.LabelEncoder()
# lbl.fit(list(train['Species'].values) + list(test['Species'].values))
# train['Species'] = lbl.transform(train['Species'].values)
# test['Species'] = lbl.transform(test['Species'].values)
# lbl.fit(list(train['Street'].values) + list(test['Street'].values))
# train['Street'] = lbl.transform(train['Street'].values)
# test['Street'] = lbl.transform(test['Street'].values)
# lbl.fit(list(train['Trap'].values) + list(test['Trap'].values))
# train['Trap'] = lbl.transform(train['Trap'].values)
# test['Trap'] = lbl.transform(test['Trap'].values)
#Converting data to catagoreical data. Instead of using the label encoder
train = pd.get_dummies(train, prefix = ['Trap', 'Species', 'Block', 'Street'],\
columns=['Trap','Species','Block','Street'])
test = pd.get_dummies(test, prefix = ['Trap', 'Species','Block','Street'],\
columns=['Trap','Species', 'Block', 'Street'])
# drop columns with -1s
train = train.loc[:,(train != -1).any(axis=0)]
test = test.loc[:,(test != -1).any(axis=0)]
# Making everything numerical if it is a string
train = train.convert_objects(convert_numeric=True)
test = test.convert_objects(convert_numeric=True)
# Converting data to polar coordinates. This may help location become a better predictor.
epicenter_lat = 41.903002
epicenter_long = -87.688267
# epicenter_rho = np.sqrt(epicenter_lat**2 + epicenter_long**2)
# epicenter_phi = np.tan(epicenter_long, epicenter_lat)
train['rho'] = np.sqrt((train['Latitude'] - epicenter_lat)**2 + (train['Longitude'] - epicenter_long)**2)
train['phi'] = np.arctan2((train['Latitude']- epicenter_lat),train['Longitude']- epicenter_long)
test['rho'] = np.sqrt((test['Latitude'] - epicenter_lat)**2 + (test['Longitude'] - epicenter_long)**2)
test['phi'] = np.arctan2((test['Latitude']- epicenter_lat),test['Longitude']- epicenter_long)
train['phi_x_rho'] = train.rho * train.phi
test['phi_x_rho'] = test.rho * test.phi
train = train.drop(train[list(filter(lambda a: a.startswith('Date'), train.columns))], axis=1)
test = test.drop(test[list(filter(lambda a: a.startswith('Date'), test.columns))], axis=1)
train = train.drop(['Latitude', 'Longitude'], axis=1)
test = test.drop(['Latitude', 'Longitude'], axis=1)
train = train.drop(set(train.columns) - set(test.columns), axis=1)
test = test.drop(set(test.columns) - set(train.columns), axis=1)
train.to_csv('train_w_weather.csv', index=False)
test.to_csv('test_w_weather.csv', index=False)
# train = pd.read_csv('train_w_weather.csv')
# test = pd.read_csv('test_w_weather.csv')
# labels = train.WnvPresent.values
features = train.columns
#features = best_features
xgb1 = xgb.XGBClassifier()
parameters = {'nthread':[6], #when use hyperthread, xgboost may become slower
'objective':['binary:logistic'],
'learning_rate': [0.05], #so called `eta` value
'max_depth': [2],
'min_child_weight': [1],
'silent': [1],
'subsample': [.91],
'colsample_bytree': [.58],
'n_estimators': [1500], #number of trees, change it to 1000 for better results
#'missing':[-999],
'seed': [1337]}
clf = model_selection.GridSearchCV(xgb1, parameters, n_jobs=6,
cv=model_selection.StratifiedKFold(n_splits=3, random_state=None, shuffle=True
),
scoring='roc_auc',
verbose=0, refit=True)
clf.fit(train[features], labels)
#model = clf.best_estimator_
#trust your CV!
best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
print('Raw AUC score:', score)
for param_name in sorted(best_parameters.keys()):
print("%s: %r" % (param_name, best_parameters[param_name]))
test_probs = clf.predict_proba(test[features])[:,1]
predictions = clf.predict_proba(test[features])[:,1]
sample['WnvPresent'] = predictions
sample.to_csv('beat_the_benchmark.csv', index=False)
model = clf.best_estimator_
# sample = pd.read_csv('../input/sample_submission.csv')
# sample.QuoteConversion_Flag = test_probs
# sample.to_csv("xgboost_best_parameter_submission.csv", index=False)
np.mean(model_selection.cross_val_score(model, X=train, y=labels, scoring='roc_auc', n_jobs=5))
# Zipping feature importances and sorting by how importnat they are
importances = list(zip(train.columns,model.feature_importances_))
#importances = list(zip(best_features,model.feature_importances_))
importances.sort(key=lambda a: a[1])
importances = importances[::-1]
# std = np.std([model.feature_importances_ for tree in model.estimators_],
# axis=0)
# Print the feature ranking
print("Feature ranking:")
for f in range(0, len(importances)):
print("{}: {}".format(importances[f][0], importances[f][1]))
# Plot the feature importances of the forest
sns.barplot(x=train.columns, y=model.feature_importances_)
best_features = [ x[0] for x in list(filter(lambda a : a[1] >0, importances))]
temp = []
for prefix in ['Trap','Species', 'Block', 'Street']:
importance = 0
for i in importances:
if i[0].startswith(prefix):
importance += i[1]
temp.append((prefix, importance))
temp
importances = importances + temp
importances.sort(key=lambda a: a[1])
importances = importances[::-1]
names = zip(*importances)[0]
values = zip(*importances)[1]
fig, ax = plt.subplots()
fig.set_size_inches(11.7, 8.27)
features = sns.barplot(x=list(names)[0:50], y=list(values)[0:50])
features.set_ylabel('Feature importance')
_= plt.setp(features.get_xticklabels(), rotation=90)
t = list(values)
t = t[0:40]
t.append(sum(list(values)[40::]))
zip(x,t)
x = list(names)[0:50]+['Remainder']
x