GA project term suggested to choose one of two options:
I decided to go with the first option.
The state of Iowa controls wholesales of alcoholic beverages with content of alcohol higher than 10%. That excludes wines and beers. Thus all stronger alcoholic beverages either imported or purchased by the state owned Iowa Alcoholic Beverages Division that adds universal markup of 50%, that increases the price for alcoholic beverages 1.5 times when sold to stores and wholesale distributors.
The dataset consists of about 2.7mln observations (records) of each individual purchase from the state by retailers or wholesalers like COSCO or Walmart. The data is stored at my Google Drive and you can read walking through this Jupyter Notebook. Each observation has 18 features describing the date, store number, store location town, zip code, county number, alcoholic beverage category, vendor number, item number, item name, volume of bottle, state cost per bottle, bottle cost, bottle retail price, number of bottles sold, revenue from yeach transaction and total volume sold.
For the purposes of this presentation, I used the extract of 10% of the initial data that has aboutu 270 thousand observations. You can try and re-run the project with the original set of data store here - https://drive.google.com/file/d/0B8xhteSGeVx6QWpXMUcxWGxJdk0/view?usp=sharing
The make takeaways are:
Don't hesitate to check the presentation (../data/2_project_Iowa_ABD.pdf)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 10, 10
import seaborn as sns
plt.style.use('ggplot')
% matplotlib inline
%load_ext giphy_magic
% matplotlib inline
from scipy import stats
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
## Load the data into a DataFrame
df = pd.read_csv('../data/Iowa_Liquor_sales_sample_10pct.csv',na_filter = True, low_memory=False)#, infer_datetime_format=True, dtype={‘County Number’:np.int64, ‘Category’: np.int64, 'State Bottle Cost' : np.float64, 'State Bottle Retail' : np.float64, 'Sale (Dollars)' : np.float64}, low_memory = False)
print df.shape
df.head()
df.info()
## Transform the dates if needed, e.g.
df.Date = pd.to_datetime(df["Date"], format = "%m/%d/%Y")
#I tried to convert all the dollar amounts into numbers. But was not succesfull to turn the County and Category numbers into integerts
#df['County Number'] = df['County Number'].astype(int)
#df['Category'] = df['Category'].astype(int)
df['State Bottle Cost'] = df['State Bottle Cost'].map(lambda x: str(x)[1:]).astype(float)
df['State Bottle Retail'] = df['State Bottle Retail'].map(lambda x: str(x)[1:]).astype(float)
df['Sale (Dollars)'] = df['Sale (Dollars)'].map(lambda x: str(x)[1:]).astype(float)
df['month'] = df.Date.dt.month
df['year'] = df.Date.dt.year
df.head()
def eda(dataframe):
print "missing values \n", dataframe.isnull().sum()
print "dataframe index \n", dataframe.index
print "dataframe types \n", dataframe.dtypes
print "dataframe shape \n", dataframe.shape
print "dataframe describe \n", dataframe.describe()
for item in dataframe:
print item
print dataframe[item].nunique()
eda(df)
print 'Total sum of sales with missing category is ---- $', df['Sale (Dollars)'][pd.isnull(df['Category']) == True].sum(), ', that is ', df['Sale (Dollars)'][pd.isnull(df['Category']) == True].sum()/(df['Sale (Dollars)'].sum()), 'of total sales'
print 'Total sum of sales with missing category name is $', df['Sale (Dollars)'][pd.isnull(df['Category Name']) == True].sum(),', that is ', df['Sale (Dollars)'][pd.isnull(df['Category Name']) == True].sum()/(df['Sale (Dollars)'].sum()), 'of total sales'
print 'Total sum of sales with missing County is ------ $', df['Sale (Dollars)'][pd.isnull(df['County']) == True].sum(),', that is ', df['Sale (Dollars)'][pd.isnull(df['Category Name']) == True].sum()/(df['Sale (Dollars)'].sum()), 'of total sales'
print 'While total sales are -------------------------- $', df['Sale (Dollars)'].sum()
dropped_sum = (df['Sale (Dollars)'][pd.isnull(df['Category']) == True].sum() + df['Sale (Dollars)'][pd.isnull(df['Category Name']) == True].sum() + df['Sale (Dollars)'][pd.isnull(df['County']) == True].sum())
print 'Total loss of sales numbers from missing data -- $',dropped_sum, 'That is ', dropped_sum/df['Sale (Dollars)'].sum()
# Here I gave another thought and decided to finally drop the missing lines and duplicates
df.dropna(inplace = True)
df.drop_duplicates(inplace = True)
df.shape
population = pd.read_csv('../data/Total_County_Population_by_Year.csv',index_col = 'County', na_filter = True)
population.head()
df = df.join(population, on = "County", how = "outer")
df.head()
dfpivot = pd.pivot_table(df, values = ['County','Sale (Dollars)','Bottles Sold', 'Volume Sold (Liters)'], index = ['Store Number'], aggfunc=np.sum)
dfpivot['av_price_unit'] = dfpivot['Sale (Dollars)']/dfpivot['Bottles Sold']
dfpivot['av_vol_unit'] = dfpivot['Volume Sold (Liters)'] / dfpivot['Bottles Sold']
dfpivot['Population'] = df['Population']
dfpivot['County'] = df['County']
dfpivot['liquor_capita'] = dfpivot['Volume Sold (Liters)'] / dfpivot['Population']
dfpivot = pd.DataFrame(dfpivot)
#dfpivot = dfpivot.merge(population, on = 'County', how = 'inner')
dfpivot.head()
df[(df.Date.dt.quarter == 1) & (df.Date.dt.year == 2015)].groupby('Store Number')
#print '5 counties with top average price per bottle bought \n'
#dfpivot.sort_values('av_price_unit', ascending = False).head()
# print '5 counties with least average price per bottle bought \n'
# dfpivot.sort_values('av_price_unit', ascending = False).tail()
# print '5 counties with top average volume per bottle bought \n'
# dfpivot.sort_values('av_vol_unit', ascending = False).head()
# print '5 counties with least average volume per bottle bought \n'
# dfpivot.sort_values('av_vol_unit', ascending = False).tail()
df.head()
I grouped the sales by each store for the year 2015
VolumeL = pd.DataFrame({'VolumeL':df.groupby(['Store Number','County','Population'])['Volume Sold (Liters)'].sum()})
Bottles = pd.DataFrame({'Bottles':df.groupby(['Store Number','County','Population'])['Bottles Sold'].sum()})
Sales_Y15Q1 = pd.DataFrame({'Q1Y15':df[(df.Date >= '2015-01-01') & (df.Date < '2015-04-01')].groupby(['Store Number','County', 'Population'])['Sale (Dollars)'].sum()})
Sales_Y15Q2 = pd.DataFrame({'Q2Y15':df[(df.Date >= '2015-04-01') & (df.Date < '2015-07-01')].groupby(['Store Number','County', 'Population'])['Sale (Dollars)'].sum()})
Sales_Y15Q3 = pd.DataFrame({'Q3Y15':df[(df.Date >= '2015-07-01') & (df.Date < '2015-10-01')].groupby(['Store Number','County', 'Population'])['Sale (Dollars)'].sum()})
Sales_Y15Q4 = pd.DataFrame({'Q4Y15':df[(df.Date >= '2015-10-01') & (df.Date < '2016-01-01')].groupby(['Store Number','County', 'Population'])['Sale (Dollars)'].sum()})
Sales_Y16Q1 = pd.DataFrame({'Q1Y16':df[(df.Date >= '2016-01-01') & (df.Date < '2016-04-01')].groupby(['Store Number','County', 'Population'])['Sale (Dollars)'].sum()})
Total_Sales = pd.concat([Bottles, VolumeL ,Sales_Y15Q1, Sales_Y15Q2, Sales_Y15Q3, Sales_Y15Q4, Sales_Y16Q1], axis =1, join = 'outer')
Total_Sales.fillna(0,inplace = True)
Total_Sales.head()
Total_Sales.tail()
# print Total_Sales.Q1Y16.sort_values(ascending = False).head()
# print 'Total Sales in Q1Y16 $', Total_Sales.Q1Y16.sort_values(ascending = False).sum()
I will:
VolumeL = pd.DataFrame({'VolumeL':df.groupby(['County','Population'])['Volume Sold (Liters)'].sum()})
Bottles = pd.DataFrame({'Bottles':df.groupby(['County','Population'])['Bottles Sold'].sum()})
Sales_Y15Q1 = pd.DataFrame({'Q1Y15':df[(df.Date >= '2015-01-01') & (df.Date < '2015-04-01')].groupby(['County', 'Population'])['Sale (Dollars)'].sum()})
Sales_Y15Q2 = pd.DataFrame({'Q2Y15':df[(df.Date >= '2015-04-01') & (df.Date < '2015-07-01')].groupby(['County', 'Population'])['Sale (Dollars)'].sum()})
Sales_Y15Q3 = pd.DataFrame({'Q3Y15':df[(df.Date >= '2015-07-01') & (df.Date < '2015-10-01')].groupby(['County', 'Population'])['Sale (Dollars)'].sum()})
Sales_Y15Q4 = pd.DataFrame({'Q4Y15':df[(df.Date >= '2015-10-01') & (df.Date < '2016-01-01')].groupby(['County', 'Population'])['Sale (Dollars)'].sum()})
Sales_Y16Q1 = pd.DataFrame({'Q1Y16':df[(df.Date >= '2016-01-01') & (df.Date < '2016-04-01')].groupby(['County', 'Population'])['Sale (Dollars)'].sum()})
County_Sales = pd.concat([Bottles, VolumeL ,Sales_Y15Q1, Sales_Y15Q2, Sales_Y15Q3, Sales_Y15Q4, Sales_Y16Q1], axis =1, join = 'outer')
County_Sales.reset_index(level = 1, inplace = True)
County_Sales['AvgBotVol'] = County_Sales.VolumeL / County_Sales.Bottles
County_Sales['AvgVolCap'] = County_Sales.VolumeL / County_Sales['Population']
County_Sales.head()
print Total_Sales.sum()
k = Total_Sales.Q1Y16.sum()/Total_Sales.Q1Y15.sum()
print 'YoY increment from 2015 to 2016:', k
Q2Y16 = Total_Sales.Q2Y15.sum()*Total_Sales.Q1Y16.sum()/Total_Sales.Q1Y15.sum()
print 'Forecast of sales for Q2 Y16: $', Q2Y16
FY16 = Total_Sales.Q3Y15.sum() + Total_Sales.Q4Y15.sum() + Total_Sales.Q1Y16.sum() + Q2Y16
print 'Thus the forecast fot the fiscal year 2016: $', FY16
Preliminary result: The forecast amount of sales in FY2016 - USD289,790,002 vs real USD288,908,790
Difference of $881,211, that's 1.2 percent error
For annual reports please click here - https://abd.iowa.gov/annual-reports
# For forecasting I will be using the per store sales
# Let's see the shape of the Working_Stores
print County_Sales.shape
County_Sales.head()
y = County_Sales.Q1Y16
X = County_Sales[['Q1Y15', 'AvgBotVol', 'AvgVolCap']]
# create training and testing vars
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print X_train.shape, y_train.shape
print X_test.shape, y_test.shape
# fit a model
lm = linear_model.LinearRegression()
print lm
model = lm.fit(X_train, y_train)
print model
predictions = lm.predict(X_test)
print predictions
print X_train.shape
print y_train.shape
#validate the model on the cv = k-1
## The line / model
plt.scatter(y_test, predictions)
plt.xlabel("True Values")
plt.ylabel("Predictions")
print "Score:", model.score(X_test, y_test)
print 'Model coefficient is:', lm.coef_
print lm.intercept_
print 'Based on multilinear regression, the forecast for fiscal year 2016 is $'
County_Sales['FY2016'] = (County_Sales.Q1Y15+County_Sales.Q2Y15+County_Sales.Q3Y15+County_Sales.Q4Y15)*lm.coef_[0]+County_Sales.AvgBotVol*lm.coef_[1]+County_Sales.AvgVolCap*lm.coef_[2]+lm.intercept_
County_Sales.head()
County_Sales.FY2016.sum()
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics
# Perform 6-fold cross validation
print County_Sales.shape
scores = cross_val_score(lm, X_train, y_train, cv=6)
print "Cross-validated scores:", scores
print "Average: ", scores.mean()
# LASSO regression
# Let me play with alpha = 0.01
# try alpha=0.01 and examine coefficients
from sklearn.linear_model import Lasso
lassoreg = Lasso(alpha=0.01, normalize=True)
lassoreg.fit(X_train, y_train)
print lassoreg.coef_
# calculate MSE (for alpha=0.01)
y_pred = lassoreg.predict(X_test)
print np.sqrt(metrics.mean_squared_error(y_test, y_pred))
# Not bad. Now I am happier.