Hello and welcome to the Data Science Job Positions on Indeed project.</br> First, this is a revisited project that will give you an idea of the jobs on indeed in over 21 cities in the United States that were published and scraped in August 2017.
The dataset has over 619 advertized positions that mention annual/monthly/daily/hourly salaries. Further code and findings will be based on such observations. Our task was to predict if in a given city, we will be able to predict that the salary in the city would be above or below the overall median. The median was selected as a parameter for our set due to a strong skew to the right, for that reason the mean would not be desciptive of the data set.
I decided to add statistical data for each city and see if the statistical data can help predict if the mean salary would be below or above the overall median salary.
If you are mostly interested in the structure of the web-scaping code, please proceed to the very bottom of the page.
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
import time
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
df = pd.read_csv('../data/jobsdf.csv')
print df.shape
df.head()
Next steps:
- Drop column "Unnamed: 0"
- Drop all records that don't mention salaries leaves only 857 records
- See what's left
# Dropping "Unnamed: 0" and all lines that don't contain salaries
df = df[df.paybase.isnull() == False]
df.drop("Unnamed: 0", axis = 1, inplace = True)
print df.shape
df.head()
Drop duplicates
df.drop_duplicates(inplace = True)
df.reset_index(inplace = True)
df.drop("index", axis = 1, inplace = True)
print df.shape
df.head()
def eda(dataframe):
print "missing values \n", dataframe.isnull().sum()
print "dataframe index \n", dataframe.index
print "dataframe types \n", dataframe.dtypes
print "dataframe shape \n", dataframe.shape
print "dataframe describe \n", dataframe.describe()
for item in dataframe:
print item
print dataframe[item].nunique()
eda(df)
df.describe()
import matplotlib.pyplot as plt
df.salarytxt.plot.hist(figsize = (12,8),bins=10)
plt.show()
df[df.salarytxt < 20000]
print df.groupby(df.city)['summary'].count()
df.groupby(df.city)['summary'].count().plot(figsize = (16,12),kind = 'bar', fontsize = 14)
_ = plt.title("Number of Vacancies with Salaries in City", fontsize = 16)
_ = plt.ylabel("Number of Vacancies", fontsize = 14)
_ = plt.xlabel("City", fontsize = 14)
plt.show()
median = df.salarytxt.median()
median
cities = pd.read_csv('../data/city_stats.csv')
cities.dropna(inplace = True)
cities.head()
jobs = pd.merge(df, cities, left_on = 'city', right_on = 'City', how = "inner")
jobs
jobs[jobs.City.isnull() == True].head()
jobs = jobs.drop('city', axis = 1)
print jobs.shape
jobs.head()
temp = pd.DataFrame()
for i in jobs.paybase.unique():
mean_ = round(jobs[jobs.paybase == i].salarytxt.mean(),2)
median_ = round(jobs[jobs.paybase == i].salarytxt.median(),2)
n = jobs[jobs.paybase == i].salarytxt.count()
temp = temp.append([[i, mean_, median_, n]], ignore_index=True)
temp.columns = ['Base', 'Mean', 'Median','Count']
temp
temp.set_index('Base', inplace = True)
temp.Count.plot.pie(autopct='%.2f', figsize = (8,8), fontsize = 14)
plt.title('Number of Job Announcements with Salaries per Payment Base', fontsize = 14)
plt.show()
temp['Mean'].sort_values(ascending = False).plot(figsize = (12,8),kind = 'bar')
plt.show()
jobs = jobs[jobs.paybase == 'annual']
median_ = jobs.salarytxt.median()
print median_
jobs.shape
# If the salary is higher than the city median, then it's ONE
jobs['sal_to_med'] = (jobs.salarytxt > median_)
jobs.head()
jobs['MgrDummy'] = jobs.jobtitle.str.contains('supervisor|manager|director|senior|president')
jobs.head(2)
jobs_temp = jobs[['salarytxt', 'Population','Density','MgrDummy','sal_to_med']]
jobs_temp.shape
jobs_temp.head()
y = jobs_temp.sal_to_med
X = jobs_temp.drop(["sal_to_med",'salarytxt'], axis = 1)
print y.head(), X.head()
from sklearn import preprocessing
X_norm = preprocessing.normalize(X, norm = 'l1')
print X.shape
X_norm
X = pd.DataFrame(X_norm, columns = X.columns)
X.head()
jobs_categories = np.floor(jobs_temp[jobs_temp.columns[:-2]].rank() / len(jobs_temp) /.5001).astype(int)
jobs_categories = jobs_categories.join([jobs_temp.MgrDummy, jobs_temp.sal_to_med])
print jobs_categories.shape
jobs_categories.head()
# Below I initiate train-split and populating the vocabulary. Then will merge the two dataframes and will run the
# Random Forest
from sklearn.feature_extraction.text import CountVectorizer
# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.
vectorizer = CountVectorizer(analyzer = "word", \
tokenizer = None, \
preprocessor = None, \
stop_words = 'english', \
max_features = 50)
# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of
# strings.
train_data_features = vectorizer.fit_transform(jobs.summary)
# Numpy arrays are easy to work with, so convert the result to an
# array
train_data_features = train_data_features.toarray()
print train_data_features.shape
train_data_features
vocab = vectorizer.get_feature_names()
len(vocab)
summary = pd.DataFrame(train_data_features, columns = vocab)
print summary.shape
summary.head()
print 'DataFrame with: senior, manager, director or president in jobtitle: ',jobs[jobs.MgrDummy == True].shape
print 'Overall DataFrame ', jobs.shape
print jobs_categories.shape
jobs_categories.head(3)
X.head()
summary.head()
super_jobs = pd.concat([X,summary], axis = 1, ignore_index=True)
super_jobs.columns = list(X.columns) + list(summary.columns)
print 'Super Data Frame', super_jobs.shape
super_jobs.head()
X = super_jobs
X.head()
jobs_categories.head()
y = jobs_categories.sal_to_med
print X.shape
X.head()
#1. Split the data into training and testing parts
feat_labels = list(X.columns)
print len(feat_labels)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
print X.shape
print X_train.shape, X_test.shape
print y_train.shape, y_test.shape
# 2. Create a random forest classifier
clf = RandomForestClassifier(n_estimators=70, random_state=0, n_jobs=-1)
# Train the classifier
clf.fit(X_train, y_train)
# Print the name and gini importance of each feature
for feature in zip(feat_labels, clf.feature_importances_):
print(feature[:20])
X_w = summary
feat_labels = list(X_w.columns)
print len(feat_labels)
X_train, X_test, y_train, y_test = train_test_split(X_w, y, test_size=0.3, random_state=0)
# 2. Create a random forest classifier
clf = RandomForestClassifier(n_estimators=70, random_state=0, n_jobs=-1)
# Train the classifier
clf.fit(X_train, y_train)
# Print the name and gini importance of each feature
for feature in zip(feat_labels, clf.feature_importances_):
print(feature)
listofimportance = pd.DataFrame(zip(feat_labels,clf.feature_importances_), columns = ['words','importance'])
importantwords = listofimportance[listofimportance['importance']>0.003].sort_values('importance', ascending=False)
importantwords[:20]
# Let's see how exactly those words affect the salary prediction:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
m = lr.fit(X_w, y)
m = m.coef_.tolist()
m = pd.DataFrame(zip(X.columns, m[0]), columns = ['features','log'])
m['exp'] = np.exp(m.log)
print "Seven words most negatively affecting salary", m.sort_values('exp', ascending=True).head(7)
print "Seven words most positively affecting salary", m.sort_values('exp', ascending=False).head(7)
y = jobs_temp.salarytxt
print X.shape
print y.head()
X.head()
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
skf = StratifiedKFold(n_splits=10)
skf.get_n_splits(X, y)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=41)
dt = RandomForestClassifier(class_weight='balanced')
s = cross_val_score(dt, X, y, cv=cv, n_jobs=30)
print "{} Score:\t{:0.3} ± {:0.3}".format("Decision Tree", s.mean().round(3), s.std().round(3))
y = jobs_categories.sal_to_med
#X = jobs_categories.drop(['sal_to_med','salarytxt'], axis = 1)
print X.shape
print y.unique()
X.head()
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=41)
dt = DecisionTreeClassifier(class_weight='balanced')
s = cross_val_score(dt, X, y, cv=cv, n_jobs=-1)
print "{} Score:\t{:0.3} ± {:0.3}".format("Decision Tree", s.mean().round(3), s.std().round(3))
dt = RandomForestClassifier(class_weight='balanced')
s = cross_val_score(dt, X, y, cv=cv, n_jobs=1)
print "{} Score:\t{:0.3} ± {:0.3}".format("Random Forest with Balanced Classes", s.mean().round(3), s.std().round(3))
# Random Forest Classifier without class weight identification
dt = RandomForestClassifier()
s = cross_val_score(dt, X, y, cv=cv, n_jobs=20)
print "{} Score:\t{:0.3} ± {:0.3}".format("Random Forest without Balanced Classes", s.mean().round(3), s.std().round(3))
dt = BaggingClassifier()
s = cross_val_score(dt, X, y, cv=cv, n_jobs=1)
print "{} Score:\t{:0.3} ± {:0.3}".format("Bagging", s.mean().round(3), s.std().round(3))
Below is the web scraper code. It contains a couple of functions that you need to run before you start web scraping:
def salary(i):
if i.find("span", {"class":"no-wrap"}) != None:
js = str(i.find("span", {"class":"no-wrap"}).text.strip()).split()
#print js
#print float(re.sub(",", "",re.findall(r"\d+\,\d+|\d+\.\d+",js[0])[0]))
if js[-1] == 'year':
if js[1] == '-':
js1 = float(re.sub(",","",(re.findall(r"\d+\,\d+|\d+\.\d+",js[0])[0])))
#print js1
js2 = float(re.sub(",","",(re.findall(r"\d+\,\d+|\d+\.\d+",js[2])[0])))
#print js2
js = (js1+js2)/2
#print js
else: js = float(re.sub(",","",(re.findall(r"\d+\,\d+|\d+\.\d+",js[0])[0])))
#print js
elif js[-1] == 'hour':
if js[1] == '-':
#print js[0], js[1], js[2]
js1 = float(re.findall(r"\d+",js[0])[0])
#print js1
js2 = float(re.findall(r"\d+",js[2])[0])
#print js2
js = (js1+js2)/2*1600
#print js
else: js = float(re.findall(r"\d+",js[0])[0])*1600
elif js[-1] == 'day':
if js[1] == '-':
#print js[0], js[1], js[2]
js1 = float(re.findall(r"\d+",js[0])[0])
#print js1
js2 = float(re.findall(r"\d+",js[2])[0])
#print js2
js = (js1+js2)/2*200
#print js
else: js = float(re.findall(r"\d+",js[0])[0])*200
elif js[-1] == 'month':
if js[1] == '-':
#print js[0], js[1], js[2]
js1 = float(re.sub(",","",(re.findall(r"\d+\,\d+|\d+\.\d+",js[0])[0])))
#print js1
js2 = float(re.sub(",","",(re.findall(r"\d+\,\d+|\d+\.\d+",js[2])[0])))
#print js2
js = (js1+js2)/2*12
#print js
else: js = float(re.sub(",","",(re.findall(r"\d+\,\d+|\d+\.\d+",js[0])[0])))*12
else:
js = str('NaN')
#print js
return js
def base(i):
if i.find("span", {"class":"no-wrap"}) != None:
pb = str(i.find("span", {"class":"no-wrap"}).text.strip()).split()
# print pb
if pb[-1] == 'year':
pb = 'annual'
elif pb[-1] == 'hour':
pb = 'hourly'
elif pb[-1] == 'day':
pb = 'dayly'
elif pb[-1] == 'month':
pb = 'monthly'
else:
pb = str("NaN")
# print pb
return pb
#['New+York', 'Chicago', 'San+Francisco', 'Austin', 'Seattle','Los+Angeles', 'Philadelphia', 'Atlanta', 'Dallas', 'Pittsburgh',
#'Portland', 'Phoenix', 'Denver', 'Houston', 'Miami', MyCity]
MyCity = 'Washington+City%2CDC'
city_set = ['New+York%2CNY', 'Chicago', 'San+Francisco', 'Austin', 'Seattle',
'Los+Angeles', 'Philadelphia', 'Atlanta', 'Dallas', 'Pittsburgh',
'Portland%2COR', 'Phoenix', 'Denver', 'Houston', 'Miami', 'Boston', 'San+Diego',
'Baltimore', 'San+Jose', 'Minneapolis','San+Antonio%2CTX','Detroit','Columbus','Charlotte','Fort Worth',
'Jacksonville+FL', 'Fresno', 'Kansas+City', 'Mesa%2CAZ','Raleigh', MyCity]
start = '10'
#max_results_per_city = min(10,n)
jobsdf = pd.DataFrame()
for city in city_set:
page0 = requests.get('http://www.indeed.com/jobs?q=data+scientist+%2420%2C000&l='+str(city))
soup = BeautifulSoup(page0.text, 'html.parser', from_encoding = 'utf-8')
n = int((soup.find("div", {"id":"searchCount"}).text.strip().split()[-1]).replace(',' , ''))
for start in range(0, min(n,2000), 10):
page = requests.get('http://www.indeed.com/jobs?q=data+scientist+%2420%2C000&l='
+str(city)+'&start='+str(start))
time.sleep(1) #the head is borrowed from M>Salmon's code. Giving it a second to rest
soup = BeautifulSoup(page.text, 'html.parser', from_encoding = 'utf-8')
#blocks will pack each block with job announcement wrapped in div-result tag-attrs
blocks = soup.find_all("div", {"class":[" row result","lastRow row result"]})
for i in blocks:
jt = i.select_one("a")["title"].lower()
if i.find("span", {"class":"company"}) != None:
cn = i.find("span", {"class":"company"}).text.strip()
else: cn = str('NaN')
loc = str(i.find("span", {"class":"location"}).text.strip())
js = salary(i)
pb = base(i)
jsum = i.find("span", {"class":"summary"}).text.strip().lower()
# print jt,"/", cn, '/', city, "/", loc, '/', js, '/', pb, '/', jsum, "\n"
row = pd.DataFrame([[jt, cn, city, loc, js, pb,jsum]], columns = ['jobtitle','company','city','location','salarytxt','paybase', 'summary'])
jobsdf = pd.concat([jobsdf,row], ignore_index = True)
jobsdf.to_csv('../../project_3_data/jobsdf.csv', encoding='utf-8')