I also added data from other sources, global population and oil price for the same years.
I also compared the correlation for the global oil price vs terrorist activity and it becomes significant with data shift by 2 - 5 years. That speaks for accumulative effect of oil trades on criminal activities in the regions of Russian and globally.
import pandas as pd
import numpy as np
import matplotlib as plt
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from dateutil.relativedelta import relativedelta
import seaborn as sns
import statsmodels.api as sm
from statsmodels.tsa.stattools import acf
from statsmodels.tsa.stattools import pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from datetime import datetime
from urllib import urlretrieve
from zipfile import ZipFile
import pysal
from pandas.core import datetools
df = pd.read_csv('../../../../Google_Drive/GA_project_data/project_5/globalterrorismdb_0617dist.csv', low_memory=False)
def eda(dataframe):
print "missing values \n", dataframe.isnull().sum()
print "dataframe index \n", dataframe.index
print "dataframe types \n", dataframe.dtypes
print "dataframe shape \n", dataframe.shape
print "dataframe describe \n", dataframe.describe()
for item in dataframe:
print item
print dataframe[item].nunique()
# Get rid of the rows without geolocation details
df.longitude.dropna(axis = 0, inplace=True)
df_ = df[['iyear','imonth','iday', 'region', 'region_txt', 'country','country_txt','provstate',
'city', 'latitude', 'longitude', 'location', 'attacktype1',
'attacktype1_txt', 'target1', 'targtype1_txt', 'targsubtype1',
(170350, 21)
df_.rename(index = str, columns = {'iyear':'year','imonth':'month', 'iday':'day' }, inplace=True)
#Replace dates where day is 0 to 1
df_['day'] = df_.day.replace(0, 1)
#Drop those lines where month is 0
df_ = df_[df_.month != 0]
# Turn Year/Month/Day into a Date
df_['Date'] = pd.to_datetime(df_[['year', 'month', 'day']])
# Set Date as index
df_ = df_.set_index('Date')
# Let's check what we have
/Users/baurjansafi/anaconda/lib/python2.7/site-packages/pandas/core/frame.py:2844: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy **kwargs) /Users/baurjansafi/anaconda/lib/python2.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy after removing the cwd from sys.path.
year | month | day | region | region_txt | country | country_txt | provstate | city | latitude | ... | location | attacktype1 | attacktype1_txt | target1 | targtype1_txt | targsubtype1 | motive | nkill | nwound | weaptype1 | |
Date | |||||||||||||||||||||
1970-07-02 | 1970 | 7 | 2 | 2 | Central America & Caribbean | 58 | Dominican Republic | NaN | Santo Domingo | 18.456792 | ... | NaN | 1 | Assassination | Julio Guzman | Private Citizens & Property | 68.0 | NaN | 1.0 | 0.0 | 13 |
1970-01-01 | 1970 | 1 | 1 | 5 | Southeast Asia | 160 | Philippines | Tarlac | Unknown | 15.478598 | ... | NaN | 1 | Assassination | Employee | Journalists & Media | 54.0 | NaN | 1.0 | 0.0 | 13 |
1970-01-01 | 1970 | 1 | 1 | 8 | Western Europe | 78 | Greece | Attica | Athens | 37.983773 | ... | NaN | 3 | Bombing/Explosion | U.S. Embassy | Government (Diplomatic) | 46.0 | NaN | NaN | NaN | 6 |
1970-01-01 | 1970 | 1 | 1 | 4 | East Asia | 101 | Japan | NaN | Fukouka | 33.580412 | ... | NaN | 7 | Facility/Infrastructure Attack | U.S. Consulate | Government (Diplomatic) | 46.0 | NaN | NaN | NaN | 8 |
1970-01-01 | 1970 | 1 | 1 | 1 | North America | 217 | United States | Illinois | Cairo | 37.005105 | ... | NaN | 2 | Armed Assault | Cairo Police Headquarters | Police | 22.0 | To protest the Cairo Illinois Police Deparment | 0.0 | 0.0 | 5 |
5 rows × 21 columns
%matplotlib inline
title= 'Yearly Attacks',
plt.ylabel('Number of Attacks per Year')
<matplotlib.text.Text at 0x1209b87d0>
%matplotlib inline
title= 'Monthly Attacks',
month 1 14036 2 12975 3 14284 4 14213 5 15771 6 14258 7 15247 8 14802 9 13246 10 14728 11 14064 12 12706 Name: month, dtype: int64
%matplotlib inline
l = df_.attacktype1_txt.unique()
for i in l:
df_[df_.attacktype1_txt == i].groupby("year").attacktype1_txt.count().plot(figsize=(12,8),
title= "Yearly Attacks by Type of Attack",
_ = plt.legend(df_.attacktype1_txt.unique())
kind = 'bar',
title = "Number of Attacks by Region",
fontsize = 14
# let's see what regions have for terror attacks for the past 46 years.
<matplotlib.axes._subplots.AxesSubplot at 0x120a98ed0>
# import urban population data for each country in the past 46 years.
# resource is World Bank
df_pop= pd.read_csv('data/global_urban_population.csv')
# Convert the imported file into mergeable data
z = pd.DataFrame()
x = 0
for i in range(df_pop.shape[0]):
for j in range(1,df_pop.shape[1]):
z.set_value(x,'Country_Year', (df_pop['Country Name'][i] +"_" + str(df_pop.columns[j])))
z.set_value(x,'Urban_Pop', df_pop.iloc[i][j])
x += 1
Country_Year | Urban_Pop | |
0 | Aruba_1970 | 29900.0 |
1 | Aruba_1971 | 30082.0 |
2 | Aruba_1972 | 30275.0 |
3 | Aruba_1973 | 30470.0 |
4 | Aruba_1974 | 30605.0 |
# I found the date with global population and wanted to get an idea if the global terrorism
# is also a sign of overpopultion. Needed to convert it into a mergeable form for the Tableau visualization
oilprice = pd.read_csv('data/OilPrice.csv')
oilprice.set_index(['year'], inplace = True)
oilprice.plot(kind = 'line', color = 'black', figsize = (12,8), fontsize = 14)
plt.xlabel('Year 1970 - 2016', fontsize = 14)
plt.ylabel('Price for Oil $/barrel', fontsize = 14)
<matplotlib.text.Text at 0x1209e5550>
for i in df.region_txt.unique():
print i, df.region[df.region_txt == i].mean()
Central America & Caribbean 2.0 North America 1.0 Southeast Asia 5.0 Western Europe 8.0 East Asia 4.0 South America 3.0 Eastern Europe 9.0 Sub-Saharan Africa 11.0 Middle East & North Africa 10.0 Australasia & Oceania 12.0 South Asia 6.0 Central Asia 7.0
Terror attacks are a ripe area of research for Bayesian inference. Given their infrequency, it is (thankfully) difficult for us to assume a high number of samples that approach some normal distribution. Because of this, we should construct a prior about the amount of terror a given area has seen and update that prior with new information (like a new year of attacks or a contrasting country from within the same region). You should compare two populations of your choosing using Bayesian inference. We want to know if the amount of terror one area has seen differs in a significant way than another area (or time period!) For example, if you are interested in knowing if one country in South America differs in a significant way from another area, you may make your prior assume that some country is a country in South America with μ average attacks and σ variation across South American countries. You would then update that prior with the information of a single country in South America as well as a separate country in South America. How significantly do the resulting posteriors differ? (An important assumption made here is that the time periods are being held constant, perhaps a single year.) You should structure your own test of populations rather than using the above example. If you're unable to setup a different test, brainstorm with your squad in the Slack chat. You must justify the prior you selected and interpret your results (use credible intervals.) Remember you can attempt to use different priors (but don't "prior hack" to affect your output!)
# I would go with Bombings/Explosions in Russia
dfba = df_[(df_.attacktype1 == 3)&(df_.country == 167)]
kind = 'bar',
title = "Number of Bomb Explosions in Russia",
fontsize = 14
# Which is in line with the global trends and fead by price in oil that
# They used to steal from pipelines
<matplotlib.axes._subplots.AxesSubplot at 0x1191f7890>
import pymc3 as pm
import theano as thno
import theano.tensor as T
# Let me see if terrorism in Dagestan and Chechnya before Vladimir Putin
# became the president of Russia was of different nature?
dfba_prior = dfba[dfba.year < 2000]
# And after Putin has come to power
dfba_post = dfba[dfba.year >= 2000]
# Average number of bomb attacks in Russia before 2010:
prior_mean = dfba_prior.year.value_counts().mean()
prior_std = dfba_prior.year.value_counts().std()
print prior_mean
print prior_std
19.7142857143 9.1417410033
Dagestan = dfba_post[(dfba_post.provstate.str.contains('Dages'))|(dfba_post.city.str.contains('Dages'))].year.value_counts().values
Chechnya = dfba_post[(dfba_post.provstate.str.contains('Chech'))|(dfba_post.city.str.contains('Chech'))].year.value_counts().values
Ingushet = dfba_post[(dfba_post.provstate.str.contains('Chech'))|(dfba_post.city.str.contains('Chech'))].year.value_counts().values
D = df_[(df_.provstate.str.contains('Dages'))|(df_.city.str.contains('Dages'))].year.value_counts().sort_index().plot()
Ch = df_[(df_.provstate.str.contains('Chech'))|(df_.city.str.contains('Chech'))].year.value_counts().sort_index().plot()
In = df_[(df_.provstate.str.contains('Ingush'))|(df_.city.str.contains('Ingush'))].year.value_counts().sort_index().plot()
plt.legend(['Dagestan', 'Chechnya', 'Ingushetia'], fontsize = 14)
<matplotlib.legend.Legend at 0x132c243d0>
# Instatiating the model
with pm.Model() as model:
D_mean = pm.Normal('bombings_Dagestan_mean', prior_mean, sd = prior_std)
Ch_mean = pm.Normal('bombings_Chech_mean', prior_mean, sd = prior_std)
std_prior_lower = 0.01
std_prior_upper = 100.0
with model:
D_std = pm.Uniform('bombings_Dagestan_st', lower = std_prior_lower, upper = std_prior_upper)
Ch_std = pm.Uniform('bombings_Chechnya_st', lower = std_prior_lower, upper = std_prior_upper)
with model:
grpD = pm.Normal('group_Dagestan', mu=D_mean,sd = D_std, observed=Dagestan)
grpCh = pm.Normal('group_Chechnya', mu=Ch_mean,sd = Ch_std, observed=Chechnya)
with model:
diff_of_means = pm.Deterministic('difference of means', D_mean - Ch_mean)
diff_of_stds = pm.Deterministic('difference of stds', D_std - Ch_std)
effect_size = pm.Deterministic(
'effect size',
with model:
trace = pm.sample(20000, njobs = 4)
Auto-assigning NUTS sampler... Initializing NUTS using ADVI... Average Loss = 156.93: 5%|▍ | 9088/200000 [00:00<00:14, 13039.87it/s] Convergence archived at 9900 Interrupted at 9,900 [4%]: Average Loss = 161.14 100%|██████████| 20500/20500 [00:59<00:00, 342.14it/s]
varnames = ['bombings_Dagestan_mean', 'bombings_Chech_mean',
'bombings_Dagestan_st', 'bombings_Chechnya_st'],
color = '#87ceeb')
array([<matplotlib.axes._subplots.AxesSubplot object at 0x14bcfd9d0>, <matplotlib.axes._subplots.AxesSubplot object at 0x14bfd8f10>, <matplotlib.axes._subplots.AxesSubplot object at 0x14bfa9210>, <matplotlib.axes._subplots.AxesSubplot object at 0x14cbc8a10>], dtype=object)
varnames=['difference of means', 'difference of stds', 'effect size'],
array([<matplotlib.axes._subplots.AxesSubplot object at 0x14e8cca90>, <matplotlib.axes._subplots.AxesSubplot object at 0x14c57dd90>, <matplotlib.axes._subplots.AxesSubplot object at 0x13fc91350>], dtype=object)
pm.summary(trace[2000:], varnames = ['difference of means', 'difference of stds', 'effect size'])
difference of means: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------- 1.665 6.430 0.032 [-10.943, 14.293] Posterior quantiles: 2.5 25 50 75 97.5 |--------------|==============|==============|--------------| -10.992 -2.635 1.685 5.971 14.256 difference of stds: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------- -1.997 5.947 0.022 [-14.170, 9.762] Posterior quantiles: 2.5 25 50 75 97.5 |--------------|==============|==============|--------------| -14.160 -5.608 -1.918 1.721 9.784 effect size: Mean SD MC Error 95% HPD interval ------------------------------------------------------------------- 0.056 0.210 0.001 [-0.351, 0.473] Posterior quantiles: 2.5 25 50 75 97.5 |--------------|==============|==============|--------------| -0.354 -0.086 0.056 0.197 0.470
from pymc3 import traceplot
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x14ccb8390>, <matplotlib.axes._subplots.AxesSubplot object at 0x14c096c10>], [<matplotlib.axes._subplots.AxesSubplot object at 0x14f829bd0>, <matplotlib.axes._subplots.AxesSubplot object at 0x13fab0b90>], [<matplotlib.axes._subplots.AxesSubplot object at 0x14a39b350>, <matplotlib.axes._subplots.AxesSubplot object at 0x1498e1b90>], [<matplotlib.axes._subplots.AxesSubplot object at 0x14ccf5610>, <matplotlib.axes._subplots.AxesSubplot object at 0x1494da5d0>], [<matplotlib.axes._subplots.AxesSubplot object at 0x144729290>, <matplotlib.axes._subplots.AxesSubplot object at 0x148d84750>], [<matplotlib.axes._subplots.AxesSubplot object at 0x14f7c4a90>, <matplotlib.axes._subplots.AxesSubplot object at 0x14f78db50>], [<matplotlib.axes._subplots.AxesSubplot object at 0x14f679c50>, <matplotlib.axes._subplots.AxesSubplot object at 0x14a6a7d10>]], dtype=object)