In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import datetime
import numpy as np
import pandas as pd
import seaborn as sns; sns.set()
sns.set_palette("tab10") # set color palette for seaborn
# stuff to make get request
import urllib.request
import json
def getResponse(url):
operUrl = urllib.request.urlopen(url)
if(operUrl.getcode()==200):
data = operUrl.read()
jsonData = json.loads(data)
else:
print("Error receiving data", operUrl.getcode())
return jsonData
In [2]:
# make json request to get bike counts
json_data = getResponse("http://www.eco-public.com/api/cw6Xk4jW4X4R/data/periode/100117730?begin=20141217&end=20190703&step=4")
In [3]:
# get bicycle counts into dataframe
counts = pd.DataFrame.from_dict(json_data, orient='columns')
counts['Date'] = pd.DatetimeIndex(counts['date'])
daily = counts[['Date', 'comptage']]
daily.set_index('Date', inplace=True)
daily.columns = ['Total']
In [4]:
# take a look at counts
import matplotlib.dates as mdates
#df_2019 = daily.loc['2019-01-01':'2019-06-30']
# create the plot space upon which to plot the data
fig, ax= plt.subplots()
# add the x-axis and the y-axis to the plot
ax.bar(daily.index.values,
daily['Total'])
# format the ticks
years = mdates.YearLocator() # every year
years_fmt = mdates.DateFormatter('%Y')
ax.xaxis.set_major_locator(years)
ax.xaxis.set_major_formatter(years_fmt)
# set title and labels for axes
ax.set(xlabel="Date",
ylabel="Bike Counts",
title="Galloping Goose Bike Counts, 2015-2019");
In [5]:
# turn off warning because it's annoying
pd.set_option('mode.chained_assignment', None)
# add binary columns for day of the week
days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
#daily.index.dayofweek
for i in range(7):
daily[days[i]] = (daily.index.dayofweek == i).astype(float)
In [6]:
from datetime import date
import holidays
# Select country
ca_holidays = holidays.Canada(years=[2015, 2016, 2017, 2018, 2019])
holiday_dates = pd.DatetimeIndex(ca_holidays.keys())
# create holiday indicator variable and join to daily bike data
daily = daily.join(pd.Series(1, index=holiday_dates, name='holiday')) # if holiday, code as 1
daily['holiday'].fillna(0, inplace=True) # if NaN, code as 0
In [7]:
# add variable for hours of daylight
# this fn is taken from Python Data Science Handbook
def hours_of_daylight(date, axis=23.44, latitude=47.61):
"""Compute hours of daylight for the given date"""
days = (date - pd.datetime(2000, 12, 21)).days
m = (1. - np.tan(np.radians(latitude)) *
np.tan(np.radians(axis) * np.cos(days * 2 * np.pi / 365.25))
)
return 24. * np.degrees(np.arccos(1 - np.clip(m, 0, 2))) / 180
daily['daylight_hrs'] = list(map(hours_of_daylight, daily.index))
#date = pd.datetime(2019, 7, 8)
#hours_of_daylight(date)
daily[['daylight_hrs']].plot()
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15),
fancybox=True, shadow=True, ncol=5);
In [8]:
# import weather data
weather2015 = pd.read_csv('data/weather2015.csv', skiprows=24, index_col='Date/Time', parse_dates=True)
weather2016 = pd.read_csv('data/weather2016.csv', skiprows=24, index_col='Date/Time', parse_dates=True)
weather2017 = pd.read_csv('data/weather2017.csv', skiprows=24, index_col='Date/Time', parse_dates=True)
weather2018 = pd.read_csv('data/weather2018.csv', skiprows=24, index_col='Date/Time', parse_dates=True)
weather2019 = pd.read_csv('data/weather2019.csv', skiprows=24, index_col='Date/Time', parse_dates=True)
In [23]:
weather = weather2015.append(weather2016, ignore_index = False) \
.append(weather2017, ignore_index = False) \
.append(weather2018, ignore_index = False) \
.append(weather2019, ignore_index = False);
weather.index.names = ['Date']
weather.head()
Out[23]:
In [10]:
list(weather)
Out[10]:
In [11]:
# add weather data
weather['dry_day'] = (weather['Total Precip (mm)']==0).astype(int)
# join weather to bike data
daily = daily.join(weather[['Mean Temp (°C)', 'Total Precip (mm)', 'dry_day']])
In [13]:
# create variable to measure how many years have passed
# like a time trend
daily['annual'] = (daily.index - daily.index[0]).days / 365
daily.tail()
Out[13]:
In [15]:
# IT'S MODEL TIME!
from sklearn.linear_model import LinearRegression
# drop any rows with null values
daily.dropna(axis=0, how='any', inplace=True)
# choose X variables
column_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun', 'holiday',
'daylight_hrs', 'Total Precip (mm)', 'dry_day', 'Mean Temp (°C)', 'annual']
X = daily[column_names]
y = daily['Total']
# tutorial says don't need intercept since we have day of week vars
# but I find that if you don't then coefs are weird
model = LinearRegression(fit_intercept=True)
model.fit(X, y)
daily['predicted'] = model.predict(X)
In [16]:
# compare total and predicted bike traffic
# plot only 2018-2019
mask = (daily.index > '12-31-2017')
df_2019 = daily.loc[mask]
df_2019[['Total', 'predicted']].plot(alpha=0.5)
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.20),
fancybox=True, shadow=True, ncol=5);
In [20]:
# look at coefficients
params = pd.Series(model.coef_, index=X.columns)
params
Out[20]:
In [21]:
# get uncertainty of coefficients using bootstrap (resampling) method
from sklearn.utils import resample
np.random.seed(1991)
err = np.std([model.fit(*resample(X,y)).coef_
for i in range(1000)], 0)
print(pd.DataFrame({'effect': params.round(0),
'error': err.round(0)}))