In [1]:

%matplotlib inline
import matplotlib.pyplot as plt
import datetime
import numpy as np
import pandas as pd
import seaborn as sns; sns.set()

sns.set_palette("tab10") # set color palette for seaborn

# stuff to make get request
import urllib.request
import json

def getResponse(url):
    operUrl = urllib.request.urlopen(url)
    if(operUrl.getcode()==200):
        data = operUrl.read()
        jsonData = json.loads(data)
    else:
        print("Error receiving data", operUrl.getcode())
    return jsonData

In [2]:

# make json request to get bike counts
json_data = getResponse("http://www.eco-public.com/api/cw6Xk4jW4X4R/data/periode/100117730?begin=20141217&end=20190703&step=4")

In [3]:

# get bicycle counts into dataframe
counts = pd.DataFrame.from_dict(json_data, orient='columns')
counts['Date'] = pd.DatetimeIndex(counts['date'])

daily = counts[['Date', 'comptage']]
daily.set_index('Date', inplace=True)
daily.columns = ['Total']

In [4]:

# take a look at counts
import matplotlib.dates as mdates
#df_2019 = daily.loc['2019-01-01':'2019-06-30']

# create the plot space upon which to plot the data
fig, ax= plt.subplots()

# add the x-axis and the y-axis to the plot
ax.bar(daily.index.values, 
        daily['Total'])

# format the ticks
years = mdates.YearLocator()   # every year
years_fmt = mdates.DateFormatter('%Y')

ax.xaxis.set_major_locator(years)
ax.xaxis.set_major_formatter(years_fmt)

# set title and labels for axes
ax.set(xlabel="Date",
       ylabel="Bike Counts",
       title="Galloping Goose Bike Counts, 2015-2019");

In [5]:

# turn off warning because it's annoying
pd.set_option('mode.chained_assignment', None)

# add binary columns for day of the week
days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
#daily.index.dayofweek
for i in range(7):
    daily[days[i]] = (daily.index.dayofweek == i).astype(float)

In [6]:

from datetime import date 
import holidays 

# Select country 
ca_holidays = holidays.Canada(years=[2015, 2016, 2017, 2018, 2019]) 
holiday_dates = pd.DatetimeIndex(ca_holidays.keys())

# create holiday indicator variable and join to daily bike data
daily = daily.join(pd.Series(1, index=holiday_dates, name='holiday')) # if holiday, code as 1
daily['holiday'].fillna(0, inplace=True) # if NaN, code as 0

In [7]:

# add variable for hours of daylight
# this fn is taken from Python Data Science Handbook
def hours_of_daylight(date, axis=23.44, latitude=47.61):
    """Compute hours of daylight for the given date"""
    days = (date - pd.datetime(2000, 12, 21)).days
    m = (1. - np.tan(np.radians(latitude)) * 
         np.tan(np.radians(axis) * np.cos(days * 2 * np.pi / 365.25))
        )
    return 24. * np.degrees(np.arccos(1 - np.clip(m, 0, 2))) / 180

daily['daylight_hrs'] = list(map(hours_of_daylight, daily.index))
#date = pd.datetime(2019, 7, 8)
#hours_of_daylight(date)
daily[['daylight_hrs']].plot()
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15),
          fancybox=True, shadow=True, ncol=5);

In [8]:

# import weather data
weather2015 = pd.read_csv('data/weather2015.csv', skiprows=24, index_col='Date/Time', parse_dates=True)
weather2016 = pd.read_csv('data/weather2016.csv', skiprows=24, index_col='Date/Time', parse_dates=True)
weather2017 = pd.read_csv('data/weather2017.csv', skiprows=24, index_col='Date/Time', parse_dates=True)
weather2018 = pd.read_csv('data/weather2018.csv', skiprows=24, index_col='Date/Time', parse_dates=True)
weather2019 = pd.read_csv('data/weather2019.csv', skiprows=24, index_col='Date/Time', parse_dates=True)

In [23]:

weather = weather2015.append(weather2016, ignore_index = False) \
.append(weather2017, ignore_index = False) \
.append(weather2018, ignore_index = False) \
.append(weather2019, ignore_index = False);

weather.index.names = ['Date']
weather.head()

Out[23]:

	Year	Month	Day	Data Quality	Max Temp (°C)	Max Temp Flag	Min Temp (°C)	Min Temp Flag	Mean Temp (°C)	Mean Temp Flag	...	Total Snow (cm)	Total Snow Flag	Total Precip (mm)	Total Precip Flag	Snow on Grnd (cm)	Snow on Grnd Flag	Dir of Max Gust (10s deg)	Dir of Max Gust Flag	Spd of Max Gust (km/h)	Spd of Max Gust Flag
Date
2015-01-01	2015	1	1	NaN	4.4	NaN	-0.4	NaN	2.0	NaN	...	NaN	M	NaN	M	NaN	NaN	NaN	NaN	<31	NaN
2015-01-02	2015	1	2	NaN	5.9	NaN	1.3	NaN	3.6	NaN	...	NaN	M	NaN	M	NaN	NaN	NaN	NaN	<31	NaN
2015-01-03	2015	1	3	NaN	5.9	NaN	2.7	NaN	4.3	NaN	...	NaN	M	NaN	M	NaN	NaN	NaN	NaN	<31	NaN
2015-01-04	2015	1	4	NaN	7.5	NaN	4.4	NaN	6.0	NaN	...	NaN	M	NaN	M	NaN	NaN	NaN	NaN	<31	NaN
2015-01-05	2015	1	5	NaN	12.1	NaN	6.9	NaN	9.5	NaN	...	NaN	M	28.6	NaN	NaN	NaN	22.0	NaN	33	NaN

5 rows × 26 columns

In [10]:

list(weather)

Out[10]:

['Year',
 'Month',
 'Day',
 'Data Quality',
 'Max Temp (°C)',
 'Max Temp Flag',
 'Min Temp (°C)',
 'Min Temp Flag',
 'Mean Temp (°C)',
 'Mean Temp Flag',
 'Heat Deg Days (°C)',
 'Heat Deg Days Flag',
 'Cool Deg Days (°C)',
 'Cool Deg Days Flag',
 'Total Rain (mm)',
 'Total Rain Flag',
 'Total Snow (cm)',
 'Total Snow Flag',
 'Total Precip (mm)',
 'Total Precip Flag',
 'Snow on Grnd (cm)',
 'Snow on Grnd Flag',
 'Dir of Max Gust (10s deg)',
 'Dir of Max Gust Flag',
 'Spd of Max Gust (km/h)',
 'Spd of Max Gust Flag']

In [11]:

# add weather data
weather['dry_day'] = (weather['Total Precip (mm)']==0).astype(int)

# join weather to bike data
daily = daily.join(weather[['Mean Temp (°C)', 'Total Precip (mm)', 'dry_day']])

In [13]:

# create variable to measure how many years have passed
# like a time trend
daily['annual'] = (daily.index - daily.index[0]).days / 365
daily.tail()

Out[13]:

	Total	Mon	Tue	Wed	Thu	Fri	Sat	Sun	holiday	daylight_hrs	Mean Temp (°C)	Total Precip (mm)	dry_day	annual
Date
2019-06-29	2335	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	15.748894	15.5	0.0	1.0	4.482192
2019-06-30	2581	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	15.738889	16.7	0.0	1.0	4.484932
2019-07-01	3222	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	15.727553	17.6	0.0	1.0	4.487671
2019-07-02	2190	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	15.714897	14.7	5.8	0.0	4.490411
2019-07-03	3449	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	15.700932	15.7	0.0	1.0	4.493151

In [15]:

# IT'S MODEL TIME!
from sklearn.linear_model import LinearRegression

# drop any rows with null values
daily.dropna(axis=0, how='any', inplace=True)

# choose X variables
column_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun', 'holiday',
                'daylight_hrs', 'Total Precip (mm)', 'dry_day', 'Mean Temp (°C)', 'annual']

X = daily[column_names]
y = daily['Total']

# tutorial says don't need intercept since we have day of week vars
# but I find that if you don't then coefs are weird
model = LinearRegression(fit_intercept=True) 
model.fit(X, y)
daily['predicted'] = model.predict(X)

In [16]:

# compare total and predicted bike traffic

# plot only 2018-2019
mask = (daily.index > '12-31-2017')
df_2019 = daily.loc[mask]

df_2019[['Total', 'predicted']].plot(alpha=0.5)
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.20),
          fancybox=True, shadow=True, ncol=5);

In [20]:

# look at coefficients
params = pd.Series(model.coef_, index=X.columns)
params

Out[20]:

Mon                  194.278784
Tue                  327.657751
Wed                  293.934535
Thu                  208.158051
Fri                   24.445311
Sat                 -521.452753
Sun                 -527.021680
holiday             -588.173486
daylight_hrs          99.501180
Total Precip (mm)    -17.840525
dry_day              335.350099
Mean Temp (°C)        66.425745
annual               135.192989
dtype: float64

In [21]:

# get uncertainty of coefficients using bootstrap (resampling) method
from sklearn.utils import resample
np.random.seed(1991)
err = np.std([model.fit(*resample(X,y)).coef_
              for i in range(1000)], 0)

print(pd.DataFrame({'effect': params.round(0),
                    'error': err.round(0)}))

                   effect  error
Mon                 194.0   23.0
Tue                 328.0   19.0
Wed                 294.0   17.0
Thu                 208.0   19.0
Fri                  24.0   17.0
Sat                -521.0   20.0
Sun                -527.0   19.0
holiday            -588.0   88.0
daylight_hrs        100.0    6.0
Total Precip (mm)   -18.0    3.0
dry_day             335.0   23.0
Mean Temp (°C)       66.0    3.0
annual              135.0    6.0

The Least Square
Trying to automate my job away.

Predicting Bike Traffic in Victoria, BC

Monica

Comments

The Least Square Trying to automate my job away.

Predicting Bike Traffic in Victoria, BC

Monica

Comments

The Least Square
Trying to automate my job away.