Assignment #9 (demo). Time series analysis. Solution#
Author: Mariya Mansurova, Analyst & developer in Yandex.Metrics team. Translated by Ivan Zakharov, ML enthusiast.
This material is subject to the terms and conditions of the Creative Commons CC BY-NC-SA 4.0 license. Free use is permitted for any non-commercial purpose.
Same assignment as a Kaggle Notebook + solution.
In this assignment, we are using Prophet and ARIMA to analyze the number of views for a Wikipedia page on Machine Learning.
Fill cells marked with “Your code here” and submit your answers to the questions through the web form.
import warnings
warnings.filterwarnings("ignore")
import os
import numpy as np
import pandas as pd
import requests
from plotly import __version__
from plotly import graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot, plot
from IPython.display import display, IFrame
print(__version__) # need 1.9.0 or greater
init_notebook_mode(connected=True)
5.23.0
def plotly_df(df, title="", width=800, height=500):
"""Visualize all the dataframe columns as line plots."""
common_kw = dict(x=df.index, mode="lines")
data = [go.Scatter(y=df[c], name=c, **common_kw) for c in df.columns]
layout = dict(title=title)
fig = dict(data=data, layout=layout)
# in a Jupyter Notebook, the following should work
#iplot(fig, show_link=False)
# in a Jupyter Book, we save a plot offline and then render it with IFrame
plot_path = f"../../_static/plotly_htmls/{title}.html".replace(" ", "_")
plot(fig, filename=plot_path, show_link=False, auto_open=False);
display(IFrame(plot_path, width=width, height=height))
Data preparation#
# for Jupyter-book, we copy data from GitHub, locally, to save Internet traffic,
# you can specify the data/ folder from the root of your cloned
# https://github.com/Yorko/mlcourse.ai repo, to save Internet traffic
DATA_PATH = "https://raw.githubusercontent.com/Yorko/mlcourse.ai/main/data/"
df = pd.read_csv(DATA_PATH + "wiki_machine_learning.csv", sep=" ")
df = df[df["count"] != 0]
df.head()
date | count | lang | page | rank | month | title | |
---|---|---|---|---|---|---|---|
81 | 2015-01-01 | 1414 | en | Machine_learning | 8708 | 201501 | Machine_learning |
80 | 2015-01-02 | 1920 | en | Machine_learning | 8708 | 201501 | Machine_learning |
79 | 2015-01-03 | 1338 | en | Machine_learning | 8708 | 201501 | Machine_learning |
78 | 2015-01-04 | 1404 | en | Machine_learning | 8708 | 201501 | Machine_learning |
77 | 2015-01-05 | 2264 | en | Machine_learning | 8708 | 201501 | Machine_learning |
df.shape
(383, 7)
Predicting with FB Prophet#
We will train at first 5 months and predict the number of trips for June.
df.date = pd.to_datetime(df.date)
plotly_df(df=df.set_index("date")[["count"]], title="assign9_plot")
from prophet import Prophet
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
Cell In[8], line 1
----> 1 from prophet import Prophet
File ~/Library/Caches/pypoetry/virtualenvs/mlcourse-ai-czBTtoES-py3.12/lib/python3.12/site-packages/prophet/__init__.py:7
1 # Copyright (c) 2017-present, Facebook, Inc.
2 # All rights reserved.
3 #
4 # This source code is licensed under the BSD-style license found in the
5 # LICENSE file in the root directory of this source tree. An additional grant
6 # of patent rights can be found in the PATENTS file in the same directory.
----> 7 from prophet.forecaster import Prophet
9 from pathlib import Path
10 about = {}
File ~/Library/Caches/pypoetry/virtualenvs/mlcourse-ai-czBTtoES-py3.12/lib/python3.12/site-packages/prophet/forecaster.py:28
25 logger.setLevel(logging.INFO)
26 NANOSECONDS_TO_SECONDS = 1000 * 1000 * 1000
---> 28 class Prophet(object):
29 """Prophet forecaster.
30
31 Parameters
(...)
79 holidays_mode: 'additive' or 'multiplicative'. Defaults to seasonality_mode.
80 """
82 def __init__(
83 self,
84 growth='linear',
(...)
101 holidays_mode=None,
102 ):
File ~/Library/Caches/pypoetry/virtualenvs/mlcourse-ai-czBTtoES-py3.12/lib/python3.12/site-packages/prophet/forecaster.py:459, in Prophet()
451 else:
452 self.changepoints_t = np.array([0]) # dummy changepoint
454 @staticmethod
455 def fourier_series(
456 dates: pd.Series,
457 period: Union[int, float],
458 series_order: int,
--> 459 ) -> NDArray[np.float_]:
460 """Provides Fourier series components with the specified frequency
461 and order.
462
(...)
471 Matrix with seasonality features.
472 """
473 if not (series_order >= 1):
File ~/Library/Caches/pypoetry/virtualenvs/mlcourse-ai-czBTtoES-py3.12/lib/python3.12/site-packages/numpy/__init__.py:400, in __getattr__(attr)
397 raise AttributeError(__former_attrs__[attr], name=None)
399 if attr in __expired_attributes__:
--> 400 raise AttributeError(
401 f"`np.{attr}` was removed in the NumPy 2.0 release. "
402 f"{__expired_attributes__[attr]}",
403 name=None
404 )
406 if attr == "chararray":
407 warnings.warn(
408 "`np.chararray` is deprecated and will be removed from "
409 "the main namespace in the future. Use an array with a string "
410 "or bytes dtype instead.", DeprecationWarning, stacklevel=2)
AttributeError: `np.float_` was removed in the NumPy 2.0 release. Use `np.float64` instead.
predictions = 30
df = df[["date", "count"]]
df.columns = ["ds", "y"]
df.tail()
train_df = df[:-predictions].copy()
m = Prophet()
m.fit(train_df);
future = m.make_future_dataframe(periods=predictions)
future.tail()
forecast = m.predict(future)
forecast.tail()
Question 1: What is the prediction of the number of views of the wiki page on January 20? Round to the nearest integer.
4947
3426 [+]
5229
2744
m.plot(forecast)
m.plot_components(forecast)
cmp_df = forecast.set_index("ds")[["yhat", "yhat_lower", "yhat_upper"]].join(
df.set_index("ds")
)
cmp_df["e"] = cmp_df["y"] - cmp_df["yhat"]
cmp_df["p"] = 100 * cmp_df["e"] / cmp_df["y"]
print("MAPE = ", round(np.mean(abs(cmp_df[-predictions:]["p"])), 2))
print("MAE = ", round(np.mean(abs(cmp_df[-predictions:]["e"])), 2))
Estimate the quality of the prediction with the last 30 points.
Question 2: What is MAPE equal to?
34.5 [+]
42.42
5.39
65.91
Question 3: What is MAE equal to?
355
4007
600 [+]
903
Predicting with ARIMA#
%matplotlib inline
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy import stats
plt.rcParams["figure.figsize"] = (15, 10)
Question 4: Let’s verify the stationarity of the series using the Dickey-Fuller test. Is the series stationary? What is the p-value?
Series is stationary, p_value = 0.107
Series is not stationary, p_value = 0.107 [+]
Series is stationary, p_value = 0.001
Series is not stationary, p_value = 0.001
sm.tsa.seasonal_decompose(train_df["y"].values, period=7).plot()
print("Dickey-Fuller test: p=%f" % sm.tsa.stattools.adfuller(train_df["y"])[1])
But the seasonally differentiated series will already be stationary.
train_df.set_index("ds", inplace=True)
train_df["y_diff"] = train_df.y - train_df.y.shift(7)
sm.tsa.seasonal_decompose(train_df.y_diff[7:].values, period=7).plot()
print("Dickey-Fuller test: p=%f" % sm.tsa.stattools.adfuller(train_df.y_diff[8:])[1])
ax = plt.subplot(211)
sm.graphics.tsa.plot_acf(train_df.y_diff[13:].values.squeeze(), lags=48, ax=ax)
ax = plt.subplot(212)
sm.graphics.tsa.plot_pacf(train_df.y_diff[13:].values.squeeze(), lags=48, ax=ax)
Initial values:
Q = 1
q = 3
P = 3
p = 1
ps = range(0, 2)
ds = range(0, 2)
qs = range(0, 4)
Ps = range(0, 4)
Ds = range(0, 3)
Qs = range(0, 2)
from itertools import product
parameters = product(ps, ds, qs, Ps, Ds, Qs)
parameters_list = list(parameters)
len(parameters_list)
%%time
import warnings
from tqdm.notebook import tqdm
results1 = []
best_aic = float("inf")
warnings.filterwarnings("ignore")
for param in tqdm(parameters_list):
# try except is necessary, because on some sets of parameters the model can not be trained
try:
model = sm.tsa.statespace.SARIMAX(
train_df["y"],
order=(param[0], param[1], param[2]),
seasonal_order=(param[3], param[4], param[5], 7),
# train the model as is even if that would lead to a non-stationary / non-invertible model
# see https://github.com/statsmodels/statsmodels/issues/6225 for details
).fit(disp=-1)
except (ValueError, np.linalg.LinAlgError):
continue
aic = model.aic
# save the best model, aic, parameters
if aic < best_aic:
best_model = model
best_aic = aic
best_param = param
results1.append([param, model.aic])
result_table1 = pd.DataFrame(results1)
result_table1.columns = ["parameters", "aic"]
print(result_table1.sort_values(by="aic", ascending=True).head())
If we consider the variants proposed in the form:
result_table1[
result_table1["parameters"].isin(
[(1, 0, 2, 3, 1, 0), (1, 1, 2, 3, 2, 1), (1, 1, 2, 3, 1, 1), (1, 0, 2, 3, 0, 0)]
)
].sort_values(by="aic")
Now do the same, but for the series with Box-Cox transformation.
import scipy.stats
train_df["y_box"], lmbda = scipy.stats.boxcox(train_df["y"])
print("The optimal Box-Cox transformation parameter: %f" % lmbda)
results2 = []
best_aic = float("inf")
for param in tqdm(parameters_list):
# try except is necessary, because on some sets of parameters the model can not be trained
try:
model = sm.tsa.statespace.SARIMAX(
train_df["y_box"],
order=(param[0], param[1], param[2]),
seasonal_order=(param[3], param[4], param[5], 7),
# train the model as is even if that would lead to a non-stationary / non-invertible model
# see https://github.com/statsmodels/statsmodels/issues/6225 for details
enforce_stationary=False,
enforce_invertibility=False
).fit(disp=-1)
except (ValueError, np.linalg.LinAlgError):
continue
aic = model.aic
# save the best model, aic, parameters
if aic < best_aic:
best_model = model
best_aic = aic
best_param = param
results2.append([param, model.aic])
warnings.filterwarnings("default")
result_table2 = pd.DataFrame(results2)
result_table2.columns = ["parameters", "aic"]
print(result_table2.sort_values(by="aic", ascending=True).head())
If we consider the variants proposed in the form:
result_table2[
result_table2["parameters"].isin(
[(1, 0, 2, 3, 1, 0), (1, 1, 2, 3, 2, 1), (1, 1, 2, 3, 1, 1), (1, 0, 2, 3, 0, 0)]
)
].sort_values(by="aic")
Next, we turn to the construction of the SARIMAX model (sm.tsa.statespace.SARIMAX
).
Question 5: What parameters are the best for the model according to the AIC
criterion?
D = 1, d = 0, Q = 0, q = 2, P = 3, p = 1
D = 2, d = 1, Q = 1, q = 2, P = 3, p = 1 [+]
D = 1, d = 1, Q = 1, q = 2, P = 3, p = 1
D = 0, d = 0, Q = 0, q = 2, P = 3, p = 1
Let’s look at the forecast of the best AIC model.
Note: any AIC below 3000 is suspicious, probably caused by non-convergence with MLE optimization, we’ll pick the 3rd-best model in terms of AIC to visualize predictions.
best_model = sm.tsa.statespace.SARIMAX(
train_df["y_box"],
order=(1, 0, 2),
seasonal_order=(3, 2, 1, 7),
enforce_stationary=False,
enforce_invertibility=False
).fit(disp=-1)
print(best_model.summary())
plt.subplot(211)
best_model.resid[13:].plot()
plt.ylabel(u"Residuals")
ax = plt.subplot(212)
sm.graphics.tsa.plot_acf(best_model.resid[13:].values.squeeze(), lags=48, ax=ax)
print("Student's test: p=%f" % stats.ttest_1samp(best_model.resid[13:], 0)[1])
print("Dickey-Fuller test: p=%f" % sm.tsa.stattools.adfuller(best_model.resid[13:])[1])
def invboxcox(y, lmbda):
# reverse Box Cox transformation
if lmbda == 0:
return np.exp(y)
else:
return np.exp(np.log(lmbda * y + 1) / lmbda)
train_df["arima_model"] = invboxcox(best_model.fittedvalues, lmbda)
train_df.y.tail(200).plot()
train_df.arima_model[13:].tail(200).plot(color="r")
plt.ylabel("wiki pageviews");