diff --git a/fill_missing.ipynb b/fill_missing.ipynb
new file mode 100644
index 0000000..94151bf
--- /dev/null
+++ b/fill_missing.ipynb
@@ -0,0 +1,42091 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "using CSV, DataFrames, Statistics, LinearAlgebra, Dates, Gadfly, Colors, StatsBase\n",
+ "using Interpolations\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "rmse (generic function with 1 method)"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "function rmse(predicted, target)\n",
+ " differences = predicted - target \n",
+ " differences_squared = differences .^ 2 \n",
+ " mean_of_differences_squared = mean(differences_squared) \n",
+ " rmse_val = sqrt(mean_of_differences_squared) \n",
+ " return rmse_val\n",
+ " \n",
+ "end"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "┌ Warning: `head(df::AbstractDataFrame)` is deprecated, use `first(df, 6)` instead.\n",
+ "│ caller = top-level scope at In[3]:2\n",
+ "└ @ Core In[3]:2\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
| quarter | stock | date | open | high | low | close | volume | change |
---|
| Int64 | String | Date | Float64 | Float64 | Float64 | Float64⍰ | Int64 | Float64⍰ |
---|
6 rows × 9 columns
1 | 1 | AA | 2011-01-07 | 15.82 | 16.72 | 15.78 | 16.42 | 239655616 | 3.79267 |
---|
2 | 1 | AA | 2011-01-14 | 16.71 | 16.71 | 15.64 | 15.97 | 242963398 | -4.42849 |
---|
3 | 1 | AA | 2011-01-21 | 16.19 | 16.38 | 15.6 | 15.79 | 138428495 | -2.47066 |
---|
4 | 1 | AA | 2011-01-28 | 15.87 | 16.63 | 15.82 | 16.13 | 151379173 | 1.63831 |
---|
5 | 1 | AA | 2011-02-04 | 16.18 | 17.39 | 16.18 | 17.14 | 154387761 | 5.93325 |
---|
6 | 1 | AA | 2011-02-11 | 17.33 | 17.48 | 16.97 | 17.37 | 114691279 | 0.230814 |
---|
"
+ ],
+ "text/latex": [
+ "\\begin{tabular}{r|ccccccccc}\n",
+ "\t& quarter & stock & date & open & high & low & close & volume & change\\\\\n",
+ "\t\\hline\n",
+ "\t& Int64 & String & Date & Float64 & Float64 & Float64 & Float64⍰ & Int64 & Float64⍰\\\\\n",
+ "\t\\hline\n",
+ "\t1 & 1 & AA & 2011-01-07 & 15.82 & 16.72 & 15.78 & 16.42 & 239655616 & 3.79267 \\\\\n",
+ "\t2 & 1 & AA & 2011-01-14 & 16.71 & 16.71 & 15.64 & 15.97 & 242963398 & -4.42849 \\\\\n",
+ "\t3 & 1 & AA & 2011-01-21 & 16.19 & 16.38 & 15.6 & 15.79 & 138428495 & -2.47066 \\\\\n",
+ "\t4 & 1 & AA & 2011-01-28 & 15.87 & 16.63 & 15.82 & 16.13 & 151379173 & 1.63831 \\\\\n",
+ "\t5 & 1 & AA & 2011-02-04 & 16.18 & 17.39 & 16.18 & 17.14 & 154387761 & 5.93325 \\\\\n",
+ "\t6 & 1 & AA & 2011-02-11 & 17.33 & 17.48 & 16.97 & 17.37 & 114691279 & 0.230814 \\\\\n",
+ "\\end{tabular}\n"
+ ],
+ "text/plain": [
+ "6×9 DataFrame. Omitted printing of 2 columns\n",
+ "│ Row │ quarter │ stock │ date │ open │ high │ low │ close │\n",
+ "│ │ \u001b[90mInt64\u001b[39m │ \u001b[90mString\u001b[39m │ \u001b[90mDate\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64⍰\u001b[39m │\n",
+ "├─────┼─────────┼────────┼────────────┼─────────┼─────────┼─────────┼──────────┤\n",
+ "│ 1 │ 1 │ AA │ 2011-01-07 │ 15.82 │ 16.72 │ 15.78 │ 16.42 │\n",
+ "│ 2 │ 1 │ AA │ 2011-01-14 │ 16.71 │ 16.71 │ 15.64 │ 15.97 │\n",
+ "│ 3 │ 1 │ AA │ 2011-01-21 │ 16.19 │ 16.38 │ 15.6 │ 15.79 │\n",
+ "│ 4 │ 1 │ AA │ 2011-01-28 │ 15.87 │ 16.63 │ 15.82 │ 16.13 │\n",
+ "│ 5 │ 1 │ AA │ 2011-02-04 │ 16.18 │ 17.39 │ 16.18 │ 17.14 │\n",
+ "│ 6 │ 1 │ AA │ 2011-02-11 │ 17.33 │ 17.48 │ 16.97 │ 17.37 │"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train = CSV.read(\"stock_data.csv\")\n",
+ "head(train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ " | quarter | stock | date | open | high | low | close | volume | change |
---|
| Int64 | String | Date | Float64 | Float64 | Float64 | Float64⍰ | Int64 | Float64⍰ |
---|
25 rows × 9 columns
1 | 1 | DIS | 2011-01-07 | 37.74 | 40.0 | 37.62 | 39.45 | 72917621 | 4.531 |
---|
2 | 1 | DIS | 2011-01-14 | 39.01 | 39.81 | 38.92 | 39.29 | 31943413 | 0.717765 |
---|
3 | 1 | DIS | 2011-01-21 | 39.07 | 39.94 | 38.51 | missing | 36187032 | missing |
---|
4 | 1 | DIS | 2011-01-28 | 39.64 | 39.95 | 38.65 | 38.85 | 42131642 | -1.99294 |
---|
5 | 1 | DIS | 2011-02-04 | 39.04 | 40.77 | 38.64 | 40.71 | 53521486 | 4.27766 |
---|
6 | 1 | DIS | 2011-02-11 | 40.8 | 44.05 | 40.71 | 43.41 | 83975520 | 6.39706 |
---|
7 | 1 | DIS | 2011-02-18 | 43.19 | 43.9 | 42.98 | 43.56 | 33868302 | 0.85668 |
---|
8 | 1 | DIS | 2011-02-25 | 42.83 | 43.28 | 41.6 | missing | 56966763 | missing |
---|
9 | 1 | DIS | 2011-03-04 | 43.02 | 44.34 | 42.97 | 43.55 | 53096584 | 1.23199 |
---|
10 | 1 | DIS | 2011-03-11 | 43.53 | 43.61 | 42.16 | 42.93 | 41229388 | -1.37836 |
---|
11 | 1 | DIS | 2011-03-18 | 42.64 | 42.67 | 40.42 | 41.23 | 65060004 | -3.30675 |
---|
12 | 1 | DIS | 2011-03-25 | 41.46 | 43.24 | 40.87 | 42.97 | 40696371 | 3.64206 |
---|
13 | 2 | DIS | 2011-04-01 | 43.19 | 43.48 | 42.36 | 42.85 | 34130223 | -0.787219 |
---|
14 | 2 | DIS | 2011-04-08 | 42.87 | 43.05 | 41.4 | 41.76 | 39055258 | -2.58922 |
---|
15 | 2 | DIS | 2011-04-15 | 41.76 | 42.02 | 40.84 | missing | 40820710 | missing |
---|
16 | 2 | DIS | 2011-04-21 | 40.97 | 42.46 | 40.45 | missing | 24312043 | missing |
---|
17 | 2 | DIS | 2011-04-29 | 42.34 | 43.35 | 41.71 | 43.1 | 30122091 | 1.79499 |
---|
18 | 2 | DIS | 2011-05-06 | 43.47 | 43.79 | 42.3 | missing | 35369886 | missing |
---|
19 | 2 | DIS | 2011-05-13 | 43.32 | 44.12 | 40.94 | 41.52 | 78855110 | -4.15512 |
---|
20 | 2 | DIS | 2011-05-20 | 41.26 | 41.84 | 40.68 | 41.5 | 50988843 | 0.581677 |
---|
21 | 2 | DIS | 2011-05-27 | 41.15 | 41.52 | 40.55 | 41.52 | 34651349 | 0.899149 |
---|
22 | 2 | DIS | 2011-06-03 | 41.9 | 41.99 | 39.17 | 39.38 | 52169227 | -6.01432 |
---|
23 | 2 | DIS | 2011-06-10 | 39.18 | 39.88 | 38.42 | 38.5 | 49099516 | -1.73558 |
---|
24 | 2 | DIS | 2011-06-17 | 38.63 | 38.86 | 37.77 | 38.04 | 54513950 | -1.52731 |
---|
25 | 2 | DIS | 2011-06-24 | 37.85 | 38.89 | 37.19 | 37.58 | 50409504 | -0.713342 |
---|
"
+ ],
+ "text/latex": [
+ "\\begin{tabular}{r|ccccccccc}\n",
+ "\t& quarter & stock & date & open & high & low & close & volume & change\\\\\n",
+ "\t\\hline\n",
+ "\t& Int64 & String & Date & Float64 & Float64 & Float64 & Float64⍰ & Int64 & Float64⍰\\\\\n",
+ "\t\\hline\n",
+ "\t1 & 1 & DIS & 2011-01-07 & 37.74 & 40.0 & 37.62 & 39.45 & 72917621 & 4.531 \\\\\n",
+ "\t2 & 1 & DIS & 2011-01-14 & 39.01 & 39.81 & 38.92 & 39.29 & 31943413 & 0.717765 \\\\\n",
+ "\t3 & 1 & DIS & 2011-01-21 & 39.07 & 39.94 & 38.51 & & 36187032 & \\\\\n",
+ "\t4 & 1 & DIS & 2011-01-28 & 39.64 & 39.95 & 38.65 & 38.85 & 42131642 & -1.99294 \\\\\n",
+ "\t5 & 1 & DIS & 2011-02-04 & 39.04 & 40.77 & 38.64 & 40.71 & 53521486 & 4.27766 \\\\\n",
+ "\t6 & 1 & DIS & 2011-02-11 & 40.8 & 44.05 & 40.71 & 43.41 & 83975520 & 6.39706 \\\\\n",
+ "\t7 & 1 & DIS & 2011-02-18 & 43.19 & 43.9 & 42.98 & 43.56 & 33868302 & 0.85668 \\\\\n",
+ "\t8 & 1 & DIS & 2011-02-25 & 42.83 & 43.28 & 41.6 & & 56966763 & \\\\\n",
+ "\t9 & 1 & DIS & 2011-03-04 & 43.02 & 44.34 & 42.97 & 43.55 & 53096584 & 1.23199 \\\\\n",
+ "\t10 & 1 & DIS & 2011-03-11 & 43.53 & 43.61 & 42.16 & 42.93 & 41229388 & -1.37836 \\\\\n",
+ "\t11 & 1 & DIS & 2011-03-18 & 42.64 & 42.67 & 40.42 & 41.23 & 65060004 & -3.30675 \\\\\n",
+ "\t12 & 1 & DIS & 2011-03-25 & 41.46 & 43.24 & 40.87 & 42.97 & 40696371 & 3.64206 \\\\\n",
+ "\t13 & 2 & DIS & 2011-04-01 & 43.19 & 43.48 & 42.36 & 42.85 & 34130223 & -0.787219 \\\\\n",
+ "\t14 & 2 & DIS & 2011-04-08 & 42.87 & 43.05 & 41.4 & 41.76 & 39055258 & -2.58922 \\\\\n",
+ "\t15 & 2 & DIS & 2011-04-15 & 41.76 & 42.02 & 40.84 & & 40820710 & \\\\\n",
+ "\t16 & 2 & DIS & 2011-04-21 & 40.97 & 42.46 & 40.45 & & 24312043 & \\\\\n",
+ "\t17 & 2 & DIS & 2011-04-29 & 42.34 & 43.35 & 41.71 & 43.1 & 30122091 & 1.79499 \\\\\n",
+ "\t18 & 2 & DIS & 2011-05-06 & 43.47 & 43.79 & 42.3 & & 35369886 & \\\\\n",
+ "\t19 & 2 & DIS & 2011-05-13 & 43.32 & 44.12 & 40.94 & 41.52 & 78855110 & -4.15512 \\\\\n",
+ "\t20 & 2 & DIS & 2011-05-20 & 41.26 & 41.84 & 40.68 & 41.5 & 50988843 & 0.581677 \\\\\n",
+ "\t21 & 2 & DIS & 2011-05-27 & 41.15 & 41.52 & 40.55 & 41.52 & 34651349 & 0.899149 \\\\\n",
+ "\t22 & 2 & DIS & 2011-06-03 & 41.9 & 41.99 & 39.17 & 39.38 & 52169227 & -6.01432 \\\\\n",
+ "\t23 & 2 & DIS & 2011-06-10 & 39.18 & 39.88 & 38.42 & 38.5 & 49099516 & -1.73558 \\\\\n",
+ "\t24 & 2 & DIS & 2011-06-17 & 38.63 & 38.86 & 37.77 & 38.04 & 54513950 & -1.52731 \\\\\n",
+ "\t25 & 2 & DIS & 2011-06-24 & 37.85 & 38.89 & 37.19 & 37.58 & 50409504 & -0.713342 \\\\\n",
+ "\\end{tabular}\n"
+ ],
+ "text/plain": [
+ "25×9 SubDataFrame. Omitted printing of 2 columns\n",
+ "│ Row │ quarter │ stock │ date │ open │ high │ low │ close │\n",
+ "│ │ \u001b[90mInt64\u001b[39m │ \u001b[90mString\u001b[39m │ \u001b[90mDate\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64⍰\u001b[39m │\n",
+ "├─────┼─────────┼────────┼────────────┼─────────┼─────────┼─────────┼──────────┤\n",
+ "│ 1 │ 1 │ DIS │ 2011-01-07 │ 37.74 │ 40.0 │ 37.62 │ 39.45 │\n",
+ "│ 2 │ 1 │ DIS │ 2011-01-14 │ 39.01 │ 39.81 │ 38.92 │ 39.29 │\n",
+ "│ 3 │ 1 │ DIS │ 2011-01-21 │ 39.07 │ 39.94 │ 38.51 │ \u001b[90mmissing\u001b[39m │\n",
+ "│ 4 │ 1 │ DIS │ 2011-01-28 │ 39.64 │ 39.95 │ 38.65 │ 38.85 │\n",
+ "│ 5 │ 1 │ DIS │ 2011-02-04 │ 39.04 │ 40.77 │ 38.64 │ 40.71 │\n",
+ "│ 6 │ 1 │ DIS │ 2011-02-11 │ 40.8 │ 44.05 │ 40.71 │ 43.41 │\n",
+ "│ 7 │ 1 │ DIS │ 2011-02-18 │ 43.19 │ 43.9 │ 42.98 │ 43.56 │\n",
+ "│ 8 │ 1 │ DIS │ 2011-02-25 │ 42.83 │ 43.28 │ 41.6 │ \u001b[90mmissing\u001b[39m │\n",
+ "│ 9 │ 1 │ DIS │ 2011-03-04 │ 43.02 │ 44.34 │ 42.97 │ 43.55 │\n",
+ "│ 10 │ 1 │ DIS │ 2011-03-11 │ 43.53 │ 43.61 │ 42.16 │ 42.93 │\n",
+ "⋮\n",
+ "│ 15 │ 2 │ DIS │ 2011-04-15 │ 41.76 │ 42.02 │ 40.84 │ \u001b[90mmissing\u001b[39m │\n",
+ "│ 16 │ 2 │ DIS │ 2011-04-21 │ 40.97 │ 42.46 │ 40.45 │ \u001b[90mmissing\u001b[39m │\n",
+ "│ 17 │ 2 │ DIS │ 2011-04-29 │ 42.34 │ 43.35 │ 41.71 │ 43.1 │\n",
+ "│ 18 │ 2 │ DIS │ 2011-05-06 │ 43.47 │ 43.79 │ 42.3 │ \u001b[90mmissing\u001b[39m │\n",
+ "│ 19 │ 2 │ DIS │ 2011-05-13 │ 43.32 │ 44.12 │ 40.94 │ 41.52 │\n",
+ "│ 20 │ 2 │ DIS │ 2011-05-20 │ 41.26 │ 41.84 │ 40.68 │ 41.5 │\n",
+ "│ 21 │ 2 │ DIS │ 2011-05-27 │ 41.15 │ 41.52 │ 40.55 │ 41.52 │\n",
+ "│ 22 │ 2 │ DIS │ 2011-06-03 │ 41.9 │ 41.99 │ 39.17 │ 39.38 │\n",
+ "│ 23 │ 2 │ DIS │ 2011-06-10 │ 39.18 │ 39.88 │ 38.42 │ 38.5 │\n",
+ "│ 24 │ 2 │ DIS │ 2011-06-17 │ 38.63 │ 38.86 │ 37.77 │ 38.04 │\n",
+ "│ 25 │ 2 │ DIS │ 2011-06-24 │ 37.85 │ 38.89 │ 37.19 │ 37.58 │"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "mcd = groupby(train, :stock)[(stock=\"MCD\",)]\n",
+ "dis = groupby(train, :stock)[(stock=\"DIS\",)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/svg+xml": [
+ "\n",
+ "\n"
+ ],
+ "text/html": [
+ "\n",
+ "\n"
+ ],
+ "text/plain": [
+ "Plot(...)"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "mcd_missing = filter(row -> ismissing(row[:close]) == true, mcd) #get missing dataframe\n",
+ "mcd_missing[:,:start_date] .= mcd_missing[:,:date] - Dates.Day(7)\n",
+ "\n",
+ "plot(mcd, x=:date, y=:close, Geom.line, \n",
+ " layer(mcd_missing, xmin=:start_date , xmax=:date, Geom.vband), \n",
+ " Guide.title(\"Missing Values in MCD\"),\n",
+ "\n",
+ " )\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/svg+xml": [
+ "\n",
+ "\n"
+ ],
+ "text/html": [
+ "\n",
+ "\n"
+ ],
+ "text/plain": [
+ "Plot(...)"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dis_missing = filter(row -> ismissing(row[:close]) == true, dis) #get missing dataframe\n",
+ "dis_missing[:,:start_date] .= dis_missing[:,:date] - Dates.Day(7)\n",
+ "\n",
+ "plot(dis, x=:date, y=:close, Geom.line, \n",
+ " layer(dis_missing, xmin=:start_date , xmax=:date, Geom.vband), \n",
+ " Guide.title(\"Missing Values in DIS\"),\n",
+ "\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "mean: 41.084999999999994\n",
+ "variance: 3.94446842105263\n"
+ ]
+ }
+ ],
+ "source": [
+ "y_dis = dropmissing(dis)[!, :close]\n",
+ "println(\"mean: \", mean(y_dis)) \n",
+ "println(\"variance: \", var(y_dis)) \n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "mean: 77.092\n",
+ "variance: 9.911711578947369\n"
+ ]
+ }
+ ],
+ "source": [
+ "y_mcd = dropmissing(mcd)[!, :close]\n",
+ "println(\"mean: \", mean(y_mcd)) \n",
+ "println(\"variance: \", var(y_mcd)) "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Will try to fill in the missing data, via common timeseries imputation method, for that we will take the stock that is most correlated with mcd,dis, introduce missing values in the same locaiton as our original series\n",
+ "compute the rmse of the fit"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# DIS BA Pair"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/svg+xml": [
+ "\n",
+ "\n"
+ ],
+ "text/html": [
+ "\n",
+ "\n"
+ ],
+ "text/plain": [
+ "Plot(...)"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ba_actual = groupby(train, :stock)[(stock=\"BA\",)]\n",
+ "ba_actual = select(ba_actual, [:date, :close])\n",
+ "plot(ba_actual, x=:date, y=:close, Geom.line, Guide.title(\"BA actual\"))\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/svg+xml": [
+ "\n",
+ "\n"
+ ],
+ "text/html": [
+ "\n",
+ "\n"
+ ],
+ "text/plain": [
+ "Plot(...)"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ba_missing = copy(ba_actual);\n",
+ "allowmissing!(ba_missing) \n",
+ "ba_missing.close[3] = missing\n",
+ "ba_missing.close[8] = missing\n",
+ "ba_missing.close[15] = missing\n",
+ "ba_missing.close[16] = missing\n",
+ "ba_missing.close[18] = missing\n",
+ "\n",
+ "plot(ba_missing, x=:date, y=:close, Geom.line, Guide.title(\"BA Missing\"))\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/svg+xml": [
+ "\n",
+ "\n"
+ ],
+ "text/html": [
+ "\n",
+ "\n"
+ ],
+ "text/plain": [
+ "Plot(...)"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "## mean fill \n",
+ "ba_mean = copy(ba_missing)\n",
+ "recode!(ba_mean[!, :close], missing => mean(skipmissing(ba_mean[!, :close])));\n",
+ "rmse_error = rmse(ba_mean[!, :close], ba_actual[!, :close])\n",
+ "plot(ba_mean, x=:date, y=:close, Geom.line, Guide.title(\"BA mean fill with $rmse_error \"))\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/svg+xml": [
+ "\n",
+ "\n"
+ ],
+ "text/html": [
+ "\n",
+ "\n"
+ ],
+ "text/plain": [
+ "Plot(...)"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "## random fill \n",
+ "ba_random = copy(ba_missing)\n",
+ "max_ba= maximum(dropmissing(ba_missing)[!, :close])\n",
+ "min_ba = minimum(dropmissing(ba_missing)[!, :close])\n",
+ "\n",
+ "recode!(ba_random[!, :close], missing => rand(min_ba:max_ba))\n",
+ "rmse_error = rmse(ba_random[!, :close], ba_actual[!, :close])\n",
+ "plot(ba_random, x=:date, y=:close, Geom.line, Guide.title(\"BA random fill with $rmse_error \"))\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "## linear interpolation \n",
+ "\n",
+ "ba_interpolation = copy(ba_missing)\n",
+ "ba_interpolation_ = copy(ba_missing)\n",
+ "ba_interpolation = hcat(ba_interpolation, collect(1:size(ba_interpolation,1))); #this will be have as our x\n",
+ "ba_interpolation= dropmissing(ba_interpolation)\n",
+ "itp = LinearInterpolation(ba_interpolation[!, :x1], ba_interpolation[!, :close],extrapolation_bc=Flat());\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dates = filter(row -> ismissing(row[:close]) == true, ba_interpolation_)[:, :date]\n",
+ "\n",
+ "for date in dates\n",
+ " ind = findfirst(ba_interpolation_[:,:date] .== date)\n",
+ " ba_interpolation_[ind, :][:close] = itp(ind)\n",
+ "end\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/svg+xml": [
+ "\n",
+ "\n"
+ ],
+ "text/html": [
+ "\n",
+ "\n"
+ ],
+ "text/plain": [
+ "Plot(...)"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "rmse_error = rmse(ba_interpolation_[!, :close], ba_actual[!, :close])\n",
+ "plot(ba_interpolation_, x=:date, y=:close, Geom.line, Guide.title(\"BA linear interpolation fill with $rmse_error \"))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "┌ Warning: `range(start, stop)` (with neither `length` nor `step` given) is deprecated, use `range(start, stop=stop)` instead.\n",
+ "│ caller = top-level scope at In[25]:10\n",
+ "└ @ Core ./In[25]:10\n"
+ ]
+ },
+ {
+ "data": {
+ "image/svg+xml": [
+ "\n",
+ "\n"
+ ],
+ "text/html": [
+ "\n",
+ "\n"
+ ],
+ "text/plain": [
+ "Plot(...)"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# rolling mean fill aka. Moving average\n",
+ "\n",
+ "\n",
+ "ba_ma = copy(ba_missing)\n",
+ "windowsize= 4 # it will calculate the mean from the missing value +- windowsize\n",
+ "dates = filter(row -> ismissing(row[:close]) == true, ba_ma)[:, :date]\n",
+ "\n",
+ "for date in dates\n",
+ " ind = findfirst(ba_ma[:,:date] .== date)\n",
+ " ba_ma[ind, :][:close] = mean(dropmissing(ba_ma)[range(max(ind - windowsize, 1), ind + windowsize),:close]) # taking the max because I cant have zero index\n",
+ "end\n",
+ "\n",
+ "\n",
+ "rmse_error = rmse(ba_ma[!, :close], ba_actual[!, :close])\n",
+ "plot(ba_ma, x=:date, y=:close, Geom.line, Guide.title(\"BA Moving average window size $windowsize with error: $rmse_error \"))\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# MCD KO Pair"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/svg+xml": [
+ "\n",
+ "\n"
+ ],
+ "text/html": [
+ "\n",
+ "\n"
+ ],
+ "text/plain": [
+ "Plot(...)"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ko_actual = groupby(train, :stock)[(stock=\"KO\",)]\n",
+ "ko_actual = select(ko_actual, [:date, :close])\n",
+ "plot(ko_actual, x=:date, y=:close, Geom.line, Guide.title(\"KO actual\"))\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/svg+xml": [
+ "\n",
+ "\n"
+ ],
+ "text/html": [
+ "\n",
+ "\n"
+ ],
+ "text/plain": [
+ "Plot(...)"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ko_missing = copy(ba_actual);\n",
+ "allowmissing!(ko_missing) \n",
+ "ko_missing.close[12] = missing\n",
+ "ko_missing.close[14] = missing\n",
+ "ko_missing.close[17] = missing\n",
+ "ko_missing.close[18] = missing\n",
+ "ko_missing.close[21] = missing\n",
+ "\n",
+ "plot(ko_missing, x=:date, y=:close, Geom.line, Guide.title(\"KO Missing\"))\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/svg+xml": [
+ "\n",
+ "\n"
+ ],
+ "text/html": [
+ "\n",
+ "\n"
+ ],
+ "text/plain": [
+ "Plot(...)"
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "## mean fill \n",
+ "ko_mean = copy(ko_missing)\n",
+ "recode!(ko_mean[!, :close], missing => mean(skipmissing(ko_mean[!, :close])));\n",
+ "rmse_error = rmse(ko_mean[!, :close], ko_actual[!, :close])\n",
+ "plot(ko_mean, x=:date, y=:close, Geom.line, Guide.title(\"KO mean fill with $rmse_error \"))\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/svg+xml": [
+ "\n",
+ "\n"
+ ],
+ "text/html": [
+ "\n",
+ "\n"
+ ],
+ "text/plain": [
+ "Plot(...)"
+ ]
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "## random fill \n",
+ "ko_random = copy(ko_missing)\n",
+ "max_ko= maximum(dropmissing(ko_missing)[!, :close])\n",
+ "min_ko = minimum(dropmissing(ko_missing)[!, :close])\n",
+ "\n",
+ "recode!(ko_random[!, :close], missing => rand(min_ko:max_ko))\n",
+ "rmse_error = rmse(ko_random[!, :close], ko_actual[!, :close])\n",
+ "plot(ko_random, x=:date, y=:close, Geom.line, Guide.title(\"KO random fill with $rmse_error \"))\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "## linear interpolation \n",
+ "\n",
+ "ko_interpolation = copy(ko_missing)\n",
+ "ko_interpolation_ = copy(ko_missing)\n",
+ "ko_interpolation = hcat(ko_interpolation, collect(1:size(ko_interpolation,1))); #this will be have as our x\n",
+ "ko_interpolation = dropmissing(ba_interpolation)\n",
+ "itp = LinearInterpolation(ko_interpolation[!, :x1], ko_interpolation[!, :close],extrapolation_bc=Flat());\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dates = filter(row -> ismissing(row[:close]) == true, ko_interpolation_)[:, :date]\n",
+ "\n",
+ "for date in dates\n",
+ " ind = findfirst(ko_interpolation_[:,:date] .== date)\n",
+ " ko_interpolation_[ind, :][:close] = itp(ind)\n",
+ "end\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/svg+xml": [
+ "\n",
+ "\n"
+ ],
+ "text/html": [
+ "\n",
+ "\n"
+ ],
+ "text/plain": [
+ "Plot(...)"
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "rmse_error = rmse(ko_interpolation_[!, :close], ko_actual[!, :close])\n",
+ "plot(ko_interpolation_, x=:date, y=:close, Geom.line, Guide.title(\"KO linear interpolation fill with $rmse_error \"))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "┌ Warning: `range(start, stop)` (with neither `length` nor `step` given) is deprecated, use `range(start, stop=stop)` instead.\n",
+ "│ caller = top-level scope at In[48]:10\n",
+ "└ @ Core ./In[48]:10\n"
+ ]
+ },
+ {
+ "data": {
+ "image/svg+xml": [
+ "\n",
+ "\n"
+ ],
+ "text/html": [
+ "\n",
+ "\n"
+ ],
+ "text/plain": [
+ "Plot(...)"
+ ]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# rolling mean fill aka. Moving average\n",
+ "\n",
+ "\n",
+ "ko_ma = copy(ko_missing)\n",
+ "windowsize= 4 # it will calculate the mean from the missing value +- windowsize\n",
+ "dates = filter(row -> ismissing(row[:close]) == true, ko_ma)[:, :date]\n",
+ "\n",
+ "for date in dates\n",
+ " ind = findfirst(ko_ma[:,:date] .== date)\n",
+ " ko_ma[ind, :][:close] = mean(dropmissing(ba_ma)[range(max(ind - windowsize, 1), ind + windowsize),:close]) # taking the max because I cant have zero index\n",
+ "end\n",
+ "\n",
+ "\n",
+ "rmse_error = rmse(ko_ma[!, :close], ko_actual[!, :close])\n",
+ "plot(ko_ma, x=:date, y=:close, Geom.line, Guide.title(\"KO Moving average window size $windowsize with error: $rmse_error \"))\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The mcd/ko pair is having a high rmse error, because the missing data fall in the same place, therefore increasing the uncerctainty in the direction of the time series, that's why this method won't work and we need to use our explantory variable to get more certainity."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Julia 1.3.1",
+ "language": "julia",
+ "name": "julia-1.3"
+ },
+ "language_info": {
+ "file_extension": ".jl",
+ "mimetype": "application/julia",
+ "name": "julia",
+ "version": "1.3.1"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}