[1]:
import numpy as np
from sklearn.gaussian_process.kernels import Matern, RBF
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import RandomForestRegressor
import plotly
import plotly.express as px
from docs.mse_estimator import ErrorComparer
from docs.data_generation import gen_rbf_X, gen_matern_X, gen_cov_mat
from spe.tree import Tree
from spe.estimators import new_y_est, cp_arbitrary, kfoldcv, kmeanscv
[2]:
np.random.seed(1)
Estimated vs Oracle Covariance#
Here we investigate the effect of estimating the covariance matrix on the quality of estimates from spe.estimators.cp_arbitrary
.
[3]:
## number of realizations to run
niter = 100
## data generation parameters
gsize=20
n=20**2
p=5
s=5
delta = 0.75
snr = 0.4
tr_frac = .25
noise_kernel = 'matern'
noise_length_scale = 1.
noise_nu = .5
X_kernel = 'matern'
X_length_scale = 5.
X_nu = 2.5
## ErrorComparer parameters
alpha = .05
nboot = 100
k = 5
max_depth = 2
lambd = .05
models = [
Tree(max_depth=max_depth),
RidgeCV(alphas=[.01,.1, 1.]),
RandomForestRegressor(n_estimators=100, max_depth=max_depth)
]
ests = [
new_y_est,
new_y_est,
cp_arbitrary,
kfoldcv,
kmeanscv
]
est_kwargs = [
{'alpha': None,
'full_refit': False},
{'alpha': alpha,
'full_refit': False},
{'use_trace_corr': False,
'alpha': alpha},
{'k': k},
{'k': k}
]
model_names = [
"Decision Tree",
"Ridge CV",
"Random Forest"
]
est_names = ["OGenCp", "EGenCp", "KFCV", "SPCV"]
[4]:
err_cmp = ErrorComparer()
[5]:
nx = ny = int(np.sqrt(n))
xs = np.linspace(0, gsize, nx)
ys = np.linspace(0, gsize, ny)
c_x, c_y = np.meshgrid(xs, ys)
c_x = c_x.flatten()
c_y = c_y.flatten()
coord = np.stack([c_x, c_y]).T
[6]:
if noise_kernel == 'rbf':
Sigma_t = gen_cov_mat(c_x, c_y, RBF(length_scale=noise_length_scale))
elif noise_kernel == 'matern':
Sigma_t = gen_cov_mat(c_x, c_y, Matern(length_scale=noise_length_scale, nu=noise_nu))
else:
Sigma_t = np.eye(n)
Cov_y_ystar = delta*Sigma_t
Sigma_t = delta*Sigma_t + (1-delta)*np.eye(n)
if noise_kernel == 'rbf' or noise_kernel == 'matern':
Chol_y = np.linalg.cholesky(Sigma_t)
else:
Chol_y = np.eye(n)
[7]:
if X_kernel == 'rbf':
X = gen_rbf_X(c_x, c_y, p)
elif X_kernel == 'matern':
X = gen_matern_X(c_x, c_y, p, length_scale=X_length_scale, nu=X_nu)
else:
X = np.random.randn(n,p)
beta = np.zeros(p)
idx = np.random.choice(p,size=s,replace=False)
beta[idx] = np.random.uniform(-1,1,size=s)
[8]:
tr_idx = np.ones(n, dtype=bool)
Simulate \(Y, Y^* \overset{iid}{\sim} \mathcal{N}(\mu, \Sigma_Y)\)#
Estimate with True Covs#
[9]:
true_model_errs = []
for model in models:
errs = err_cmp.compare(
model,
ests[:2] + ests[3:],
est_kwargs[:2] + est_kwargs[3:],
niter=niter,
n=n,
p=p,
s=s,
snr=snr,
X=X,
beta=beta,
coord=coord,
Chol_y=Chol_y,
Chol_ystar=None,
Cov_y_ystar=None,
tr_idx=tr_idx,
fair=False,
piecewise_const_mu=True,
)
true_model_errs.append(errs)
0%| | 0/100 [00:00<?, ?it/s]
100%|██████████| 100/100 [00:03<00:00, 27.67it/s]
100%|██████████| 100/100 [00:04<00:00, 24.70it/s]
100%|██████████| 100/100 [01:31<00:00, 1.09it/s]
Estimate with Matching Model#
[10]:
from sklearn.base import clone
[11]:
model_errs = []
for model in models:
errs = err_cmp.compare(
model,
ests[2:3],
est_kwargs[2:3],
niter=niter,
n=n,
p=p,
s=s,
snr=snr,
X=X,
beta=beta,
coord=coord,
Chol_y=Chol_y,
Chol_ystar=None,
Cov_y_ystar=None,
tr_idx=tr_idx,
fair=False,
piecewise_const_mu=True,
est_sigma=True,
est_sigma_model=clone(model),
)
model_errs.append(errs)
100%|██████████| 100/100 [02:13<00:00, 1.34s/it]
100%|██████████| 100/100 [02:14<00:00, 1.35s/it]
100%|██████████| 100/100 [18:43<00:00, 11.23s/it]
[12]:
comb_errs = []
for true_err, model_err in zip(true_model_errs, model_errs):
comb_errs.append(true_err[:2] + model_err + true_err[2:])
[13]:
from importlib import reload
from docs import plotting_utils
reload(plotting_utils)
from docs.plotting_utils import gen_model_barplots
[14]:
plotly.offline.init_notebook_mode()
fig = gen_model_barplots(
comb_errs,
model_names,
est_names,
title="Comparing Performance: Estimated vs Oracle Covariance NSN",
err_bars=True,
color_discrete_sequence=[px.colors.qualitative.Bold[i] for i in [0,5,1,2]],
fig_name="est_cov_ind",
)
fig.show()
Simulate \(\begin{pmatrix} Y \\ Y^* \end{pmatrix} \sim \mathcal{N}\left(\begin{pmatrix} \mu \\ \mu \end{pmatrix}, \begin{pmatrix}\Sigma_Y & \Sigma_{Y, Y^*} \\ \Sigma_{Y^*, Y} & \Sigma_{Y} \end{pmatrix}\right)\)#
Estimate with True Covs#
[15]:
true_corr_model_errs = []
for model in models:
errs = err_cmp.compare(
model,
ests[:2] + ests[3:],
est_kwargs[:2] + est_kwargs[3:],
niter=niter,
n=n,
p=p,
s=s,
snr=snr,
X=X,
beta=beta,
coord=coord,
Chol_y=Chol_y,
Chol_ystar=None,
Cov_y_ystar=Cov_y_ystar,
tr_idx=tr_idx,
fair=False,
piecewise_const_mu=True,
)
true_corr_model_errs.append(errs)
100%|██████████| 100/100 [00:07<00:00, 13.71it/s]
100%|██████████| 100/100 [00:04<00:00, 20.02it/s]
100%|██████████| 100/100 [01:45<00:00, 1.06s/it]
Estimate with Matching Model#
[16]:
corr_model_errs = []
for model in models:
errs = err_cmp.compare(
model,
ests[2:3],
est_kwargs[2:3],
niter=niter,
n=n,
p=p,
s=s,
snr=snr,
X=X,
beta=beta,
coord=coord,
Chol_y=Chol_y,
Chol_ystar=None,
Cov_y_ystar=Cov_y_ystar,
tr_idx=tr_idx,
fair=False,
piecewise_const_mu=True,
est_sigma="corr_resp",
est_sigma_model=clone(model),
)
corr_model_errs.append(errs)
100%|██████████| 100/100 [02:40<00:00, 1.61s/it]
100%|██████████| 100/100 [02:21<00:00, 1.42s/it]
100%|██████████| 100/100 [15:44<00:00, 9.45s/it]
[17]:
comb_errs_corr = []
for true_err, model_err in zip(true_corr_model_errs, corr_model_errs):
comb_errs_corr.append(true_err[:2] + model_err + true_err[2:])
[18]:
corr_fig = gen_model_barplots(
comb_errs_corr,
model_names,
est_names,
title="Comparing Performance: Estimated vs Oracle Covariance SSN",
err_bars=True,
color_discrete_sequence=[px.colors.qualitative.Bold[i] for i in [0,5,1,2]],
fig_name="est_cov_corr",
)
corr_fig.show()
[ ]: