[1]:
import numpy as np
import pandas as pd
from sklearn.gaussian_process.kernels import Matern, RBF
from sklearn.cluster import KMeans
from docs.data_generation import gen_rbf_X, gen_matern_X, create_clus_split, gen_cov_mat
from docs.plotting_utils import gen_model_barplots
import matplotlib.pyplot as plt
import plotly.express as px
from tqdm import tqdm
/var/folders/06/s3csl9g94gx2ptsgwz8cpfvh0000gn/T/ipykernel_72302/1955058804.py:2: DeprecationWarning:
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
import pandas as pd
CV Corrections#
Simulations demonstrating sampling settings where CV methods do properly estimate MSE even when data are correlated.
Setup#
[2]:
np.random.seed(1)
[3]:
## number of realizations to run
niter = 100
## data generation parameters
gsize = 10
n=60**2
p=50
s=50
delta = 0.75
snr = 0.4
tr_frac = .2
bloo_radius = gsize // 2
k = 10
noise_kernel = 'matern'
noise_length_scale = 1.
noise_nu = .5
X_kernel = 'matern'
X_length_scale = 5.
X_nu = 2.5
est_names = ["KFCV", "SPCV", "BLOOCV"]
[4]:
nx = ny = int(np.sqrt(n))
xs = np.linspace(0, gsize, nx)
ys = np.linspace(0, gsize, ny)
c_x, c_y = np.meshgrid(xs, ys)
c_x = c_x.flatten()
c_y = c_y.flatten()
coord = np.stack([c_x, c_y]).T
[5]:
if noise_kernel == 'rbf':
Sigma_y = gen_cov_mat(c_x, c_y, RBF(length_scale=noise_length_scale))
elif noise_kernel == 'matern':
Sigma_y = gen_cov_mat(c_x, c_y, Matern(length_scale=noise_length_scale, nu=noise_nu))
else:
Sigma_y = np.eye(n)
Cov_y_ystar = delta*Sigma_y
Sigma_y = delta*Sigma_y + (1-delta)*np.eye(n)
if noise_kernel == 'rbf' or noise_kernel == 'matern':
Chol_y = np.linalg.cholesky(Sigma_y)
else:
Chol_y = np.eye(n)
Generate Gaussian X, Y#
[6]:
def gen_X():
if X_kernel == 'rbf':
X = gen_rbf_X(c_x, c_y, p)
elif X_kernel == 'matern':
X = gen_matern_X(c_x, c_y, p, length_scale=X_length_scale, nu=X_nu)
else:
X = np.random.randn(n,p)
return X
X = gen_X()
beta = np.zeros(p)
idx = np.random.choice(p,size=s,replace=False)
beta[idx] = np.random.uniform(-1,1,size=s)
[7]:
Chol_y *= np.std(X@beta) / np.sqrt(snr)
Sigma_y = Chol_y @ Chol_y.T
[8]:
Y = X @ beta + Chol_y @ np.random.randn(n)
Compute expected correction for one fold of CV vs expected correction for random sample#
[9]:
## assuming linear model, and bias approx 0
def computeCorrection(
S,
Sigma,
tr_idx,
ts_idx=None,
):
if ts_idx is None:
ts_idx = ~tr_idx
return (Sigma[ts_idx,ts_idx].sum() - 2*np.diag(S @ Sigma[tr_idx,:][:,ts_idx]).sum()) / ts_idx.sum()
[10]:
def getDistance(c_x, c_y):
Loc = np.stack([c_x, c_y]).T
m = np.sum(Loc**2, axis=1)
D = (-2 * Loc.dot(Loc.T) + m).T + m
D = 0.5 * (D + D.T)
D = np.maximum(D, 0) ## sometimes gets values like -1e-9
D = np.sqrt(D)
return D
def getBufferTrain(D, tr_bool, ts_idx):
buffer_tr_bool = tr_bool & (D[ts_idx,:] > bloo_radius)
return buffer_tr_bool
[11]:
D = getDistance(c_x, c_y)
[12]:
kfcv_corr = np.zeros(niter)
spcv_corr = np.zeros(niter)
bloocv_corr = np.zeros(niter)
ns_corr = np.zeros(niter)
sp_corr = np.zeros(niter)
for i in tqdm(range(niter)):
idxs = np.random.choice(n, size=int(tr_frac*n), replace=False)
cv_tr_idx = idxs[:int(tr_frac*tr_frac*n)]
cv_tr_bool = np.zeros(n, dtype=bool)
cv_tr_bool[cv_tr_idx] = True
cv_ts_idx = idxs[int(tr_frac*tr_frac*n):int(tr_frac*n)]
cv_ts_bool = np.zeros(n, dtype=bool)
cv_ts_bool[cv_ts_idx] = True
tr_idx = idxs[:int(tr_frac*tr_frac*n)]
tr_bool = np.zeros(n, dtype=bool)
tr_bool[tr_idx] = True
ts_idx = idxs[int(tr_frac*tr_frac*n):int(tr_frac*n)]
ts_bool = np.zeros(n, dtype=bool)
ts_bool[ts_idx] = True
## split into 3 folds by kmeans
groups = KMeans(n_init=10, n_clusters=k).fit(coord[idxs[:int(tr_frac*n)]]).labels_
# groups = KMeans(n_init=10, n_clusters=5).fit(coord[idxs[:int(tr_frac*tr_frac*n)]]).labels_
spcv_tr_idx = idxs[np.where(groups < k-1)[0]]
spcv_tr_bool = np.zeros(n, dtype=bool)
spcv_tr_bool[spcv_tr_idx] = True
spcv_ts_idx = idxs[np.where(groups == k-1)[0]]
spcv_ts_bool = np.zeros(n, dtype=bool)
spcv_ts_bool[spcv_ts_idx] = True
## pick one point for ts, tr is all far enough away
bloocv_tr_idx = idxs[np.random.choice(len(idxs),size=len(idxs),replace=False)]
bloocv_ts_idx = bloocv_tr_idx[0]
bloocv_tr_bool = np.zeros(n, dtype=bool)
bloocv_tr_bool[bloocv_tr_idx] = True
bloocv_tr_bool = getBufferTrain(D, bloocv_tr_bool, bloocv_ts_idx)
bloocv_ts_bool = np.zeros(n, dtype=bool)
bloocv_ts_bool[bloocv_ts_idx] = True
# X = gen_rbf_X(c_x, c_y, p)
# X = gen_matern_X(c_x, c_y, p, length_scale=X_length_scale, nu=X_nu)
# Y = X@beta + Chol_y @ np.random.randn(n)
X = gen_X()
X_cv = X[cv_tr_idx,:]
S_cv = X[cv_ts_idx,:] @ np.linalg.pinv(X_cv)
X_spcv = X[spcv_tr_idx,:]
S_spcv = X[spcv_ts_idx,:] @ np.linalg.pinv(X_spcv)
X_bloocv = X[bloocv_tr_bool,:]
S_bloocv = X[bloocv_ts_idx,:] @ np.linalg.pinv(X_bloocv)
X_tr = X[tr_idx,:]
S_tr = X[ts_idx,:] @ np.linalg.pinv(X_tr)
sp_idx = np.random.choice(ts_idx,size=1)
sp_bool = np.zeros(n, dtype=bool)
sp_bool[sp_idx] = True
S_sp = X[sp_idx,:] @ np.linalg.pinv(X_tr)
kfcv_corr[i] = computeCorrection(S_cv, Sigma_y, cv_tr_bool, cv_ts_bool)
spcv_corr[i] = computeCorrection(S_spcv, Sigma_y, spcv_tr_bool, spcv_ts_bool)
bloocv_corr[i] = computeCorrection(S_bloocv, Sigma_y, bloocv_tr_bool, bloocv_ts_bool)
ns_corr[i] = computeCorrection(S_tr, Sigma_y, tr_bool, ts_bool)
sp_corr[i] = computeCorrection(S_sp, Sigma_y, tr_bool, sp_bool)
corrs = pd.DataFrame({
'KFCV': kfcv_corr,
'SPCV': spcv_corr,
'BLOOCV': bloocv_corr,
})
100%|██████████| 100/100 [00:15<00:00, 6.29it/s]
[13]:
fig = gen_model_barplots(
[[ns_corr, kfcv_corr, spcv_corr, bloocv_corr]],
[""],
est_names,
title="Train/Test Split: OLS Correction Term Comparisons",
has_elev_err=False,
err_bars=True,
color_discrete_sequence=[px.colors.qualitative.Bold[i] for i in [1,2,3]],
)
fig.show()
Data type cannot be displayed: application/vnd.plotly.v1+json
Compute expected correction for one fold of CV vs expected correction for clustered sample#
[19]:
clus_kfcv_corr = np.zeros(niter)
clus_spcv_corr = np.zeros(niter)
clus_bloocv_corr = np.zeros(niter)
clus_ns_corr = np.zeros(niter)
clus_sp_corr = np.zeros(niter)
# tr_frac /= 2
[20]:
for i in tqdm(range(niter)):
idxs, ts_idx = create_clus_split(
nx,
ny,
tr_frac,
ngrid=5,
ts_frac=tr_frac-tr_frac*tr_frac,
sort_grids=False,
)
## randomize order for CV
idxs = np.random.choice(idxs, size=len(idxs), replace=False)
cv_tr_idx = idxs[:int(tr_frac*len(idxs))]
cv_tr_bool = np.zeros(n, dtype=bool)
cv_tr_bool[cv_tr_idx] = True
cv_ts_idx = idxs[int(tr_frac*len(idxs)):]
cv_ts_bool = np.zeros(n, dtype=bool)
cv_ts_bool[cv_ts_idx] = True
tr_idx = idxs[:int(tr_frac*len(idxs))]
tr_bool = np.zeros(n, dtype=bool)
tr_bool[tr_idx] = True
# ts_idx = idxs[int(tr_frac*tr_frac*n):]
# ts_bool = np.zeros(n, dtype=bool)
ts_bool[ts_idx] = True
## split into 3 folds by kmeans
groups = KMeans(n_init=10, n_clusters=k).fit(coord[idxs]).labels_
# groups = KMeans(n_init=10, n_clusters=k).fit(coord[idxs[:int(tr_frac*n)]]).labels_
# groups = KMeans(n_init=10, n_clusters=5).fit(coord[idxs[:int(tr_frac*tr_frac*n)]]).labels_
spcv_tr_idx = idxs[np.where(groups < k-1)[0]]
spcv_tr_bool = np.zeros(n, dtype=bool)
spcv_tr_bool[spcv_tr_idx] = True
spcv_ts_idx = idxs[np.where(groups == k-1)[0]]
spcv_ts_bool = np.zeros(n, dtype=bool)
spcv_ts_bool[spcv_ts_idx] = True
## pick one point for ts, tr is all far enough away
bloocv_tr_idx = idxs[np.random.choice(len(idxs),size=len(idxs),replace=False)]
bloocv_ts_idx = bloocv_tr_idx[0]
bloocv_tr_bool = np.zeros(n, dtype=bool)
bloocv_tr_bool[bloocv_tr_idx] = True
bloocv_tr_bool = getBufferTrain(D, bloocv_tr_bool, bloocv_ts_idx)
bloocv_ts_bool = np.zeros(n, dtype=bool)
bloocv_ts_bool[bloocv_ts_idx] = True
# X = gen_rbf_X(c_x, c_y, p)
# X = gen_matern_X(c_x, c_y, p, length_scale=X_length_scale, nu=X_nu)
# Y = X@beta + Chol_y @ np.random.randn(n)
X = gen_X()
X_cv = X[cv_tr_idx,:]
S_cv = X[cv_ts_idx,:] @ np.linalg.pinv(X_cv)
X_spcv = X[spcv_tr_idx,:]
S_spcv = X[spcv_ts_idx,:] @ np.linalg.pinv(X_spcv)
X_bloocv = X[bloocv_tr_bool,:]
S_bloocv = X[bloocv_ts_idx,:] @ np.linalg.pinv(X_bloocv)
X_tr = X[tr_idx,:]
S_tr = X[ts_idx,:] @ np.linalg.pinv(X_tr)
sp_idx = np.random.choice(ts_idx,size=1)
sp_bool = np.zeros(n, dtype=bool)
sp_bool[sp_idx] = True
S_sp = X[sp_idx,:] @ np.linalg.pinv(X_tr)
clus_kfcv_corr[i] = computeCorrection(S_cv, Sigma_y, cv_tr_bool, cv_ts_bool)
clus_spcv_corr[i] = computeCorrection(S_spcv, Sigma_y, spcv_tr_bool, spcv_ts_bool)
clus_bloocv_corr[i] = computeCorrection(S_bloocv, Sigma_y, bloocv_tr_bool, bloocv_ts_bool)
clus_ns_corr[i] = computeCorrection(S_tr, Sigma_y, tr_bool, ts_bool)
clus_sp_corr[i] = computeCorrection(S_sp, Sigma_y, tr_bool, sp_bool)
# print(ts_bool.sum(), cv_tr_bool.sum(), cv_ts_bool.sum())
clus_corrs = pd.DataFrame({
'KFCV': clus_kfcv_corr,
'SPCV': clus_spcv_corr,
'BLOOCV': clus_bloocv_corr,
})
100%|██████████| 100/100 [00:15<00:00, 6.48it/s]
[21]:
fig = gen_model_barplots(
[[clus_ns_corr, clus_kfcv_corr, clus_spcv_corr, clus_bloocv_corr]],
[""],
est_names,
title="Spatial Train/Test Split: OLS Correction Term Comparisons",
has_elev_err=False,
err_bars=True,
color_discrete_sequence=[px.colors.qualitative.Bold[i] for i in [1,2,3]],
)
fig.show()
Data type cannot be displayed: application/vnd.plotly.v1+json
[ ]: