[1]:
import numpy as np
import pandas as pd

from sklearn.gaussian_process.kernels import Matern, RBF
from sklearn.cluster import KMeans

from docs.data_generation import gen_rbf_X, gen_matern_X, create_clus_split, gen_cov_mat
from docs.plotting_utils import gen_model_barplots

import matplotlib.pyplot as plt
import plotly.express as px

from tqdm import tqdm
/var/folders/06/s3csl9g94gx2ptsgwz8cpfvh0000gn/T/ipykernel_72302/1955058804.py:2: DeprecationWarning:
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466

  import pandas as pd

CV Corrections#

Simulations demonstrating sampling settings where CV methods do properly estimate MSE even when data are correlated.

Setup#

[2]:
np.random.seed(1)
[3]:
## number of realizations to run
niter = 100

## data generation parameters
gsize = 10
n=60**2
p=50
s=50
delta = 0.75
snr = 0.4
tr_frac = .2
bloo_radius = gsize // 2
k = 10

noise_kernel = 'matern'
noise_length_scale = 1.
noise_nu = .5

X_kernel = 'matern'
X_length_scale = 5.
X_nu = 2.5

est_names = ["KFCV", "SPCV", "BLOOCV"]
[4]:
nx = ny = int(np.sqrt(n))
xs = np.linspace(0, gsize, nx)
ys = np.linspace(0, gsize, ny)
c_x, c_y = np.meshgrid(xs, ys)
c_x = c_x.flatten()
c_y = c_y.flatten()
coord = np.stack([c_x, c_y]).T
[5]:
if noise_kernel == 'rbf':
    Sigma_y = gen_cov_mat(c_x, c_y, RBF(length_scale=noise_length_scale))
elif noise_kernel == 'matern':
    Sigma_y = gen_cov_mat(c_x, c_y, Matern(length_scale=noise_length_scale, nu=noise_nu))
else:
    Sigma_y = np.eye(n)

Cov_y_ystar = delta*Sigma_y
Sigma_y = delta*Sigma_y + (1-delta)*np.eye(n)

if noise_kernel == 'rbf' or noise_kernel == 'matern':
    Chol_y = np.linalg.cholesky(Sigma_y)
else:
    Chol_y = np.eye(n)

Generate Gaussian X, Y#

[6]:
def gen_X():
    if X_kernel == 'rbf':
        X = gen_rbf_X(c_x, c_y, p)
    elif X_kernel == 'matern':
        X = gen_matern_X(c_x, c_y, p, length_scale=X_length_scale, nu=X_nu)
    else:
        X = np.random.randn(n,p)
    return X

X = gen_X()

beta = np.zeros(p)
idx = np.random.choice(p,size=s,replace=False)
beta[idx] = np.random.uniform(-1,1,size=s)
[7]:
Chol_y *= np.std(X@beta) / np.sqrt(snr)
Sigma_y = Chol_y @ Chol_y.T
[8]:
Y = X @ beta + Chol_y @ np.random.randn(n)

Compute expected correction for one fold of CV vs expected correction for random sample#

[9]:
## assuming linear model, and bias approx 0
def computeCorrection(
    S,
    Sigma,
    tr_idx,
    ts_idx=None,
):
    if ts_idx is None:
        ts_idx = ~tr_idx

    return (Sigma[ts_idx,ts_idx].sum() - 2*np.diag(S @ Sigma[tr_idx,:][:,ts_idx]).sum()) / ts_idx.sum()
[10]:
def getDistance(c_x, c_y):
    Loc = np.stack([c_x, c_y]).T
    m = np.sum(Loc**2, axis=1)
    D = (-2 * Loc.dot(Loc.T) + m).T + m
    D = 0.5 * (D + D.T)
    D = np.maximum(D, 0)  ## sometimes gets values like -1e-9
    D = np.sqrt(D)

    return D

def getBufferTrain(D, tr_bool, ts_idx):
    buffer_tr_bool = tr_bool & (D[ts_idx,:] > bloo_radius)
    return buffer_tr_bool
[11]:
D = getDistance(c_x, c_y)
[12]:
kfcv_corr = np.zeros(niter)
spcv_corr = np.zeros(niter)
bloocv_corr = np.zeros(niter)
ns_corr = np.zeros(niter)
sp_corr = np.zeros(niter)

for i in tqdm(range(niter)):
    idxs = np.random.choice(n, size=int(tr_frac*n), replace=False)

    cv_tr_idx = idxs[:int(tr_frac*tr_frac*n)]
    cv_tr_bool = np.zeros(n, dtype=bool)
    cv_tr_bool[cv_tr_idx] = True

    cv_ts_idx = idxs[int(tr_frac*tr_frac*n):int(tr_frac*n)]
    cv_ts_bool = np.zeros(n, dtype=bool)
    cv_ts_bool[cv_ts_idx] = True

    tr_idx = idxs[:int(tr_frac*tr_frac*n)]
    tr_bool = np.zeros(n, dtype=bool)
    tr_bool[tr_idx] = True

    ts_idx = idxs[int(tr_frac*tr_frac*n):int(tr_frac*n)]
    ts_bool = np.zeros(n, dtype=bool)
    ts_bool[ts_idx] = True

    ## split into 3 folds by kmeans
    groups = KMeans(n_init=10, n_clusters=k).fit(coord[idxs[:int(tr_frac*n)]]).labels_
    # groups = KMeans(n_init=10, n_clusters=5).fit(coord[idxs[:int(tr_frac*tr_frac*n)]]).labels_
    spcv_tr_idx = idxs[np.where(groups < k-1)[0]]
    spcv_tr_bool = np.zeros(n, dtype=bool)
    spcv_tr_bool[spcv_tr_idx] = True
    spcv_ts_idx = idxs[np.where(groups == k-1)[0]]
    spcv_ts_bool = np.zeros(n, dtype=bool)
    spcv_ts_bool[spcv_ts_idx] = True

    ## pick one point for ts, tr is all far enough away
    bloocv_tr_idx = idxs[np.random.choice(len(idxs),size=len(idxs),replace=False)]
    bloocv_ts_idx = bloocv_tr_idx[0]

    bloocv_tr_bool = np.zeros(n, dtype=bool)
    bloocv_tr_bool[bloocv_tr_idx] = True
    bloocv_tr_bool = getBufferTrain(D, bloocv_tr_bool, bloocv_ts_idx)

    bloocv_ts_bool = np.zeros(n, dtype=bool)
    bloocv_ts_bool[bloocv_ts_idx] = True

    # X = gen_rbf_X(c_x, c_y, p)
    # X = gen_matern_X(c_x, c_y, p, length_scale=X_length_scale, nu=X_nu)
    # Y = X@beta + Chol_y @ np.random.randn(n)
    X = gen_X()

    X_cv = X[cv_tr_idx,:]
    S_cv = X[cv_ts_idx,:] @ np.linalg.pinv(X_cv)

    X_spcv = X[spcv_tr_idx,:]
    S_spcv = X[spcv_ts_idx,:] @ np.linalg.pinv(X_spcv)

    X_bloocv = X[bloocv_tr_bool,:]
    S_bloocv = X[bloocv_ts_idx,:] @ np.linalg.pinv(X_bloocv)


    X_tr = X[tr_idx,:]
    S_tr = X[ts_idx,:] @ np.linalg.pinv(X_tr)

    sp_idx = np.random.choice(ts_idx,size=1)
    sp_bool = np.zeros(n, dtype=bool)
    sp_bool[sp_idx] = True
    S_sp = X[sp_idx,:] @ np.linalg.pinv(X_tr)
    kfcv_corr[i] = computeCorrection(S_cv, Sigma_y, cv_tr_bool, cv_ts_bool)
    spcv_corr[i] = computeCorrection(S_spcv, Sigma_y, spcv_tr_bool, spcv_ts_bool)
    bloocv_corr[i] = computeCorrection(S_bloocv, Sigma_y, bloocv_tr_bool, bloocv_ts_bool)
    ns_corr[i] = computeCorrection(S_tr, Sigma_y, tr_bool, ts_bool)
    sp_corr[i] = computeCorrection(S_sp, Sigma_y, tr_bool, sp_bool)

corrs = pd.DataFrame({
    'KFCV': kfcv_corr,
    'SPCV': spcv_corr,
    'BLOOCV': bloocv_corr,
})

100%|██████████| 100/100 [00:15<00:00,  6.29it/s]
[13]:
fig = gen_model_barplots(
    [[ns_corr, kfcv_corr, spcv_corr, bloocv_corr]],
    [""],
    est_names,
    title="Train/Test Split: OLS Correction Term Comparisons",
    has_elev_err=False,
    err_bars=True,
    color_discrete_sequence=[px.colors.qualitative.Bold[i] for i in [1,2,3]],
)
fig.show()

Data type cannot be displayed: application/vnd.plotly.v1+json

Compute expected correction for one fold of CV vs expected correction for clustered sample#

[19]:
clus_kfcv_corr = np.zeros(niter)
clus_spcv_corr = np.zeros(niter)
clus_bloocv_corr = np.zeros(niter)
clus_ns_corr = np.zeros(niter)
clus_sp_corr = np.zeros(niter)

# tr_frac /= 2
[20]:

for i in tqdm(range(niter)): idxs, ts_idx = create_clus_split( nx, ny, tr_frac, ngrid=5, ts_frac=tr_frac-tr_frac*tr_frac, sort_grids=False, ) ## randomize order for CV idxs = np.random.choice(idxs, size=len(idxs), replace=False) cv_tr_idx = idxs[:int(tr_frac*len(idxs))] cv_tr_bool = np.zeros(n, dtype=bool) cv_tr_bool[cv_tr_idx] = True cv_ts_idx = idxs[int(tr_frac*len(idxs)):] cv_ts_bool = np.zeros(n, dtype=bool) cv_ts_bool[cv_ts_idx] = True tr_idx = idxs[:int(tr_frac*len(idxs))] tr_bool = np.zeros(n, dtype=bool) tr_bool[tr_idx] = True # ts_idx = idxs[int(tr_frac*tr_frac*n):] # ts_bool = np.zeros(n, dtype=bool) ts_bool[ts_idx] = True ## split into 3 folds by kmeans groups = KMeans(n_init=10, n_clusters=k).fit(coord[idxs]).labels_ # groups = KMeans(n_init=10, n_clusters=k).fit(coord[idxs[:int(tr_frac*n)]]).labels_ # groups = KMeans(n_init=10, n_clusters=5).fit(coord[idxs[:int(tr_frac*tr_frac*n)]]).labels_ spcv_tr_idx = idxs[np.where(groups < k-1)[0]] spcv_tr_bool = np.zeros(n, dtype=bool) spcv_tr_bool[spcv_tr_idx] = True spcv_ts_idx = idxs[np.where(groups == k-1)[0]] spcv_ts_bool = np.zeros(n, dtype=bool) spcv_ts_bool[spcv_ts_idx] = True ## pick one point for ts, tr is all far enough away bloocv_tr_idx = idxs[np.random.choice(len(idxs),size=len(idxs),replace=False)] bloocv_ts_idx = bloocv_tr_idx[0] bloocv_tr_bool = np.zeros(n, dtype=bool) bloocv_tr_bool[bloocv_tr_idx] = True bloocv_tr_bool = getBufferTrain(D, bloocv_tr_bool, bloocv_ts_idx) bloocv_ts_bool = np.zeros(n, dtype=bool) bloocv_ts_bool[bloocv_ts_idx] = True # X = gen_rbf_X(c_x, c_y, p) # X = gen_matern_X(c_x, c_y, p, length_scale=X_length_scale, nu=X_nu) # Y = X@beta + Chol_y @ np.random.randn(n) X = gen_X() X_cv = X[cv_tr_idx,:] S_cv = X[cv_ts_idx,:] @ np.linalg.pinv(X_cv) X_spcv = X[spcv_tr_idx,:] S_spcv = X[spcv_ts_idx,:] @ np.linalg.pinv(X_spcv) X_bloocv = X[bloocv_tr_bool,:] S_bloocv = X[bloocv_ts_idx,:] @ np.linalg.pinv(X_bloocv) X_tr = X[tr_idx,:] S_tr = X[ts_idx,:] @ np.linalg.pinv(X_tr) sp_idx = np.random.choice(ts_idx,size=1) sp_bool = np.zeros(n, dtype=bool) sp_bool[sp_idx] = True S_sp = X[sp_idx,:] @ np.linalg.pinv(X_tr) clus_kfcv_corr[i] = computeCorrection(S_cv, Sigma_y, cv_tr_bool, cv_ts_bool) clus_spcv_corr[i] = computeCorrection(S_spcv, Sigma_y, spcv_tr_bool, spcv_ts_bool) clus_bloocv_corr[i] = computeCorrection(S_bloocv, Sigma_y, bloocv_tr_bool, bloocv_ts_bool) clus_ns_corr[i] = computeCorrection(S_tr, Sigma_y, tr_bool, ts_bool) clus_sp_corr[i] = computeCorrection(S_sp, Sigma_y, tr_bool, sp_bool) # print(ts_bool.sum(), cv_tr_bool.sum(), cv_ts_bool.sum()) clus_corrs = pd.DataFrame({ 'KFCV': clus_kfcv_corr, 'SPCV': clus_spcv_corr, 'BLOOCV': clus_bloocv_corr, })
100%|██████████| 100/100 [00:15<00:00,  6.48it/s]
[21]:
fig = gen_model_barplots(
    [[clus_ns_corr, clus_kfcv_corr, clus_spcv_corr, clus_bloocv_corr]],
    [""],
    est_names,
    title="Spatial Train/Test Split: OLS Correction Term Comparisons",
    has_elev_err=False,
    err_bars=True,
    color_discrete_sequence=[px.colors.qualitative.Bold[i] for i in [1,2,3]],
)
fig.show()

Data type cannot be displayed: application/vnd.plotly.v1+json

[ ]: