import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pandas as pd
import math
Setup
In [1]:
In [2]:
def design_matrix(model_name, x):
if model_name == "POLY-4":
= np.column_stack([np.ones(len(x)), x, x**2, x**3, x**4])
X elif model_name == "CUBIC":
= np.column_stack([np.ones(len(x)), x, x**2, x**3])
X elif model_name == "PAR":
= np.column_stack([np.ones(len(x)), x, x**2])
X elif model_name == "LIN":
= np.column_stack([np.ones(len(x)), x])
X elif model_name == "LIN0":
= x.reshape(-1, 1) # Only x as a feature (no intercept)
X else:
raise NotImplementedError(model_name)
return X
def fit_model(model_name, x, y):
"""Fits the specified model using statsmodels (OLS)."""
= sm.OLS(y, design_matrix(model_name, x))
model = model.fit()
results return results
def predict(model_name, x, y, x_to_predict=None):
= fit_model(model_name, x, y)
fit_results if x_to_predict is None:
= x
x_to_predict return fit_results.predict(design_matrix(model_name, x_to_predict))
In [3]:
def generate_y(x, noise_std=0.0):
"""Generates data based on the specified model."""
= 0.5 + 0.5 * np.tanh(x - 2)
y if noise_std:
+= np.random.normal(0, noise_std, size=len(x))
y return y
def generate_x(x_range, step=0.1):
return np.round(np.arange(x_range[0], x_range[1] + step, step), decimals=1)
In [4]:
# Define x ranges
= {
x_ranges "X0": (0, 3.5),
"Xtarg": (3.5, 5),
"Xall": (0, 5),
"Xcal": (0, 2.5),
"Xgen": (2.5, 3.5),
}
= generate_x(x_ranges["Xall"])
x_all = generate_y(x_all)
y_true = {x: y for x, y in zip(x_all, y_true)} # ensure exact same y's for all ranges f_dict
Replicate Fig. 5
In [5]:
In [6]:
Replicate Table 2
In [7]:
def score(y_pred, y):
return np.mean((y_pred - y)**2) * 100000
In [8]:
def _set_ranges(range_name_item):
= x_ranges[range_name_item[0]]
x_range = generate_x(x_range)
x = np.array([f_dict[x_i] for x_i in x])
y = x_ranges[range_name_item[1]]
x_out_range = generate_x(x_out_range)
x_out = np.array([f_dict[x_i] for x_i in x_out])
y_out return x, y, x_out, y_out
def _display_table(models, names, results_dict):
= [[results_dict[range_name][model]["score"] for range_name in names] for model in models]
table_data = (
df =models)
pd.DataFrame(table_data, indexint)
.astype(None, axis=1).rename(columns={i: name for i, name in enumerate(names)})
.rename_axis(
)
display(df)
def run_gen_simulation(range_name_items, models):
"""Runs the simulation and returns the results."""
= {}
results_dict = []
names for item in range_name_items:
= f"{item[0]}->{item[1]}"
name = {}
results_dict[name]
names.append(name)= _set_ranges(item)
x, y, x_out, y_out for model_name in models:
= predict(model_name, x, y, x_to_predict=x_out)
y_pred = {
results_dict[name][model_name] "score": round(score(y_pred, y_out), 0),
}
_display_table(models, names, results_dict)
= list(reversed(["POLY-4", "CUBIC", "PAR", "LIN", "LIN0"]))
sim_models "Xcal", "Xgen"), ("X0", "Xtarg"), ("X0", "Xall")], sim_models) run_gen_simulation([(
Xcal->Xgen | X0->Xtarg | X0->Xall | |
---|---|---|---|
LIN0 | 2779 | 3520 | 1677 |
LIN | 818 | 8220 | 2851 |
PAR | 8683 | 24830 | 7954 |
CUBIC | 5260 | 12133 | 3822 |
POLY-4 | 1486 | 84795 | 26608 |