Approximate models

Building Approximate GP Models from Scratch¶

When working with large datasets, exact GP models can become computationally expensive due to their O(n³) complexity. Approximate GP models provide a practical alternative by using sparse approximations or reduced-rank methods to scale to larger datasets while maintaining good predictive performance.

In this example, we demonstrate how to build and compare approximate GP models with analytical (exact) GP models using the 3D Rosenbrock function. We'll examine the trade-offs between model construction time and data size, showing how approximate models enable efficient modeling for larger datasets.

In [1]:

Copied!





# set values if testing
import os

from xopt import Xopt, Evaluator
from xopt.generators import RandomGenerator
from xopt.resources.test_functions.rosenbrock import (
    evaluate_rosenbrock,
    make_rosenbrock_vocs,
)

from xopt.generators.bayesian.models.approximate import ApproximateModelConstructor
from xopt.generators.bayesian.models.standard import StandardModelConstructor

# Ignore all warnings
import warnings

warnings.filterwarnings("ignore")


SMOKE_TEST = os.environ.get("SMOKE_TEST")
MAX_TRAINING_DATA = 100 if SMOKE_TEST else 2500
MAX_STEPS = 1 if SMOKE_TEST else 10

# make rosenbrock function vocs in 3D
vocs = make_rosenbrock_vocs(3)

# collect some data using random sampling
evaluator = Evaluator(function=evaluate_rosenbrock)
generator = RandomGenerator(vocs=vocs)
X = Xopt(generator=generator, evaluator=evaluator)
X.random_evaluate(MAX_TRAINING_DATA)
# set values if testing
import os

from xopt import Xopt, Evaluator
from xopt.generators import RandomGenerator
from xopt.resources.test_functions.rosenbrock import (
    evaluate_rosenbrock,
    make_rosenbrock_vocs,
)

from xopt.generators.bayesian.models.approximate import ApproximateModelConstructor
from xopt.generators.bayesian.models.standard import StandardModelConstructor

# Ignore all warnings
import warnings

warnings.filterwarnings("ignore")


SMOKE_TEST = os.environ.get("SMOKE_TEST")
MAX_TRAINING_DATA = 100 if SMOKE_TEST else 2500
MAX_STEPS = 1 if SMOKE_TEST else 10

# make rosenbrock function vocs in 3D
vocs = make_rosenbrock_vocs(3)

# collect some data using random sampling
evaluator = Evaluator(function=evaluate_rosenbrock)
generator = RandomGenerator(vocs=vocs)
X = Xopt(generator=generator, evaluator=evaluator)
X.random_evaluate(MAX_TRAINING_DATA)

Out[1]:

	x0	x1	x2	y	xopt_runtime	xopt_error
0	1.210999	-0.446264	0.844177	409.615133	0.000015	False
1	-1.018198	-0.192430	-1.631532	434.987289	0.000011	False
2	1.472977	-1.509822	1.116408	1495.676508	0.000004	False
3	0.035048	1.997925	-1.945463	3925.600321	0.000003	False
4	-0.872183	-1.479701	-1.874708	2163.387283	0.000002	False
...	...	...	...	...	...	...
2495	0.552718	-0.988394	-1.554204	812.229001	0.000002	False
2496	0.237074	-1.259359	1.891637	188.099732	0.000002	False
2497	-1.768123	0.912996	0.766333	497.975311	0.000002	False
2498	1.801021	-1.172698	-1.589519	2834.766615	0.000002	False
2499	-0.415521	-1.480450	1.084406	404.050287	0.000002	False

2500 rows × 6 columns

Benchmark: Standard vs Approximate Model Construction¶

To understand the practical benefits of approximate models, we'll benchmark the construction time of both approaches across varying dataset sizes. This comparison demonstrates how approximate models provide computational advantages while maintaining model quality. Emperical test runs show that Approximate GP models should outperform Standard models around 2000 points.

Notes¶

This benchmark can be computationally expensive to evaluate, so it may take a long time to execute and you may run out of memory.
This benchmark measures the amount of time needed to fit the model hyperparameters to the dataset, which includes both the execution time of the model and the number of steps needed to optimize the hyperparameters. As a result, the training time will not show perfect O(n³) scaling, but should show rough qualatative agreement.

In [2]:

Copied!

data = X.data
data = X.data

In [3]:

Copied!

model_constructor = StandardModelConstructor()
approximate_model_constructor = ApproximateModelConstructor()
model_constructor = StandardModelConstructor()
approximate_model_constructor = ApproximateModelConstructor()

In [4]:

Copied!





import time
import numpy as np

import matplotlib.pyplot as plt
from scipy.optimize import curve_fit


data_sizes = np.linspace(50, MAX_TRAINING_DATA, MAX_STEPS, dtype=int)
standard_times = []
approximate_times = []

standard_model_constructor = StandardModelConstructor()
approximate_model_constructor = ApproximateModelConstructor()

for n in data_sizes:
    subset = data.iloc[:n]

    # Time StandardModelConstructor
    start = time.time()
    standard_model_constructor.build_model_from_vocs(vocs, subset)
    standard_times.append(time.time() - start)

    # Time ApproximateModelConstructor
    start = time.time()
    approximate_model_constructor.build_model_from_vocs(vocs, subset)
    approximate_times.append(time.time() - start)

    print(
        f"n={n}: Standard={standard_times[-1]:.3f}s, Approximate={approximate_times[-1]:.3f}s"
    )


# fit n^3 scaling to standard times
def cubic(x, a):
    return a * x**3


# Fit cubic curve to standard times
params, _ = curve_fit(cubic, data_sizes, standard_times)
fitted_standard_times = cubic(data_sizes, *params)

# Plot results
fig, ax = plt.subplots(figsize=(8, 5))
ax.plot(data_sizes, standard_times, marker="o", label="StandardModelConstructor")
ax.plot(data_sizes, approximate_times, marker="s", label="ApproximateModelConstructor")
ax.plot(data_sizes, fitted_standard_times, linestyle="--", label="Cubic Fit (Standard)")

ax.set_xlabel("Number of data points")
ax.set_ylabel("Build time (s)")
ax.set_title("GP Model Construction Time Benchmark")
ax.legend()
ax.grid(True)
plt.tight_layout()
plt.show()
import time
import numpy as np

import matplotlib.pyplot as plt
from scipy.optimize import curve_fit


data_sizes = np.linspace(50, MAX_TRAINING_DATA, MAX_STEPS, dtype=int)
standard_times = []
approximate_times = []

standard_model_constructor = StandardModelConstructor()
approximate_model_constructor = ApproximateModelConstructor()

for n in data_sizes:
    subset = data.iloc[:n]

    # Time StandardModelConstructor
    start = time.time()
    standard_model_constructor.build_model_from_vocs(vocs, subset)
    standard_times.append(time.time() - start)

    # Time ApproximateModelConstructor
    start = time.time()
    approximate_model_constructor.build_model_from_vocs(vocs, subset)
    approximate_times.append(time.time() - start)

    print(
        f"n={n}: Standard={standard_times[-1]:.3f}s, Approximate={approximate_times[-1]:.3f}s"
    )


# fit n^3 scaling to standard times
def cubic(x, a):
    return a * x**3


# Fit cubic curve to standard times
params, _ = curve_fit(cubic, data_sizes, standard_times)
fitted_standard_times = cubic(data_sizes, *params)

# Plot results
fig, ax = plt.subplots(figsize=(8, 5))
ax.plot(data_sizes, standard_times, marker="o", label="StandardModelConstructor")
ax.plot(data_sizes, approximate_times, marker="s", label="ApproximateModelConstructor")
ax.plot(data_sizes, fitted_standard_times, linestyle="--", label="Cubic Fit (Standard)")

ax.set_xlabel("Number of data points")
ax.set_ylabel("Build time (s)")
ax.set_title("GP Model Construction Time Benchmark")
ax.legend()
ax.grid(True)
plt.tight_layout()
plt.show()

n=50: Standard=0.341s, Approximate=2.029s

n=322: Standard=0.813s, Approximate=3.051s

n=594: Standard=3.092s, Approximate=6.305s

n=866: Standard=7.937s, Approximate=10.627s

n=1138: Standard=10.735s, Approximate=22.038s

n=1411: Standard=15.024s, Approximate=23.421s

n=1683: Standard=24.902s, Approximate=25.499s

n=1955: Standard=42.781s, Approximate=29.646s

n=2227: Standard=70.831s, Approximate=31.699s

n=2500: Standard=60.717s, Approximate=34.331s

No description has been provided for this image

In [ ]: