8000 ENH: Faster Eigen Decomposition For Isomap & KernelPCA by yaichm · Pull Request #31247 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

ENH: Faster Eigen Decomposition For Isomap & KernelPCA #31247

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 20 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
b286daf
add randomized_eigh(selection='value')
Apr 16, 2025
66573f0
add changelog
Apr 16, 2025
ce755eb
Add randomized_eigsh(selection='value') for fast eigendecomposition w…
Apr 20, 2025
5bccfbc
Merge branch 'scikit-learn:main' into feature/randomized_eigsh_value
yaichm Apr 20, 2025
94cc727
Merge remote-tracking branch 'upstream/main' into feature/randomized_…
Apr 22, 2025
13db528
Merge remote-tracking branch 'origin/feature/randomized_eigsh_value' …
Apr 22, 2025
cbdd2ad
Merge branch 'scikit-learn:main' into feature/randomized_eigsh_value
yaichm Apr 23, 2025
64da5d2
Merge branch 'scikit-learn:main' into feature/randomized_eigsh_value
yaichm Apr 24, 2025
d204484
Merge branch 'main' into feature/randomized_eigsh_value
yaichm Apr 25, 2025
53e0102
Fix linting errors and rename the changelog to match the PR
Apr 27, 2025
80f8ee7
Fix docstring of randomized_eigen_decomposition
Apr 27, 2025
161658d
Merge branch 'main' into feature/randomized_eigsh_value
yaichm Apr 28, 2025
91654c8
Added test for array API compliance
Apr 28, 2025
ca7f378
Merge branch 'main' into feature/randomized_eigsh_value
yaichm Apr 28, 2025
4829984
Merge branch 'main' into feature/randomized_eigsh_value
yaichm Apr 29, 2025
4939734
Merge branch 'main' into feature/randomized_eigsh_value
yaichm May 1, 2025
e3628d1
Merge branch 'main' into feature/randomized_eigsh_value
yaichm May 5, 2025
94e1961
Merge branch 'main' into feature/randomized_eigsh_value
yaichm May 7, 2025
7593464
Merge branch 'main' into feature/randomized_eigsh_value
yaichm May 9, 2025
e8656f1
Merge branch 'main' into feature/randomized_eigsh_value
yaichm May 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10000 106 changes: 106 additions & 0 deletions benchmarks/bench_isomap_auto_vs_randomized.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
"""
======================================================================
Benchmark: Comparing Isomap Solvers - Execution Time vs. Representation
======================================================================

This benchmark demonstrates how different eigenvalue solvers in Isomap
can affect execution time and embedding quality.

Description:
------------
We use a subset of handwritten digits (`load_digits` with 6 classes).
Each data point is projected into a lower-dimensional space (2D) using
two different solvers (`auto` and `randomized`).

What you can observe:
----------------------
- The `auto` solver provides a reference solution.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The 'auto' behaviour should rather be improved to rely on some heuristic in order to solve the problem faster automatically. See what has been done in #27491

- The `randomized` solver is tested for comparison in terms of
representation quality and execution time.

Further exploration:
---------------------
You can modify the number of neighbors (`n_neighbors`) or experiment with
other Isomap solvers.
"""

import time

import matplotlib.pyplot as plt
import numpy as np
from matplotlib import offsetbox

from sklearn.datasets import load_digits
from sklearn.manifold import Isomap
from sklearn.preprocessing import MinMaxScaler

# 1- Data Loading
# ---------------
digits = load_digits(n_class=6)
X, y = digits.data, digits.target
n_neighbors = 30 # Number of neighbors for Isomap


# 2- Visualization Function
# -------------------------
def plot_embedding(ax, X, title):
"""Displays projected points with image annotations."""
X = MinMaxScaler().fit_transform(X)

for digit in digits.target_names:
ax.scatter(
*X[y == digit].T,
marker=f"${digit}$",
s=60,
color=plt.cm.Dark2(digit),
alpha=0.425,
zorder=2,
)

# Add digit images in the projected space
shown_images = np.array([[1.0, 1.0]])
for i in range(X.shape[0]):
dist = np.sum((X[i] - shown_images) ** 2, 1)
if np.min(dist) < 4e-3:
continue
shown_images = np.concatenate([shown_images, [X[i]]], axis=0)
imagebox = offsetbox.AnnotationBbox(
offsetbox.OffsetImage(digits.images[i], cmap=plt.cm.gray_r), X[i]
)
imagebox.set(zorder=1)
ax.add_artist(imagebox)

ax.set_title(title)
ax.axis("off")


# 3- Define Embeddings and Benchmark
# ----------------------------------
embeddings = {
"Isomap (auto solver)": Isomap(
n_neighbors=n_neighbors, n_components=2, eigen_solver="auto"
),
"Isomap (randomized solver)": Isomap(
n_neighbors=n_neighbors, n_components=2, eigen_solver="randomized_value"
),
}

projections, timing = {}, {}

# Compute embeddings
for name, transformer in embeddings.items():
print(f"Computing {name}...")
start_time = time.time()
projections[name] = transformer.fit_transform(X, y)
timing[name] = time.time() - start_time

# 4- Display Results
# ------------------
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

for ax, (name, proj) in zip(axes, projections.items()):
title = f"{name} (time: {timing[name]:.3f}s)"
plot_embedding(ax, proj, title)

plt.tight_layout()
plt.show()
118 changes: 118 additions & 0 deletions benchmarks/bench_isomap_execution_time_full_vs_randomized.py
8000
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
"""
======================================================================
Isomap Solvers Benchmark: Execution Time vs Number of Samples
======================================================================

This benchmark demonstrates how the choice of eigen_solver in Isomap
can significantly affect computation time, especially as the dataset
size increases.

Description:
------------
Synthetic datasets are generated using `make_classification` with a
fixed number of features. The number of samples is
varied from 1000 to 4000.

For each setting, Isomap is applied using two different solvers:
- 'auto' (full eigendecomposition)
- 'randomized_value'

The execution time of each solver is measured for each number of
samples, and the average time over multiple runs (default: 3) is
plotted.

What you can observe:
---------------------
If n_components < 10, the randomized and auto solvers produce similar
results (in this case, the arpack solver is selected).
However, when n_components > 10, the randomized solver becomes significantly
faster, especially as the number of samples increases.

"""

import time

import matplotlib.pyplot as plt
import numpy as np

from sklearn.datasets import make_classification
from sklearn.manifold import Isomap

# 1 - Experiment Setup
# -- -- -- -- -- -- -- -- -- -
n_samples_list = [1000, 2000, 3000, 4000]
n_neighbors = 30
n_components_list = [2, 10]
n_features = 100
n_iter = 3 # Number of repetitions for averaging execution time

# Store timings for each value of n_components
timing_all = {}

for n_components in n_components_list:
# Create containers for timing results
timing = {
"auto": np.zeros((len(n_samples_list), n_iter)),
"randomized_value": np.zeros((len(n_samples_list), n_iter)),
}

for j, n in enumerate(n_samples_list):
# Generate synthetic classification dataset
X, _ = make_classification(
n_samples=n,
n_features=n_features,
n_redundant=0,
n_clusters_per_class=1,
n_classes=1,
random_state=42,
)

# Evaluate both solvers for multiple repetitions
for solver in ["auto", "randomized_value"]:
for i in range(n_iter):
model = Isomap(
n_neighbors=n_neighbors,
n_components=n_components,
eigen_solver=solver,
)
start = time.perf_counter()
model.fit(X)
elapsed = time.perf_counter() - start
timing[solver][j, i] = elapsed

timing_all[n_components] = timing

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

for idx, n_components in enumerate(n_components_list):
ax = axes[idx]
timing = timing_all[n_components]
avg_full = timing["auto"].mean(axis=1)
std_full = timing["auto"].std(axis=1)
avg_rand = timing["randomized_value"].mean(axis=1)
std_rand = timing["randomized_value"].std(axis=1)

ax.errorbar(
n_samples_list,
avg_full,
yerr=std_full,
label="Isomap (full)",
marker="o",
linestyle="-",
)
ax.errorbar(
n_samples_list,
avg_rand,
yerr=std_rand,
label="Isomap (randomized)",
marker="x",
linestyle="--",
)
ax.set_xlabel("Number of Samples")
ax.set_ylabel("Execution Time (seconds)")
ax.set_title(f"Isomap Execution Time (n_components = {n_components})")
ax.legend()
ax.grid(True)

plt.tight_layout()
plt.show()
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
"""
========================================================================
Benchmark: Isomap Reconstruction Error - Standard vs. Randomized Solver
========================================================================

This benchmark illustrates how the number of samples impacts the quality
of the Isomap embedding, using reconstruction error as a metric.

Description:
------------
We generate synthetic 2D non-linear data (two concentric circles) with
varying numbers of samples. For each subset, we compare the reconstruction
error of two Isomap solvers:

- The `auto` solver (standard dense or arpack, selected automatically).
- The `randomized_value` solver .

What you can observe:
---------------------
-The randomized and auto solvers yield the same errors for different samples,
which means that randomized provides the same projection quality as auto.

Further exploration:
---------------------
- Modify the number of neighbors or iterations.
"""

import matplotlib.pyplot as plt
import numpy as np

from sklearn.datasets import make_circles
from sklearn.manifold import Isomap

# 1- Experiment Configuration
# ---------------------------
min_n_samples, max_n_samples = 100, 4000
n_samples_grid_size = 4 # Number of sample sizes to test

n_samples_range = [
int(
min_n_samples
+ np.floor((x / (n_samples_grid_size - 1)) * (max_n_samples - min_n_samples))
)
for x in range(0, n_samples_grid_size)
]

n_components = 2
n_iter = 3 # Number of repetitions per sample size

# 2- Data Generation
# ------------------
n_features = 2
X_full, y_full = make_circles(
n_samples=max_n_samples, factor=0.3, noise=0.05, random_state=0
)

# 3- Benchmark Execution
# ----------------------
errors_randomized = []
errors_full = []

for n_samples in n_samples_range:
X, y = X_full[:n_samples], y_full[:n_samples]
print(f"Computing for n_samples = {n_samples}")

# Instantiate Isomap solvers
isomap_randomized = Isomap(
n_neighbors=50, n_components=n_components, eigen_solver="randomized_value"
)
isomap_auto = Isomap(n_neighbors=50, n_components=n_components, eigen_solver="auto")

# Fit and record reconstruction error
isomap_randomized.fit(X)
err_rand = isomap_randomized.reconstruction_error()
errors_randomized.append(err_rand)

isomap_auto.fit(X)
err_auto = isomap_auto.reconstruction_error()
errors_full.append(err_auto)

# 4- Results Visualization
# ------------------------
plt.figure(figsize=(10, 6))
plt.scatter(n_samples_range, errors_full, label="Isomap (auto)", color="b", marker="*")
plt.scatter(
n_samples_range,
errors_randomized,
label="Isomap (randomized)",
color="r",
marker="x",
)

plt.title("Isomap Reconstruction Error vs. Number of Samples")
plt.xlabel("Number of Samples")
plt.ylabel("Reconstruction Error")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
25 changes: 25 additions & 0 deletions benchmarks/bench_kernel_pca_solvers_time_vs_n_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
ref_time = np.empty((len(n_compo_range), n_iter)) * np.nan
a_time = np.empty((len(n_compo_range), n_iter)) * np.nan
r_time = np.empty((len(n_compo_range), n_iter)) * np.nan
rv_time = np.empty((len(n_compo_range), n_iter)) * np.nan
# loop
for j, n_components in enumerate(n_compo_range):
n_components = int(n_components)
Expand Down Expand Up @@ -119,13 +120,28 @@
# check that the result is still correct despite the approximation
assert_array_almost_equal(np.abs(r_pred), np.abs(ref_pred))

# D- randomized_value
print(" - randomized_value solver")
for i in range(n_iter):
start_time = time.perf_counter()
rv_pred = (
KernelPCA(n_components, eigen_solver="randomized_value")
.fit(X_train)
.transform(X_test)
)
rv_time[j, i] = time.perf_counter() - start_time
# check that the result is still correct despite the approximation
assert_array_almost_equal(np.abs(rv_pred), np.abs(ref_pred))

# Compute statistics for the 3 methods
avg_ref_time = ref_time.mean(axis=1)
std_ref_time = ref_time.std(axis=1)
avg_a_time = a_time.mean(axis=1)
std_a_time = a_time.std(axis=1)
avg_r_time = r_time.mean(axis=1)
std_r_time = r_time.std(axis=1)
avg_rv_time = rv_time.mean(axis=1)
std_rv_time = rv_time.std(axis=1)


# 4- Plots
Expand Down Expand Up @@ -160,6 +176,15 @@
color="b",
label="randomized",
)
ax.errorbar(
n_compo_range,
avg_rv_time,
yerr=std_rv_time,
marker="x",
linestyle="",
color="purple",
label="randomized_value",
)
ax.legend(loc="upper left")

# customize axes
Expand Down
Loading
Loading
0