ogrisel · July 3, 2023 18:30
diff --git a/bench_pca.py b/bench_pca.py
 # %%
 from sklearn import set_config
 from sklearn.base import clone

 set_config(array_api_dispatch=True)

 # %%
 try:
    import torch
 except ModuleNotFoundError:
    torch = None

 # %%
 try:
    import cupy
 except ModuleNotFoundError:
    cupy = None

 # %%
 try:
    import cuml
 except ModuleNotFoundError:
    cuml = None

 # %%
 from threadpoolctl import threadpool_info
 from pprint import pprint

 pprint(threadpool_info())

 # %%
 import numpy as np
 import joblib

 m = joblib.Memory(location="./joblib")


 @m.cache
 def make_data(n_samples=int(5e5), n_features=1000, rank=30, dtype=np.float32, seed=0):
    rng = np.random.default_rng(0)
    s = np.diag(np.linspace(100, 0, rank)).astype(dtype)
    A = rng.standard_normal(size=(n_samples, rank)).astype(dtype)
    B = rng.standard_normal(size=(rank, n_features)).astype(dtype)
    return A @ s @ B


 X_np = make_data()
 print(f"Data shape {X_np.shape} and dtype {X_np.dtype} of size {X_np.nbytes / 1e6:.1f} MB")

 # %%
 from threadpoolctl import threadpool_limits
 from sklearn.decomposition import PCA
 from time import perf_counter

 k = 5
 svd_solver = "full"
 if svd_solver == "randomized":
    pca_params = dict(
        n_components=k,
        svd_solver="randomized",
        power_iteration_normalizer="QR",
        random_state=0,
    )
 else:
    pca_params = dict(
        n_components=k,
        svd_solver="full",
    )


 pca_np = PCA(**pca_params)
 print("Running benchmarks for:")
 print(pca_np)

 tic = perf_counter()
 pca_np.fit(X_np)
 toc = perf_counter()
 print(
    f"Fitting PCA(n_components={k}) with numpy took {toc - tic:.3f}s"
 )
 # print(pca_np.explained_variance_ratio_)

 # for n_threads in [1, 4]:
 #     with threadpool_limits(limits=n_threads):
 #         tic = perf_counter()
 #         pca_np.fit(X_np)
 #         toc = perf_counter()
 #         print(
 #             f"Fitting PCA(n_components={k}) with numpy and {n_threads=} took {toc - tic:.3f}s"
 #         )
        # print(pca_np.explained_variance_ratio_)
 # %%

 if torch is not None:
    X_torch_cpu = torch.tensor(X_np)
    pca_torch_cpu = clone(pca_np)

    tic = perf_counter()
    pca_torch_cpu.fit(X_torch_cpu)
    toc = perf_counter()
    print(
        f"Fitting PCA(n_components={k}) with torch on CPU took {toc - tic:.3f}s"
    )
    # print(pca_torch_cpu.explained_variance_ratio_)

 if torch is not None and torch.cuda.is_available():
    X_torch_gpu = X_torch_cpu.to("cuda")
    pca_torch_gpu = clone(pca_np)
    tic = perf_counter()
    pca_torch_gpu.fit(X_torch_gpu)
    torch.cuda.synchronize()
    toc = perf_counter()
    print(
        f"Fitting PCA(n_components={k}) with torch on GPU took {toc - tic:.3f}s"
    )
    # print(pca_torch_gpu.explained_variance_ratio_.cpu())

 # %%
 if cupy is not None:
    X_cupy = cupy.asarray(X_np)
    pca_cupy = clone(pca_np)
    tic = perf_counter()
    pca_cupy.fit(X_cupy)
    cupy.cuda.stream.get_current_stream().synchronize()
    toc = perf_counter()
    print(
        f"Fitting PCA(n_components={k}) with cupy on GPU took {toc - tic:.3f}s"
    )
    # print(pca_cupy.explained_variance_ratio_)

 # %%
 if svd_solver == "full" and cuml is not None and cupy is not None:
    from cuml.decomposition import PCA as PCA_cuml
    # cuML's PCA does not support randomized SVD
    pca_cuml = PCA_cuml(**pca_params)
    tic = perf_counter()
    pca_cuml.fit(X_cupy)
    cupy.cuda.stream.get_current_stream().synchronize()
    toc = perf_counter()
    print(
        f"Fitting PCA(n_components={k}) with cupy with cuML on GPU took {toc - tic:.3f}s"
    )
    # print(pca_cuml.explained_variance_ratio_)
	# %%
	from sklearn import set_config
	from sklearn.base import clone

	set_config(array_api_dispatch=True)

	# %%
	try:
	import torch
	except ModuleNotFoundError:
	torch = None

	# %%
	try:
	import cupy
	except ModuleNotFoundError:
	cupy = None

	# %%
	try:
	import cuml
	except ModuleNotFoundError:
	cuml = None

	# %%
	from threadpoolctl import threadpool_info
	from pprint import pprint

	pprint(threadpool_info())

	# %%
	import numpy as np
	import joblib

	m = joblib.Memory(location="./joblib")


	@m.cache
	def make_data(n_samples=int(5e5), n_features=1000, rank=30, dtype=np.float32, seed=0):
	rng = np.random.default_rng(0)
	s = np.diag(np.linspace(100, 0, rank)).astype(dtype)
	A = rng.standard_normal(size=(n_samples, rank)).astype(dtype)
	B = rng.standard_normal(size=(rank, n_features)).astype(dtype)
	return A @ s @ B


	X_np = make_data()
	print(f"Data shape {X_np.shape} and dtype {X_np.dtype} of size {X_np.nbytes / 1e6:.1f} MB")

	# %%
	from threadpoolctl import threadpool_limits
	from sklearn.decomposition import PCA
	from time import perf_counter

	k = 5
	svd_solver = "full"
	if svd_solver == "randomized":
	pca_params = dict(
	n_components=k,
	svd_solver="randomized",
	power_iteration_normalizer="QR",
	random_state=0,
	)
	else:
	pca_params = dict(
	n_components=k,
	svd_solver="full",
	)


	pca_np = PCA(**pca_params)
	print("Running benchmarks for:")
	print(pca_np)

	tic = perf_counter()
	pca_np.fit(X_np)
	toc = perf_counter()
	print(
	f"Fitting PCA(n_components={k}) with numpy took {toc - tic:.3f}s"
	)
	# print(pca_np.explained_variance_ratio_)

	# for n_threads in [1, 4]:
	# with threadpool_limits(limits=n_threads):
	# tic = perf_counter()
	# pca_np.fit(X_np)
	# toc = perf_counter()
	# print(
	# f"Fitting PCA(n_components={k}) with numpy and {n_threads=} took {toc - tic:.3f}s"
	# )
	# print(pca_np.explained_variance_ratio_)
	# %%

	if torch is not None:
	X_torch_cpu = torch.tensor(X_np)
	pca_torch_cpu = clone(pca_np)

	tic = perf_counter()
	pca_torch_cpu.fit(X_torch_cpu)
	toc = perf_counter()
	print(
	f"Fitting PCA(n_components={k}) with torch on CPU took {toc - tic:.3f}s"
	)
	# print(pca_torch_cpu.explained_variance_ratio_)

	if torch is not None and torch.cuda.is_available():
	X_torch_gpu = X_torch_cpu.to("cuda")
	pca_torch_gpu = clone(pca_np)
	tic = perf_counter()
	pca_torch_gpu.fit(X_torch_gpu)
	torch.cuda.synchronize()
	toc = perf_counter()
	print(
	f"Fitting PCA(n_components={k}) with torch on GPU took {toc - tic:.3f}s"
	)
	# print(pca_torch_gpu.explained_variance_ratio_.cpu())

	# %%
	if cupy is not None:
	X_cupy = cupy.asarray(X_np)
	pca_cupy = clone(pca_np)
	tic = perf_counter()
	pca_cupy.fit(X_cupy)
	cupy.cuda.stream.get_current_stream().synchronize()
	toc = perf_counter()
	print(
	f"Fitting PCA(n_components={k}) with cupy on GPU took {toc - tic:.3f}s"
	)
	# print(pca_cupy.explained_variance_ratio_)

	# %%
	if svd_solver == "full" and cuml is not None and cupy is not None:
	from cuml.decomposition import PCA as PCA_cuml
	# cuML's PCA does not support randomized SVD
	pca_cuml = PCA_cuml(**pca_params)
	tic = perf_counter()
	pca_cuml.fit(X_cupy)
	cupy.cuda.stream.get_current_stream().synchronize()
	toc = perf_counter()
	print(
	f"Fitting PCA(n_components={k}) with cupy with cuML on GPU took {toc - tic:.3f}s"
	)
	# print(pca_cuml.explained_variance_ratio_)