-
Notifications
You must be signed in to change notification settings - Fork 517
Closed
Description
This is related to this issue in BERTopic, and this issue in UMAP. Probably related to this issue and this issue, too.
I am currently Using UMAP to reduce the dimension and then using HDBSCAN to cluster the embeddings. However, I am running into the following error. Any idea why?
My data size is 10 million rows and 5 dimensions (reduced with UMAP from 384 dimensions). I have 1TB of RAM and 32 cores.
I am using Jupyter Notebook.
---------------------------------------------------------------------------
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
"""
Traceback (most recent call last):
File "/rds/user/jw983/hpc-work/pytorch-env2/lib/python3.9/site-packages/joblib/externals/loky/process_executor.py", line 404, in _process_worker
call_item = call_queue.get(block=True, timeout=timeout)
File "/usr/local/software/spack/spack-git/opt/spack/linux-rhel7-broadwell/gcc-5.4.0/python-3.9.6-sbr552hsx3zanhgi3ekdjp4rsn6o6ejq/lib/python3.9/multiprocessing/queues.py", line 122, in get
return _ForkingPickler.loads(res)
File "sklearn/neighbors/_binary_tree.pxi", line 1057, in sklearn.neighbors._kd_tree.BinaryTree.__setstate__
File "sklearn/neighbors/_binary_tree.pxi", line 999, in sklearn.neighbors._kd_tree.BinaryTree._update_memviews
File "stringsource", line 658, in View.MemoryView.memoryview_cwrapper
File "stringsource", line 349, in View.MemoryView.memoryview.__cinit__
ValueError: buffer source array is read-only
"""
The above exception was the direct cause of the following exception:
BrokenProcessPool Traceback (most recent call last)
/tmp/ipykernel_778601/1248467627.py in <module>
----> 1 test1=hdbscan_model.fit(embedding_test)
/rds/user/jw983/hpc-work/pytorch-env2/lib/python3.9/site-packages/hdbscan/hdbscan_.py in fit(self, X, y)
917 self._condensed_tree,
918 self._single_linkage_tree,
--> 919 self._min_spanning_tree) = hdbscan(X, **kwargs)
920
921 if self.prediction_data:
/rds/user/jw983/hpc-work/pytorch-env2/lib/python3.9/site-packages/hdbscan/hdbscan_.py in hdbscan(X, min_cluster_size, min_samples, alpha, cluster_selection_epsilon, metric, p, leaf_size, algorithm, memory, approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs, cluster_selection_method, allow_single_cluster, match_reference_implementation, **kwargs)
608 gen_min_span_tree, **kwargs)
609 else:
--> 610 (single_linkage_tree, result_min_span_tree) = memory.cache(
611 _hdbscan_boruvka_kdtree)(X, min_samples, alpha,
612 metric, p, leaf_size,
/rds/user/jw983/hpc-work/pytorch-env2/lib/python3.9/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
350
351 def __call__(self, *args, **kwargs):
--> 352 return self.func(*args, **kwargs)
353
354 def call_and_shelve(self, *args, **kwargs):
/rds/user/jw983/hpc-work/pytorch-env2/lib/python3.9/site-packages/hdbscan/hdbscan_.py in _hdbscan_boruvka_kdtree(X, min_samples, alpha, metric, p, leaf_size, approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs, **kwargs)
273
274 tree = KDTree(X, metric=metric, leaf_size=leaf_size, **kwargs)
--> 275 alg = KDTreeBoruvkaAlgorithm(tree, min_samples, metric=metric,
276 leaf_size=leaf_size // 3,
277 approx_min_span_tree=approx_min_span_tree,
hdbscan/_hdbscan_boruvka.pyx in hdbscan._hdbscan_boruvka.KDTreeBoruvkaAlgorithm.__init__()
hdbscan/_hdbscan_boruvka.pyx in hdbscan._hdbscan_boruvka.KDTreeBoruvkaAlgorithm._compute_bounds()
/rds/user/jw983/hpc-work/pytorch-env2/lib/python3.9/site-packages/joblib/parallel.py in __call__(self, iterable)
1052
1053 with self._backend.retrieval_context():
-> 1054 self.retrieve()
1055 # Make sure that we get a last message telling us we are done
1056 elapsed_time = time.time() - self._start_time
/rds/user/jw983/hpc-work/pytorch-env2/lib/python3.9/site-packages/joblib/parallel.py in retrieve(self)
931 try:
932 if getattr(self._backend, 'supports_timeout', False):
--> 933 self._output.extend(job.get(timeout=self.timeout))
934 else:
935 self._output.extend(job.get())
/rds/user/jw983/hpc-work/pytorch-env2/lib/python3.9/site-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout)
540 AsyncResults.get from multiprocessing."""
541 try:
--> 542 return future.result(timeout=timeout)
543 except CfTimeoutError as e:
544 raise TimeoutError from e
/usr/local/software/spack/spack-git/opt/spack/linux-rhel7-broadwell/gcc-5.4.0/python-3.9.6-sbr552hsx3zanhgi3ekdjp4rsn6o6ejq/lib/python3.9/concurrent/futures/_base.py in result(self, timeout)
443 raise CancelledError()
444 elif self._state == FINISHED:
--> 445 return self.__get_result()
446 else:
447 raise TimeoutError()
/usr/local/software/spack/spack-git/opt/spack/linux-rhel7-broadwell/gcc-5.4.0/python-3.9.6-sbr552hsx3zanhgi3ekdjp4rsn6o6ejq/lib/python3.9/concurrent/futures/_base.py in __get_result(self)
388 if self._exception:
389 try:
--> 390 raise self._exception
391 finally:
392 # Break a reference cycle with the exception in self._exception
BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.
Metadata
Metadata
Assignees
Labels
No labels