From 72a395e3807720548d6404fe0c162ce1b8500ecb Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Tue, 12 Apr 2022 15:31:31 -0400 Subject: [PATCH 1/4] Use pickle.HIGHEST_PROTOCOL Closes #6 --- multiscale_phate/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/multiscale_phate/utils.py b/multiscale_phate/utils.py index 63b013e..d46f73b 100644 --- a/multiscale_phate/utils.py +++ b/multiscale_phate/utils.py @@ -15,4 +15,4 @@ def hash_object(X): Description of returned object. """ - return hash(pickle.dumps(X)) + return hash(pickle.dumps(X, protocol=pickle.HIGHEST_PROTOCOL)) From 8aa4a5bc0e7086555f79f1a8d43282d868011d5e Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 13 Apr 2022 13:01:18 -0400 Subject: [PATCH 2/4] Avoid protocol 5 in python 3.7 --- multiscale_phate/utils.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/multiscale_phate/utils.py b/multiscale_phate/utils.py index d46f73b..3ceaea2 100644 --- a/multiscale_phate/utils.py +++ b/multiscale_phate/utils.py @@ -1,4 +1,15 @@ import pickle +import sys + + +def _pickle_protocol(): + """Get the highest working pickle protocol + + Pickle protocol 5 is supported on Python 3.7 but doesn't work with loky + """ + if tuple(sys.version.split(".")[:2]) < ('3', '8'): + return min(pickle.HIGHEST_PROTOCOL, 4) + return pickle.HIGHEST_PROTOCOL def hash_object(X): @@ -15,4 +26,4 @@ def hash_object(X): Description of returned object. """ - return hash(pickle.dumps(X, protocol=pickle.HIGHEST_PROTOCOL)) + return hash(pickle.dumps(X, protocol=_pickle_protocol())) From 796e36ed7763002ea5f729981d9ea2ca69da455d Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 13 Apr 2022 15:25:34 -0400 Subject: [PATCH 3/4] Revert --- multiscale_phate/utils.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/multiscale_phate/utils.py b/multiscale_phate/utils.py index 3ceaea2..d46f73b 100644 --- a/multiscale_phate/utils.py +++ b/multiscale_phate/utils.py @@ -1,15 +1,4 @@ import pickle -import sys - - -def _pickle_protocol(): - """Get the highest working pickle protocol - - Pickle protocol 5 is supported on Python 3.7 but doesn't work with loky - """ - if tuple(sys.version.split(".")[:2]) < ('3', '8'): - return min(pickle.HIGHEST_PROTOCOL, 4) - return pickle.HIGHEST_PROTOCOL def hash_object(X): @@ -26,4 +15,4 @@ def hash_object(X): Description of returned object. """ - return hash(pickle.dumps(X, protocol=_pickle_protocol())) + return hash(pickle.dumps(X, protocol=pickle.HIGHEST_PROTOCOL)) From e9a5a29db863fbaefc8c287e1aa9cd434e313191 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 13 Apr 2022 15:29:19 -0400 Subject: [PATCH 4/4] Set custom looks pickler --- multiscale_phate/compress.py | 45 +++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/multiscale_phate/compress.py b/multiscale_phate/compress.py index 8f4d718..f24a319 100644 --- a/multiscale_phate/compress.py +++ b/multiscale_phate/compress.py @@ -1,9 +1,21 @@ +import contextlib import numpy as np import joblib import tasklogger import sklearn.cluster import sklearn.neighbors import scipy.spatial.distance +from joblib.externals.loky import set_loky_pickler + + +@contextlib.contextmanager +def custom_loky_pickler(pickler): + try: + set_loky_pickler(pickler) + yield + finally: + # revert to default + set_loky_pickler() def get_compression_features(N, features, n_pca, partitions, landmarks): @@ -132,25 +144,26 @@ def subset_data(data, desired_num_clusters, n_jobs, num_cluster=100, random_stat clusters_unique, cluster_counts = np.unique(clusters, return_counts=True) clusters_next_iter = clusters.copy() - while np.max(cluster_counts) > np.ceil(N / desired_num_clusters): - min_val = 0 - partitions_id_uni = joblib.Parallel(n_jobs=n_jobs)( - joblib.delayed(cluster_components)( - data[np.where(clusters == clusters_unique[i])[0], :], - num_cluster, - size, - random_state=random_state, + with custom_loky_pickler('pickle'): + while np.max(cluster_counts) > np.ceil(N / desired_num_clusters): + min_val = 0 + partitions_id_uni = joblib.Parallel(n_jobs=n_jobs)( + joblib.delayed(cluster_components)( + data[np.where(clusters == clusters_unique[i])[0], :], + num_cluster, + size, + random_state=random_state, + ) + for i in range(len(clusters_unique)) ) - for i in range(len(clusters_unique)) - ) - for i in range(len(clusters_unique)): - loc = np.where(clusters == clusters_unique[i])[0] - clusters_next_iter[loc] = np.array(partitions_id_uni[i]) + min_val - min_val = min_val + np.max(np.array(partitions_id_uni[i])) + 1 + for i in range(len(clusters_unique)): + loc = np.where(clusters == clusters_unique[i])[0] + clusters_next_iter[loc] = np.array(partitions_id_uni[i]) + min_val + min_val = min_val + np.max(np.array(partitions_id_uni[i])) + 1 - clusters = clusters_next_iter.copy() - clusters_unique, cluster_counts = np.unique(clusters, return_counts=True) + clusters = clusters_next_iter.copy() + clusters_unique, cluster_counts = np.unique(clusters, return_counts=True) return clusters