From 19e62b74385fdcdb0f589f87f0acd7dfb305278f Mon Sep 17 00:00:00 2001 From: jason Date: Sat, 23 Jan 2016 22:19:09 -0600 Subject: [PATCH 1/3] Added functionality to allow X to be a precomputed distance matrix. This is useful if the distance metric is expensive to calculate and the user wants to speed up multiple runs of DeBaCl by inputting precomputed distance values. In some odd instances, data may not be easily represented in N dimensional space but distance values for the data may be readily available. This code provides functionality for those less common use cases. --- debacl/level_set_tree.py | 68 ++++++++++++++++++++++++++++++++++++++++ debacl/utils.py | 23 ++++++++++++-- 2 files changed, 88 insertions(+), 3 deletions(-) diff --git a/debacl/level_set_tree.py b/debacl/level_set_tree.py index cdeef9d..e7ae3c3 100644 --- a/debacl/level_set_tree.py +++ b/debacl/level_set_tree.py @@ -1260,6 +1260,74 @@ def construct_tree(X, k, prune_threshold=None, num_levels=None, verbose=False): return tree +def construct_tree_from_precomputed_matrix(X, p, k, prune_threshold=None, num_levels=None, verbose=False): + """ + Construct a level set tree from tabular data. + + Parameters + ---------- + X : 2-dimensional numpy array + A square distance matrix + + p: int + Underlying dimensionality of the data that was used to create X, the distance matrix + + k : int + Number of observations to consider as neighbors to a given point. + + prune_threshold : int, optional + Leaf nodes with fewer than this number of members are recursively + merged into larger nodes. If 'None' (the default), then no pruning + is performed. + + num_levels : int, optional + Number of density levels in the constructed tree. If None (default), + `num_levels` is internally set to be the number of rows in `X`. + + verbose : bool, optional + If True, a progress indicator is printed at every 100th level of tree + construction. + + Returns + ------- + T : LevelSetTree + A pruned level set tree. + + See Also + -------- + construct_tree, construct_tree_from_graph, LevelSetTree + + Examples + -------- + >>> X = numpy.random.rand(100, 2) + >>> @numpy.vectorize + >>> def euclidean(i,j): + >>> return scipy.spatial.distance.euclidean(X[i], X[j]) + >>> X = numpy.fromfunction(euclidean, (100,100)) + >>> tree = debacl.construct_tree_from_precomputed_matrix(X, p=2, k=8, prune_threshold=5) + >>> print(tree) + +----+-------------+-----------+------------+----------+------+--------+----------+ + | id | start_level | end_level | start_mass | end_mass | size | parent | children | + +----+-------------+-----------+------------+----------+------+--------+----------+ + | 0 | 0.000 | 0.870 | 0.000 | 0.450 | 100 | None | [3, 4] | + | 3 | 0.870 | 3.364 | 0.450 | 0.990 | 17 | 0 | [] | + | 4 | 0.870 | 1.027 | 0.450 | 0.520 | 35 | 0 | [7, 8] | + | 7 | 1.027 | 1.755 | 0.520 | 0.870 | 8 | 4 | [] | + | 8 | 1.027 | 3.392 | 0.520 | 1.000 | 23 | 4 | [] | + +----+-------------+-----------+------------+----------+------+--------+----------+ + """ + + sim_graph, radii = _utl.knn_graph(X, k, method='precomputed') + + n, _ = X.shape + density = _utl.knn_density(radii, n, p, k) + + tree = construct_tree_from_graph(adjacency_list=sim_graph, density=density, + prune_threshold=prune_threshold, + num_levels=num_levels, verbose=verbose) + return tree + + def construct_tree_from_graph(adjacency_list, density, prune_threshold=None, num_levels=None, verbose=False): """ diff --git a/debacl/utils.py b/debacl/utils.py index 7552c9b..6776206 100644 --- a/debacl/utils.py +++ b/debacl/utils.py @@ -38,12 +38,13 @@ def knn_graph(X, k, method='brute_force', leaf_size=30): Parameters ---------- X : numpy array | list [numpy arrays] - Data points, with each row as an observation. + Data points, with each row as an observation + OR a distance matrix if method='precomputed' k : int The number of points to consider as neighbors of any given observation. - method : {'brute-force', 'kd-tree', 'ball-tree'}, optional + method : {'brute-force', 'kd-tree', 'ball-tree', 'precomputed'}, optional Computing method. - 'brute-force': computes the (Euclidean) distance between all O(n^2) @@ -61,11 +62,14 @@ def knn_graph(X, k, method='brute_force', leaf_size=30): distances. Typically much faster than 'brute-force', and works with up to a few hundred dimensions. Requires the scikit-learn library. + - 'precomputed': used when the matrix X is a precomputed distance + matrix. + leaf_size : int, optional For the 'kd-tree' and 'ball-tree' methods, the number of observations in the leaf nodes. Leaves are not split further, so distance computations within leaf nodes are done by brute force. 'leaf_size' is - ignored for the 'brute-force' method. + ignored for the 'brute-force' and 'precomputed' methods. Returns ------- @@ -109,6 +113,19 @@ def knn_graph(X, k, method='brute_force', leaf_size=30): raise ImportError("The scikit-learn library could not be loaded." + " It is required for the 'ball-tree' method.") + if method == 'precomputed': + # Pseudo-sort each row of the distance matrix so that the indices of the closest k elements are listed first + # The entries [...,:k] are not in sorted order by distance + neighbors = _np.argpartition(X, k)[:, 0:k] + + # Get the distance values for the first k elements, and then sort 'neighbors' by those distance values + radii = X[_np.arange(n)[:, None], neighbors[:]] + order = radii.argsort(axis=1) + neighbors = neighbors[_np.arange(n)[:, None], order] + + radii = radii[_np.arange(n)[:, None], order] + radii = radii[:, -1] + else: # assume brute-force if not _HAS_SCIPY: raise ImportError("The 'scipy' module could not be loaded. " + From 23fb21ad754c2666e2e8da4dd5fa4b128fcc10a9 Mon Sep 17 00:00:00 2001 From: jason Date: Sat, 23 Jan 2016 22:33:16 -0600 Subject: [PATCH 2/3] modified example code to use scipy rather than a custom defined function --- debacl/level_set_tree.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/debacl/level_set_tree.py b/debacl/level_set_tree.py index e7ae3c3..06a0889 100644 --- a/debacl/level_set_tree.py +++ b/debacl/level_set_tree.py @@ -1300,10 +1300,8 @@ def construct_tree_from_precomputed_matrix(X, p, k, prune_threshold=None, num_le Examples -------- >>> X = numpy.random.rand(100, 2) - >>> @numpy.vectorize - >>> def euclidean(i,j): - >>> return scipy.spatial.distance.euclidean(X[i], X[j]) - >>> X = numpy.fromfunction(euclidean, (100,100)) + >>> X = scipy.spatial.distance.pdist(data) + >>> X = scipy.spatial.distance.squareform(matrix) >>> tree = debacl.construct_tree_from_precomputed_matrix(X, p=2, k=8, prune_threshold=5) >>> print(tree) +----+-------------+-----------+------------+----------+------+--------+----------+ From 79cf95e66787afb532cc76a74b2ca60df5138cad Mon Sep 17 00:00:00 2001 From: jason Date: Mon, 7 Mar 2016 12:10:25 -0600 Subject: [PATCH 3/3] added test functionality, cleanup and pep8 --- debacl/__init__.py | 1 + debacl/level_set_tree.py | 21 +++++++++++++-------- debacl/test/test_lst.py | 13 +++++++++++++ debacl/test/test_utils.py | 10 ++++++++++ debacl/utils.py | 20 +++++++++----------- 5 files changed, 46 insertions(+), 19 deletions(-) diff --git a/debacl/__init__.py b/debacl/__init__.py index 5e13b62..4b65494 100644 --- a/debacl/__init__.py +++ b/debacl/__init__.py @@ -19,6 +19,7 @@ from debacl.level_set_tree import construct_tree from debacl.level_set_tree import construct_tree_from_graph +from debacl.level_set_tree import construct_tree_from_distance_matrix from debacl.level_set_tree import load_tree from debacl.level_set_tree import LevelSetTree diff --git a/debacl/level_set_tree.py b/debacl/level_set_tree.py index 06a0889..9e92903 100644 --- a/debacl/level_set_tree.py +++ b/debacl/level_set_tree.py @@ -66,8 +66,9 @@ class LevelSetTree(object): LevelSetTree objects should not generally be instantiated directly, because they will contain empty node hierarchies. Use the tree - constructors :func:`construct_tree` or - :func:`construct_tree_from_graph` to instantiate a LevelSetTree model. + constructors :func:`construct_tree`, :func:`construct_tree_from_graph`, + or :func: `construct_tree_from_distance_matrix` to instantiate a + LevelSetTree model. Parameters ---------- @@ -1260,9 +1261,10 @@ def construct_tree(X, k, prune_threshold=None, num_levels=None, verbose=False): return tree -def construct_tree_from_precomputed_matrix(X, p, k, prune_threshold=None, num_levels=None, verbose=False): +def construct_tree_from_distance_matrix(X, p, k, prune_threshold=None, + num_levels=None, verbose=False): """ - Construct a level set tree from tabular data. + Construct a level set tree from a precomputed distance matrix. Parameters ---------- @@ -1270,7 +1272,8 @@ def construct_tree_from_precomputed_matrix(X, p, k, prune_threshold=None, num_le A square distance matrix p: int - Underlying dimensionality of the data that was used to create X, the distance matrix + Underlying dimensionality of the data that was used to create X, the + distance matrix k : int Number of observations to consider as neighbors to a given point. @@ -1300,9 +1303,11 @@ def construct_tree_from_precomputed_matrix(X, p, k, prune_threshold=None, num_le Examples -------- >>> X = numpy.random.rand(100, 2) - >>> X = scipy.spatial.distance.pdist(data) - >>> X = scipy.spatial.distance.squareform(matrix) - >>> tree = debacl.construct_tree_from_precomputed_matrix(X, p=2, k=8, prune_threshold=5) + >>> distances = scipy.spatial.distance.pdist(X) + >>> distance_matrix = scipy.spatial.distance.squareform(distances) + >>> tree = debacl.construct_tree_from_distance_matrix(distance_matrix, p=2, + ... k=8, + ... prune_threshold=5) >>> print(tree) +----+-------------+-----------+------------+----------+------+--------+----------+ | id | start_level | end_level | start_mass | end_mass | size | parent | children | diff --git a/debacl/test/test_lst.py b/debacl/test/test_lst.py index a950ed6..0cb76e4 100644 --- a/debacl/test/test_lst.py +++ b/debacl/test/test_lst.py @@ -183,6 +183,19 @@ def test_construct_from_data(self): self._check_tree_viability(tree) self._check_tree_correctness(tree) + def test_construct_from_distance_matrix(self): + """ + Check viability and correctness of an LST constructed from a distance + matrix. + """ + distances = scipy.spatial.distance.pdist(self.dataset) + distance_matrix = scipy.spatial.distance.squareform(distances) + tree = dcl.construct_tree_from_distance_matrix(distance_matrix, self.k, + prune_threshold=self.gamma) + + self._check_tree_viability(tree) + self._check_tree_correctness(tree) + def test_load(self): """ Check viability and correctness of an LST saved then loaded from file. diff --git a/debacl/test/test_utils.py b/debacl/test/test_utils.py index c2c43d8..5d7c421 100644 --- a/debacl/test/test_utils.py +++ b/debacl/test/test_utils.py @@ -75,6 +75,16 @@ def test_knn_graph(self): for neighbors, ans_neighbors in zip(knn, ans_graph): self.assertItemsEqual(neighbors, ans_neighbors) + distances = scipy.spatial.distance.pdist(self.X) + distance_matrix = scipy.spatial.distance.squareform(distances) + knn, radii = utl.knn_graph(distance_matrix, k, method='precomputed') + + ## Test precomputed method + assert_array_equal(radii, ans_radii) + + for neighbors, ans_neighbors in zip(knn, ans_graph): + self.assertItemsEqual(neighbors, ans_neighbors) + def test_epsilon_graph(self): """ Test construction of the epsilon-nearest neighbor graph. diff --git a/debacl/utils.py b/debacl/utils.py index 6776206..b4dd742 100644 --- a/debacl/utils.py +++ b/debacl/utils.py @@ -114,17 +114,15 @@ def knn_graph(X, k, method='brute_force', leaf_size=30): " It is required for the 'ball-tree' method.") if method == 'precomputed': - # Pseudo-sort each row of the distance matrix so that the indices of the closest k elements are listed first - # The entries [...,:k] are not in sorted order by distance - neighbors = _np.argpartition(X, k)[:, 0:k] - - # Get the distance values for the first k elements, and then sort 'neighbors' by those distance values - radii = X[_np.arange(n)[:, None], neighbors[:]] - order = radii.argsort(axis=1) - neighbors = neighbors[_np.arange(n)[:, None], order] - - radii = radii[_np.arange(n)[:, None], order] - radii = radii[:, -1] + if X.ndim != 2 or X.shape[0] != X.shape[1]: + raise ValueError("array 'X' must be square") + + # Pseudo-sort each row of the distance matrix so that the indices of + # the closest k elements are listed first # The entries [...,:k-1] are + # not in sorted order by distance, but the kth entry is guaranteed to + # be in the correct spot + neighbors = _np.argpartition(X, k - 1)[:, :k] + radii = X[_np.arange(n), neighbors[:, -1]] else: # assume brute-force if not _HAS_SCIPY: