From 5253da053f9e2ba968398f50788e3e9818a0e842 Mon Sep 17 00:00:00 2001 From: Alex Morrise Date: Wed, 14 Jun 2023 14:33:08 -0700 Subject: [PATCH 1/2] adds dendrogram to graphistry and tests in compute/cluster.py --- graphistry/compute/cluster.py | 56 ++++++++++++++++++++++++ graphistry/tests/test_compute_cluster.py | 21 ++++++++- 2 files changed, 75 insertions(+), 2 deletions(-) diff --git a/graphistry/compute/cluster.py b/graphistry/compute/cluster.py index 585b17acd8..810149d461 100644 --- a/graphistry/compute/cluster.py +++ b/graphistry/compute/cluster.py @@ -436,3 +436,59 @@ def transform_dbscan( ) return g return emb, X, y, df + + + +def get_dendrogram_edges(df: pd.DataFrame, as_graph:bool=True) -> Union[pd.DataFrame, Any]: + """Converts a dataframe of feature embeddings to a dendrogram graph with edges between each merge + This will calculate what AgglomerativeClustering does under the hood, but using the linkage matrix + + Args: + :df: dataframe of feature embeddings + :as_graph: whether to return a graphistry graph or a dataframe of edges + Usage: + :: + g = graphistry.edges(edf, 'src', 'dst').nodes(ndf, 'node') + g2 = g.umap().dbscan() # or g2 = g.featurize() + g3 = get_dendrogram_edges(g2.get_matrix(), as_graph=True) + """ + from scipy.cluster.hierarchy import linkage + import graphistry + + # df is the numeric dataframe from umap, or featurize + Z = linkage(df, 'ward') + # Convert to a DataFrame + df2 = pd.DataFrame(Z, columns=['src', 'dst', 'dist', 'size']) + + # Create a new node for each merge + num_samples = len(df) + df2['src'] = df2['src'].astype(int) + df2['dst'] = df2['dst'].astype(int) + + # The new node is the index + the number of samples + df2['new_node'] = df2.index + num_samples + + # Convert the dataframe to have each edge as a row + edges_src = pd.DataFrame({ + 'node1': df2['new_node'], + 'node2': df2['src'], + 'dist': df2['dist'], + }) + + edges_dst = pd.DataFrame({ + 'node1': df2['new_node'], + 'node2': df2['dst'], + 'dist': df2['dist'] + }) + + edges = pd.concat([edges_src, edges_dst]) + + # Handle data type + edges['node1'] = edges['node1'].astype(int) + edges['node2'] = edges['node2'].astype(int) + + if as_graph: + g = graphistry.edges(edges, 'node1', 'node2') + return g + return edges + diff --git a/graphistry/tests/test_compute_cluster.py b/graphistry/tests/test_compute_cluster.py index c93d0e279d..1fba4682dd 100644 --- a/graphistry/tests/test_compute_cluster.py +++ b/graphistry/tests/test_compute_cluster.py @@ -4,7 +4,7 @@ import graphistry from graphistry.constants import DBSCAN from graphistry.util import ModelDict -from graphistry.compute.cluster import lazy_dbscan_import_has_dependency +from graphistry.compute.cluster import lazy_dbscan_import_has_dependency, get_dendrogram_edges has_dbscan, _, has_gpu_dbscan, _ = lazy_dbscan_import_has_dependency() @@ -67,7 +67,24 @@ def test_transform_dbscan(self): g3 = g2.transform_dbscan(ndf, ndf, verbose=True) self._condition(g3, kind) - + +class TestDendrogram(unittest.TestCase): + @pytest.mark.skipif(not has_dbscan, reason="requires ai dependencies") + def setUp(self) -> None: + g = graphistry.nodes(ndf).edges(edf, 'src', 'dst') + gs=[] + for kind in ['nodes', 'edges']: + g2 = g.umap(kind=kind, n_topics=2, dbscan=False).dbscan(kind=kind, verbose=True) + gs.append(g2) + self.gs = gs + + @pytest.mark.skipif(not has_dbscan, reason="requires ai dependencies") + def testDendrogramToGraph(self): + for kind, g2 in zip(['nodes', 'edges'], self.gs): + g3 = get_dendrogram_edges(g2.get_matrix(kind=kind)) + self.assertTrue('node1' in g3._edges, 'dendrogram graph has no `node1` column') + self.assertTrue('node2' in g3._edges, 'dendrogram graph has no `node1` column') + if __name__ == '__main__': unittest.main() From f302a5b6116bf8c89304a0787e4a7e7515a7d335 Mon Sep 17 00:00:00 2001 From: Alex Morrise Date: Wed, 14 Jun 2023 15:54:15 -0700 Subject: [PATCH 2/2] lint --- graphistry/compute/cluster.py | 3 +-- graphistry/tests/test_compute_cluster.py | 4 +++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/graphistry/compute/cluster.py b/graphistry/compute/cluster.py index 810149d461..673d3edfb3 100644 --- a/graphistry/compute/cluster.py +++ b/graphistry/compute/cluster.py @@ -439,7 +439,7 @@ def transform_dbscan( -def get_dendrogram_edges(df: pd.DataFrame, as_graph:bool=True) -> Union[pd.DataFrame, Any]: +def get_dendrogram_edges(df: pd.DataFrame, as_graph: bool = True) -> Union[pd.DataFrame, Any]: """Converts a dataframe of feature embeddings to a dendrogram graph with edges between each merge This will calculate what AgglomerativeClustering does under the hood, but using the linkage matrix @@ -491,4 +491,3 @@ def get_dendrogram_edges(df: pd.DataFrame, as_graph:bool=True) -> Union[pd.DataF g = graphistry.edges(edges, 'node1', 'node2') return g return edges - diff --git a/graphistry/tests/test_compute_cluster.py b/graphistry/tests/test_compute_cluster.py index 1fba4682dd..947f10ddc0 100644 --- a/graphistry/tests/test_compute_cluster.py +++ b/graphistry/tests/test_compute_cluster.py @@ -69,10 +69,11 @@ def test_transform_dbscan(self): class TestDendrogram(unittest.TestCase): + @pytest.mark.skipif(not has_dbscan, reason="requires ai dependencies") def setUp(self) -> None: g = graphistry.nodes(ndf).edges(edf, 'src', 'dst') - gs=[] + gs = [] for kind in ['nodes', 'edges']: g2 = g.umap(kind=kind, n_topics=2, dbscan=False).dbscan(kind=kind, verbose=True) gs.append(g2) @@ -85,6 +86,7 @@ def testDendrogramToGraph(self): self.assertTrue('node1' in g3._edges, 'dendrogram graph has no `node1` column') self.assertTrue('node2' in g3._edges, 'dendrogram graph has no `node1` column') + if __name__ == '__main__': unittest.main()