node2vec-results-replication/classifier.py at master · ema-pe/node2vec-results-replication · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
"""This script performs a multi-label classification on node embeddings using the
same experimental settings as in the node2vec paper."""

# pylint: disable=line-too-long,import-error, ungrouped-imports

# Copyright (c) 2025 Emanuele Petriglia <inbox@emanuelepetriglia.com>
# All rights reserved. This file is licensed under the MIT license.


import argparse
from pathlib import Path

import numpy as np
from sklearn.linear_model import LogisticRegression
from skmultilearn.model_selection import iterative_train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score
import scipy.io
from gensim.models import KeyedVectors
from tqdm import tqdm


def load_node2vec_emb(file_path):
    """Loads node embeddings from a file in word2vec format.

    Args:
        file_path (Path): The path to the embedding file.

    Returns:
        node_map (dict): A dictionary mapping node IDs to their embeddings.
        num_nodes (int): The total number of nodes.
        emb_dim (int): The dimensionality of the embeddings.
    """
    # Load embeddings using gensim.
    model = KeyedVectors.load_word2vec_format(str(file_path), binary=False)

    # Extract node IDs and their embeddings.
    node_ids = [int(node) for node in model.index_to_key]

    # Create a mapping of node ID to embedding.
    node_map = {int(node): model[node] for node in model.index_to_key}

    num_nodes = len(node_ids)
    emb_dim = model.vector_size

    return node_map, num_nodes, emb_dim


# pylint: disable=too-many-locals, too-many-statements
def main(emb, mat, train_size, cpus):
    """Trains and evaluates a multi-label classifier on node embeddings.

    Args:
        emb (Path): Path to the input graph embedding file (.emb).
        mat (Path): Path to the input original labels file (.mat).
        train_size (float): Percentage of data to use for training.
        cpus (int): Number of CPUs to use for parallel processing.
    """
    # Load node2vec embeddings.
    node_map, num_nodes, num_features = load_node2vec_emb(emb)
    print(f"Loaded embeddings: {num_nodes} nodes with {num_features} dimensions")

    # Load multi-label data from the .mat file.
    mat_data = scipy.io.loadmat(mat)
    y = mat_data["group"].toarray()
    num_labels = y.shape[1]

    print(f"Number of nodes in label matrix: {y.shape[0]}")
    print(f"Number of labels: {num_labels}")

    # Align node IDs with the labels.
    x_aligned = np.array([node_map[i] for i in range(y.shape[0])])
    if np.isnan(x_aligned).any():
        print("NaN values found in aligned feature matrix. Replacing with zeros...")
        # Replace NaN values with zeros. This anyway should not happen.
        x_aligned = np.nan_to_num(x_aligned, nan=0.0)

    # Print some statistics about labels.
    label_counts = y.sum(axis=0)
    print("\nLabel distribution summary:")
    print(f"Min labels per class: {min(label_counts)}")
    print(f"Max labels per class: {max(label_counts)}")
    print(f"Avg labels per class: {np.mean(label_counts):.2f}")

    # num_random_instances is from node2vec original paper.
    num_random_instances = 10
    macro_f1_scores = np.zeros(num_random_instances)
    micro_f1_scores = np.zeros_like(macro_f1_scores)

    train_percent = int(train_size * 100)
    test_percent = int((1 - train_size) * 100)
    print(
        f"\nRunning multi-label classification for {num_random_instances} random instances ({train_percent}-{test_percent} split)..."
    )
    with tqdm(total=num_random_instances) as pbar:
        for i in range(num_random_instances):
            # First split: Create a held-out test set.
            #
            # This part of the data is not used for training or hyperparameter
            # tuning. It provides an unbiased evaluation of the final model.
            x_train_val, y_train_val, x_test, y_test = iterative_train_test_split(
                x_aligned, y, test_size=1 - train_size
            )

            # Second split: Create training and validation sets.
            #
            # The training set is used to train the classifier. The validation
            # set is used to tune the classification threshold, a critical
            # hyperparameter, without leaking information from the test set.
            x_train, y_train, x_val, y_val = iterative_train_test_split(
                x_train_val, y_train_val, test_size=0.5
            )

            # Create a logistic regression classifier with L2 regularization.
            # From the original paper we know only that L2 regularization and
            # one-vs-rest strategy are used.
            base_classifier = LogisticRegression(
                penalty="l2", C=100, solver="liblinear", max_iter=1000
            )
            classifier = OneVsRestClassifier(base_classifier, n_jobs=cpus)

            # Train the classifier.
            classifier.fit(x_train, y_train)

            # Predict probabilities on the validation set.
            y_val_pred_proba = classifier.predict_proba(x_val)

            # Tune threshold on validation set for each label.
            best_thresholds = np.zeros(y_val.shape[1])
            thresholds = np.arange(0.05, 1.0, 0.05)
            for j in range(y_val.shape[1]):
                best_f1 = -1
                best_thresh = 0.5
                for threshold in thresholds:
                    y_val_pred = (y_val_pred_proba[:, j] >= threshold).astype(int)
                    f1 = f1_score(
                        y_val[:, j], y_val_pred, average="binary", zero_division=0
                    )
                    if f1 > best_f1:
                        best_f1 = f1
                        best_thresh = threshold
                best_thresholds[j] = best_thresh

            # Predict on the test set using the best thresholds.
            y_test_pred_proba = classifier.predict_proba(x_test)
            y_pred = (y_test_pred_proba >= best_thresholds).astype(int)

            # Calculate F1 scores.
            macro_f1 = f1_score(y_test, y_pred, average="macro", zero_division=0)
            micro_f1 = f1_score(y_test, y_pred, average="micro", zero_division=0)

            # Save scores and update tqdm progress bar.
            macro_f1_scores[i] = macro_f1
            micro_f1_scores[i] = micro_f1
            pbar.set_postfix(macro_f1=f"{macro_f1:.2f}", micro_f1=f"{micro_f1:.2f}")
            pbar.update(1)

    average_macro_f1 = np.mean(macro_f1_scores)
    std_macro_f1 = np.std(macro_f1_scores)
    average_micro_f1 = np.mean(micro_f1_scores)
    std_micro_f1 = np.std(micro_f1_scores)

    print(f"Embedding: {emb.as_posix()!r}")
    print(f"Avg Macro-F1 score: {average_macro_f1:.4f} (std {std_macro_f1:.4f})")
    print(f"Avg Micro-F1 score: {average_micro_f1:.4f} (std {std_micro_f1:.4f})")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run multi-label classification.")
    parser.add_argument(
        "--emb", required=True, type=Path, help="Input graph embedding (.emb file)"
    )
    parser.add_argument(
        "--mat", required=True, type=Path, help="Input original labels (.mat file)"
    )
    parser.add_argument(
        "--train-size",
        type=float,
        default=0.5,
        help="Percentage of data to use for training (0.0 to 1.0). Default is 0.5.",
    )
    parser.add_argument(
        "--cpus",
        type=int,
        default=6,
        help="Number of CPUs to use for parallel processing (default: 6)",
    )

    args = parser.parse_args()
    main(args.emb, args.mat, args.train_size, args.cpus)