ENH: initial uncertainty sampling commit

ajhoffman1229 · ajhoffman1229 · commit fef8a95e4ea2 · 2025-04-26T15:14:10.000-04:00
- add prediction tools (may still need to be updated with more recent
changes)
- update colvar and uncertainty files to work with newer MACE version
and other changes
diff --git a/nff/md/colvars.py b/nff/md/colvars.py
@@ -6,7 +6,7 @@
 
 import itertools
 from itertools import repeat
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Dict, Optional, Tuple
 
 import numpy as np
 import torch
@@ -16,14 +16,15 @@
 
 from nff.io.ase import AtomsBatch
 from nff.train import load_model
+from nff.train.evaluate import evaluate
 from nff.train.uncertainty import (
     EnsembleUncertainty,
     EvidentialUncertainty,
     GMMUncertainty,
     MVEUncertainty,
 )
 from nff.utils.cuda import batch_to
-from nff.utils.prediction import get_prediction, get_residual
+from nff.utils.prediction import evaluate_mace, get_prediction, get_residual
 from nff.utils.scatter import compute_grad
 
 if TYPE_CHECKING:
@@ -152,21 +153,44 @@ def _init_uncertainty(self):
 
         if self.info_dict.get("uncertainty_type") == "gmm" and self.unc_class.is_fitted() is False:
             print("COLVAR: Doing train prediction")
-            _, train_predicted = get_prediction(
-                model=self.model,
-                dset=self.info_dict["train_dset"],
-                batch_size=self.info_dict["batch_size"],
-                device=self.device,
-                requires_grad=False,
-            )
+            if any(c in self.model.__repr__() for c in ["Painn", "SchNet"]):
+                train_predicted, _train_targs, _loss = evaluate(
+                    model=self.model,
+                    loader=self.info_dict["train_dset"],
+                    loss_fn=self.info_dict["loss_fn"],
+                    device=self.device,
+                    requires_embedding=True,
+                )
 
-            train_embedding = train_predicted["embedding"][0].detach().cpu().squeeze()
-            train_atomic_numbers = torch.cat(
-                [torch.LongTensor(at.get_atomic_numbers()) for at in self.info_dict["train_dset"]]
-            )
+                # GMM requires a 2D tensor for the embeddings, with the
+                train_embedding = torch.concat(train_predicted["embedding"])
+
+            elif "MACE" in self.model.__repr__():
+                _, train_predicted = evaluate_mace(
+                    model=self.model,
+                    dset=self.info_dict["train_dset"],
+                    batch_size=self.info_dict["batch_size"],
+                    device=self.device,
+                    embedding_kwargs=self.info_dict["uncertainty_params"]["embedding_kwargs"],
+                )
+
+                train_embedding = train_predicted["embeddings"].detach().cpu().squeeze()
+            # print("COLVAR: Doing train prediction")
+            # _, train_predicted = get_prediction(
+            #     model=self.model,
+            #     dset=self.info_dict["train_dset"],
+            #     batch_size=self.info_dict["batch_size"],
+            #     device=self.device,
+            #     requires_grad=False,
+            # )
+
+            # train_embedding = train_predicted["embedding"][0].detach().cpu().squeeze()
+            # train_atomic_numbers = torch.cat(
+            #     [torch.LongTensor(at.get_atomic_numbers()) for at in self.info_dict["train_dset"]]
+            # )
 
             print("COLVAR: Fitting GMM")
-            self.unc_class.fit_gmm(train_embedding, train_atomic_numbers)
+            self.unc_class.fit_gmm(train_embedding)
 
         self.calibrate = self.info_dict["uncertainty_params"].get("calibrate", False)
         if self.calibrate:
@@ -667,7 +691,56 @@ def energy_gap(self, enkey1: str, enkey2: str):
 
         return cv, cv_grad
 
-    def forward(self, atoms: Atoms) -> tuple[np.ndarray, np.ndarray]:
+    def uncertainty(self, atoms: Atoms, pred=None, return_grad: bool = True) -> Tuple[torch.Tensor, torch.Tensor]:
+        if pred is None:
+            _, pred = get_prediction(
+                self.model,
+                dset=[atoms],
+                batch_size=self.info_dict["batch_size"],
+                device=self.device,
+                get_target=False,
+                requires_grad=True,
+                pool_embedding=True,
+            )
+
+        # get neighbor list
+        atoms.update_nbr_list()
+        pred["nbr_list"] = torch.LongTensor(atoms.nbr_list).to(self.device)
+
+        # get atomic numbers
+        pred["test_atomic_numbers"] = torch.LongTensor(atoms.get_atomic_numbers())
+
+        uncertainty = self.unc_class(
+            results=pred,
+            num_atoms=pred["num_atoms"],
+            reset_min_uncertainty=False,
+            device=self.device,
+        )
+
+        if return_grad is False:
+            return uncertainty, None
+
+        if not uncertainty.requires_grad:
+            uncertainty.requires_grad = True
+
+        uncertainty_grad = compute_grad(
+            inputs=pred["xyz"],
+            output=uncertainty,
+            allow_unused=True,
+        )
+        if uncertainty_grad is None:
+            uncertainty_grad = torch.zeros_like(pred["xyz"])
+
+        # make sure uncertainty is a scalar
+        uncertainty = uncertainty.sum()
+
+        return uncertainty, uncertainty_grad
+
+    def forward(
+        self,
+        atoms: Atoms,
+        pred: Optional[Dict] = None,  # noqa
+    ) -> tuple[np.ndarray, np.ndarray]:
         """Switch function to call the right CV-func
 
         Args:
@@ -732,6 +805,9 @@ def forward(self, atoms: Atoms) -> tuple[np.ndarray, np.ndarray]:
         elif self.info_dict["name"] == "energy_gap":
             cv, cv_grad = self.energy_gap(self.info_dict["enkey_1"], self.info_dict["enkey_2"])
 
+        elif self.info_dict["name"] == "uncertainty":
+            cv, cv_grad = self.uncertainty(atoms, pred)
+
         return cv.detach().cpu().numpy(), cv_grad.detach().cpu().numpy()
 
 
diff --git a/nff/train/uncertainty.py b/nff/train/uncertainty.py
@@ -20,7 +20,6 @@
 
 from nff.io.gmm import GaussianMixture
 from nff.train.evaluate import evaluate
-from nff.utils.cuda import batch_detach
 from nff.utils.prediction import get_residual
 
 __all__ = [
@@ -42,7 +41,7 @@
 class Uncertainty:
     """Base class for uncertainty predictions."""
 
-    def __init__(  # noqa: D107
+    def __init__(
         self,
         order: str,
         calibrate: bool,
@@ -69,7 +68,7 @@ def __init__(  # noqa: D107
 
             self.CP = ConformalPrediction(alpha=cp_alpha)
 
-    def __call__(self, *args, **kwargs):  # noqa: D102
+    def __call__(self, *args, **kwargs):
         return self.get_uncertainty(*args, **kwargs)
 
     def set_min_uncertainty(self, min_uncertainty: float, force: bool = False) -> None:
@@ -175,7 +174,7 @@ class ConformalPrediction:
         on calibration data and apply to test data during prediction.
     """
 
-    def __init__(self, alpha: float):  # noqa: D107
+    def __init__(self, alpha: float):
         self.alpha = alpha
 
     def fit(
@@ -225,7 +224,7 @@ class EnsembleUncertainty(Uncertainty):
         targ_unit (Union[str, None], optional): Target unit of the quantity. Defaults to None.
     """
 
-    def __init__(  # noqa: D107
+    def __init__(
         self,
         quantity: str,
         order: str,
@@ -342,7 +341,7 @@ class for the possible options.
         min_uncertainty (Union[float, None], optional): Minimum uncertainty value. Defaults to None.
     """
 
-    def __init__(  # noqa: D107
+    def __init__(
         self,
         order: str = "atomic",
         shared_v: bool = False,
@@ -429,7 +428,7 @@ class for the possible options.
         min_uncertainty (Union[float, None], optional): Minimum uncertainty value. Defaults to None.
     """
 
-    def __init__(  # noqa: D107
+    def __init__(
         self,
         variance_key: str = "var",
         quantity: str = "forces",
@@ -480,7 +479,7 @@ class for the possible options.
         gmm_path (Union[str, None], optional): Path to the saved GMM model. Defaults to None.
     """
 
-    def __init__(  # noqa: D107
+    def __init__(
         self,
         train_embed_key: str = "train_embedding",
         test_embed_key: str = "embedding",
@@ -701,30 +700,56 @@ def get_unc_class(model: torch.nn.Module, info_dict: dict) -> Uncertainty:
     # to refit it
     if info_dict.get("uncertainty_type") == "gmm" and unc_class.is_fitted() is False:
         print("GMM: Doing train prediction")
-        train_predicted, _train_targs, _loss = evaluate(
-            model=model,
-            loader=info_dict["train_dset"],
-            loss_fn=info_dict["loss_fn"],
-            device=device,
-            requires_embedding=True,
-        )
+        if any(c in model.__repr__() for c in ["Painn", "SchNet"]):
+            train_predicted, _train_targs, _loss = evaluate(
+                model=model,
+                loader=info_dict["train_dset"],
+                loss_fn=info_dict["loss_fn"],
+                device=device,
+                requires_embedding=True,
+            )
+
+            # GMM requires a 2D tensor for the embeddings, with the
+            train_embedding = torch.concat(train_predicted["embedding"])
+
+        elif "MACE" in model.__repr__():
+            _, train_predicted = evaluate(
+                model=model,
+                dset=info_dict["train_dset"],
+                batch_size=info_dict["batch_size"],
+                device=device,
+                embedding_kwargs=info_dict["uncertainty_params"]["embedding_kwargs"],
+            )
 
-        # GMM requires a 2D tensor for the embeddings, with the
-        train_embedding = torch.stack([t.flatten() for t in train_predicted["embedding"]], dim=0)
+            train_embedding = train_predicted["embeddings"].detach().cpu().squeeze()
 
         print("COLVAR: Fitting GMM")
         unc_class.fit_gmm(train_embedding)
     calibrate = info_dict["uncertainty_params"].get("calibrate", False)
-    if calibrate and unc_class.CP.qhat is None:
+    if calibrate and (not hasattr(unc_class.CP, "qhat") or unc_class.CP.qhat is None):
         print("COLVAR: Fitting ConformalPrediction")
-        calib_target, calib_predicted = evaluate(
-            model=model,
-            dset=info_dict["calib_dset"],
-            batch_size=info_dict["batch_size"],
-            device=device,
-            embedding_kwargs=info_dict["uncertainty_params"]["embedding_kwargs"],
-        )
+        if any(c in model.__repr__() for c in ["Painn", "SchNet"]):
+            calib_predicted, calib_target, _loss = evaluate(
+                model=model,
+                loader=info_dict["calib_dset"],
+                loss_fn=info_dict["loss_fn"],
+                device=device,
+                requires_embedding=True,
+            )
+
+        elif "MACE" in model.__repr__():
+            calib_target, calib_predicted = evaluate(
+                model=model,
+                dset=info_dict["calib_dset"],
+                batch_size=info_dict["batch_size"],
+                device=device,
+                embedding_kwargs=info_dict["uncertainty_params"]["embedding_kwargs"],
+            )
+
         # calib_predicted["embeddings"] = calib_predicted["embeddings"][0]
+        print(calib_predicted.keys())
+        print(len(calib_predicted[unc_class.test_key]))
+        print(calib_predicted[unc_class.test_key][0].shape)
         calib_uncertainty = (
             unc_class(
                 results=calib_predicted,
diff --git a/nff/utils/prediction_chk.py b/nff/utils/prediction_chk.py