From 034c1348a02a4ea6b86cc45d40b8c3b139881708 Mon Sep 17 00:00:00 2001
From: motiwari <mohittiwarinyc+github@gmail.com>
Date: Fri, 15 Aug 2025 18:12:55 -0700
Subject: [PATCH 01/29] Removing dynamic database docstrings

---
 dynamic_database.py | 159 --------------------------------------------
 1 file changed, 159 deletions(-)

diff --git a/dynamic_database.py b/dynamic_database.py
index ed6e229..4bb2862 100644
--- a/dynamic_database.py
+++ b/dynamic_database.py
@@ -37,20 +37,6 @@ def parse_pos(pos_str):
         raise ValueError(f"Unexpected format for Pos: {pos_str}")
 
 @dataclass
-"""
-Annotation class represents a code annotation with its full name, definition path, 
-and position details.
-Attributes:
-    full_name (str): The full name of the annotation.
-    def_path (str): The file path where the annotation is defined.
-    def_pos (Pos): The starting position of the annotation definition.
-    def_end_pos (Pos): The ending position of the annotation definition.
-Methods:
-    from_dict(data: Dict) -> Annotation:
-        Creates an Annotation instance from a dictionary.
-    to_dict() -> Dict:
-        Converts the Annotation instance to a dictionary.
-"""
 class Annotation:
     full_name: str
     def_path: str
@@ -77,27 +63,6 @@ def to_dict(self) -> Dict:
         }
 
 @dataclass
-"""
-AnnotatedTactic is a data class that represents a tactic with its annotations and states before and after its application.
-Attributes:
-    tactic (str): The tactic applied.
-    annotated_tactic (Tuple[str, List[Annotation]]): A tuple containing the tactic and a list of annotations.
-    state_before (str): The state before the tactic is applied.
-    state_after (str): The state after the tactic is applied.
-Methods:
-    from_dict(cls, data: Dict) -> AnnotatedTactic:
-        Creates an AnnotatedTactic instance from a dictionary.
-        Args:
-            data (Dict): A dictionary containing the keys "tactic", "annotated_tactic", "state_before", and "state_after".
-        Returns:
-            AnnotatedTactic: An instance of AnnotatedTactic.
-        Raises:
-            ValueError: If the dictionary does not contain the required keys.
-    to_dict(self) -> Dict:
-        Converts the AnnotatedTactic instance to a dictionary.
-        Returns:
-            Dict: A dictionary representation of the AnnotatedTactic instance.
-"""
 class AnnotatedTactic:
     tactic: str
     annotated_tactic: Tuple[str, List[Annotation]]
@@ -130,24 +95,6 @@ def to_dict(self) -> Dict:
         }
 
 @dataclass
-"""
-A class to represent a theorem with its associated metadata.
-Attributes:
-    full_name (str): The full name of the theorem.
-    file_path (Path): The file path where the theorem is located.
-    start (Pos): The starting position of the theorem in the file.
-    end (Pos): The ending position of the theorem in the file.
-    url (str): The URL associated with the theorem.
-    commit (str): The commit hash associated with the theorem.
-    theorem_statement (str, optional): The statement of the theorem.
-    traced_tactics (Optional[List[AnnotatedTactic]], optional): A list of traced tactics.
-    difficulty_rating (Optional[float], optional): The difficulty rating of the theorem.
-Methods:
-    __eq__(self, other): Checks if two Theorem instances are equal.
-    is_same_theorem(self, other: Theorem) -> bool: Checks if two Theorem instances represent the same theorem.
-    from_dict(cls, data: Dict, url: str, commit: str) -> Theorem: Creates a Theorem instance from a dictionary.
-    to_dict(self) -> Dict: Converts the Theorem instance to a dictionary.
-"""
 class Theorem:
     full_name: str
     file_path: Path
@@ -202,28 +149,6 @@ def to_dict(self) -> Dict:
         }
 
 @dataclass
-"""
-A class representing a Premise with various attributes.
-Attributes:
-    full_name (str): The full name of the premise.
-    code (str): The code associated with the premise.
-    start (Pos): The starting position of the premise.
-    end (Pos): The ending position of the premise.
-    kind (str): The kind or type of the premise.
-Methods:
-    from_dict(cls, data: Dict) -> Premise:
-        Creates an instance of Premise from a dictionary.
-        Args:
-            data (Dict): A dictionary containing the premise data.
-        Returns:
-            Premise: An instance of the Premise class.
-        Raises:
-            ValueError: If the dictionary does not contain the required keys.
-    to_dict(self) -> Dict:
-        Converts the Premise instance to a dictionary.
-        Returns:
-            Dict: A dictionary representation of the Premise instance.
-"""
 class Premise:
     full_name: str
     code: str
@@ -253,26 +178,6 @@ def to_dict(self) -> Dict:
         }
 
 @dataclass
-"""
-Represents a file containing premises and their associated imports.
-Attributes:
-    path (Path): The file path.
-    imports (List[str]): A list of import statements.
-    premises (List[Premise]): A list of premises.
-Methods:
-    from_dict(cls, data: Dict) -> PremiseFile:
-        Creates an instance of PremiseFile from a dictionary.
-        Args:
-            data (Dict): A dictionary containing the keys "path", "imports", and "premises".
-        Returns:
-            PremiseFile: An instance of PremiseFile.
-        Raises:
-            ValueError: If the dictionary does not contain the required keys.
-    to_dict(self) -> Dict:
-        Converts the PremiseFile instance to a dictionary.
-        Returns:
-            Dict: A dictionary representation of the PremiseFile instance.
-"""
 class PremiseFile:
     path: Path
     imports: List[str]
@@ -296,41 +201,6 @@ def to_dict(self) -> Dict:
         }
 
 @dataclass
-"""
-Repository class represents a repository with various attributes and methods to manage theorems and premise files.
-Attributes:
-    url (str): URL of the repository.
-    name (str): Name of the repository.
-    commit (str): Commit hash of the repository.
-    lean_version (str): Version of Lean used in the repository.
-    lean_dojo_version (str): Version of Lean Dojo used in the repository.
-    metadata (Dict[str, str]): Metadata associated with the repository.
-    proven_theorems (List[Theorem]): List of proven theorems.
-    sorry_theorems_proved (List[Theorem]): List of sorry theorems that have been proved.
-    sorry_theorems_unproved (List[Theorem]): List of sorry theorems that are unproved.
-    premise_files (List[PremiseFile]): List of premise files.
-    files_traced (List[Path]): List of traced files.
-    pr_url (Optional[str]): URL of the pull request.
-Methods:
-    __eq__(self, other): Checks equality between two Repository instances.
-    __hash__(self): Returns the hash value of the Repository instance.
-    total_theorems(self) -> int: Returns the total number of theorems.
-    num_proven_theorems(self) -> int: Returns the number of proven theorems.
-    num_sorry_theorems_proved(self) -> int: Returns the number of sorry theorems that have been proved.
-    num_sorry_theorems_unproved(self) -> int: Returns the number of sorry theorems that are unproved.
-    num_sorry_theorems(self) -> int: Returns the total number of sorry theorems.
-    num_premise_files(self) -> int: Returns the number of premise files.
-    num_premises(self) -> int: Returns the total number of premises.
-    num_files_traced(self) -> int: Returns the number of traced files.
-    get_all_theorems(self) -> List[Theorem]: Returns a list of all theorems.
-    get_theorem(self, full_name: str, file_path: str) -> Optional[Theorem]: Retrieves a theorem by its full name and file path.
-    update_theorem(self, theorem: Theorem) -> None: Updates an existing theorem.
-    get_premise_file(self, path: str) -> Optional[PremiseFile]: Retrieves a premise file by its path.
-    get_file_traced(self, path: str) -> Optional[Path]: Retrieves a traced file by its path.
-    from_dict(cls, data: Dict) -> Repository: Creates a Repository instance from a dictionary.
-    to_dict(self) -> Dict: Converts the Repository instance to a dictionary.
-    change_sorry_to_proven(self, theorem: Theorem, log_file: str) -> None: Changes a sorry theorem to a proven theorem and logs the change.
-"""
 class Repository:
     url: str
     name: str
@@ -544,35 +414,6 @@ def safe_remove_dir_path(dir_path):
                     raise
 
 @dataclass
-"""
-A class that manages a collection of repositories containing Lean theorem proofs.
-The DynamicDatabase class provides functionality for:
-1. Managing repositories (adding, retrieving, updating, deleting)
-2. Generating merged datasets from multiple repositories
-3. Splitting theorem data for training/validation/testing
-4. Exporting proofs, corpus data, and metadata
-Attributes:
-    repositories: List of Repository objects managed by the database
-Methods:
-    generate_merged_dataset: Creates a merged dataset from multiple repositories
-    _merge_corpus: Merges premise files from multiple repositories
-    _split_data: Splits theorem data using different strategies
-    _split_randomly: Splits theorems randomly into train/val/test sets
-    _split_by_premise: Splits theorems based on premises to ensure premise novelty
-    _export_proofs: Exports theorem proofs in JSON format
-    _export_traced_files: Exports information about traced files
-    _export_metadata: Exports metadata about repositories and statistics
-    add_repository: Adds a new repository to the database
-    get_repository: Retrieves a repository by URL and commit
-    update_repository: Updates an existing repository
-    print_database_contents: Logs the current database contents
-    delete_repository: Removes a repository from the database
-    to_dict: Converts the database to a dictionary representation
-    from_dict: Creates a database instance from a dictionary
-    to_json: Serializes the database to a JSON file
-    from_json: Deserializes a database from a JSON file
-    update_json: Updates an existing JSON file with current database state
-"""
 class DynamicDatabase:
     repositories: List[Repository] = field(default_factory=list)
 

From a475cfbcf9876aaff1240024a1da44b36b673976 Mon Sep 17 00:00:00 2001
From: motiwari <mohittiwarinyc+github@gmail.com>
Date: Mon, 25 Aug 2025 09:23:09 -0700
Subject: [PATCH 02/29] Running formatter

---
 .gitignore                             |    3 +
 common.py                              |   15 +-
 compute_fisher.py                      |   22 +-
 custom_progress.py                     |   26 +-
 custom_traced_data.py                  |   11 +-
 custom_utils.py                        |    8 +-
 dynamic_database.py                    |  376 ++++--
 generate_benchmark_lean4.py            |  123 +-
 generator/datamodule.py                |    3 +-
 generator/model.py                     |   20 +-
 leanagent.py                           |  692 +++++++----
 leanagent_utils.py                     |    3 +-
 prover/evaluate.py                     |    7 +-
 prover/proof_search.py                 |   30 +-
 prover/search_tree.py                  |    3 +-
 replace_files.sh                       |    4 +-
 retrieval/datamodule.py                |   25 +-
 retrieval/evaluate.py                  |    2 +-
 retrieval/evaluate_multiple.py         |   43 +-
 retrieval/fisher_computation_module.py |    9 +-
 retrieval/index.py                     |    5 +-
 retrieval/main.py                      |   43 +-
 retrieval/model.py                     |   31 +-
 run_leanagent.sh                       |    8 +-
 tests/test_common.py                   |    2 +-
 unittest_dynamic_database.py           | 1586 ++++++++++++++++--------
 26 files changed, 2063 insertions(+), 1037 deletions(-)

diff --git a/.gitignore b/.gitignore
index 99e2eca..757bfd1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,8 @@
 *.pkl
 retrieval/bm25
+.idea/
+.DS_Store
+RAID/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/common.py b/common.py
index 04a119d..ac50437 100644
--- a/common.py
+++ b/common.py
@@ -38,10 +38,12 @@ def __post_init__(self) -> None:
         assert isinstance(self.theorem_full_name, str)
         assert isinstance(self.theorem_pos, Pos)
         if self.state is not None:
-            if not (isinstance(self.state, str)
+            if not (
+                isinstance(self.state, str)
                 and "⊢" in self.state
                 and MARK_START_SYMBOL not in self.state
-                and MARK_END_SYMBOL not in self.state):
+                and MARK_END_SYMBOL not in self.state
+            ):
                 logger.warning(f"Invalid state: {self.state}")
             assert (
                 isinstance(self.state, str)
@@ -56,9 +58,11 @@ def serialize(self) -> str:
             return ""
         return self.state
 
+
 def escape_regex_special_chars(text):
     return re.escape(text)
 
+
 @dataclass(unsafe_hash=True)
 class Premise:
     """Premises are "documents" in our retrieval setup."""
@@ -202,7 +206,6 @@ def __init__(self, jsonl_path: str) -> None:
         dep_graph = nx.DiGraph()
         self.all_premises = []
 
-
         for line in open(jsonl_path):
             file_data = json.loads(line)
             path = file_data["path"]
@@ -222,7 +225,7 @@ def __init__(self, jsonl_path: str) -> None:
         self.imported_premises_cache = {}
         self.fill_cache()
 
-    def _get_file(self, path: str) -> File:        
+    def _get_file(self, path: str) -> File:
         # for some reason, the `path` in the parameter starts with ./
         # but the paths in the corpus don't
         # so we need to remove the ./
@@ -471,7 +474,9 @@ def _is_deepspeed_checkpoint(path: str):
 def load_checkpoint(model_cls, ckpt_path: str, device, freeze: bool, config: dict):
     """Handle DeepSpeed checkpoints in model loading."""
     if not _is_deepspeed_checkpoint(ckpt_path):
-        model = model_cls.load_from_checkpoint(ckpt_path, strict=False, **config).to(device)
+        model = model_cls.load_from_checkpoint(ckpt_path, strict=False, **config).to(
+            device
+        )
     else:
         with tempfile.TemporaryDirectory() as dirname:
             path = os.path.join(dirname, "lightning.cpkt")
diff --git a/compute_fisher.py b/compute_fisher.py
index 352d080..56b32fe 100644
--- a/compute_fisher.py
+++ b/compute_fisher.py
@@ -3,6 +3,7 @@
 
 new_data_path = "<NEW_DATA_PATH>/<NEW_DATASET_NAME>"
 
+
 def main():
     """
     The main function that drives LeanAgent.
@@ -23,7 +24,7 @@ def main():
     try:
         logger.info("Calculating Fisher Information Matrix for EWC")
         ### FISHER INFORMATION MATRIX FOR NEXT EWC
-        
+
         if not torch.cuda.is_available():
             logger.warning("Indexing the corpus using CPU can be very slow.")
             device = torch.device("cpu")
@@ -41,7 +42,9 @@ def main():
         try:
             best_model_path = find_latest_checkpoint()
             logger.info(f"Found latest checkpoint: {best_model_path}")
-            best_model = PremiseRetriever.load(best_model_path, device, freeze=False, config=config)
+            best_model = PremiseRetriever.load(
+                best_model_path, device, freeze=False, config=config
+            )
         except FileNotFoundError as e:
             logger.error(f"No checkpoint found: {str(e)}")
             logger.warning("Using the current model state.")
@@ -51,8 +54,8 @@ def main():
         fisher_module = FisherComputationModule(best_model)
 
         VERY_LONG_TIMEOUT = 7 * 24 * 60 * 60 * 52  # 1 year
-        os.environ['TORCH_NCCL_ASYNC_ERROR_HANDLING'] = '1'
-        os.environ['NCCL_TIMEOUT'] = str(VERY_LONG_TIMEOUT * 1000)
+        os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "1"
+        os.environ["NCCL_TIMEOUT"] = str(VERY_LONG_TIMEOUT * 1000)
 
         ddp_strategy = DDPStrategy(timeout=timedelta(seconds=VERY_LONG_TIMEOUT))
         # Setup trainer for Fisher computation
@@ -79,9 +82,9 @@ def main():
             batch_size=BATCH_SIZE,
             eval_batch_size=64,
             max_seq_len=1024,
-            num_workers=4
+            num_workers=4,
         )
-        data_module.setup(stage='fit')
+        data_module.setup(stage="fit")
 
         try:
             logger.info("right before barrier fisher")
@@ -92,7 +95,11 @@ def main():
 
             # Save the FIM if needed
             if fisher_trainer.is_global_zero:
-                fisher_file_path = os.path.join(RAID_DIR, FISHER_DIR, f"fisher_info_{new_data_path.split('/')[-1]}_distributed.pkl")
+                fisher_file_path = os.path.join(
+                    RAID_DIR,
+                    FISHER_DIR,
+                    f"fisher_info_{new_data_path.split('/')[-1]}_distributed.pkl",
+                )
                 fisher_module.save_fisher_info(fisher_file_path)
                 logger.info(f"Fisher Information Matrix saved at {fisher_file_path}")
         except Exception as e:
@@ -103,5 +110,6 @@ def main():
         logger.info(f"An error occurred: {e}", file=sys.stderr)
         traceback.print_exc()
 
+
 if __name__ == "__main__":
     main()
diff --git a/custom_progress.py b/custom_progress.py
index 6d647c9..7c02ce8 100644
--- a/custom_progress.py
+++ b/custom_progress.py
@@ -139,7 +139,9 @@ class _Progress(_BaseProgress):
 
     def __post_init__(self) -> None:
         if self.total.__class__ is not self.current.__class__:
-            raise ValueError("The `total` and `current` instances should be of the same class")
+            raise ValueError(
+                "The `total` and `current` instances should be of the same class"
+            )
 
     def increment_ready(self) -> None:
         self.total.ready += 1
@@ -147,13 +149,17 @@ def increment_ready(self) -> None:
 
     def increment_started(self) -> None:
         if not isinstance(self.total, _StartedTracker):
-            raise TypeError(f"`{self.total.__class__.__name__}` doesn't have a `started` attribute")
+            raise TypeError(
+                f"`{self.total.__class__.__name__}` doesn't have a `started` attribute"
+            )
         self.total.started += 1
         self.current.started += 1
 
     def increment_processed(self) -> None:
         if not isinstance(self.total, _ProcessedTracker):
-            raise TypeError(f"`{self.total.__class__.__name__}` doesn't have a `processed` attribute")
+            raise TypeError(
+                f"`{self.total.__class__.__name__}` doesn't have a `processed` attribute"
+            )
         self.total.processed += 1
         self.current.processed += 1
 
@@ -162,7 +168,9 @@ def increment_completed(self) -> None:
         self.current.completed += 1
 
     @classmethod
-    def from_defaults(cls, tracker_cls: Type[_ReadyCompletedTracker], **kwargs: int) -> "_Progress":
+    def from_defaults(
+        cls, tracker_cls: Type[_ReadyCompletedTracker], **kwargs: int
+    ) -> "_Progress":
         """Utility function to easily create an instance from keyword arguments to both ``Tracker``s."""
         return cls(total=tracker_cls(**kwargs), current=tracker_cls(**kwargs))
 
@@ -244,8 +252,12 @@ class _OptimizerProgress(_BaseProgress):
 
     """
 
-    step: _Progress = field(default_factory=lambda: _Progress.from_defaults(_ReadyCompletedTracker))
-    zero_grad: _Progress = field(default_factory=lambda: _Progress.from_defaults(_StartedTracker))
+    step: _Progress = field(
+        default_factory=lambda: _Progress.from_defaults(_ReadyCompletedTracker)
+    )
+    zero_grad: _Progress = field(
+        default_factory=lambda: _Progress.from_defaults(_StartedTracker)
+    )
 
     @override
     def reset(self) -> None:
@@ -297,4 +309,4 @@ def reset_on_restart(self) -> None:
     def load_state_dict(self, state_dict: dict) -> None:
         if state_dict["optimizer"]["step"]["total"]["completed"] == None:
             state_dict["optimizer"]["step"]["total"]["completed"] = 0
-        self.optimizer.load_state_dict(state_dict["optimizer"])
\ No newline at end of file
+        self.optimizer.load_state_dict(state_dict["optimizer"])
diff --git a/custom_traced_data.py b/custom_traced_data.py
index 26e7117..ae569e4 100644
--- a/custom_traced_data.py
+++ b/custom_traced_data.py
@@ -1,5 +1,4 @@
-"""This module defines traced repos/files/theorems.
-"""
+"""This module defines traced repos/files/theorems."""
 
 import re
 import os
@@ -1080,7 +1079,7 @@ def from_traced_files(
             TracedFile.from_traced_file(root_dir, path, repo)
             for path in tqdm(json_paths)
         ]
-        
+
         dependencies = repo.get_dependencies(root_dir)
         if build_deps:
             traced_files_graph = _build_dependency_graph(traced_files, root_dir, repo)
@@ -1110,7 +1109,7 @@ def save_to_disk(self) -> None:
 
         for tf in tqdm(self.traced_files, total=num_traced_files):
             _save_xml_to_disk(tf)
-    
+
     @classmethod
     def load_from_disk(
         cls, root_dir: Union[str, Path], build_deps: bool = True
@@ -1138,7 +1137,7 @@ def load_from_disk(
         traced_files = [
             TracedFile.from_xml(root_dir, path, repo) for path in tqdm(xml_paths)
         ]
-        
+
         dependencies = repo.get_dependencies(root_dir)
         if build_deps:
             traced_files_graph = _build_dependency_graph(traced_files, root_dir, repo)
@@ -1166,4 +1165,4 @@ def get_traced_theorem(self, thm: Theorem) -> Optional[TracedTheorem]:
         else:
             assert thm.repo in self.dependencies.values()
             path = Path(self.name) / LEAN4_PACKAGES_DIR / thm.repo.name / thm.file_path
-        return self.get_traced_file(path).get_traced_theorem(thm.full_name)
\ No newline at end of file
+        return self.get_traced_file(path).get_traced_theorem(thm.full_name)
diff --git a/custom_utils.py b/custom_utils.py
index 8f8ea79..dd587a4 100644
--- a/custom_utils.py
+++ b/custom_utils.py
@@ -1,5 +1,4 @@
-"""Utility functions used internally by LeanDojo.
-"""
+"""Utility functions used internally by LeanDojo."""
 
 import re
 import os
@@ -20,7 +19,7 @@
 
 @contextmanager
 def working_directory(
-    path: Optional[Union[str, Path]] = None
+    path: Optional[Union[str, Path]] = None,
 ) -> Generator[Path, None, None]:
     """Context manager setting the current working directory (CWD) to ``path`` (or a temporary directory if ``path`` is None).
 
@@ -52,6 +51,7 @@ def working_directory(
         if is_temporary:
             tmp_dir.__exit__(None, None, None)
 
+
 @contextmanager
 def report_critical_failure(msg: str) -> Generator[None, None, None]:
     """Context manager logging ``msg`` in case of any exception.
@@ -285,4 +285,4 @@ def to_lean_path(root_dir: Path, path: Path, repo) -> bool:
     else:
         # E.g., ".lake/build/ir/Mathlib/LinearAlgebra/Basics.lean" or "build/ir/Mathlib/LinearAlgebra/Basics.lean"
         assert path.is_relative_to(LEAN4_BUILD_DIR / "ir"), path
-        return path.relative_to(LEAN4_BUILD_DIR / "ir")
\ No newline at end of file
+        return path.relative_to(LEAN4_BUILD_DIR / "ir")
diff --git a/dynamic_database.py b/dynamic_database.py
index 4bb2862..d819000 100644
--- a/dynamic_database.py
+++ b/dynamic_database.py
@@ -12,12 +12,13 @@
 from loguru import logger
 import shutil
 
+
 def parse_pos(pos_str):
     """
     Parses a position string or list into a Pos object.
 
     Args:
-        pos_str (str or list): The position data, either as a string in the format 'Pos(x, y)' 
+        pos_str (str or list): The position data, either as a string in the format 'Pos(x, y)'
                                or as a list [x, y].
 
     Returns:
@@ -28,7 +29,9 @@ def parse_pos(pos_str):
     """
     if isinstance(pos_str, str):
         # pos_str came from a JSON file
-        pos_parts = pos_str.replace('Pos', '').replace('(', '').replace(')', '').split(',')
+        pos_parts = (
+            pos_str.replace("Pos", "").replace("(", "").replace(")", "").split(",")
+        )
         return Pos(int(pos_parts[0]), int(pos_parts[1]))
     elif isinstance(pos_str, list):
         # pos_str came from a dictionary initialization
@@ -36,6 +39,7 @@ def parse_pos(pos_str):
     else:
         raise ValueError(f"Unexpected format for Pos: {pos_str}")
 
+
 @dataclass
 class Annotation:
     full_name: str
@@ -45,23 +49,26 @@ class Annotation:
 
     @classmethod
     def from_dict(cls, data: Dict) -> Annotation:
-        if not all(key in data for key in ["full_name", "def_path", "def_pos", "def_end_pos"]):
+        if not all(
+            key in data for key in ["full_name", "def_path", "def_pos", "def_end_pos"]
+        ):
             raise ValueError("Invalid Annotation data format")
         return cls(
             full_name=data["full_name"],
             def_path=data["def_path"],
             def_pos=parse_pos(data["def_pos"]),
-            def_end_pos=parse_pos(data["def_end_pos"])
+            def_end_pos=parse_pos(data["def_end_pos"]),
         )
-    
+
     def to_dict(self) -> Dict:
         return {
             "full_name": self.full_name,
             "def_path": self.def_path,
             "def_pos": repr(self.def_pos),
-            "def_end_pos": repr(self.def_end_pos)
+            "def_end_pos": repr(self.def_end_pos),
         }
 
+
 @dataclass
 class AnnotatedTactic:
     tactic: str
@@ -71,29 +78,33 @@ class AnnotatedTactic:
 
     @classmethod
     def from_dict(cls, data: Dict) -> AnnotatedTactic:
-        if not all(key in data for key in ["tactic", "annotated_tactic", "state_before", "state_after"]):
+        if not all(
+            key in data
+            for key in ["tactic", "annotated_tactic", "state_before", "state_after"]
+        ):
             raise ValueError("Invalid AnnotatedTactic data format")
         return cls(
             tactic=data["tactic"],
             annotated_tactic=(
                 data["annotated_tactic"][0],
-                [Annotation.from_dict(a) for a in data["annotated_tactic"][1]]
+                [Annotation.from_dict(a) for a in data["annotated_tactic"][1]],
             ),
             state_before=data["state_before"],
-            state_after=data["state_after"]
+            state_after=data["state_after"],
         )
-    
+
     def to_dict(self) -> Dict:
         return {
             "tactic": self.tactic,
             "annotated_tactic": [
                 self.annotated_tactic[0],
-                [a.to_dict() for a in self.annotated_tactic[1]]
+                [a.to_dict() for a in self.annotated_tactic[1]],
             ],
             "state_before": self.state_before,
-            "state_after": self.state_after
+            "state_after": self.state_after,
         }
 
+
 @dataclass
 class Theorem:
     full_name: str
@@ -112,10 +123,12 @@ def __eq__(self, other):
         return self.is_same_theorem(other)
 
     def is_same_theorem(self, other: Theorem) -> bool:
-        return (self.full_name == other.full_name and
-                self.file_path == other.file_path and
-                self.start == other.start and
-                self.end == other.end)
+        return (
+            self.full_name == other.full_name
+            and self.file_path == other.file_path
+            and self.start == other.start
+            and self.end == other.end
+        )
 
     @classmethod
     def from_dict(cls, data: Dict, url: str, commit: str) -> Theorem:
@@ -132,9 +145,9 @@ def from_dict(cls, data: Dict, url: str, commit: str) -> Theorem:
             traced_tactics=[
                 AnnotatedTactic.from_dict(t) for t in data.get("traced_tactics", [])
             ],
-            difficulty_rating=data.get("difficulty_rating")
+            difficulty_rating=data.get("difficulty_rating"),
         )
-    
+
     def to_dict(self) -> Dict:
         return {
             "full_name": self.full_name,
@@ -145,9 +158,10 @@ def to_dict(self) -> Dict:
             "url": self.url,
             "commit": self.commit,
             "traced_tactics": [t.to_dict() for t in (self.traced_tactics or [])],
-            "difficulty_rating": self.difficulty_rating
+            "difficulty_rating": self.difficulty_rating,
         }
 
+
 @dataclass
 class Premise:
     full_name: str
@@ -158,25 +172,28 @@ class Premise:
 
     @classmethod
     def from_dict(cls, data: Dict) -> Premise:
-        if not all(key in data for key in ["full_name", "code", "start", "end", "kind"]):
+        if not all(
+            key in data for key in ["full_name", "code", "start", "end", "kind"]
+        ):
             raise ValueError("Invalid Premise data format")
         return cls(
             full_name=data["full_name"],
             code=data["code"],
             start=parse_pos(data["start"]),
             end=parse_pos(data["end"]),
-            kind=data["kind"]
+            kind=data["kind"],
         )
-    
+
     def to_dict(self) -> Dict:
         return {
             "full_name": self.full_name,
             "code": self.code,
             "start": repr(self.start),
             "end": repr(self.end),
-            "kind": self.kind
+            "kind": self.kind,
         }
 
+
 @dataclass
 class PremiseFile:
     path: Path
@@ -190,16 +207,17 @@ def from_dict(cls, data: Dict) -> PremiseFile:
         return cls(
             path=Path(data["path"]),
             imports=data["imports"],
-            premises=[Premise.from_dict(p) for p in data["premises"]]
+            premises=[Premise.from_dict(p) for p in data["premises"]],
         )
-    
+
     def to_dict(self) -> Dict:
         return {
             "path": str(self.path),
             "imports": self.imports,
-            "premises": [p.to_dict() for p in self.premises]
+            "premises": [p.to_dict() for p in self.premises],
         }
 
+
 @dataclass
 class Repository:
     url: str
@@ -218,14 +236,24 @@ class Repository:
     def __eq__(self, other):
         if not isinstance(other, Repository):
             return NotImplemented
-        return (self.url == other.url and 
-                self.name == other.name and
-                self.commit == other.commit and
-                self.lean_version == other.lean_version and
-                self.lean_dojo_version == other.lean_dojo_version)
+        return (
+            self.url == other.url
+            and self.name == other.name
+            and self.commit == other.commit
+            and self.lean_version == other.lean_version
+            and self.lean_dojo_version == other.lean_dojo_version
+        )
 
     def __hash__(self):
-        return hash((self.url, self.name, self.commit, self.lean_version, self.lean_dojo_version))
+        return hash(
+            (
+                self.url,
+                self.name,
+                self.commit,
+                self.lean_version,
+                self.lean_dojo_version,
+            )
+        )
 
     @property
     def total_theorems(self) -> int:
@@ -246,7 +274,7 @@ def num_sorry_theorems_unproved(self) -> int:
     @property
     def num_sorry_theorems(self) -> int:
         return self.num_sorry_theorems_proved + self.num_sorry_theorems_unproved
-    
+
     @property
     def num_premise_files(self) -> int:
         return len(self.premise_files)
@@ -258,26 +286,41 @@ def num_premises(self) -> int:
     @property
     def num_files_traced(self) -> int:
         return len(self.files_traced)
-    
+
     @property
     def get_all_theorems(self) -> List[Theorem]:
-        return self.proven_theorems + self.sorry_theorems_proved + self.sorry_theorems_unproved
-    
+        return (
+            self.proven_theorems
+            + self.sorry_theorems_proved
+            + self.sorry_theorems_unproved
+        )
+
     def get_theorem(self, full_name: str, file_path: str) -> Optional[Theorem]:
-        for thm_list in [self.proven_theorems, self.sorry_theorems_proved, self.sorry_theorems_unproved]:
+        for thm_list in [
+            self.proven_theorems,
+            self.sorry_theorems_proved,
+            self.sorry_theorems_unproved,
+        ]:
             for thm in thm_list:
-                if thm.full_name == full_name and (str(thm.file_path) == file_path or (file_path == "" and str(thm.file_path) == ".")):
+                if thm.full_name == full_name and (
+                    str(thm.file_path) == file_path
+                    or (file_path == "" and str(thm.file_path) == ".")
+                ):
                     return thm
         return None
-    
+
     def update_theorem(self, theorem: Theorem) -> None:
-        for thm_list in [self.proven_theorems, self.sorry_theorems_proved, self.sorry_theorems_unproved]:
+        for thm_list in [
+            self.proven_theorems,
+            self.sorry_theorems_proved,
+            self.sorry_theorems_unproved,
+        ]:
             for i, thm in enumerate(thm_list):
                 if thm.is_same_theorem(theorem):
                     thm_list[i] = theorem
                     return
         raise ValueError(f"Theorem '{theorem.full_name}' not found.")
-    
+
     def get_premise_file(self, path: str) -> Optional[PremiseFile]:
         return next((pf for pf in self.premise_files if str(pf.path) == path), None)
 
@@ -286,15 +329,27 @@ def get_file_traced(self, path: str) -> Optional[Path]:
 
     @classmethod
     def from_dict(cls, data: Dict) -> Repository:
-        if not all(key in data for key in ["url", "name", "commit", "lean_version", "lean_dojo_version", "metadata"]):
+        if not all(
+            key in data
+            for key in [
+                "url",
+                "name",
+                "commit",
+                "lean_version",
+                "lean_dojo_version",
+                "metadata",
+            ]
+        ):
             raise ValueError("Invalid Repository data format")
         if "date_processed" not in data["metadata"]:
             raise ValueError("Metadata must contain the 'date_processed' key")
 
         metadata = data["metadata"].copy()
         if isinstance(metadata["date_processed"], str):
-            metadata["date_processed"] = datetime.datetime.fromisoformat(metadata["date_processed"])
-        
+            metadata["date_processed"] = datetime.datetime.fromisoformat(
+                metadata["date_processed"]
+            )
+
         repo = cls(
             url=data["url"],
             name=data["name"],
@@ -303,48 +358,72 @@ def from_dict(cls, data: Dict) -> Repository:
             lean_dojo_version=data["lean_dojo_version"],
             metadata=metadata,
             files_traced=[],
-            pr_url=data.get("pr_url")
+            pr_url=data.get("pr_url"),
         )
 
-        if all(key in data for key in ["theorems_folder", "premise_files_corpus", "files_traced"]):
-            if not all(os.path.exists(data[key]) for key in ["theorems_folder", "premise_files_corpus", "files_traced"]):
-                raise ValueError("Paths to data cannot be empty when creating repo from dataset")
+        if all(
+            key in data
+            for key in ["theorems_folder", "premise_files_corpus", "files_traced"]
+        ):
+            if not all(
+                os.path.exists(data[key])
+                for key in ["theorems_folder", "premise_files_corpus", "files_traced"]
+            ):
+                raise ValueError(
+                    "Paths to data cannot be empty when creating repo from dataset"
+                )
 
             theorems_folder = Path(data["theorems_folder"])
             for file in theorems_folder.glob("*.json"):
-                with open(file, 'r') as f:
+                with open(file, "r") as f:
                     theorem_data = json.load(f)
                 for t_data in tqdm(theorem_data):
                     theorem = Theorem.from_dict(t_data, repo.url, repo.commit)
-                    if any('sorry' in step.tactic for step in (theorem.traced_tactics or [])):
+                    if any(
+                        "sorry" in step.tactic
+                        for step in (theorem.traced_tactics or [])
+                    ):
                         repo.sorry_theorems_unproved.append(theorem)
                     else:
                         repo.proven_theorems.append(theorem)
 
-            with open(data["premise_files_corpus"], 'r') as f:
+            with open(data["premise_files_corpus"], "r") as f:
                 for line in f:
                     premise_file_data = json.loads(line)
                     premise_file = PremiseFile.from_dict(premise_file_data)
                     repo.premise_files.append(premise_file)
 
-            with open(data["files_traced"], 'r') as f:
+            with open(data["files_traced"], "r") as f:
                 for line in f:
                     traced_file_data = json.loads(line)
                     repo.files_traced.append(Path(traced_file_data["traced_file_path"]))
         else:
             # Process theorems and premises from the existing data structure
-            repo.proven_theorems = [Theorem.from_dict(t, repo.url, repo.commit) for t in data.get("proven_theorems", [])]
-            repo.sorry_theorems_proved = [Theorem.from_dict(t, repo.url, repo.commit) for t in data.get("sorry_theorems_proved", [])]
-            repo.sorry_theorems_unproved = [Theorem.from_dict(t, repo.url, repo.commit) for t in data.get("sorry_theorems_unproved", [])]
-            repo.premise_files = [PremiseFile.from_dict(pf) for pf in data.get("premise_files", [])]
+            repo.proven_theorems = [
+                Theorem.from_dict(t, repo.url, repo.commit)
+                for t in data.get("proven_theorems", [])
+            ]
+            repo.sorry_theorems_proved = [
+                Theorem.from_dict(t, repo.url, repo.commit)
+                for t in data.get("sorry_theorems_proved", [])
+            ]
+            repo.sorry_theorems_unproved = [
+                Theorem.from_dict(t, repo.url, repo.commit)
+                for t in data.get("sorry_theorems_unproved", [])
+            ]
+            repo.premise_files = [
+                PremiseFile.from_dict(pf) for pf in data.get("premise_files", [])
+            ]
             repo.files_traced = [Path(file) for file in data.get("files_traced", [])]
 
         return repo
-    
+
     def to_dict(self) -> Dict:
         metadata_copy = self.metadata.copy()
         if isinstance(metadata_copy["date_processed"], datetime.datetime):
-            metadata_copy["date_processed"] = metadata_copy["date_processed"].isoformat()
+            metadata_copy["date_processed"] = metadata_copy[
+                "date_processed"
+            ].isoformat()
         return {
             "url": self.url,
             "name": self.name,
@@ -362,10 +441,12 @@ def to_dict(self) -> Dict:
             "num_files_traced": self.num_files_traced,
             "proven_theorems": [t.to_dict() for t in self.proven_theorems],
             "sorry_theorems_proved": [t.to_dict() for t in self.sorry_theorems_proved],
-            "sorry_theorems_unproved": [t.to_dict() for t in self.sorry_theorems_unproved],
+            "sorry_theorems_unproved": [
+                t.to_dict() for t in self.sorry_theorems_unproved
+            ],
             "premise_files": [pf.to_dict() for pf in self.premise_files],
             "files_traced": [str(file) for file in self.files_traced],
-            "pr_url": self.pr_url
+            "pr_url": self.pr_url,
         }
 
     def change_sorry_to_proven(self, theorem: Theorem, log_file: str) -> None:
@@ -374,28 +455,31 @@ def change_sorry_to_proven(self, theorem: Theorem, log_file: str) -> None:
             self.sorry_theorems_proved.append(theorem)
 
             message = f"Theorem proved: {theorem.full_name} in {theorem.file_path} for repo {self.name} (commit: {self.commit})"
-            timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+            timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
             log_entry = f"{timestamp} - {message}\n"
-            
+
             os.makedirs(os.path.dirname(log_file), exist_ok=True)
-            
-            with open(log_file, 'a') as f:
+
+            with open(log_file, "a") as f:
                 f.write(log_entry)
         else:
-            raise ValueError("The theorem is not in the list of unproved sorry theorems.")
+            raise ValueError(
+                "The theorem is not in the list of unproved sorry theorems."
+            )
+
 
 def safe_remove_dir_path(dir_path):
     """
     Safely removes a directory if it exists.
-    
+
     Attempts to remove the directory multiple times in case of permission errors.
-    
+
     Args:
         dir_path (Path): Path object representing the directory to remove
-        
+
     Raises:
         PermissionError: If the directory cannot be removed after multiple attempts
-        
+
     Returns:
         None
     """
@@ -410,30 +494,43 @@ def safe_remove_dir_path(dir_path):
                 if attempt < max_retries - 1:
                     time.sleep(0.1)  # Wait a bit before retrying
                 else:
-                    logger.error(f"Failed to remove {dir_path} after {max_retries} attempts: {e}")
+                    logger.error(
+                        f"Failed to remove {dir_path} after {max_retries} attempts: {e}"
+                    )
                     raise
 
+
 @dataclass
 class DynamicDatabase:
     repositories: List[Repository] = field(default_factory=list)
 
     SPLIT = Dict[str, List[Theorem]]
 
-    def generate_merged_dataset(self, output_path: Path, repos_to_include: Optional[List[Tuple[str, str]]] = None) -> None:
+    def generate_merged_dataset(
+        self,
+        output_path: Path,
+        repos_to_include: Optional[List[Tuple[str, str]]] = None,
+    ) -> None:
         """
         Generate a merged dataset from multiple repositories in the database.
-        
+
         :param output_path: Path where the merged dataset will be saved
-        :param repos_to_include: List of tuples (url, commit) of repositories to include in the dataset. 
+        :param repos_to_include: List of tuples (url, commit) of repositories to include in the dataset.
                                  If None, all repos are included.
         """
         random.seed(3407)
-        
+
         output_path.mkdir(parents=True, exist_ok=True)
 
-        repos_to_process = self.repositories if repos_to_include is None else [
-            repo for repo in self.repositories if (repo.url, repo.commit) in repos_to_include
-        ]
+        repos_to_process = (
+            self.repositories
+            if repos_to_include is None
+            else [
+                repo
+                for repo in self.repositories
+                if (repo.url, repo.commit) in repos_to_include
+            ]
+        )
 
         if repos_to_include is None:
             logger.info("Merging all repositories in the database.")
@@ -447,7 +544,14 @@ def generate_merged_dataset(self, output_path: Path, repos_to_include: Optional[
 
         for repo in repos_to_process:
             for theorem in repo.get_all_theorems:
-                key = (theorem.file_path, theorem.full_name, list(theorem.start)[0], list(theorem.start)[1], list(theorem.end)[0], list(theorem.end)[1])
+                key = (
+                    theorem.file_path,
+                    theorem.full_name,
+                    list(theorem.start)[0],
+                    list(theorem.start)[1],
+                    list(theorem.end)[0],
+                    list(theorem.end)[1],
+                )
                 date_processed = repo.metadata["date_processed"]
                 if isinstance(date_processed, str):
                     date_processed = datetime.datetime.fromisoformat(date_processed)
@@ -486,19 +590,25 @@ def _merge_corpus(self, repos: List[Repository], output_path: Path) -> None:
                             "code": premise.code,
                             "start": list(premise.start),
                             "end": list(premise.end),
-                            "kind": premise.kind
-                        } for premise in premise_file.premises
-                    ]
+                            "kind": premise.kind,
+                        }
+                        for premise in premise_file.premises
+                    ],
                 }
-                path = file_data['path']
+                path = file_data["path"]
                 if path not in merged_corpus:
                     merged_corpus[path] = json.dumps(file_data)
 
-        with open(output_path / "corpus.jsonl", 'w') as f:
+        with open(output_path / "corpus.jsonl", "w") as f:
             for line in merged_corpus.values():
                 f.write(line + "\n")
 
-    def _split_data(self, theorems: List[Theorem], num_val_pct: float = 0.02, num_test_pct: float = 0.02) -> Dict[str, SPLIT]:
+    def _split_data(
+        self,
+        theorems: List[Theorem],
+        num_val_pct: float = 0.02,
+        num_test_pct: float = 0.02,
+    ) -> Dict[str, SPLIT]:
         num_theorems = len(theorems)
         num_val = int(num_theorems * num_val_pct)
         num_test = int(num_theorems * num_test_pct)
@@ -508,7 +618,9 @@ def _split_data(self, theorems: List[Theorem], num_val_pct: float = 0.02, num_te
             "novel_premises": self._split_by_premise(theorems, num_val, num_test),
         }
 
-    def _split_randomly(self, theorems: List[Theorem], num_val: int, num_test: int) -> SPLIT:
+    def _split_randomly(
+        self, theorems: List[Theorem], num_val: int, num_test: int
+    ) -> SPLIT:
         random.shuffle(theorems)
         num_train = len(theorems) - num_val - num_test
         return {
@@ -517,7 +629,9 @@ def _split_randomly(self, theorems: List[Theorem], num_val: int, num_test: int)
             "test": theorems[num_train + num_val :],
         }
 
-    def _split_by_premise(self, theorems: List[Theorem], num_val: int, num_test: int) -> SPLIT:
+    def _split_by_premise(
+        self, theorems: List[Theorem], num_val: int, num_test: int
+    ) -> SPLIT:
         num_val_test = num_val + num_test
         theorems_val_test = []
 
@@ -528,11 +642,15 @@ def _split_by_premise(self, theorems: List[Theorem], num_val: int, num_test: int
                     for annotation in tactic.annotated_tactic[1]:
                         theorems_by_premises[annotation.full_name].append(t)
 
-        theorems_by_premises = sorted(theorems_by_premises.items(), key=lambda x: len(x[1]))
+        theorems_by_premises = sorted(
+            theorems_by_premises.items(), key=lambda x: len(x[1])
+        )
 
         for _, thms in theorems_by_premises:
             if len(theorems_val_test) < num_val_test:
-                theorems_val_test.extend([t for t in thms if t not in theorems_val_test])
+                theorems_val_test.extend(
+                    [t for t in thms if t not in theorems_val_test]
+                )
             else:
                 break
 
@@ -563,9 +681,10 @@ def _export_proofs(self, splits: Dict[str, SPLIT], output_path: Path) -> None:
                                         "full_name": a.full_name,
                                         "def_path": str(a.def_path),
                                         "def_pos": list(a.def_pos),
-                                        "def_end_pos": list(a.def_end_pos)
-                                    } for a in t.annotated_tactic[1]
-                                ]
+                                        "def_end_pos": list(a.def_end_pos),
+                                    }
+                                    for a in t.annotated_tactic[1]
+                                ],
                             ],
                             "state_before": t.state_before,
                             "state_after": t.state_after,
@@ -573,27 +692,29 @@ def _export_proofs(self, splits: Dict[str, SPLIT], output_path: Path) -> None:
                         for t in thm.traced_tactics
                         if t.state_before != "no goals" and "·" not in t.tactic
                     ]
-                    data.append({
-                        "url": thm.url,
-                        "commit": thm.commit,
-                        "file_path": str(thm.file_path),
-                        "full_name": thm.full_name,
-                        "theorem_statement": thm.theorem_statement,
-                        "start": list(thm.start),
-                        "end": list(thm.end),
-                        "traced_tactics": tactics,
-                    })
+                    data.append(
+                        {
+                            "url": thm.url,
+                            "commit": thm.commit,
+                            "file_path": str(thm.file_path),
+                            "full_name": thm.full_name,
+                            "theorem_statement": thm.theorem_statement,
+                            "start": list(thm.start),
+                            "end": list(thm.end),
+                            "traced_tactics": tactics,
+                        }
+                    )
 
                 output_file = strategy_dir / f"{name}.json"
-                with open(output_file, 'w') as f:
+                with open(output_file, "w") as f:
                     json.dump(data, f, indent=2)
 
-    def _export_traced_files(self, all_traced_files: Set[Path], output_path: Path) -> None:
-        with open(output_path / "traced_files.jsonl", 'w') as f:
+    def _export_traced_files(
+        self, all_traced_files: Set[Path], output_path: Path
+    ) -> None:
+        with open(output_path / "traced_files.jsonl", "w") as f:
             for file in all_traced_files:
-                f.write(json.dumps({
-                    "traced_file_path": str(file)
-                }) + "\n")
+                f.write(json.dumps({"traced_file_path": str(file)}) + "\n")
 
     def _export_metadata(self, repos: List[Repository], output_path: Path) -> None:
         metadata = {
@@ -605,7 +726,8 @@ def _export_metadata(self, repos: List[Repository], output_path: Path) -> None:
                     "lean_version": repo.lean_version,
                     "lean_dojo_version": repo.lean_dojo_version,
                     "metadata": repo.metadata,
-                } for repo in repos
+                }
+                for repo in repos
             ],
             "total_theorems": sum(repo.total_theorems for repo in repos),
             "num_proven_theorems": sum(repo.num_proven_theorems for repo in repos),
@@ -617,9 +739,11 @@ def _export_metadata(self, repos: List[Repository], output_path: Path) -> None:
 
         for repo_data in metadata["repositories"]:
             if isinstance(repo_data["metadata"]["date_processed"], datetime.datetime):
-                repo_data["metadata"]["date_processed"] = repo_data["metadata"]["date_processed"].isoformat()
-        
-        with open(output_path / "metadata.json", 'w') as f:
+                repo_data["metadata"]["date_processed"] = repo_data["metadata"][
+                    "date_processed"
+                ].isoformat()
+
+        with open(output_path / "metadata.json", "w") as f:
             json.dump(metadata, f, indent=2)
 
     def add_repository(self, repo: Repository) -> None:
@@ -628,7 +752,9 @@ def add_repository(self, repo: Repository) -> None:
             self.repositories.append(repo)
             logger.info(f"Added new repository: {repo.url} (commit: {repo.commit})")
         else:
-            logger.info(f"Repository '{repo.url}' with commit '{repo.commit}' already exists in the database.")
+            logger.info(
+                f"Repository '{repo.url}' with commit '{repo.commit}' already exists in the database."
+            )
 
     def get_repository(self, url: str, commit: str) -> Optional[Repository]:
         for repo in self.repositories:
@@ -637,14 +763,22 @@ def get_repository(self, url: str, commit: str) -> Optional[Repository]:
         return None
 
     def update_repository(self, updated_repo: Repository) -> None:
-        logger.info(f"Attempting to update repository: {updated_repo.url} (commit: {updated_repo.commit})")
+        logger.info(
+            f"Attempting to update repository: {updated_repo.url} (commit: {updated_repo.commit})"
+        )
         for i, repo in enumerate(self.repositories):
             if repo == updated_repo:
                 self.repositories[i] = updated_repo
-                logger.info(f"Updated repository: {updated_repo.url} (commit: {updated_repo.commit})")
+                logger.info(
+                    f"Updated repository: {updated_repo.url} (commit: {updated_repo.commit})"
+                )
                 return
-        logger.error(f"Repository '{updated_repo.url}' with commit '{updated_repo.commit}' not found for update.")
-        raise ValueError(f"Repository '{updated_repo.url}' with commit '{updated_repo.commit}' not found.")
+        logger.error(
+            f"Repository '{updated_repo.url}' with commit '{updated_repo.commit}' not found for update."
+        )
+        raise ValueError(
+            f"Repository '{updated_repo.url}' with commit '{updated_repo.commit}' not found."
+        )
 
     def print_database_contents(self):
         logger.info("Current database contents:")
@@ -659,9 +793,7 @@ def delete_repository(self, url: str, commit: str) -> None:
         raise ValueError(f"Repository '{url}' with commit '{commit}' not found.")
 
     def to_dict(self) -> Dict:
-        return {
-            "repositories": [repo.to_dict() for repo in self.repositories]
-        }
+        return {"repositories": [repo.to_dict() for repo in self.repositories]}
 
     @classmethod
     def from_dict(cls, data: Dict) -> DynamicDatabase:
@@ -675,13 +807,13 @@ def from_dict(cls, data: Dict) -> DynamicDatabase:
 
     def to_json(self, file_path: str) -> None:
         """Serialize the database to a JSON file."""
-        with open(file_path, 'w', encoding='utf-8') as f:
+        with open(file_path, "w", encoding="utf-8") as f:
             json.dump(self.to_dict(), f, indent=2, ensure_ascii=False)
 
     @classmethod
     def from_json(cls, file_path: str) -> DynamicDatabase:
         """Deserialize the database from a JSON file."""
-        with open(file_path, 'r') as f:
+        with open(file_path, "r") as f:
             data = json.load(f)
         return cls.from_dict(data)
 
@@ -695,4 +827,4 @@ def update_json(self, file_path: str) -> None:
         for repo in self.repositories:
             existing_db.update_repository(repo)
 
-        existing_db.to_json(file_path)
\ No newline at end of file
+        existing_db.to_json(file_path)
diff --git a/generate_benchmark_lean4.py b/generate_benchmark_lean4.py
index deefbc6..0c72377 100644
--- a/generate_benchmark_lean4.py
+++ b/generate_benchmark_lean4.py
@@ -18,18 +18,20 @@
 
 random.seed(3407)  # https://arxiv.org/abs/2109.08203
 
-RAID_DIR = os.environ.get('RAID_DIR')
+RAID_DIR = os.environ.get("RAID_DIR")
 SPLIT_NAME = str  # train/val/test
 SPLIT = Dict[SPLIT_NAME, List[TracedTheorem]]
 SPLIT_STRATEGY = str
 _LEAN4_VERSION_REGEX = re.compile(r"leanprover/lean4:(?P<version>.+?)")
 
+
 def get_lean4_version_from_config(toolchain: str) -> str:
     """Return the required Lean version given a ``lean-toolchain`` config."""
     m = _LEAN4_VERSION_REGEX.fullmatch(toolchain.strip())
     assert m is not None, "Invalid config."
     return m["version"]
 
+
 def is_supported_version(v) -> bool:
     """
     Check if ``v`` is at least `v4.3.0-rc2` and at most `v4.8.0-rc1`.
@@ -42,7 +44,12 @@ def is_supported_version(v) -> bool:
         return False
     v = v[1:]
     major, minor, patch = [int(_) for _ in v.split("-")[0].split(".")]
-    if major < 4 or (major == 4 and minor < 3) or (major == 4 and minor > 8) or (major == 4 and minor == 8 and patch > 1):
+    if (
+        major < 4
+        or (major == 4 and minor < 3)
+        or (major == 4 and minor > 8)
+        or (major == 4 and minor == 8 and patch > 1)
+    ):
         return False
     if (
         major > 4
@@ -57,10 +64,9 @@ def is_supported_version(v) -> bool:
     else:
         return True
 
+
 def _split_sequentially(
-    traced_theorems: List[TracedTheorem],
-    num_val: int,
-    num_test: int
+    traced_theorems: List[TracedTheorem], num_val: int, num_test: int
 ) -> SPLIT:
     """Split ``traced_theorems`` sequentially into train/val/test."""
     num_theorems = len(traced_theorems)
@@ -73,9 +79,7 @@ def _split_sequentially(
 
 
 def split_randomly(
-    traced_theorems: List[TracedTheorem],
-    num_val: int,
-    num_test: int
+    traced_theorems: List[TracedTheorem], num_val: int, num_test: int
 ) -> SPLIT:
     """Split ``traced_theorems`` randomly into train/val/test."""
     logger.info("Splitting the theorems randomly")
@@ -83,10 +87,9 @@ def split_randomly(
     random.shuffle(traced_theorems)
     return _split_sequentially(traced_theorems, num_val, num_test)
 
+
 def split_by_premise(
-    traced_theorems: List[TracedTheorem],
-    num_val: int,
-    num_test: int
+    traced_theorems: List[TracedTheorem], num_val: int, num_test: int
 ) -> SPLIT:
     """
     Split theorems into train/val/test so that proofs in val/test rely on at
@@ -125,12 +128,15 @@ def split_by_premise(
         "test": theorems_val_test[num_val:],
     }
 
-def split_data(traced_repo: TracedRepo, num_val_pct: float = 0.02, num_test_pct: float = 0.02) -> Dict[SPLIT_STRATEGY, SPLIT]:
+
+def split_data(
+    traced_repo: TracedRepo, num_val_pct: float = 0.02, num_test_pct: float = 0.02
+) -> Dict[SPLIT_STRATEGY, SPLIT]:
     """
     Split the traced theorems into training, validation, and test sets.
 
     This function extracts theorems from the provided TracedRepo object, excluding
-    theorems from the Lean 4 repository itself. The theorems are then split using 
+    theorems from the Lean 4 repository itself. The theorems are then split using
     multiple strategies, including random splitting and splitting by novel premises.
 
     Args:
@@ -159,13 +165,16 @@ def split_data(traced_repo: TracedRepo, num_val_pct: float = 0.02, num_test_pct:
     num_val = int(num_theorems * num_val_pct)
     num_test = int(num_theorems * num_test_pct)
 
-    logger.info(f"{num_theorems} theorems in total, with {num_val} for validation and {num_test} for testing")
+    logger.info(
+        f"{num_theorems} theorems in total, with {num_val} for validation and {num_test} for testing"
+    )
 
     return {
         "random": split_randomly(traced_theorems, num_val, num_test),
         "novel_premises": split_by_premise(traced_theorems, num_val, num_test),
     }
 
+
 def _get_file_path(traced_repo: TracedRepo, thm: TracedTheorem) -> str:
     """
     Get the file path for a given theorem in a traced repository.
@@ -201,8 +210,8 @@ def export_proofs(
 ) -> None:
     """
     Export proofs from a traced repository to the specified destination path.
-    This function processes the given splits (organized by strategy) and writes the theorem proofs 
-    to JSON files in the destination directory. Each theorem is exported with its metadata, 
+    This function processes the given splits (organized by strategy) and writes the theorem proofs
+    to JSON files in the destination directory. Each theorem is exported with its metadata,
     including URL, commit, file path, theorem statement, and traced tactics.
     Args:
         splits: Dictionary mapping split strategies to actual splits. Each split maps dataset
@@ -246,7 +255,7 @@ def export_proofs(
                 theorem_statement = None
                 if thm.has_tactic_proof() and thm.get_tactic_proof() is not None:
                     theorem_statement = thm.get_theorem_statement()
-                
+
                 data.append(
                     {
                         "url": traced_repo.repo.url,
@@ -303,18 +312,13 @@ def export_premises(traced_repo: TracedRepo, dst_path: Path) -> None:
     logger.info(
         f"{num_premises} theorems/definitions from {len(traced_repo.traced_files)} files saved to {oup_path}"
     )
-    
+
     oup_path = dst_path / "traced_files.jsonl"
     with oup_path.open("wt") as oup:
         for traced_file in traced_repo.traced_files:
             source_file = traced_file.lean_file
             source_file_path = source_file.path
-            oup.write(
-                json.dumps(
-                    {"traced_file_path": str(source_file_path)}
-                )
-                + "\n"
-            )
+            oup.write(json.dumps({"traced_file_path": str(source_file_path)}) + "\n")
 
     return num_premises, len(traced_repo.traced_files)
 
@@ -358,17 +362,17 @@ def export_metadata(traced_repo: TracedRepo, dst_path: Path, **kwargs) -> None:
 def safe_remove_dir(dir_path):
     """
     Safely removes a directory if it exists.
-    
+
     This function attempts to remove the specified directory, with multiple retries
     in case of permission errors. A warning is logged if the directory already exists.
-    
+
     Args:
         dir_path (str): Path to the directory to be removed.
-        
+
     Raises:
         PermissionError: If the directory cannot be removed after multiple attempts
                          due to permission issues.
-    
+
     Note:
         The function will retry up to 5 times with a 0.1 second delay between attempts
         if a PermissionError occurs.
@@ -384,23 +388,25 @@ def safe_remove_dir(dir_path):
                 if attempt < max_retries - 1:
                     time.sleep(0.1)  # Wait a bit before retrying
                 else:
-                    logger.error(f"Failed to remove {dir_path} after {max_retries} attempts: {e}")
+                    logger.error(
+                        f"Failed to remove {dir_path} after {max_retries} attempts: {e}"
+                    )
                     raise
 
 
 def safe_remove_dir_path(dir_path):
     """
     Safely removes a directory and all its contents if it exists.
-    
+
     Uses multiple attempts with a small delay between them to handle potential
     permission errors that might occur on some systems when removing directories.
-    
+
     Args:
         dir_path (Path): Path object representing the directory to remove
-        
+
     Raises:
         PermissionError: If the directory cannot be removed after multiple attempts
-        
+
     Returns:
         None
     """
@@ -415,9 +421,12 @@ def safe_remove_dir_path(dir_path):
                 if attempt < max_retries - 1:
                     time.sleep(0.1)  # Wait a bit before retrying
                 else:
-                    logger.error(f"Failed to remove {dir_path} after {max_retries} attempts: {e}")
+                    logger.error(
+                        f"Failed to remove {dir_path} after {max_retries} attempts: {e}"
+                    )
                     raise
 
+
 def export_data(
     traced_repo: TracedRepo,
     splits: Dict[SPLIT_STRATEGY, SPLIT],
@@ -425,20 +434,20 @@ def export_data(
     **kwargs,
 ) -> None:
     """Export a traced repository's content to a specified destination path.
-    
-    This function exports proofs, premises, licenses, and metadata from a traced 
-    repository to a specified destination path. The repository's theorems should have 
+
+    This function exports proofs, premises, licenses, and metadata from a traced
+    repository to a specified destination path. The repository's theorems should have
     been split using a strategy defined in `splits`.
-    
+
     Args:
         traced_repo: The traced repository containing the data to export.
         splits: Dictionary mapping split strategies to their corresponding splits.
         dst_path: Destination path where the data will be exported. Can be a string or Path object.
         **kwargs: Additional keyword arguments to pass to export_metadata.
-    
+
     Returns:
         tuple: A tuple containing (number of premises, number of files traced, total theorems exported).
-        
+
     Note:
         Any existing content at the destination path will be removed.
     """
@@ -460,17 +469,18 @@ def export_data(
 
     return num_premises, num_files_traced, total_theorems
 
+
 def configure_leandojo():
     """
     Configure the LeanDojo environment for benchmarking.
-    
+
     This function sets up the logger configuration for LeanDojo and displays
     important environment variables including the current working directory
     and various constants related to process management.
-    
+
     It removes any existing logger handlers and adds a new handler for stderr
     with DEBUG level logging.
-    
+
     No parameters are required, and the function does not return any values.
     """
     constants.logger.remove()
@@ -482,11 +492,12 @@ def configure_leandojo():
 
     logger.info(f"Current working directory: {os.getcwd()}")
 
+
 def main(url, commit, dst_dir):
     """
     Generates a benchmark dataset for Lean 4 proofs from a specified repository.
     This function clones a Lean 4 repository, configures the appropriate Lean toolchain
-    version, traces the repository using LeanDojo, and exports the trace data to a 
+    version, traces the repository using LeanDojo, and exports the trace data to a
     designated directory.
     Args:
         url (str): The URL of the Lean 4 Git repository to clone
@@ -511,23 +522,27 @@ def main(url, commit, dst_dir):
     v = get_lean4_version_from_config(config["content"])
     logger.info(f"lean version v: {v}")
     logger.info(f"is supported: {is_supported_version(v)}")
-    if not is_supported_version(v):  # Won't get here since we checked for a compatible commit, but sanity check in case
+    if not is_supported_version(
+        v
+    ):  # Won't get here since we checked for a compatible commit, but sanity check in case
         logger.info("Unsupported version")
-    v = v[1:] # ignore "v" at beginning
-    
-    lean_dir2 = f"/.elan/toolchains/leanprover--lean4---{v}"
-    lean_dir3 = f"~/.elan/toolchains/leanprover--lean4---{v}"
+    v = v[1:]  # ignore "v" at beginning
+
+    lean_dir2 = f"/Users/motiwari/.elan/toolchains/leanprover--lean4---{v}"
+    lean_dir3 = f"/Users/motiwari/.elan/toolchains/leanprover--lean4---{v}"
     logger.info(f"lean path2 {lean_dir2}")
     logger.info(f"lean path3 {lean_dir3}")
     if not os.path.exists(lean_dir2):
         logger.info(f"Lean toolchain path 2 does not exist: {lean_dir2}")
     if not os.path.exists(lean_dir3):
         logger.info(f"Lean toolchain path 3 does not exist: {lean_dir3}")
-    os.environ['LEAN4_PATH'] = lean_dir2
-    os.environ['PATH'] = f"{lean_dir2}/bin:{os.environ.get('PATH', '')}"
+    os.environ["LEAN4_PATH"] = lean_dir2
+    os.environ["PATH"] = f"{lean_dir2}/bin:{os.environ.get('PATH', '')}"
     logger.info(f"Switched to Lean toolchain at: {lean_dir2}")
 
-    logger.info(f"lean --version: {subprocess.run(['lean', '--version'], capture_output=True).stdout.decode('utf-8')}")
+    logger.info(
+        f"lean --version: {subprocess.run(['lean', '--version'], capture_output=True).stdout.decode('utf-8')}"
+    )
     logger.info(f"repo: {repo}")
 
     logger.info("Configuring LeanDojo again...")
@@ -544,6 +559,8 @@ def main(url, commit, dst_dir):
     safe_remove_dir(dst_dir)
     splits = split_data(traced_repo)
     logger.info("Successfully split the data")
-    num_premises, num_files_traced, total_theorems = export_data(traced_repo, splits, dst_dir)
+    num_premises, num_files_traced, total_theorems = export_data(
+        traced_repo, splits, dst_dir
+    )
     logger.info("Successfully exported the data")
     return traced_repo, num_premises, num_files_traced, total_theorems
diff --git a/generator/datamodule.py b/generator/datamodule.py
index 95a4af9..882e64a 100644
--- a/generator/datamodule.py
+++ b/generator/datamodule.py
@@ -25,7 +25,7 @@ class GeneratorDataset(Dataset):
     """
     A PyTorch Dataset for loading and processing data for a generator model that produces tactics given proof states.
 
-    This dataset handles loading examples from a JSON file, formatting states and tactics, 
+    This dataset handles loading examples from a JSON file, formatting states and tactics,
     and optionally augmenting states with retrieved premises.
 
     Attributes:
@@ -39,6 +39,7 @@ class GeneratorDataset(Dataset):
         is_train (bool): Whether this dataset is used for training.
         data (List[Example]): The loaded and processed examples.
     """
+
     def __init__(
         self,
         data_path: str,
diff --git a/generator/model.py b/generator/model.py
index d5f5c83..f07e95c 100644
--- a/generator/model.py
+++ b/generator/model.py
@@ -26,6 +26,7 @@
 
 torch.set_float32_matmul_precision("medium")
 
+
 def safe_remove_dir(dir_path):
     """
     Safely removes a directory path if it exists, with retries.
@@ -57,7 +58,9 @@ def safe_remove_dir(dir_path):
                 if attempt < max_retries - 1:
                     time.sleep(0.1)  # Wait a bit before retrying
                 else:
-                    logger.error(f"Failed to remove {dir_path} after {max_retries} attempts: {e}")
+                    logger.error(
+                        f"Failed to remove {dir_path} after {max_retries} attempts: {e}"
+                    )
                     raise
 
 
@@ -80,6 +83,7 @@ class TopkAccuracy(Metric):
         update(batch_preds, batch_gt): Updates the state with batch statistics.
         compute(): Computes the accuracy based on collected state.
     """
+
     is_differentiable: Optional[bool] = False
     higher_is_better: Optional[bool] = True
     full_state_update: bool = True
@@ -295,11 +299,11 @@ def on_fit_start(self) -> None:
     def validation_step(self, batch: Dict[str, Any], _) -> None:
         """
         Performs a validation step on a batch of data.
-        
-        The method computes the loss on the validation data, logs the loss, and generates 
-        tactic candidates using Beam Search. It also logs example inputs/outputs and 
+
+        The method computes the loss on the validation data, logs the loss, and generates
+        tactic candidates using Beam Search. It also logs example inputs/outputs and
         calculates top-k accuracy metrics for the generated tactics.
-        
+
         Args:
             batch: A dictionary containing batch data with the following keys:
                 - state_ids: Tensor of input state token IDs
@@ -307,10 +311,10 @@ def validation_step(self, batch: Dict[str, Any], _) -> None:
                 - tactic_ids: Tensor of target tactic token IDs
                 - tactic: List of reference tactic strings
             _: Batch index (unused)
-        
+
         Returns:
             None
-        
+
         Side effects:
             - Logs validation loss
             - Logs example inputs/outputs as text
@@ -442,7 +446,7 @@ def batch_generate(
         Returns:
             List[List[Tuple[str, float]]]: A list of lists where each inner list contains tuples of
             (tactic_text, score) for each state. Duplicate tactics are removed.
-            
+
         Note:
             If a retriever is configured, it will be used to augment states with relevant premises
             before generation.
diff --git a/leanagent.py b/leanagent.py
index 969203b..ea7d062 100644
--- a/leanagent.py
+++ b/leanagent.py
@@ -42,23 +42,28 @@
 from retrieval.datamodule import RetrievalDataModule
 from retrieval.main import run_cli
 import torch
-from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor, Callback
+from pytorch_lightning.callbacks import (
+    ModelCheckpoint,
+    EarlyStopping,
+    LearningRateMonitor,
+    Callback,
+)
 from pytorch_lightning import seed_everything
 
 # Set the seed for reproducibility
 random.seed(3407)  # https://arxiv.org/abs/2109.08203
 BATCH_SIZE = 4
-RAID_DIR = os.environ.get('RAID_DIR')
-os.environ['RAY_TMPDIR'] = f"{RAID_DIR}/tmp"
+RAID_DIR = os.environ.get("RAID_DIR")
+os.environ["RAY_TMPDIR"] = f"{RAID_DIR}/tmp"
 repo_dir = f"{RAID_DIR}/repos_new"
 
-DATA_DIR = "<DATA_DIR>"
-CHECKPOINT_DIR = "<CHECKPOINT_DIR>"
-EVAL_RESULTS_FILE_PATH = f"{RAID_DIR}/LeanAgent/<EVAL_RESULTS_FILE_PATH>"
-DB_FILE_NAME = "<DB_FILE_NAME>"
-PROOF_LOG_FILE_NAME = "proof_logs/<PROOF_LOG_FILE_NAME>"
-ENCOUNTERED_THEOREMS_FILE = "<ENCOUNTERED_THEOREMS_FILE>"
-FISHER_DIR = "<FISHER_DIR>"  # Optional
+DATA_DIR = f"{RAID_DIR}/data"
+CHECKPOINT_DIR = f"{RAID_DIR}/checkpoints"
+EVAL_RESULTS_FILE_PATH = f"{RAID_DIR}/eval_results.txt"
+DB_FILE_NAME = "db_file.txt"
+PROOF_LOG_FILE_NAME = f"{RAID_DIR}/proof_log.txt"
+ENCOUNTERED_THEOREMS_FILE = f"{RAID_DIR}/encountered_theorems.pkl"
+FISHER_DIR = f"{RAID_DIR}/fisher"  # Optional
 
 repos_for_merged_dataset = []
 repos_for_proving = []
@@ -67,10 +72,10 @@
 # Feel free to remove any repos from this list if you would like to test on them
 known_repositories = [
     "leanprover-community/mathlib4",  # ReProver is trained on this
-    "leanprover-community/batteries", # functional programming instead of math
+    "leanprover-community/batteries",  # functional programming instead of math
     "leanprover-community/aesop",
     "leanprover/lean4",
-    "leanprover-community/mathlib", # Mathlib3 version
+    "leanprover-community/mathlib",  # Mathlib3 version
     "leanprover-community/mathlib3",
     "leanprover/std4",  # moved to batteries
     "leanprover-community/duper",  # functional programming instead of math
@@ -86,7 +91,7 @@
     "ufmg-smite/lean-smt",  # fails to trace due to windows-style line endings
     "teorth/symmetric_project",  # no compatible commit
     "cmu-l3/llmlean",  # irrelevant + only 4 theorems
-    "PatrickMassot/GlimpseOfLean",   # strange trace problems with _parse_deps
+    "PatrickMassot/GlimpseOfLean",  # strange trace problems with _parse_deps
     "avigad/lamr",  # trace problems
     "leanprover-community/quote4",  # no theorems
     "leanprover-community/iris-lean",  # trace problems
@@ -101,25 +106,25 @@
     "risc0/risc0-lean4",
     "PatrickMassot/verbose-lean4",  # no theorems
     "tydeu/lean4-alloy",  # no theorems
-    "leanprover/leansat", # deprecated
-    "BoltonBailey/formal-snarks-project", # two theorems
-    "dwrensha/lean4-maze", # two theorems
-    "leanprover-community/mathport", # irrelevant
+    "leanprover/leansat",  # deprecated
+    "BoltonBailey/formal-snarks-project",  # two theorems
+    "dwrensha/lean4-maze",  # two theorems
+    "leanprover-community/mathport",  # irrelevant
     "argumentcomputer/LSpec",  # one theorem
-    "reaslab/jixia", # no theorems
-    "riccardobrasca/flt3", # no theorems
-    "dwrensha/animate-lean-proofs", # irrelevant
-    "lean-ja/lean-by-example", # irrelevant
-    "NethermindEth/Clear", # no theorems
-    "fgdorais/lean4-parser", # irrelevant
-    "semorrison/lean-training-data", # irrelevant
-    "verse-lab/lean-ssr", # irrelevant
-    "GaloisInc/lean-llvm", # irrelevant
-    "argumentcomputer/Wasm.lean", # irrelevant
-    "NethermindEth/EVMYulLean", # irrelevant
-    "rwbarton/advent-of-lean-4", # irrelevant
-    "leanprover-community/tutorials4", # irrelevant
-    "haruhisa-enomoto/mathlib4-all-tactics", # irrelevant
+    "reaslab/jixia",  # no theorems
+    "riccardobrasca/flt3",  # no theorems
+    "dwrensha/animate-lean-proofs",  # irrelevant
+    "lean-ja/lean-by-example",  # irrelevant
+    "NethermindEth/Clear",  # no theorems
+    "fgdorais/lean4-parser",  # irrelevant
+    "semorrison/lean-training-data",  # irrelevant
+    "verse-lab/lean-ssr",  # irrelevant
+    "GaloisInc/lean-llvm",  # irrelevant
+    "argumentcomputer/Wasm.lean",  # irrelevant
+    "NethermindEth/EVMYulLean",  # irrelevant
+    "rwbarton/advent-of-lean-4",  # irrelevant
+    "leanprover-community/tutorials4",  # irrelevant
+    "haruhisa-enomoto/mathlib4-all-tactics",  # irrelevant
     "leanprover/LNSym",
     "leanprover-community/flt-regular",
     "opencompl/lean-mlir-old",
@@ -160,7 +165,7 @@
     "digama0/mm-lean4",
     "KislyjKisel/Raylib.lean",
     "algebraic-dev/melp",
-    "hhu-adam/Robo", # same as other tutorials but has lots of sorries
+    "hhu-adam/Robo",  # same as other tutorials but has lots of sorries
     "hargoniX/socket.lean",
     "kovach/etch",
     "damek/gd-lean",
@@ -169,7 +174,7 @@
     "katydid/proofs",
     "alexjbest/leaff",
     "sinhp/Poly",
-    "lftcm2023/lftcm2023", # same as other tutorials but has lots of sorries
+    "lftcm2023/lftcm2023",  # same as other tutorials but has lots of sorries
     "lean-ja/lean99",
     "leanprover/SHerLOC",
     "Seasawher/mdgen",
@@ -183,7 +188,7 @@
     "madvorak/fecssk",
     "david-christiansen/bob24",
     "awodey/joyal",
-    "BrownCS1951x/fpv2023", # same as other tutorials but has lots of sorries
+    "BrownCS1951x/fpv2023",  # same as other tutorials but has lots of sorries
     "paulch42/lean-spec",
     "siddhartha-gadgil/MetaExamples",
     "dannypsnl/violet",
@@ -194,7 +199,7 @@
     "kmill/LeanTeX",
     "leanprover/lean4export",
     "leanprover-community/mathlib3port",
-    "brown-cs22/CS22-Lean-2024", # same as other tutorials but has lots of sorries
+    "brown-cs22/CS22-Lean-2024",  # same as other tutorials but has lots of sorries
     "T-Brick/lean-wasm",
     "crabbo-rave/Soup",
     "argumentcomputer/RustFFI.lean",
@@ -237,14 +242,14 @@
     "arthurpaulino/LeanMusic",
     "argumentcomputer/Ipld.lean",
     "Odomontois/advent2022-lean",
-    "kbuzzard/IISc-experiments", # same as other tutorials but has lots of sorries
+    "kbuzzard/IISc-experiments",  # same as other tutorials but has lots of sorries
     "ykonstant1/InfinitePrimes",
     "alexkassil/natural_number_game_lean4",
     "seewoo5/lean-poly-abc",
     "rah4927/lean-dojo-mew",
     "siddhartha-gadgil/proofs-and-programs-2023",
     "PatrickMassot/lean4-game-server",
-    "knowsys/Formale-Systeme-in-LEAN", # same as other tutorials but has lots of sorries
+    "knowsys/Formale-Systeme-in-LEAN",  # same as other tutorials but has lots of sorries
     "katydid/symbolic-automatic-derivatives",
     "girving/interval",
     "ImperialCollegeLondon/group-theory-experiments",
@@ -253,14 +258,14 @@
     "vasnesterov/HadwigerNelson",
     "FWuermse/lean-postgres",
     "leanprover-community/import-graph",
-    "Human-Oriented-ATP/lean-tactics", # more about tactics than premises
+    "Human-Oriented-ATP/lean-tactics",  # more about tactics than premises
     "paulcadman/lean4-leetcode",
     "argumentcomputer/Lurk.lean",
     "AlexDuchnowski/rubiks-cube",
     "SchrodingerZhu/lean-gccjit",
     "JamesGallicchio/http",
     "jtristan/UnicodeSkipListTableExample",
-    "adomani/MA4N1_2023", # same as other tutorials but has lots of sorries
+    "adomani/MA4N1_2023",  # same as other tutorials but has lots of sorries
     "remimimimimi/leansec",
     "hhu-adam/lean-i18n",
     "RemyDegenne/testing-lower-bounds",
@@ -298,9 +303,10 @@
 
 COMMIT_MESSAGE = "[LeanAgent] Proofs"
 
+
 def clone_repo(repo_url):
     """Clone a git repository and return the path to the repository and its sha."""
-    repo_name = "/".join(repo_url.split('/')[-2:]).replace('.git', '')
+    repo_name = "/".join(repo_url.split("/")[-2:]).replace(".git", "")
     logger.info(f"Cloning {repo_url}")
     logger.info(f"Repo name: {repo_name}")
     repo_name = repo_dir + "/" + repo_name
@@ -310,28 +316,53 @@ def clone_repo(repo_url):
     subprocess.run(["git", "clone", repo_url, repo_name])
     process = subprocess.Popen(["git", "ls-remote", repo_url], stdout=subprocess.PIPE)
     stdout, stderr = process.communicate()
-    sha = re.split(r'\t+', stdout.decode('utf-8'))[0]
+    sha = re.split(r"\t+", stdout.decode("utf-8"))[0]
     return repo_name, sha
 
+
 def branch_exists(repo_name, branch_name):
     """Check if a branch exists in a git repository."""
-    proc = subprocess.run(["git", "-C", repo_name, "branch", "-a"], capture_output=True, text=True)
-    branches = proc.stdout.split('\n')
+    proc = subprocess.run(
+        ["git", "-C", repo_name, "branch", "-a"], capture_output=True, text=True
+    )
+    branches = proc.stdout.split("\n")
     local_branch = branch_name
-    remote_branch = f'remote/{branch_name}'
-    return any(branch.strip().endswith(local_branch) or branch.strip().endswith(remote_branch) for branch in branches)
+    remote_branch = f"remote/{branch_name}"
+    return any(
+        branch.strip().endswith(local_branch) or branch.strip().endswith(remote_branch)
+        for branch in branches
+    )
+
 
 def create_or_switch_branch(repo_name, branch_name, base_branch):
     """Create a branch in a git repository if it doesn't exist, or switch to it if it does."""
     if not branch_exists(repo_name, branch_name):
-        subprocess.run(["git", "-C", repo_name, "checkout", "-b", branch_name], check=True)
+        subprocess.run(
+            ["git", "-C", repo_name, "checkout", "-b", branch_name], check=True
+        )
     else:
         subprocess.run(["git", "-C", repo_name, "checkout", branch_name], check=True)
-        subprocess.run(["git", "-C", repo_name, "merge", base_branch, "-m", f"Merging {branch_name} into {base_branch}"], check=True)
+        subprocess.run(
+            [
+                "git",
+                "-C",
+                repo_name,
+                "merge",
+                base_branch,
+                "-m",
+                f"Merging {branch_name} into {base_branch}",
+            ],
+            check=True,
+        )
+
 
 def commit_changes(repo_name, commit_message):
     """Commit changes to a git repository."""
-    status = subprocess.run(["git", "-C", repo_name, "status", "--porcelain"], capture_output=True, text=True).stdout.strip()
+    status = subprocess.run(
+        ["git", "-C", repo_name, "status", "--porcelain"],
+        capture_output=True,
+        text=True,
+    ).stdout.strip()
     if status == "":
         print("No changes to commit.")
         return False
@@ -339,55 +370,56 @@ def commit_changes(repo_name, commit_message):
     subprocess.run(["git", "-C", repo_name, "commit", "-m", commit_message], check=True)
     return True
 
+
 def push_changes(repo_name, branch_name):
     """Push changes to a git repository."""
-    subprocess.run(["git", "-C", repo_name, "push", "-u", "origin", branch_name], check=True)
+    subprocess.run(
+        ["git", "-C", repo_name, "push", "-u", "origin", branch_name], check=True
+    )
+
 
 def get_default_branch(repo_full_name):
     """Get the default branch of a repository (default `main`)."""
     url = f"https://api.github.com/repos/{repo_full_name}"
     headers = {
         "Authorization": f"token {personal_access_token}",
-        "Accept": "application/vnd.github.v3+json"
+        "Accept": "application/vnd.github.v3+json",
     }
     response = requests.get(url, headers=headers)
     if response.status_code == 200:
-        return response.json()['default_branch']
+        return response.json()["default_branch"]
     else:
         logger.info(f"Failed to get default branch for {repo_full_name}")
         return "main"
 
+
 def create_pull_request(repo_full_name, title, body, head_branch):
     """Create a pull request in a repository."""
     base_branch = get_default_branch(repo_full_name)
     url = f"https://api.github.com/repos/{repo_full_name}/pulls"
     headers = {
         "Authorization": f"token {personal_access_token}",
-        "Accept": "application/vnd.github.v3+json"
-    }
-    data = {
-        "title": title,
-        "body": body,
-        "head": head_branch,
-        "base": base_branch
+        "Accept": "application/vnd.github.v3+json",
     }
+    data = {"title": title, "body": body, "head": head_branch, "base": base_branch}
     response = requests.post(url, headers=headers, json=data)
     if response.status_code == 201:
-        print("Pull request created successfully: " + response.json()['html_url'])
-        return response.json()['html_url']
+        print("Pull request created successfully: " + response.json()["html_url"])
+        return response.json()["html_url"]
     else:
         print("Failed to create pull request", response.text)
         return ""
 
+
 def get_compatible_commit(url):
     """Find the most recent commit with a Lean version that LeanAgent supports."""
     try:
         process = subprocess.Popen(["git", "ls-remote", url], stdout=subprocess.PIPE)
         stdout, stderr = process.communicate()
-        latest_commit = re.split(r'\t+', stdout.decode('utf-8'))[0]
+        latest_commit = re.split(r"\t+", stdout.decode("utf-8"))[0]
         logger.info(f"Latest commit: {latest_commit}")
 
-        new_url = url.replace('.git', '')
+        new_url = url.replace(".git", "")
         logger.info(f"Creating LeanGitRepo for {new_url}")
         repo = LeanGitRepo(new_url, latest_commit)
         logger.info(f"Getting config for {url}")
@@ -399,19 +431,21 @@ def get_compatible_commit(url):
 
         logger.info(f"Searching for compatible commit for {url}")
         try:
-            subprocess.run(["git", "rev-parse", "--is-inside-work-tree"], 
-                        check=True, 
-                        stdout=subprocess.DEVNULL, 
-                        stderr=subprocess.DEVNULL)
+            subprocess.run(
+                ["git", "rev-parse", "--is-inside-work-tree"],
+                check=True,
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+            )
             logger.info("Already in a Git repository")
         except subprocess.CalledProcessError:
             logger.info("Not in a Git repository. Initializing one.")
             subprocess.run(["git", "init"], check=True)
-        
+
         process = subprocess.Popen(
             ["git", "fetch", "--depth=1000000", url],  # Fetch commits
             stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE
+            stderr=subprocess.PIPE,
         )
         logger.info(f"Fetching commits for {url}")
         _, stderr = process.communicate()
@@ -421,19 +455,21 @@ def get_compatible_commit(url):
         process = subprocess.Popen(
             ["git", "log", "--format=%H", "FETCH_HEAD"],  # Get list of commits
             stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE
+            stderr=subprocess.PIPE,
         )
         logger.info(f"Getting list of commits for {url}")
         stdout, stderr = process.communicate()
         if process.returncode != 0:
             raise Exception(f"Git log command failed: {stderr.decode('utf-8')}")
-        commits = stdout.decode('utf-8').strip().split('\n')
+        commits = stdout.decode("utf-8").strip().split("\n")
         logger.info(f"Found {len(commits)} commits for {url}")
         for commit in commits:
-            new_url = url.replace('.git', '')
+            new_url = url.replace(".git", "")
             repo = LeanGitRepo(new_url, commit)
             config = repo.get_config("lean-toolchain")
-            v = generate_benchmark_lean4.get_lean4_version_from_config(config["content"])
+            v = generate_benchmark_lean4.get_lean4_version_from_config(
+                config["content"]
+            )
             if generate_benchmark_lean4.is_supported_version(v):
                 logger.info(f"Found compatible commit {commit} for {url}")
                 return commit, v
@@ -444,14 +480,15 @@ def get_compatible_commit(url):
         logger.info(f"Error in get_compatible_commit: {str(e)}")
         return None, None
 
+
 def find_and_save_compatible_commits(repo_info_file, lean_git_repos):
     """Finds compatible commits for various repositories"""
     updated_repos = []
     for repo in lean_git_repos:
         url = repo.url
-        if not url.endswith('.git'):
-            url = url + '.git'
-        
+        if not url.endswith(".git"):
+            url = url + ".git"
+
         sha = None
         v = None
         if "mathlib4" in url:
@@ -468,45 +505,52 @@ def find_and_save_compatible_commits(repo_info_file, lean_git_repos):
         if not sha:
             logger.info(f"Failed to find a compatible commit for {url}")
             continue
-        
-        updated_repos.append({"url": url.replace('.git', ''), "commit": sha, "version": v})
-    
-    with open(repo_info_file, 'w') as f:
+
+        updated_repos.append(
+            {"url": url.replace(".git", ""), "commit": sha, "version": v}
+        )
+
+    with open(repo_info_file, "w") as f:
         json.dump(updated_repos, f)
-    
+
     return updated_repos
 
+
 def search_github_repositories(language="Lean", num_repos=10):
     """Search for the given number of repositories on GitHub that have the given language."""
-    headers = {'Authorization': personal_access_token}
+    headers = {"Authorization": personal_access_token}
     query_params = {
-        'q': f'language:{language}',
-        'sort': 'stars',
-        'order': 'desc',
-        'per_page': 100,
+        "q": f"language:{language}",
+        "sort": "stars",
+        "order": "desc",
+        "per_page": 100,
     }
-    
+
     cloned_count = 0
     page = 1
 
     while cloned_count < num_repos:
-        query_params['page'] = page
-        response = requests.get('https://api.github.com/search/repositories', headers=headers, params=query_params)
-        
+        query_params["page"] = page
+        response = requests.get(
+            "https://api.github.com/search/repositories",
+            headers=headers,
+            params=query_params,
+        )
+
         if response.status_code == 200:
-            repositories = response.json()['items']
+            repositories = response.json()["items"]
             for repo in repositories:
                 if cloned_count >= num_repos:
                     break
-                repo_full_name = repo['full_name']
+                repo_full_name = repo["full_name"]
                 logger.info(f"Processing {repo_full_name}")
                 if repo_full_name not in known_repositories:
                     name = None
                     try:
-                        clone_url = repo['clone_url']
+                        clone_url = repo["clone_url"]
                         repo_name, sha = clone_repo(clone_url)
                         name = repo_name
-                        url = clone_url.replace('.git', '')
+                        url = clone_url.replace(".git", "")
                         lean_git_repo = LeanGitRepo(url, sha)
                         lean_git_repos.append(lean_git_repo)
                         repos.append(repo_full_name)
@@ -516,7 +560,9 @@ def search_github_repositories(language="Lean", num_repos=10):
                         shutil.rmtree(name)
                         logger.info(f"Failed to clone {repo_full_name} because of {e}")
                 else:
-                    logger.info(f"Skipping {repo_full_name} since it is a known repository")
+                    logger.info(
+                        f"Skipping {repo_full_name} since it is a known repository"
+                    )
             page += 1
         else:
             logger.info("Failed to search GitHub", response.status_code)
@@ -525,7 +571,7 @@ def search_github_repositories(language="Lean", num_repos=10):
         # Check if we've reached the end of the search results
         if len(repositories) < 100:
             break
-    
+
     logger.info(f"Total repositories processed: {cloned_count}")
 
 
@@ -569,7 +615,7 @@ def _eval(data, preds_map) -> Tuple[float, float, float]:
 def load_fisher_information(file_path):
     """Loads the Fisher Information Matrix."""
     try:
-        with open(file_path, 'rb') as f:
+        with open(file_path, "rb") as f:
             fisher_info = pickle.load(f)
         logger.info("Fisher Information successfully loaded.")
         return fisher_info
@@ -577,38 +623,61 @@ def load_fisher_information(file_path):
         logger.error(f"No Fisher Information file found at {file_path}.")
         return None
 
+
 def find_latest_checkpoint():
     """Finds the most recent checkpoint."""
     checkpoint_dir = RAID_DIR + "/" + CHECKPOINT_DIR
-    all_checkpoints = [os.path.join(checkpoint_dir, f) for f in os.listdir(checkpoint_dir) if f.endswith(".ckpt")]
+    all_checkpoints = [
+        os.path.join(checkpoint_dir, f)
+        for f in os.listdir(checkpoint_dir)
+        if f.endswith(".ckpt")
+    ]
     if not all_checkpoints:
         raise FileNotFoundError("No checkpoints found.")
     latest_checkpoint = max(all_checkpoints, key=os.path.getmtime)
     logger.info(f"Using the latest checkpoint: {latest_checkpoint}")
     return latest_checkpoint
 
+
 def find_latest_fisher():
     """Finds the most recent Fisher Information Matrix."""
     fisher_dir = RAID_DIR + "/" + FISHER_DIR
-    all_fisher = [os.path.join(fisher_dir, f) for f in os.listdir(fisher_dir) if f.endswith(".pkl")]
+    all_fisher = [
+        os.path.join(fisher_dir, f)
+        for f in os.listdir(fisher_dir)
+        if f.endswith(".pkl")
+    ]
     if not all_fisher:
         raise FileNotFoundError("No Fisher Information Matrices found.")
     latest_fisher = max(all_fisher, key=os.path.getmtime)
     logger.info(f"Using the latest Fisher Information Matrix: {latest_fisher}")
     return latest_fisher
 
-def theorem_identifier(theorem: Theorem) -> Tuple[str, str, Tuple[int, int], Tuple[int, int]]:
+
+def theorem_identifier(
+    theorem: Theorem,
+) -> Tuple[str, str, Tuple[int, int], Tuple[int, int]]:
     """Returns a unique identifier for a theorem."""
-    return (theorem.full_name, str(theorem.file_path), tuple(theorem.start), tuple(theorem.end))
+    return (
+        theorem.full_name,
+        str(theorem.file_path),
+        tuple(theorem.start),
+        tuple(theorem.end),
+    )
+
 
-def process_theorem_batch(theorem_batch, positions_batch, repo, db, prover, dynamic_database_json_path):
+def process_theorem_batch(
+    theorem_batch, positions_batch, repo, db, prover, dynamic_database_json_path
+):
     """Processes a batch of theorems."""
     lean_dojo_theorems = [t[1] for t in theorem_batch]
-    results = prover.search_unordered(LeanGitRepo(repo.url, repo.commit), lean_dojo_theorems, positions_batch)
-    
+    results = prover.search_unordered(
+        LeanGitRepo(repo.url, repo.commit), lean_dojo_theorems, positions_batch
+    )
+
     # Create a mapping from LeanDojoTheorem to our Theorem
     theorem_map = {ldj_thm: thm for thm, ldj_thm in theorem_batch}
-    
+
     for result in results:
         if isinstance(result, SearchResult):
             if result.theorem in theorem_map:
@@ -620,8 +689,9 @@ def process_theorem_batch(theorem_batch, positions_batch, repo, db, prover, dyna
                             tactic=tactic,
                             annotated_tactic=(tactic, []),
                             state_before="",
-                            state_after=""
-                        ) for tactic in result.proof
+                            state_after="",
+                        )
+                        for tactic in result.proof
                     ]
                     theorem.traced_tactics = traced_tactics
                     repo.change_sorry_to_proven(theorem, PROOF_LOG_FILE_NAME)
@@ -633,47 +703,70 @@ def process_theorem_batch(theorem_batch, positions_batch, repo, db, prover, dyna
                 logger.warning(f"Theorem not found in theorem_map: {result.theorem}")
         else:
             logger.warning(f"Unexpected result type")
-    
+
     db.to_json(dynamic_database_json_path)
 
+
 def save_progress(all_encountered_theorems):
     """Saves the set of encountered theorems."""
     logger.info("Saving encountered theorems...")
-    with open(ENCOUNTERED_THEOREMS_FILE, 'wb') as f:
+    with open(ENCOUNTERED_THEOREMS_FILE, "wb") as f:
         pickle.dump(all_encountered_theorems, f)
 
+
 def load_encountered_theorems(file_path):
     """Loads the theorems that have been encountered."""
     all_encountered_theorems = set()
     if os.path.exists(file_path):
         try:
-            with open(file_path, 'rb') as f:
+            with open(file_path, "rb") as f:
                 file_content = f.read()
                 if file_content:  # Check if the file is not empty
                     all_encountered_theorems = pickle.loads(file_content)
                 else:
-                    logger.warning(f"The file {file_path} is empty. Starting with an empty set.")
+                    logger.warning(
+                        f"The file {file_path} is empty. Starting with an empty set."
+                    )
         except (EOFError, pickle.UnpicklingError) as e:
-            logger.warning(f"Error reading {file_path}: {e}. Starting with an empty set.")
+            logger.warning(
+                f"Error reading {file_path}: {e}. Starting with an empty set."
+            )
         except Exception as e:
-            logger.error(f"Unexpected error when reading {file_path}: {e}. Starting with an empty set.")
+            logger.error(
+                f"Unexpected error when reading {file_path}: {e}. Starting with an empty set."
+            )
     else:
         logger.info(f"The file {file_path} does not exist. Starting with an empty set.")
-    
+
     return all_encountered_theorems
 
-def prove_sorry_theorems(db: DynamicDatabase, prover: DistributedProver, dynamic_database_json_path, repos_to_include: Optional[List[Tuple[str, str]]] = None, batch_size: int = 12):
+
+def prove_sorry_theorems(
+    db: DynamicDatabase,
+    prover: DistributedProver,
+    dynamic_database_json_path,
+    repos_to_include: Optional[List[Tuple[str, str]]] = None,
+    batch_size: int = 12,
+):
     """Proves sorry theorems."""
-    repos_to_process = db.repositories if repos_to_include is None else [
-        repo for repo in db.repositories if (repo.url, repo.commit) in repos_to_include
-    ]
+    repos_to_process = (
+        db.repositories
+        if repos_to_include is None
+        else [
+            repo
+            for repo in db.repositories
+            if (repo.url, repo.commit) in repos_to_include
+        ]
+    )
 
     # To avoid proving the same theorem multiple times, potentially from different versions of the
     # same repo, we sort the repositories
-    repos_to_process.sort(key=lambda r: r.metadata['date_processed'], reverse=True)
+    repos_to_process.sort(key=lambda r: r.metadata["date_processed"], reverse=True)
 
     processed_theorems: Set[Tuple[str, str, Tuple[int, int], Tuple[int, int]]] = set()
-    all_encountered_theorems: Set[Tuple[str, str, Tuple[int, int], Tuple[int, int]]] = set()
+    all_encountered_theorems: Set[Tuple[str, str, Tuple[int, int], Tuple[int, int]]] = (
+        set()
+    )
     last_save_time = datetime.datetime.now()
     save_interval = timedelta(minutes=30)
 
@@ -689,8 +782,10 @@ def prove_sorry_theorems(db: DynamicDatabase, prover: DistributedProver, dynamic
 
         theorem_batch = []
         positions_batch = []
-    
-        for theorem in tqdm(sorry_theorems, desc=f"Processing theorems from {repo.name}", unit="theorem"):
+
+        for theorem in tqdm(
+            sorry_theorems, desc=f"Processing theorems from {repo.name}", unit="theorem"
+        ):
             # Ignore sorry theorems from the repo's dependencies
             if theorem.url != repo_url or theorem.commit != repo_commit:
                 continue
@@ -698,7 +793,9 @@ def prove_sorry_theorems(db: DynamicDatabase, prover: DistributedProver, dynamic
             theorem_id = theorem_identifier(theorem)
 
             if theorem_id in all_encountered_theorems:
-                logger.info(f"Skipping already encountered theorem: {theorem.full_name}")
+                logger.info(
+                    f"Skipping already encountered theorem: {theorem.full_name}"
+                )
                 continue
 
             all_encountered_theorems.add(theorem_id)
@@ -707,7 +804,7 @@ def prove_sorry_theorems(db: DynamicDatabase, prover: DistributedProver, dynamic
                 continue
 
             processed_theorems.add(theorem_id)
-            
+
             logger.info(f"Searching for proof for {theorem.full_name}")
             logger.info(f"Position: {theorem.start}")
 
@@ -715,14 +812,21 @@ def prove_sorry_theorems(db: DynamicDatabase, prover: DistributedProver, dynamic
             lean_dojo_theorem = LeanDojoTheorem(
                 repo=LeanGitRepo(repo_url, repo_commit),
                 file_path=theorem.file_path,
-                full_name=theorem.full_name
+                full_name=theorem.full_name,
             )
 
             theorem_batch.append((theorem, lean_dojo_theorem))
             positions_batch.append(Pos(*theorem.start))
 
             if len(theorem_batch) == batch_size:
-                process_theorem_batch(theorem_batch, positions_batch, repo, db, prover, dynamic_database_json_path)
+                process_theorem_batch(
+                    theorem_batch,
+                    positions_batch,
+                    repo,
+                    db,
+                    prover,
+                    dynamic_database_json_path,
+                )
                 theorem_batch = []
                 positions_batch = []
 
@@ -730,22 +834,30 @@ def prove_sorry_theorems(db: DynamicDatabase, prover: DistributedProver, dynamic
             if current_time - last_save_time >= save_interval:
                 save_progress(all_encountered_theorems)
                 last_save_time = current_time
-        
+
         # Process any remaining theorems in the last batch
         if theorem_batch:
-            process_theorem_batch(theorem_batch, positions_batch, repo, db, prover, dynamic_database_json_path)
+            process_theorem_batch(
+                theorem_batch,
+                positions_batch,
+                repo,
+                db,
+                prover,
+                dynamic_database_json_path,
+            )
 
     save_progress(all_encountered_theorems)
     logger.info("Finished attempting to prove sorry theorems")
 
+
 def add_repo_to_database(dynamic_database_json_path, repo, db):
     """Adds a repository to the dynamic database."""
     # Prepare the data necessary to add this repo to the dynamic database
     url = repo.url
-    if not url.endswith('.git'):
-        url = url + '.git'
+    if not url.endswith(".git"):
+        url = url + ".git"
     logger.info(f"Processing {url}")
-    
+
     if "mathlib4" in url:
         sha = "2b29e73438e240a427bcecc7c0fe19306beb1310"
         v = "v4.8.0"
@@ -757,19 +869,21 @@ def add_repo_to_database(dynamic_database_json_path, repo, db):
         v = "v4.8.0-rc1"
     else:
         sha, v = get_compatible_commit(url)
-        
+
     if not sha:
         logger.info(f"Failed to find a compatible commit for {url}")
         return None
-    
+
     logger.info(f"Found compatible commit {sha} for {url}")
     logger.info(f"Lean version: {v}")
-    url = url.replace('.git', '')
+    url = url.replace(".git", "")
     repo = LeanGitRepo(url, sha)
     dir_name = repo.url.split("/")[-1] + "_" + sha
     dst_dir = RAID_DIR + "/" + DATA_DIR + "/" + dir_name
     logger.info(f"Generating benchmark at {dst_dir}")
-    traced_repo, _, _, total_theorems = generate_benchmark_lean4.main(repo.url, sha, dst_dir)
+    traced_repo, _, _, total_theorems = generate_benchmark_lean4.main(
+        repo.url, sha, dst_dir
+    )
     if not traced_repo:
         logger.info(f"Failed to trace {url}")
         return None
@@ -797,9 +911,9 @@ def add_repo_to_database(dynamic_database_json_path, repo, db):
         "theorems_folder": theorems_folder,
         "premise_files_corpus": premise_files_corpus,
         "files_traced": files_traced,
-        "pr_url": pr_url
+        "pr_url": pr_url,
     }
-    
+
     repo = Repository.from_dict(data)
     logger.info("Before adding new repo:")
     db.print_database_contents()
@@ -809,6 +923,7 @@ def add_repo_to_database(dynamic_database_json_path, repo, db):
     db.to_json(dynamic_database_json_path)
     return "Done"
 
+
 def replace_sorry_with_proof(proofs):
     """Replace the `sorry` with the proof text in the Lean files."""
     logger.info(f"Replacing sorries with {len(proofs)} proofs!")
@@ -819,40 +934,44 @@ def replace_sorry_with_proof(proofs):
         if file_path not in proofs_by_file:
             proofs_by_file[file_path] = []
         proofs_by_file[file_path].append((start, end, proof_text))
-    
+
     for file_path, proofs in proofs_by_file.items():
-        with open(file_path, 'r') as file:
+        with open(file_path, "r") as file:
             lines = file.readlines()
 
         # sort proof by starting line and column number (working bottom up retains positions)
         proofs.sort(key=lambda x: (x[0].line_nb, x[0].column_nb), reverse=True)
-        
+
         for start, end, proof_text in proofs:
             start_line, start_col = start.line_nb - 1, start.column_nb - 1
             end_line, end_col = end.line_nb - 1, end.column_nb - 1
-            original_text = ''.join(lines[start_line:end_line + 1])
-            new_text = original_text.replace('sorry', proof_text, 1)
-            lines[start_line:end_line + 1] = new_text
-            
-            with open(file_path, 'w') as file:
+            original_text = "".join(lines[start_line : end_line + 1])
+            new_text = original_text.replace("sorry", proof_text, 1)
+            lines[start_line : end_line + 1] = new_text
+
+            with open(file_path, "w") as file:
                 file.writelines(lines)
 
     logger.info("Finished replacing sorries with proofs!")
 
+
 def calculate_difficulty(theorem: Theorem) -> Union[float, None]:
     """Calculates the difficulty of a theorem."""
     proof_steps = theorem.traced_tactics
-    if any('sorry' in step.tactic for step in proof_steps):
-        return float('inf')  # Hard (no proof)
+    if any("sorry" in step.tactic for step in proof_steps):
+        return float("inf")  # Hard (no proof)
     if len(proof_steps) == 0:
         return None  # To be distributed later
     return math.exp(len(proof_steps))
 
-def categorize_difficulty(difficulty: Union[float, None], percentiles: List[float]) -> str:
+
+def categorize_difficulty(
+    difficulty: Union[float, None], percentiles: List[float]
+) -> str:
     """Categorizes the difficulty of a theorem."""
     if difficulty is None:
         return "To_Distribute"
-    if difficulty == float('inf'):
+    if difficulty == float("inf"):
         return "Hard (No proof)"
     elif difficulty <= percentiles[0]:
         return "Easy"
@@ -861,6 +980,7 @@ def categorize_difficulty(difficulty: Union[float, None], percentiles: List[floa
     else:
         return "Hard"
 
+
 def sort_repositories_by_difficulty(db: DynamicDatabase) -> List[Repository]:
     """Sorts repositories by the difficulty of their theorems."""
     difficulties_by_repo = defaultdict(list)
@@ -872,10 +992,18 @@ def sort_repositories_by_difficulty(db: DynamicDatabase) -> List[Repository]:
         for theorem in repo.get_all_theorems:
             difficulty = calculate_difficulty(theorem)
             theorem.difficulty_rating = difficulty
-            difficulties_by_repo[repo].append((theorem.full_name, str(theorem.file_path), tuple(theorem.start), tuple(theorem.end), difficulty))
+            difficulties_by_repo[repo].append(
+                (
+                    theorem.full_name,
+                    str(theorem.file_path),
+                    tuple(theorem.start),
+                    tuple(theorem.end),
+                    difficulty,
+                )
+            )
             if difficulty is not None:
                 all_difficulties.append(difficulty)
-            
+
         db.update_repository(repo)
         print(f"Finished {repo.name}")
 
@@ -888,7 +1016,9 @@ def sort_repositories_by_difficulty(db: DynamicDatabase) -> List[Repository]:
         print(f"Starting {repo.name}")
         for theorem_name, file_path, start, end, difficulty in theorems:
             category = categorize_difficulty(difficulty, percentiles)
-            categorized_theorems[repo][category].append((theorem_name, file_path, start, end, difficulty))
+            categorized_theorems[repo][category].append(
+                (theorem_name, file_path, start, end, difficulty)
+            )
         print(f"Finished {repo.name}")
 
     print("Distributed theorems with no proofs")
@@ -904,43 +1034,49 @@ def sort_repositories_by_difficulty(db: DynamicDatabase) -> List[Repository]:
         print(f"Finished {repo.name}")
 
     # Sort repositories based on the number of easy theorems
-    sorted_repos = sorted(categorized_theorems.keys(), key=lambda r: len(categorized_theorems[r]["Easy"]), reverse=True)
+    sorted_repos = sorted(
+        categorized_theorems.keys(),
+        key=lambda r: len(categorized_theorems[r]["Easy"]),
+        reverse=True,
+    )
 
     return sorted_repos, categorized_theorems, percentiles
 
+
 def save_sorted_repos(sorted_repos: List[Repository], file_path: str):
     """Saves the sorted repositories to a file."""
     sorted_repo_data = [
-        {
-            "url": repo.url,
-            "commit": repo.commit,
-            "name": repo.name
-        } for repo in sorted_repos
+        {"url": repo.url, "commit": repo.commit, "name": repo.name}
+        for repo in sorted_repos
     ]
-    with open(file_path, 'w') as f:
+    with open(file_path, "w") as f:
         json.dump(sorted_repo_data, f, indent=2)
 
+
 def load_sorted_repos(file_path: str) -> List[Tuple[str, str, str]]:
     """Loads the sorted repositories from a file."""
-    with open(file_path, 'r') as f:
+    with open(file_path, "r") as f:
         sorted_repo_data = json.load(f)
     return [(repo["url"], repo["commit"], repo["name"]) for repo in sorted_repo_data]
 
+
 def write_skip_file(repo_url):
     """Writes a repository URL to a file to skip it."""
     skip_file_path = os.path.join(RAID_DIR, DATA_DIR, "skip_repo.txt")
-    with open(skip_file_path, 'w') as f:
+    with open(skip_file_path, "w") as f:
         f.write(repo_url)
 
+
 def should_skip_repo():
     """Checks if a repository should be skipped."""
     skip_file_path = os.path.join(RAID_DIR, DATA_DIR, "skip_repo.txt")
     if os.path.exists(skip_file_path):
-        with open(skip_file_path, 'r') as f:
+        with open(skip_file_path, "r") as f:
             repo_url = f.read().strip()
         return True, repo_url
     return False, None
 
+
 def main():
     """
     Main function to run LeanAgent.
@@ -955,9 +1091,9 @@ def main():
         use_fisher = False
         single_repo = True
         curriculum_learning = True
-        num_repos = 15
+        num_repos = 1
         dynamic_database_json_path = RAID_DIR + "/" + DB_FILE_NAME
-        
+
         lambdas = None
         if run_progressive_training:
             logger.info("Running progressive training")
@@ -972,14 +1108,19 @@ def main():
         logger.info("LeanDojo configured")
 
         # Check if the current process is the main one
-        is_main_process = int(os.environ.get('LOCAL_RANK', '0')) == 0
+        is_main_process = int(os.environ.get("LOCAL_RANK", "0")) == 0
 
         # Initialize the database if it doesn't exist or is empty
         if is_main_process:
             logger.info("Starting the main process")
-            if not os.path.exists(dynamic_database_json_path) or os.path.getsize(dynamic_database_json_path) == 0:
+            if (
+                not os.path.exists(dynamic_database_json_path)
+                or os.path.getsize(dynamic_database_json_path) == 0
+            ):
                 # File doesn't exist or is empty, initialize it
-                logger.info(f"Initializing new database at {dynamic_database_json_path}")
+                logger.info(
+                    f"Initializing new database at {dynamic_database_json_path}"
+                )
                 db = DynamicDatabase()
                 db.to_json(dynamic_database_json_path)
             else:
@@ -989,7 +1130,9 @@ def main():
                     logger.info(f"Loaded database from {dynamic_database_json_path}")
                 except json.JSONDecodeError:
                     # If there's an error decoding the JSON, initialize a new database
-                    logger.warning(f"Error decoding JSON from {dynamic_database_json_path}. Initializing new database.")
+                    logger.warning(
+                        f"Error decoding JSON from {dynamic_database_json_path}. Initializing new database."
+                    )
                     db = DynamicDatabase()
                     db.to_json(dynamic_database_json_path)
 
@@ -1006,10 +1149,14 @@ def main():
                     logger.info(f"Processing {repo.url}")
                     result = add_repo_to_database(dynamic_database_json_path, repo, db)
                     if result is not None:
-                        logger.info(f"Successfully added repo {repo.url}")                    
-                logger.info(f"Successfully added {num_repos} repositories to the database")
-                
-                sorted_repos, categorized_theorems, percentiles = sort_repositories_by_difficulty(db)
+                        logger.info(f"Successfully added repo {repo.url}")
+                logger.info(
+                    f"Successfully added {num_repos} repositories to the database"
+                )
+
+                sorted_repos, categorized_theorems, percentiles = (
+                    sort_repositories_by_difficulty(db)
+                )
                 print("Sorted repositories. Saving now...")
                 db.to_json(dynamic_database_json_path)
                 save_sorted_repos(sorted_repos, "sorted_repos.json")
@@ -1020,23 +1167,44 @@ def main():
                         theorems = categorized_theorems[repo][category]
                         print(f"  {category}: {len(theorems)} theorems")
                         if theorems:
-                            sorted_theorems = sorted(theorems, key=lambda x: x[2] if x[2] is not None else -float('inf'), reverse=True)[:3]
+                            sorted_theorems = sorted(
+                                theorems,
+                                key=lambda x: (
+                                    x[2] if x[2] is not None else -float("inf")
+                                ),
+                                reverse=True,
+                            )[:3]
                             for name, path, start, end, diff in sorted_theorems:
                                 diff_str = f"{diff:.2f}" if diff is not None else "N/A"
-                                print(f"    - {name} (File: {path}, Difficulty: {diff_str})")
+                                print(
+                                    f"    - {name} (File: {path}, Difficulty: {diff_str})"
+                                )
 
                 print("\nOverall Statistics:")
-                total_theorems = sum(len(theorems) for categories in categorized_theorems.values() for theorems in categories.values())
+                total_theorems = sum(
+                    len(theorems)
+                    for categories in categorized_theorems.values()
+                    for theorems in categories.values()
+                )
                 for category in ["Easy", "Medium", "Hard", "Hard (No proof)"]:
-                    count = sum(len(categories[category]) for categories in categorized_theorems.values())
+                    count = sum(
+                        len(categories[category])
+                        for categories in categorized_theorems.values()
+                    )
                     percentage = (count / total_theorems) * 100
                     print(f"{category}: {count} theorems ({percentage:.2f}%)")
 
-                print(f"\nPercentile thresholds: Easy <= {percentiles[0]:.2f}, Medium <= {percentiles[1]:.2f}, Hard > {percentiles[1]:.2f}")
-            
+                print(
+                    f"\nPercentile thresholds: Easy <= {percentiles[0]:.2f}, Medium <= {percentiles[1]:.2f}, Hard > {percentiles[1]:.2f}"
+                )
+
                 logger.info("Finding compatible repositories...")
-                updated_repos = find_and_save_compatible_commits(repo_info_file, sorted_repos)
-                lean_git_repos = [LeanGitRepo(repo['url'], repo['commit']) for repo in updated_repos]
+                updated_repos = find_and_save_compatible_commits(
+                    repo_info_file, sorted_repos
+                )
+                lean_git_repos = [
+                    LeanGitRepo(repo["url"], repo["commit"]) for repo in updated_repos
+                ]
                 logger.info("Finished finding compatible repositories")
         else:
             logger.info("Starting without curriculum learning")
@@ -1049,28 +1217,39 @@ def main():
                     logger.info(f"Processing {repo.url}")
                     result = add_repo_to_database(dynamic_database_json_path, repo, db)
                     if result is not None:
-                        logger.info(f"Successfully added repo {repo.url}")                    
-                logger.info(f"Successfully added {num_repos} repositories to the database")
+                        logger.info(f"Successfully added repo {repo.url}")
+                logger.info(
+                    f"Successfully added {num_repos} repositories to the database"
+                )
 
                 logger.info("Finding compatible repositories...")
-                updated_repos = find_and_save_compatible_commits(repo_info_file, lean_git_repos)
-                lean_git_repos = [LeanGitRepo(repo['url'], repo['commit']) for repo in updated_repos]
+                updated_repos = find_and_save_compatible_commits(
+                    repo_info_file, lean_git_repos
+                )
+                lean_git_repos = [
+                    LeanGitRepo(repo["url"], repo["commit"]) for repo in updated_repos
+                ]
                 logger.info("Finished finding compatible repositories")
 
         # All processes wait for the file to be created and then read from it
         max_attempts = 30
         for attempt in range(max_attempts):
             try:
-                with open(repo_info_file, 'r') as f:
+                with open(repo_info_file, "r") as f:
                     repo_info = json.load(f)
                 break
             except (json.JSONDecodeError, FileNotFoundError):
                 if attempt == max_attempts - 1:
-                    raise Exception("Failed to read repository information after multiple attempts")
+                    raise Exception(
+                        "Failed to read repository information after multiple attempts"
+                    )
                 time.sleep(1)
-            
+
         # Load compatible repositories
-        lean_git_repos = [LeanGitRepo(info['url'].replace('.git', ''), info['commit']) for info in repo_info]
+        lean_git_repos = [
+            LeanGitRepo(info["url"].replace(".git", ""), info["commit"])
+            for info in repo_info
+        ]
 
         # Iterate over each repository and lambda value
         for i in range(num_repos):
@@ -1101,8 +1280,10 @@ def main():
                         logger.info("Repo already in repos_for_merged_dataset")
 
                     db.generate_merged_dataset(dst_dir, repos_for_merged_dataset)
-                
-                dst_dir = RAID_DIR + "/" + DATA_DIR + "/" + f"merged_with_new_{dir_name}"
+
+                dst_dir = (
+                    RAID_DIR + "/" + DATA_DIR + "/" + f"merged_with_new_{dir_name}"
+                )
                 new_data_path = dst_dir
 
                 logger.info("All GPUs")
@@ -1116,16 +1297,18 @@ def main():
                     except FileNotFoundError as e:
                         logger.error(str(e))
                         model_checkpoint_path = f"{RAID_DIR}/checkpoints/mathlib4_29dcec074de168ac2bf835a77ef68bbe069194c5.ckpt"
-                    
+
                     # Train the model on the new dataset that we generated from the dynamic database.
                     logger.info("Inside train_test_fisher")
                     logger.info(f"Starting training at epoch {current_epoch}")
                     seed_everything(3407)
 
                     # Progessive Training
-                    
+
                     if not torch.cuda.is_available():
-                        logger.warning("Indexing the corpus using CPU can be very slow.")
+                        logger.warning(
+                            "Indexing the corpus using CPU can be very slow."
+                        )
                         device = torch.device("cpu")
                     else:
                         device = torch.device("cuda")
@@ -1156,34 +1339,39 @@ def main():
                     filename_suffix = f"_lambda_{lambda_value}"
                     checkpoint_callback = ModelCheckpoint(
                         dirpath=RAID_DIR + "/" + CHECKPOINT_DIR,
-                        filename=dir_name + filename_suffix + "_{epoch}-{Recall@10_val:.2f}",
+                        filename=dir_name
+                        + filename_suffix
+                        + "_{epoch}-{Recall@10_val:.2f}",
                         verbose=True,
                         save_top_k=-1,  # Save all checkpoints
                         every_n_epochs=1,  # Save every epoch (which is just once in this case)
                         monitor="Recall@10_val",
-                        mode="max"
+                        mode="max",
                     )
-                    
+
                     early_stop_callback = EarlyStopping(
-                        monitor="Recall@10_val",
-                        patience=5,
-                        mode="max",
-                        verbose=True
+                        monitor="Recall@10_val", patience=5, mode="max", verbose=True
                     )
 
-                    lr_monitor = LearningRateMonitor(logging_interval='step')
+                    lr_monitor = LearningRateMonitor(logging_interval="step")
 
                     # Set up environment variables for NCCL
                     VERY_LONG_TIMEOUT = 7 * 24 * 60 * 60 * 52  # 1 year
-                    os.environ['TORCH_NCCL_ASYNC_ERROR_HANDLING'] = '1'
-                    os.environ['NCCL_TIMEOUT'] = str(VERY_LONG_TIMEOUT * 1000)
+                    os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "1"
+                    os.environ["NCCL_TIMEOUT"] = str(VERY_LONG_TIMEOUT * 1000)
 
                     # Create a custom log directory for Lightning
-                    custom_log_dir = os.path.join(RAID_DIR, "lightning_logs", f"{dir_name}_{use_fisher}_lambda_{lambda_value}")
+                    custom_log_dir = os.path.join(
+                        RAID_DIR,
+                        "lightning_logs",
+                        f"{dir_name}_{use_fisher}_lambda_{lambda_value}",
+                    )
                     os.makedirs(custom_log_dir, exist_ok=True)
 
                     # Initialize DDP strategy
-                    ddp_strategy = DDPStrategy(timeout=timedelta(seconds=VERY_LONG_TIMEOUT))
+                    ddp_strategy = DDPStrategy(
+                        timeout=timedelta(seconds=VERY_LONG_TIMEOUT)
+                    )
                     trainer = pl.Trainer(
                         accelerator="gpu",
                         gradient_clip_val=1.0,
@@ -1191,7 +1379,11 @@ def main():
                         strategy=ddp_strategy,
                         devices=4,
                         accumulate_grad_batches=4,
-                        callbacks=[lr_monitor, checkpoint_callback, early_stop_callback],
+                        callbacks=[
+                            lr_monitor,
+                            checkpoint_callback,
+                            early_stop_callback,
+                        ],
                         max_epochs=current_epoch + epochs_per_repo,
                         log_every_n_steps=1,
                         num_sanity_val_steps=0,
@@ -1203,11 +1395,15 @@ def main():
                     trainer.strategy.barrier()
                     should_skip, skip_repo_url = should_skip_repo()
                     if should_skip:
-                        logger.info(f"Skipping repository {skip_repo_url} due to preprocessing issues")
+                        logger.info(
+                            f"Skipping repository {skip_repo_url} due to preprocessing issues"
+                        )
                         trainer.strategy.barrier()
                         if is_main_process:
                             logger.info("Removing skip file")
-                            skip_file_path = os.path.join(RAID_DIR, DATA_DIR, "skip_repo.txt")
+                            skip_file_path = os.path.join(
+                                RAID_DIR, DATA_DIR, "skip_repo.txt"
+                            )
                             os.remove(skip_file_path)
                         continue
 
@@ -1225,35 +1421,51 @@ def main():
                         batch_size=BATCH_SIZE,
                         eval_batch_size=64,
                         max_seq_len=1024,
-                        num_workers=4
+                        num_workers=4,
                     )
-                    data_module.setup(stage='fit')
+                    data_module.setup(stage="fit")
 
-                    logger.info(f"Training dataset size after load: {len(data_module.ds_train)}")
-                    logger.info(f"Validation dataset size after load: {len(data_module.ds_val)}")
-                    logger.info(f"Testing dataset size after load: {len(data_module.ds_pred)}")
+                    logger.info(
+                        f"Training dataset size after load: {len(data_module.ds_train)}"
+                    )
+                    logger.info(
+                        f"Validation dataset size after load: {len(data_module.ds_val)}"
+                    )
+                    logger.info(
+                        f"Testing dataset size after load: {len(data_module.ds_pred)}"
+                    )
 
-                    logger.info(f"Starting progressive training from epoch {current_epoch} to {current_epoch + epochs_per_repo}")
+                    logger.info(
+                        f"Starting progressive training from epoch {current_epoch} to {current_epoch + epochs_per_repo}"
+                    )
 
                     # Train the model
                     try:
                         logger.info("hit the barrier before training")
                         trainer.strategy.barrier()
-                        trainer.fit(model, datamodule=data_module, ckpt_path=model_checkpoint_path)
+                        trainer.fit(
+                            model,
+                            datamodule=data_module,
+                            ckpt_path=model_checkpoint_path,
+                        )
                         logger.info("hit the barrier after training")
                         trainer.strategy.barrier()
                     except Exception as e:
                         print(f"An error occurred during training: {str(e)}")
                         print(traceback.format_exc())
 
-                    logger.info(f"Finished progressive training at epoch {trainer.current_epoch}")
+                    logger.info(
+                        f"Finished progressive training at epoch {trainer.current_epoch}"
+                    )
 
                     # Testing for Average Recall
 
                     try:
                         best_model_path = find_latest_checkpoint()
                         logger.info(f"Found latest checkpoint: {best_model_path}")
-                        best_model = PremiseRetriever.load(best_model_path, device, freeze=False, config=config)
+                        best_model = PremiseRetriever.load(
+                            best_model_path, device, freeze=False, config=config
+                        )
                     except FileNotFoundError as e:
                         logger.error(f"No checkpoint found: {str(e)}")
                         logger.warning("Using the current model state.")
@@ -1264,15 +1476,19 @@ def main():
                     logger.info("Testing...")
                     total_R1, total_R10, total_MRR = [], [], []
                     dataset_path = RAID_DIR + "/" + DATA_DIR
-                    testing_paths = [os.path.join(dataset_path, d) for d in os.listdir(dataset_path)]
+                    testing_paths = [
+                        os.path.join(dataset_path, d) for d in os.listdir(dataset_path)
+                    ]
                     if is_main_process:
                         with open(EVAL_RESULTS_FILE_PATH, "a") as f:
                             f.write("\n\n\n")
-                            f.write(f"Results for {dir_name} with lambda = {lambda_value}")
+                            f.write(
+                                f"Results for {dir_name} with lambda = {lambda_value}"
+                            )
                     for data_path in testing_paths:
                         if "merged" not in data_path:
                             continue
-                        
+
                         run_cli(best_model_path, data_path)
                         if is_main_process:
                             num_gpus = 4
@@ -1302,18 +1518,24 @@ def main():
                         avg_R10 = np.mean(total_R10)
                         avg_MRR = np.mean(total_MRR)
 
-                        logger.info(f"Average R@1 = {avg_R1} %, R@10 = {avg_R10} %, MRR = {avg_MRR}")
+                        logger.info(
+                            f"Average R@1 = {avg_R1} %, R@10 = {avg_R10} %, MRR = {avg_MRR}"
+                        )
 
                         if not os.path.exists(EVAL_RESULTS_FILE_PATH):
-                            open(EVAL_RESULTS_FILE_PATH, 'w').close()
+                            open(EVAL_RESULTS_FILE_PATH, "w").close()
 
                         with open(EVAL_RESULTS_FILE_PATH, "a") as f:
                             f.write("\n\n\n")
-                            f.write(f"Average R@1 = {avg_R1} %, R@10 = {avg_R10} %, MRR = {avg_MRR}")
+                            f.write(
+                                f"Average R@1 = {avg_R1} %, R@10 = {avg_R10} %, MRR = {avg_MRR}"
+                            )
                 else:
                     model_checkpoint_path = f"{RAID_DIR}/checkpoints/mathlib4_29dcec074de168ac2bf835a77ef68bbe069194c5.ckpt"
                     if result is None:
-                        logger.info(f"Skipping repository {repo.url} due to preprocessing issues")
+                        logger.info(
+                            f"Skipping repository {repo.url} due to preprocessing issues"
+                        )
                         continue
 
                 if is_main_process:
@@ -1326,8 +1548,12 @@ def main():
                     # Set up the prover
                     use_vllm = False
                     corpus_path = dst_dir + "/corpus.jsonl"
-                    tactic = None  # `None` since we are not using a fixed tactic generator
-                    module = None  # `None` since we are not using a fixed tactic generator
+                    tactic = (
+                        None  # `None` since we are not using a fixed tactic generator
+                    )
+                    module = (
+                        None  # `None` since we are not using a fixed tactic generator
+                    )
                     num_workers = 4
                     num_gpus = 4
                     timeout = 600
@@ -1349,14 +1575,21 @@ def main():
                         raid_dir=RAID_DIR,
                         checkpoint_dir=CHECKPOINT_DIR,
                         debug=debug,
-                        run_progressive_training=run_progressive_training
+                        run_progressive_training=run_progressive_training,
                     )
 
                     # Prove sorry theorems
                     if single_repo:
-                        prove_sorry_theorems(db, prover, dynamic_database_json_path, repos_for_merged_dataset)
+                        prove_sorry_theorems(
+                            db,
+                            prover,
+                            dynamic_database_json_path,
+                            repos_for_merged_dataset,
+                        )
                     else:
-                        prove_sorry_theorems(db, prover, dynamic_database_json_path, repos_for_proving)
+                        prove_sorry_theorems(
+                            db, prover, dynamic_database_json_path, repos_for_proving
+                        )
                     db.to_json(dynamic_database_json_path)
 
                     logger.info("Finished searching for proofs of sorry theorems")
@@ -1379,7 +1612,7 @@ def main():
                     #         push_changes(repo, TMP_BRANCH)
                     #         url = str(create_pull_request(repo_no_dir, PR_TITLE, PR_BODY, TMP_BRANCH))
                     #     shutil.rmtree(repo)
-                
+
                 logger.info("Finished processing the repository")
                 current_epoch += epochs_per_repo
                 logger.info(f"current epoch: {current_epoch}")
@@ -1391,5 +1624,6 @@ def main():
         logger.info(f"An error occurred: {e}", file=sys.stderr)
         traceback.print_exc()
 
+
 if __name__ == "__main__":
     main()
diff --git a/leanagent_utils.py b/leanagent_utils.py
index 381065e..684b390 100644
--- a/leanagent_utils.py
+++ b/leanagent_utils.py
@@ -1,6 +1,7 @@
 MARK_START_SYMBOL = "<a>"
 MARK_END_SYMBOL = "</a>"
 
+
 def remove_marks(s: str) -> str:
     """Remove all :code:`<a>` and :code:`</a>` from ``s``."""
-    return s.replace(MARK_START_SYMBOL, "").replace(MARK_END_SYMBOL, "") 
+    return s.replace(MARK_START_SYMBOL, "").replace(MARK_END_SYMBOL, "")
diff --git a/prover/evaluate.py b/prover/evaluate.py
index e5ebcf3..44eec39 100644
--- a/prover/evaluate.py
+++ b/prover/evaluate.py
@@ -1,5 +1,4 @@
-"""Script for evaluating the prover on theorems extracted by LeanDojo.
-"""
+"""Script for evaluating the prover on theorems extracted by LeanDojo."""
 
 import os
 import uuid
@@ -27,7 +26,7 @@ def _get_theorems(
     """
     Retrieves a list of Lean theorems from specified files based on given filters.
 
-    This function fetches theorems from Lean files using internal helper functions and 
+    This function fetches theorems from Lean files using internal helper functions and
     validates that all repositories containing the theorems have been traced with LeanDojo.
 
     Parameters:
@@ -194,7 +193,7 @@ def evaluate(
         num_sampled_tactics=num_sampled_tactics,
         debug=verbose,
     )
-    
+
     results = prover.search_unordered(repo, theorems, positions)
 
     # Calculate the result statistics.
diff --git a/prover/proof_search.py b/prover/proof_search.py
index 6324add..e927630 100644
--- a/prover/proof_search.py
+++ b/prover/proof_search.py
@@ -1,5 +1,4 @@
-"""Proof search using best-first search.
-"""
+"""Proof search using best-first search."""
 
 import os
 import sys
@@ -33,8 +32,9 @@
 from prover.search_tree import *
 from generator.model import RetrievalAugmentedGenerator, FixedTacticGenerator
 
-tolerance = 1 # second
-RAID_DIR = os.environ.get('RAID_DIR')
+tolerance = 1  # second
+RAID_DIR = os.environ.get("RAID_DIR")
+
 
 @dataclass(frozen=True)
 class SearchResult:
@@ -79,21 +79,21 @@ def search(
     ) -> Optional[SearchResult]:
         """
         Performs a best-first search to find a proof for the given theorem.
-        
+
         The search uses a tactic generator to propose tactics and expands
         the search tree until either a proof is found, the timeout is reached,
         or the search space is exhausted.
-        
+
         Args:
             repo (LeanGitRepo): The Lean Git repository containing the theorem.
             thm (Theorem): The theorem to be proved.
             pos (Pos): The position information for the theorem in the source code.
-            
+
         Returns:
             Optional[SearchResult]: A SearchResult object containing information about the
             proof search, including the proof if one was found, or None if there was
             an initialization error.
-            
+
         Raises:
             No explicit exceptions are raised from this method, though internal
             exceptions are caught and handled.
@@ -387,16 +387,22 @@ async def generate(self, prompt: str, num_samples: int) -> RequestOutput:
             final_output = oup
         return final_output
 
+
 def find_latest_checkpoint(raid_dir, checkpoint_dir):
     """Finds the most recent checkpoint."""
     checkpoint_dir = raid_dir + "/" + checkpoint_dir
-    all_checkpoints = [os.path.join(checkpoint_dir, f) for f in os.listdir(checkpoint_dir) if f.endswith(".ckpt")]
+    all_checkpoints = [
+        os.path.join(checkpoint_dir, f)
+        for f in os.listdir(checkpoint_dir)
+        if f.endswith(".ckpt")
+    ]
     if not all_checkpoints:
         raise FileNotFoundError("No checkpoints found.")
     latest_checkpoint = max(all_checkpoints, key=os.path.getmtime)
     logger.info(f"Using the latest checkpoint: {latest_checkpoint}")
     return latest_checkpoint
 
+
 class DistributedProver:
     """A distributed prover that uses Ray to parallelize the proof search.
 
@@ -447,7 +453,7 @@ def __init__(
                 model_checkpoint_path = find_latest_checkpoint(raid_dir, checkpoint_dir)
             else:
                 model_checkpoint_path = f"{RAID_DIR}/checkpoints/mathlib4_29dcec074de168ac2bf835a77ef68bbe069194c5.ckpt"
-            
+
             config = {
                 "model_name": "kaiyuy/leandojo-lean4-retriever-tacgen-byt5-small",
                 "lr": 1e-3,
@@ -473,7 +479,7 @@ def __init__(
                     logger.info(f"Loaded indexed corpus from {indexed_corpus_path}")
                 tac_gen.retriever.reindex_corpus(batch_size=32)
                 logger.info("Finished reindexing!")
-    
+
         self.distributed = num_workers > 1
         if not self.distributed:
             assert num_gpus <= 1
@@ -537,4 +543,4 @@ def search_unordered(
             logger.error(ex)
             sys.exit(1)
 
-        return results
\ No newline at end of file
+        return results
diff --git a/prover/search_tree.py b/prover/search_tree.py
index 3174e67..222d030 100644
--- a/prover/search_tree.py
+++ b/prover/search_tree.py
@@ -1,5 +1,4 @@
-"""Definitions of the search tree used by the prover.
-"""
+"""Definitions of the search tree used by the prover."""
 
 import math
 from enum import Enum
diff --git a/replace_files.sh b/replace_files.sh
index 87c7805..45a3215 100644
--- a/replace_files.sh
+++ b/replace_files.sh
@@ -26,7 +26,7 @@
 # Note: This script modifies installed Python packages. Use with caution as it may
 # affect the behavior of any code using these packages.
 # -----------------------------------------------------------------------------
-export RAID_DIR="<RAID_DIR>"
+export RAID_DIR="~/Desktop/LeanAgent/RAID/"
 
 # Replace PyTorch Lightning progress.py
 python -c "import pytorch_lightning as pl; print(pl.__file__)" > pl_path.txt
@@ -59,4 +59,4 @@ cat $UTILS_PY_PATH
 echo "Replacing $UTILS_PY_PATH"
 cp ${RAID_DIR}/LeanAgent/custom_utils.py $UTILS_PY_PATH
 echo "Contents after replacement:"
-cat $UTILS_PY_PATH
\ No newline at end of file
+cat $UTILS_PY_PATH
diff --git a/retrieval/datamodule.py b/retrieval/datamodule.py
index 3070c05..400ca88 100644
--- a/retrieval/datamodule.py
+++ b/retrieval/datamodule.py
@@ -53,6 +53,7 @@ class RetrievalDataset(Dataset):
             'label': tensor,
             # Additional metadata fields
     """
+
     def __init__(
         self,
         data_paths: List[str],
@@ -76,14 +77,18 @@ def __init__(
 
     def load_or_cache_data(self, data_paths: List[str]) -> List[Example]:
         cache_file = os.path.join(self.cache_path, "cached_data.pkl")
-        
+
         # Check if cached data exists
         if os.path.exists(cache_file):
-            with open(cache_file, 'rb') as file:
+            with open(cache_file, "rb") as file:
                 data = pickle.load(file)
             logger.info(f"Loaded data from cache {cache_file}")
         else:
-            data = list(itertools.chain.from_iterable(self._load_data(path) for path in data_paths))
+            data = list(
+                itertools.chain.from_iterable(
+                    self._load_data(path) for path in data_paths
+                )
+            )
             # Cache the data
             # create file if it does not already exist
             try:
@@ -92,7 +97,7 @@ def load_or_cache_data(self, data_paths: List[str]) -> List[Example]:
                 if exc.errno != errno.EEXIST:
                     raise
                 pass
-            with open(cache_file, 'wb') as file:
+            with open(cache_file, "wb") as file:
                 pickle.dump(data, file)
             logger.info(f"Saved loaded data to cache {cache_file}")
         return data
@@ -107,7 +112,10 @@ def _load_data(self, data_path: str) -> List[Example]:
                 state = format_state(tac["state_before"])
                 # Some states are empty because they are from sorry theorems that have been proven.
                 context = Context(
-                    file_path, thm["full_name"], Pos(*thm["start"]), state if state else None
+                    file_path,
+                    thm["full_name"],
+                    Pos(*thm["start"]),
+                    state if state else None,
                 )
                 all_pos_premises = get_all_pos_premises(
                     tac["annotated_tactic"], self.corpus
@@ -295,6 +303,7 @@ class RetrievalDataModule(pl.LightningDataModule):
     ds_pred : RetrievalDataset
         Test dataset for prediction
     """
+
     def __init__(
         self,
         data_path: str,
@@ -332,7 +341,7 @@ def setup(self, stage: Optional[str] = None) -> None:
             self.max_seq_len,
             self.tokenizer,
             is_train=True,
-            cache_path=os.path.join(self.data_path, "cache_train")
+            cache_path=os.path.join(self.data_path, "cache_train"),
         )
         print(f"Training dataset size: {len(self.ds_train)}")
 
@@ -345,7 +354,7 @@ def setup(self, stage: Optional[str] = None) -> None:
                 self.max_seq_len,
                 self.tokenizer,
                 is_train=False,
-                cache_path=os.path.join(self.data_path, "cache_val")
+                cache_path=os.path.join(self.data_path, "cache_val"),
             )
             print(f"Validation dataset size: {len(self.ds_val)}")
 
@@ -358,7 +367,7 @@ def setup(self, stage: Optional[str] = None) -> None:
                 self.max_seq_len,
                 self.tokenizer,
                 is_train=False,
-                cache_path=os.path.join(self.data_path, "cache_pred")
+                cache_path=os.path.join(self.data_path, "cache_pred"),
             )
             print(f"Testing dataset size: {len(self.ds_pred)}")
 
diff --git a/retrieval/evaluate.py b/retrieval/evaluate.py
index d22df03..c18ddb6 100644
--- a/retrieval/evaluate.py
+++ b/retrieval/evaluate.py
@@ -117,4 +117,4 @@ def main() -> None:
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/retrieval/evaluate_multiple.py b/retrieval/evaluate_multiple.py
index e8a6b95..6c9a4d0 100644
--- a/retrieval/evaluate_multiple.py
+++ b/retrieval/evaluate_multiple.py
@@ -7,10 +7,11 @@
 from typing import List, Tuple
 from loguru import logger
 
+
 def _eval(data, preds_map) -> Tuple[float, float, float]:
     """
     Evaluates the performance of premise retrieval against ground truth.
-    
+
     Parameters:
     -----------
     data : list
@@ -19,7 +20,7 @@ def _eval(data, preds_map) -> Tuple[float, float, float]:
     preds_map : dict
         Dictionary mapping theorem identifiers to prediction results. Each prediction contains
         'all_pos_premises' (ground truth) and 'retrieved_premises' (model predictions).
-        
+
     Returns:
     --------
     Tuple[float, float, float]
@@ -27,7 +28,7 @@ def _eval(data, preds_map) -> Tuple[float, float, float]:
         - R1: Top-1 Recall (percentage of times the top retrieved premise is a correct premise)
         - R10: Recall@10 (percentage of correct premises found in the top 10 retrievals)
         - MRR: Mean Reciprocal Rank (average of 1/rank where rank is the position of the first correct premise)
-        
+
     Notes:
     ------
     - For each tactic in each theorem, the function evaluates if the retrieved premises match the ground truth.
@@ -37,7 +38,9 @@ def _eval(data, preds_map) -> Tuple[float, float, float]:
     R1, R10, MRR = [], [], []
     for thm in tqdm(data):
         for i, _ in enumerate(thm["traced_tactics"]):
-            pred = preds_map.get((thm["file_path"], thm["full_name"], tuple(thm["start"]), i))
+            pred = preds_map.get(
+                (thm["file_path"], thm["full_name"], tuple(thm["start"]), i)
+            )
             if pred is None:
                 continue
             all_pos_premises = set(pred["all_pos_premises"])
@@ -62,12 +65,13 @@ def _eval(data, preds_map) -> Tuple[float, float, float]:
     MRR = np.mean(MRR) if MRR else 0
     return R1, R10, MRR
 
+
 def main():
     """
     Main function for evaluating the premise retriever on multiple data splits.
 
     The function loads the predictions from a file, evaluates them against each provided data split,
-    and calculates average metrics across all splits. Metrics include Recall@1, Recall@10, and Mean 
+    and calculates average metrics across all splits. Metrics include Recall@1, Recall@10, and Mean
     Reciprocal Rank (MRR).
 
     Command Line Arguments:
@@ -77,14 +81,30 @@ def main():
     Returns:
         None: Results are logged to the console.
     """
-    parser = argparse.ArgumentParser(description="Script for evaluating the premise retriever.")
-    parser.add_argument("--preds-file", type=str, required=True, help="Path to the retriever's predictions file.")
-    parser.add_argument("--data-paths", type=str, nargs='+', required=True, help="Paths to the directories containing the data splits.")
+    parser = argparse.ArgumentParser(
+        description="Script for evaluating the premise retriever."
+    )
+    parser.add_argument(
+        "--preds-file",
+        type=str,
+        required=True,
+        help="Path to the retriever's predictions file.",
+    )
+    parser.add_argument(
+        "--data-paths",
+        type=str,
+        nargs="+",
+        required=True,
+        help="Paths to the directories containing the data splits.",
+    )
 
     args = parser.parse_args()
     logger.info(f"Loading predictions from {args.preds_file}")
     preds = pickle.load(open(args.preds_file, "rb"))
-    preds_map = {(p["file_path"], p["full_name"], tuple(p["start"]), p["tactic_idx"]): p for p in preds}
+    preds_map = {
+        (p["file_path"], p["full_name"], tuple(p["start"]), p["tactic_idx"]): p
+        for p in preds
+    }
 
     total_R1, total_R10, total_MRR = [], [], []
     for data_path in args.data_paths:
@@ -99,7 +119,10 @@ def main():
     avg_R1 = np.mean(total_R1)
     avg_R10 = np.mean(total_R10)
     avg_MRR = np.mean(total_MRR)
-    logger.info(f"Average R@1 = {avg_R1} %, Average R@10 = {avg_R10} %, Average MRR = {avg_MRR}")
+    logger.info(
+        f"Average R@1 = {avg_R1} %, Average R@10 = {avg_R10} %, Average MRR = {avg_MRR}"
+    )
+
 
 if __name__ == "__main__":
     main()
diff --git a/retrieval/fisher_computation_module.py b/retrieval/fisher_computation_module.py
index d08d314..a333575 100644
--- a/retrieval/fisher_computation_module.py
+++ b/retrieval/fisher_computation_module.py
@@ -4,6 +4,7 @@
 import torch.distributed as dist
 import pickle
 
+
 class FisherComputationModule(pl.LightningModule):
     def __init__(self, model):
         super().__init__()
@@ -79,7 +80,7 @@ def on_train_epoch_end(self):
         across the entire distributed training process.
         """
         logger.info("Synchronizing and normalizing Fisher Information")
-        
+
         # Synchronize Fisher Information across GPUs
         # Each GPU now has the sum of the Fisher Information from all GPUs for each parameter
         for name in self.fisher_info:
@@ -97,10 +98,12 @@ def on_train_epoch_end(self):
             self.fisher_info[name] /= total_samples
 
     def configure_optimizers(self):
-        return torch.optim.SGD(self.model.parameters(), lr=0)  # We don't actually want to update the model
+        return torch.optim.SGD(
+            self.model.parameters(), lr=0
+        )  # We don't actually want to update the model
 
     def save_fisher_info(self, fisher_file_path):
         if self.trainer.is_global_zero:
             logger.info(f"Saving Fisher Information Matrix to {fisher_file_path}")
             with open(fisher_file_path, "wb") as f:
-                    pickle.dump(self.fisher_info, f)
\ No newline at end of file
+                pickle.dump(self.fisher_info, f)
diff --git a/retrieval/index.py b/retrieval/index.py
index 6e0fbf2..1fb2f05 100644
--- a/retrieval/index.py
+++ b/retrieval/index.py
@@ -1,5 +1,4 @@
-"""Script for indexing the corpus using the retriever.
-"""
+"""Script for indexing the corpus using the retriever."""
 
 import torch
 import pickle
@@ -60,4 +59,4 @@ def main() -> None:
     main()
 
 # python retrieval/index.py --ckpt_path leandojo-lean4-retriever-byt5-small --corpus-path /data/yingzi_ma/lean_project/datasets/mathlib4_29dcec074de168ac2bf835a77ef68bbe069194c5/corpus.jsonl --output-path indexed_corpus.pkl
-# python retrieval/index.py --ckpt_path leandojo-lean4-retriever-byt5-small --corpus-path mathlib4_29dcec074de168ac2bf835a77ef68bbe069194c5/corpus.jsonl --output-path indexed_corpus.pkl
\ No newline at end of file
+# python retrieval/index.py --ckpt_path leandojo-lean4-retriever-byt5-small --corpus-path mathlib4_29dcec074de168ac2bf835a77ef68bbe069194c5/corpus.jsonl --output-path indexed_corpus.pkl
diff --git a/retrieval/main.py b/retrieval/main.py
index ed4094f..331846d 100644
--- a/retrieval/main.py
+++ b/retrieval/main.py
@@ -1,5 +1,4 @@
-"""Script for training the premise retriever.
-"""
+"""Script for training the premise retriever."""
 
 import os
 from typing import Tuple
@@ -14,31 +13,51 @@
 from retrieval.model import PremiseRetriever
 from retrieval.datamodule import RetrievalDataModule
 
+
 class CLI(LightningCLI):
     def __init__(self, *args, **kwargs):
-        super().__init__(*args, 
-                         save_config_kwargs={"overwrite": True},
-                         **kwargs)
+        super().__init__(*args, save_config_kwargs={"overwrite": True}, **kwargs)
 
     def add_arguments_to_parser(self, parser) -> None:
         parser.link_arguments("model.model_name", "data.model_name")
         parser.link_arguments("data.max_seq_len", "model.max_seq_len")
-        parser.add_argument('--data-path', type=str, required=True, help='Path to the dataset.')
-    
+        parser.add_argument(
+            "--data-path", type=str, required=True, help="Path to the dataset."
+        )
+
     def before_instantiate_classes(self):
         cur_data_path = vars(vars(self.config)["predict"])["data_path"]
         # Modify the --config YAML file to include the current data_path
-        vars(vars(vars(self.config)["predict"])["data"])["data_path"] = cur_data_path + "/random"
-        vars(vars(vars(self.config)["predict"])["data"])["corpus_path"] = cur_data_path + "/corpus.jsonl"
-        logger.info(f"Data path: {vars(vars(vars(self.config)['predict'])['data'])['data_path']}")
-        logger.info(f"Corpus path: {vars(vars(vars(self.config)['predict'])['data'])['corpus_path']}")
+        vars(vars(vars(self.config)["predict"])["data"])["data_path"] = (
+            cur_data_path + "/random"
+        )
+        vars(vars(vars(self.config)["predict"])["data"])["corpus_path"] = (
+            cur_data_path + "/corpus.jsonl"
+        )
+        logger.info(
+            f"Data path: {vars(vars(vars(self.config)['predict'])['data'])['data_path']}"
+        )
+        logger.info(
+            f"Corpus path: {vars(vars(vars(self.config)['predict'])['data'])['corpus_path']}"
+        )
+
 
 def run_cli(model_path, data_path):
     logger.info(f"PID: {os.getpid()}")
     # Mimic command line argument passing
-    sys.argv = ['main.py', 'predict', '--config', 'retrieval/confs/cli_lean4_random.yaml', '--ckpt_path', model_path, '--data-path', data_path]
+    sys.argv = [
+        "main.py",
+        "predict",
+        "--config",
+        "retrieval/confs/cli_lean4_random.yaml",
+        "--ckpt_path",
+        model_path,
+        "--data-path",
+        data_path,
+    ]
     cli = CLI(PremiseRetriever, RetrievalDataModule)
 
+
 def main() -> None:
     logger.info(f"PID: {os.getpid()}")
     cli = CLI(PremiseRetriever, RetrievalDataModule)
diff --git a/retrieval/model.py b/retrieval/model.py
index 5ccf2d3..6832674 100644
--- a/retrieval/model.py
+++ b/retrieval/model.py
@@ -29,6 +29,7 @@
 
 torch.set_float32_matmul_precision("medium")
 
+
 class PremiseRetriever(pl.LightningModule):
     """
     A PyTorch Lightning module implementing a premise retriever for theorem proving.
@@ -58,6 +59,7 @@ class PremiseRetriever(pl.LightningModule):
         max_seq_len (int): Maximum sequence length for tokenization
         num_retrieved (int, optional): Number of premises to retrieve. Defaults to 100.
     """
+
     def __init__(
         self,
         model_name: str,
@@ -91,22 +93,24 @@ def set_lambda(self, lambda_value):
         self.lamda = lambda_value
 
     def set_previous_params(self):
-        self.previous_params = {name: param.clone().detach() for name, param in self.named_parameters()}
+        self.previous_params = {
+            name: param.clone().detach() for name, param in self.named_parameters()
+        }
 
     def ewc_loss(self):
         """
         Calculate the Elastic Weight Consolidation (EWC) loss.
-        EWC loss is used to prevent catastrophic forgetting in neural networks by 
-        penalizing changes to important parameters. The penalty is based on the 
-        Fisher Information matrix and the difference between current and previous 
+        EWC loss is used to prevent catastrophic forgetting in neural networks by
+        penalizing changes to important parameters. The penalty is based on the
+        Fisher Information matrix and the difference between current and previous
         parameter values.
         Returns:
-            float: The calculated EWC loss. If Fisher information is not available 
+            float: The calculated EWC loss. If Fisher information is not available
                    or lambda is zero, returns 0.0.
         """
         if not self.fisher_info or self.lamda == 0:
             return 0.0
-    
+
         ewc_loss = 0
         for name, param in self.named_parameters():
             if name in self.fisher_info and name in self.previous_params:
@@ -121,7 +125,9 @@ def ewc_loss(self):
         return total_loss
 
     @classmethod
-    def load(cls, ckpt_path: str, device, freeze: bool, config: dict) -> "PremiseRetriever":
+    def load(
+        cls, ckpt_path: str, device, freeze: bool, config: dict
+    ) -> "PremiseRetriever":
         return load_checkpoint(cls, ckpt_path, device, freeze, config)
 
     @classmethod
@@ -158,7 +164,9 @@ def load_corpus(self, path_or_corpus: Union[str, Corpus]) -> None:
             self.corpus = indexed_corpus.corpus
             self.corpus_embeddings = indexed_corpus.embeddings
             self.embeddings_staled = False
-            logger.info(f"Embeddings staled load corpus pickle: {self.embeddings_staled}")
+            logger.info(
+                f"Embeddings staled load corpus pickle: {self.embeddings_staled}"
+            )
 
     @property
     def embedding_size(self) -> int:
@@ -250,7 +258,6 @@ def on_train_batch_end(self, outputs, batch, _) -> None:
         """Mark the embeddings as staled after a training batch."""
         self.embeddings_staled = True
 
-
     def configure_optimizers(self) -> Dict[str, Any]:
         return get_optimizers(
             self.parameters(), self.trainer, self.lr, self.warmup_steps
@@ -274,7 +281,7 @@ def reindex_corpus(self, batch_size: int) -> None:
 
         Returns:
             None
-        """        
+        """
         if not self.embeddings_staled:
             return
         logger.info("Re-indexing the retrieval corpus")
@@ -450,12 +457,12 @@ def predict_step(self, batch: Dict[str, Any], _):
                     "scores": s,
                 }
             )
-    
+
     def on_predict_epoch_end(self) -> None:
         if self.trainer.log_dir is not None:
             logger.info("About to construct predictions map")
             gpu_id = self.trainer.local_rank
-            
+
             preds_map = {
                 (p["file_path"], p["full_name"], tuple(p["start"]), p["tactic_idx"]): p
                 for p in self.predict_step_outputs
diff --git a/run_leanagent.sh b/run_leanagent.sh
index 66b4870..180ff70 100644
--- a/run_leanagent.sh
+++ b/run_leanagent.sh
@@ -24,13 +24,13 @@
 #
 # Usage: bash run_leanagent.sh
 #!/bin/bash
-export RAID_DIR="<RAID_DIR>"
-cd ${RAID_DIR}/LeanAgent
+export RAID_DIR="~/Desktop/LeanAgent/RAID/"
+export LEAN_AGENT_DIR="~/Desktop/LeanAgent"
+cd ${LEAN_AGENT_DIR}
 echo "Script executed from: ${PWD}"
-source <PATH_TO_CONDA_ENV>/etc/profile.d/conda.sh
+source /Users/motiwari/miniforge3/etc/profile.d/conda.sh
 conda activate LeanAgent
 export PYTHONPATH="${PYTHONPATH}:${RAID_DIR}/LeanAgent"
-export GITHUB_ACCESS_TOKEN="<GITHUB_ACCESS_TOKEN>"
 export CACHE_DIR="${RAID_DIR}/.cache/lean_dojo"
 echo "Removing old cache files"
 rm -rf /tmp/ray
diff --git a/tests/test_common.py b/tests/test_common.py
index 9bf00cd..6855286 100644
--- a/tests/test_common.py
+++ b/tests/test_common.py
@@ -17,4 +17,4 @@ def test_remove_marks(input_string, expected_output):
     """
     Tests that remove_marks correctly strips <a> and </a> tags from a string.
     """
-    assert remove_marks(input_string) == expected_output 
+    assert remove_marks(input_string) == expected_output
diff --git a/unittest_dynamic_database.py b/unittest_dynamic_database.py
index dfbcc93..25c9b03 100644
--- a/unittest_dynamic_database.py
+++ b/unittest_dynamic_database.py
@@ -4,7 +4,15 @@
 import unittest
 import datetime
 from pathlib import Path
-from dynamic_database import DynamicDatabase, Repository, Theorem, AnnotatedTactic, Annotation, PremiseFile, Premise
+from dynamic_database import (
+    DynamicDatabase,
+    Repository,
+    Theorem,
+    AnnotatedTactic,
+    Annotation,
+    PremiseFile,
+    Premise,
+)
 from lean_dojo.data_extraction.lean import Pos, LeanGitRepo
 import generate_benchmark_lean4
 import lean_dojo
@@ -20,11 +28,12 @@
 import os
 from unittest.mock import patch, MagicMock
 
-RAID_DIR = os.environ.get('RAID_DIR')
+RAID_DIR = os.environ.get("RAID_DIR")
 DATA_DIR = "datasets_new_unittest"
 MERGED_DATA_DIR = "datasets_merged_unittest"
 PROOF_LOG_FILE_NAME = "proof_logs_unittest/proof_log_unittest.log"
 
+
 class TestDynamicDatabaseCore(unittest.TestCase):
     """
     Unit tests for the DynamicDatabase class and related functionality.
@@ -42,6 +51,7 @@ class TestDynamicDatabaseCore(unittest.TestCase):
     The tests use a combination of simple and complex test cases to verify that
     all aspects of the database function correctly, including edge cases.
     """
+
     def setUp(self):
         self.db = DynamicDatabase()
         self.repo = Repository(
@@ -72,7 +82,7 @@ def test_get_update_theorem_in_repo(self):
             start=Pos(1, 1),
             end=Pos(2, 1),
             url="https://github.com/test/repo",
-            commit="abc123"
+            commit="abc123",
         )
         self.repo.proven_theorems.append(theorem)
         self.db.add_repository(self.repo)
@@ -87,7 +97,7 @@ def test_get_update_theorem_in_repo(self):
             end=Pos(2, 1),
             url="https://github.com/test/repo",
             commit="abc123",
-            theorem_statement="Updated statement"
+            theorem_statement="Updated statement",
         )
         self.repo.update_theorem(updated_theorem)
         retrieved_theorem = self.repo.get_theorem("test_theorem", "test.lean")
@@ -99,17 +109,13 @@ def test_get_update_theorem_in_repo(self):
             start=Pos(1, 1),
             end=Pos(2, 1),
             url="https://github.com/test/repo",
-            commit="abc123"
+            commit="abc123",
         )
         with self.assertRaises(ValueError):
             self.repo.update_theorem(non_existent_theorem)
 
     def test_get_premise_file_in_repo(self):
-        premise_file = PremiseFile(
-            path=Path("test.lean"),
-            imports=[],
-            premises=[]
-        )
+        premise_file = PremiseFile(path=Path("test.lean"), imports=[], premises=[])
         self.repo.premise_files.append(premise_file)
         self.db.add_repository(self.repo)
 
@@ -141,7 +147,7 @@ def test_difficulty_rating_in_theorem(self):
             end=Pos(2, 1),
             url="https://github.com/test/repo",
             commit="abc123",
-            difficulty_rating=0.7
+            difficulty_rating=0.7,
         )
         self.repo.proven_theorems.append(theorem)
         self.db.add_repository(self.repo)
@@ -152,7 +158,7 @@ def test_difficulty_rating_in_theorem(self):
     def test_validation_in_from_dict(self):
         with self.assertRaises(ValueError):
             DynamicDatabase.from_dict({})
-        
+
         with self.assertRaises(ValueError):
             Repository.from_dict({})
 
@@ -181,7 +187,7 @@ def test_empty_path_to_data_in_from_dict_repository(self):
             "metadata": {"date_processed": datetime.datetime.now().isoformat()},
             "theorems_folder": "",
             "premise_files_corpus": "",
-            "files_traced": ""
+            "files_traced": "",
         }
         with self.assertRaises(ValueError):
             Repository.from_dict(data)
@@ -193,17 +199,13 @@ def test_to_dict_for_all(self):
             start=Pos(1, 1),
             end=Pos(2, 1),
             url="https://github.com/test/repo",
-            commit="abc123"
+            commit="abc123",
         )
         theorem_dict = theorem.to_dict()
         self.assertIsInstance(theorem_dict, dict)
         self.assertEqual(theorem_dict["full_name"], "test_theorem")
 
-        premise_file = PremiseFile(
-            path=Path("test.lean"),
-            imports=[],
-            premises=[]
-        )
+        premise_file = PremiseFile(path=Path("test.lean"), imports=[], premises=[])
         premise_file_dict = premise_file.to_dict()
         self.assertIsInstance(premise_file_dict, dict)
         self.assertEqual(premise_file_dict["path"], "test.lean")
@@ -216,7 +218,7 @@ def test_to_dict_for_all(self):
             full_name="test_annotation",
             def_path="test/path.lean",
             def_pos=Pos(1, 1),
-            def_end_pos=Pos(2, 1)
+            def_end_pos=Pos(2, 1),
         )
         annotation_dict = annotation.to_dict()
         self.assertIsInstance(annotation_dict, dict)
@@ -229,7 +231,7 @@ def test_to_dict_for_all(self):
             tactic="test_tactic",
             annotated_tactic=("test_tactic", [annotation]),
             state_before="test_state_before",
-            state_after="test_state_after"
+            state_after="test_state_after",
         )
         annotated_tactic_dict = annotated_tactic.to_dict()
         self.assertIsInstance(annotated_tactic_dict, dict)
@@ -242,7 +244,7 @@ def test_to_dict_for_all(self):
             code="test_code",
             start=Pos(1, 1),
             end=Pos(2, 1),
-            kind="theorem"
+            kind="theorem",
         )
         premise_dict = premise.to_dict()
         self.assertIsInstance(premise_dict, dict)
@@ -251,7 +253,7 @@ def test_to_dict_for_all(self):
         self.assertEqual(premise_dict["start"], "(1, 1)")
         self.assertEqual(premise_dict["end"], "(2, 1)")
         self.assertEqual(premise_dict["kind"], "theorem")
-    
+
     def test_empty_string_and_none_json_serialization(self):
         empty_theorem = Theorem(
             full_name="",
@@ -261,25 +263,25 @@ def test_empty_string_and_none_json_serialization(self):
             url="https://github.com/test/repo",
             commit="abc123",
             theorem_statement=None,
-            difficulty_rating=None
+            difficulty_rating=None,
         )
         self.repo.proven_theorems.append(empty_theorem)
         self.db.add_repository(self.repo)
-        
+
         json_file = "empty_none_test.json"
         self.db.to_json(json_file)
-        
+
         loaded_db = DynamicDatabase.from_json(json_file)
         loaded_repo = loaded_db.get_repository(self.repo.url, self.repo.commit)
         loaded_theorem = loaded_repo.proven_theorems[-1]
-        
+
         self.assertEqual(loaded_theorem.full_name, "")
         self.assertEqual(str(loaded_theorem.file_path), ".")
         self.assertEqual(loaded_theorem.url, "https://github.com/test/repo")
         self.assertEqual(loaded_theorem.commit, "abc123")
         self.assertIsNone(loaded_theorem.theorem_statement)
         self.assertIsNone(loaded_theorem.difficulty_rating)
-    
+
     def test_complex_json_serialization(self):
         theorem1 = Theorem(
             full_name="theorem1",
@@ -292,21 +294,24 @@ def test_complex_json_serialization(self):
             traced_tactics=[
                 AnnotatedTactic(
                     tactic="rw [add_comm]",
-                    annotated_tactic=("rw [add_comm]", [
-                        Annotation(
-                            full_name="add_comm",
-                            def_path="src/add_comm.lean",
-                            def_pos=Pos(5, 1),
-                            def_end_pos=Pos(7, 1)
-                        )
-                    ]),
+                    annotated_tactic=(
+                        "rw [add_comm]",
+                        [
+                            Annotation(
+                                full_name="add_comm",
+                                def_path="src/add_comm.lean",
+                                def_pos=Pos(5, 1),
+                                def_end_pos=Pos(7, 1),
+                            )
+                        ],
+                    ),
                     state_before="⊢ 2 + 2 = 4",
-                    state_after="⊢ 2 + 2 = 4"
+                    state_after="⊢ 2 + 2 = 4",
                 )
             ],
-            difficulty_rating=0.7
+            difficulty_rating=0.7,
         )
-        
+
         theorem2 = Theorem(
             full_name="theorem2",
             file_path=Path("test2.lean"),
@@ -316,9 +321,9 @@ def test_complex_json_serialization(self):
             commit="abc123",
             theorem_statement="theorem2 : ∀ x y : ℕ, x + y = y + x",
             traced_tactics=[],
-            difficulty_rating=None
+            difficulty_rating=None,
         )
-        
+
         premise_file = PremiseFile(
             path=Path("premise.lean"),
             imports=["import data.nat.basic"],
@@ -328,11 +333,11 @@ def test_complex_json_serialization(self):
                     code="theorem nat_add_comm : ∀ a b : ℕ, a + b = b + a := sorry",
                     start=Pos(1, 1),
                     end=Pos(1, 60),
-                    kind="theorem"
+                    kind="theorem",
                 )
-            ]
+            ],
         )
-        
+
         complex_repo = Repository(
             url="https://github.com/test/complex-repo",
             name="Complex Test Repo",
@@ -341,55 +346,59 @@ def test_complex_json_serialization(self):
             lean_dojo_version="1.0.0",
             metadata={
                 "date_processed": datetime.datetime.now(),
-                "extra_info": {"key1": "value1", "key2": 2}
+                "extra_info": {"key1": "value1", "key2": 2},
             },
             proven_theorems=[theorem1],
             sorry_theorems_unproved=[theorem2],
             premise_files=[premise_file],
             files_traced=[Path("test1.lean"), Path("test2.lean")],
-            pr_url="https://github.com/test/complex-repo/pull/1"
+            pr_url="https://github.com/test/complex-repo/pull/1",
         )
-        
+
         self.db.add_repository(complex_repo)
-        
+
         json_file = "complex_test_database.json"
         self.db.to_json(json_file)
-        
+
         loaded_db = DynamicDatabase.from_json(json_file)
-        
+
         self.assertEqual(len(loaded_db.repositories), len(self.db.repositories))
-        loaded_repo = loaded_db.get_repository("https://github.com/test/complex-repo", "complex123")
+        loaded_repo = loaded_db.get_repository(
+            "https://github.com/test/complex-repo", "complex123"
+        )
         self.assertIsNotNone(loaded_repo)
-        
+
         self.assertEqual(loaded_repo.name, "Complex Test Repo")
         self.assertEqual(loaded_repo.lean_version, "4.0.0")
-        self.assertEqual(loaded_repo.pr_url, "https://github.com/test/complex-repo/pull/1")
-        
+        self.assertEqual(
+            loaded_repo.pr_url, "https://github.com/test/complex-repo/pull/1"
+        )
+
         # Check theorems
         self.assertEqual(len(loaded_repo.proven_theorems), 1)
         self.assertEqual(len(loaded_repo.sorry_theorems_unproved), 1)
-        
+
         loaded_theorem1 = loaded_repo.proven_theorems[0]
         self.assertEqual(loaded_theorem1.full_name, "theorem1")
         self.assertEqual(loaded_theorem1.theorem_statement, "theorem1 : 2 + 2 = 4")
         self.assertEqual(len(loaded_theorem1.traced_tactics), 1)
         self.assertEqual(loaded_theorem1.difficulty_rating, 0.7)
-        
+
         loaded_theorem2 = loaded_repo.sorry_theorems_unproved[0]
         self.assertEqual(loaded_theorem2.full_name, "theorem2")
         self.assertIsNone(loaded_theorem2.difficulty_rating)
-        
+
         # Check premise files
         self.assertEqual(len(loaded_repo.premise_files), 1)
         loaded_premise_file = loaded_repo.premise_files[0]
         self.assertEqual(str(loaded_premise_file.path), "premise.lean")
         self.assertEqual(len(loaded_premise_file.premises), 1)
-        
+
         # Check metadata
         self.assertIn("extra_info", loaded_repo.metadata)
         self.assertEqual(loaded_repo.metadata["extra_info"]["key1"], "value1")
         self.assertEqual(loaded_repo.metadata["extra_info"]["key2"], 2)
-        
+
         # Check files traced
         self.assertEqual(len(loaded_repo.files_traced), 2)
         self.assertIn(Path("test1.lean"), loaded_repo.files_traced)
@@ -402,7 +411,7 @@ def test_is_same_theorem(self):
             start=Pos(1, 1),
             end=Pos(2, 1),
             url="https://github.com/test/repo",
-            commit="abc123"
+            commit="abc123",
         )
         theorem2 = Theorem(
             full_name="test_theorem",
@@ -410,7 +419,7 @@ def test_is_same_theorem(self):
             start=Pos(1, 1),
             end=Pos(2, 1),
             url="https://github.com/test/repo",
-            commit="abc123"
+            commit="abc123",
         )
         theorem3 = Theorem(
             full_name="other_theorem",
@@ -418,7 +427,7 @@ def test_is_same_theorem(self):
             start=Pos(1, 1),
             end=Pos(2, 1),
             url="https://github.com/test/repo",
-            commit="abc123"
+            commit="abc123",
         )
         self.assertTrue(theorem1.is_same_theorem(theorem2))
         self.assertFalse(theorem1.is_same_theorem(theorem3))
@@ -430,7 +439,7 @@ def test_repository_properties(self):
             start=Pos(1, 1),
             end=Pos(2, 1),
             url="https://github.com/test/repo",
-            commit="abc123"
+            commit="abc123",
         )
         theorem2 = Theorem(
             full_name="test_theorem2",
@@ -438,7 +447,7 @@ def test_repository_properties(self):
             start=Pos(3, 1),
             end=Pos(4, 1),
             url="https://github.com/test/repo",
-            commit="abc123"
+            commit="abc123",
         )
         self.repo.proven_theorems.append(theorem1)
         self.repo.sorry_theorems_unproved.append(theorem2)
@@ -456,7 +465,7 @@ def test_get_all_theorems(self):
             start=Pos(1, 1),
             end=Pos(2, 1),
             url="https://github.com/test/repo",
-            commit="abc123"
+            commit="abc123",
         )
         theorem2 = Theorem(
             full_name="test_theorem2",
@@ -464,7 +473,7 @@ def test_get_all_theorems(self):
             start=Pos(3, 1),
             end=Pos(4, 1),
             url="https://github.com/test/repo",
-            commit="abc123"
+            commit="abc123",
         )
         self.repo.proven_theorems.append(theorem1)
         self.repo.sorry_theorems_unproved.append(theorem2)
@@ -481,7 +490,7 @@ def test_empty_repository(self):
             commit="empty123",
             lean_version="3.50.3",
             lean_dojo_version="1.8.4",
-            metadata={"date_processed": datetime.datetime.now()}
+            metadata={"date_processed": datetime.datetime.now()},
         )
         self.db.add_repository(empty_repo)
 
@@ -499,7 +508,7 @@ def test_theorem_with_empty_traced_tactics(self):
             end=Pos(2, 1),
             url="https://github.com/test/repo",
             commit="abc123",
-            traced_tactics=[]
+            traced_tactics=[],
         )
         self.repo.proven_theorems.append(theorem)
         self.db.add_repository(self.repo)
@@ -516,7 +525,7 @@ def test_none_values(self):
             url="https://github.com/test/repo",
             commit="abc123",
             theorem_statement=None,
-            difficulty_rating=None
+            difficulty_rating=None,
         )
         self.repo.proven_theorems.append(theorem)
         self.repo.pr_url = None
@@ -534,7 +543,7 @@ def test_empty_strings(self):
             start=Pos(1, 1),
             end=Pos(2, 1),
             url="",
-            commit=""
+            commit="",
         )
         self.repo.proven_theorems.append(theorem)
         self.db.add_repository(self.repo)
@@ -551,7 +560,7 @@ def test_empty_strings(self):
             start=Pos(1, 1),
             end=Pos(2, 1),
             url="new_url",
-            commit=""
+            commit="",
         )
         self.repo.update_theorem(theorem2)
         retrieved_theorem = self.repo.get_theorem("", "")  # Should be theorem2
@@ -580,7 +589,7 @@ def test_duplicate_url_different_commit(self):
             commit="abc123",
             lean_version="3.50.3",
             lean_dojo_version="1.8.4",
-            metadata={"date_processed": datetime.datetime.now()}
+            metadata={"date_processed": datetime.datetime.now()},
         )
         repo2 = Repository(
             url="https://github.com/test/repo",
@@ -588,7 +597,9 @@ def test_duplicate_url_different_commit(self):
             commit="def456",
             lean_version="3.50.3",
             lean_dojo_version="1.8.4",
-            metadata={"date_processed": datetime.datetime.now() + datetime.timedelta(days=1)}
+            metadata={
+                "date_processed": datetime.datetime.now() + datetime.timedelta(days=1)
+            },
         )
 
         # Add a theorem to both repositories
@@ -599,7 +610,7 @@ def test_duplicate_url_different_commit(self):
             end=Pos(2, 1),
             url="https://github.com/test/repo",
             commit="abc123",
-            theorem_statement="Old version"
+            theorem_statement="Old version",
         )
         repo1.proven_theorems.append(common_theorem)
 
@@ -610,27 +621,31 @@ def test_duplicate_url_different_commit(self):
             end=Pos(2, 1),
             url="https://github.com/test/repo",
             commit="def456",
-            theorem_statement="New version"
+            theorem_statement="New version",
         )
         repo2.proven_theorems.append(updated_common_theorem)
 
         # Add unique theorems to each repository
-        repo1.proven_theorems.append(Theorem(
-            full_name="unique_to_repo1",
-            file_path=Path("repo1.lean"),
-            start=Pos(1, 1),
-            end=Pos(2, 1),
-            url="https://github.com/test/repo",
-            commit="abc123"
-        ))
-        repo2.proven_theorems.append(Theorem(
-            full_name="unique_to_repo2",
-            file_path=Path("repo2.lean"),
-            start=Pos(1, 1),
-            end=Pos(2, 1),
-            url="https://github.com/test/repo",
-            commit="def456"
-        ))
+        repo1.proven_theorems.append(
+            Theorem(
+                full_name="unique_to_repo1",
+                file_path=Path("repo1.lean"),
+                start=Pos(1, 1),
+                end=Pos(2, 1),
+                url="https://github.com/test/repo",
+                commit="abc123",
+            )
+        )
+        repo2.proven_theorems.append(
+            Theorem(
+                full_name="unique_to_repo2",
+                file_path=Path("repo2.lean"),
+                start=Pos(1, 1),
+                end=Pos(2, 1),
+                url="https://github.com/test/repo",
+                commit="def456",
+            )
+        )
 
         self.db.add_repository(repo1)
         self.db.add_repository(repo2)
@@ -639,14 +654,16 @@ def test_duplicate_url_different_commit(self):
 
         dst_dir = Path(RAID_DIR) / DATA_DIR / "test_duplicate_url"
         self.db.generate_merged_dataset(dst_dir)
-        with open(dst_dir / "random" / "train.json", 'r') as f:
+        with open(dst_dir / "random" / "train.json", "r") as f:
             data = json.load(f)
 
         # Check that both repositories are represented
         self.assertEqual(len(data), 3)
 
         # Check that the common theorem is from the most recent repository
-        common_theorem_in_dataset = next(t for t in data if t["full_name"] == "common_theorem")
+        common_theorem_in_dataset = next(
+            t for t in data if t["full_name"] == "common_theorem"
+        )
         self.assertEqual(common_theorem_in_dataset["theorem_statement"], "New version")
         self.assertEqual(common_theorem_in_dataset["commit"], "def456")
 
@@ -654,12 +671,16 @@ def test_duplicate_url_different_commit(self):
         self.assertTrue(any(t["full_name"] == "unique_to_repo1" for t in data))
         self.assertTrue(any(t["full_name"] == "unique_to_repo2" for t in data))
 
-        with open(dst_dir / "metadata.json", 'r') as f:
+        with open(dst_dir / "metadata.json", "r") as f:
             metadata = json.load(f)
-        
+
         self.assertEqual(len(metadata["repositories"]), 2)
-        self.assertTrue(any(repo["commit"] == "abc123" for repo in metadata["repositories"]))
-        self.assertTrue(any(repo["commit"] == "def456" for repo in metadata["repositories"]))
+        self.assertTrue(
+            any(repo["commit"] == "abc123" for repo in metadata["repositories"])
+        )
+        self.assertTrue(
+            any(repo["commit"] == "def456" for repo in metadata["repositories"])
+        )
 
     def test_change_sorry_to_proven(self):
         theorem = Theorem(
@@ -668,7 +689,7 @@ def test_change_sorry_to_proven(self):
             start=Pos(1, 1),
             end=Pos(2, 1),
             url="https://github.com/test/repo",
-            commit="abc123"
+            commit="abc123",
         )
         self.repo.sorry_theorems_unproved.append(theorem)
         self.db.add_repository(self.repo)
@@ -684,14 +705,14 @@ def test_change_sorry_to_proven(self):
             start=Pos(3, 1),
             end=Pos(4, 1),
             url="https://github.com/test/repo",
-            commit="abc123"
+            commit="abc123",
         )
         with self.assertRaises(ValueError):
             self.repo.change_sorry_to_proven(not_found_theorem, PROOF_LOG_FILE_NAME)
 
         with self.assertRaises(ValueError):
             self.repo.change_sorry_to_proven(theorem, PROOF_LOG_FILE_NAME)
-    
+
     def test_add_repository_duplicate(self):
         repo = Repository(
             url="https://github.com/test/repo",
@@ -708,7 +729,9 @@ def test_add_repository_duplicate(self):
 
         # Try to add the same repository again
         self.db.add_repository(repo)
-        self.assertEqual(len(self.db.repositories), 1, "Repository should not be added twice")
+        self.assertEqual(
+            len(self.db.repositories), 1, "Repository should not be added twice"
+        )
 
         # Verify that the repository details are unchanged
         added_repo = self.db.get_repository("https://github.com/test/repo", "abc123")
@@ -723,7 +746,7 @@ def test_repository_equality(self):
             commit="abc123",
             lean_version="3.50.3",
             lean_dojo_version="1.8.4",
-            metadata={"date_processed": datetime.datetime.now()}
+            metadata={"date_processed": datetime.datetime.now()},
         )
         repo2 = Repository(
             url="https://github.com/test/repo",
@@ -731,7 +754,7 @@ def test_repository_equality(self):
             commit="abc123",
             lean_version="3.50.3",
             lean_dojo_version="1.8.4",
-            metadata={"date_processed": datetime.datetime.now()}
+            metadata={"date_processed": datetime.datetime.now()},
         )
         repo3 = Repository(
             url="https://github.com/test/repo",
@@ -739,7 +762,7 @@ def test_repository_equality(self):
             commit="def456",
             lean_version="3.50.3",
             lean_dojo_version="1.8.4",
-            metadata={"date_processed": datetime.datetime.now()}
+            metadata={"date_processed": datetime.datetime.now()},
         )
         self.assertEqual(repo1, repo2)
         self.assertNotEqual(repo1, repo3)
@@ -751,13 +774,13 @@ def test_add_repository_duplicate(self):
             commit="abc123",
             lean_version="3.50.3",
             lean_dojo_version="1.8.4",
-            metadata={"date_processed": datetime.datetime.now()}
+            metadata={"date_processed": datetime.datetime.now()},
         )
         self.db.add_repository(repo)
         self.assertEqual(len(self.db.repositories), 1)
         self.db.add_repository(repo)
         self.assertEqual(len(self.db.repositories), 1)
-    
+
     def test_update_repository_duplicate(self):
         repo = Repository(
             url="https://github.com/test/repo",
@@ -765,7 +788,7 @@ def test_update_repository_duplicate(self):
             commit="abc123",
             lean_version="3.50.3",
             lean_dojo_version="1.8.4",
-            metadata={"date_processed": datetime.datetime.now()}
+            metadata={"date_processed": datetime.datetime.now()},
         )
         self.db.add_repository(repo)
         self.assertEqual(len(self.db.repositories), 1)
@@ -789,7 +812,7 @@ def test_update_repository_duplicate(self):
         self.assertEqual(updated_repo.name, "Updated Repo")
         self.assertEqual(updated_repo.commit, "abc123")
         self.assertEqual(added_repo.lean_version, "3.50.4")
-    
+
     def test_update_theorem_difficulty(self):
         theorem = Theorem(
             full_name="test_theorem",
@@ -798,7 +821,7 @@ def test_update_theorem_difficulty(self):
             end=Pos(2, 1),
             url="https://github.com/test/repo",
             commit="abc123",
-            difficulty_rating=None
+            difficulty_rating=None,
         )
         self.repo.proven_theorems.append(theorem)
         self.db.add_repository(self.repo)
@@ -826,7 +849,7 @@ def test_update_theorem_difficulty(self):
         loaded_db = DynamicDatabase.from_json(json_file)
         loaded_repo = loaded_db.get_repository("https://github.com/test/repo", "abc123")
         self.assertIsNotNone(loaded_repo)
-        
+
         loaded_theorem = loaded_repo.get_theorem("test_theorem", "test.lean")
         self.assertIsNotNone(loaded_theorem)
 
@@ -838,22 +861,30 @@ def create_theorem(self, name, tactics):
             end=Pos(2, 1),
             url="https://github.com/test/repo",
             commit="abc123",
-            traced_tactics=tactics
+            traced_tactics=tactics,
         )
-    
+
     def _calculate_difficulty(self, theorem: Theorem) -> Union[float, None]:
         proof_steps = theorem.traced_tactics
-        if any('sorry' in step.tactic for step in proof_steps):
-            return float('inf')  # Hard (no proof)
+        if any("sorry" in step.tactic for step in proof_steps):
+            return float("inf")  # Hard (no proof)
         if len(proof_steps) == 0:
             return None  # To be distributed later
         return math.exp(len(proof_steps))
 
     def test_calculate_and_update_difficulty(self):
         # Test case 1: Theorem with 'sorry'
-        sorry_theorem = self.create_theorem("sorry_theorem", [
-            AnnotatedTactic(tactic="sorry", annotated_tactic=("sorry", []), state_before="", state_after="")
-        ])
+        sorry_theorem = self.create_theorem(
+            "sorry_theorem",
+            [
+                AnnotatedTactic(
+                    tactic="sorry",
+                    annotated_tactic=("sorry", []),
+                    state_before="",
+                    state_after="",
+                )
+            ],
+        )
         self.repo.sorry_theorems_unproved.append(sorry_theorem)
 
         # Test case 2: Theorem with no tactics
@@ -861,17 +892,43 @@ def test_calculate_and_update_difficulty(self):
         self.repo.proven_theorems.append(empty_theorem)
 
         # Test case 3: Theorem with proven sorry
-        normal_theorem = self.create_theorem("proven_sorry_theorem", [
-            AnnotatedTactic(tactic="tactic1", annotated_tactic=("tactic1", []), state_before="", state_after=""),
-            AnnotatedTactic(tactic="tactic2", annotated_tactic=("tactic2", []), state_before="", state_after="")
-        ])
+        normal_theorem = self.create_theorem(
+            "proven_sorry_theorem",
+            [
+                AnnotatedTactic(
+                    tactic="tactic1",
+                    annotated_tactic=("tactic1", []),
+                    state_before="",
+                    state_after="",
+                ),
+                AnnotatedTactic(
+                    tactic="tactic2",
+                    annotated_tactic=("tactic2", []),
+                    state_before="",
+                    state_after="",
+                ),
+            ],
+        )
         self.repo.proven_theorems.append(normal_theorem)
 
         # Test case 4: Theorem with normal teactics
-        normal_theorem = self.create_theorem("normal_theorem", [
-            AnnotatedTactic(tactic="tactic1", annotated_tactic=("tactic1", []), state_before="before", state_after="no goals"),
-            AnnotatedTactic(tactic="tactic2", annotated_tactic=("tactic2", []), state_before="before2", state_after="no goals")
-        ])
+        normal_theorem = self.create_theorem(
+            "normal_theorem",
+            [
+                AnnotatedTactic(
+                    tactic="tactic1",
+                    annotated_tactic=("tactic1", []),
+                    state_before="before",
+                    state_after="no goals",
+                ),
+                AnnotatedTactic(
+                    tactic="tactic2",
+                    annotated_tactic=("tactic2", []),
+                    state_before="before2",
+                    state_after="no goals",
+                ),
+            ],
+        )
         self.repo.proven_theorems.append(normal_theorem)
 
         self.db.add_repository(self.repo)
@@ -885,7 +942,7 @@ def test_calculate_and_update_difficulty(self):
         self.db.update_repository(self.repo)
 
         sorry_theorem = self.repo.get_theorem("sorry_theorem", "test.lean")
-        self.assertEqual(sorry_theorem.difficulty_rating, float('inf'))
+        self.assertEqual(sorry_theorem.difficulty_rating, float("inf"))
 
         empty_theorem = self.repo.get_theorem("empty_theorem", "test.lean")
         self.assertIsNone(empty_theorem.difficulty_rating)
@@ -901,7 +958,7 @@ def test_calculate_and_update_difficulty(self):
         self.assertIsNotNone(loaded_repo)
 
         loaded_sorry_theorem = loaded_repo.get_theorem("sorry_theorem", "test.lean")
-        self.assertEqual(loaded_sorry_theorem.difficulty_rating, float('inf'))
+        self.assertEqual(loaded_sorry_theorem.difficulty_rating, float("inf"))
 
         loaded_empty_theorem = loaded_repo.get_theorem("empty_theorem", "test.lean")
         self.assertIsNone(loaded_empty_theorem.difficulty_rating)
@@ -909,6 +966,7 @@ def test_calculate_and_update_difficulty(self):
         loaded_normal_theorem = loaded_repo.get_theorem("normal_theorem", "test.lean")
         self.assertEqual(loaded_normal_theorem.difficulty_rating, math.exp(2))
 
+
 class TestDynamicDatabaseSimpleLean(unittest.TestCase):
     def setUp(self):
         self.db = DynamicDatabase()
@@ -919,7 +977,7 @@ def create_simple_lean_repo(self):
         url = "https://github.com/Adarsh321123/SimpleLean"
         commit = "99a5078e1614e61f0d9cc234ca246c8744a4e660"
         lean_git_repo = LeanGitRepo(url, commit)
-        dir_name = url.split("/")[-1].replace('.git', '') + "_" + commit
+        dir_name = url.split("/")[-1].replace(".git", "") + "_" + commit
         dst_dir = RAID_DIR + "/" + DATA_DIR + "/" + dir_name + "_updated"
         config = lean_git_repo.get_config("lean-toolchain")
         v = generate_benchmark_lean4.get_lean4_version_from_config(config["content"])
@@ -951,14 +1009,15 @@ def test_generate_dataset_with_empty_repo(self):
         self.assertTrue(dst_dir.exists())
         self.assertTrue((dst_dir / "random").exists())
         self.assertTrue((dst_dir / "novel_premises").exists())
-        
-        for split in ['train', 'val', 'test']:
-            with open(dst_dir / "random" / f"{split}.json", 'r') as f:
+
+        for split in ["train", "val", "test"]:
+            with open(dst_dir / "random" / f"{split}.json", "r") as f:
                 data = json.load(f)
                 self.assertEqual(len(data), 0)
 
+
 class TestDynamicDatabaseUnicode(unittest.TestCase):
-    """"
+    """ "
     Unit test class for testing Unicode handling in the DynamicDatabase class.
     This test suite focuses on verifying that the DynamicDatabase correctly handles
     Unicode characters during serialization and deserialization operations, and
@@ -973,6 +1032,7 @@ class TestDynamicDatabaseUnicode(unittest.TestCase):
     This ensures the database can correctly handle international character sets
     and mathematical notation when saving to and loading from JSON files.
     """
+
     def setUp(self):
         self.db = DynamicDatabase()
         self.unicode_repo = self.create_unicode_sample_repo()
@@ -985,7 +1045,7 @@ def assertDatetimeEqual(self, dt1, dt2):
         may lose microsecond precision.
         """
         self.assertEqual(dt1.replace(microsecond=0), dt2.replace(microsecond=0))
-    
+
     def create_unicode_sample_repo(self):
         repo = Repository(
             url="https://github.com/example/repo",
@@ -1007,19 +1067,22 @@ def create_unicode_sample_repo(self):
             traced_tactics=[
                 AnnotatedTactic(
                     tactic="induction x with n ih",
-                    annotated_tactic=("induction x with n ih", [
-                        Annotation(
-                            full_name="induction",
-                            def_path="src/tactic/induction.lean",
-                            def_pos=Pos(100, 1),
-                            def_end_pos=Pos(100, 10)
-                        )
-                    ]),
+                    annotated_tactic=(
+                        "induction x with n ih",
+                        [
+                            Annotation(
+                                full_name="induction",
+                                def_path="src/tactic/induction.lean",
+                                def_pos=Pos(100, 1),
+                                def_end_pos=Pos(100, 10),
+                            )
+                        ],
+                    ),
                     state_before="⊢ ∀ x y : ℕ, x + y = y + x",
-                    state_after="2 goals\ncase zero\n⊢ ∀ y : ℕ, 0 + y = y + 0\ncase succ\nn : ℕ\nih : ∀ y : ℕ, n + y = y + n\n⊢ ∀ y : ℕ, succ n + y = y + succ n"
+                    state_after="2 goals\ncase zero\n⊢ ∀ y : ℕ, 0 + y = y + 0\ncase succ\nn : ℕ\nih : ∀ y : ℕ, n + y = y + n\n⊢ ∀ y : ℕ, succ n + y = y + succ n",
                 )
             ],
-            difficulty_rating=0.7
+            difficulty_rating=0.7,
         )
 
         theorem2 = Theorem(
@@ -1031,7 +1094,7 @@ def create_unicode_sample_repo(self):
             url="https://github.com/example/repo",
             commit="abc123",
             traced_tactics=[],
-            difficulty_rating=0.9
+            difficulty_rating=0.9,
         )
 
         repo.proven_theorems.append(theorem1)
@@ -1046,87 +1109,107 @@ def create_unicode_sample_repo(self):
                     code="theorem sqrt_squared (x : ℝ) (h : x ≥ 0) : √(x^2) = x := sorry",
                     start=Pos(1, 1),
                     end=Pos(1, 70),
-                    kind="theorem"
+                    kind="theorem",
                 )
-            ]
+            ],
         )
 
         repo.premise_files.append(premise_file)
         repo.files_traced.append(Path("src/example.lean"))
         return repo
-    
+
     def test_unicode_serialization_deserialization(self):
         json_file = "test_unicode_database.json"
         self.db.to_json(json_file)
-        
+
         deserialized_db = DynamicDatabase.from_json(json_file)
-        
+
         assert len(self.db.repositories) == len(deserialized_db.repositories)
-        
+
         original_repo = self.db.repositories[0]
         deserialized_repo = deserialized_db.repositories[0]
-        
+
         assert original_repo.name == deserialized_repo.name
-        self.assertDatetimeEqual(original_repo.metadata["date_processed"], deserialized_repo.metadata["date_processed"])
-        
+        self.assertDatetimeEqual(
+            original_repo.metadata["date_processed"],
+            deserialized_repo.metadata["date_processed"],
+        )
+
         original_theorem1 = original_repo.proven_theorems[0]
         deserialized_theorem1 = deserialized_repo.proven_theorems[0]
-        
-        assert original_theorem1.theorem_statement == deserialized_theorem1.theorem_statement
-        assert original_theorem1.traced_tactics[0].state_before == deserialized_theorem1.traced_tactics[0].state_before
-        assert original_theorem1.traced_tactics[0].state_after == deserialized_theorem1.traced_tactics[0].state_after
-        
+
+        assert (
+            original_theorem1.theorem_statement
+            == deserialized_theorem1.theorem_statement
+        )
+        assert (
+            original_theorem1.traced_tactics[0].state_before
+            == deserialized_theorem1.traced_tactics[0].state_before
+        )
+        assert (
+            original_theorem1.traced_tactics[0].state_after
+            == deserialized_theorem1.traced_tactics[0].state_after
+        )
+
         original_theorem2 = original_repo.sorry_theorems_unproved[0]
         deserialized_theorem2 = deserialized_repo.sorry_theorems_unproved[0]
-        
-        assert original_theorem2.theorem_statement == deserialized_theorem2.theorem_statement
-        
+
+        assert (
+            original_theorem2.theorem_statement
+            == deserialized_theorem2.theorem_statement
+        )
+
         original_premise = original_repo.premise_files[0].premises[0]
         deserialized_premise = deserialized_repo.premise_files[0].premises[0]
-        
+
         assert original_premise.code == deserialized_premise.code
-    
+
     def test_unicode_modification(self):
         json_file = "test_unicode_database.json"
         self.db.to_json(json_file)
-        
+
         deserialized_db = DynamicDatabase.from_json(json_file)
-        
-        repo = deserialized_db.get_repository("https://github.com/example/repo", "abc123")
+
+        repo = deserialized_db.get_repository(
+            "https://github.com/example/repo", "abc123"
+        )
         assert repo is not None
-        
+
         sorry_theorem = repo.sorry_theorems_unproved[0]
-        
+
         sorry_theorem.traced_tactics = [
             AnnotatedTactic(
                 tactic="intros a b c x h_a_nonzero",
                 annotated_tactic=("intros a b c x h_a_nonzero", []),
                 state_before="⊢ ∀ a b c x : ℝ, a ≠ 0 → (a * x² + b * x + c = 0 ↔ x = (-b + √(b² - 4*a*c)) / (2*a) ∨ x = (-b - √(b² - 4*a*c)) / (2*a))",
-                state_after="a b c x : ℝ\nh_a_nonzero : a ≠ 0\n⊢ a * x² + b * x + c = 0 ↔ x = (-b + √(b² - 4*a*c)) / (2*a) ∨ x = (-b - √(b² - 4*a*c)) / (2*a)"
+                state_after="a b c x : ℝ\nh_a_nonzero : a ≠ 0\n⊢ a * x² + b * x + c = 0 ↔ x = (-b + √(b² - 4*a*c)) / (2*a) ∨ x = (-b - √(b² - 4*a*c)) / (2*a)",
             ),
             AnnotatedTactic(
                 tactic="apply iff.intro",
                 annotated_tactic=("apply iff.intro", []),
                 state_before="a b c x : ℝ\nh_a_nonzero : a ≠ 0\n⊢ a * x² + b * x + c = 0 ↔ x = (-b + √(b² - 4*a*c)) / (2*a) ∨ x = (-b - √(b² - 4*a*c)) / (2*a)",
-                state_after="no goals"
-            )
+                state_after="no goals",
+            ),
         ]
-        
+
         repo.change_sorry_to_proven(sorry_theorem, PROOF_LOG_FILE_NAME)
         deserialized_db.update_json(json_file)
         updated_db = DynamicDatabase.from_json(json_file)
-        updated_repo = updated_db.get_repository("https://github.com/example/repo", "abc123")
+        updated_repo = updated_db.get_repository(
+            "https://github.com/example/repo", "abc123"
+        )
         assert updated_repo is not None
-        
+
         assert len(updated_repo.sorry_theorems_unproved) == 0
         assert len(updated_repo.sorry_theorems_proved) == 1
-        
+
         updated_theorem = updated_repo.sorry_theorems_proved[0]
         assert updated_theorem.full_name == "example.quadratic_formula"
         assert len(updated_theorem.traced_tactics) == 2
         assert "√(b² - 4*a*c)" in updated_theorem.traced_tactics[0].state_before
         assert "↔" in updated_theorem.traced_tactics[1].state_before
 
+
 class TestDynamicDatabase(unittest.TestCase):
     """
     Test suite for the DynamicDatabase class.
@@ -1146,6 +1229,7 @@ class TestDynamicDatabase(unittest.TestCase):
     Each test method uses a fresh DynamicDatabase instance and a test Repository object
     created in the setUp method.
     """
+
     def setUp(self):
         self.db = DynamicDatabase()
         self.repo = Repository(
@@ -1172,7 +1256,9 @@ def test_add_repository(self):
 
     def test_get_repository(self):
         self.db.add_repository(self.repo)
-        retrieved_repo = self.db.get_repository("https://github.com/test/repo", "abc123")
+        retrieved_repo = self.db.get_repository(
+            "https://github.com/test/repo", "abc123"
+        )
         self.assertEqual(retrieved_repo, self.repo)
 
     def test_update_repository(self):
@@ -1183,12 +1269,19 @@ def test_update_repository(self):
             commit="abc123",
             lean_version="3.50.3",
             lean_dojo_version="1.8.4",
-            metadata={"date_processed": datetime.datetime.now() + datetime.timedelta(days=1)},
+            metadata={
+                "date_processed": datetime.datetime.now() + datetime.timedelta(days=1)
+            },
         )
         self.db.update_repository(updated_repo)
-        retrieved_repo = self.db.get_repository("https://github.com/test/repo", "abc123")
+        retrieved_repo = self.db.get_repository(
+            "https://github.com/test/repo", "abc123"
+        )
         self.assertEqual(retrieved_repo.name, "Test Repo")
-        self.assertNotEqual(retrieved_repo.metadata["date_processed"].replace(microsecond=0), self.current_datetime.replace(microsecond=0))
+        self.assertNotEqual(
+            retrieved_repo.metadata["date_processed"].replace(microsecond=0),
+            self.current_datetime.replace(microsecond=0),
+        )
 
     def test_delete_repository(self):
         self.db.add_repository(self.repo)
@@ -1203,7 +1296,10 @@ def test_to_json_and_from_json(self):
         self.assertEqual(len(loaded_db.repositories), 1)
         loaded_repo = loaded_db.get_repository("https://github.com/test/repo", "abc123")
         self.assertEqual(loaded_repo.name, "Test Repo")
-        self.assertDatetimeEqual(loaded_repo.metadata["date_processed"], self.current_datetime)
+        self.assertDatetimeEqual(
+            loaded_repo.metadata["date_processed"], self.current_datetime
+        )
+
 
 class TestDynamicDatabasePFR(unittest.TestCase):
     def setUp(self):
@@ -1237,7 +1333,7 @@ def create_sample_repo(self):
             "theorems_folder": theorems_folder,
             "premise_files_corpus": premise_files_corpus,
             "files_traced": files_traced,
-            "pr_url": pr_url
+            "pr_url": pr_url,
         }
         repo = Repository.from_dict(data)
         return repo
@@ -1245,30 +1341,54 @@ def create_sample_repo(self):
     def test_repository_creation(self):
         self.assertIsNotNone(self.sample_repo)
         self.assertEqual(self.sample_repo.url, "https://github.com/teorth/pfr")
-        self.assertEqual(self.sample_repo.commit, "6a5082ee465f9e44cea479c7b741b3163162bb7e")
+        self.assertEqual(
+            self.sample_repo.commit, "6a5082ee465f9e44cea479c7b741b3163162bb7e"
+        )
 
     def test_theorem_loading(self):
         self.assertGreater(len(self.sample_repo.proven_theorems), 0)
         self.assertGreater(len(self.sample_repo.sorry_theorems_unproved), 0)
 
-        theorem = next(t for t in self.sample_repo.proven_theorems if t.full_name == "ContinuousLinearMap.opNorm_lsmul")
+        theorem = next(
+            t
+            for t in self.sample_repo.proven_theorems
+            if t.full_name == "ContinuousLinearMap.opNorm_lsmul"
+        )
         self.assertIsNotNone(theorem)
-        self.assertEqual(theorem.file_path, Path(".lake/packages/mathlib/Mathlib/Analysis/NormedSpace/OperatorNorm/Mul.lean"))
+        self.assertEqual(
+            theorem.file_path,
+            Path(
+                ".lake/packages/mathlib/Mathlib/Analysis/NormedSpace/OperatorNorm/Mul.lean"
+            ),
+        )
         self.assertEqual(theorem.start, Pos(281, 1))
         self.assertEqual(theorem.end, Pos(290, 26))
 
     def test_traced_tactics(self):
-        theorem = next(t for t in self.sample_repo.proven_theorems if t.full_name == "ContinuousLinearMap.opNorm_lsmul")
+        theorem = next(
+            t
+            for t in self.sample_repo.proven_theorems
+            if t.full_name == "ContinuousLinearMap.opNorm_lsmul"
+        )
         self.assertGreater(len(theorem.traced_tactics), 0)
 
         first_tactic = theorem.traced_tactics[0]
-        self.assertEqual(first_tactic.tactic, "refine' ContinuousLinearMap.opNorm_eq_of_bounds zero_le_one (fun x => _) fun N _ h => _")
-        self.assertIn("ContinuousLinearMap.opNorm_eq_of_bounds", first_tactic.annotated_tactic[0])
+        self.assertEqual(
+            first_tactic.tactic,
+            "refine' ContinuousLinearMap.opNorm_eq_of_bounds zero_le_one (fun x => _) fun N _ h => _",
+        )
+        self.assertIn(
+            "ContinuousLinearMap.opNorm_eq_of_bounds", first_tactic.annotated_tactic[0]
+        )
 
     def test_premise_loading(self):
         self.assertGreater(len(self.sample_repo.premise_files), 0)
 
-        premise_file = next(pf for pf in self.sample_repo.premise_files if pf.path == Path(".lake/packages/lean4/src/lean/Init/Prelude.lean"))
+        premise_file = next(
+            pf
+            for pf in self.sample_repo.premise_files
+            if pf.path == Path(".lake/packages/lean4/src/lean/Init/Prelude.lean")
+        )
         self.assertIsNotNone(premise_file)
         self.assertGreater(len(premise_file.premises), 0)
 
@@ -1289,8 +1409,12 @@ def test_serialization_deserialization(self):
 
         self.assertEqual(original_repo.name, deserialized_repo.name)
         self.assertEqual(original_repo.commit, deserialized_repo.commit)
-        self.assertEqual(len(original_repo.proven_theorems), len(deserialized_repo.proven_theorems))
-        self.assertEqual(len(original_repo.premise_files), len(deserialized_repo.premise_files))
+        self.assertEqual(
+            len(original_repo.proven_theorems), len(deserialized_repo.proven_theorems)
+        )
+        self.assertEqual(
+            len(original_repo.premise_files), len(deserialized_repo.premise_files)
+        )
 
     def test_generate_dataset_structure(self):
         url = "https://github.com/teorth/pfr"
@@ -1319,7 +1443,7 @@ def test_generated_dataset_content(self):
         dst_dir = Path(RAID_DIR) / DATA_DIR / f"{dir_name}_generated"
         self.db.generate_merged_dataset(dst_dir)
 
-        with open(dst_dir / "random" / "train.json", 'r') as f:
+        with open(dst_dir / "random" / "train.json", "r") as f:
             train_data = json.load(f)
             self.assertIsInstance(train_data, list)
             self.assertGreater(len(train_data), 0)
@@ -1333,19 +1457,19 @@ def test_generated_dataset_content(self):
             self.assertIn("end", first_theorem)
             self.assertIn("traced_tactics", first_theorem)
 
-        with open(dst_dir / "corpus.jsonl", 'r') as f:
+        with open(dst_dir / "corpus.jsonl", "r") as f:
             first_line = f.readline().strip()
             first_premise_file = json.loads(first_line)
             self.assertIn("path", first_premise_file)
             self.assertIn("imports", first_premise_file)
             self.assertIn("premises", first_premise_file)
 
-        with open(dst_dir / "traced_files.jsonl", 'r') as f:
+        with open(dst_dir / "traced_files.jsonl", "r") as f:
             first_line = f.readline().strip()
             first_traced_file = json.loads(first_line)
             self.assertIn("traced_file_path", first_traced_file)
 
-        with open(dst_dir / "metadata.json", 'r') as f:
+        with open(dst_dir / "metadata.json", "r") as f:
             metadata = json.load(f)
             self.assertIn("repositories", metadata)
             self.assertEqual(len(metadata["repositories"]), 1)
@@ -1371,22 +1495,46 @@ def test_dataset_splitting(self):
         dst_dir = Path(RAID_DIR) / DATA_DIR / f"{dir_name}_generated"
         self.db.generate_merged_dataset(dst_dir)
 
-        for strategy in ['random', 'novel_premises']:
+        for strategy in ["random", "novel_premises"]:
             train_set = set()
             val_set = set()
             test_set = set()
 
-            with open(dst_dir / strategy / "train.json", 'r') as f:
+            with open(dst_dir / strategy / "train.json", "r") as f:
                 train_data = json.load(f)
-                train_set = set((item['full_name'], item['file_path'], tuple(item['start']), tuple(item['end'])) for item in train_data)
+                train_set = set(
+                    (
+                        item["full_name"],
+                        item["file_path"],
+                        tuple(item["start"]),
+                        tuple(item["end"]),
+                    )
+                    for item in train_data
+                )
 
-            with open(dst_dir / strategy / "val.json", 'r') as f:
+            with open(dst_dir / strategy / "val.json", "r") as f:
                 val_data = json.load(f)
-                val_set = set((item['full_name'], item['file_path'], tuple(item['start']), tuple(item['end'])) for item in val_data)
+                val_set = set(
+                    (
+                        item["full_name"],
+                        item["file_path"],
+                        tuple(item["start"]),
+                        tuple(item["end"]),
+                    )
+                    for item in val_data
+                )
 
-            with open(dst_dir / strategy / "test.json", 'r') as f:
+            with open(dst_dir / strategy / "test.json", "r") as f:
                 test_data = json.load(f)
-                test_set = set((item['full_name'], item['file_path'], tuple(item['start']), tuple(item['end'])) for item in test_data)
+                test_set = set(
+                    (
+                        item["full_name"],
+                        item["file_path"],
+                        tuple(item["start"]),
+                        tuple(item["end"]),
+                    )
+                    for item in test_data
+                )
 
             self.assertGreater(len(train_set), 0)
             self.assertGreater(len(val_set), 0)
@@ -1406,58 +1554,78 @@ def test_dataset_consistency(self):
         # Check that all theorems in the dataset are from the original repository
         all_theorems = set(thm.full_name for thm in self.sample_repo.get_all_theorems)
 
-        for strategy in ['random', 'novel_premises']:
-            for split in ['train', 'val', 'test']:
-                with open(dst_dir / strategy / f"{split}.json", 'r') as f:
+        for strategy in ["random", "novel_premises"]:
+            for split in ["train", "val", "test"]:
+                with open(dst_dir / strategy / f"{split}.json", "r") as f:
                     data = json.load(f)
                     for item in data:
-                        self.assertIn(item['full_name'], all_theorems)
+                        self.assertIn(item["full_name"], all_theorems)
 
     def test_compare_manual_and_dynamic_datasets(self):
         random.seed(3407)
 
-        manual_dataset_path = Path(RAID_DIR) / DATA_DIR / "pfr_6a5082ee465f9e44cea479c7b741b3163162bb7e_updated"
-        dynamic_dataset_path = Path(RAID_DIR) / DATA_DIR / "pfr_6a5082ee465f9e44cea479c7b741b3163162bb7e_generated"
+        manual_dataset_path = (
+            Path(RAID_DIR)
+            / DATA_DIR
+            / "pfr_6a5082ee465f9e44cea479c7b741b3163162bb7e_updated"
+        )
+        dynamic_dataset_path = (
+            Path(RAID_DIR)
+            / DATA_DIR
+            / "pfr_6a5082ee465f9e44cea479c7b741b3163162bb7e_generated"
+        )
 
         self.db.generate_merged_dataset(dynamic_dataset_path)
-        
-        for strategy in ['random', 'novel_premises']:
+
+        for strategy in ["random", "novel_premises"]:
             logger.info(f"Comparing datasets for {strategy} strategy")
             manual_theorems = []
             dynamic_theorems = []
 
-            for split in ['train', 'val', 'test']:
+            for split in ["train", "val", "test"]:
                 logger.info(f"Loading {split} split for {strategy} strategy")
                 manual_file = manual_dataset_path / strategy / f"{split}.json"
                 dynamic_file = dynamic_dataset_path / strategy / f"{split}.json"
-                
-                with open(manual_file, 'r') as f:
+
+                with open(manual_file, "r") as f:
                     manual_data = json.load(f)
                     manual_theorems.extend(manual_data)
-                logger.info(f"Loaded {len(manual_data)} theorems from manual {split} split")
-                
-                with open(dynamic_file, 'r') as f:
+                logger.info(
+                    f"Loaded {len(manual_data)} theorems from manual {split} split"
+                )
+
+                with open(dynamic_file, "r") as f:
                     dynamic_data = json.load(f)
                     dynamic_theorems.extend(dynamic_data)
-                logger.info(f"Loaded {len(dynamic_data)} theorems from dynamic {split} split")
-            
-            assert len(manual_theorems) == len(dynamic_theorems), "Manual and dynamic datasets have different number of theorems"
-            logger.info(f"Comparing {len(manual_theorems)} manual theorems with {len(dynamic_theorems)} dynamic theorems for {strategy} strategy")
-            self.assertTrue(self._fast_compare_theorems(manual_theorems, dynamic_theorems), 
-                        f"Theorem content for {strategy} strategy does not match")
+                logger.info(
+                    f"Loaded {len(dynamic_data)} theorems from dynamic {split} split"
+                )
+
+            assert len(manual_theorems) == len(
+                dynamic_theorems
+            ), "Manual and dynamic datasets have different number of theorems"
+            logger.info(
+                f"Comparing {len(manual_theorems)} manual theorems with {len(dynamic_theorems)} dynamic theorems for {strategy} strategy"
+            )
+            self.assertTrue(
+                self._fast_compare_theorems(manual_theorems, dynamic_theorems),
+                f"Theorem content for {strategy} strategy does not match",
+            )
             logger.info(f"Theorem content for {strategy} strategy matches")
 
         self.maxDiff = None
         logger.info("Comparing corpus and traced files")
-        with open(manual_dataset_path / "corpus.jsonl", 'r') as f:
+        with open(manual_dataset_path / "corpus.jsonl", "r") as f:
             manual_corpus = [json.loads(line) for line in f]
         logger.info(f"Loaded {len(manual_corpus)} items from manual corpus")
 
-        with open(dynamic_dataset_path / "corpus.jsonl", 'r') as f:
+        with open(dynamic_dataset_path / "corpus.jsonl", "r") as f:
             dynamic_corpus = [json.loads(line) for line in f]
         logger.info(f"Loaded {len(dynamic_corpus)} items from dynamic corpus")
 
-        assert len(manual_corpus) == len(dynamic_corpus), "Manual and dynamic datasets have different number of premise files"
+        assert len(manual_corpus) == len(
+            dynamic_corpus
+        ), "Manual and dynamic datasets have different number of premise files"
         logger.info("Comparing corpus content")
         try:
             self.assertCountEqual(manual_corpus, dynamic_corpus)
@@ -1467,15 +1635,17 @@ def test_compare_manual_and_dynamic_datasets(self):
             logger.info(str(e))
             raise
 
-        with open(manual_dataset_path / "traced_files.jsonl", 'r') as f:
+        with open(manual_dataset_path / "traced_files.jsonl", "r") as f:
             manual_traced = [json.loads(line) for line in f]
         logger.info(f"Loaded {len(manual_traced)} items from manual traced files")
 
-        with open(dynamic_dataset_path / "traced_files.jsonl", 'r') as f:
+        with open(dynamic_dataset_path / "traced_files.jsonl", "r") as f:
             dynamic_traced = [json.loads(line) for line in f]
         logger.info(f"Loaded {len(dynamic_traced)} items from dynamic traced files")
 
-        assert len(manual_traced) == len(dynamic_traced), "Manual and dynamic datasets have different number of traced files"
+        assert len(manual_traced) == len(
+            dynamic_traced
+        ), "Manual and dynamic datasets have different number of traced files"
         logger.info("Comparing traced files content")
         try:
             self.assertCountEqual(manual_traced, dynamic_traced)
@@ -1486,12 +1656,20 @@ def test_compare_manual_and_dynamic_datasets(self):
             raise
 
     def _fast_compare_theorems(self, manual_theorems, dynamic_theorems):
-        logger.info(f"Converting {len(manual_theorems)} manual theorems to hashable format")
+        logger.info(
+            f"Converting {len(manual_theorems)} manual theorems to hashable format"
+        )
         manual_set = set(map(self._theorem_to_hashable, manual_theorems))
-        assert len(manual_set) == len(manual_theorems), "Manual theorems contain duplicates"
-        logger.info(f"Converting {len(dynamic_theorems)} dynamic theorems to hashable format")
+        assert len(manual_set) == len(
+            manual_theorems
+        ), "Manual theorems contain duplicates"
+        logger.info(
+            f"Converting {len(dynamic_theorems)} dynamic theorems to hashable format"
+        )
         dynamic_set = set(map(self._theorem_to_hashable, dynamic_theorems))
-        assert len(dynamic_set) == len(dynamic_theorems), "Dynamic theorems contain duplicates"
+        assert len(dynamic_set) == len(
+            dynamic_theorems
+        ), "Dynamic theorems contain duplicates"
 
         logger.info("Comparing theorem sets")
         only_in_manual = manual_set - dynamic_set
@@ -1520,20 +1698,27 @@ def _fast_compare_theorems(self, manual_theorems, dynamic_theorems):
 
     def _theorem_to_hashable(self, theorem):
         return (
-            theorem['file_path'],
-            theorem['full_name'],
-            tuple(theorem['start']),
-            tuple(theorem['end']),
+            theorem["file_path"],
+            theorem["full_name"],
+            tuple(theorem["start"]),
+            tuple(theorem["end"]),
         )
 
     def _tactic_to_hashable(self, tactic):
         return (
-            tactic['tactic'],
-            tactic['annotated_tactic'][0],
-            tuple((a['full_name'], a['def_path'], tuple(a['def_pos']), tuple(a['def_end_pos']))
-                for a in tactic['annotated_tactic'][1]),
-            tactic['state_before'],
-            tactic['state_after']
+            tactic["tactic"],
+            tactic["annotated_tactic"][0],
+            tuple(
+                (
+                    a["full_name"],
+                    a["def_path"],
+                    tuple(a["def_pos"]),
+                    tuple(a["def_end_pos"]),
+                )
+                for a in tactic["annotated_tactic"][1]
+            ),
+            tactic["state_before"],
+            tactic["state_after"],
         )
 
     def test_unicode_handling_in_dataset(self):
@@ -1543,16 +1728,26 @@ def test_unicode_handling_in_dataset(self):
         dst_dir = Path(RAID_DIR) / DATA_DIR / f"{dir_name}_generated"
         self.db.generate_merged_dataset(dst_dir)
 
-        with open(dst_dir / "metadata.json", 'r', encoding='utf-8') as f:
+        with open(dst_dir / "metadata.json", "r", encoding="utf-8") as f:
             metadata = json.load(f)
-            self.assertIn('repositories', metadata, "No 'repositories' key in metadata")
-            self.assertGreater(len(metadata['repositories']), 0, "No repositories in metadata")
-            repo = metadata['repositories'][0]
-            self.assertIn('metadata', repo, "No 'metadata' key in repository")
-            repo_metadata = repo['metadata']
-            self.assertIn('unicode', repo_metadata, "No 'unicode' key in repository metadata")
-            self.assertIn("ユニコード", repo_metadata['unicode'], "Unicode string not found in metadata")
-            self.assertIn("ユニコード", metadata['repositories'][0]['metadata']['unicode'])
+            self.assertIn("repositories", metadata, "No 'repositories' key in metadata")
+            self.assertGreater(
+                len(metadata["repositories"]), 0, "No repositories in metadata"
+            )
+            repo = metadata["repositories"][0]
+            self.assertIn("metadata", repo, "No 'metadata' key in repository")
+            repo_metadata = repo["metadata"]
+            self.assertIn(
+                "unicode", repo_metadata, "No 'unicode' key in repository metadata"
+            )
+            self.assertIn(
+                "ユニコード",
+                repo_metadata["unicode"],
+                "Unicode string not found in metadata",
+            )
+            self.assertIn(
+                "ユニコード", metadata["repositories"][0]["metadata"]["unicode"]
+            )
 
     def tearDown(self):
         # Clean up generated files after tests
@@ -1564,7 +1759,11 @@ def tearDown(self):
             shutil.rmtree(dst_dir)
 
     def test_theorem_statement(self):
-        theorem = next(t for t in self.sample_repo.proven_theorems if t.full_name == "ContinuousLinearMap.opNorm_lsmul")
+        theorem = next(
+            t
+            for t in self.sample_repo.proven_theorems
+            if t.full_name == "ContinuousLinearMap.opNorm_lsmul"
+        )
         self.assertIsNotNone(theorem.theorem_statement)
         self.assertIn("opNorm_lsmul", theorem.theorem_statement)
 
@@ -1575,8 +1774,14 @@ def test_unicode_handling(self):
 
     def test_file_tracing(self):
         self.assertGreater(len(self.sample_repo.files_traced), 0)
-        self.assertIn(Path("PFR/Mathlib/GroupTheory/Torsion.lean"), self.sample_repo.files_traced)
-        self.assertIn(Path(".lake/packages/batteries/Batteries/Data/List/Lemmas.lean"), self.sample_repo.files_traced)
+        self.assertIn(
+            Path("PFR/Mathlib/GroupTheory/Torsion.lean"), self.sample_repo.files_traced
+        )
+        self.assertIn(
+            Path(".lake/packages/batteries/Batteries/Data/List/Lemmas.lean"),
+            self.sample_repo.files_traced,
+        )
+
 
 class TestDynamicDatabasePFRNewVersion(unittest.TestCase):
     """
@@ -1598,10 +1803,16 @@ class TestDynamicDatabasePFRNewVersion(unittest.TestCase):
     It also compares dynamically generated datasets with manually created ones
     to ensure compatibility and correctness.
     """
+
     def setUp(self):
         self.db = DynamicDatabase()
-        self.sample_repo_PFR = self.create_sample_repo("https://github.com/teorth/pfr", "6a5082ee465f9e44cea479c7b741b3163162bb7e")
-        self.sample_repo_new_version = self.create_sample_repo("https://github.com/Adarsh321123/new-version-test", "f465306be03ced999caa157a85558a6c41b3e3f5")
+        self.sample_repo_PFR = self.create_sample_repo(
+            "https://github.com/teorth/pfr", "6a5082ee465f9e44cea479c7b741b3163162bb7e"
+        )
+        self.sample_repo_new_version = self.create_sample_repo(
+            "https://github.com/Adarsh321123/new-version-test",
+            "f465306be03ced999caa157a85558a6c41b3e3f5",
+        )
         self.db.add_repository(self.sample_repo_PFR)
         self.db.add_repository(self.sample_repo_new_version)
 
@@ -1628,14 +1839,14 @@ def create_sample_repo(self, url, commit):
         }
         repo = Repository.from_dict(data)
         return repo
-    
+
     def test_multiple_json_serialization_deserialization(self):
         json_file1 = "test_multiple_1.json"
         json_file2 = "test_multiple_2.json"
-        
+
         # First serialization
         self.db.to_json(json_file1)
-        
+
         # Deserialize and modify
         loaded_db1 = DynamicDatabase.from_json(json_file1)
         new_repo = Repository(
@@ -1644,31 +1855,45 @@ def test_multiple_json_serialization_deserialization(self):
             commit="newcommit123",
             lean_version="4.0.0",
             lean_dojo_version="1.0.0",
-            metadata={"date_processed": datetime.datetime.now()}
+            metadata={"date_processed": datetime.datetime.now()},
         )
         loaded_db1.add_repository(new_repo)
-        
+
         # Second serialization
         loaded_db1.to_json(json_file2)
-        
+
         # Final deserialization
         loaded_db2 = DynamicDatabase.from_json(json_file2)
-        
-        self.assertEqual(len(loaded_db2.repositories), 3)  # PFR, new-version-test, and new-repo
-        self.assertEqual(loaded_db2.repositories[2].url, "https://github.com/test/new-repo")
+
+        self.assertEqual(
+            len(loaded_db2.repositories), 3
+        )  # PFR, new-version-test, and new-repo
+        self.assertEqual(
+            loaded_db2.repositories[2].url, "https://github.com/test/new-repo"
+        )
         self.assertEqual(loaded_db2.repositories[2].commit, "newcommit123")
-        
+
         # Check that the original repositories are still intact
-        self.assertEqual(loaded_db2.repositories[0].url, "https://github.com/teorth/pfr")
-        self.assertEqual(loaded_db2.repositories[1].url, "https://github.com/Adarsh321123/new-version-test")
-        
+        self.assertEqual(
+            loaded_db2.repositories[0].url, "https://github.com/teorth/pfr"
+        )
+        self.assertEqual(
+            loaded_db2.repositories[1].url,
+            "https://github.com/Adarsh321123/new-version-test",
+        )
+
         # Verify that the content of the repositories is preserved
-        pfr_repo = loaded_db2.get_repository("https://github.com/teorth/pfr", "6a5082ee465f9e44cea479c7b741b3163162bb7e")
+        pfr_repo = loaded_db2.get_repository(
+            "https://github.com/teorth/pfr", "6a5082ee465f9e44cea479c7b741b3163162bb7e"
+        )
         self.assertIsNotNone(pfr_repo)
         self.assertGreater(len(pfr_repo.proven_theorems), 0)
         self.assertGreater(len(pfr_repo.sorry_theorems_unproved), 0)
-        
-        new_version_repo = loaded_db2.get_repository("https://github.com/Adarsh321123/new-version-test", "f465306be03ced999caa157a85558a6c41b3e3f5")
+
+        new_version_repo = loaded_db2.get_repository(
+            "https://github.com/Adarsh321123/new-version-test",
+            "f465306be03ced999caa157a85558a6c41b3e3f5",
+        )
         self.assertIsNotNone(new_version_repo)
         self.assertGreater(len(new_version_repo.proven_theorems), 0)
         self.assertGreater(len(new_version_repo.sorry_theorems_unproved), 0)
@@ -1676,49 +1901,92 @@ def test_multiple_json_serialization_deserialization(self):
     def test_repository_creation(self):
         self.assertIsNotNone(self.sample_repo_PFR)
         self.assertEqual(self.sample_repo_PFR.url, "https://github.com/teorth/pfr")
-        self.assertEqual(self.sample_repo_PFR.commit, "6a5082ee465f9e44cea479c7b741b3163162bb7e")
+        self.assertEqual(
+            self.sample_repo_PFR.commit, "6a5082ee465f9e44cea479c7b741b3163162bb7e"
+        )
         self.assertIsNotNone(self.sample_repo_new_version)
-        self.assertEqual(self.sample_repo_new_version.url, "https://github.com/Adarsh321123/new-version-test")
-        self.assertEqual(self.sample_repo_new_version.commit, "f465306be03ced999caa157a85558a6c41b3e3f5")
+        self.assertEqual(
+            self.sample_repo_new_version.url,
+            "https://github.com/Adarsh321123/new-version-test",
+        )
+        self.assertEqual(
+            self.sample_repo_new_version.commit,
+            "f465306be03ced999caa157a85558a6c41b3e3f5",
+        )
 
     def test_theorem_loading(self):
         self.assertGreater(len(self.sample_repo_PFR.proven_theorems), 0)
         self.assertGreater(len(self.sample_repo_PFR.sorry_theorems_unproved), 0)
 
-        theorem = next(t for t in self.sample_repo_PFR.proven_theorems if t.full_name == "ContinuousLinearMap.opNorm_lsmul")
+        theorem = next(
+            t
+            for t in self.sample_repo_PFR.proven_theorems
+            if t.full_name == "ContinuousLinearMap.opNorm_lsmul"
+        )
         self.assertIsNotNone(theorem)
-        self.assertEqual(theorem.file_path, Path(".lake/packages/mathlib/Mathlib/Analysis/NormedSpace/OperatorNorm/Mul.lean"))
+        self.assertEqual(
+            theorem.file_path,
+            Path(
+                ".lake/packages/mathlib/Mathlib/Analysis/NormedSpace/OperatorNorm/Mul.lean"
+            ),
+        )
         self.assertEqual(theorem.start, Pos(281, 1))
         self.assertEqual(theorem.end, Pos(290, 26))
 
         self.assertGreater(len(self.sample_repo_new_version.proven_theorems), 0)
         self.assertGreater(len(self.sample_repo_new_version.sorry_theorems_unproved), 0)
 
-        theorem = next(t for t in self.sample_repo_new_version.proven_theorems if t.full_name == "Ordinal.le_mul_right")
+        theorem = next(
+            t
+            for t in self.sample_repo_new_version.proven_theorems
+            if t.full_name == "Ordinal.le_mul_right"
+        )
         self.assertIsNotNone(theorem)
-        self.assertEqual(theorem.file_path, Path(".lake/packages/mathlib/Mathlib/SetTheory/Ordinal/Arithmetic.lean"))
+        self.assertEqual(
+            theorem.file_path,
+            Path(".lake/packages/mathlib/Mathlib/SetTheory/Ordinal/Arithmetic.lean"),
+        )
         self.assertEqual(theorem.start, Pos(742, 1))
         self.assertEqual(theorem.end, Pos(744, 17))
 
     def test_traced_tactics(self):
-        theorem = next(t for t in self.sample_repo_PFR.proven_theorems if t.full_name == "ContinuousLinearMap.opNorm_lsmul")
+        theorem = next(
+            t
+            for t in self.sample_repo_PFR.proven_theorems
+            if t.full_name == "ContinuousLinearMap.opNorm_lsmul"
+        )
         self.assertGreater(len(theorem.traced_tactics), 0)
 
         first_tactic = theorem.traced_tactics[0]
-        self.assertEqual(first_tactic.tactic, "refine' ContinuousLinearMap.opNorm_eq_of_bounds zero_le_one (fun x => _) fun N _ h => _")
-        self.assertIn("ContinuousLinearMap.opNorm_eq_of_bounds", first_tactic.annotated_tactic[0])
+        self.assertEqual(
+            first_tactic.tactic,
+            "refine' ContinuousLinearMap.opNorm_eq_of_bounds zero_le_one (fun x => _) fun N _ h => _",
+        )
+        self.assertIn(
+            "ContinuousLinearMap.opNorm_eq_of_bounds", first_tactic.annotated_tactic[0]
+        )
 
-        theorem = next(t for t in self.sample_repo_new_version.proven_theorems if t.full_name == "Ordinal.le_mul_right")
+        theorem = next(
+            t
+            for t in self.sample_repo_new_version.proven_theorems
+            if t.full_name == "Ordinal.le_mul_right"
+        )
         self.assertGreater(len(theorem.traced_tactics), 0)
 
         first_tactic = theorem.traced_tactics[0]
-        self.assertEqual(first_tactic.tactic, "convert mul_le_mul_right' (one_le_iff_pos.2 hb) a")
+        self.assertEqual(
+            first_tactic.tactic, "convert mul_le_mul_right' (one_le_iff_pos.2 hb) a"
+        )
         self.assertIn("mul_le_mul_right'", first_tactic.annotated_tactic[0])
 
     def test_premise_loading(self):
         self.assertGreater(len(self.sample_repo_PFR.premise_files), 0)
 
-        premise_file = next(pf for pf in self.sample_repo_PFR.premise_files if pf.path == Path(".lake/packages/lean4/src/lean/Init/Prelude.lean"))
+        premise_file = next(
+            pf
+            for pf in self.sample_repo_PFR.premise_files
+            if pf.path == Path(".lake/packages/lean4/src/lean/Init/Prelude.lean")
+        )
         self.assertIsNotNone(premise_file)
         self.assertGreater(len(premise_file.premises), 0)
 
@@ -1728,7 +1996,11 @@ def test_premise_loading(self):
 
         self.assertGreater(len(self.sample_repo_new_version.premise_files), 0)
 
-        premise_file = next(pf for pf in self.sample_repo_new_version.premise_files if pf.path == Path(".lake/packages/lean4/src/lean/Init/Prelude.lean"))
+        premise_file = next(
+            pf
+            for pf in self.sample_repo_new_version.premise_files
+            if pf.path == Path(".lake/packages/lean4/src/lean/Init/Prelude.lean")
+        )
         self.assertIsNotNone(premise_file)
         self.assertGreater(len(premise_file.premises), 0)
 
@@ -1749,16 +2021,32 @@ def test_serialization_deserialization(self):
 
         self.assertEqual(original_repo_PFR.name, deserialized_repo_PFR.name)
         self.assertEqual(original_repo_PFR.commit, deserialized_repo_PFR.commit)
-        self.assertEqual(len(original_repo_PFR.proven_theorems), len(deserialized_repo_PFR.proven_theorems))
-        self.assertEqual(len(original_repo_PFR.premise_files), len(deserialized_repo_PFR.premise_files))
+        self.assertEqual(
+            len(original_repo_PFR.proven_theorems),
+            len(deserialized_repo_PFR.proven_theorems),
+        )
+        self.assertEqual(
+            len(original_repo_PFR.premise_files),
+            len(deserialized_repo_PFR.premise_files),
+        )
 
         original_repo_new_version = self.db.repositories[0]
         deserialized_repo_new_version = deserialized_db.repositories[0]
 
-        self.assertEqual(original_repo_new_version.name, deserialized_repo_new_version.name)
-        self.assertEqual(original_repo_new_version.commit, deserialized_repo_new_version.commit)
-        self.assertEqual(len(original_repo_new_version.proven_theorems), len(deserialized_repo_new_version.proven_theorems))
-        self.assertEqual(len(original_repo_new_version.premise_files), len(deserialized_repo_new_version.premise_files))
+        self.assertEqual(
+            original_repo_new_version.name, deserialized_repo_new_version.name
+        )
+        self.assertEqual(
+            original_repo_new_version.commit, deserialized_repo_new_version.commit
+        )
+        self.assertEqual(
+            len(original_repo_new_version.proven_theorems),
+            len(deserialized_repo_new_version.proven_theorems),
+        )
+        self.assertEqual(
+            len(original_repo_new_version.premise_files),
+            len(deserialized_repo_new_version.premise_files),
+        )
 
     def test_generate_dataset_structure(self):
         url_PFR = "https://github.com/teorth/pfr"
@@ -1767,7 +2055,11 @@ def test_generate_dataset_structure(self):
         url_new_version = "https://github.com/Adarsh321123/new-version-test"
         commit_new_version = "f465306be03ced999caa157a85558a6c41b3e3f5"
         dir_name_new_version = url_new_version.split("/")[-1] + "_" + commit_new_version
-        dst_dir = Path(RAID_DIR) / DATA_DIR / f"{dir_name_PFR}_{dir_name_new_version}_generated"
+        dst_dir = (
+            Path(RAID_DIR)
+            / DATA_DIR
+            / f"{dir_name_PFR}_{dir_name_new_version}_generated"
+        )
         self.db.generate_merged_dataset(dst_dir)
 
         self.assertTrue(dst_dir.exists())
@@ -1790,30 +2082,42 @@ def test_generated_dataset_content(self):
         url_new_version = "https://github.com/Adarsh321123/new-version-test"
         commit_new_version = "f465306be03ced999caa157a85558a6c41b3e3f5"
         dir_name_new_version = url_new_version.split("/")[-1] + "_" + commit_new_version
-        dst_dir = Path(RAID_DIR) / DATA_DIR / f"{dir_name_PFR}_{dir_name_new_version}_generated"
+        dst_dir = (
+            Path(RAID_DIR)
+            / DATA_DIR
+            / f"{dir_name_PFR}_{dir_name_new_version}_generated"
+        )
         self.db.generate_merged_dataset(dst_dir)
 
         # Count sorry theorems in each repository
         repo_sorry_counts = {
             self.sample_repo_PFR.url: len(self.sample_repo_PFR.sorry_theorems_unproved),
-            self.sample_repo_new_version.url: len(self.sample_repo_new_version.sorry_theorems_unproved)
+            self.sample_repo_new_version.url: len(
+                self.sample_repo_new_version.sorry_theorems_unproved
+            ),
         }
         total_sorry_theorems = sum(repo_sorry_counts.values())
 
         # Count sorry theorems in the generated dataset
         dataset_sorry_count = 0
-        for split in ['train', 'val', 'test']:
-            with open(dst_dir / "random" / f"{split}.json", 'r') as f:
+        for split in ["train", "val", "test"]:
+            with open(dst_dir / "random" / f"{split}.json", "r") as f:
                 data = json.load(f)
                 for theorem in data:
-                    if any(tactic.get('tactic') == 'sorry' for tactic in theorem.get('traced_tactics', [])):
+                    if any(
+                        tactic.get("tactic") == "sorry"
+                        for tactic in theorem.get("traced_tactics", [])
+                    ):
                         dataset_sorry_count += 1
 
-        self.assertEqual(dataset_sorry_count, total_sorry_theorems, 
-                            f"Number of sorry theorems in dataset ({dataset_sorry_count}) does not match "
-                            f"the sum from individual repositories ({total_sorry_theorems})")
+        self.assertEqual(
+            dataset_sorry_count,
+            total_sorry_theorems,
+            f"Number of sorry theorems in dataset ({dataset_sorry_count}) does not match "
+            f"the sum from individual repositories ({total_sorry_theorems})",
+        )
 
-        with open(dst_dir / "random" / "train.json", 'r') as f:
+        with open(dst_dir / "random" / "train.json", "r") as f:
             train_data = json.load(f)
             self.assertIsInstance(train_data, list)
             self.assertGreater(len(train_data), 0)
@@ -1827,19 +2131,19 @@ def test_generated_dataset_content(self):
             self.assertIn("end", first_theorem)
             self.assertIn("traced_tactics", first_theorem)
 
-        with open(dst_dir / "corpus.jsonl", 'r') as f:
+        with open(dst_dir / "corpus.jsonl", "r") as f:
             first_line = f.readline().strip()
             first_premise_file = json.loads(first_line)
             self.assertIn("path", first_premise_file)
             self.assertIn("imports", first_premise_file)
             self.assertIn("premises", first_premise_file)
 
-        with open(dst_dir / "traced_files.jsonl", 'r') as f:
+        with open(dst_dir / "traced_files.jsonl", "r") as f:
             first_line = f.readline().strip()
             first_traced_file = json.loads(first_line)
             self.assertIn("traced_file_path", first_traced_file)
 
-        with open(dst_dir / "metadata.json", 'r') as f:
+        with open(dst_dir / "metadata.json", "r") as f:
             metadata = json.load(f)
             self.assertIn("repositories", metadata)
             self.assertEqual(len(metadata["repositories"]), 2)
@@ -1851,9 +2155,12 @@ def test_generated_dataset_content(self):
             self.assertIn("num_files_traced", metadata)
 
             # Check if the total number of sorry theorems in metadata matches our count
-            self.assertEqual(metadata["num_sorry_theorems"], total_sorry_theorems,
-                                f"Number of sorry theorems in metadata ({metadata['num_sorry_theorems']}) "
-                                f"does not match the sum from individual repositories ({total_sorry_theorems})")
+            self.assertEqual(
+                metadata["num_sorry_theorems"],
+                total_sorry_theorems,
+                f"Number of sorry theorems in metadata ({metadata['num_sorry_theorems']}) "
+                f"does not match the sum from individual repositories ({total_sorry_theorems})",
+            )
 
             for repo in metadata["repositories"]:
                 self.assertIn("url", repo)
@@ -1870,25 +2177,53 @@ def test_dataset_splitting(self):
         url_new_version = "https://github.com/Adarsh321123/new-version-test"
         commit_new_version = "f465306be03ced999caa157a85558a6c41b3e3f5"
         dir_name_new_version = url_new_version.split("/")[-1] + "_" + commit_new_version
-        dst_dir = Path(RAID_DIR) / DATA_DIR / f"{dir_name_PFR}_{dir_name_new_version}_generated"
+        dst_dir = (
+            Path(RAID_DIR)
+            / DATA_DIR
+            / f"{dir_name_PFR}_{dir_name_new_version}_generated"
+        )
         self.db.generate_merged_dataset(dst_dir)
 
-        for strategy in ['random', 'novel_premises']:
+        for strategy in ["random", "novel_premises"]:
             train_set = set()
             val_set = set()
             test_set = set()
 
-            with open(dst_dir / strategy / "train.json", 'r') as f:
+            with open(dst_dir / strategy / "train.json", "r") as f:
                 train_data = json.load(f)
-                train_set = set((item['full_name'], item['file_path'], tuple(item['start']), tuple(item['end'])) for item in train_data)
+                train_set = set(
+                    (
+                        item["full_name"],
+                        item["file_path"],
+                        tuple(item["start"]),
+                        tuple(item["end"]),
+                    )
+                    for item in train_data
+                )
 
-            with open(dst_dir / strategy / "val.json", 'r') as f:
+            with open(dst_dir / strategy / "val.json", "r") as f:
                 val_data = json.load(f)
-                val_set = set((item['full_name'], item['file_path'], tuple(item['start']), tuple(item['end'])) for item in val_data)
+                val_set = set(
+                    (
+                        item["full_name"],
+                        item["file_path"],
+                        tuple(item["start"]),
+                        tuple(item["end"]),
+                    )
+                    for item in val_data
+                )
 
-            with open(dst_dir / strategy / "test.json", 'r') as f:
+            with open(dst_dir / strategy / "test.json", "r") as f:
                 test_data = json.load(f)
-                test_set = set((item['full_name'], item['file_path'], tuple(item['start']), tuple(item['end'])) for item in test_data)
+                test_set = set(
+                    (
+                        item["full_name"],
+                        item["file_path"],
+                        tuple(item["start"]),
+                        tuple(item["end"]),
+                    )
+                    for item in test_data
+                )
 
             self.assertGreater(len(train_set), 0)
             self.assertGreater(len(val_set), 0)
@@ -1905,73 +2240,105 @@ def test_dataset_consistency(self):
         url_new_version = "https://github.com/Adarsh321123/new-version-test"
         commit_new_version = "f465306be03ced999caa157a85558a6c41b3e3f5"
         dir_name_new_version = url_new_version.split("/")[-1] + "_" + commit_new_version
-        dst_dir = Path(RAID_DIR) / DATA_DIR / f"{dir_name_PFR}_{dir_name_new_version}_generated"
+        dst_dir = (
+            Path(RAID_DIR)
+            / DATA_DIR
+            / f"{dir_name_PFR}_{dir_name_new_version}_generated"
+        )
         self.db.generate_merged_dataset(dst_dir)
 
         # Check that all theorems in the dataset are from the original repositories
-        all_theorems_PFR = set(thm.full_name for thm in self.sample_repo_PFR.get_all_theorems)
-        all_theorems_new_version = set(thm.full_name for thm in self.sample_repo_new_version.get_all_theorems)
+        all_theorems_PFR = set(
+            thm.full_name for thm in self.sample_repo_PFR.get_all_theorems
+        )
+        all_theorems_new_version = set(
+            thm.full_name for thm in self.sample_repo_new_version.get_all_theorems
+        )
 
-        for strategy in ['random', 'novel_premises']:
-            for split in ['train', 'val', 'test']:
-                with open(dst_dir / strategy / f"{split}.json", 'r') as f:
+        for strategy in ["random", "novel_premises"]:
+            for split in ["train", "val", "test"]:
+                with open(dst_dir / strategy / f"{split}.json", "r") as f:
                     data = json.load(f)
                     for item in data:
-                        self.assertIn(item['full_name'], all_theorems_PFR | all_theorems_new_version)
-    
+                        self.assertIn(
+                            item["full_name"],
+                            all_theorems_PFR | all_theorems_new_version,
+                        )
+
     def test_generate_dataset_with_specific_repo(self):
         dynamic_dataset_path = Path(RAID_DIR) / DATA_DIR / "pfr_only_generated"
-        self.db.generate_merged_dataset(dynamic_dataset_path, repos_to_include=[(self.sample_repo_PFR.url, self.sample_repo_PFR.commit)])
+        self.db.generate_merged_dataset(
+            dynamic_dataset_path,
+            repos_to_include=[(self.sample_repo_PFR.url, self.sample_repo_PFR.commit)],
+        )
 
         self.assertTrue(dynamic_dataset_path.exists())
         self.assertTrue((dynamic_dataset_path / "random").exists())
         self.assertTrue((dynamic_dataset_path / "novel_premises").exists())
 
-        with open(dynamic_dataset_path / "metadata.json", 'r') as f:
+        with open(dynamic_dataset_path / "metadata.json", "r") as f:
             metadata = json.load(f)
             self.assertEqual(len(metadata["repositories"]), 1)
-            self.assertEqual(metadata["repositories"][0]["url"], self.sample_repo_PFR.url)
+            self.assertEqual(
+                metadata["repositories"][0]["url"], self.sample_repo_PFR.url
+            )
 
         # Compare with the original PFR dataset
-        manual_dataset_path = Path(RAID_DIR) / DATA_DIR / "pfr_6a5082ee465f9e44cea479c7b741b3163162bb7e_updated"
-        
-        for strategy in ['random', 'novel_premises']:
+        manual_dataset_path = (
+            Path(RAID_DIR)
+            / DATA_DIR
+            / "pfr_6a5082ee465f9e44cea479c7b741b3163162bb7e_updated"
+        )
+
+        for strategy in ["random", "novel_premises"]:
             logger.info(f"Comparing datasets for {strategy} strategy")
             manual_theorems = []
             dynamic_theorems = []
 
-            for split in ['train', 'val', 'test']:
+            for split in ["train", "val", "test"]:
                 logger.info(f"Loading {split} split for {strategy} strategy")
                 manual_file = manual_dataset_path / strategy / f"{split}.json"
                 dynamic_file = dynamic_dataset_path / strategy / f"{split}.json"
-                
-                with open(manual_file, 'r') as f:
+
+                with open(manual_file, "r") as f:
                     manual_data = json.load(f)
                     manual_theorems.extend(manual_data)
-                logger.info(f"Loaded {len(manual_data)} theorems from manual {split} split")
-                
-                with open(dynamic_file, 'r') as f:
+                logger.info(
+                    f"Loaded {len(manual_data)} theorems from manual {split} split"
+                )
+
+                with open(dynamic_file, "r") as f:
                     dynamic_data = json.load(f)
                     dynamic_theorems.extend(dynamic_data)
-                logger.info(f"Loaded {len(dynamic_data)} theorems from dynamic {split} split")
-            
-            assert len(manual_theorems) == len(dynamic_theorems), "Manual and dynamic datasets have different number of theorems"
-            logger.info(f"Comparing {len(manual_theorems)} manual theorems with {len(dynamic_theorems)} dynamic theorems for {strategy} strategy")
-            self.assertTrue(self._fast_compare_theorems(manual_theorems, dynamic_theorems), 
-                        f"Theorem content for {strategy} strategy does not match")
+                logger.info(
+                    f"Loaded {len(dynamic_data)} theorems from dynamic {split} split"
+                )
+
+            assert len(manual_theorems) == len(
+                dynamic_theorems
+            ), "Manual and dynamic datasets have different number of theorems"
+            logger.info(
+                f"Comparing {len(manual_theorems)} manual theorems with {len(dynamic_theorems)} dynamic theorems for {strategy} strategy"
+            )
+            self.assertTrue(
+                self._fast_compare_theorems(manual_theorems, dynamic_theorems),
+                f"Theorem content for {strategy} strategy does not match",
+            )
             logger.info(f"Theorem content for {strategy} strategy matches")
 
         self.maxDiff = None
         logger.info("Comparing corpus and traced files")
-        with open(manual_dataset_path / "corpus.jsonl", 'r') as f:
+        with open(manual_dataset_path / "corpus.jsonl", "r") as f:
             manual_corpus = [json.loads(line) for line in f]
         logger.info(f"Loaded {len(manual_corpus)} items from manual corpus")
 
-        with open(dynamic_dataset_path / "corpus.jsonl", 'r') as f:
+        with open(dynamic_dataset_path / "corpus.jsonl", "r") as f:
             dynamic_corpus = [json.loads(line) for line in f]
         logger.info(f"Loaded {len(dynamic_corpus)} items from dynamic corpus")
 
-        assert len(manual_corpus) == len(dynamic_corpus), "Manual and dynamic datasets have different number of premise files"
+        assert len(manual_corpus) == len(
+            dynamic_corpus
+        ), "Manual and dynamic datasets have different number of premise files"
         logger.info("Comparing corpus content")
         try:
             self.assertCountEqual(manual_corpus, dynamic_corpus)
@@ -1981,15 +2348,17 @@ def test_generate_dataset_with_specific_repo(self):
             logger.info(str(e))
             raise
 
-        with open(manual_dataset_path / "traced_files.jsonl", 'r') as f:
+        with open(manual_dataset_path / "traced_files.jsonl", "r") as f:
             manual_traced = [json.loads(line) for line in f]
         logger.info(f"Loaded {len(manual_traced)} items from manual traced files")
 
-        with open(dynamic_dataset_path / "traced_files.jsonl", 'r') as f:
+        with open(dynamic_dataset_path / "traced_files.jsonl", "r") as f:
             dynamic_traced = [json.loads(line) for line in f]
         logger.info(f"Loaded {len(dynamic_traced)} items from dynamic traced files")
 
-        assert len(manual_traced) == len(dynamic_traced), "Manual and dynamic datasets have different number of traced files"
+        assert len(manual_traced) == len(
+            dynamic_traced
+        ), "Manual and dynamic datasets have different number of traced files"
         logger.info("Comparing traced files content")
         try:
             self.assertCountEqual(manual_traced, dynamic_traced)
@@ -2002,32 +2371,46 @@ def test_generate_dataset_with_specific_repo(self):
     def test_compare_manual_and_dynamic_datasets(self):
         random.seed(3407)
 
-        manual_dataset_path = Path(RAID_DIR) / MERGED_DATA_DIR / "merged_pfr_6a5082ee465f9e44cea479c7b741b3163162bb7e_new-version-test_f465306be03ced999caa157a85558a6c41b3e3f5_updated"
-        dynamic_dataset_path = Path(RAID_DIR) / MERGED_DATA_DIR / "merged_pfr_6a5082ee465f9e44cea479c7b741b3163162bb7e_new-version-test_f465306be03ced999caa157a85558a6c41b3e3f5_generated"
+        manual_dataset_path = (
+            Path(RAID_DIR)
+            / MERGED_DATA_DIR
+            / "merged_pfr_6a5082ee465f9e44cea479c7b741b3163162bb7e_new-version-test_f465306be03ced999caa157a85558a6c41b3e3f5_updated"
+        )
+        dynamic_dataset_path = (
+            Path(RAID_DIR)
+            / MERGED_DATA_DIR
+            / "merged_pfr_6a5082ee465f9e44cea479c7b741b3163162bb7e_new-version-test_f465306be03ced999caa157a85558a6c41b3e3f5_generated"
+        )
 
         self.db.generate_merged_dataset(dynamic_dataset_path)
-        
-        for strategy in ['random', 'novel_premises']:
+
+        for strategy in ["random", "novel_premises"]:
             logger.info(f"Comparing datasets for {strategy} strategy")
             manual_theorems = []
             dynamic_theorems = []
 
-            for split in ['train', 'val', 'test']:
+            for split in ["train", "val", "test"]:
                 logger.info(f"Loading {split} split for {strategy} strategy")
                 manual_file = manual_dataset_path / strategy / f"{split}.json"
                 dynamic_file = dynamic_dataset_path / strategy / f"{split}.json"
-                
-                with open(manual_file, 'r') as f:
+
+                with open(manual_file, "r") as f:
                     manual_data = json.load(f)
                     manual_theorems.extend(manual_data)
-                logger.info(f"Loaded {len(manual_data)} theorems from manual {split} split")
-                
-                with open(dynamic_file, 'r') as f:
+                logger.info(
+                    f"Loaded {len(manual_data)} theorems from manual {split} split"
+                )
+
+                with open(dynamic_file, "r") as f:
                     dynamic_data = json.load(f)
                     dynamic_theorems.extend(dynamic_data)
-                logger.info(f"Loaded {len(dynamic_data)} theorems from dynamic {split} split")
-            
-            logger.info(f"Comparing {len(manual_theorems)} manual theorems with {len(dynamic_theorems)} dynamic theorems for {strategy} strategy")
+                logger.info(
+                    f"Loaded {len(dynamic_data)} theorems from dynamic {split} split"
+                )
+
+            logger.info(
+                f"Comparing {len(manual_theorems)} manual theorems with {len(dynamic_theorems)} dynamic theorems for {strategy} strategy"
+            )
 
             # The manual code has a bug where it allows duplicate theorems as long as they exist in different repositories.
             # As such, we need to remove these duplicates.
@@ -2040,89 +2423,145 @@ def test_compare_manual_and_dynamic_datasets(self):
                 else:
                     manual_dict[key] = thm
             deduplicated_manual_theorems = list(manual_dict.values())
-            
+
             dynamic_dict = {self._theorem_to_key(t): t for t in dynamic_theorems}
 
-            logger.info(f"After deduplication - Manual theorems: {len(deduplicated_manual_theorems)}, Dynamic theorems: {len(dynamic_theorems)}")
-            
+            logger.info(
+                f"After deduplication - Manual theorems: {len(deduplicated_manual_theorems)}, Dynamic theorems: {len(dynamic_theorems)}"
+            )
+
             only_in_manual = set(manual_dict.keys()) - set(dynamic_dict.keys())
             only_in_dynamic = set(dynamic_dict.keys()) - set(manual_dict.keys())
-            
+
             if only_in_manual:
-                logger.error(f"{len(only_in_manual)} theorems only in manual dataset for {strategy}")
+                logger.error(
+                    f"{len(only_in_manual)} theorems only in manual dataset for {strategy}"
+                )
                 for key in list(only_in_manual)[:1]:
                     manual_thm = manual_dict[key]
-                    logger.error(f"Manual only: {manual_thm['full_name']} in {manual_thm['file_path']}")
-                    logger.error(f"  URL: {manual_thm['url']}, Commit: {manual_thm['commit']}")
-                    logger.error(f"  Start: {manual_thm['start']}, End: {manual_thm['end']}")
-                    logger.error(f"  Theorem statement: {manual_thm['theorem_statement'][:100]}...")  # First 100 chars
-            
+                    logger.error(
+                        f"Manual only: {manual_thm['full_name']} in {manual_thm['file_path']}"
+                    )
+                    logger.error(
+                        f"  URL: {manual_thm['url']}, Commit: {manual_thm['commit']}"
+                    )
+                    logger.error(
+                        f"  Start: {manual_thm['start']}, End: {manual_thm['end']}"
+                    )
+                    logger.error(
+                        f"  Theorem statement: {manual_thm['theorem_statement'][:100]}..."
+                    )  # First 100 chars
+
             if only_in_dynamic:
-                logger.error(f"{len(only_in_dynamic)} theorems only in dynamic dataset for {strategy}")
+                logger.error(
+                    f"{len(only_in_dynamic)} theorems only in dynamic dataset for {strategy}"
+                )
                 for key in list(only_in_dynamic)[:1]:
                     dynamic_thm = dynamic_dict[key]
-                    logger.error(f"Dynamic only: {dynamic_thm['full_name']} in {dynamic_thm['file_path']}")
-                    logger.error(f"  URL: {dynamic_thm['url']}, Commit: {dynamic_thm['commit']}")
-                    logger.error(f"  Start: {dynamic_thm['start']}, End: {dynamic_thm['end']}")
-                    logger.error(f"  Theorem statement: {dynamic_thm['theorem_statement'][:100]}...")  # First 100 chars
-            
-            self.assertEqual(len(only_in_manual), 0, f"Theorems found only in manual dataset for {strategy}")
-            self.assertEqual(len(only_in_dynamic), 0, f"Theorems found only in dynamic dataset for {strategy}")
-            
-            assert len(set(manual_dict.keys())) == len(set(dynamic_dict.keys())), "Manual and dynamic datasets have different number of theorems"
-            self.assertTrue(self._fast_compare_theorems(deduplicated_manual_theorems, dynamic_theorems), 
-                        f"Theorem content for {strategy} strategy does not match")
-            logger.info(f"Theorem content for {strategy} strategy matches after deduplication")
+                    logger.error(
+                        f"Dynamic only: {dynamic_thm['full_name']} in {dynamic_thm['file_path']}"
+                    )
+                    logger.error(
+                        f"  URL: {dynamic_thm['url']}, Commit: {dynamic_thm['commit']}"
+                    )
+                    logger.error(
+                        f"  Start: {dynamic_thm['start']}, End: {dynamic_thm['end']}"
+                    )
+                    logger.error(
+                        f"  Theorem statement: {dynamic_thm['theorem_statement'][:100]}..."
+                    )  # First 100 chars
+
+            self.assertEqual(
+                len(only_in_manual),
+                0,
+                f"Theorems found only in manual dataset for {strategy}",
+            )
+            self.assertEqual(
+                len(only_in_dynamic),
+                0,
+                f"Theorems found only in dynamic dataset for {strategy}",
+            )
+
+            assert len(set(manual_dict.keys())) == len(
+                set(dynamic_dict.keys())
+            ), "Manual and dynamic datasets have different number of theorems"
+            self.assertTrue(
+                self._fast_compare_theorems(
+                    deduplicated_manual_theorems, dynamic_theorems
+                ),
+                f"Theorem content for {strategy} strategy does not match",
+            )
+            logger.info(
+                f"Theorem content for {strategy} strategy matches after deduplication"
+            )
 
         self.maxDiff = None
         logger.info("Comparing corpus and traced files")
-        with open(manual_dataset_path / "corpus.jsonl", 'r') as f:
+        with open(manual_dataset_path / "corpus.jsonl", "r") as f:
             manual_corpus = [json.loads(line) for line in f]
         logger.info(f"Loaded {len(manual_corpus)} items from manual corpus")
 
-        with open(dynamic_dataset_path / "corpus.jsonl", 'r') as f:
+        with open(dynamic_dataset_path / "corpus.jsonl", "r") as f:
             dynamic_corpus = [json.loads(line) for line in f]
         logger.info(f"Loaded {len(dynamic_corpus)} items from dynamic corpus")
 
-        manual_corpus_dict = {item['path']: item for item in manual_corpus}
+        manual_corpus_dict = {item["path"]: item for item in manual_corpus}
         deduplicated_manual_corpus = list(manual_corpus_dict.values())
-        dynamic_corpus_dict = {item['path']: item for item in dynamic_corpus}
+        dynamic_corpus_dict = {item["path"]: item for item in dynamic_corpus}
 
-        logger.info(f"Manual corpus: {len(manual_corpus)} items, {len(deduplicated_manual_corpus)} unique")
+        logger.info(
+            f"Manual corpus: {len(manual_corpus)} items, {len(deduplicated_manual_corpus)} unique"
+        )
         logger.info(f"Dynamic corpus: {len(dynamic_corpus)} items")
 
-        only_in_manual_corpus = set(manual_corpus_dict.keys()) - set(dynamic_corpus_dict.keys())
-        only_in_dynamic_corpus = set(dynamic_corpus_dict.keys()) - set(manual_corpus_dict.keys())
+        only_in_manual_corpus = set(manual_corpus_dict.keys()) - set(
+            dynamic_corpus_dict.keys()
+        )
+        only_in_dynamic_corpus = set(dynamic_corpus_dict.keys()) - set(
+            manual_corpus_dict.keys()
+        )
 
-        self.assertEqual(len(only_in_manual_corpus), 0, "Corpus items found only in manual dataset")
-        self.assertEqual(len(only_in_dynamic_corpus), 0, "Corpus items found only in dynamic dataset")
+        self.assertEqual(
+            len(only_in_manual_corpus), 0, "Corpus items found only in manual dataset"
+        )
+        self.assertEqual(
+            len(only_in_dynamic_corpus), 0, "Corpus items found only in dynamic dataset"
+        )
 
-        assert len(set(dynamic_corpus_dict.keys())) == len(set(dynamic_corpus_dict.keys())), "Manual and dynamic datasets have different number of premise files"
+        assert len(set(dynamic_corpus_dict.keys())) == len(
+            set(dynamic_corpus_dict.keys())
+        ), "Manual and dynamic datasets have different number of premise files"
         # Since we choose the first processed premise file in the case of duplicates,
         # we can't compare the lines directly
         logger.info("Comparing corpus content")
         try:
             # Check that the paths are the same in both datasets
             manual_paths = set(manual_corpus_dict.keys())
-            dynamic_paths = set(item['path'] for item in dynamic_corpus)
-            self.assertEqual(manual_paths, dynamic_paths, "Paths in manual and dynamic corpus do not match")
+            dynamic_paths = set(item["path"] for item in dynamic_corpus)
+            self.assertEqual(
+                manual_paths,
+                dynamic_paths,
+                "Paths in manual and dynamic corpus do not match",
+            )
             logger.info("Corpus content matches after deduplication")
         except AssertionError as e:
             logger.info("Corpus content mismatch:")
             logger.info(str(e))
             raise
 
-        with open(manual_dataset_path / "traced_files.jsonl", 'r') as f:
+        with open(manual_dataset_path / "traced_files.jsonl", "r") as f:
             manual_traced = [json.loads(line) for line in f]
         logger.info(f"Loaded {len(manual_traced)} items from manual traced files")
 
-        with open(dynamic_dataset_path / "traced_files.jsonl", 'r') as f:
+        with open(dynamic_dataset_path / "traced_files.jsonl", "r") as f:
             dynamic_traced = [json.loads(line) for line in f]
         logger.info(f"Loaded {len(dynamic_traced)} items from dynamic traced files")
 
-        manual_traced_dict = {item['traced_file_path']: item for item in manual_traced}
+        manual_traced_dict = {item["traced_file_path"]: item for item in manual_traced}
         deduplicated_manual_traced = list(manual_traced_dict.values())
-        logger.info(f"Manual traced files: {len(manual_traced)} items, {len(deduplicated_manual_traced)} unique")
+        logger.info(
+            f"Manual traced files: {len(manual_traced)} items, {len(deduplicated_manual_traced)} unique"
+        )
         logger.info(f"Dynamic traced files: {len(dynamic_traced)} items")
 
         logger.info("Comparing traced files content")
@@ -2136,19 +2575,27 @@ def test_compare_manual_and_dynamic_datasets(self):
 
     def _theorem_to_key(self, theorem):
         return (
-            theorem['file_path'],
-            theorem['full_name'],
-            tuple(theorem['start']),
-            tuple(theorem['end'])
+            theorem["file_path"],
+            theorem["full_name"],
+            tuple(theorem["start"]),
+            tuple(theorem["end"]),
         )
 
     def _fast_compare_theorems(self, manual_theorems, dynamic_theorems):
-        logger.info(f"Converting {len(manual_theorems)} manual theorems to hashable format")
+        logger.info(
+            f"Converting {len(manual_theorems)} manual theorems to hashable format"
+        )
         manual_set = set(map(self._theorem_to_hashable, manual_theorems))
-        assert len(manual_set) == len(manual_theorems), "Manual theorems contain duplicates"
-        logger.info(f"Converting {len(dynamic_theorems)} dynamic theorems to hashable format")
+        assert len(manual_set) == len(
+            manual_theorems
+        ), "Manual theorems contain duplicates"
+        logger.info(
+            f"Converting {len(dynamic_theorems)} dynamic theorems to hashable format"
+        )
         dynamic_set = set(map(self._theorem_to_hashable, dynamic_theorems))
-        assert len(dynamic_set) == len(dynamic_theorems), "Dynamic theorems contain duplicates"
+        assert len(dynamic_set) == len(
+            dynamic_theorems
+        ), "Dynamic theorems contain duplicates"
 
         logger.info("Comparing theorem sets")
         only_in_manual = manual_set - dynamic_set
@@ -2177,22 +2624,29 @@ def _fast_compare_theorems(self, manual_theorems, dynamic_theorems):
 
     def _theorem_to_hashable(self, theorem):
         return (
-            theorem['file_path'],
-            theorem['full_name'],
-            tuple(theorem['start']),
-            tuple(theorem['end']),
+            theorem["file_path"],
+            theorem["full_name"],
+            tuple(theorem["start"]),
+            tuple(theorem["end"]),
         )
 
     def _tactic_to_hashable(self, tactic):
         return (
-            tactic['tactic'],
-            tactic['annotated_tactic'][0],
-            tuple((a['full_name'], a['def_path'], tuple(a['def_pos']), tuple(a['def_end_pos']))
-                for a in tactic['annotated_tactic'][1]),
-            tactic['state_before'],
-            tactic['state_after']
-        )
-    
+            tactic["tactic"],
+            tactic["annotated_tactic"][0],
+            tuple(
+                (
+                    a["full_name"],
+                    a["def_path"],
+                    tuple(a["def_pos"]),
+                    tuple(a["def_end_pos"]),
+                )
+                for a in tactic["annotated_tactic"][1]
+            ),
+            tactic["state_before"],
+            tactic["state_after"],
+        )
+
     def test_unicode_handling_in_dataset(self):
         url_PFR = "https://github.com/teorth/pfr"
         commit_PFR = "6a5082ee465f9e44cea479c7b741b3163162bb7e"
@@ -2200,27 +2654,55 @@ def test_unicode_handling_in_dataset(self):
         url_new_version = "https://github.com/Adarsh321123/new-version-test"
         commit_new_version = "f465306be03ced999caa157a85558a6c41b3e3f5"
         dir_name_new_version = url_new_version.split("/")[-1] + "_" + commit_new_version
-        dst_dir = Path(RAID_DIR) / DATA_DIR / f"{dir_name_PFR}_{dir_name_new_version}_generated"
+        dst_dir = (
+            Path(RAID_DIR)
+            / DATA_DIR
+            / f"{dir_name_PFR}_{dir_name_new_version}_generated"
+        )
         self.db.generate_merged_dataset(dst_dir)
 
-        with open(dst_dir / "metadata.json", 'r', encoding='utf-8') as f:
+        with open(dst_dir / "metadata.json", "r", encoding="utf-8") as f:
             metadata = json.load(f)
-            self.assertIn('repositories', metadata, "No 'repositories' key in metadata")
-            self.assertGreater(len(metadata['repositories']), 0, "No repositories in metadata")
-            repo_PFR = metadata['repositories'][0]
-            self.assertIn('metadata', repo_PFR, "No 'metadata' key in repository")
-            repo_PFR_metadata = repo_PFR['metadata']
-            self.assertIn('unicode', repo_PFR_metadata, "No 'unicode' key in repository metadata")
-            self.assertIn("ユニコード", repo_PFR_metadata['unicode'], "Unicode string not found in metadata")
-            self.assertIn("ユニコード", metadata['repositories'][0]['metadata']['unicode'])
-
-            self.assertGreater(len(metadata['repositories']), 1, "Only one repository in metadata")
-            repo_new_version = metadata['repositories'][1]
-            self.assertIn('metadata', repo_new_version, "No 'metadata' key in repository")
-            repo_new_version_metadata = repo_new_version['metadata']
-            self.assertIn('unicode', repo_new_version_metadata, "No 'unicode' key in repository metadata")
-            self.assertIn("ユニコード", repo_new_version_metadata['unicode'], "Unicode string not found in metadata")
-            self.assertIn("ユニコード", metadata['repositories'][1]['metadata']['unicode'])
+            self.assertIn("repositories", metadata, "No 'repositories' key in metadata")
+            self.assertGreater(
+                len(metadata["repositories"]), 0, "No repositories in metadata"
+            )
+            repo_PFR = metadata["repositories"][0]
+            self.assertIn("metadata", repo_PFR, "No 'metadata' key in repository")
+            repo_PFR_metadata = repo_PFR["metadata"]
+            self.assertIn(
+                "unicode", repo_PFR_metadata, "No 'unicode' key in repository metadata"
+            )
+            self.assertIn(
+                "ユニコード",
+                repo_PFR_metadata["unicode"],
+                "Unicode string not found in metadata",
+            )
+            self.assertIn(
+                "ユニコード", metadata["repositories"][0]["metadata"]["unicode"]
+            )
+
+            self.assertGreater(
+                len(metadata["repositories"]), 1, "Only one repository in metadata"
+            )
+            repo_new_version = metadata["repositories"][1]
+            self.assertIn(
+                "metadata", repo_new_version, "No 'metadata' key in repository"
+            )
+            repo_new_version_metadata = repo_new_version["metadata"]
+            self.assertIn(
+                "unicode",
+                repo_new_version_metadata,
+                "No 'unicode' key in repository metadata",
+            )
+            self.assertIn(
+                "ユニコード",
+                repo_new_version_metadata["unicode"],
+                "Unicode string not found in metadata",
+            )
+            self.assertIn(
+                "ユニコード", metadata["repositories"][1]["metadata"]["unicode"]
+            )
 
     def tearDown(self):
         # Clean up generated files after tests
@@ -2230,16 +2712,28 @@ def tearDown(self):
         url_new_version = "https://github.com/Adarsh321123/new-version-test"
         commit_new_version = "f465306be03ced999caa157a85558a6c41b3e3f5"
         dir_name_new_version = url_new_version.split("/")[-1] + "_" + commit_new_version
-        dst_dir = Path(RAID_DIR) / DATA_DIR / f"{dir_name_PFR}_{dir_name_new_version}_generated"
+        dst_dir = (
+            Path(RAID_DIR)
+            / DATA_DIR
+            / f"{dir_name_PFR}_{dir_name_new_version}_generated"
+        )
         if dst_dir.exists():
             shutil.rmtree(dst_dir)
 
     def test_theorem_statement(self):
-        theorem = next(t for t in self.sample_repo_PFR.proven_theorems if t.full_name == "ContinuousLinearMap.opNorm_lsmul")
+        theorem = next(
+            t
+            for t in self.sample_repo_PFR.proven_theorems
+            if t.full_name == "ContinuousLinearMap.opNorm_lsmul"
+        )
         self.assertIsNotNone(theorem.theorem_statement)
         self.assertIn("opNorm_lsmul", theorem.theorem_statement)
 
-        theorem = next(t for t in self.sample_repo_new_version.proven_theorems if t.full_name == "Ordinal.le_mul_right")
+        theorem = next(
+            t
+            for t in self.sample_repo_new_version.proven_theorems
+            if t.full_name == "Ordinal.le_mul_right"
+        )
         self.assertIsNotNone(theorem.theorem_statement)
         self.assertIn("le_mul_right", theorem.theorem_statement)
 
@@ -2254,12 +2748,25 @@ def test_unicode_handling(self):
 
     def test_file_tracing(self):
         self.assertGreater(len(self.sample_repo_PFR.files_traced), 0)
-        self.assertIn(Path("PFR/Mathlib/GroupTheory/Torsion.lean"), self.sample_repo_PFR.files_traced)
-        self.assertIn(Path(".lake/packages/batteries/Batteries/Data/List/Lemmas.lean"), self.sample_repo_PFR.files_traced)
+        self.assertIn(
+            Path("PFR/Mathlib/GroupTheory/Torsion.lean"),
+            self.sample_repo_PFR.files_traced,
+        )
+        self.assertIn(
+            Path(".lake/packages/batteries/Batteries/Data/List/Lemmas.lean"),
+            self.sample_repo_PFR.files_traced,
+        )
 
         self.assertGreater(len(self.sample_repo_new_version.files_traced), 0)
-        self.assertIn(Path("NewVersionTest/ExercisesOne.lean"), self.sample_repo_new_version.files_traced)
-        self.assertIn(Path(".lake/packages/batteries/Batteries/Data/List/Lemmas.lean"), self.sample_repo_new_version.files_traced)
+        self.assertIn(
+            Path("NewVersionTest/ExercisesOne.lean"),
+            self.sample_repo_new_version.files_traced,
+        )
+        self.assertIn(
+            Path(".lake/packages/batteries/Batteries/Data/List/Lemmas.lean"),
+            self.sample_repo_new_version.files_traced,
+        )
+
 
 class TestDynamicDatabaseProver(unittest.TestCase):
     """
@@ -2281,6 +2788,7 @@ class TestDynamicDatabaseProver(unittest.TestCase):
     a single unproved theorem, which can then be manipulated to test different
     aspects of the system's functionality.
     """
+
     def setUp(self):
         self.db = DynamicDatabase()
         self.repo = Repository(
@@ -2298,9 +2806,9 @@ def setUp(self):
                     end=Pos(10, 1),
                     url="https://github.com/test/repo",
                     commit="abcdef1234567890",
-                    theorem_statement="theorem test_theorem : 2 + 2 = 4 := sorry"
+                    theorem_statement="theorem test_theorem : 2 + 2 = 4 := sorry",
                 )
-            ]
+            ],
         )
         self.db.add_repository(self.repo)
 
@@ -2310,7 +2818,7 @@ def test_create_annotated_tactic(self):
             tactic=tactic,
             annotated_tactic=(tactic, []),
             state_before="",
-            state_after=""
+            state_after="",
         )
         self.assertEqual(annotated_tactic.tactic, tactic)
         self.assertEqual(annotated_tactic.annotated_tactic, (tactic, []))
@@ -2324,14 +2832,14 @@ def test_update_theorem_with_proof(self):
                 tactic="rw [add_comm]",
                 annotated_tactic=("rw [add_comm]", []),
                 state_before="⊢ 2 + 2 = 4",
-                state_after="⊢ 2 + 2 = 4"
+                state_after="⊢ 2 + 2 = 4",
             ),
             AnnotatedTactic(
                 tactic="refl",
                 annotated_tactic=("refl", []),
                 state_before="⊢ 2 + 2 = 4",
-                state_after="no goals"
-            )
+                state_after="no goals",
+            ),
         ]
         theorem.traced_tactics = traced_tactics
         self.repo.change_sorry_to_proven(theorem, PROOF_LOG_FILE_NAME)
@@ -2340,7 +2848,9 @@ def test_update_theorem_with_proof(self):
         updated_repo = self.db.get_repository(self.repo.url, self.repo.commit)
         self.assertEqual(len(updated_repo.sorry_theorems_proved), 1)
         self.assertEqual(len(updated_repo.sorry_theorems_unproved), 0)
-        self.assertEqual(updated_repo.sorry_theorems_proved[0].traced_tactics, traced_tactics)
+        self.assertEqual(
+            updated_repo.sorry_theorems_proved[0].traced_tactics, traced_tactics
+        )
 
     def test_json_serialization_with_proved_theorems(self):
         results = [
@@ -2352,42 +2862,45 @@ def test_json_serialization_with_proved_theorems(self):
                 environment_time=2.0,
                 total_time=3.0,
                 num_total_nodes=10,
-                num_searched_nodes=5
+                num_searched_nodes=5,
             )
         ]
         result = results[0]
-        
+
         traced_tactics = [
             AnnotatedTactic(
                 tactic=tactic,
                 annotated_tactic=(tactic, []),
                 state_before="",
-                state_after=""
-            ) for tactic in result.proof
+                state_after="",
+            )
+            for tactic in result.proof
         ]
         self.repo.sorry_theorems_unproved[0].traced_tactics = traced_tactics
-        self.repo.change_sorry_to_proven(self.repo.sorry_theorems_unproved[0], PROOF_LOG_FILE_NAME)
-        
+        self.repo.change_sorry_to_proven(
+            self.repo.sorry_theorems_unproved[0], PROOF_LOG_FILE_NAME
+        )
+
         # Serialize to JSON
         json_file = "proved_theorems_test.json"
         self.db.to_json(json_file)
-        
+
         # Deserialize from JSON
         loaded_db = DynamicDatabase.from_json(json_file)
-        
+
         # Verify the loaded data
         loaded_repo = loaded_db.get_repository(self.repo.url, self.repo.commit)
         self.assertIsNotNone(loaded_repo)
-        
+
         self.assertEqual(len(loaded_repo.sorry_theorems_unproved), 0)
         self.assertEqual(len(loaded_repo.sorry_theorems_proved), 1)
-        
+
         proved_theorem = loaded_repo.sorry_theorems_proved[0]
         self.assertEqual(proved_theorem.full_name, "test_theorem")
         self.assertEqual(len(proved_theorem.traced_tactics), 2)
         self.assertEqual(proved_theorem.traced_tactics[0].tactic, "rw [add_comm]")
         self.assertEqual(proved_theorem.traced_tactics[1].tactic, "refl")
-        
+
         # Test updating the loaded database
         new_theorem = Theorem(
             full_name="new_theorem",
@@ -2396,18 +2909,18 @@ def test_json_serialization_with_proved_theorems(self):
             end=Pos(5, 1),
             url="https://github.com/test/repo",
             commit="abcdef1234567890",
-            theorem_statement="theorem new_theorem : 3 + 3 = 6 := sorry"
+            theorem_statement="theorem new_theorem : 3 + 3 = 6 := sorry",
         )
         loaded_repo.sorry_theorems_unproved.append(new_theorem)
-        
+
         # Serialize the updated database
         updated_json_file = "updated_proved_theorems_test.json"
         loaded_db.to_json(updated_json_file)
-        
+
         # Deserialize and verify the update
         final_db = DynamicDatabase.from_json(updated_json_file)
         final_repo = final_db.get_repository(self.repo.url, self.repo.commit)
-        
+
         self.assertEqual(len(final_repo.sorry_theorems_proved), 1)
         self.assertEqual(len(final_repo.sorry_theorems_unproved), 1)
         self.assertEqual(final_repo.sorry_theorems_unproved[0].full_name, "new_theorem")
@@ -2415,16 +2928,16 @@ def test_json_serialization_with_proved_theorems(self):
     def test_update_theorem_with_proof_and_json(self):
         json_file = "temp_file.json"
         theorem = self.repo.sorry_theorems_unproved[0]
-        
+
         traced_tactics = [
             AnnotatedTactic(
                 tactic="rw [add_comm]",
                 annotated_tactic=("rw [add_comm]", []),
                 state_before="",
-                state_after=""
+                state_after="",
             )
         ]
-        
+
         theorem.traced_tactics = traced_tactics
         self.repo.change_sorry_to_proven(theorem, PROOF_LOG_FILE_NAME)
         self.db.update_repository(self.repo)
@@ -2440,7 +2953,7 @@ def test_update_theorem_with_proof_and_json(self):
         self.assertEqual(proved_theorem.file_path, theorem.file_path)
         self.assertEqual(proved_theorem.start, theorem.start)
         self.assertEqual(proved_theorem.end, theorem.end)
-        
+
         self.assertEqual(len(proved_theorem.traced_tactics), 1)
         loaded_tactic = proved_theorem.traced_tactics[0]
         self.assertEqual(loaded_tactic.tactic, "rw [add_comm]")
@@ -2458,7 +2971,7 @@ def test_prove_sorry_theorems(self):
                 environment_time=2.0,
                 total_time=3.0,
                 num_total_nodes=10,
-                num_searched_nodes=5
+                num_searched_nodes=5,
             )
         ]
         result = results[0] if results else None
@@ -2472,11 +2985,13 @@ def test_prove_sorry_theorems(self):
                         tactic=tactic,
                         annotated_tactic=(tactic, []),
                         state_before="",
-                        state_after=""
+                        state_after="",
                     )
                 )
             self.repo.sorry_theorems_unproved[0].traced_tactics = traced_tactics
-            self.repo.change_sorry_to_proven(self.repo.sorry_theorems_unproved[0], PROOF_LOG_FILE_NAME)
+            self.repo.change_sorry_to_proven(
+                self.repo.sorry_theorems_unproved[0], PROOF_LOG_FILE_NAME
+            )
 
         self.assertEqual(len(self.repo.sorry_theorems_unproved), 0)
         self.assertEqual(len(self.repo.sorry_theorems_proved), 1)
@@ -2493,10 +3008,14 @@ def test_save_load_dynamic_database(self):
 
         self.assertEqual(len(self.db.repositories), len(loaded_db.repositories))
         self.assertEqual(self.db.repositories[0].url, loaded_db.repositories[0].url)
-        self.assertEqual(self.db.repositories[0].commit, loaded_db.repositories[0].commit)
-        self.assertEqual(len(self.db.repositories[0].sorry_theorems_unproved),
-                            len(loaded_db.repositories[0].sorry_theorems_unproved))
-    
+        self.assertEqual(
+            self.db.repositories[0].commit, loaded_db.repositories[0].commit
+        )
+        self.assertEqual(
+            len(self.db.repositories[0].sorry_theorems_unproved),
+            len(loaded_db.repositories[0].sorry_theorems_unproved),
+        )
+
     def test_add_repository_and_save(self):
         json_file = "temp_file.json"
 
@@ -2517,9 +3036,9 @@ def test_add_repository_and_save(self):
                     "end": [10, 1],
                     "url": "https://github.com/test/new-repo",
                     "commit": "1234567890abcdef",
-                    "theorem_statement": "theorem new_test_theorem : 3 + 3 = 6 := sorry"
+                    "theorem_statement": "theorem new_test_theorem : 3 + 3 = 6 := sorry",
                 }
-            ]
+            ],
         }
 
         new_repo = Repository.from_dict(new_repo_data)
@@ -2528,13 +3047,18 @@ def test_add_repository_and_save(self):
         loaded_db = DynamicDatabase.from_json(json_file)
 
         self.assertEqual(len(loaded_db.repositories), 2)
-        self.assertEqual(loaded_db.repositories[1].url, "https://github.com/test/new-repo")
+        self.assertEqual(
+            loaded_db.repositories[1].url, "https://github.com/test/new-repo"
+        )
         self.assertEqual(len(loaded_db.repositories[1].sorry_theorems_unproved), 1)
-        self.assertEqual(loaded_db.repositories[1].sorry_theorems_unproved[0].full_name, "new_test_theorem")
+        self.assertEqual(
+            loaded_db.repositories[1].sorry_theorems_unproved[0].full_name,
+            "new_test_theorem",
+        )
 
     def test_prove_sorry_theorems_and_save(self):
         json_file = "temp_file.json"
-        
+
         self.db.to_json(json_file)
 
         results = [
@@ -2546,7 +3070,7 @@ def test_prove_sorry_theorems_and_save(self):
                 environment_time=2.0,
                 total_time=3.0,
                 num_total_nodes=10,
-                num_searched_nodes=5
+                num_searched_nodes=5,
             )
         ]
         result = results[0] if results else None
@@ -2560,11 +3084,13 @@ def test_prove_sorry_theorems_and_save(self):
                         tactic=tactic,
                         annotated_tactic=(tactic, []),
                         state_before="",
-                        state_after=""
+                        state_after="",
                     )
                 )
             self.repo.sorry_theorems_unproved[0].traced_tactics = traced_tactics
-            self.repo.change_sorry_to_proven(self.repo.sorry_theorems_unproved[0], PROOF_LOG_FILE_NAME)
+            self.repo.change_sorry_to_proven(
+                self.repo.sorry_theorems_unproved[0], PROOF_LOG_FILE_NAME
+            )
 
         self.db.to_json(json_file)
         loaded_db = DynamicDatabase.from_json(json_file)
@@ -2577,8 +3103,15 @@ def test_prove_sorry_theorems_and_save(self):
         self.assertEqual(proved_theorem.traced_tactics[0].tactic, "rw [add_comm]")
         self.assertEqual(proved_theorem.traced_tactics[1].tactic, "refl")
 
-    def _theorem_identifier(self, theorem: Theorem) -> Tuple[str, str, Tuple[int, int], Tuple[int, int]]:
-        return (theorem.full_name, str(theorem.file_path), tuple(theorem.start), tuple(theorem.end))
+    def _theorem_identifier(
+        self, theorem: Theorem
+    ) -> Tuple[str, str, Tuple[int, int], Tuple[int, int]]:
+        return (
+            theorem.full_name,
+            str(theorem.file_path),
+            tuple(theorem.start),
+            tuple(theorem.end),
+        )
 
     def test_prove_sorry_theorems_with_duplicates(self):
         # Create two repositories with the same theorem but different commits
@@ -2597,9 +3130,9 @@ def test_prove_sorry_theorems_with_duplicates(self):
                     end=Pos(10, 1),
                     url="https://github.com/test/repo",
                     commit="commit1",
-                    theorem_statement="theorem duplicate_theorem : 2 + 2 = 4 := sorry"
+                    theorem_statement="theorem duplicate_theorem : 2 + 2 = 4 := sorry",
                 )
-            ]
+            ],
         )
 
         repo2 = Repository(
@@ -2617,9 +3150,9 @@ def test_prove_sorry_theorems_with_duplicates(self):
                     end=Pos(10, 1),
                     url="https://github.com/test/repo",
                     commit="commit2",
-                    theorem_statement="theorem duplicate_theorem : 2 + 2 = 4 := sorry"
+                    theorem_statement="theorem duplicate_theorem : 2 + 2 = 4 := sorry",
                 )
-            ]
+            ],
         )
 
         # Create a test database with both repositories
@@ -2647,29 +3180,36 @@ def test_prove_sorry_theorems_with_duplicates(self):
                 environment_time=2.0,
                 total_time=3.0,
                 num_total_nodes=10,
-                num_searched_nodes=5
+                num_searched_nodes=5,
             )
         ]
 
         # Simulate the prove_sorry_theorems function
         processed_theorems = set()
-        for repo in sorted(loaded_db.repositories, key=lambda r: r.metadata['date_processed'], reverse=True):
+        for repo in sorted(
+            loaded_db.repositories,
+            key=lambda r: r.metadata["date_processed"],
+            reverse=True,
+        ):
             for theorem in repo.sorry_theorems_unproved:
                 theorem_id = self._theorem_identifier(theorem)
                 if theorem_id in processed_theorems:
                     continue
-                
+
                 processed_theorems.add(theorem_id)
-                
+
                 # Apply the proof to the theorem
-                result = results[0]  # In a real scenario, this would be the result of calling the prover
+                result = results[
+                    0
+                ]  # In a real scenario, this would be the result of calling the prover
                 traced_tactics = [
                     AnnotatedTactic(
                         tactic=tactic,
                         annotated_tactic=(tactic, []),
                         state_before="",
-                        state_after=""
-                    ) for tactic in result.proof
+                        state_after="",
+                    )
+                    for tactic in result.proof
                 ]
                 theorem.traced_tactics = traced_tactics
                 repo.change_sorry_to_proven(theorem, PROOF_LOG_FILE_NAME)
@@ -2695,13 +3235,13 @@ def test_prove_sorry_theorems_with_duplicates(self):
         self.assertEqual(len(proved_theorem.traced_tactics), 2)
         self.assertEqual(proved_theorem.traced_tactics[0].tactic, "rw [add_comm]")
         self.assertEqual(proved_theorem.traced_tactics[1].tactic, "refl")
-    
+
     def test_repeated_to_json_during_proving(self):
         json_file = "repeated_save_test.json"
-        
+
         # Initial save
         self.db.to_json(json_file)
-        
+
         results = [
             SearchResult(
                 theorem=self.repo.sorry_theorems_unproved[0],
@@ -2711,10 +3251,10 @@ def test_repeated_to_json_during_proving(self):
                 environment_time=2.0,
                 total_time=3.0,
                 num_total_nodes=10,
-                num_searched_nodes=5
+                num_searched_nodes=5,
             )
         ]
-        
+
         # Simulate proving and saving after each theorem
         for result in results:
             if isinstance(result, SearchResult) and result.status == Status.PROVED:
@@ -2723,22 +3263,26 @@ def test_repeated_to_json_during_proving(self):
                         tactic=tactic,
                         annotated_tactic=(tactic, []),
                         state_before="",
-                        state_after=""
-                    ) for tactic in result.proof
+                        state_after="",
+                    )
+                    for tactic in result.proof
                 ]
                 self.repo.sorry_theorems_unproved[0].traced_tactics = traced_tactics
-                self.repo.change_sorry_to_proven(self.repo.sorry_theorems_unproved[0], PROOF_LOG_FILE_NAME)
+                self.repo.change_sorry_to_proven(
+                    self.repo.sorry_theorems_unproved[0], PROOF_LOG_FILE_NAME
+                )
                 self.db.update_repository(self.repo)
                 self.db.to_json(json_file)  # Save after each theorem is proved
-        
+
         # Final save
         self.db.to_json(json_file)
-        
+
         # Load and verify
         loaded_db = DynamicDatabase.from_json(json_file)
         self.assertEqual(len(loaded_db.repositories[0].sorry_theorems_proved), 1)
         self.assertEqual(len(loaded_db.repositories[0].sorry_theorems_unproved), 0)
 
+
 class TestDynamicDatabaseEmpty(unittest.TestCase):
     def setUp(self):
         self.empty_json_path = "empty_database.json"
@@ -2749,38 +3293,38 @@ def tearDown(self):
             os.remove(self.empty_json_path)
 
     def test_from_json_empty_file(self):
-        with open(self.empty_json_path, 'w') as f:
+        with open(self.empty_json_path, "w") as f:
             json.dump({"repositories": []}, f)
-        
+
         db = DynamicDatabase.from_json(self.empty_json_path)
         self.assertEqual(len(db.repositories), 0)
 
     def test_to_json_empty_database(self):
         self.db.to_json(self.empty_json_path)
-        
+
         self.assertTrue(os.path.exists(self.empty_json_path))
-        with open(self.empty_json_path, 'r') as f:
+        with open(self.empty_json_path, "r") as f:
             content = json.load(f)
             self.assertEqual(content, {"repositories": []})
 
     def test_from_json_invalid_empty_object(self):
-        with open(self.empty_json_path, 'w') as f:
+        with open(self.empty_json_path, "w") as f:
             json.dump({}, f)
-        
+
         with self.assertRaises(ValueError):
             DynamicDatabase.from_json(self.empty_json_path)
 
     def test_from_json_nonexistent_file(self):
         if os.path.exists(self.empty_json_path):
             os.remove(self.empty_json_path)
-        
+
         with self.assertRaises(FileNotFoundError):
             DynamicDatabase.from_json(self.empty_json_path)
 
     def test_add_repository_to_empty_database(self):
-        with open(self.empty_json_path, 'w') as f:
+        with open(self.empty_json_path, "w") as f:
             json.dump({"repositories": []}, f)
-        
+
         db = DynamicDatabase.from_json(self.empty_json_path)
 
         repo_data = {
@@ -2789,19 +3333,21 @@ def test_add_repository_to_empty_database(self):
             "commit": "abc123",
             "lean_version": "3.50.3",
             "lean_dojo_version": "1.8.4",
-            "metadata": {"date_processed": datetime.datetime.now().isoformat()}
+            "metadata": {"date_processed": datetime.datetime.now().isoformat()},
         }
         repo = Repository.from_dict(repo_data)
         db.add_repository(repo)
-        
+
         db.to_json(self.empty_json_path)
-        
+
         loaded_db = DynamicDatabase.from_json(self.empty_json_path)
         self.assertEqual(len(loaded_db.repositories), 1)
         self.assertEqual(loaded_db.repositories[0].url, "https://github.com/test/repo")
 
+
 def main():
     unittest.main()
 
+
 if __name__ == "__main__":
     main()

From a455001658465226bc4168b40da67e5a82fbf498 Mon Sep 17 00:00:00 2001
From: motiwari <mohittiwarinyc+github@gmail.com>
Date: Mon, 25 Aug 2025 09:26:21 -0700
Subject: [PATCH 03/29] Fixing imports

---
 CLAUDE.md                              | 91 ++++++++++++++++++++++++++
 common.py                              | 26 ++++----
 custom_traced_data.py                  | 34 +++++-----
 custom_utils.py                        | 20 +++---
 dynamic_database.py                    | 14 ++--
 generate_benchmark_lean4.py            | 19 +++---
 generator/datamodule.py                | 20 ++----
 generator/main.py                      |  1 +
 generator/model.py                     | 26 +++-----
 ld_path.txt                            |  1 +
 leanagent.py                           | 67 ++++++++-----------
 pl_path.txt                            |  1 +
 prover/evaluate.py                     | 16 ++---
 prover/proof_search.py                 | 33 ++++------
 prover/search_tree.py                  | 16 ++---
 retrieval/bm25/main.py                 | 23 ++++---
 retrieval/bm25/train_tokenizer.py      |  5 +-
 retrieval/datamodule.py                | 24 +++----
 retrieval/evaluate.py                  |  9 +--
 retrieval/evaluate_multiple.py         |  9 +--
 retrieval/fisher_computation_module.py |  5 +-
 retrieval/index.py                     |  5 +-
 retrieval/main.py                      | 11 ++--
 retrieval/model.py                     | 29 ++++----
 tests/test_common.py                   |  1 +
 unittest_dynamic_database.py           | 38 +++++------
 26 files changed, 298 insertions(+), 246 deletions(-)
 create mode 100644 CLAUDE.md
 create mode 100644 ld_path.txt
 create mode 100644 pl_path.txt

diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..5ef3e45
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,91 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Common Development Commands
+
+### Environment Setup
+```bash
+# Create and activate conda environment
+conda create -n "LeanAgent" python=3.10
+conda activate LeanAgent
+pip install -r requirements.txt
+```
+
+### Running Tests
+```bash
+# Run the test suite
+conda activate LeanAgent
+python -m pytest tests/
+```
+
+### Running LeanAgent
+```bash
+# Main training/proving pipeline
+bash run_leanagent.sh
+```
+
+### Fisher Information Matrix Computation (for EWC ablations)
+```bash
+# Compute Fisher Information Matrix
+bash run_compute_fisher.sh
+```
+
+### Environment Configuration Required
+Before running, update the following in shell scripts:
+- `RAID_DIR`: Path to storage directory
+- `PATH_TO_CONDA_ENV`: Path to conda installation  
+- `GITHUB_ACCESS_TOKEN`: GitHub personal access token
+
+## Architecture Overview
+
+LeanAgent is a lifelong learning framework for formal theorem proving that continuously learns from expanding mathematical repositories without forgetting previous knowledge.
+
+### Core Components
+
+1. **Dynamic Database (`dynamic_database.py`)**: Central JSON-based storage system that manages mathematical knowledge across repositories. Tracks theorems (proven, sorry-but-now-proven, unproven), premise files, and repository metadata with deduplication capabilities.
+
+2. **Repository Processing Pipeline**: 
+   - Discovers and clones Lean repositories from GitHub
+   - Uses LeanDojo to trace/extract theorems, proofs, and premises
+   - Checks Lean version compatibility (4.3.0-rc2 to 4.8.0-rc1)
+   - Builds dependency graphs and exports structured datasets
+
+3. **Progressive Retriever Training (`retrieval/`)**:
+   - Trains ByT5-based retriever incrementally on new repositories
+   - Uses PyTorch Lightning with DDP for distributed training
+   - Saves checkpoints based on R@10 validation performance
+   - Measures both plasticity (new learning) and stability (retention)
+
+4. **Theorem Proving (`prover/`)**:
+   - Best-first tree search for sorry theorem proving
+   - Uses trained retriever to find relevant premises
+   - Generates tactic candidates with beam search
+   - 10-minute timeout per theorem, processes in batches of 12
+
+5. **Curriculum Learning**: Exponential complexity scoring (e^S where S = proof steps) with Easy/Medium/Hard categorization based on 33rd/67th percentiles.
+
+### Key Configuration Points
+
+The main configuration happens in `leanagent.py` where you must set:
+- `repo_dir`: Repository path
+- `DATA_DIR`: Data storage directory  
+- `CHECKPOINT_DIR`: Model checkpoint directory
+- `EVAL_RESULTS_FILE_PATH`: Evaluation results path
+- `DB_FILE_NAME`: Database filename
+- `PROOF_LOG_FILE_NAME`: Proof logging filename
+- `ENCOUNTERED_THEOREMS_FILE`: Theorem tracking file
+- `FISHER_DIR`: Fisher Information Matrix directory (optional)
+
+### Distributed Computing
+- Uses Ray for distributed repository processing
+- PyTorch Lightning DDP for multi-GPU training (typically 4 A100s)
+- Custom timeout settings for lengthy operations
+- Resource cleanup between phases to prevent memory leaks
+
+### Repository Integration
+When theorems are successfully proven, LeanAgent can:
+- Create temporary branches
+- Replace `sorry` with generated proofs
+- Submit pull requests to original repositories
+- Use standardized commit messages and PR templates
\ No newline at end of file
diff --git a/common.py b/common.py
index ac50437..b9eafef 100644
--- a/common.py
+++ b/common.py
@@ -1,24 +1,24 @@
+import json
 import os
+import random
 import re
 import sys
-import json
-import random
-import torch
 import tempfile
+from dataclasses import dataclass, field
+from typing import Any, Dict, Generator, List, Optional, Tuple
+
 import networkx as nx
-from loguru import logger
-from lean_dojo import Pos
 import pytorch_lightning as pl
-from dataclasses import dataclass, field
-from leanagent_utils import remove_marks, MARK_START_SYMBOL, MARK_END_SYMBOL
-from pytorch_lightning.utilities.deepspeed import (
-    convert_zero_checkpoint_to_fp32_state_dict,
-)
-from transformers import get_cosine_schedule_with_warmup
-from deepspeed.ops.adam import FusedAdam, DeepSpeedCPUAdam
-from typing import Optional, List, Dict, Any, Tuple, Generator
+import torch
+from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam
+from lean_dojo import Pos
+from loguru import logger
 from pytorch_lightning.strategies.deepspeed import DeepSpeedStrategy
+from pytorch_lightning.utilities.deepspeed import \
+    convert_zero_checkpoint_to_fp32_state_dict
+from transformers import get_cosine_schedule_with_warmup
 
+from leanagent_utils import MARK_END_SYMBOL, MARK_START_SYMBOL, remove_marks
 
 Example = Dict[str, Any]
 Batch = Dict[str, Any]
diff --git a/custom_traced_data.py b/custom_traced_data.py
index ae569e4..ee61c83 100644
--- a/custom_traced_data.py
+++ b/custom_traced_data.py
@@ -1,30 +1,26 @@
 """This module defines traced repos/files/theorems."""
 
-import re
-import os
+import itertools
 import json
+import os
 import random
-import itertools
+import re
 import webbrowser
-import networkx as nx
-from tqdm import tqdm
-from lxml import etree
+from dataclasses import dataclass, field
 from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import networkx as nx
 from loguru import logger
-from dataclasses import dataclass, field
-from typing import List, Optional, Dict, Any, Tuple, Union
-
-from ..utils import (
-    is_git_repo,
-    compute_md5,
-    to_lean_path,
-    to_dep_path,
-    to_json_path,
-    to_xml_path,
-)
+from lxml import etree
+from tqdm import tqdm
+
+from ..constants import (LEAN4_PACKAGES_DIR, LOAD_USED_PACKAGES_ONLY,
+                         NUM_WORKERS)
+from ..utils import (compute_md5, is_git_repo, to_dep_path, to_json_path,
+                     to_lean_path, to_xml_path)
 from .ast import *
-from .lean import LeanFile, LeanGitRepo, Theorem, Pos
-from ..constants import NUM_WORKERS, LOAD_USED_PACKAGES_ONLY, LEAN4_PACKAGES_DIR
+from .lean import LeanFile, LeanGitRepo, Pos, Theorem
 
 
 @dataclass(frozen=True)
diff --git a/custom_utils.py b/custom_utils.py
index dd587a4..181e9ac 100644
--- a/custom_utils.py
+++ b/custom_utils.py
@@ -1,20 +1,22 @@
 """Utility functions used internally by LeanDojo."""
 
-import re
+import hashlib
 import os
+import re
+import subprocess
+import tempfile
 import time
-import urllib
 import typing
-import hashlib
-import tempfile
-import subprocess
+import urllib
+from contextlib import contextmanager
+from functools import cache
 from pathlib import Path
+from typing import Generator, List, Optional, Tuple, Union
+
 from loguru import logger
-from functools import cache
-from contextlib import contextmanager
-from typing import Tuple, Union, List, Generator, Optional
 
-from .constants import NUM_WORKERS, TMP_DIR, LEAN4_PACKAGES_DIR, LEAN4_BUILD_DIR
+from .constants import (LEAN4_BUILD_DIR, LEAN4_PACKAGES_DIR, NUM_WORKERS,
+                        TMP_DIR)
 
 
 @contextmanager
diff --git a/dynamic_database.py b/dynamic_database.py
index d819000..d3fccc5 100644
--- a/dynamic_database.py
+++ b/dynamic_database.py
@@ -1,16 +1,18 @@
 from __future__ import annotations
+
 import datetime
 import json
 import os
-from dataclasses import dataclass, field, asdict
-from typing import List, Dict, Optional, Union, Tuple, Set
-from pathlib import Path
-from lean_dojo.data_extraction.lean import Pos
-from tqdm import tqdm
 import random
+import shutil
 from collections import defaultdict
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import Dict, List, Optional, Set, Tuple, Union
+
+from lean_dojo.data_extraction.lean import Pos
 from loguru import logger
-import shutil
+from tqdm import tqdm
 
 
 def parse_pos(pos_str):
diff --git a/generate_benchmark_lean4.py b/generate_benchmark_lean4.py
index 0c72377..d343931 100644
--- a/generate_benchmark_lean4.py
+++ b/generate_benchmark_lean4.py
@@ -1,20 +1,21 @@
 import json
-import shutil
 import random
-import networkx as nx
+import re
+import shutil
+import subprocess
+import sys
+import time
+from collections import defaultdict
 from copy import copy
-from pathlib import Path
-from loguru import logger
 from datetime import datetime
-from collections import defaultdict
+from pathlib import Path
 from typing import Dict, List, Union
-import time
+
 import lean_dojo
+import networkx as nx
 from lean_dojo import *
 from lean_dojo.constants import LEAN4_PACKAGES_DIR
-import re
-import subprocess
-import sys
+from loguru import logger
 
 random.seed(3407)  # https://arxiv.org/abs/2109.08203
 
diff --git a/generator/datamodule.py b/generator/datamodule.py
index 882e64a..0dde8da 100644
--- a/generator/datamodule.py
+++ b/generator/datamodule.py
@@ -1,24 +1,18 @@
 """Data module for the tactic generator."""
 
-import os
 import json
+import os
 import pickle
-from tqdm import tqdm
-from loguru import logger
+from typing import Any, Dict, List, Optional
+
 import pytorch_lightning as pl
-from typing import Optional, List, Dict, Any
+from loguru import logger
 from torch.utils.data import DataLoader, Dataset
+from tqdm import tqdm
 from transformers import AutoTokenizer, ByT5Tokenizer
 
-from common import (
-    Batch,
-    Corpus,
-    Example,
-    format_state,
-    remove_marks,
-    format_tactic,
-    format_augmented_state,
-)
+from common import (Batch, Corpus, Example, format_augmented_state,
+                    format_state, format_tactic, remove_marks)
 
 
 class GeneratorDataset(Dataset):
diff --git a/generator/main.py b/generator/main.py
index 9ae06c8..3d42239 100644
--- a/generator/main.py
+++ b/generator/main.py
@@ -1,6 +1,7 @@
 """Script for training the tactic generator."""
 
 import os
+
 from loguru import logger
 from pytorch_lightning.cli import LightningCLI
 
diff --git a/generator/model.py b/generator/model.py
index f07e95c..66d3617 100644
--- a/generator/model.py
+++ b/generator/model.py
@@ -1,28 +1,22 @@
 """Lightning module for the tactic generator."""
 
 import os
-import torch
+import pickle
 import shutil
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Optional, Tuple
+
 import openai
-import pickle
+import pytorch_lightning as pl
+import torch
 from lean_dojo import Pos
 from loguru import logger
-import pytorch_lightning as pl
 from torchmetrics import Metric
-from abc import ABC, abstractmethod
-from typing import List, Dict, Any, Optional, Tuple
-from transformers import T5ForConditionalGeneration, AutoTokenizer
-
-from common import (
-    zip_strict,
-    remove_marks,
-    IndexedCorpus,
-    get_optimizers,
-    load_checkpoint,
-    format_augmented_state,
-)
-from retrieval.model import PremiseRetriever
+from transformers import AutoTokenizer, T5ForConditionalGeneration
 
+from common import (IndexedCorpus, format_augmented_state, get_optimizers,
+                    load_checkpoint, remove_marks, zip_strict)
+from retrieval.model import PremiseRetriever
 
 torch.set_float32_matmul_precision("medium")
 
diff --git a/ld_path.txt b/ld_path.txt
new file mode 100644
index 0000000..c00d90a
--- /dev/null
+++ b/ld_path.txt
@@ -0,0 +1 @@
+/Users/motiwari/miniforge3/envs/LeanAgent/lib/python3.10/site-packages/lean_dojo/__init__.py
diff --git a/leanagent.py b/leanagent.py
index ea7d062..59d2724 100644
--- a/leanagent.py
+++ b/leanagent.py
@@ -1,54 +1,45 @@
 # import all the necessary libraries
+import json
 import math
-import ray
-from collections import defaultdict
 import os
-import requests
-import subprocess
+import pickle
+import random
 import re
 import shutil
-from lean_dojo import *
-import os
-import json
-import pickle
+import subprocess
+import sys
+import time
+import traceback
+from collections import defaultdict
+from copy import copy
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union
+
+import lean_dojo
 import numpy as np
-from tqdm import tqdm
-from loguru import logger
+import pytorch_lightning as pl
+import ray
+import requests
+import torch
+from lean_dojo import *
+from lean_dojo import LeanGitRepo, Pos
 from lean_dojo import Theorem
-from typing import List, Tuple, Optional
-from lean_dojo import LeanGitRepo, Pos, is_available_in_cache
 from lean_dojo import Theorem as LeanDojoTheorem
-import json
-import shutil
-import random
-from copy import copy
-from pathlib import Path
+from lean_dojo import is_available_in_cache
 from loguru import logger
-from datetime import datetime, timedelta
-from collections import defaultdict
-from typing import Dict, List, Union
-import generate_benchmark_lean4
-import traceback
-import sys
+from pytorch_lightning import seed_everything
+from pytorch_lightning.callbacks import (Callback, EarlyStopping,
+                                         LearningRateMonitor, ModelCheckpoint)
+from pytorch_lightning.strategies import DDPStrategy
 from tqdm import tqdm
+
+import generate_benchmark_lean4
 from dynamic_database import *
-import time
-from pytorch_lightning.strategies import DDPStrategy
-from prover.proof_search import Status, DistributedProver, SearchResult
-import re
-import lean_dojo
-import pytorch_lightning as pl
-from retrieval.model import PremiseRetriever
+from prover.proof_search import DistributedProver, SearchResult, Status
 from retrieval.datamodule import RetrievalDataModule
 from retrieval.main import run_cli
-import torch
-from pytorch_lightning.callbacks import (
-    ModelCheckpoint,
-    EarlyStopping,
-    LearningRateMonitor,
-    Callback,
-)
-from pytorch_lightning import seed_everything
+from retrieval.model import PremiseRetriever
 
 # Set the seed for reproducibility
 random.seed(3407)  # https://arxiv.org/abs/2109.08203
diff --git a/pl_path.txt b/pl_path.txt
new file mode 100644
index 0000000..de1baf0
--- /dev/null
+++ b/pl_path.txt
@@ -0,0 +1 @@
+/Users/motiwari/miniforge3/envs/LeanAgent/lib/python3.10/site-packages/pytorch_lightning/__init__.py
diff --git a/prover/evaluate.py b/prover/evaluate.py
index 44eec39..4ad634f 100644
--- a/prover/evaluate.py
+++ b/prover/evaluate.py
@@ -1,18 +1,18 @@
 """Script for evaluating the prover on theorems extracted by LeanDojo."""
 
-import os
-import uuid
+import argparse
+import hashlib
 import json
+import os
 import pickle
-import hashlib
-import argparse
+import uuid
+from typing import List, Optional, Tuple
+
+from lean_dojo import LeanGitRepo, Pos, Theorem, is_available_in_cache
 from loguru import logger
-from lean_dojo import Theorem
-from typing import List, Tuple, Optional
-from lean_dojo import LeanGitRepo, Theorem, Pos, is_available_in_cache
 
 from common import set_logger
-from prover.proof_search import Status, DistributedProver
+from prover.proof_search import DistributedProver, Status
 
 
 def _get_theorems(
diff --git a/prover/proof_search.py b/prover/proof_search.py
index e927630..dd08fe6 100644
--- a/prover/proof_search.py
+++ b/prover/proof_search.py
@@ -1,36 +1,27 @@
 """Proof search using best-first search."""
 
+import asyncio
+import heapq
 import os
 import sys
-import ray
 import time
 import uuid
-import heapq
-import asyncio
-import torch
-from lean_dojo import (
-    Pos,
-    Dojo,
-    Theorem,
-    LeanGitRepo,
-    TacticState,
-    LeanError,
-    TimeoutError,
-    ProofFinished,
-    ProofGivenUp,
-    DojoInitError,
-    DojoCrashError,
-    DojoHardTimeoutError,
-)
-from loguru import logger
 from dataclasses import dataclass
 from typing import List, Optional, Tuple
+
+import ray
+import torch
+from lean_dojo import (Dojo, DojoCrashError, DojoHardTimeoutError,
+                       DojoInitError, LeanError, LeanGitRepo, Pos,
+                       ProofFinished, ProofGivenUp, TacticState, Theorem,
+                       TimeoutError)
+from loguru import logger
 from ray.util.actor_pool import ActorPool
-from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams, RequestOutput
+from vllm import AsyncEngineArgs, AsyncLLMEngine, RequestOutput, SamplingParams
 
 from common import zip_strict
+from generator.model import FixedTacticGenerator, RetrievalAugmentedGenerator
 from prover.search_tree import *
-from generator.model import RetrievalAugmentedGenerator, FixedTacticGenerator
 
 tolerance = 1  # second
 RAID_DIR = os.environ.get("RAID_DIR")
diff --git a/prover/search_tree.py b/prover/search_tree.py
index 222d030..b1df0a2 100644
--- a/prover/search_tree.py
+++ b/prover/search_tree.py
@@ -1,18 +1,14 @@
 """Definitions of the search tree used by the prover."""
 
 import math
-from enum import Enum
-from lean_dojo import (
-    TacticState,
-    LeanError,
-    TimeoutError,
-    ProofGivenUp,
-    ProofFinished,
-)
 from abc import ABC, abstractmethod
-from functools import total_ordering
 from dataclasses import dataclass, field
-from typing import Optional, List, Tuple, Iterable, Union
+from enum import Enum
+from functools import total_ordering
+from typing import Iterable, List, Optional, Tuple, Union
+
+from lean_dojo import (LeanError, ProofFinished, ProofGivenUp, TacticState,
+                       TimeoutError)
 
 
 class Status(Enum):
diff --git a/retrieval/bm25/main.py b/retrieval/bm25/main.py
index 0db0900..1d41415 100644
--- a/retrieval/bm25/main.py
+++ b/retrieval/bm25/main.py
@@ -1,24 +1,23 @@
 """Script for training the BM25 premise retriever."""
 
-import os
-import ray
-import json
-import pickle
 import argparse
 import itertools
-import numpy as np
-from tqdm import tqdm
+import json
 import multiprocessing
-from loguru import logger
-from common import Corpus
+import os
+import pickle
+from typing import Any, Dict, List
+
+import numpy as np
+import ray
 from lean_dojo import Pos
+from loguru import logger
 from rank_bm25 import BM25Okapi
-from tokenizers import Tokenizer
-from typing import List, Dict, Any
 from ray.util.actor_pool import ActorPool
+from tokenizers import Tokenizer
+from tqdm import tqdm
 
-
-from common import Context, format_state, get_all_pos_premises
+from common import Context, Corpus, format_state, get_all_pos_premises
 
 
 def _process_theorem(
diff --git a/retrieval/bm25/train_tokenizer.py b/retrieval/bm25/train_tokenizer.py
index 28f1b3f..fc0514a 100644
--- a/retrieval/bm25/train_tokenizer.py
+++ b/retrieval/bm25/train_tokenizer.py
@@ -1,10 +1,11 @@
-import os
 import argparse
+import os
+
 from loguru import logger
 from tokenizers import Tokenizer
 from tokenizers.models import BPE
-from tokenizers.trainers import BpeTrainer
 from tokenizers.pre_tokenizers import Whitespace
+from tokenizers.trainers import BpeTrainer
 
 from common import Corpus
 from retrieval.datamodule import RetrievalDataset
diff --git a/retrieval/datamodule.py b/retrieval/datamodule.py
index 400ca88..39ee444 100644
--- a/retrieval/datamodule.py
+++ b/retrieval/datamodule.py
@@ -1,23 +1,23 @@
 """Datamodule for the premise retrieval."""
 
-import os
+import itertools
 import json
-import torch
+import os
+import pickle
 import random
-import itertools
-from tqdm import tqdm
-from loguru import logger
 from copy import deepcopy
-from lean_dojo import Pos
+from typing import List, Optional
+
 import pytorch_lightning as pl
-from lean_dojo import LeanGitRepo
-from typing import Optional, List
+import torch
+from lean_dojo import LeanGitRepo, Pos
+from loguru import logger
+from torch.utils.data import DataLoader, Dataset
+from tqdm import tqdm
 from transformers import AutoTokenizer
-from torch.utils.data import Dataset, DataLoader
-import pickle
-
 
-from common import Context, Corpus, Batch, Example, format_state, get_all_pos_premises
+from common import (Batch, Context, Corpus, Example, format_state,
+                    get_all_pos_premises)
 
 
 class RetrievalDataset(Dataset):
diff --git a/retrieval/evaluate.py b/retrieval/evaluate.py
index c18ddb6..5f31b95 100644
--- a/retrieval/evaluate.py
+++ b/retrieval/evaluate.py
@@ -1,13 +1,14 @@
 """Script for evaluating the premise retriever."""
 
-import os
+import argparse
 import json
+import os
 import pickle
-import argparse
-import numpy as np
-from tqdm import tqdm
 from typing import Tuple
+
+import numpy as np
 from loguru import logger
+from tqdm import tqdm
 
 
 def _eval(data, preds_map) -> Tuple[float, float, float]:
diff --git a/retrieval/evaluate_multiple.py b/retrieval/evaluate_multiple.py
index 6c9a4d0..84ede56 100644
--- a/retrieval/evaluate_multiple.py
+++ b/retrieval/evaluate_multiple.py
@@ -1,11 +1,12 @@
-import os
+import argparse
 import json
+import os
 import pickle
-import argparse
-import numpy as np
-from tqdm import tqdm
 from typing import List, Tuple
+
+import numpy as np
 from loguru import logger
+from tqdm import tqdm
 
 
 def _eval(data, preds_map) -> Tuple[float, float, float]:
diff --git a/retrieval/fisher_computation_module.py b/retrieval/fisher_computation_module.py
index a333575..d257b3c 100644
--- a/retrieval/fisher_computation_module.py
+++ b/retrieval/fisher_computation_module.py
@@ -1,8 +1,9 @@
+import pickle
+
 import pytorch_lightning as pl
-from loguru import logger
 import torch
 import torch.distributed as dist
-import pickle
+from loguru import logger
 
 
 class FisherComputationModule(pl.LightningModule):
diff --git a/retrieval/index.py b/retrieval/index.py
index 1fb2f05..1a6f89b 100644
--- a/retrieval/index.py
+++ b/retrieval/index.py
@@ -1,8 +1,9 @@
 """Script for indexing the corpus using the retriever."""
 
-import torch
-import pickle
 import argparse
+import pickle
+
+import torch
 from loguru import logger
 
 from common import IndexedCorpus
diff --git a/retrieval/main.py b/retrieval/main.py
index 331846d..ed71157 100644
--- a/retrieval/main.py
+++ b/retrieval/main.py
@@ -1,17 +1,18 @@
 """Script for training the premise retriever."""
 
+import json
 import os
+import pickle
+import sys
 from typing import Tuple
+
 import numpy as np
-import pickle
-import json
-from tqdm import tqdm
 from loguru import logger
 from pytorch_lightning.cli import LightningCLI, SaveConfigCallback
-import sys
+from tqdm import tqdm
 
-from retrieval.model import PremiseRetriever
 from retrieval.datamodule import RetrievalDataModule
+from retrieval.model import PremiseRetriever
 
 
 class CLI(LightningCLI):
diff --git a/retrieval/model.py b/retrieval/model.py
index 6832674..a03946f 100644
--- a/retrieval/model.py
+++ b/retrieval/model.py
@@ -1,31 +1,24 @@
 """Ligihtning module for the premise retriever."""
 
-import os
 import json
 import math
-import torch
+import os
 import pickle
+from datetime import datetime, timedelta
+from typing import Any, Dict, List, Tuple, Union
+
 import numpy as np
-from tqdm import tqdm
-from lean_dojo import Pos
-from loguru import logger
 import pytorch_lightning as pl
+import torch
 import torch.nn.functional as F
-from typing import List, Dict, Any, Tuple, Union
-from transformers import T5EncoderModel, AutoTokenizer
+from lean_dojo import Pos
+from loguru import logger
 from torch.distributed import barrier
-from datetime import datetime, timedelta
-
-from common import (
-    Premise,
-    Context,
-    Corpus,
-    get_optimizers,
-    load_checkpoint,
-    zip_strict,
-    cpu_checkpointing_enabled,
-)
+from tqdm import tqdm
+from transformers import AutoTokenizer, T5EncoderModel
 
+from common import (Context, Corpus, Premise, cpu_checkpointing_enabled,
+                    get_optimizers, load_checkpoint, zip_strict)
 
 torch.set_float32_matmul_precision("medium")
 
diff --git a/tests/test_common.py b/tests/test_common.py
index 6855286..735274d 100644
--- a/tests/test_common.py
+++ b/tests/test_common.py
@@ -1,4 +1,5 @@
 import pytest
+
 from leanagent_utils import remove_marks
 
 
diff --git a/unittest_dynamic_database.py b/unittest_dynamic_database.py
index 25c9b03..291e2ec 100644
--- a/unittest_dynamic_database.py
+++ b/unittest_dynamic_database.py
@@ -1,32 +1,24 @@
 # import all the necessary modules
+import datetime
+import json
 import math
-from typing import Union
+import os
+import random
+import shutil
 import unittest
-import datetime
 from pathlib import Path
-from dynamic_database import (
-    DynamicDatabase,
-    Repository,
-    Theorem,
-    AnnotatedTactic,
-    Annotation,
-    PremiseFile,
-    Premise,
-)
-from lean_dojo.data_extraction.lean import Pos, LeanGitRepo
-import generate_benchmark_lean4
+from typing import Tuple, Union
+from unittest.mock import MagicMock, Mock, patch
+
 import lean_dojo
-import json
-import shutil
-import random
+from lean_dojo.data_extraction.lean import LeanGitRepo, Pos
 from loguru import logger
-from unittest.mock import Mock, patch
-from dynamic_database import DynamicDatabase, Repository, Theorem, AnnotatedTactic
-from prover.proof_search import Status, SearchResult
-from dynamic_database import parse_pos
-from typing import Tuple
-import os
-from unittest.mock import patch, MagicMock
+
+import generate_benchmark_lean4
+from dynamic_database import (AnnotatedTactic, Annotation, DynamicDatabase,
+                              Premise, PremiseFile, Repository, Theorem,
+                              parse_pos)
+from prover.proof_search import SearchResult, Status
 
 RAID_DIR = os.environ.get("RAID_DIR")
 DATA_DIR = "datasets_new_unittest"

From 93118f45bcea37ff3c1b31508dc102e6a5b7ac06 Mon Sep 17 00:00:00 2001
From: motiwari <mohittiwarinyc+github@gmail.com>
Date: Mon, 25 Aug 2025 13:29:27 -0700
Subject: [PATCH 04/29] Refactor

---
 .gitignore                  |   1 +
 constants.py                | 237 +++++++++
 dynamic_database.py         |   8 +-
 generate_benchmark_lean4.py |   2 +-
 git_utils.py                | 547 ++++++++++++++++++++
 leanagent.py                | 972 ++++++------------------------------
 run_leanagent.sh            |  10 +-
 7 files changed, 937 insertions(+), 840 deletions(-)
 create mode 100644 constants.py
 create mode 100644 git_utils.py
 mode change 100644 => 100755 run_leanagent.sh

diff --git a/.gitignore b/.gitignore
index 757bfd1..dce68d7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@ retrieval/bm25
 .idea/
 .DS_Store
 RAID/
+repos
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/constants.py b/constants.py
new file mode 100644
index 0000000..01f0ad4
--- /dev/null
+++ b/constants.py
@@ -0,0 +1,237 @@
+PR_TITLE = "[LeanAgent] Proofs"
+
+PR_BODY = """
+[LeanAgent](https://arxiv.org/abs/2410.06209) discovers a proof for a theorem with the `sorry` keyword.
+
+---
+
+<i>~LeanAgent - From the [LeanDojo](https://leandojo.org/) family</i>
+"""
+
+TMP_BRANCH = "_LeanAgent"
+
+COMMIT_MESSAGE = "[LeanAgent] Proofs"
+
+# List of known repositories to process or skip
+# Feel free to remove any repos from this list if you would like to test on them
+
+known_repositories = [
+    "leanprover-community/mathlib4",  # ReProver is trained on this
+    "leanprover-community/batteries",  # functional programming instead of math
+    "leanprover-community/aesop",
+    "leanprover/lean4",
+    "leanprover-community/mathlib",  # Mathlib3 version
+    "leanprover-community/mathlib3",
+    "leanprover/std4",  # moved to batteries
+    "leanprover-community/duper",  # functional programming instead of math
+    "leanprover/lake",
+    "openai/lean-gym",
+    "leanprover-community/lean4-metaprogramming-book",
+    "kmill/lean4-raytracer",  # no theorems
+    "argumentcomputer/yatima",  # trace problems
+    "ImperialCollegeLondon/formalising-mathematics-2024",  # trace problems
+    "leanprover-community/ProofWidgets4",  # trace problems
+    "leanprover/verso",  # trace problems
+    "leanprover-community/NNG4",  # trace problems
+    "ufmg-smite/lean-smt",  # fails to trace due to windows-style line endings
+    "teorth/symmetric_project",  # no compatible commit
+    "cmu-l3/llmlean",  # irrelevant + only 4 theorems
+    "PatrickMassot/GlimpseOfLean",  # strange trace problems with _parse_deps
+    "avigad/lamr",  # trace problems
+    "leanprover-community/quote4",  # no theorems
+    "leanprover-community/iris-lean",  # trace problems
+    "aripiprazole/rinha",  # incompatible commit
+    "leanprover/lean4-cli",  # no theorems
+    "leanprover/LeanInk",  # no theorems
+    "leanprover-community/lean-auto",
+    "leanprover-community/repl",  # no theorems
+    "leanprover/doc-gen4",  # no theorems
+    "leanprover/SampCert",  # trace problems
+    "nomeata/loogle",
+    "risc0/risc0-lean4",
+    "PatrickMassot/verbose-lean4",  # no theorems
+    "tydeu/lean4-alloy",  # no theorems
+    "leanprover/leansat",  # deprecated
+    "BoltonBailey/formal-snarks-project",  # two theorems
+    "dwrensha/lean4-maze",  # two theorems
+    "leanprover-community/mathport",  # irrelevant
+    "argumentcomputer/LSpec",  # one theorem
+    "reaslab/jixia",  # no theorems
+    "riccardobrasca/flt3",  # no theorems
+    "dwrensha/animate-lean-proofs",  # irrelevant
+    "lean-ja/lean-by-example",  # irrelevant
+    "NethermindEth/Clear",  # no theorems
+    "fgdorais/lean4-parser",  # irrelevant
+    "semorrison/lean-training-data",  # irrelevant
+    "verse-lab/lean-ssr",  # irrelevant
+    "GaloisInc/lean-llvm",  # irrelevant
+    "argumentcomputer/Wasm.lean",  # irrelevant
+    "NethermindEth/EVMYulLean",  # irrelevant
+    "rwbarton/advent-of-lean-4",  # irrelevant
+    "leanprover-community/tutorials4",  # irrelevant
+    "haruhisa-enomoto/mathlib4-all-tactics",  # irrelevant
+    "leanprover/LNSym",
+    "leanprover-community/flt-regular",
+    "opencompl/lean-mlir-old",
+    "rami3l/plfl",
+    "HEPLean/HepLean",
+    "forked-from-1kasper/ground_zero",
+    "verified-optimization/CvxLean",
+    "leanprover-community/sphere-eversion",
+    "optsuite/optlib",
+    "YaelDillies/LeanCamCombi",
+    "JamesGallicchio/LeanColls",
+    "T-Brick/c0deine",
+    "jjdishere/EG",
+    "alexkeizer/QpfTypes",
+    "fpvandoorn/LeanCourse23",
+    "marcusrossel/lean-egg",
+    "reilabs/proven-zk",
+    "algebraic-dev/soda",
+    "leanprover-community/llm",
+    "dignissimus/Untangle",
+    "argumentcomputer/Megaparsec.lean",
+    "emilyriehl/infinity-cosmos",
+    "BartoszPiotrowski/lean-premise-selection",
+    "djvelleman/HTPILeanPackage",
+    "girving/ray",
+    "Anderssorby/SDL.lean",
+    "pandaman64/lean-regex",
+    "brown-cs22/CS22-Lean-2023",
+    "hhu-adam/GameSkeleton",
+    "FR-vdash-bot/Algorithm",
+    "PeterKementzey/graph-library-for-lean4",
+    "arthurpaulino/LeanMySQL",
+    "arthurpaulino/NumLean",
+    "FormalSAT/trestle",
+    "nomeata/lean-wf-induct",
+    "leanprover/lean4checker",
+    "IPDSnelting/tba-2022",
+    "digama0/mm-lean4",
+    "KislyjKisel/Raylib.lean",
+    "algebraic-dev/melp",
+    "hhu-adam/Robo",  # same as other tutorials but has lots of sorries
+    "hargoniX/socket.lean",
+    "kovach/etch",
+    "damek/gd-lean",
+    "0art0/lean-slides",
+    "forked-from-1kasper/lean4-categories",
+    "katydid/proofs",
+    "alexjbest/leaff",
+    "sinhp/Poly",
+    "lftcm2023/lftcm2023",  # same as other tutorials but has lots of sorries
+    "lean-ja/lean99",
+    "leanprover/SHerLOC",
+    "Seasawher/mdgen",
+    "opencompl/egg-tactic-code",
+    "david-christiansen/ssft24",
+    "T-Brick/lean2wasm",
+    "hargoniX/cpdt-lean",
+    "jsm28/AperiodicMonotilesLean",
+    "draperlaboratory/ELFSage",
+    "rookie-joe/automatic-lean4-compilation",
+    "madvorak/fecssk",
+    "david-christiansen/bob24",
+    "awodey/joyal",
+    "BrownCS1951x/fpv2023",  # same as other tutorials but has lots of sorries
+    "paulch42/lean-spec",
+    "siddhartha-gadgil/MetaExamples",
+    "dannypsnl/violet",
+    "arthurpaulino/LeanREPL",
+    "Kha/do-supplement",
+    "joehendrix/lean-sat-checker",
+    "ammkrn/timelib",
+    "kmill/LeanTeX",
+    "leanprover/lean4export",
+    "leanprover-community/mathlib3port",
+    "brown-cs22/CS22-Lean-2024",  # same as other tutorials but has lots of sorries
+    "T-Brick/lean-wasm",
+    "crabbo-rave/Soup",
+    "argumentcomputer/RustFFI.lean",
+    "suhr/tmath",
+    "leanprover/leanbv",
+    "arthurpaulino/FxyLang",
+    "SchrodingerZhu/LeanGccBackend",
+    "lecopivo/lean4-karray",
+    "ImperialCollegeLondon/M1F-explained",
+    "proost-assistant/ProostLean",
+    "DavePearce/LeanEVM",
+    "algebraic-dev/ash",
+    "FormalizedFormalLogic/Arithmetization",
+    "cmu-l3/ntp-toolkit",
+    "dwrensha/tryAtEachStep",
+    "yangky11/lean4-example",
+    "T-Brick/DateTime",
+    "model-checking/rust-lean-models",
+    "MichaelStollBayreuth/EulerProducts",
+    "hargoniX/Flame",
+    "argumentcomputer/Http.lean",
+    "madvorak/vcsp",
+    "teorth/newton",
+    "apnelson1/Matroid",
+    "smorel394/TS1",
+    "ianjauslin-rutgers/pythagoras4",
+    "mortarsanjaya/IMOSLLean4",
+    "dupuisf/BibtexQuery",
+    "nomeata/lean-calcify",
+    "argumentcomputer/FFaCiL.lean",
+    "javra/iit",
+    "arthurpaulino/viper",
+    "lindy-labs/aegis",
+    "PatrickMassot/NNG4",
+    "argumentcomputer/YatimaStdLib.lean",
+    "fgdorais/lean4-unicode-basic",
+    "mhuisi/Uniq",
+    "Kha/macro-supplement",
+    "chenjulang/rubikcubegroup",
+    "arthurpaulino/LeanMusic",
+    "argumentcomputer/Ipld.lean",
+    "Odomontois/advent2022-lean",
+    "kbuzzard/IISc-experiments",  # same as other tutorials but has lots of sorries
+    "ykonstant1/InfinitePrimes",
+    "alexkassil/natural_number_game_lean4",
+    "seewoo5/lean-poly-abc",
+    "rah4927/lean-dojo-mew",
+    "siddhartha-gadgil/proofs-and-programs-2023",
+    "PatrickMassot/lean4-game-server",
+    "knowsys/Formale-Systeme-in-LEAN",  # same as other tutorials but has lots of sorries
+    "katydid/symbolic-automatic-derivatives",
+    "girving/interval",
+    "ImperialCollegeLondon/group-theory-experiments",
+    "knowsys/CertifyingDatalog",
+    "bergmannjg/leanCurl",
+    "vasnesterov/HadwigerNelson",
+    "FWuermse/lean-postgres",
+    "leanprover-community/import-graph",
+    "Human-Oriented-ATP/lean-tactics",  # more about tactics than premises
+    "paulcadman/lean4-leetcode",
+    "argumentcomputer/Lurk.lean",
+    "AlexDuchnowski/rubiks-cube",
+    "SchrodingerZhu/lean-gccjit",
+    "JamesGallicchio/http",
+    "jtristan/UnicodeSkipListTableExample",
+    "adomani/MA4N1_2023",  # same as other tutorials but has lots of sorries
+    "remimimimimi/leansec",
+    "hhu-adam/lean-i18n",
+    "RemyDegenne/testing-lower-bounds",
+    "mariainesdff/LocalClassFieldTheory",
+    "AviCraimer/relational-calculus-library-lean4",
+    "JLimperg/regensburg-itp-school-2023",
+    "jaalonso/Calculemus2",
+    "mseri/BET",
+    "xubaiw/Reservoir.lean",
+    "hargoniX/nest-core",
+    "siddhartha-gadgil/Polylean",
+    "MichaelStollBayreuth/Weights",
+    "sanchace/FRACTRAN",
+    "argumentcomputer/Poseidon.lean",
+    "madvorak/chomsky",
+    "T-Brick/ControlFlow",
+    "pa-ba/guarded-lean",
+]
+
+known_dead_repos = [
+    "uwdb/Cosette",
+    "notepad-plus-plus/userDefinedLanguages",
+    "teorth/analysis",
+]
\ No newline at end of file
diff --git a/dynamic_database.py b/dynamic_database.py
index d3fccc5..4f1b297 100644
--- a/dynamic_database.py
+++ b/dynamic_database.py
@@ -1,14 +1,14 @@
 from __future__ import annotations
-
-import datetime
+import time
+from datetime import datetime
 import json
 import os
 import random
 import shutil
 from collections import defaultdict
-from dataclasses import asdict, dataclass, field
+from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Dict, List, Optional, Set, Tuple, Union
+from typing import Dict, List, Optional, Set, Tuple
 
 from lean_dojo.data_extraction.lean import Pos
 from loguru import logger
diff --git a/generate_benchmark_lean4.py b/generate_benchmark_lean4.py
index d343931..67c182c 100644
--- a/generate_benchmark_lean4.py
+++ b/generate_benchmark_lean4.py
@@ -10,7 +10,7 @@
 from datetime import datetime
 from pathlib import Path
 from typing import Dict, List, Union
-
+import os
 import lean_dojo
 import networkx as nx
 from lean_dojo import *
diff --git a/git_utils.py b/git_utils.py
new file mode 100644
index 0000000..6d56fb4
--- /dev/null
+++ b/git_utils.py
@@ -0,0 +1,547 @@
+import numpy as np
+import json
+import re
+import shutil
+import subprocess
+import requests
+import generate_benchmark_lean4
+from lean_dojo import LeanGitRepo
+from datetime import datetime
+import lean_dojo
+from collections import defaultdict
+from dynamic_database import Repository, DynamicDatabase, Theorem
+
+from loguru import logger
+from typing import Union, List, Tuple
+import math
+import os
+
+from constants import known_repositories, known_dead_repos, PR_TITLE, PR_BODY, TMP_BRANCH, COMMIT_MESSAGE
+
+personal_access_token = os.environ.get("GITHUB_ACCESS_TOKEN")
+BATCH_SIZE = 4
+RAID_DIR = os.environ.get("RAID_DIR")
+os.environ["RAY_TMPDIR"] = f"{RAID_DIR}/tmp"
+repo_dir = f"{RAID_DIR}/repos_new"
+
+DATA_DIR = f"{RAID_DIR}/data"
+CHECKPOINT_DIR = f"{RAID_DIR}/checkpoints"
+EVAL_RESULTS_FILE_PATH = f"{RAID_DIR}/eval_results.txt"
+DB_FILE_NAME = "db_file.txt"
+PROOF_LOG_FILE_NAME = f"{RAID_DIR}/proof_log.txt"
+ENCOUNTERED_THEOREMS_FILE = f"{RAID_DIR}/encountered_theorems.pkl"
+FISHER_DIR = f"{RAID_DIR}/fisher"  # Optional
+
+
+def clone_repo(repo_url):
+    """Clone a git repository and return the path to the repository and its sha."""
+    # TODO: Fix
+    repo_name = "/".join(repo_url.split("/")[-2:]).replace(".git", "")
+    logger.info(f"Cloning {repo_url}")
+    logger.info(f"Repo name: {repo_name}")
+    repo_name = os.path.join(repo_dir, repo_name)
+    if os.path.exists(repo_name):
+        print(f"Deleting existing repository directory: {repo_name}")
+        shutil.rmtree(repo_name)
+    
+    subprocess.run(["git", "clone", repo_url, repo_name])
+    process = subprocess.Popen(["git", "ls-remote", repo_url], stdout=subprocess.PIPE)
+    stdout, _stderr = process.communicate()
+    sha = re.split(r"\t+", stdout.decode("utf-8"))[0]
+    return repo_name, sha
+
+
+def branch_exists(repo_name, branch_name):
+    """Check if a branch exists in a git repository."""
+    proc = subprocess.run(
+        ["git", "-C", repo_name, "branch", "-a"], capture_output=True, text=True
+    )
+    branches = proc.stdout.split("\n")
+    local_branch = branch_name
+    remote_branch = f"remote/{branch_name}"
+    return any(
+        branch.strip().endswith(local_branch) or branch.strip().endswith(remote_branch)
+        for branch in branches
+    )
+
+
+def create_or_switch_branch(repo_name, branch_name, base_branch):
+    """Create a branch in a git repository if it doesn't exist, or switch to it if it does."""
+    if not branch_exists(repo_name, branch_name):
+        subprocess.run(
+            ["git", "-C", repo_name, "checkout", "-b", branch_name], check=True
+        )
+    else:
+        subprocess.run(["git", "-C", repo_name, "checkout", branch_name], check=True)
+        subprocess.run(
+            [
+                "git",
+                "-C",
+                repo_name,
+                "merge",
+                base_branch,
+                "-m",
+                f"Merging {branch_name} into {base_branch}",
+            ],
+            check=True,
+        )
+
+
+def commit_changes(repo_name, commit_message):
+    """Commit changes to a git repository."""
+    status = subprocess.run(
+        ["git", "-C", repo_name, "status", "--porcelain"],
+        capture_output=True,
+        text=True,
+    ).stdout.strip()
+    if status == "":
+        print("No changes to commit.")
+        return False
+    subprocess.run(["git", "-C", repo_name, "add", "."], check=True)
+    subprocess.run(["git", "-C", repo_name, "commit", "-m", commit_message], check=True)
+    return True
+
+
+def push_changes(repo_name, branch_name):
+    """Push changes to a git repository."""
+    subprocess.run(
+        ["git", "-C", repo_name, "push", "-u", "origin", branch_name], check=True
+    )
+
+
+def get_default_branch(repo_full_name):
+    """Get the default branch of a repository (default `main`)."""
+    url = f"https://api.github.com/repos/{repo_full_name}"
+    headers = {
+        "Authorization": f"token {personal_access_token}",
+        "Accept": "application/vnd.github.v3+json",
+    }
+    response = requests.get(url, headers=headers)
+    if response.status_code == 200:
+        return response.json()["default_branch"]
+    else:
+        logger.info(f"Failed to get default branch for {repo_full_name}")
+        return "main"
+
+
+def create_pull_request(repo_full_name, title, body, head_branch):
+    """Create a pull request in a repository."""
+    base_branch = get_default_branch(repo_full_name)
+    url = f"https://api.github.com/repos/{repo_full_name}/pulls"
+    headers = {
+        "Authorization": f"token {personal_access_token}",
+        "Accept": "application/vnd.github.v3+json",
+    }
+    data = {"title": title, "body": body, "head": head_branch, "base": base_branch}
+    response = requests.post(url, headers=headers, json=data)
+    if response.status_code == 201:
+        print("Pull request created successfully: " + response.json()["html_url"])
+        return response.json()["html_url"]
+    else:
+        print("Failed to create pull request", response.text)
+        return ""
+
+def ensure_inside_git():
+    """Ensure that the current directory is inside a git repository."""
+    try:
+        subprocess.run(
+            ["git", "rev-parse", "--is-inside-work-tree"],
+            check=True,
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
+        logger.info("Already in a Git repository")
+    except subprocess.CalledProcessError:
+        logger.info("Not in a Git repository. Initializing one.")
+        subprocess.run(["git", "init"], check=True)
+        
+def get_compatible_commit(url):
+    """Find the most recent commit with a Lean version that LeanAgent supports."""
+    try:
+        process = subprocess.Popen(["git", "ls-remote", url], stdout=subprocess.PIPE)
+        stdout, stderr = process.communicate()
+        latest_commit = re.split(r"\t+", stdout.decode("utf-8"))[0]
+        logger.info(f"Latest commit: {latest_commit}")
+
+        new_url = url.replace(".git", "")
+        logger.info(f"Creating LeanGitRepo for {new_url}")
+        
+        repo = LeanGitRepo(new_url, latest_commit)
+        logger.info(f"Getting config for {url}")
+        
+        config = repo.get_config("lean-toolchain")
+        v = generate_benchmark_lean4.get_lean4_version_from_config(config["content"])
+        
+        if generate_benchmark_lean4.is_supported_version(v):
+            logger.info(f"Latest commit compatible for url {url}")
+            return latest_commit, v
+
+        logger.info(f"Searching for compatible commit for {url}")
+        
+        ensure_inside_git()
+        ZZ
+        process = subprocess.Popen(
+            ["git", "fetch", "--depth=1000000", url],  # Fetch commits
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        logger.info(f"Fetching commits for {url}")
+        _, stderr = process.communicate()
+        
+        if process.returncode != 0:
+            raise Exception(f"Git fetch command failed: {stderr.decode('utf-8')}")
+        
+        logger.info(f"Fetched commits for {url}")
+        
+        process = subprocess.Popen(
+            ["git", "log", "--format=%H", "FETCH_HEAD"],  # Get list of commits
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        
+        logger.info(f"Getting list of commits for {url}")
+        
+        stdout, stderr = process.communicate()
+        if process.returncode != 0:
+            raise Exception(f"Git log command failed: {stderr.decode('utf-8')}")
+        
+        commits = stdout.decode("utf-8").strip().split("\n")
+        logger.info(f"Found {len(commits)} commits for {url}")
+        
+        new_url = url.replace(".git", "")
+        
+        repo_human_name = "/".join(new_url.split("/")[-2:])
+        
+        # Delete repo if it exists, because it might be checked out to a different commit
+        if os.path.exists(os.path.join("repos", repo_human_name)):
+            shutil.rmtree(os.path.join("repos", repo_human_name))
+        
+        subprocess.run(["git", "clone", url, os.path.join("repos", repo_human_name)], check=True)
+
+        for commit in commits:
+            logger.info(f"Checking commit {commit} for {url}")
+            # Check out the commit locally
+            subprocess.run(["git", "-C", os.path.join("repos", repo_human_name), "checkout", commit], check=True)
+            import ipdb; ipdb.set_trace()
+            repo = LeanGitRepo.from_path(os.path.join(os.getcwd(), "repos", repo_human_name), commit)
+            config = repo.get_config("lean-toolchain")
+            v = generate_benchmark_lean4.get_lean4_version_from_config(config["content"])
+            if generate_benchmark_lean4.is_supported_version(v):
+                logger.info(f"Found compatible commit {commit} for {url}")
+                return commit, v
+
+        raise Exception("No compatible commit found")
+
+    except Exception as e:
+        logger.info(f"Error in get_compatible_commit: {str(e)}")
+        return None, None
+
+
+def find_and_save_compatible_commits(repo_info_file, lean_git_repos):
+    """Finds compatible commits for various repositories"""
+    with open(repo_info_file, "r") as repo_compatibility_file:
+        updated_repos = json.loads(repo_compatibility_file)
+    
+        for repo in lean_git_repos:
+            url = repo.url
+            if not url.endswith(".git"):
+                url = url + ".git"
+
+            sha = None
+            v = None
+            
+            # TODO: Check these
+            if "mathlib4" in url:
+                sha = "2b29e73438e240a427bcecc7c0fe19306beb1310"
+                v = "v4.8.0"
+            elif "SciLean" in url:
+                sha = "22d53b2f4e3db2a172e71da6eb9c916e62655744"
+                v = "v4.7.0"
+            elif "pfr" in url:
+                sha = "fa398a5b853c7e94e3294c45e50c6aee013a2687"
+                v = "v4.8.0-rc1"
+            else:
+                # Check if it's in any element
+                for elem in updated_repos:
+                    if url.replace(".git", "") == elem["url"]:
+                        continue
+                    
+                sha, v = get_compatible_commit(url)
+            
+
+            # Always write to json, even for null repos
+            updated_repos.append(
+                {"url": url.replace(".git", ""), "commit": sha if sha else None, "version": v if v else None}
+            )
+            
+            if not sha:
+                logger.info(f"Failed to find a compatible commit for {url}")
+
+        # Write per repo in case of interrupt
+        with open(repo_info_file, "w") as f:
+            json.dump(updated_repos, f)
+
+    return updated_repos
+
+
+def search_github_repositories(lean_git_repos, repos, language="Lean", num_repos=10):
+    """Search for the given number of repositories on GitHub that have the given language."""
+    headers = {"Authorization": personal_access_token}
+    query_params = {
+        "q": f"language:{language}",
+        "sort": "stars",
+        "order": "desc",
+        "per_page": 100,
+    }
+
+    cloned_count = 0
+    page = 1
+
+    while cloned_count < num_repos:
+        query_params["page"] = page
+        response = requests.get(
+            "https://api.github.com/search/repositories",
+            headers=headers,
+            params=query_params,
+        )
+
+        if response.status_code == 200:
+            repositories = response.json()["items"]
+            for repo in repositories:
+                if cloned_count >= num_repos:
+                    break
+                
+                repo_full_name = repo["full_name"]
+                print("\n\n")
+                logger.info(f"Processing {repo_full_name}")
+                
+                
+                # Skip repos that are already known
+                if repo_full_name not in known_repositories + known_dead_repos + repos:
+                    print("\n\n")
+                    logger.info(f"Processing new repo: {repo_full_name}")
+                    name = None
+                    try:
+                        clone_url = repo["clone_url"]
+                        repo_name, sha = clone_repo(clone_url)
+                        name = repo_name
+                        url = clone_url.replace(".git", "")
+                        
+                        # TODO: This constructor can be very slow
+                        lean_git_repo = LeanGitRepo(url, sha)
+                        
+                        lean_git_repos.append(lean_git_repo)
+                        repos.append(repo_full_name)
+                        cloned_count += 1
+                        logger.info(f"Cloned {repo_full_name}")
+                    except Exception as e:
+                        shutil.rmtree(name)
+                        logger.info(f"Failed to clone {repo_full_name} because of {e}")
+                else:
+                    logger.info(
+                        f"Skipping {repo_full_name} since it is a known repository"
+                    )
+            page += 1
+        else:
+            logger.info("Failed to search GitHub", response.status_code)
+            break
+
+        # Check if we've reached the end of the search results
+        if len(repositories) < 100:
+            break
+
+    logger.info(f"Total repositories processed: {cloned_count}")
+    return lean_git_repos, repos
+
+
+def add_repo_to_database(dynamic_database_json_path, repo, db):
+    """Adds a repository to the dynamic database."""
+    # Prepare the data necessary to add this repo to the dynamic database
+    url = repo.url
+    if not url.endswith(".git"):
+        url = url + ".git"
+    logger.info(f"\n\nProcessing {url}")
+
+    if "mathlib4" in url:
+        sha = "2b29e73438e240a427bcecc7c0fe19306beb1310"
+        v = "v4.8.0"
+    elif "SciLean" in url:
+        sha = "22d53b2f4e3db2a172e71da6eb9c916e62655744"
+        v = "v4.7.0"
+    elif "pfr" in url:
+        sha = "fa398a5b853c7e94e3294c45e50c6aee013a2687"
+        v = "v4.8.0-rc1"
+    else:
+        sha, v = get_compatible_commit(url)
+
+    if not sha:
+        logger.info(f"Failed to find a compatible commit for {url}")
+        return None
+
+    logger.info(f"Found compatible commit {sha} for {url} with lean version: {v}")
+    url = url.replace(".git", "")
+    repo = LeanGitRepo(url, sha)
+    dir_name = repo.url.split("/")[-1] + "_" + sha
+    dst_dir = RAID_DIR + "/" + DATA_DIR + "/" + dir_name
+    logger.info(f"Generating benchmark at {dst_dir}")
+    traced_repo, _, _, total_theorems = generate_benchmark_lean4.main(
+        repo.url, sha, dst_dir
+    )
+    if not traced_repo:
+        logger.info(f"Failed to trace {url}")
+        return None
+    if total_theorems < 3 * BATCH_SIZE:  # Should be enough theorems for train/val/test
+        logger.info(f"No theorems found in {url}")
+        return None
+    logger.info(f"Finished generating benchmark at {dst_dir}")
+
+    # Add the new repo to the dynamic database
+    config = repo.get_config("lean-toolchain")
+    v = generate_benchmark_lean4.get_lean4_version_from_config(config["content"])
+    theorems_folder = dst_dir + "/random"
+    premise_files_corpus = dst_dir + "/corpus.jsonl"
+    files_traced = dst_dir + "/traced_files.jsonl"
+    pr_url = None
+    data = {
+        "url": repo.url,
+        "name": "/".join(repo.url.split("/")[-2:]),
+        "commit": repo.commit,
+        "lean_version": v,
+        "lean_dojo_version": lean_dojo.__version__,
+        "metadata": {
+            "date_processed": datetime.datetime.now(),
+        },
+        "theorems_folder": theorems_folder,
+        "premise_files_corpus": premise_files_corpus,
+        "files_traced": files_traced,
+        "pr_url": pr_url,
+    }
+
+    repo = Repository.from_dict(data)
+    logger.info("Before adding new repo:")
+    db.print_database_contents()
+    db.add_repository(repo)
+    logger.info("After adding new repo:")
+    db.print_database_contents()
+    db.to_json(dynamic_database_json_path)
+    return "Done"
+
+def calculate_difficulty(theorem: Theorem) -> Union[float, None]:
+    """Calculates the difficulty of a theorem."""
+    proof_steps = theorem.traced_tactics
+    if any("sorry" in step.tactic for step in proof_steps):
+        return float("inf")  # Hard (no proof)
+    if len(proof_steps) == 0:
+        return None  # To be distributed later
+    return math.exp(len(proof_steps))
+
+def categorize_difficulty(
+    difficulty: Union[float, None], percentiles: List[float]
+) -> str:
+    """Categorizes the difficulty of a theorem."""
+    if difficulty is None:
+        return "To_Distribute"
+    if difficulty == float("inf"):
+        return "Hard (No proof)"
+    elif difficulty <= percentiles[0]:
+        return "Easy"
+    elif difficulty <= percentiles[1]:
+        return "Medium"
+    else:
+        return "Hard"
+
+
+def sort_repositories_by_difficulty(db: DynamicDatabase) -> List[Repository]:
+    """Sorts repositories by the difficulty of their theorems."""
+    difficulties_by_repo = defaultdict(list)
+    all_difficulties = []
+
+    print("Ready to calculate difficulties of all theorems")
+    for repo in db.repositories:
+        print(f"Starting {repo.name}")
+        for theorem in repo.get_all_theorems:
+            difficulty = calculate_difficulty(theorem)
+            theorem.difficulty_rating = difficulty
+            difficulties_by_repo[repo].append(
+                (
+                    theorem.full_name,
+                    str(theorem.file_path),
+                    tuple(theorem.start),
+                    tuple(theorem.end),
+                    difficulty,
+                )
+            )
+            if difficulty is not None:
+                all_difficulties.append(difficulty)
+
+        db.update_repository(repo)
+        print(f"Finished {repo.name}")
+
+    percentiles = np.percentile(all_difficulties, [33, 67])
+
+    categorized_theorems = defaultdict(lambda: defaultdict(list))
+
+    print("Ready to categorize theorems")
+    for repo, theorems in difficulties_by_repo.items():
+        print(f"Starting {repo.name}")
+        for theorem_name, file_path, start, end, difficulty in theorems:
+            category = categorize_difficulty(difficulty, percentiles)
+            categorized_theorems[repo][category].append(
+                (theorem_name, file_path, start, end, difficulty)
+            )
+        print(f"Finished {repo.name}")
+
+    print("Distributed theorems with no proofs")
+    for repo in categorized_theorems:
+        print(f"Starting {repo.name}")
+        to_distribute = categorized_theorems[repo]["To_Distribute"]
+        chunk_size = len(to_distribute) // 3
+        for i, category in enumerate(["Easy", "Medium", "Hard"]):
+            start = i * chunk_size
+            end = start + chunk_size if i < 2 else None
+            categorized_theorems[repo][category].extend(to_distribute[start:end])
+        del categorized_theorems[repo]["To_Distribute"]
+        print(f"Finished {repo.name}")
+
+    # Sort repositories based on the number of easy theorems
+    sorted_repos = sorted(
+        categorized_theorems.keys(),
+        key=lambda r: len(categorized_theorems[r]["Easy"]),
+        reverse=True,
+    )
+
+    return sorted_repos, categorized_theorems, percentiles
+
+
+def save_sorted_repos(sorted_repos: List[Repository], file_path: str):
+    """Saves the sorted repositories to a file."""
+    sorted_repo_data = [
+        {"url": repo.url, "commit": repo.commit, "name": repo.name}
+        for repo in sorted_repos
+    ]
+    with open(file_path, "w") as f:
+        json.dump(sorted_repo_data, f, indent=2)
+
+
+def load_sorted_repos(file_path: str) -> List[Tuple[str, str, str]]:
+    """Loads the sorted repositories from a file."""
+    with open(file_path, "r") as f:
+        sorted_repo_data = json.load(f)
+    return [(repo["url"], repo["commit"], repo["name"]) for repo in sorted_repo_data]
+
+
+def write_skip_file(repo_url):
+    """Writes a repository URL to a file to skip it."""
+    skip_file_path = os.path.join(RAID_DIR, DATA_DIR, "skip_repo.txt")
+    with open(skip_file_path, "w") as f:
+        f.write(repo_url)
+
+
+def should_skip_repo():
+    """Checks if a repository should be skipped."""
+    skip_file_path = os.path.join(RAID_DIR, DATA_DIR, "skip_repo.txt")
+    if os.path.exists(skip_file_path):
+        with open(skip_file_path, "r") as f:
+            repo_url = f.read().strip()
+        return True, repo_url
+    return False, None
\ No newline at end of file
diff --git a/leanagent.py b/leanagent.py
index 59d2724..68a3e4d 100644
--- a/leanagent.py
+++ b/leanagent.py
@@ -1,47 +1,45 @@
-# import all the necessary libraries
 import json
-import math
 import os
 import pickle
 import random
-import re
-import shutil
-import subprocess
+
 import sys
 import time
 import traceback
-from collections import defaultdict
-from copy import copy
+
 from datetime import datetime, timedelta
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Union
+from typing import List, Optional, Tuple
+
 
-import lean_dojo
 import numpy as np
 import pytorch_lightning as pl
 import ray
-import requests
+
 import torch
 from lean_dojo import *
 from lean_dojo import LeanGitRepo, Pos
 from lean_dojo import Theorem
 from lean_dojo import Theorem as LeanDojoTheorem
-from lean_dojo import is_available_in_cache
+
 from loguru import logger
 from pytorch_lightning import seed_everything
-from pytorch_lightning.callbacks import (Callback, EarlyStopping,
-                                         LearningRateMonitor, ModelCheckpoint)
+from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor, ModelCheckpoint
 from pytorch_lightning.strategies import DDPStrategy
 from tqdm import tqdm
 
 import generate_benchmark_lean4
-from dynamic_database import *
+from dynamic_database import AnnotatedTactic, Theorem, DynamicDatabase
 from prover.proof_search import DistributedProver, SearchResult, Status
 from retrieval.datamodule import RetrievalDataModule
 from retrieval.main import run_cli
 from retrieval.model import PremiseRetriever
 
+from git_utils import find_and_save_compatible_commits, search_github_repositories, should_skip_repo, add_repo_to_database, sort_repositories_by_difficulty, save_sorted_repos
+
 # Set the seed for reproducibility
+personal_access_token = os.environ.get("GITHUB_ACCESS_TOKEN")
+
 random.seed(3407)  # https://arxiv.org/abs/2109.08203
 BATCH_SIZE = 4
 RAID_DIR = os.environ.get("RAID_DIR")
@@ -58,512 +56,8 @@
 
 repos_for_merged_dataset = []
 repos_for_proving = []
-
-# List of known repositories to process or skip
-# Feel free to remove any repos from this list if you would like to test on them
-known_repositories = [
-    "leanprover-community/mathlib4",  # ReProver is trained on this
-    "leanprover-community/batteries",  # functional programming instead of math
-    "leanprover-community/aesop",
-    "leanprover/lean4",
-    "leanprover-community/mathlib",  # Mathlib3 version
-    "leanprover-community/mathlib3",
-    "leanprover/std4",  # moved to batteries
-    "leanprover-community/duper",  # functional programming instead of math
-    "leanprover/lake",
-    "openai/lean-gym",
-    "leanprover-community/lean4-metaprogramming-book",
-    "kmill/lean4-raytracer",  # no theorems
-    "argumentcomputer/yatima",  # trace problems
-    "ImperialCollegeLondon/formalising-mathematics-2024",  # trace problems
-    "leanprover-community/ProofWidgets4",  # trace problems
-    "leanprover/verso",  # trace problems
-    "leanprover-community/NNG4",  # trace problems
-    "ufmg-smite/lean-smt",  # fails to trace due to windows-style line endings
-    "teorth/symmetric_project",  # no compatible commit
-    "cmu-l3/llmlean",  # irrelevant + only 4 theorems
-    "PatrickMassot/GlimpseOfLean",  # strange trace problems with _parse_deps
-    "avigad/lamr",  # trace problems
-    "leanprover-community/quote4",  # no theorems
-    "leanprover-community/iris-lean",  # trace problems
-    "aripiprazole/rinha",  # incompatible commit
-    "leanprover/lean4-cli",  # no theorems
-    "leanprover/LeanInk",  # no theorems
-    "leanprover-community/lean-auto",
-    "leanprover-community/repl",  # no theorems
-    "leanprover/doc-gen4",  # no theorems
-    "leanprover/SampCert",  # trace problems
-    "nomeata/loogle",
-    "risc0/risc0-lean4",
-    "PatrickMassot/verbose-lean4",  # no theorems
-    "tydeu/lean4-alloy",  # no theorems
-    "leanprover/leansat",  # deprecated
-    "BoltonBailey/formal-snarks-project",  # two theorems
-    "dwrensha/lean4-maze",  # two theorems
-    "leanprover-community/mathport",  # irrelevant
-    "argumentcomputer/LSpec",  # one theorem
-    "reaslab/jixia",  # no theorems
-    "riccardobrasca/flt3",  # no theorems
-    "dwrensha/animate-lean-proofs",  # irrelevant
-    "lean-ja/lean-by-example",  # irrelevant
-    "NethermindEth/Clear",  # no theorems
-    "fgdorais/lean4-parser",  # irrelevant
-    "semorrison/lean-training-data",  # irrelevant
-    "verse-lab/lean-ssr",  # irrelevant
-    "GaloisInc/lean-llvm",  # irrelevant
-    "argumentcomputer/Wasm.lean",  # irrelevant
-    "NethermindEth/EVMYulLean",  # irrelevant
-    "rwbarton/advent-of-lean-4",  # irrelevant
-    "leanprover-community/tutorials4",  # irrelevant
-    "haruhisa-enomoto/mathlib4-all-tactics",  # irrelevant
-    "leanprover/LNSym",
-    "leanprover-community/flt-regular",
-    "opencompl/lean-mlir-old",
-    "rami3l/plfl",
-    "HEPLean/HepLean",
-    "forked-from-1kasper/ground_zero",
-    "verified-optimization/CvxLean",
-    "leanprover-community/sphere-eversion",
-    "optsuite/optlib",
-    "YaelDillies/LeanCamCombi",
-    "JamesGallicchio/LeanColls",
-    "T-Brick/c0deine",
-    "jjdishere/EG",
-    "alexkeizer/QpfTypes",
-    "fpvandoorn/LeanCourse23",
-    "marcusrossel/lean-egg",
-    "reilabs/proven-zk",
-    "algebraic-dev/soda",
-    "leanprover-community/llm",
-    "dignissimus/Untangle",
-    "argumentcomputer/Megaparsec.lean",
-    "emilyriehl/infinity-cosmos",
-    "BartoszPiotrowski/lean-premise-selection",
-    "djvelleman/HTPILeanPackage",
-    "girving/ray",
-    "Anderssorby/SDL.lean",
-    "pandaman64/lean-regex",
-    "brown-cs22/CS22-Lean-2023",
-    "hhu-adam/GameSkeleton",
-    "FR-vdash-bot/Algorithm",
-    "PeterKementzey/graph-library-for-lean4",
-    "arthurpaulino/LeanMySQL",
-    "arthurpaulino/NumLean",
-    "FormalSAT/trestle",
-    "nomeata/lean-wf-induct",
-    "leanprover/lean4checker",
-    "IPDSnelting/tba-2022",
-    "digama0/mm-lean4",
-    "KislyjKisel/Raylib.lean",
-    "algebraic-dev/melp",
-    "hhu-adam/Robo",  # same as other tutorials but has lots of sorries
-    "hargoniX/socket.lean",
-    "kovach/etch",
-    "damek/gd-lean",
-    "0art0/lean-slides",
-    "forked-from-1kasper/lean4-categories",
-    "katydid/proofs",
-    "alexjbest/leaff",
-    "sinhp/Poly",
-    "lftcm2023/lftcm2023",  # same as other tutorials but has lots of sorries
-    "lean-ja/lean99",
-    "leanprover/SHerLOC",
-    "Seasawher/mdgen",
-    "opencompl/egg-tactic-code",
-    "david-christiansen/ssft24",
-    "T-Brick/lean2wasm",
-    "hargoniX/cpdt-lean",
-    "jsm28/AperiodicMonotilesLean",
-    "draperlaboratory/ELFSage",
-    "rookie-joe/automatic-lean4-compilation",
-    "madvorak/fecssk",
-    "david-christiansen/bob24",
-    "awodey/joyal",
-    "BrownCS1951x/fpv2023",  # same as other tutorials but has lots of sorries
-    "paulch42/lean-spec",
-    "siddhartha-gadgil/MetaExamples",
-    "dannypsnl/violet",
-    "arthurpaulino/LeanREPL",
-    "Kha/do-supplement",
-    "joehendrix/lean-sat-checker",
-    "ammkrn/timelib",
-    "kmill/LeanTeX",
-    "leanprover/lean4export",
-    "leanprover-community/mathlib3port",
-    "brown-cs22/CS22-Lean-2024",  # same as other tutorials but has lots of sorries
-    "T-Brick/lean-wasm",
-    "crabbo-rave/Soup",
-    "argumentcomputer/RustFFI.lean",
-    "suhr/tmath",
-    "leanprover/leanbv",
-    "arthurpaulino/FxyLang",
-    "SchrodingerZhu/LeanGccBackend",
-    "lecopivo/lean4-karray",
-    "ImperialCollegeLondon/M1F-explained",
-    "proost-assistant/ProostLean",
-    "DavePearce/LeanEVM",
-    "algebraic-dev/ash",
-    "FormalizedFormalLogic/Arithmetization",
-    "cmu-l3/ntp-toolkit",
-    "dwrensha/tryAtEachStep",
-    "yangky11/lean4-example",
-    "T-Brick/DateTime",
-    "model-checking/rust-lean-models",
-    "MichaelStollBayreuth/EulerProducts",
-    "hargoniX/Flame",
-    "argumentcomputer/Http.lean",
-    "madvorak/vcsp",
-    "teorth/newton",
-    "apnelson1/Matroid",
-    "smorel394/TS1",
-    "ianjauslin-rutgers/pythagoras4",
-    "mortarsanjaya/IMOSLLean4",
-    "dupuisf/BibtexQuery",
-    "nomeata/lean-calcify",
-    "argumentcomputer/FFaCiL.lean",
-    "javra/iit",
-    "arthurpaulino/viper",
-    "lindy-labs/aegis",
-    "PatrickMassot/NNG4",
-    "argumentcomputer/YatimaStdLib.lean",
-    "fgdorais/lean4-unicode-basic",
-    "mhuisi/Uniq",
-    "Kha/macro-supplement",
-    "chenjulang/rubikcubegroup",
-    "arthurpaulino/LeanMusic",
-    "argumentcomputer/Ipld.lean",
-    "Odomontois/advent2022-lean",
-    "kbuzzard/IISc-experiments",  # same as other tutorials but has lots of sorries
-    "ykonstant1/InfinitePrimes",
-    "alexkassil/natural_number_game_lean4",
-    "seewoo5/lean-poly-abc",
-    "rah4927/lean-dojo-mew",
-    "siddhartha-gadgil/proofs-and-programs-2023",
-    "PatrickMassot/lean4-game-server",
-    "knowsys/Formale-Systeme-in-LEAN",  # same as other tutorials but has lots of sorries
-    "katydid/symbolic-automatic-derivatives",
-    "girving/interval",
-    "ImperialCollegeLondon/group-theory-experiments",
-    "knowsys/CertifyingDatalog",
-    "bergmannjg/leanCurl",
-    "vasnesterov/HadwigerNelson",
-    "FWuermse/lean-postgres",
-    "leanprover-community/import-graph",
-    "Human-Oriented-ATP/lean-tactics",  # more about tactics than premises
-    "paulcadman/lean4-leetcode",
-    "argumentcomputer/Lurk.lean",
-    "AlexDuchnowski/rubiks-cube",
-    "SchrodingerZhu/lean-gccjit",
-    "JamesGallicchio/http",
-    "jtristan/UnicodeSkipListTableExample",
-    "adomani/MA4N1_2023",  # same as other tutorials but has lots of sorries
-    "remimimimimi/leansec",
-    "hhu-adam/lean-i18n",
-    "RemyDegenne/testing-lower-bounds",
-    "mariainesdff/LocalClassFieldTheory",
-    "AviCraimer/relational-calculus-library-lean4",
-    "JLimperg/regensburg-itp-school-2023",
-    "jaalonso/Calculemus2",
-    "mseri/BET",
-    "xubaiw/Reservoir.lean",
-    "hargoniX/nest-core",
-    "siddhartha-gadgil/Polylean",
-    "MichaelStollBayreuth/Weights",
-    "sanchace/FRACTRAN",
-    "argumentcomputer/Poseidon.lean",
-    "madvorak/chomsky",
-    "T-Brick/ControlFlow",
-    "pa-ba/guarded-lean",
-]
-
-repos = []
 lean_git_repos = []
-personal_access_token = os.environ.get("GITHUB_ACCESS_TOKEN")
-
-PR_TITLE = "[LeanAgent] Proofs"
-
-PR_BODY = """
-[LeanAgent](https://arxiv.org/abs/2410.06209) discovers a proof for a theorem with the `sorry` keyword.
-
----
-
-<i>~LeanAgent - From the [LeanDojo](https://leandojo.org/) family</i>
-"""
-
-TMP_BRANCH = "_LeanAgent"
-
-COMMIT_MESSAGE = "[LeanAgent] Proofs"
-
-
-def clone_repo(repo_url):
-    """Clone a git repository and return the path to the repository and its sha."""
-    repo_name = "/".join(repo_url.split("/")[-2:]).replace(".git", "")
-    logger.info(f"Cloning {repo_url}")
-    logger.info(f"Repo name: {repo_name}")
-    repo_name = repo_dir + "/" + repo_name
-    if os.path.exists(repo_name):
-        print(f"Deleting existing repository directory: {repo_name}")
-        shutil.rmtree(repo_name)
-    subprocess.run(["git", "clone", repo_url, repo_name])
-    process = subprocess.Popen(["git", "ls-remote", repo_url], stdout=subprocess.PIPE)
-    stdout, stderr = process.communicate()
-    sha = re.split(r"\t+", stdout.decode("utf-8"))[0]
-    return repo_name, sha
-
-
-def branch_exists(repo_name, branch_name):
-    """Check if a branch exists in a git repository."""
-    proc = subprocess.run(
-        ["git", "-C", repo_name, "branch", "-a"], capture_output=True, text=True
-    )
-    branches = proc.stdout.split("\n")
-    local_branch = branch_name
-    remote_branch = f"remote/{branch_name}"
-    return any(
-        branch.strip().endswith(local_branch) or branch.strip().endswith(remote_branch)
-        for branch in branches
-    )
-
-
-def create_or_switch_branch(repo_name, branch_name, base_branch):
-    """Create a branch in a git repository if it doesn't exist, or switch to it if it does."""
-    if not branch_exists(repo_name, branch_name):
-        subprocess.run(
-            ["git", "-C", repo_name, "checkout", "-b", branch_name], check=True
-        )
-    else:
-        subprocess.run(["git", "-C", repo_name, "checkout", branch_name], check=True)
-        subprocess.run(
-            [
-                "git",
-                "-C",
-                repo_name,
-                "merge",
-                base_branch,
-                "-m",
-                f"Merging {branch_name} into {base_branch}",
-            ],
-            check=True,
-        )
-
-
-def commit_changes(repo_name, commit_message):
-    """Commit changes to a git repository."""
-    status = subprocess.run(
-        ["git", "-C", repo_name, "status", "--porcelain"],
-        capture_output=True,
-        text=True,
-    ).stdout.strip()
-    if status == "":
-        print("No changes to commit.")
-        return False
-    subprocess.run(["git", "-C", repo_name, "add", "."], check=True)
-    subprocess.run(["git", "-C", repo_name, "commit", "-m", commit_message], check=True)
-    return True
-
-
-def push_changes(repo_name, branch_name):
-    """Push changes to a git repository."""
-    subprocess.run(
-        ["git", "-C", repo_name, "push", "-u", "origin", branch_name], check=True
-    )
-
-
-def get_default_branch(repo_full_name):
-    """Get the default branch of a repository (default `main`)."""
-    url = f"https://api.github.com/repos/{repo_full_name}"
-    headers = {
-        "Authorization": f"token {personal_access_token}",
-        "Accept": "application/vnd.github.v3+json",
-    }
-    response = requests.get(url, headers=headers)
-    if response.status_code == 200:
-        return response.json()["default_branch"]
-    else:
-        logger.info(f"Failed to get default branch for {repo_full_name}")
-        return "main"
-
-
-def create_pull_request(repo_full_name, title, body, head_branch):
-    """Create a pull request in a repository."""
-    base_branch = get_default_branch(repo_full_name)
-    url = f"https://api.github.com/repos/{repo_full_name}/pulls"
-    headers = {
-        "Authorization": f"token {personal_access_token}",
-        "Accept": "application/vnd.github.v3+json",
-    }
-    data = {"title": title, "body": body, "head": head_branch, "base": base_branch}
-    response = requests.post(url, headers=headers, json=data)
-    if response.status_code == 201:
-        print("Pull request created successfully: " + response.json()["html_url"])
-        return response.json()["html_url"]
-    else:
-        print("Failed to create pull request", response.text)
-        return ""
-
-
-def get_compatible_commit(url):
-    """Find the most recent commit with a Lean version that LeanAgent supports."""
-    try:
-        process = subprocess.Popen(["git", "ls-remote", url], stdout=subprocess.PIPE)
-        stdout, stderr = process.communicate()
-        latest_commit = re.split(r"\t+", stdout.decode("utf-8"))[0]
-        logger.info(f"Latest commit: {latest_commit}")
-
-        new_url = url.replace(".git", "")
-        logger.info(f"Creating LeanGitRepo for {new_url}")
-        repo = LeanGitRepo(new_url, latest_commit)
-        logger.info(f"Getting config for {url}")
-        config = repo.get_config("lean-toolchain")
-        v = generate_benchmark_lean4.get_lean4_version_from_config(config["content"])
-        if generate_benchmark_lean4.is_supported_version(v):
-            logger.info(f"Latest commit compatible for url {url}")
-            return latest_commit, v
-
-        logger.info(f"Searching for compatible commit for {url}")
-        try:
-            subprocess.run(
-                ["git", "rev-parse", "--is-inside-work-tree"],
-                check=True,
-                stdout=subprocess.DEVNULL,
-                stderr=subprocess.DEVNULL,
-            )
-            logger.info("Already in a Git repository")
-        except subprocess.CalledProcessError:
-            logger.info("Not in a Git repository. Initializing one.")
-            subprocess.run(["git", "init"], check=True)
-
-        process = subprocess.Popen(
-            ["git", "fetch", "--depth=1000000", url],  # Fetch commits
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-        )
-        logger.info(f"Fetching commits for {url}")
-        _, stderr = process.communicate()
-        if process.returncode != 0:
-            raise Exception(f"Git fetch command failed: {stderr.decode('utf-8')}")
-        logger.info(f"Fetched commits for {url}")
-        process = subprocess.Popen(
-            ["git", "log", "--format=%H", "FETCH_HEAD"],  # Get list of commits
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-        )
-        logger.info(f"Getting list of commits for {url}")
-        stdout, stderr = process.communicate()
-        if process.returncode != 0:
-            raise Exception(f"Git log command failed: {stderr.decode('utf-8')}")
-        commits = stdout.decode("utf-8").strip().split("\n")
-        logger.info(f"Found {len(commits)} commits for {url}")
-        for commit in commits:
-            new_url = url.replace(".git", "")
-            repo = LeanGitRepo(new_url, commit)
-            config = repo.get_config("lean-toolchain")
-            v = generate_benchmark_lean4.get_lean4_version_from_config(
-                config["content"]
-            )
-            if generate_benchmark_lean4.is_supported_version(v):
-                logger.info(f"Found compatible commit {commit} for {url}")
-                return commit, v
-
-        raise Exception("No compatible commit found")
-
-    except Exception as e:
-        logger.info(f"Error in get_compatible_commit: {str(e)}")
-        return None, None
-
-
-def find_and_save_compatible_commits(repo_info_file, lean_git_repos):
-    """Finds compatible commits for various repositories"""
-    updated_repos = []
-    for repo in lean_git_repos:
-        url = repo.url
-        if not url.endswith(".git"):
-            url = url + ".git"
-
-        sha = None
-        v = None
-        if "mathlib4" in url:
-            sha = "2b29e73438e240a427bcecc7c0fe19306beb1310"
-            v = "v4.8.0"
-        elif "SciLean" in url:
-            sha = "22d53b2f4e3db2a172e71da6eb9c916e62655744"
-            v = "v4.7.0"
-        elif "pfr" in url:
-            sha = "fa398a5b853c7e94e3294c45e50c6aee013a2687"
-            v = "v4.8.0-rc1"
-        else:
-            sha, v = get_compatible_commit(url)
-        if not sha:
-            logger.info(f"Failed to find a compatible commit for {url}")
-            continue
-
-        updated_repos.append(
-            {"url": url.replace(".git", ""), "commit": sha, "version": v}
-        )
-
-    with open(repo_info_file, "w") as f:
-        json.dump(updated_repos, f)
-
-    return updated_repos
-
-
-def search_github_repositories(language="Lean", num_repos=10):
-    """Search for the given number of repositories on GitHub that have the given language."""
-    headers = {"Authorization": personal_access_token}
-    query_params = {
-        "q": f"language:{language}",
-        "sort": "stars",
-        "order": "desc",
-        "per_page": 100,
-    }
-
-    cloned_count = 0
-    page = 1
-
-    while cloned_count < num_repos:
-        query_params["page"] = page
-        response = requests.get(
-            "https://api.github.com/search/repositories",
-            headers=headers,
-            params=query_params,
-        )
-
-        if response.status_code == 200:
-            repositories = response.json()["items"]
-            for repo in repositories:
-                if cloned_count >= num_repos:
-                    break
-                repo_full_name = repo["full_name"]
-                logger.info(f"Processing {repo_full_name}")
-                if repo_full_name not in known_repositories:
-                    name = None
-                    try:
-                        clone_url = repo["clone_url"]
-                        repo_name, sha = clone_repo(clone_url)
-                        name = repo_name
-                        url = clone_url.replace(".git", "")
-                        lean_git_repo = LeanGitRepo(url, sha)
-                        lean_git_repos.append(lean_git_repo)
-                        repos.append(repo_full_name)
-                        cloned_count += 1
-                        logger.info(f"Cloned {repo_full_name}")
-                    except Exception as e:
-                        shutil.rmtree(name)
-                        logger.info(f"Failed to clone {repo_full_name} because of {e}")
-                else:
-                    logger.info(
-                        f"Skipping {repo_full_name} since it is a known repository"
-                    )
-            page += 1
-        else:
-            logger.info("Failed to search GitHub", response.status_code)
-            break
-
-        # Check if we've reached the end of the search results
-        if len(repositories) < 100:
-            break
-
-    logger.info(f"Total repositories processed: {cloned_count}")
+repos = []
 
 
 def _eval(data, preds_map) -> Tuple[float, float, float]:
@@ -839,82 +333,7 @@ def prove_sorry_theorems(
 
     save_progress(all_encountered_theorems)
     logger.info("Finished attempting to prove sorry theorems")
-
-
-def add_repo_to_database(dynamic_database_json_path, repo, db):
-    """Adds a repository to the dynamic database."""
-    # Prepare the data necessary to add this repo to the dynamic database
-    url = repo.url
-    if not url.endswith(".git"):
-        url = url + ".git"
-    logger.info(f"Processing {url}")
-
-    if "mathlib4" in url:
-        sha = "2b29e73438e240a427bcecc7c0fe19306beb1310"
-        v = "v4.8.0"
-    elif "SciLean" in url:
-        sha = "22d53b2f4e3db2a172e71da6eb9c916e62655744"
-        v = "v4.7.0"
-    elif "pfr" in url:
-        sha = "fa398a5b853c7e94e3294c45e50c6aee013a2687"
-        v = "v4.8.0-rc1"
-    else:
-        sha, v = get_compatible_commit(url)
-
-    if not sha:
-        logger.info(f"Failed to find a compatible commit for {url}")
-        return None
-
-    logger.info(f"Found compatible commit {sha} for {url}")
-    logger.info(f"Lean version: {v}")
-    url = url.replace(".git", "")
-    repo = LeanGitRepo(url, sha)
-    dir_name = repo.url.split("/")[-1] + "_" + sha
-    dst_dir = RAID_DIR + "/" + DATA_DIR + "/" + dir_name
-    logger.info(f"Generating benchmark at {dst_dir}")
-    traced_repo, _, _, total_theorems = generate_benchmark_lean4.main(
-        repo.url, sha, dst_dir
-    )
-    if not traced_repo:
-        logger.info(f"Failed to trace {url}")
-        return None
-    if total_theorems < 3 * BATCH_SIZE:  # Should be enough theorems for train/val/test
-        logger.info(f"No theorems found in {url}")
-        return None
-    logger.info(f"Finished generating benchmark at {dst_dir}")
-
-    # Add the new repo to the dynamic database
-    config = repo.get_config("lean-toolchain")
-    v = generate_benchmark_lean4.get_lean4_version_from_config(config["content"])
-    theorems_folder = dst_dir + "/random"
-    premise_files_corpus = dst_dir + "/corpus.jsonl"
-    files_traced = dst_dir + "/traced_files.jsonl"
-    pr_url = None
-    data = {
-        "url": repo.url,
-        "name": "/".join(repo.url.split("/")[-2:]),
-        "commit": repo.commit,
-        "lean_version": v,
-        "lean_dojo_version": lean_dojo.__version__,
-        "metadata": {
-            "date_processed": datetime.datetime.now(),
-        },
-        "theorems_folder": theorems_folder,
-        "premise_files_corpus": premise_files_corpus,
-        "files_traced": files_traced,
-        "pr_url": pr_url,
-    }
-
-    repo = Repository.from_dict(data)
-    logger.info("Before adding new repo:")
-    db.print_database_contents()
-    db.add_repository(repo)
-    logger.info("After adding new repo:")
-    db.print_database_contents()
-    db.to_json(dynamic_database_json_path)
-    return "Done"
-
-
+    
 def replace_sorry_with_proof(proofs):
     """Replace the `sorry` with the proof text in the Lean files."""
     logger.info(f"Replacing sorries with {len(proofs)} proofs!")
@@ -945,128 +364,133 @@ def replace_sorry_with_proof(proofs):
 
     logger.info("Finished replacing sorries with proofs!")
 
-
-def calculate_difficulty(theorem: Theorem) -> Union[float, None]:
-    """Calculates the difficulty of a theorem."""
-    proof_steps = theorem.traced_tactics
-    if any("sorry" in step.tactic for step in proof_steps):
-        return float("inf")  # Hard (no proof)
-    if len(proof_steps) == 0:
-        return None  # To be distributed later
-    return math.exp(len(proof_steps))
-
-
-def categorize_difficulty(
-    difficulty: Union[float, None], percentiles: List[float]
-) -> str:
-    """Categorizes the difficulty of a theorem."""
-    if difficulty is None:
-        return "To_Distribute"
-    if difficulty == float("inf"):
-        return "Hard (No proof)"
-    elif difficulty <= percentiles[0]:
-        return "Easy"
-    elif difficulty <= percentiles[1]:
-        return "Medium"
-    else:
-        return "Hard"
-
-
-def sort_repositories_by_difficulty(db: DynamicDatabase) -> List[Repository]:
-    """Sorts repositories by the difficulty of their theorems."""
-    difficulties_by_repo = defaultdict(list)
-    all_difficulties = []
-
-    print("Ready to calculate difficulties of all theorems")
-    for repo in db.repositories:
-        print(f"Starting {repo.name}")
-        for theorem in repo.get_all_theorems:
-            difficulty = calculate_difficulty(theorem)
-            theorem.difficulty_rating = difficulty
-            difficulties_by_repo[repo].append(
-                (
-                    theorem.full_name,
-                    str(theorem.file_path),
-                    tuple(theorem.start),
-                    tuple(theorem.end),
-                    difficulty,
-                )
+def initialize_database(dynamic_database_json_path: str) -> DynamicDatabase:
+    """Initializes or loads the dynamic database."""
+    # Check if the current process is the main one
+    is_main_process = int(os.environ.get("LOCAL_RANK", "0")) == 0
+
+    # Initialize the database if it doesn't exist or is empty
+    if is_main_process:
+        logger.info("Starting the main process")
+        if (
+            not os.path.exists(dynamic_database_json_path)
+            or os.path.getsize(dynamic_database_json_path) == 0
+        ):
+            # File doesn't exist or is empty, initialize it
+            logger.info(
+                f"\nInitializing new database at {dynamic_database_json_path}\n"
             )
-            if difficulty is not None:
-                all_difficulties.append(difficulty)
-
-        db.update_repository(repo)
-        print(f"Finished {repo.name}")
-
-    percentiles = np.percentile(all_difficulties, [33, 67])
-
-    categorized_theorems = defaultdict(lambda: defaultdict(list))
+            db = DynamicDatabase()
+            db.to_json(dynamic_database_json_path)
+        else:
+            try:
+                logger.info(f"Loading database from {dynamic_database_json_path}")
+                db = DynamicDatabase.from_json(dynamic_database_json_path)
+                logger.info(f"Loaded database from {dynamic_database_json_path}")
+            except json.JSONDecodeError:
+                # If there's an error decoding the JSON, initialize a new database
+                logger.warning(
+                    f"Error decoding JSON from {dynamic_database_json_path}. Initializing new database."
+                )
+                db = DynamicDatabase()
+                db.to_json(dynamic_database_json_path)
+    
+    return db
 
-    print("Ready to categorize theorems")
-    for repo, theorems in difficulties_by_repo.items():
-        print(f"Starting {repo.name}")
-        for theorem_name, file_path, start, end, difficulty in theorems:
-            category = categorize_difficulty(difficulty, percentiles)
-            categorized_theorems[repo][category].append(
-                (theorem_name, file_path, start, end, difficulty)
+def get_repos(curriculum_learning: bool, num_repos: int, dynamic_database_json_path: str, db: DynamicDatabase):
+    global lean_git_repos
+    global repos
+    # If curriculum learning is enabled, initialize repositories and sort them by difficulty
+    repo_info_file = os.path.join(RAID_DIR, DATA_DIR, "repo_info_compatible.json")
+    # Check if the current process is the main one
+    is_main_process = int(os.environ.get("LOCAL_RANK", "0")) == 0
+    if curriculum_learning:
+        logger.info("Starting curriculum learning")
+        if is_main_process:
+            lean_git_repos, repos = search_github_repositories(lean_git_repos, repos, "Lean", num_repos)
+            
+            for i in range(len(lean_git_repos)):
+                repo = lean_git_repos[i]
+                print("\n\n")
+                logger.info(f"Processing new repo: {repo.url}")
+                result = add_repo_to_database(dynamic_database_json_path, repo, db)
+                if result is not None:
+                    logger.info(f"Successfully added repo {repo.url}")
+            
+            logger.info(
+                f"Successfully added {num_repos} repositories to the database"
             )
-        print(f"Finished {repo.name}")
-
-    print("Distributed theorems with no proofs")
-    for repo in categorized_theorems:
-        print(f"Starting {repo.name}")
-        to_distribute = categorized_theorems[repo]["To_Distribute"]
-        chunk_size = len(to_distribute) // 3
-        for i, category in enumerate(["Easy", "Medium", "Hard"]):
-            start = i * chunk_size
-            end = start + chunk_size if i < 2 else None
-            categorized_theorems[repo][category].extend(to_distribute[start:end])
-        del categorized_theorems[repo]["To_Distribute"]
-        print(f"Finished {repo.name}")
-
-    # Sort repositories based on the number of easy theorems
-    sorted_repos = sorted(
-        categorized_theorems.keys(),
-        key=lambda r: len(categorized_theorems[r]["Easy"]),
-        reverse=True,
-    )
-
-    return sorted_repos, categorized_theorems, percentiles
-
-
-def save_sorted_repos(sorted_repos: List[Repository], file_path: str):
-    """Saves the sorted repositories to a file."""
-    sorted_repo_data = [
-        {"url": repo.url, "commit": repo.commit, "name": repo.name}
-        for repo in sorted_repos
-    ]
-    with open(file_path, "w") as f:
-        json.dump(sorted_repo_data, f, indent=2)
-
-
-def load_sorted_repos(file_path: str) -> List[Tuple[str, str, str]]:
-    """Loads the sorted repositories from a file."""
-    with open(file_path, "r") as f:
-        sorted_repo_data = json.load(f)
-    return [(repo["url"], repo["commit"], repo["name"]) for repo in sorted_repo_data]
 
+            sorted_repos, categorized_theorems, percentiles = (
+                sort_repositories_by_difficulty(db)
+            )
+            
+            print("Sorted repositories. Saving now...")
+            db.to_json(dynamic_database_json_path)
+            save_sorted_repos(sorted_repos, "sorted_repos.json")
+            
+            print("Summary of theorem difficulties by URL:")
+            for repo in sorted_repos:
+                print(f"\nURL: {repo.url}")
+                for category in ["Easy", "Medium", "Hard", "Hard (No proof)"]:
+                    theorems = categorized_theorems[repo][category]
+                    print(f"  {category}: {len(theorems)} theorems")
+                    if theorems:
+                        sorted_theorems = sorted(
+                            theorems,
+                            key=lambda x: (
+                                x[2] if x[2] is not None else -float("inf")
+                            ),
+                            reverse=True,
+                        )[:3]
+                        for name, path, _start, _end, diff in sorted_theorems:
+                            diff_str = f"{diff:.2f}" if diff is not None else "N/A"
+                            print(
+                                f"    - {name} (File: {path}, Difficulty: {diff_str})"
+                            )
 
-def write_skip_file(repo_url):
-    """Writes a repository URL to a file to skip it."""
-    skip_file_path = os.path.join(RAID_DIR, DATA_DIR, "skip_repo.txt")
-    with open(skip_file_path, "w") as f:
-        f.write(repo_url)
+            print("\nOverall Statistics:")
+            total_theorems = sum(
+                len(theorems)
+                for categories in categorized_theorems.values()
+                for theorems in categories.values()
+            )
+            for category in ["Easy", "Medium", "Hard", "Hard (No proof)"]:
+                count = sum(
+                    len(categories[category])
+                    for categories in categorized_theorems.values()
+                )
+                percentage = (count / total_theorems) * 100
+                print(f"{category}: {count} theorems ({percentage:.2f}%)")
 
+            print(
+                f"\nPercentile thresholds: Easy <= {percentiles[0]:.2f}, Medium <= {percentiles[1]:.2f}, Hard > {percentiles[1]:.2f}"
+            )
 
-def should_skip_repo():
-    """Checks if a repository should be skipped."""
-    skip_file_path = os.path.join(RAID_DIR, DATA_DIR, "skip_repo.txt")
-    if os.path.exists(skip_file_path):
-        with open(skip_file_path, "r") as f:
-            repo_url = f.read().strip()
-        return True, repo_url
-    return False, None
+            logger.info("Finding compatible repositories...")
+            updated_repos = find_and_save_compatible_commits(repo_info_file, sorted_repos)
+            lean_git_repos = [LeanGitRepo(repo["url"], repo["commit"]) for repo in updated_repos]
+            logger.info("Finished finding compatible repositories")
+    else:
+        logger.info("Starting without curriculum learning")
+        if is_main_process:
+            lean_git_repos, repos = search_github_repositories(lean_git_repos, repos, "lean", num_repos)
 
+            for i in range(len(lean_git_repos)):
+                repo = lean_git_repos[i]
+                logger.info(f"Processing {repo.url}")
+                result = add_repo_to_database(dynamic_database_json_path, repo, db)
+                if result is not None:
+                    logger.info(f"Successfully added repo {repo.url}")
+            
+            logger.info(f"Successfully added {num_repos} repositories to the database")
+
+            logger.info("Finding compatible repositories...")
+            updated_repos = find_and_save_compatible_commits(repo_info_file, lean_git_repos)
+            lean_git_repos = [LeanGitRepo(repo["url"], repo["commit"]) for repo in updated_repos]
+            logger.info("Finished finding compatible repositories")
+    
+    return lean_git_repos, repos, updated_repos
 
 def main():
     """
@@ -1075,6 +499,7 @@ def main():
     global repos_for_merged_dataset
     global repos_for_proving
     global lean_git_repos
+    global repos
     try:
         current_epoch = 0
         epochs_per_repo = 1
@@ -1083,7 +508,7 @@ def main():
         single_repo = True
         curriculum_learning = True
         num_repos = 1
-        dynamic_database_json_path = RAID_DIR + "/" + DB_FILE_NAME
+        dynamic_database_json_path = os.path.join(RAID_DIR, DB_FILE_NAME)
 
         lambdas = None
         if run_progressive_training:
@@ -1098,131 +523,14 @@ def main():
         generate_benchmark_lean4.configure_leandojo()
         logger.info("LeanDojo configured")
 
-        # Check if the current process is the main one
-        is_main_process = int(os.environ.get("LOCAL_RANK", "0")) == 0
-
-        # Initialize the database if it doesn't exist or is empty
-        if is_main_process:
-            logger.info("Starting the main process")
-            if (
-                not os.path.exists(dynamic_database_json_path)
-                or os.path.getsize(dynamic_database_json_path) == 0
-            ):
-                # File doesn't exist or is empty, initialize it
-                logger.info(
-                    f"Initializing new database at {dynamic_database_json_path}"
-                )
-                db = DynamicDatabase()
-                db.to_json(dynamic_database_json_path)
-            else:
-                try:
-                    logger.info(f"Loading database from {dynamic_database_json_path}")
-                    db = DynamicDatabase.from_json(dynamic_database_json_path)
-                    logger.info(f"Loaded database from {dynamic_database_json_path}")
-                except json.JSONDecodeError:
-                    # If there's an error decoding the JSON, initialize a new database
-                    logger.warning(
-                        f"Error decoding JSON from {dynamic_database_json_path}. Initializing new database."
-                    )
-                    db = DynamicDatabase()
-                    db.to_json(dynamic_database_json_path)
-
+        db = initialize_database(dynamic_database_json_path)
         logger.info(f"Found {num_repos} repositories")
 
-        # If curriculum learning is enabled, initialize repositories and sort them by difficulty
-        if curriculum_learning:
-            logger.info("Starting curriculum learning")
-            repo_info_file = f"{RAID_DIR}/{DATA_DIR}/repo_info_compatible.json"
-            if is_main_process:
-                search_github_repositories("Lean", num_repos)
-                for i in range(len(lean_git_repos)):
-                    repo = lean_git_repos[i]
-                    logger.info(f"Processing {repo.url}")
-                    result = add_repo_to_database(dynamic_database_json_path, repo, db)
-                    if result is not None:
-                        logger.info(f"Successfully added repo {repo.url}")
-                logger.info(
-                    f"Successfully added {num_repos} repositories to the database"
-                )
-
-                sorted_repos, categorized_theorems, percentiles = (
-                    sort_repositories_by_difficulty(db)
-                )
-                print("Sorted repositories. Saving now...")
-                db.to_json(dynamic_database_json_path)
-                save_sorted_repos(sorted_repos, "sorted_repos.json")
-                print("Summary of theorem difficulties by URL:")
-                for repo in sorted_repos:
-                    print(f"\nURL: {repo.url}")
-                    for category in ["Easy", "Medium", "Hard", "Hard (No proof)"]:
-                        theorems = categorized_theorems[repo][category]
-                        print(f"  {category}: {len(theorems)} theorems")
-                        if theorems:
-                            sorted_theorems = sorted(
-                                theorems,
-                                key=lambda x: (
-                                    x[2] if x[2] is not None else -float("inf")
-                                ),
-                                reverse=True,
-                            )[:3]
-                            for name, path, start, end, diff in sorted_theorems:
-                                diff_str = f"{diff:.2f}" if diff is not None else "N/A"
-                                print(
-                                    f"    - {name} (File: {path}, Difficulty: {diff_str})"
-                                )
-
-                print("\nOverall Statistics:")
-                total_theorems = sum(
-                    len(theorems)
-                    for categories in categorized_theorems.values()
-                    for theorems in categories.values()
-                )
-                for category in ["Easy", "Medium", "Hard", "Hard (No proof)"]:
-                    count = sum(
-                        len(categories[category])
-                        for categories in categorized_theorems.values()
-                    )
-                    percentage = (count / total_theorems) * 100
-                    print(f"{category}: {count} theorems ({percentage:.2f}%)")
-
-                print(
-                    f"\nPercentile thresholds: Easy <= {percentiles[0]:.2f}, Medium <= {percentiles[1]:.2f}, Hard > {percentiles[1]:.2f}"
-                )
-
-                logger.info("Finding compatible repositories...")
-                updated_repos = find_and_save_compatible_commits(
-                    repo_info_file, sorted_repos
-                )
-                lean_git_repos = [
-                    LeanGitRepo(repo["url"], repo["commit"]) for repo in updated_repos
-                ]
-                logger.info("Finished finding compatible repositories")
-        else:
-            logger.info("Starting without curriculum learning")
-            repo_info_file = f"{RAID_DIR}/{DATA_DIR}/repo_info_compatible.json"
-            if is_main_process:
-                search_github_repositories("Lean", num_repos)
-
-                for i in range(len(lean_git_repos)):
-                    repo = lean_git_repos[i]
-                    logger.info(f"Processing {repo.url}")
-                    result = add_repo_to_database(dynamic_database_json_path, repo, db)
-                    if result is not None:
-                        logger.info(f"Successfully added repo {repo.url}")
-                logger.info(
-                    f"Successfully added {num_repos} repositories to the database"
-                )
-
-                logger.info("Finding compatible repositories...")
-                updated_repos = find_and_save_compatible_commits(
-                    repo_info_file, lean_git_repos
-                )
-                lean_git_repos = [
-                    LeanGitRepo(repo["url"], repo["commit"]) for repo in updated_repos
-                ]
-                logger.info("Finished finding compatible repositories")
+        lean_git_repos, repos, updated_repos = get_repos(curriculum_learning, num_repos, dynamic_database_json_path, db)
 
+        repo_info_file = os.path.join(RAID_DIR, DATA_DIR, "repo_info_compatible.json")
         # All processes wait for the file to be created and then read from it
+        # TODO: Fix with a semaphore or file lock
         max_attempts = 30
         for attempt in range(max_attempts):
             try:
@@ -1242,6 +550,8 @@ def main():
             for info in repo_info
         ]
 
+        is_main_process = int(os.environ.get("LOCAL_RANK", "0")) == 0
+        
         # Iterate over each repository and lambda value
         for i in range(num_repos):
             for lambda_value in lambdas:
diff --git a/run_leanagent.sh b/run_leanagent.sh
old mode 100644
new mode 100755
index 180ff70..e127b50
--- a/run_leanagent.sh
+++ b/run_leanagent.sh
@@ -26,15 +26,17 @@
 #!/bin/bash
 export RAID_DIR="~/Desktop/LeanAgent/RAID/"
 export LEAN_AGENT_DIR="~/Desktop/LeanAgent"
+export PYTHONPATH="${PYTHONPATH}:${RAID_DIR}/LeanAgent"
+export CACHE_DIR="${RAID_DIR}/.cache/lean_dojo"
+export RAY_TMPDIR="${RAID_DIR}/tmp"
+export CONDA_SH="/Users/motiwari/miniforge3/etc/profile.d/conda.sh"
+source ${CONDA_SH}
+
 cd ${LEAN_AGENT_DIR}
 echo "Script executed from: ${PWD}"
-source /Users/motiwari/miniforge3/etc/profile.d/conda.sh
 conda activate LeanAgent
-export PYTHONPATH="${PYTHONPATH}:${RAID_DIR}/LeanAgent"
-export CACHE_DIR="${RAID_DIR}/.cache/lean_dojo"
 echo "Removing old cache files"
 rm -rf /tmp/ray
-export RAY_TMPDIR="${RAID_DIR}/tmp"
 rm -rf ${RAY_TMPDIR}
 mkdir "${RAY_TMPDIR}"
 echo "Stopping ray"

From ef397bef8acfdf9086d8411e268c822beaebcca1 Mon Sep 17 00:00:00 2001
From: motiwari <mohittiwarinyc+github@gmail.com>
Date: Mon, 25 Aug 2025 14:07:53 -0700
Subject: [PATCH 05/29] Querying commit hashes

---
 git_utils.py | 56 +++++++++++++++++++---------------------------------
 1 file changed, 20 insertions(+), 36 deletions(-)

diff --git a/git_utils.py b/git_utils.py
index 6d56fb4..5cd4630 100644
--- a/git_utils.py
+++ b/git_utils.py
@@ -16,7 +16,7 @@
 import math
 import os
 
-from constants import known_repositories, known_dead_repos, PR_TITLE, PR_BODY, TMP_BRANCH, COMMIT_MESSAGE
+from constants import known_repositories, PR_TITLE, PR_BODY, TMP_BRANCH, COMMIT_MESSAGE
 
 personal_access_token = os.environ.get("GITHUB_ACCESS_TOKEN")
 BATCH_SIZE = 4
@@ -141,22 +141,10 @@ def create_pull_request(repo_full_name, title, body, head_branch):
         print("Failed to create pull request", response.text)
         return ""
 
-def ensure_inside_git():
-    """Ensure that the current directory is inside a git repository."""
-    try:
-        subprocess.run(
-            ["git", "rev-parse", "--is-inside-work-tree"],
-            check=True,
-            stdout=subprocess.DEVNULL,
-            stderr=subprocess.DEVNULL,
-        )
-        logger.info("Already in a Git repository")
-    except subprocess.CalledProcessError:
-        logger.info("Not in a Git repository. Initializing one.")
-        subprocess.run(["git", "init"], check=True)
-        
+
 def get_compatible_commit(url):
     """Find the most recent commit with a Lean version that LeanAgent supports."""
+    import ipdb; ipdb.set_trace()
     try:
         process = subprocess.Popen(["git", "ls-remote", url], stdout=subprocess.PIPE)
         stdout, stderr = process.communicate()
@@ -177,9 +165,18 @@ def get_compatible_commit(url):
             return latest_commit, v
 
         logger.info(f"Searching for compatible commit for {url}")
-        
-        ensure_inside_git()
-        ZZ
+        try:
+            subprocess.run(
+                ["git", "rev-parse", "--is-inside-work-tree"],
+                check=True,
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+            )
+            logger.info("Already in a Git repository")
+        except subprocess.CalledProcessError:
+            logger.info("Not in a Git repository. Initializing one.")
+            subprocess.run(["git", "init"], check=True)
+
         process = subprocess.Popen(
             ["git", "fetch", "--depth=1000000", url],  # Fetch commits
             stdout=subprocess.PIPE,
@@ -209,21 +206,10 @@ def get_compatible_commit(url):
         logger.info(f"Found {len(commits)} commits for {url}")
         
         new_url = url.replace(".git", "")
-        
-        repo_human_name = "/".join(new_url.split("/")[-2:])
-        
-        # Delete repo if it exists, because it might be checked out to a different commit
-        if os.path.exists(os.path.join("repos", repo_human_name)):
-            shutil.rmtree(os.path.join("repos", repo_human_name))
-        
-        subprocess.run(["git", "clone", url, os.path.join("repos", repo_human_name)], check=True)
-
         for commit in commits:
-            logger.info(f"Checking commit {commit} for {url}")
-            # Check out the commit locally
-            subprocess.run(["git", "-C", os.path.join("repos", repo_human_name), "checkout", commit], check=True)
-            import ipdb; ipdb.set_trace()
-            repo = LeanGitRepo.from_path(os.path.join(os.getcwd(), "repos", repo_human_name), commit)
+            
+            
+            repo = LeanGitRepo(new_url, commit)
             config = repo.get_config("lean-toolchain")
             v = generate_benchmark_lean4.get_lean4_version_from_config(config["content"])
             if generate_benchmark_lean4.is_supported_version(v):
@@ -239,6 +225,7 @@ def get_compatible_commit(url):
 
 def find_and_save_compatible_commits(repo_info_file, lean_git_repos):
     """Finds compatible commits for various repositories"""
+    import ipdb; ipdb.set_trace()
     with open(repo_info_file, "r") as repo_compatibility_file:
         updated_repos = json.loads(repo_compatibility_file)
     
@@ -317,7 +304,7 @@ def search_github_repositories(lean_git_repos, repos, language="Lean", num_repos
                 
                 
                 # Skip repos that are already known
-                if repo_full_name not in known_repositories + known_dead_repos + repos:
+                if repo_full_name not in known_repositories:
                     print("\n\n")
                     logger.info(f"Processing new repo: {repo_full_name}")
                     name = None
@@ -326,10 +313,7 @@ def search_github_repositories(lean_git_repos, repos, language="Lean", num_repos
                         repo_name, sha = clone_repo(clone_url)
                         name = repo_name
                         url = clone_url.replace(".git", "")
-                        
-                        # TODO: This constructor can be very slow
                         lean_git_repo = LeanGitRepo(url, sha)
-                        
                         lean_git_repos.append(lean_git_repo)
                         repos.append(repo_full_name)
                         cloned_count += 1

From b1d5280f77da3cedc03e4319ce255c83ff3c6070 Mon Sep 17 00:00:00 2001
From: motiwari <mohittiwarinyc+github@gmail.com>
Date: Mon, 25 Aug 2025 14:08:17 -0700
Subject: [PATCH 06/29] Revert

---
 git_utils.py | 56 +++++++++++++++++++++++++++++++++-------------------
 1 file changed, 36 insertions(+), 20 deletions(-)

diff --git a/git_utils.py b/git_utils.py
index 5cd4630..6d56fb4 100644
--- a/git_utils.py
+++ b/git_utils.py
@@ -16,7 +16,7 @@
 import math
 import os
 
-from constants import known_repositories, PR_TITLE, PR_BODY, TMP_BRANCH, COMMIT_MESSAGE
+from constants import known_repositories, known_dead_repos, PR_TITLE, PR_BODY, TMP_BRANCH, COMMIT_MESSAGE
 
 personal_access_token = os.environ.get("GITHUB_ACCESS_TOKEN")
 BATCH_SIZE = 4
@@ -141,10 +141,22 @@ def create_pull_request(repo_full_name, title, body, head_branch):
         print("Failed to create pull request", response.text)
         return ""
 
-
+def ensure_inside_git():
+    """Ensure that the current directory is inside a git repository."""
+    try:
+        subprocess.run(
+            ["git", "rev-parse", "--is-inside-work-tree"],
+            check=True,
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
+        logger.info("Already in a Git repository")
+    except subprocess.CalledProcessError:
+        logger.info("Not in a Git repository. Initializing one.")
+        subprocess.run(["git", "init"], check=True)
+        
 def get_compatible_commit(url):
     """Find the most recent commit with a Lean version that LeanAgent supports."""
-    import ipdb; ipdb.set_trace()
     try:
         process = subprocess.Popen(["git", "ls-remote", url], stdout=subprocess.PIPE)
         stdout, stderr = process.communicate()
@@ -165,18 +177,9 @@ def get_compatible_commit(url):
             return latest_commit, v
 
         logger.info(f"Searching for compatible commit for {url}")
-        try:
-            subprocess.run(
-                ["git", "rev-parse", "--is-inside-work-tree"],
-                check=True,
-                stdout=subprocess.DEVNULL,
-                stderr=subprocess.DEVNULL,
-            )
-            logger.info("Already in a Git repository")
-        except subprocess.CalledProcessError:
-            logger.info("Not in a Git repository. Initializing one.")
-            subprocess.run(["git", "init"], check=True)
-
+        
+        ensure_inside_git()
+        ZZ
         process = subprocess.Popen(
             ["git", "fetch", "--depth=1000000", url],  # Fetch commits
             stdout=subprocess.PIPE,
@@ -206,10 +209,21 @@ def get_compatible_commit(url):
         logger.info(f"Found {len(commits)} commits for {url}")
         
         new_url = url.replace(".git", "")
+        
+        repo_human_name = "/".join(new_url.split("/")[-2:])
+        
+        # Delete repo if it exists, because it might be checked out to a different commit
+        if os.path.exists(os.path.join("repos", repo_human_name)):
+            shutil.rmtree(os.path.join("repos", repo_human_name))
+        
+        subprocess.run(["git", "clone", url, os.path.join("repos", repo_human_name)], check=True)
+
         for commit in commits:
-            
-            
-            repo = LeanGitRepo(new_url, commit)
+            logger.info(f"Checking commit {commit} for {url}")
+            # Check out the commit locally
+            subprocess.run(["git", "-C", os.path.join("repos", repo_human_name), "checkout", commit], check=True)
+            import ipdb; ipdb.set_trace()
+            repo = LeanGitRepo.from_path(os.path.join(os.getcwd(), "repos", repo_human_name), commit)
             config = repo.get_config("lean-toolchain")
             v = generate_benchmark_lean4.get_lean4_version_from_config(config["content"])
             if generate_benchmark_lean4.is_supported_version(v):
@@ -225,7 +239,6 @@ def get_compatible_commit(url):
 
 def find_and_save_compatible_commits(repo_info_file, lean_git_repos):
     """Finds compatible commits for various repositories"""
-    import ipdb; ipdb.set_trace()
     with open(repo_info_file, "r") as repo_compatibility_file:
         updated_repos = json.loads(repo_compatibility_file)
     
@@ -304,7 +317,7 @@ def search_github_repositories(lean_git_repos, repos, language="Lean", num_repos
                 
                 
                 # Skip repos that are already known
-                if repo_full_name not in known_repositories:
+                if repo_full_name not in known_repositories + known_dead_repos + repos:
                     print("\n\n")
                     logger.info(f"Processing new repo: {repo_full_name}")
                     name = None
@@ -313,7 +326,10 @@ def search_github_repositories(lean_git_repos, repos, language="Lean", num_repos
                         repo_name, sha = clone_repo(clone_url)
                         name = repo_name
                         url = clone_url.replace(".git", "")
+                        
+                        # TODO: This constructor can be very slow
                         lean_git_repo = LeanGitRepo(url, sha)
+                        
                         lean_git_repos.append(lean_git_repo)
                         repos.append(repo_full_name)
                         cloned_count += 1

From 8bf2234a2633664769e0a65e33ac30c95a8c4bf2 Mon Sep 17 00:00:00 2001
From: motiwari <mohittiwarinyc+github@gmail.com>
Date: Mon, 25 Aug 2025 15:33:37 -0700
Subject: [PATCH 07/29] organization

---
 compute_fisher.py     |   1 -
 constants.py          |   7 +-
 custom_traced_data.py |   6 +-
 custom_utils.py       |   3 +-
 filenames.py          |  12 ++
 git_utils.py          | 277 ++++++++++++++++++++----------------------
 leanagent.py          |  38 +++---
 leanagent_utils.py    |   4 +-
 8 files changed, 165 insertions(+), 183 deletions(-)
 create mode 100644 filenames.py

diff --git a/compute_fisher.py b/compute_fisher.py
index 56b32fe..0ef43fa 100644
--- a/compute_fisher.py
+++ b/compute_fisher.py
@@ -96,7 +96,6 @@ def main():
             # Save the FIM if needed
             if fisher_trainer.is_global_zero:
                 fisher_file_path = os.path.join(
-                    RAID_DIR,
                     FISHER_DIR,
                     f"fisher_info_{new_data_path.split('/')[-1]}_distributed.pkl",
                 )
diff --git a/constants.py b/constants.py
index 01f0ad4..b87e291 100644
--- a/constants.py
+++ b/constants.py
@@ -1,5 +1,4 @@
 PR_TITLE = "[LeanAgent] Proofs"
-
 PR_BODY = """
 [LeanAgent](https://arxiv.org/abs/2410.06209) discovers a proof for a theorem with the `sorry` keyword.
 
@@ -7,11 +6,13 @@
 
 <i>~LeanAgent - From the [LeanDojo](https://leandojo.org/) family</i>
 """
-
 TMP_BRANCH = "_LeanAgent"
-
 COMMIT_MESSAGE = "[LeanAgent] Proofs"
 
+
+MARK_START_SYMBOL = "<a>"
+MARK_END_SYMBOL = "</a>"
+
 # List of known repositories to process or skip
 # Feel free to remove any repos from this list if you would like to test on them
 
diff --git a/custom_traced_data.py b/custom_traced_data.py
index ee61c83..18fa8c5 100644
--- a/custom_traced_data.py
+++ b/custom_traced_data.py
@@ -15,10 +15,8 @@
 from lxml import etree
 from tqdm import tqdm
 
-from ..constants import (LEAN4_PACKAGES_DIR, LOAD_USED_PACKAGES_ONLY,
-                         NUM_WORKERS)
-from ..utils import (compute_md5, is_git_repo, to_dep_path, to_json_path,
-                     to_lean_path, to_xml_path)
+from ..constants import (LEAN4_PACKAGES_DIR, LOAD_USED_PACKAGES_ONLY, NUM_WORKERS)
+from ..utils import (compute_md5, is_git_repo, to_dep_path, to_json_path, to_lean_path, to_xml_path)
 from .ast import *
 from .lean import LeanFile, LeanGitRepo, Pos, Theorem
 
diff --git a/custom_utils.py b/custom_utils.py
index 181e9ac..33fd56d 100644
--- a/custom_utils.py
+++ b/custom_utils.py
@@ -15,8 +15,7 @@
 
 from loguru import logger
 
-from .constants import (LEAN4_BUILD_DIR, LEAN4_PACKAGES_DIR, NUM_WORKERS,
-                        TMP_DIR)
+from .constants import LEAN4_BUILD_DIR, LEAN4_PACKAGES_DIR, TMP_DIR
 
 
 @contextmanager
diff --git a/filenames.py b/filenames.py
new file mode 100644
index 0000000..8f7cfc4
--- /dev/null
+++ b/filenames.py
@@ -0,0 +1,12 @@
+import os
+
+RAID_DIR = os.environ.get("RAID_DIR")
+os.environ["RAY_TMPDIR"] = os.path.join(RAID_DIR, "tmp")
+REPO_DIR = os.path.join(RAID_DIR, "repos")
+DATA_DIR = os.path.join(RAID_DIR, "data")
+CHECKPOINT_DIR = os.path.join(RAID_DIR, "checkpoints")
+EVAL_RESULTS_FILE_PATH = os.path.join(RAID_DIR, "eval_results.txt")
+DB_FILE_NAME = "db_file.txt"
+PROOF_LOG_FILE_NAME = os.path.join(RAID_DIR, "proof_log.txt")
+ENCOUNTERED_THEOREMS_FILE = os.path.join(RAID_DIR, "encountered_theorems.pkl")
+FISHER_DIR = os.path.join(RAID_DIR, "fisher")  # Optional
\ No newline at end of file
diff --git a/git_utils.py b/git_utils.py
index 6d56fb4..2c793ea 100644
--- a/git_utils.py
+++ b/git_utils.py
@@ -20,17 +20,7 @@
 
 personal_access_token = os.environ.get("GITHUB_ACCESS_TOKEN")
 BATCH_SIZE = 4
-RAID_DIR = os.environ.get("RAID_DIR")
-os.environ["RAY_TMPDIR"] = f"{RAID_DIR}/tmp"
-repo_dir = f"{RAID_DIR}/repos_new"
-
-DATA_DIR = f"{RAID_DIR}/data"
-CHECKPOINT_DIR = f"{RAID_DIR}/checkpoints"
-EVAL_RESULTS_FILE_PATH = f"{RAID_DIR}/eval_results.txt"
-DB_FILE_NAME = "db_file.txt"
-PROOF_LOG_FILE_NAME = f"{RAID_DIR}/proof_log.txt"
-ENCOUNTERED_THEOREMS_FILE = f"{RAID_DIR}/encountered_theorems.pkl"
-FISHER_DIR = f"{RAID_DIR}/fisher"  # Optional
+from filenames import REPO_DIR, DATA_DIR
 
 
 def clone_repo(repo_url):
@@ -39,7 +29,8 @@ def clone_repo(repo_url):
     repo_name = "/".join(repo_url.split("/")[-2:]).replace(".git", "")
     logger.info(f"Cloning {repo_url}")
     logger.info(f"Repo name: {repo_name}")
-    repo_name = os.path.join(repo_dir, repo_name)
+    repo_name = os.path.join(REPO_DIR, repo_name)
+    
     if os.path.exists(repo_name):
         print(f"Deleting existing repository directory: {repo_name}")
         shutil.rmtree(repo_name)
@@ -54,7 +45,7 @@ def clone_repo(repo_url):
 def branch_exists(repo_name, branch_name):
     """Check if a branch exists in a git repository."""
     proc = subprocess.run(
-        ["git", "-C", repo_name, "branch", "-a"], capture_output=True, text=True
+        ["git", "-C", repo_name, "branch", "-a"], stdout=subprocess.PIPE, text=True
     )
     branches = proc.stdout.split("\n")
     local_branch = branch_name
@@ -157,130 +148,125 @@ def ensure_inside_git():
         
 def get_compatible_commit(url):
     """Find the most recent commit with a Lean version that LeanAgent supports."""
-    try:
-        process = subprocess.Popen(["git", "ls-remote", url], stdout=subprocess.PIPE)
-        stdout, stderr = process.communicate()
-        latest_commit = re.split(r"\t+", stdout.decode("utf-8"))[0]
-        logger.info(f"Latest commit: {latest_commit}")
-
-        new_url = url.replace(".git", "")
-        logger.info(f"Creating LeanGitRepo for {new_url}")
-        
-        repo = LeanGitRepo(new_url, latest_commit)
-        logger.info(f"Getting config for {url}")
-        
-        config = repo.get_config("lean-toolchain")
-        v = generate_benchmark_lean4.get_lean4_version_from_config(config["content"])
-        
-        if generate_benchmark_lean4.is_supported_version(v):
-            logger.info(f"Latest commit compatible for url {url}")
-            return latest_commit, v
-
-        logger.info(f"Searching for compatible commit for {url}")
-        
-        ensure_inside_git()
-        ZZ
-        process = subprocess.Popen(
-            ["git", "fetch", "--depth=1000000", url],  # Fetch commits
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-        )
-        logger.info(f"Fetching commits for {url}")
-        _, stderr = process.communicate()
-        
-        if process.returncode != 0:
-            raise Exception(f"Git fetch command failed: {stderr.decode('utf-8')}")
-        
-        logger.info(f"Fetched commits for {url}")
-        
-        process = subprocess.Popen(
-            ["git", "log", "--format=%H", "FETCH_HEAD"],  # Get list of commits
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-        )
-        
-        logger.info(f"Getting list of commits for {url}")
-        
-        stdout, stderr = process.communicate()
-        if process.returncode != 0:
-            raise Exception(f"Git log command failed: {stderr.decode('utf-8')}")
-        
-        commits = stdout.decode("utf-8").strip().split("\n")
-        logger.info(f"Found {len(commits)} commits for {url}")
-        
-        new_url = url.replace(".git", "")
-        
-        repo_human_name = "/".join(new_url.split("/")[-2:])
-        
-        # Delete repo if it exists, because it might be checked out to a different commit
-        if os.path.exists(os.path.join("repos", repo_human_name)):
-            shutil.rmtree(os.path.join("repos", repo_human_name))
+    if "mathlib4" in url or "SciLean" in url or "pfr" in url:
+        if "mathlib4" in url:
+            sha = "2b29e73438e240a427bcecc7c0fe19306beb1310"
+            v = "v4.8.0"
+        elif "SciLean" in url:
+            sha = "22d53b2f4e3db2a172e71da6eb9c916e62655744"
+            v = "v4.7.0"
+        elif "pfr" in url:
+            sha = "fa398a5b853c7e94e3294c45e50c6aee013a2687"
+            v = "v4.8.0-rc1"
+        return sha, v
+    else:
+        with open(os.path.join("RAID", "data", "repo_info_compatible.json"), "r") as f:
+            try:
+                repos_and_compatible_commits = json.load(f)
+            except json.JSONDecodeError:
+                repos_and_compatible_commits = []
         
-        subprocess.run(["git", "clone", url, os.path.join("repos", repo_human_name)], check=True)
-
-        for commit in commits:
-            logger.info(f"Checking commit {commit} for {url}")
-            # Check out the commit locally
-            subprocess.run(["git", "-C", os.path.join("repos", repo_human_name), "checkout", commit], check=True)
-            import ipdb; ipdb.set_trace()
-            repo = LeanGitRepo.from_path(os.path.join(os.getcwd(), "repos", repo_human_name), commit)
+        if url in [repo["url"] + ".git" for repo in repos_and_compatible_commits if repo["commit"]]:
+            logger.info(f"Repository {url} already has a compatible commit.")
+            return None, None
+            
+        try:
+            process = subprocess.Popen(["git", "ls-remote", url], stdout=subprocess.PIPE)
+            stdout, stderr = process.communicate()
+            latest_commit = re.split(r"\t+", stdout.decode("utf-8"))[0]
+            logger.info(f"Latest commit: {latest_commit}")
+
+            new_url = url.replace(".git", "")
+            logger.info(f"Creating LeanGitRepo for {new_url}")
+            
+            repo = LeanGitRepo(new_url, latest_commit)
+            logger.info(f"Getting config for {url}")
+            
             config = repo.get_config("lean-toolchain")
             v = generate_benchmark_lean4.get_lean4_version_from_config(config["content"])
+            
             if generate_benchmark_lean4.is_supported_version(v):
-                logger.info(f"Found compatible commit {commit} for {url}")
-                return commit, v
-
-        raise Exception("No compatible commit found")
-
-    except Exception as e:
-        logger.info(f"Error in get_compatible_commit: {str(e)}")
-        return None, None
-
-
-def find_and_save_compatible_commits(repo_info_file, lean_git_repos):
-    """Finds compatible commits for various repositories"""
-    with open(repo_info_file, "r") as repo_compatibility_file:
-        updated_repos = json.loads(repo_compatibility_file)
-    
-        for repo in lean_git_repos:
-            url = repo.url
-            if not url.endswith(".git"):
-                url = url + ".git"
+                logger.info(f"Latest commit compatible for url {url}")
+                return latest_commit, v
 
-            sha = None
-            v = None
+            logger.info(f"Searching for compatible commit for {url}")
             
-            # TODO: Check these
-            if "mathlib4" in url:
-                sha = "2b29e73438e240a427bcecc7c0fe19306beb1310"
-                v = "v4.8.0"
-            elif "SciLean" in url:
-                sha = "22d53b2f4e3db2a172e71da6eb9c916e62655744"
-                v = "v4.7.0"
-            elif "pfr" in url:
-                sha = "fa398a5b853c7e94e3294c45e50c6aee013a2687"
-                v = "v4.8.0-rc1"
-            else:
-                # Check if it's in any element
-                for elem in updated_repos:
-                    if url.replace(".git", "") == elem["url"]:
-                        continue
-                    
-                sha, v = get_compatible_commit(url)
+            ensure_inside_git()
+            process = subprocess.Popen(
+                ["git", "fetch", "--depth=1000000", url],  # Fetch commits
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+            )
             
-
-            # Always write to json, even for null repos
-            updated_repos.append(
-                {"url": url.replace(".git", ""), "commit": sha if sha else None, "version": v if v else None}
+            logger.info(f"Fetching commits for {url}")
+            _, stderr = process.communicate()
+            
+            if process.returncode != 0:
+                raise Exception(f"Git fetch command failed: {stderr.decode('utf-8')}")
+            
+            logger.info(f"Fetched commits for {url}")
+            
+            process = subprocess.Popen(
+                ["git", "log", "--format=%H", "FETCH_HEAD"],  # Get list of commits
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
             )
             
-            if not sha:
-                logger.info(f"Failed to find a compatible commit for {url}")
+            logger.info(f"Getting list of commits for {url}")
+            
+            stdout, stderr = process.communicate()
+            if process.returncode != 0:
+                raise Exception(f"Git log command failed: {stderr.decode('utf-8')}")
+            
+            commits = stdout.decode("utf-8").strip().split("\n")
+            logger.info(f"Found {len(commits)} commits for {url}")
+            
+            new_url = url.replace(".git", "")
+            
+            repo_human_name = "/".join(new_url.split("/")[-2:])
+            
+            # Delete repo if it exists, because it might be checked out to a different commit
+            if os.path.exists(os.path.join("repos", repo_human_name)):
+                shutil.rmtree(os.path.join("repos", repo_human_name))
+            
+            subprocess.run(["git", "clone", url, os.path.join("repos", repo_human_name)], check=True)
+            for commit in commits:
+                logger.info(f"Checking commit {commit} for {url}")
+                # Check out the commit locally
+                subprocess.run(["git", "-C", os.path.join("repos", repo_human_name), "checkout", commit], capture_output=False, check=True)
+                
+                # Check the lean-toolchain file manually, avoid calling LeanGitRepo because it makes a lot of web requests
+                with open(os.path.join("repos", repo_human_name, "lean-toolchain"), "r") as f:
+                    config_content = f.read()
+                
+                v = generate_benchmark_lean4.get_lean4_version_from_config(config_content)
+                if generate_benchmark_lean4.is_supported_version(v):
+                    logger.info(f"Found compatible commit {commit} for {url}")
+                    repos_and_compatible_commits.append({"url": url.replace(".git", ""), "commit": commit, "version": v})
+                    with open(os.path.join(DATA_DIR, "repo_info_compatible.json"), "w") as f:
+                        json.dump(repos_and_compatible_commits, f, indent=2)
+                        f.flush()
+                        
+                    return commit, v
+            raise Exception("No compatible commit found")
+        except Exception as e:
+            logger.info(f"Error in get_compatible_commit: {str(e)}")
+            return None, None
 
-        # Write per repo in case of interrupt
-        with open(repo_info_file, "w") as f:
-            json.dump(updated_repos, f)
 
+def find_and_save_compatible_commits(repo_info_file, lean_git_repos):
+    """Finds and saves compatible commits for various repositories"""
+    for repo in lean_git_repos:
+        url = repo.url
+        if not url.endswith(".git"):
+            url = url + ".git"
+
+        # Saves the compatible commit in repo_info_file
+        _sha, _v = get_compatible_commit(url)
+        
+    with open(repo_info_file, "r") as repos_and_compatible_commits_f:
+        updated_repos = json.load(repos_and_compatible_commits_f)
+        
     return updated_repos
 
 
@@ -289,7 +275,7 @@ def search_github_repositories(lean_git_repos, repos, language="Lean", num_repos
     headers = {"Authorization": personal_access_token}
     query_params = {
         "q": f"language:{language}",
-        "sort": "stars",
+        "sort": "stars",  # What can this be?
         "order": "desc",
         "per_page": 100,
     }
@@ -362,17 +348,7 @@ def add_repo_to_database(dynamic_database_json_path, repo, db):
         url = url + ".git"
     logger.info(f"\n\nProcessing {url}")
 
-    if "mathlib4" in url:
-        sha = "2b29e73438e240a427bcecc7c0fe19306beb1310"
-        v = "v4.8.0"
-    elif "SciLean" in url:
-        sha = "22d53b2f4e3db2a172e71da6eb9c916e62655744"
-        v = "v4.7.0"
-    elif "pfr" in url:
-        sha = "fa398a5b853c7e94e3294c45e50c6aee013a2687"
-        v = "v4.8.0-rc1"
-    else:
-        sha, v = get_compatible_commit(url)
+    sha, v = get_compatible_commit(url)
 
     if not sha:
         logger.info(f"Failed to find a compatible commit for {url}")
@@ -382,25 +358,29 @@ def add_repo_to_database(dynamic_database_json_path, repo, db):
     url = url.replace(".git", "")
     repo = LeanGitRepo(url, sha)
     dir_name = repo.url.split("/")[-1] + "_" + sha
-    dst_dir = RAID_DIR + "/" + DATA_DIR + "/" + dir_name
+    dst_dir = os.path.join(DATA_DIR, dir_name)
     logger.info(f"Generating benchmark at {dst_dir}")
     traced_repo, _, _, total_theorems = generate_benchmark_lean4.main(
         repo.url, sha, dst_dir
     )
+    
     if not traced_repo:
         logger.info(f"Failed to trace {url}")
         return None
-    if total_theorems < 3 * BATCH_SIZE:  # Should be enough theorems for train/val/test
-        logger.info(f"No theorems found in {url}")
+    
+    if total_theorems < 3 * BATCH_SIZE:  # Require enough theorems for train/val/test
+        logger.info(f"Not enough theorems found in {url}")
         return None
+    
     logger.info(f"Finished generating benchmark at {dst_dir}")
 
     # Add the new repo to the dynamic database
     config = repo.get_config("lean-toolchain")
     v = generate_benchmark_lean4.get_lean4_version_from_config(config["content"])
-    theorems_folder = dst_dir + "/random"
-    premise_files_corpus = dst_dir + "/corpus.jsonl"
-    files_traced = dst_dir + "/traced_files.jsonl"
+    theorems_folder = os.path.join(dst_dir, "theorems")
+    premise_files_corpus = os.path.join(dst_dir, "corpus.jsonl")
+    files_traced = os.path.join(dst_dir, "traced_files.jsonl")
+    
     pr_url = None
     data = {
         "url": repo.url,
@@ -420,9 +400,12 @@ def add_repo_to_database(dynamic_database_json_path, repo, db):
     repo = Repository.from_dict(data)
     logger.info("Before adding new repo:")
     db.print_database_contents()
-    db.add_repository(repo)
+    
+    
     logger.info("After adding new repo:")
+    db.add_repository(repo)
     db.print_database_contents()
+    
     db.to_json(dynamic_database_json_path)
     return "Done"
 
@@ -532,14 +515,14 @@ def load_sorted_repos(file_path: str) -> List[Tuple[str, str, str]]:
 
 def write_skip_file(repo_url):
     """Writes a repository URL to a file to skip it."""
-    skip_file_path = os.path.join(RAID_DIR, DATA_DIR, "skip_repo.txt")
+    skip_file_path = os.path.join(DATA_DIR, "skip_repo.txt")
     with open(skip_file_path, "w") as f:
         f.write(repo_url)
 
 
 def should_skip_repo():
     """Checks if a repository should be skipped."""
-    skip_file_path = os.path.join(RAID_DIR, DATA_DIR, "skip_repo.txt")
+    skip_file_path = os.path.join(DATA_DIR, "skip_repo.txt")
     if os.path.exists(skip_file_path):
         with open(skip_file_path, "r") as f:
             repo_url = f.read().strip()
diff --git a/leanagent.py b/leanagent.py
index 68a3e4d..d29ad0b 100644
--- a/leanagent.py
+++ b/leanagent.py
@@ -9,7 +9,7 @@
 
 from datetime import datetime, timedelta
 from pathlib import Path
-from typing import List, Optional, Tuple
+from typing import List, Optional, Set, Tuple
 
 
 import numpy as np
@@ -36,23 +36,14 @@
 from retrieval.model import PremiseRetriever
 
 from git_utils import find_and_save_compatible_commits, search_github_repositories, should_skip_repo, add_repo_to_database, sort_repositories_by_difficulty, save_sorted_repos
+from filenames import DATA_DIR, RAID_DIR, CHECKPOINT_DIR, EVAL_RESULTS_FILE_PATH, DB_FILE_NAME, PROOF_LOG_FILE_NAME, ENCOUNTERED_THEOREMS_FILE, FISHER_DIR
 
 # Set the seed for reproducibility
 personal_access_token = os.environ.get("GITHUB_ACCESS_TOKEN")
 
 random.seed(3407)  # https://arxiv.org/abs/2109.08203
 BATCH_SIZE = 4
-RAID_DIR = os.environ.get("RAID_DIR")
-os.environ["RAY_TMPDIR"] = f"{RAID_DIR}/tmp"
-repo_dir = f"{RAID_DIR}/repos_new"
-
-DATA_DIR = f"{RAID_DIR}/data"
-CHECKPOINT_DIR = f"{RAID_DIR}/checkpoints"
-EVAL_RESULTS_FILE_PATH = f"{RAID_DIR}/eval_results.txt"
-DB_FILE_NAME = "db_file.txt"
-PROOF_LOG_FILE_NAME = f"{RAID_DIR}/proof_log.txt"
-ENCOUNTERED_THEOREMS_FILE = f"{RAID_DIR}/encountered_theorems.pkl"
-FISHER_DIR = f"{RAID_DIR}/fisher"  # Optional
+
 
 repos_for_merged_dataset = []
 repos_for_proving = []
@@ -74,6 +65,7 @@ def _eval(data, preds_map) -> Tuple[float, float, float]:
                 pred = preds_map[key]
             else:
                 continue
+            
             all_pos_premises = set(pred["all_pos_premises"])
             if len(all_pos_premises) == 0:
                 continue
@@ -111,14 +103,14 @@ def load_fisher_information(file_path):
 
 def find_latest_checkpoint():
     """Finds the most recent checkpoint."""
-    checkpoint_dir = RAID_DIR + "/" + CHECKPOINT_DIR
     all_checkpoints = [
-        os.path.join(checkpoint_dir, f)
-        for f in os.listdir(checkpoint_dir)
+        os.path.join(CHECKPOINT_DIR, f)
+        for f in os.listdir(CHECKPOINT_DIR)
         if f.endswith(".ckpt")
     ]
-    if not all_checkpoints:
+    if len(all_checkpoints) == 0:
         raise FileNotFoundError("No checkpoints found.")
+    
     latest_checkpoint = max(all_checkpoints, key=os.path.getmtime)
     logger.info(f"Using the latest checkpoint: {latest_checkpoint}")
     return latest_checkpoint
@@ -126,14 +118,14 @@ def find_latest_checkpoint():
 
 def find_latest_fisher():
     """Finds the most recent Fisher Information Matrix."""
-    fisher_dir = RAID_DIR + "/" + FISHER_DIR
     all_fisher = [
-        os.path.join(fisher_dir, f)
-        for f in os.listdir(fisher_dir)
+        os.path.join(FISHER_DIR, f)
+        for f in os.listdir(FISHER_DIR)
         if f.endswith(".pkl")
     ]
-    if not all_fisher:
+    if len(all_fisher) == 0:
         raise FileNotFoundError("No Fisher Information Matrices found.")
+    
     latest_fisher = max(all_fisher, key=os.path.getmtime)
     logger.info(f"Using the latest Fisher Information Matrix: {latest_fisher}")
     return latest_fisher
@@ -401,7 +393,7 @@ def get_repos(curriculum_learning: bool, num_repos: int, dynamic_database_json_p
     global lean_git_repos
     global repos
     # If curriculum learning is enabled, initialize repositories and sort them by difficulty
-    repo_info_file = os.path.join(RAID_DIR, DATA_DIR, "repo_info_compatible.json")
+    repo_info_file = os.path.join(DATA_DIR, "repo_info_compatible.json")
     # Check if the current process is the main one
     is_main_process = int(os.environ.get("LOCAL_RANK", "0")) == 0
     if curriculum_learning:
@@ -528,7 +520,7 @@ def main():
 
         lean_git_repos, repos, updated_repos = get_repos(curriculum_learning, num_repos, dynamic_database_json_path, db)
 
-        repo_info_file = os.path.join(RAID_DIR, DATA_DIR, "repo_info_compatible.json")
+        repo_info_file = os.path.join(DATA_DIR, "repo_info_compatible.json")
         # All processes wait for the file to be created and then read from it
         # TODO: Fix with a semaphore or file lock
         max_attempts = 30
@@ -703,7 +695,7 @@ def main():
                         if is_main_process:
                             logger.info("Removing skip file")
                             skip_file_path = os.path.join(
-                                RAID_DIR, DATA_DIR, "skip_repo.txt"
+                                DATA_DIR, "skip_repo.txt"
                             )
                             os.remove(skip_file_path)
                         continue
diff --git a/leanagent_utils.py b/leanagent_utils.py
index 684b390..381ab8b 100644
--- a/leanagent_utils.py
+++ b/leanagent_utils.py
@@ -1,6 +1,4 @@
-MARK_START_SYMBOL = "<a>"
-MARK_END_SYMBOL = "</a>"
-
+from constants import MARK_END_SYMBOL, MARK_START_SYMBOL
 
 def remove_marks(s: str) -> str:
     """Remove all :code:`<a>` and :code:`</a>` from ``s``."""

From 29523db5259111385a86688b5b7d4d16ef47dfc4 Mon Sep 17 00:00:00 2001
From: motiwari <mohittiwarinyc+github@gmail.com>
Date: Mon, 15 Sep 2025 10:58:52 -0700
Subject: [PATCH 08/29] Trying to update paths so they're created in
 RAID_DIR/reposM

---
 constants.py                | 6 ++++++
 generate_benchmark_lean4.py | 3 +++
 git_utils.py                | 8 ++++++--
 leanagent.py                | 8 +++-----
 run_leanagent.sh            | 5 +++--
 5 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/constants.py b/constants.py
index b87e291..d11b663 100644
--- a/constants.py
+++ b/constants.py
@@ -229,10 +229,16 @@
     "madvorak/chomsky",
     "T-Brick/ControlFlow",
     "pa-ba/guarded-lean",
+    
+    
+    
 ]
 
 known_dead_repos = [
     "uwdb/Cosette",
     "notepad-plus-plus/userDefinedLanguages",
     "teorth/analysis",
+    
+    # Added by Mo to find smaller repo to iterate on
+    
 ]
\ No newline at end of file
diff --git a/generate_benchmark_lean4.py b/generate_benchmark_lean4.py
index 67c182c..f1942f1 100644
--- a/generate_benchmark_lean4.py
+++ b/generate_benchmark_lean4.py
@@ -551,13 +551,16 @@ def main(url, commit, dst_dir):
     logger.info("LeanDojo configured")
 
     try:
+        import ipdb; ipdb.set_trace()
         logger.info("Tracing the repo...")
         traced_repo = trace(repo)
         logger.info("Successfully traced the repo")
     except Exception as e:
         logger.info(f"Failed to trace repo {repo} because of {e}")
         return None, 0, 0, 10
+    
     safe_remove_dir(dst_dir)
+    
     splits = split_data(traced_repo)
     logger.info("Successfully split the data")
     num_premises, num_files_traced, total_theorems = export_data(
diff --git a/git_utils.py b/git_utils.py
index 2c793ea..cf4c459 100644
--- a/git_utils.py
+++ b/git_utils.py
@@ -8,9 +8,12 @@
 from lean_dojo import LeanGitRepo
 from datetime import datetime
 import lean_dojo
+from lean_dojo.data_extraction.cache import _split_git_url
 from collections import defaultdict
 from dynamic_database import Repository, DynamicDatabase, Theorem
 
+
+
 from loguru import logger
 from typing import Union, List, Tuple
 import math
@@ -26,7 +29,7 @@
 def clone_repo(repo_url):
     """Clone a git repository and return the path to the repository and its sha."""
     # TODO: Fix
-    repo_name = "/".join(repo_url.split("/")[-2:]).replace(".git", "")
+    repo_name = os.path.join(*_split_git_url(repo_url)).replace(".git", "")
     logger.info(f"Cloning {repo_url}")
     logger.info(f"Repo name: {repo_name}")
     repo_name = os.path.join(REPO_DIR, repo_name)
@@ -168,7 +171,8 @@ def get_compatible_commit(url):
         
         if url in [repo["url"] + ".git" for repo in repos_and_compatible_commits if repo["commit"]]:
             logger.info(f"Repository {url} already has a compatible commit.")
-            return None, None
+            repo = [repo for repo in repos_and_compatible_commits if repo["url"] + ".git" == url][0]
+            return repo["commit"], repo["version"]
             
         try:
             process = subprocess.Popen(["git", "ls-remote", url], stdout=subprocess.PIPE)
diff --git a/leanagent.py b/leanagent.py
index d29ad0b..509e9bf 100644
--- a/leanagent.py
+++ b/leanagent.py
@@ -574,9 +574,7 @@ def main():
 
                     db.generate_merged_dataset(dst_dir, repos_for_merged_dataset)
 
-                dst_dir = (
-                    RAID_DIR + "/" + DATA_DIR + "/" + f"merged_with_new_{dir_name}"
-                )
+                dst_dir = os.path.join(DATA_DIR, f"merged_with_new_{dir_name}")
                 new_data_path = dst_dir
 
                 logger.info("All GPUs")
@@ -631,7 +629,7 @@ def main():
                     dir_name = new_data_path.split("/")[-1]
                     filename_suffix = f"_lambda_{lambda_value}"
                     checkpoint_callback = ModelCheckpoint(
-                        dirpath=RAID_DIR + "/" + CHECKPOINT_DIR,
+                        dirpath=CHECKPOINT_DIR,
                         filename=dir_name
                         + filename_suffix
                         + "_{epoch}-{Recall@10_val:.2f}",
@@ -768,7 +766,7 @@ def main():
 
                     logger.info("Testing...")
                     total_R1, total_R10, total_MRR = [], [], []
-                    dataset_path = RAID_DIR + "/" + DATA_DIR
+                    dataset_path = DATA_DIR
                     testing_paths = [
                         os.path.join(dataset_path, d) for d in os.listdir(dataset_path)
                     ]
diff --git a/run_leanagent.sh b/run_leanagent.sh
index e127b50..9cb9aff 100755
--- a/run_leanagent.sh
+++ b/run_leanagent.sh
@@ -24,10 +24,11 @@
 #
 # Usage: bash run_leanagent.sh
 #!/bin/bash
-export RAID_DIR="~/Desktop/LeanAgent/RAID/"
-export LEAN_AGENT_DIR="~/Desktop/LeanAgent"
+export RAID_DIR="/Users/motiwari/Desktop/LeanAgent/RAID"
+export LEAN_AGENT_DIR="/Users/motiwari/Desktop/LeanAgent"
 export PYTHONPATH="${PYTHONPATH}:${RAID_DIR}/LeanAgent"
 export CACHE_DIR="${RAID_DIR}/.cache/lean_dojo"
+export REPO_DIR ="${RAID_DIR}/repos"
 export RAY_TMPDIR="${RAID_DIR}/tmp"
 export CONDA_SH="/Users/motiwari/miniforge3/etc/profile.d/conda.sh"
 source ${CONDA_SH}

From c2a9f0a04b8d75ac81be7763964880bca5dc4b01 Mon Sep 17 00:00:00 2001
From: motiwari <mohittiwarinyc+github@gmail.com>
Date: Thu, 18 Sep 2025 11:15:47 -0700
Subject: [PATCH 09/29] Updating changes

---
 .gitignore                  |  4 ++++
 generate_benchmark_lean4.py |  2 --
 git_utils.py                | 34 +++++++++++++++++++++++++---------
 leanagent.py                |  7 +++++++
 4 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/.gitignore b/.gitignore
index dce68d7..5c87a7b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,7 @@
+workspace
+workspace-backup
+
+
 *.pkl
 retrieval/bm25
 .idea/
diff --git a/generate_benchmark_lean4.py b/generate_benchmark_lean4.py
index f1942f1..27056b5 100644
--- a/generate_benchmark_lean4.py
+++ b/generate_benchmark_lean4.py
@@ -549,9 +549,7 @@ def main(url, commit, dst_dir):
     logger.info("Configuring LeanDojo again...")
     configure_leandojo()
     logger.info("LeanDojo configured")
-
     try:
-        import ipdb; ipdb.set_trace()
         logger.info("Tracing the repo...")
         traced_repo = trace(repo)
         logger.info("Successfully traced the repo")
diff --git a/git_utils.py b/git_utils.py
index cf4c459..2be50c3 100644
--- a/git_utils.py
+++ b/git_utils.py
@@ -28,23 +28,30 @@
 
 def clone_repo(repo_url):
     """Clone a git repository and return the path to the repository and its sha."""
-    # TODO: Fix
     repo_name = os.path.join(*_split_git_url(repo_url)).replace(".git", "")
-    logger.info(f"Cloning {repo_url}")
+    
     logger.info(f"Repo name: {repo_name}")
-    repo_name = os.path.join(REPO_DIR, repo_name)
     
+    repo_name = os.path.join(REPO_DIR, repo_name)
     if os.path.exists(repo_name):
-        print(f"Deleting existing repository directory: {repo_name}")
-        shutil.rmtree(repo_name)
+        print(f"Repository already exists in directory: {repo_name}")
+        process = subprocess.Popen(
+            ["git", "-C", repo_name, "rev-parse", "HEAD"], stdout=subprocess.PIPE
+        )
+        stdout, _stderr = process.communicate()
+    else:
+        logger.info(f"Cloning {repo_url} from scratch")
+        subprocess.run(["git", "clone", repo_url, repo_name])
+        process = subprocess.Popen(["git", "ls-remote", repo_url], stdout=subprocess.PIPE)
+        stdout, _stderr = process.communicate()
     
-    subprocess.run(["git", "clone", repo_url, repo_name])
-    process = subprocess.Popen(["git", "ls-remote", repo_url], stdout=subprocess.PIPE)
-    stdout, _stderr = process.communicate()
     sha = re.split(r"\t+", stdout.decode("utf-8"))[0]
+    sha = sha.strip()
+    print("Sha is " + sha)
     return repo_name, sha
 
 
+
 def branch_exists(repo_name, branch_name):
     """Check if a branch exists in a git repository."""
     proc = subprocess.run(
@@ -231,6 +238,7 @@ def get_compatible_commit(url):
             
             # Delete repo if it exists, because it might be checked out to a different commit
             if os.path.exists(os.path.join("repos", repo_human_name)):
+                logger.info(f"CAREFUL: Deleting existing repo at {os.path.join('repos', repo_human_name)}")
                 shutil.rmtree(os.path.join("repos", repo_human_name))
             
             subprocess.run(["git", "clone", url, os.path.join("repos", repo_human_name)], check=True)
@@ -316,7 +324,6 @@ def search_github_repositories(lean_git_repos, repos, language="Lean", num_repos
                         repo_name, sha = clone_repo(clone_url)
                         name = repo_name
                         url = clone_url.replace(".git", "")
-                        
                         # TODO: This constructor can be very slow
                         lean_git_repo = LeanGitRepo(url, sha)
                         
@@ -325,6 +332,7 @@ def search_github_repositories(lean_git_repos, repos, language="Lean", num_repos
                         cloned_count += 1
                         logger.info(f"Cloned {repo_full_name}")
                     except Exception as e:
+                        logger.info(f"CAREFUL: Deleting existing repo at {os.path.join('repos', repo_full_name)}")
                         shutil.rmtree(name)
                         logger.info(f"Failed to clone {repo_full_name} because of {e}")
                 else:
@@ -359,11 +367,19 @@ def add_repo_to_database(dynamic_database_json_path, repo, db):
         return None
 
     logger.info(f"Found compatible commit {sha} for {url} with lean version: {v}")
+    
+    # Ensure that the repo is checked out to the compatible commit
+    repo_name, _ = clone_repo(url)
+    subprocess.run(["git", "-C", repo_name, "checkout", sha], check=True)
+    logger.info(f"Checked out {url} to commit {sha}")
+    
+    
     url = url.replace(".git", "")
     repo = LeanGitRepo(url, sha)
     dir_name = repo.url.split("/")[-1] + "_" + sha
     dst_dir = os.path.join(DATA_DIR, dir_name)
     logger.info(f"Generating benchmark at {dst_dir}")
+    
     traced_repo, _, _, total_theorems = generate_benchmark_lean4.main(
         repo.url, sha, dst_dir
     )
diff --git a/leanagent.py b/leanagent.py
index 509e9bf..de84f97 100644
--- a/leanagent.py
+++ b/leanagent.py
@@ -399,6 +399,10 @@ def get_repos(curriculum_learning: bool, num_repos: int, dynamic_database_json_p
     if curriculum_learning:
         logger.info("Starting curriculum learning")
         if is_main_process:
+            if num_repos < 3:
+                logger.warning("num_repos should be at least 3 for curriculum learning")
+            
+            
             lean_git_repos, repos = search_github_repositories(lean_git_repos, repos, "Lean", num_repos)
             
             for i in range(len(lean_git_repos)):
@@ -413,6 +417,9 @@ def get_repos(curriculum_learning: bool, num_repos: int, dynamic_database_json_p
                 f"Successfully added {num_repos} repositories to the database"
             )
 
+            if len(db.repositories) < 3:
+                raise ValueError("The database should contain at least 3 repositories for curriculum learning")
+            
             sorted_repos, categorized_theorems, percentiles = (
                 sort_repositories_by_difficulty(db)
             )

From 97dc30308f5aa7272ed61cc803e61b6133badbbc Mon Sep 17 00:00:00 2001
From: aumrp77 <aumrp77@gmail.com>
Date: Tue, 23 Sep 2025 09:37:12 -0400
Subject: [PATCH 10/29] fix: run on CPU-only dev envs; handle empty difficulty
 list

---
 .gitignore             |  7 +++++++
 common.py              | 11 +++++++++--
 git_utils.py           |  4 ++++
 prover/proof_search.py |  7 ++++++-
 4 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index 5c87a7b..4f25a37 100644
--- a/.gitignore
+++ b/.gitignore
@@ -138,3 +138,10 @@ dmypy.json
 # Pyre type checker
 .pyre/
 /.lake
+
+# local artifacts
+RAID/
+lean_dojo/
+*.log
+.lake/
+**/.wt-lean48*
diff --git a/common.py b/common.py
index b9eafef..1509193 100644
--- a/common.py
+++ b/common.py
@@ -10,10 +10,17 @@
 import networkx as nx
 import pytorch_lightning as pl
 import torch
-from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam
+try:
+    from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam
+except Exception:
+    DeepSpeedCPUAdam = None
+    FusedAdam = None
 from lean_dojo import Pos
 from loguru import logger
-from pytorch_lightning.strategies.deepspeed import DeepSpeedStrategy
+try:
+    from pytorch_lightning.strategies.deepspeed import DeepSpeedStrategy
+except Exception:
+    class DeepSpeedStrategy: ...  # placeholder so isinstance checks won't explode
 from pytorch_lightning.utilities.deepspeed import \
     convert_zero_checkpoint_to_fp32_state_dict
 from transformers import get_cosine_schedule_with_warmup
diff --git a/git_utils.py b/git_utils.py
index 2be50c3..28a2d32 100644
--- a/git_utils.py
+++ b/git_utils.py
@@ -480,6 +480,10 @@ def sort_repositories_by_difficulty(db: DynamicDatabase) -> List[Repository]:
         db.update_repository(repo)
         print(f"Finished {repo.name}")
 
+    if len(all_difficulties) == 0:
+        from loguru import logger
+        logger.warning("No theorem difficulties found; skipping difficulty bucketing.")
+        return []
     percentiles = np.percentile(all_difficulties, [33, 67])
 
     categorized_theorems = defaultdict(lambda: defaultdict(list))
diff --git a/prover/proof_search.py b/prover/proof_search.py
index dd08fe6..0c80122 100644
--- a/prover/proof_search.py
+++ b/prover/proof_search.py
@@ -17,7 +17,12 @@
                        TimeoutError)
 from loguru import logger
 from ray.util.actor_pool import ActorPool
-from vllm import AsyncEngineArgs, AsyncLLMEngine, RequestOutput, SamplingParams
+try:
+    from vllm import AsyncEngineArgs, AsyncLLMEngine, RequestOutput, SamplingParams
+    VLLM_AVAILABLE = True
+except Exception:
+    AsyncEngineArgs = AsyncLLMEngine = RequestOutput = SamplingParams = None
+    VLLM_AVAILABLE = False
 
 from common import zip_strict
 from generator.model import FixedTacticGenerator, RetrievalAugmentedGenerator

From 6066b3acf19e3958b223a156ce3bf75dbf720ee1 Mon Sep 17 00:00:00 2001
From: aumrp77 <aumrp77@gmail.com>
Date: Thu, 23 Oct 2025 23:26:07 -0400
Subject: [PATCH 11/29] Guard repo_info reads with a file lock

---
 leanagent.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/leanagent.py b/leanagent.py
index de84f97..c87e84e 100644
--- a/leanagent.py
+++ b/leanagent.py
@@ -1,3 +1,4 @@
+import fcntl
 import json
 import os
 import pickle
@@ -506,7 +507,7 @@ def main():
         use_fisher = False
         single_repo = True
         curriculum_learning = True
-        num_repos = 1
+        num_repos = 4
         dynamic_database_json_path = os.path.join(RAID_DIR, DB_FILE_NAME)
 
         lambdas = None
@@ -528,14 +529,18 @@ def main():
         lean_git_repos, repos, updated_repos = get_repos(curriculum_learning, num_repos, dynamic_database_json_path, db)
 
         repo_info_file = os.path.join(DATA_DIR, "repo_info_compatible.json")
-        # All processes wait for the file to be created and then read from it
-        # TODO: Fix with a semaphore or file lock
+        lock_path = f"{repo_info_file}.lock"
         max_attempts = 30
         for attempt in range(max_attempts):
             try:
-                with open(repo_info_file, "r") as f:
-                    repo_info = json.load(f)
-                break
+                with open(lock_path, "a") as lock_handle:
+                    fcntl.flock(lock_handle.fileno(), fcntl.LOCK_EX)
+                    try:
+                        with open(repo_info_file, "r") as f:
+                            repo_info = json.load(f)
+                        break
+                    finally:
+                        fcntl.flock(lock_handle.fileno(), fcntl.LOCK_UN)
             except (json.JSONDecodeError, FileNotFoundError):
                 if attempt == max_attempts - 1:
                     raise Exception(

From 8a94e5b036b8a8009b327f2c44abe14641dd89a6 Mon Sep 17 00:00:00 2001
From: aumrp77 <aumrp77@gmail.com>
Date: Tue, 28 Oct 2025 19:12:18 -0400
Subject: [PATCH 12/29] Guard JSON access with file lock and adjust dataset
 handling

---
 .mplcache/fontlist-v390.json | 4664 ++++++++++++++++++++++++++++++++++
 filenames.py                 |    6 +-
 git_utils.py                 |    6 +-
 leanagent.py                 |   38 +-
 requirements-local.txt       |   20 +
 requirements.cpu.txt         |   17 +
 retrieval/bm25/main.py       |   37 +-
 scripts/manual_trace.py      |   76 +
 testfile_root                |    2 +
 9 files changed, 4828 insertions(+), 38 deletions(-)
 create mode 100644 .mplcache/fontlist-v390.json
 create mode 100644 requirements-local.txt
 create mode 100644 requirements.cpu.txt
 create mode 100644 scripts/manual_trace.py
 create mode 100644 testfile_root

diff --git a/.mplcache/fontlist-v390.json b/.mplcache/fontlist-v390.json
new file mode 100644
index 0000000..5910b87
--- /dev/null
+++ b/.mplcache/fontlist-v390.json
@@ -0,0 +1,4664 @@
+{
+  "_version": 390,
+  "_FontManager__default_weight": "normal",
+  "default_size": null,
+  "defaultFamily": {
+    "ttf": "DejaVu Sans",
+    "afm": "Helvetica"
+  },
+  "afmlist": [
+    {
+      "fname": "fonts/afm/pbkli8a.afm",
+      "name": "ITC Bookman",
+      "style": "italic",
+      "variant": "normal",
+      "weight": "light",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/pdfcorefonts/Courier-Bold.afm",
+      "name": "Courier",
+      "style": "normal",
+      "variant": "normal",
+      "weight": "bold",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/cmex10.afm",
+      "name": "Computer Modern",
+      "style": "normal",
+      "variant": "normal",
+      "weight": "medium",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/phvb8a.afm",
+      "name": "Helvetica",
+      "style": "normal",
+      "variant": "normal",
+      "weight": "bold",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/pbkdi8a.afm",
+      "name": "ITC Bookman",
+      "style": "italic",
+      "variant": "normal",
+      "weight": "demi",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/ptmri8a.afm",
+      "name": "Times",
+      "style": "italic",
+      "variant": "normal",
+      "weight": "medium",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/cmtt10.afm",
+      "name": "Computer Modern",
+      "style": "normal",
+      "variant": "normal",
+      "weight": "medium",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/phvlo8a.afm",
+      "name": "Helvetica",
+      "style": "italic",
+      "variant": "normal",
+      "weight": "light",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/pzcmi8a.afm",
+      "name": "ITC Zapf Chancery",
+      "style": "italic",
+      "variant": "normal",
+      "weight": "medium",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/putri8a.afm",
+      "name": "Utopia",
+      "style": "italic",
+      "variant": "normal",
+      "weight": "regular",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/pdfcorefonts/Symbol.afm",
+      "name": "Symbol",
+      "style": "normal",
+      "variant": "normal",
+      "weight": "medium",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/ptmbi8a.afm",
+      "name": "Times",
+      "style": "italic",
+      "variant": "normal",
+      "weight": "bold",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/pagko8a.afm",
+      "name": "ITC Avant Garde Gothic",
+      "style": "italic",
+      "variant": "normal",
+      "weight": "book",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/phvro8an.afm",
+      "name": "Helvetica",
+      "style": "italic",
+      "variant": "normal",
+      "weight": "medium",
+      "stretch": "condensed",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/phvr8an.afm",
+      "name": "Helvetica",
+      "style": "normal",
+      "variant": "normal",
+      "weight": "medium",
+      "stretch": "condensed",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/pzdr.afm",
+      "name": "ITC Zapf Dingbats",
+      "style": "normal",
+      "variant": "normal",
+      "weight": "medium",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/pplri8a.afm",
+      "name": "Palatino",
+      "style": "italic",
+      "variant": "normal",
+      "weight": "medium",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/cmmi10.afm",
+      "name": "Computer Modern",
+      "style": "italic",
+      "variant": "normal",
+      "weight": "medium",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/pdfcorefonts/Times-Italic.afm",
+      "name": "Times",
+      "style": "italic",
+      "variant": "normal",
+      "weight": "medium",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/pdfcorefonts/Helvetica-Oblique.afm",
+      "name": "Helvetica",
+      "style": "italic",
+      "variant": "normal",
+      "weight": "medium",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/pdfcorefonts/Helvetica-BoldOblique.afm",
+      "name": "Helvetica",
+      "style": "italic",
+      "variant": "normal",
+      "weight": "bold",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/ptmr8a.afm",
+      "name": "Times",
+      "style": "normal",
+      "variant": "normal",
+      "weight": "roman",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/pdfcorefonts/Times-Bold.afm",
+      "name": "Times",
+      "style": "normal",
+      "variant": "normal",
+      "weight": "bold",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/pagdo8a.afm",
+      "name": "ITC Avant Garde Gothic",
+      "style": "italic",
+      "variant": "normal",
+      "weight": "demi",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/phvro8a.afm",
+      "name": "Helvetica",
+      "style": "italic",
+      "variant": "normal",
+      "weight": "medium",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/pdfcorefonts/Times-BoldItalic.afm",
+      "name": "Times",
+      "style": "italic",
+      "variant": "normal",
+      "weight": "bold",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/pdfcorefonts/Helvetica.afm",
+      "name": "Helvetica",
+      "style": "normal",
+      "variant": "normal",
+      "weight": "medium",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/phvl8a.afm",
+      "name": "Helvetica",
+      "style": "normal",
+      "variant": "normal",
+      "weight": "light",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/phvbo8an.afm",
+      "name": "Helvetica",
+      "style": "italic",
+      "variant": "normal",
+      "weight": "bold",
+      "stretch": "condensed",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/putbi8a.afm",
+      "name": "Utopia",
+      "style": "italic",
+      "variant": "normal",
+      "weight": "bold",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/pcrb8a.afm",
+      "name": "Courier",
+      "style": "normal",
+      "variant": "normal",
+      "weight": "bold",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/pplbi8a.afm",
+      "name": "Palatino",
+      "style": "italic",
+      "variant": "normal",
+      "weight": "bold",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/pcrbo8a.afm",
+      "name": "Courier",
+      "style": "italic",
+      "variant": "normal",
+      "weight": "bold",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/pdfcorefonts/ZapfDingbats.afm",
+      "name": "ZapfDingbats",
+      "style": "normal",
+      "variant": "normal",
+      "weight": "medium",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/pdfcorefonts/Courier.afm",
+      "name": "Courier",
+      "style": "normal",
+      "variant": "normal",
+      "weight": "medium",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/pcrr8a.afm",
+      "name": "Courier",
+      "style": "normal",
+      "variant": "normal",
+      "weight": "medium",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/cmsy10.afm",
+      "name": "Computer Modern",
+      "style": "italic",
+      "variant": "normal",
+      "weight": "medium",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/ptmb8a.afm",
+      "name": "Times",
+      "style": "normal",
+      "variant": "normal",
+      "weight": "bold",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/phvb8an.afm",
+      "name": "Helvetica",
+      "style": "normal",
+      "variant": "normal",
+      "weight": "bold",
+      "stretch": "condensed",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/pncri8a.afm",
+      "name": "New Century Schoolbook",
+      "style": "italic",
+      "variant": "normal",
+      "weight": "medium",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/putb8a.afm",
+      "name": "Utopia",
+      "style": "normal",
+      "variant": "normal",
+      "weight": "bold",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/pncb8a.afm",
+      "name": "New Century Schoolbook",
+      "style": "normal",
+      "variant": "normal",
+      "weight": "bold",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/pagk8a.afm",
+      "name": "ITC Avant Garde Gothic",
+      "style": "normal",
+      "variant": "normal",
+      "weight": "book",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/pplb8a.afm",
+      "name": "Palatino",
+      "style": "normal",
+      "variant": "normal",
+      "weight": "bold",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/pdfcorefonts/Courier-BoldOblique.afm",
+      "name": "Courier",
+      "style": "italic",
+      "variant": "normal",
+      "weight": "bold",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/pbkl8a.afm",
+      "name": "ITC Bookman",
+      "style": "normal",
+      "variant": "normal",
+      "weight": "light",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/phvr8a.afm",
+      "name": "Helvetica",
+      "style": "normal",
+      "variant": "normal",
+      "weight": "medium",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/phvbo8a.afm",
+      "name": "Helvetica",
+      "style": "italic",
+      "variant": "normal",
+      "weight": "bold",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/pdfcorefonts/Courier-Oblique.afm",
+      "name": "Courier",
+      "style": "italic",
+      "variant": "normal",
+      "weight": "medium",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/putr8a.afm",
+      "name": "Utopia",
+      "style": "normal",
+      "variant": "normal",
+      "weight": "regular",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/psyr.afm",
+      "name": "Symbol",
+      "style": "normal",
+      "variant": "normal",
+      "weight": "medium",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/pplr8a.afm",
+      "name": "Palatino",
+      "style": "normal",
+      "variant": "normal",
+      "weight": "roman",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/pdfcorefonts/Times-Roman.afm",
+      "name": "Times",
+      "style": "normal",
+      "variant": "normal",
+      "weight": "roman",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/pagd8a.afm",
+      "name": "ITC Avant Garde Gothic",
+      "style": "normal",
+      "variant": "normal",
+      "weight": "demi",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/pbkd8a.afm",
+      "name": "ITC Bookman",
+      "style": "normal",
+      "variant": "normal",
+      "weight": "demi",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/pcrro8a.afm",
+      "name": "Courier",
+      "style": "italic",
+      "variant": "normal",
+      "weight": "medium",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/pdfcorefonts/Helvetica-Bold.afm",
+      "name": "Helvetica",
+      "style": "normal",
+      "variant": "normal",
+      "weight": "bold",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/pncr8a.afm",
+      "name": "New Century Schoolbook",
+      "style": "normal",
+      "variant": "normal",
+      "weight": "roman",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/pncbi8a.afm",
+      "name": "New Century Schoolbook",
+      "style": "italic",
+      "variant": "normal",
+      "weight": "bold",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/afm/cmr10.afm",
+      "name": "Computer Modern",
+      "style": "normal",
+      "variant": "normal",
+      "weight": "medium",
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    }
+  ],
+  "ttflist": [
+    {
+      "fname": "fonts/ttf/DejaVuSansMono-Oblique.ttf",
+      "name": "DejaVu Sans Mono",
+      "style": "oblique",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/STIXSizOneSymBol.ttf",
+      "name": "STIXSizeOneSym",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/DejaVuSans.ttf",
+      "name": "DejaVu Sans",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/STIXGeneral.ttf",
+      "name": "STIXGeneral",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/DejaVuSansMono-Bold.ttf",
+      "name": "DejaVu Sans Mono",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/STIXGeneralBolIta.ttf",
+      "name": "STIXGeneral",
+      "style": "italic",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/DejaVuSerif.ttf",
+      "name": "DejaVu Serif",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/STIXSizFourSymBol.ttf",
+      "name": "STIXSizeFourSym",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/cmsy10.ttf",
+      "name": "cmsy10",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/cmex10.ttf",
+      "name": "cmex10",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/STIXSizThreeSymBol.ttf",
+      "name": "STIXSizeThreeSym",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/cmss10.ttf",
+      "name": "cmss10",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/cmmi10.ttf",
+      "name": "cmmi10",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/DejaVuSerif-Italic.ttf",
+      "name": "DejaVu Serif",
+      "style": "italic",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/cmtt10.ttf",
+      "name": "cmtt10",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/STIXGeneralItalic.ttf",
+      "name": "STIXGeneral",
+      "style": "italic",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/STIXNonUniIta.ttf",
+      "name": "STIXNonUnicode",
+      "style": "italic",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/DejaVuSans-BoldOblique.ttf",
+      "name": "DejaVu Sans",
+      "style": "oblique",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/DejaVuSansMono.ttf",
+      "name": "DejaVu Sans Mono",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/DejaVuSerifDisplay.ttf",
+      "name": "DejaVu Serif Display",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/DejaVuSerif-Bold.ttf",
+      "name": "DejaVu Serif",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/DejaVuSansDisplay.ttf",
+      "name": "DejaVu Sans Display",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/STIXSizOneSymReg.ttf",
+      "name": "STIXSizeOneSym",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/STIXNonUniBolIta.ttf",
+      "name": "STIXNonUnicode",
+      "style": "italic",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/STIXGeneralBol.ttf",
+      "name": "STIXGeneral",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/STIXSizFiveSymReg.ttf",
+      "name": "STIXSizeFiveSym",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/cmr10.ttf",
+      "name": "cmr10",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/cmb10.ttf",
+      "name": "cmb10",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/DejaVuSansMono-BoldOblique.ttf",
+      "name": "DejaVu Sans Mono",
+      "style": "oblique",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/STIXSizTwoSymReg.ttf",
+      "name": "STIXSizeTwoSym",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/DejaVuSerif-BoldItalic.ttf",
+      "name": "DejaVu Serif",
+      "style": "italic",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/STIXNonUni.ttf",
+      "name": "STIXNonUnicode",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/DejaVuSans-Oblique.ttf",
+      "name": "DejaVu Sans",
+      "style": "oblique",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/DejaVuSans-Bold.ttf",
+      "name": "DejaVu Sans",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/STIXNonUniBol.ttf",
+      "name": "STIXNonUnicode",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/STIXSizThreeSymReg.ttf",
+      "name": "STIXSizeThreeSym",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/STIXSizTwoSymBol.ttf",
+      "name": "STIXSizeTwoSym",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "fonts/ttf/STIXSizFourSymReg.ttf",
+      "name": "STIXSizeFourSym",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Herculanum.ttf",
+      "name": "Herculanum",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansBuhid-Regular.ttf",
+      "name": "Noto Sans Buhid",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/InaiMathi-MN.ttc",
+      "name": "InaiMathi",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Verdana.ttf",
+      "name": "Verdana",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/SnellRoundhand.ttc",
+      "name": "Snell Roundhand",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 500,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansTaiViet-Regular.ttf",
+      "name": "Noto Sans Tai Viet",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansMeeteiMayek-Regular.ttf",
+      "name": "Noto Sans Meetei Mayek",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansCaucasianAlbanian-Regular.ttf",
+      "name": "Noto Sans Caucasian Albanian",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansHanunoo-Regular.ttf",
+      "name": "Noto Sans Hanunoo",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansMahajani-Regular.ttf",
+      "name": "Noto Sans Mahajani",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Apple Braille.ttf",
+      "name": "Apple Braille",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/STIXTwoText-Italic.ttf",
+      "name": "STIX Two Text",
+      "style": "italic",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/AlBayan.ttc",
+      "name": "Al Bayan",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Luminari.ttf",
+      "name": "Luminari",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansEgyptianHieroglyphs-Regular.ttf",
+      "name": "Noto Sans Egyptian Hieroglyphs",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Ayuthaya.ttf",
+      "name": "Ayuthaya",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/PartyLET-plain.ttf",
+      "name": "Party LET",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Cochin.ttc",
+      "name": "Cochin",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 500,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/\u30d2\u30e9\u30ad\u3099\u30ce\u89d2\u30b3\u3099\u30b7\u30c3\u30af W8.ttc",
+      "name": "Hiragino Sans",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 800,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Gurmukhi.ttf",
+      "name": "Gurmukhi MT",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 500,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansUgaritic-Regular.ttf",
+      "name": "Noto Sans Ugaritic",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/ITFDevanagari.ttc",
+      "name": "ITF Devanagari",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Georgia Bold Italic.ttf",
+      "name": "Georgia",
+      "style": "italic",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Khmer Sangam MN.ttf",
+      "name": "Khmer Sangam MN",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/ThonburiUI.ttc",
+      "name": ".ThonburiUI",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/STIXIntDReg.otf",
+      "name": "STIXIntegralsD",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSerifNyiakengPuachueHmong-Regular.ttf",
+      "name": "Noto Serif Hmong Nyiakeng",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansBrahmi-Regular.ttf",
+      "name": "Noto Sans Brahmi",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Hoefler Text Ornaments.ttf",
+      "name": "Hoefler Text",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansBhaiksuki-Regular.ttf",
+      "name": "Noto Sans Bhaiksuki",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Georgia Italic.ttf",
+      "name": "Georgia",
+      "style": "italic",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Sathu.ttf",
+      "name": "Sathu",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Kailasa.ttc",
+      "name": "Kailasa",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/SFNSMonoItalic.ttf",
+      "name": ".SF NS Mono",
+      "style": "italic",
+      "variant": "normal",
+      "weight": 295,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Arial Unicode.ttf",
+      "name": "Arial Unicode MS",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansSharada-Regular.ttf",
+      "name": "Noto Sans Sharada",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Mishafi.ttf",
+      "name": "Mishafi",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Diwan Thuluth.ttf",
+      "name": "Diwan Thuluth",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/STIXVarBol.otf",
+      "name": "STIXVariants",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Futura.ttc",
+      "name": "Futura",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 500,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansThaana-Regular.ttf",
+      "name": "Noto Sans Thaana",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Gurmukhi Sangam MN.ttc",
+      "name": "Gurmukhi Sangam MN",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/SFArmenianRounded.ttf",
+      "name": ".SF Armenian Rounded",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Diwan Kufi.ttc",
+      "name": "Diwan Kufi",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Webdings.ttf",
+      "name": "Webdings",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Damascus.ttc",
+      "name": "Damascus",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/KohinoorGujarati.ttc",
+      "name": "Kohinoor Gujarati",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansTaiTham-Regular.ttf",
+      "name": "Noto Sans Tai Tham",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Arial Narrow Bold Italic.ttf",
+      "name": "Arial Narrow",
+      "style": "italic",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "condensed",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansElbasan-Regular.ttf",
+      "name": "Noto Sans Elbasan",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/STIXSizFiveSymReg.otf",
+      "name": "STIXSizeFiveSym",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/PTSans.ttc",
+      "name": "PT Sans",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansCypriot-Regular.ttf",
+      "name": "Noto Sans Cypriot",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/\u30d2\u30e9\u30ad\u3099\u30ce\u89d2\u30b3\u3099\u30b7\u30c3\u30af W6.ttc",
+      "name": "Hiragino Sans",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 600,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Comic Sans MS.ttf",
+      "name": "Comic Sans MS",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansOldSouthArabian-Regular.ttf",
+      "name": "Noto Sans Old South Arabian",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansMiao-Regular.ttf",
+      "name": "Noto Sans Miao",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansKharoshthi-Regular.ttf",
+      "name": "Noto Sans Kharoshthi",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/DIN Alternate Bold.ttf",
+      "name": "DIN Alternate",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/STIXNonUniIta.otf",
+      "name": "STIXNonUnicode",
+      "style": "italic",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Trebuchet MS Italic.ttf",
+      "name": "Trebuchet MS",
+      "style": "italic",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansOsmanya-Regular.ttf",
+      "name": "Noto Sans Osmanya",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Kannada Sangam MN.ttc",
+      "name": "Kannada Sangam MN",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Verdana Italic.ttf",
+      "name": "Verdana",
+      "style": "italic",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/\u30d2\u30e9\u30ad\u3099\u30ce\u89d2\u30b3\u3099\u30b7\u30c3\u30af W1.ttc",
+      "name": "Hiragino Sans",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 200,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/SFHebrewRounded.ttf",
+      "name": ".SF Hebrew Rounded",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/CJKSymbolsFallback.ttc",
+      "name": ".CJK Symbols Fallback HK",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 542,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/STIXSizThreeSymBol.otf",
+      "name": "STIXSizeThreeSym",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/GillSans.ttc",
+      "name": "Gill Sans",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansGunjalaGondi-Regular.otf",
+      "name": "Noto Sans Gunjala Gondi",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansMultani-Regular.ttf",
+      "name": "Noto Sans Multani",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansTagbanwa-Regular.ttf",
+      "name": "Noto Sans Tagbanwa",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Arial Black.ttf",
+      "name": "Arial Black",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 900,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansBatak-Regular.ttf",
+      "name": "Noto Sans Batak",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Optima.ttc",
+      "name": "Optima",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/SFCompactRounded.ttf",
+      "name": ".SF Compact Rounded",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansSaurashtra-Regular.ttf",
+      "name": "Noto Sans Saurashtra",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Chalkboard.ttc",
+      "name": "Chalkboard",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Marion.ttc",
+      "name": "Marion",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/HelveticaNeue.ttc",
+      "name": "Helvetica Neue",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Wingdings 2.ttf",
+      "name": "Wingdings 2",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansChakma-Regular.ttf",
+      "name": "Noto Sans Chakma",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Sinhala MN.ttc",
+      "name": "Sinhala MN",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/STIXGeneral.otf",
+      "name": "STIXGeneral",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansSoraSompeng-Regular.ttf",
+      "name": "Noto Sans Sora Sompeng",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansSiddham-Regular.otf",
+      "name": "Noto Sans Siddham",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansImperialAramaic-Regular.ttf",
+      "name": "Noto Sans Imperial Aramaic",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/Library/Fonts/Arial Unicode.ttf",
+      "name": "Arial Unicode MS",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansSyriac-Regular.ttf",
+      "name": "Noto Sans Syriac",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansKaithi-Regular.ttf",
+      "name": "Noto Sans Kaithi",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Courier New Bold.ttf",
+      "name": "Courier New",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/STIXGeneralItalic.otf",
+      "name": "STIXGeneral",
+      "style": "italic",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Lao Sangam MN.ttf",
+      "name": "Lao Sangam MN",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Krungthep.ttf",
+      "name": "Krungthep",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSerifYezidi-Regular.otf",
+      "name": "Noto Serif Yezidi",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/NotoSansOriya.ttc",
+      "name": "Noto Sans Oriya",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Apple Symbols.ttf",
+      "name": "Apple Symbols",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Impact.ttf",
+      "name": "Impact",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Noteworthy.ttc",
+      "name": "Noteworthy",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 300,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/ArialHB.ttc",
+      "name": "Arial Hebrew",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/GujaratiMT.ttc",
+      "name": "Gujarati MT",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansSylotiNagri-Regular.ttf",
+      "name": "Noto Sans Syloti Nagri",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansMeroitic-Regular.ttf",
+      "name": "Noto Sans Meroitic",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Corsiva.ttc",
+      "name": "Corsiva Hebrew",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansCanadianAboriginal-Regular.otf",
+      "name": "Noto Sans Canadian Aboriginal",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Galvji.ttc",
+      "name": "Galvji",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansTirhuta-Regular.ttf",
+      "name": "Noto Sans Tirhuta",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/STIXIntSmBol.otf",
+      "name": "STIXIntegralsSm",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/\u30d2\u30e9\u30ad\u3099\u30ce\u89d2\u30b3\u3099\u30b7\u30c3\u30af W4.ttc",
+      "name": "Hiragino Sans",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/STIXGeneralBolIta.otf",
+      "name": "STIXGeneral",
+      "style": "italic",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/STIXIntDBol.otf",
+      "name": "STIXIntegralsD",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansMandaic-Regular.ttf",
+      "name": "Noto Sans Mandaic",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Arial Rounded Bold.ttf",
+      "name": "Arial Rounded MT Bold",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Verdana Bold Italic.ttf",
+      "name": "Verdana",
+      "style": "italic",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/PTMono.ttc",
+      "name": "PT Mono",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Courier New Bold Italic.ttf",
+      "name": "Courier New",
+      "style": "italic",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Iowan Old Style.ttc",
+      "name": "Iowan Old Style",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Nadeem.ttc",
+      "name": "Nadeem",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Bodoni 72.ttc",
+      "name": "Bodoni 72",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/PTSerif.ttc",
+      "name": "PT Serif",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Mshtakan.ttc",
+      "name": "Mshtakan",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansJavanese-Regular.otf",
+      "name": "Noto Sans Javanese",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Andale Mono.ttf",
+      "name": "Andale Mono",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Phosphate.ttc",
+      "name": "Phosphate",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/BigCaslon.ttf",
+      "name": "Big Caslon",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 500,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansHanifiRohingya-Regular.ttf",
+      "name": "Noto Sans Hanifi Rohingya",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansCarian-Regular.ttf",
+      "name": "Noto Sans Carian",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Academy Engraved LET Fonts.ttf",
+      "name": "Academy Engraved LET",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Courier New.ttf",
+      "name": "Courier New",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Times.ttc",
+      "name": "Times",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansOldPersian-Regular.ttf",
+      "name": "Noto Sans Old Persian",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Seravek.ttc",
+      "name": "Seravek",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Microsoft Sans Serif.ttf",
+      "name": "Microsoft Sans Serif",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Keyboard.ttf",
+      "name": ".Keyboard",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 100,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/NotoSansKannada.ttc",
+      "name": "Noto Sans Kannada",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 100,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Courier.ttc",
+      "name": "Courier",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Arial Bold Italic.ttf",
+      "name": "Arial",
+      "style": "italic",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/STIXSizTwoSymBol.otf",
+      "name": "STIXSizeTwoSym",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansPhagsPa-Regular.ttf",
+      "name": "Noto Sans PhagsPa",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Helvetica.ttc",
+      "name": "Helvetica",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Athelas.ttc",
+      "name": "Athelas",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/STIXSizFourSymBol.otf",
+      "name": "STIXSizeFourSym",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansInscriptionalPahlavi-Regular.ttf",
+      "name": "Noto Sans Inscriptional Pahlavi",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Apple Braille Pinpoint 6 Dot.ttf",
+      "name": "Apple Braille",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansLisu-Regular.ttf",
+      "name": "Noto Sans Lisu",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/SFNSRounded.ttf",
+      "name": ".SF NS Rounded",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansKayahLi-Regular.ttf",
+      "name": "Noto Sans Kayah Li",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/AquaKana.ttc",
+      "name": ".Aqua Kana",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 300,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Trebuchet MS.ttf",
+      "name": "Trebuchet MS",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansNKo-Regular.ttf",
+      "name": "Noto Sans NKo",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansNewTaiLue-Regular.ttf",
+      "name": "Noto Sans New Tai Lue",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Apple Braille Outline 8 Dot.ttf",
+      "name": "Apple Braille",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Trattatello.ttf",
+      "name": "Trattatello",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Sinhala Sangam MN.ttc",
+      "name": "Sinhala Sangam MN",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/\u30d2\u30e9\u30ad\u3099\u30ce\u89d2\u30b3\u3099\u30b7\u30c3\u30af W2.ttc",
+      "name": "Hiragino Sans",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 250,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/NewYorkItalic.ttf",
+      "name": ".New York",
+      "style": "italic",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/SukhumvitSet.ttc",
+      "name": "Sukhumvit Set",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 250,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Gurmukhi MN.ttc",
+      "name": "Gurmukhi MN",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Hiragino Sans GB.ttc",
+      "name": "Hiragino Sans GB",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 300,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansOldPermic-Regular.ttf",
+      "name": "Noto Sans Old Permic",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/SFNSItalic.ttf",
+      "name": "System Font",
+      "style": "italic",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansGlagolitic-Regular.ttf",
+      "name": "Noto Sans Glagolitic",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansGothic-Regular.ttf",
+      "name": "Noto Sans Gothic",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Farisi.ttf",
+      "name": "Farisi",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Silom.ttf",
+      "name": "Silom",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Wingdings.ttf",
+      "name": "Wingdings",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/STIXIntUpReg.otf",
+      "name": "STIXIntegralsUp",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansAdlam-Regular.ttf",
+      "name": "Noto Sans Adlam",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansCuneiform-Regular.ttf",
+      "name": "Noto Sans Cuneiform",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Bradley Hand Bold.ttf",
+      "name": "Bradley Hand",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansOlChiki-Regular.ttf",
+      "name": "Noto Sans Ol Chiki",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/KohinoorBangla.ttc",
+      "name": "Kohinoor Bangla",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansLycian-Regular.ttf",
+      "name": "Noto Sans Lycian",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/MarkerFelt.ttc",
+      "name": "Marker Felt",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansMarchen-Regular.ttf",
+      "name": "Noto Sans Marchen",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Arial Bold.ttf",
+      "name": "Arial",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/\u30d2\u30e9\u30ad\u3099\u30ce\u4e38\u30b3\u3099 ProN W4.ttc",
+      "name": "Hiragino Maru Gothic Pro",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansPhoenician-Regular.ttf",
+      "name": "Noto Sans Phoenician",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Avenir Next.ttc",
+      "name": "Avenir Next",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Apple Chancery.ttf",
+      "name": "Apple Chancery",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 0,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/STIXTwoMath.otf",
+      "name": "STIX Two Math",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Comic Sans MS Bold.ttf",
+      "name": "Comic Sans MS",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/STIXSizTwoSymReg.otf",
+      "name": "STIXSizeTwoSym",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Chalkduster.ttf",
+      "name": "Chalkduster",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Sana.ttc",
+      "name": "Sana",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/SFNSMono.ttf",
+      "name": ".SF NS Mono",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 295,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/STIXTwoText.ttf",
+      "name": "STIX Two Text",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/PlantagenetCherokee.ttf",
+      "name": "Plantagenet Cherokee",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansOldItalic-Regular.ttf",
+      "name": "Noto Sans Old Italic",
+      "style": "italic",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/KufiStandardGK.ttc",
+      "name": "KufiStandardGK",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Malayalam Sangam MN.ttc",
+      "name": "Malayalam Sangam MN",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Times New Roman Bold.ttf",
+      "name": "Times New Roman",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/NotoNastaliq.ttc",
+      "name": "Noto Nastaliq Urdu",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/STIXSizFourSymReg.otf",
+      "name": "STIXSizeFourSym",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansHatran-Regular.ttf",
+      "name": "Noto Sans Hatran",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansOsage-Regular.ttf",
+      "name": "Noto Sans Osage",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansVai-Regular.ttf",
+      "name": "Noto Sans Vai",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/KohinoorTelugu.ttc",
+      "name": "Kohinoor Telugu",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/DecoTypeNaskh.ttc",
+      "name": "DecoType Naskh",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/SFNS.ttf",
+      "name": "System Font",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/SFCamera.ttf",
+      "name": ".SF Camera",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Bangla MN.ttc",
+      "name": "Bangla MN",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansOldHungarian-Regular.ttf",
+      "name": "Noto Sans Old Hungarian",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Menlo.ttc",
+      "name": "Menlo",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Apple Braille Outline 6 Dot.ttf",
+      "name": "Apple Braille",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansLydian-Regular.ttf",
+      "name": "Noto Sans Lydian",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/\u30d2\u30e9\u30ad\u3099\u30ce\u89d2\u30b3\u3099\u30b7\u30c3\u30af W7.ttc",
+      "name": "Hiragino Sans",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/SFCompactItalic.ttf",
+      "name": ".SF Compact",
+      "style": "italic",
+      "variant": "normal",
+      "weight": 1000,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansTagalog-Regular.ttf",
+      "name": "Noto Sans Tagalog",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Hoefler Text.ttc",
+      "name": "Hoefler Text",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/SuperClarendon.ttc",
+      "name": "Superclarendon",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/NotoSansArmenian.ttc",
+      "name": "Noto Sans Armenian",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 900,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Bodoni 72 Smallcaps Book.ttf",
+      "name": "Bodoni 72 Smallcaps",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Wingdings 3.ttf",
+      "name": "Wingdings 3",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/STIXSizThreeSymReg.otf",
+      "name": "STIXSizeThreeSym",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansWancho-Regular.ttf",
+      "name": "Noto Sans Wancho",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Symbol.ttf",
+      "name": "Symbol",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NewPeninimMT.ttc",
+      "name": "New Peninim MT",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/DecoTypeNastaleeqUrdu.ttc",
+      "name": ".DecoType Nastaleeq Urdu UI",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Muna.ttc",
+      "name": "Muna",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Rockwell.ttc",
+      "name": "Rockwell",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Charter.ttc",
+      "name": "Charter",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/AppleMyungjo.ttf",
+      "name": "AppleMyungjo",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Devanagari Sangam MN.ttc",
+      "name": "Devanagari Sangam MN",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansBuginese-Regular.ttf",
+      "name": "Noto Sans Buginese",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Verdana Bold.ttf",
+      "name": "Verdana",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/NotoSansMyanmar.ttc",
+      "name": "Noto Sans Myanmar",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 900,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Malayalam MN.ttc",
+      "name": "Malayalam MN",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/STIXIntUpSmBol.otf",
+      "name": "STIXIntegralsUpSm",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/STHeiti Light.ttc",
+      "name": "Heiti TC",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 300,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansMro-Regular.ttf",
+      "name": "Noto Sans Mro",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Bodoni 72 OS.ttc",
+      "name": "Bodoni 72 Oldstyle",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Skia.ttf",
+      "name": "Skia",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 5,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/\u30d2\u30e9\u30ad\u3099\u30ce\u660e\u671d ProN.ttc",
+      "name": "Hiragino Mincho ProN",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 300,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/ADTNumeric.ttc",
+      "name": ".SF Numeric",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Arial.ttf",
+      "name": "Arial",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Thonburi.ttc",
+      "name": "Thonburi",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/MuktaMahee.ttc",
+      "name": "Mukta Mahee",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/PTSerifCaption.ttc",
+      "name": "PT Serif Caption",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansModi-Regular.ttf",
+      "name": "Noto Sans Modi",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/DevanagariMT.ttc",
+      "name": "Devanagari MT",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/STIXNonUniBolIta.otf",
+      "name": "STIXNonUnicode",
+      "style": "italic",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansLinearA-Regular.ttf",
+      "name": "Noto Sans Linear A",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansMendeKikakui-Regular.ttf",
+      "name": "Noto Sans Mende Kikakui",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Khmer MN.ttc",
+      "name": "Khmer MN",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/\u30d2\u30e9\u30ad\u3099\u30ce\u89d2\u30b3\u3099\u30b7\u30c3\u30af W0.ttc",
+      "name": "Hiragino Sans",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 100,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Oriya Sangam MN.ttc",
+      "name": "Oriya Sangam MN",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Oriya MN.ttc",
+      "name": "Oriya MN",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansInscriptionalParthian-Regular.ttf",
+      "name": "Noto Sans Inscriptional Parthian",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansSamaritan-Regular.ttf",
+      "name": "Noto Sans Samaritan",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Arial Narrow Bold.ttf",
+      "name": "Arial Narrow",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "condensed",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Trebuchet MS Bold.ttf",
+      "name": "Trebuchet MS",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/NotoSerifMyanmar.ttc",
+      "name": "Noto Serif Myanmar",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 900,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/\u30d2\u30e9\u30ad\u3099\u30ce\u89d2\u30b3\u3099\u30b7\u30c3\u30af W3.ttc",
+      "name": "Hiragino Sans",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 300,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Trebuchet MS Bold Italic.ttf",
+      "name": "Trebuchet MS",
+      "style": "italic",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Waseem.ttc",
+      "name": "Waseem",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/SFArmenian.ttf",
+      "name": ".SF Armenian",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/NewYork.ttf",
+      "name": ".New York",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansBamum-Regular.ttf",
+      "name": "Noto Sans Bamum",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansTaiLe-Regular.ttf",
+      "name": "Noto Sans Tai Le",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Telugu Sangam MN.ttc",
+      "name": "Telugu Sangam MN",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/STIXNonUniBol.otf",
+      "name": "STIXNonUnicode",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Avenir Next Condensed.ttc",
+      "name": "Avenir Next Condensed",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "condensed",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Al Nile.ttc",
+      "name": "Al Nile",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/STIXIntSmReg.otf",
+      "name": "STIXIntegralsSm",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Zapfino.ttf",
+      "name": "Zapfino",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Arial Narrow Italic.ttf",
+      "name": "Arial Narrow",
+      "style": "italic",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "condensed",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansTifinagh-Regular.otf",
+      "name": "Noto Sans Tifinagh",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/STHeiti Medium.ttc",
+      "name": "Heiti TC",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansLepcha-Regular.ttf",
+      "name": "Noto Sans Lepcha",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansDuployan-Regular.ttf",
+      "name": "Noto Sans Duployan",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/ZapfDingbats.ttf",
+      "name": "Zapf Dingbats",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/STIXSizOneSymReg.otf",
+      "name": "STIXSizeOneSym",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Beirut.ttc",
+      "name": "Beirut",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Palatino.ttc",
+      "name": "Palatino",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/STIXGeneralBol.otf",
+      "name": "STIXGeneral",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Georgia.ttf",
+      "name": "Georgia",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/SignPainter.ttc",
+      "name": "SignPainter",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Copperplate.ttc",
+      "name": "Copperplate",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/SFHebrew.ttf",
+      "name": ".SF Hebrew",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansCham-Regular.ttf",
+      "name": "Noto Sans Cham",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Geneva.ttf",
+      "name": "Geneva",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansManichaean-Regular.ttf",
+      "name": "Noto Sans Manichaean",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Raanana.ttc",
+      "name": "Raanana",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/STIXIntUpDBol.otf",
+      "name": "STIXIntegralsUpD",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Arial Italic.ttf",
+      "name": "Arial",
+      "style": "italic",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/STIXIntUpSmReg.otf",
+      "name": "STIXIntegralsUpSm",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/SFArabic.ttf",
+      "name": ".SF Arabic",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Apple Braille Pinpoint 8 Dot.ttf",
+      "name": "Apple Braille",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansMongolian-Regular.ttf",
+      "name": "Noto Sans Mongolian",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Savoye LET.ttc",
+      "name": "Savoye LET",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Bangla Sangam MN.ttc",
+      "name": "Bangla Sangam MN",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/AppleGothic.ttf",
+      "name": "AppleGothic",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansNewa-Regular.ttf",
+      "name": "Noto Sans Newa",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansLinearB-Regular.ttf",
+      "name": "Noto Sans Linear B",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Papyrus.ttc",
+      "name": "Papyrus",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "condensed",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansAvestan-Regular.ttf",
+      "name": "Noto Sans Avestan",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Courier New Italic.ttf",
+      "name": "Courier New",
+      "style": "italic",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Songti.ttc",
+      "name": "Songti SC",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 900,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSerifAhom-Regular.ttf",
+      "name": "Noto Serif Ahom",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Lao MN.ttc",
+      "name": "Lao MN",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansPahawhHmong-Regular.ttf",
+      "name": "Noto Sans Pahawh Hmong",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/LucidaGrande.ttc",
+      "name": "Lucida Grande",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 500,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansBassaVah-Regular.ttf",
+      "name": "Noto Sans Bassa Vah",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Didot.ttc",
+      "name": "Didot",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Baghdad.ttc",
+      "name": "Baghdad",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansPsalterPahlavi-Regular.ttf",
+      "name": "Noto Sans Psalter Pahlavi",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Georgia Bold.ttf",
+      "name": "Georgia",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Kohinoor.ttc",
+      "name": "Kohinoor Devanagari",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSerifBalinese-Regular.ttf",
+      "name": "Noto Serif Balinese",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/KefaIII.ttf",
+      "name": "Kefa III",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Baskerville.ttc",
+      "name": "Baskerville",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansOldTurkic-Regular.ttf",
+      "name": "Noto Sans Old Turkic",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Kannada MN.ttc",
+      "name": "Kannada MN",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/SFGeorgianRounded.ttf",
+      "name": ".SF Georgian Rounded",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Monaco.ttf",
+      "name": "Monaco",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/SFGeorgian.ttf",
+      "name": ".SF Georgian",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/AmericanTypewriter.ttc",
+      "name": "American Typewriter",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Al Tarikh.ttc",
+      "name": "Al Tarikh",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Tamil MN.ttc",
+      "name": "Tamil MN",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/SFCompact.ttf",
+      "name": ".SF Compact",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 1000,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansRejang-Regular.ttf",
+      "name": "Noto Sans Rejang",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansLimbu-Regular.ttf",
+      "name": "Noto Sans Limbu",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Shree714.ttc",
+      "name": "Shree Devanagari 714",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansMasaramGondi-Regular.otf",
+      "name": "Noto Sans Masaram Gondi",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Times New Roman.ttf",
+      "name": "Times New Roman",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Times New Roman Bold Italic.ttf",
+      "name": "Times New Roman",
+      "style": "italic",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Gujarati Sangam MN.ttc",
+      "name": "Gujarati Sangam MN",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Mishafi Gold.ttf",
+      "name": "Mishafi Gold",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/DIN Condensed Bold.ttf",
+      "name": "DIN Condensed",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "condensed",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Avenir.ttc",
+      "name": "Avenir",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansWarangCiti-Regular.ttf",
+      "name": "Noto Sans Warang Citi",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/\u30d2\u30e9\u30ad\u3099\u30ce\u89d2\u30b3\u3099\u30b7\u30c3\u30af W9.ttc",
+      "name": "Hiragino Sans",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 900,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Farah.ttc",
+      "name": "Farah",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/STIXNonUni.otf",
+      "name": "STIXNonUnicode",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Times New Roman Italic.ttf",
+      "name": "Times New Roman",
+      "style": "italic",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/STIXIntUpBol.otf",
+      "name": "STIXIntegralsUp",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/ChalkboardSE.ttc",
+      "name": "Chalkboard SE",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansOldNorthArabian-Regular.ttf",
+      "name": "Noto Sans Old North Arabian",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/STIXVar.otf",
+      "name": "STIXVariants",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansTakri-Regular.ttf",
+      "name": "Noto Sans Takri",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Tahoma Bold.ttf",
+      "name": "Tahoma",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Telugu MN.ttc",
+      "name": "Telugu MN",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Tahoma.ttf",
+      "name": "Tahoma",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansNagMundari-Regular.ttf",
+      "name": "Noto Sans Nag Mundari",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/EuphemiaCAS.ttc",
+      "name": "Euphemia UCAS",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansYi-Regular.ttf",
+      "name": "Noto Sans Yi",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/SFArabicRounded.ttf",
+      "name": ".SF Arabic Rounded",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/STIXIntUpDReg.otf",
+      "name": "STIXIntegralsUpD",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/STIXSizOneSymBol.otf",
+      "name": "STIXSizeOneSym",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 700,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Tamil Sangam MN.ttc",
+      "name": "Tamil Sangam MN",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansKhudawadi-Regular.ttf",
+      "name": "Noto Sans Khudawadi",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansPalmyrene-Regular.ttf",
+      "name": "Noto Sans Palmyrene",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Myanmar MN.ttc",
+      "name": "Myanmar MN",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/AppleSDGothicNeo.ttc",
+      "name": "Apple SD Gothic Neo",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Myanmar Sangam MN.ttc",
+      "name": "Myanmar Sangam MN",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansKhojki-Regular.ttf",
+      "name": "Noto Sans Khojki",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansPauCinHau-Regular.ttf",
+      "name": "Noto Sans Pau Cin Hau",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansSundanese-Regular.ttf",
+      "name": "Noto Sans Sundanese",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Brush Script.ttf",
+      "name": "Brush Script MT",
+      "style": "italic",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/GeezaPro.ttc",
+      "name": "Geeza Pro",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Bodoni Ornaments.ttf",
+      "name": "Bodoni Ornaments",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/\u30d2\u30e9\u30ad\u3099\u30ce\u89d2\u30b3\u3099\u30b7\u30c3\u30af W5.ttc",
+      "name": "Hiragino Sans",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 500,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Kokonor.ttf",
+      "name": "Kokonor",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansNabataean-Regular.ttf",
+      "name": "Noto Sans Nabataean",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/NotoSansCoptic-Regular.ttf",
+      "name": "Noto Sans Coptic",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "normal",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    },
+    {
+      "fname": "/System/Library/Fonts/Supplemental/Arial Narrow.ttf",
+      "name": "Arial Narrow",
+      "style": "normal",
+      "variant": "normal",
+      "weight": 400,
+      "stretch": "condensed",
+      "size": "scalable",
+      "__class__": "FontEntry"
+    }
+  ],
+  "__class__": "FontManager"
+}
\ No newline at end of file
diff --git a/filenames.py b/filenames.py
index 8f7cfc4..ea7828c 100644
--- a/filenames.py
+++ b/filenames.py
@@ -1,7 +1,9 @@
 import os
 
 RAID_DIR = os.environ.get("RAID_DIR")
-os.environ["RAY_TMPDIR"] = os.path.join(RAID_DIR, "tmp")
+ray_tmp = "/tmp/ray"
+os.makedirs(ray_tmp, exist_ok=True)
+os.environ["RAY_TMPDIR"] = ray_tmp
 REPO_DIR = os.path.join(RAID_DIR, "repos")
 DATA_DIR = os.path.join(RAID_DIR, "data")
 CHECKPOINT_DIR = os.path.join(RAID_DIR, "checkpoints")
@@ -9,4 +11,4 @@
 DB_FILE_NAME = "db_file.txt"
 PROOF_LOG_FILE_NAME = os.path.join(RAID_DIR, "proof_log.txt")
 ENCOUNTERED_THEOREMS_FILE = os.path.join(RAID_DIR, "encountered_theorems.pkl")
-FISHER_DIR = os.path.join(RAID_DIR, "fisher")  # Optional
\ No newline at end of file
+FISHER_DIR = os.path.join(RAID_DIR, "fisher")  # Optional
diff --git a/git_utils.py b/git_utils.py
index 28a2d32..4883df1 100644
--- a/git_utils.py
+++ b/git_utils.py
@@ -397,7 +397,7 @@ def add_repo_to_database(dynamic_database_json_path, repo, db):
     # Add the new repo to the dynamic database
     config = repo.get_config("lean-toolchain")
     v = generate_benchmark_lean4.get_lean4_version_from_config(config["content"])
-    theorems_folder = os.path.join(dst_dir, "theorems")
+    theorems_folder = os.path.join(dst_dir, "random")
     premise_files_corpus = os.path.join(dst_dir, "corpus.jsonl")
     files_traced = os.path.join(dst_dir, "traced_files.jsonl")
     
@@ -409,7 +409,7 @@ def add_repo_to_database(dynamic_database_json_path, repo, db):
         "lean_version": v,
         "lean_dojo_version": lean_dojo.__version__,
         "metadata": {
-            "date_processed": datetime.datetime.now(),
+            "date_processed": datetime.now(),
         },
         "theorems_folder": theorems_folder,
         "premise_files_corpus": premise_files_corpus,
@@ -551,4 +551,4 @@ def should_skip_repo():
         with open(skip_file_path, "r") as f:
             repo_url = f.read().strip()
         return True, repo_url
-    return False, None
\ No newline at end of file
+    return False, None
diff --git a/leanagent.py b/leanagent.py
index c87e84e..7f9801c 100644
--- a/leanagent.py
+++ b/leanagent.py
@@ -8,6 +8,7 @@
 import time
 import traceback
 
+from contextlib import contextmanager
 from datetime import datetime, timedelta
 from pathlib import Path
 from typing import List, Optional, Set, Tuple
@@ -52,6 +53,30 @@
 repos = []
 
 
+@contextmanager
+def _locked(path: str, mode: str):
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    with open(path, mode) as handle:
+        fcntl.flock(handle.fileno(), fcntl.LOCK_EX)
+        try:
+            yield handle
+        finally:
+            if any(flag in mode for flag in ("w", "a", "+")):
+                handle.flush()
+                os.fsync(handle.fileno())
+            fcntl.flock(handle.fileno(), fcntl.LOCK_UN)
+
+
+def read_json_locked(path: str):
+    with _locked(path, "r") as handle:
+        return json.load(handle)
+
+
+def write_json_locked(path: str, obj) -> None:
+    with _locked(path, "w") as handle:
+        json.dump(obj, handle, indent=2, sort_keys=True)
+
+
 def _eval(data, preds_map) -> Tuple[float, float, float]:
     """Evaluates the retrieval model."""
     R1 = []
@@ -507,7 +532,7 @@ def main():
         use_fisher = False
         single_repo = True
         curriculum_learning = True
-        num_repos = 4
+        num_repos = 3
         dynamic_database_json_path = os.path.join(RAID_DIR, DB_FILE_NAME)
 
         lambdas = None
@@ -529,18 +554,11 @@ def main():
         lean_git_repos, repos, updated_repos = get_repos(curriculum_learning, num_repos, dynamic_database_json_path, db)
 
         repo_info_file = os.path.join(DATA_DIR, "repo_info_compatible.json")
-        lock_path = f"{repo_info_file}.lock"
         max_attempts = 30
         for attempt in range(max_attempts):
             try:
-                with open(lock_path, "a") as lock_handle:
-                    fcntl.flock(lock_handle.fileno(), fcntl.LOCK_EX)
-                    try:
-                        with open(repo_info_file, "r") as f:
-                            repo_info = json.load(f)
-                        break
-                    finally:
-                        fcntl.flock(lock_handle.fileno(), fcntl.LOCK_UN)
+                repo_info = read_json_locked(repo_info_file)
+                break
             except (json.JSONDecodeError, FileNotFoundError):
                 if attempt == max_attempts - 1:
                     raise Exception(
diff --git a/requirements-local.txt b/requirements-local.txt
new file mode 100644
index 0000000..2a77334
--- /dev/null
+++ b/requirements-local.txt
@@ -0,0 +1,20 @@
+numpy
+loguru
+rich
+pydantic
+gitpython
+requests
+tqdm
+docker
+filelock
+typing_extensions
+ray
+python-dotenv
+toml
+PyGithub
+networkx
+rank_bm25
+lxml
+regex
+packaging
+psutil
diff --git a/requirements.cpu.txt b/requirements.cpu.txt
new file mode 100644
index 0000000..f41b96f
--- /dev/null
+++ b/requirements.cpu.txt
@@ -0,0 +1,17 @@
+pytorch-lightning[extra]
+numpy
+deepspeed
+lean_dojo==1.9.0
+loguru
+networkx
+ray
+requests
+torch
+tqdm
+transformers
+openai
+python-dotenv
+rank_bm25
+torchmetrics
+pytest==8.4.0
+pytest-cov==6.2.1
diff --git a/retrieval/bm25/main.py b/retrieval/bm25/main.py
index 1d41415..48eb9b3 100644
--- a/retrieval/bm25/main.py
+++ b/retrieval/bm25/main.py
@@ -92,30 +92,21 @@ def _process_theorem(
 
 
 @ray.remote(num_cpus=1)
-"""
-A Ray remote class for processing theorems with BM25 retrieval.
-
-This class handles the initialization of necessary components for theorem processing,
-including loading the tokenizer, corpus, and setting up the BM25 retrieval model.
-It provides a method to process individual theorems by retrieving relevant premises.
-
-Parameters
-----------
-tokenizer_path : str
-    Path to the tokenizer file
-data_path : str
-    Path to the data directory containing corpus files
-num_retrieved : int
-    Number of premises to retrieve for each theorem
-use_all_premises : bool
-    Whether to use all available premises or just retrieved ones
-
-Methods
--------
-process_theorem(thm: Dict[str, Any])
-    Process a single theorem, retrieving relevant premises using BM25
-"""
 class TheoremProcessor:
+    """
+    Ray remote class that processes theorems with BM25 retrieval.
+
+    Parameters
+    ----------
+    tokenizer_path : str
+        Path to the tokenizer file.
+    data_path : str
+        Path to the data directory containing corpus files.
+    num_retrieved : int
+        Number of premises to retrieve for each theorem.
+    use_all_premises : bool
+        Whether to use all available premises or just retrieved ones.
+    """
     def __init__(
         self,
         tokenizer_path: str,
diff --git a/scripts/manual_trace.py b/scripts/manual_trace.py
new file mode 100644
index 0000000..3fe35b1
--- /dev/null
+++ b/scripts/manual_trace.py
@@ -0,0 +1,76 @@
+import argparse, json, os, sys
+from pathlib import Path
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--repo", required=True, help="Path to repo root (has .lake/)")
+    ap.add_argument("--url", required=True, help="Repo URL (e.g. https://github.com/owner/name)")
+    ap.add_argument("--commit", default="", help="Commit SHA (defaults to git rev-parse HEAD)")
+    ap.add_argument("--out_root", required=True, help="Datasets root (e.g. RAID/data)")
+    ap.add_argument("--zip", action="store_true", help="Also create a zip bundle with IR + corpus.jsonl")
+    args = ap.parse_args()
+
+    repo = Path(args.repo).resolve()
+    if not repo.exists():
+        print(f"[ERR] Repo not found: {repo}", file=sys.stderr)
+        sys.exit(2)
+
+    # detect commit if not given
+    commit = args.commit.strip()
+    if not commit:
+        import subprocess
+        try:
+            commit = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=str(repo)).decode().strip()
+        except Exception as e:
+            print(f"[ERR] Could not detect commit via git: {e}", file=sys.stderr)
+            sys.exit(2)
+
+    ir_root = repo / ".lake" / "build" / "ir"
+    if not ir_root.is_dir():
+        print(f"[ERR] IR dir not found: {ir_root}\nRun `lake build` first.", file=sys.stderr)
+        sys.exit(2)
+
+    asts = sorted(ir_root.rglob("*.ast.json"))
+    if not asts:
+        print(f"[ERR] No *.ast.json files under {ir_root}", file=sys.stderr)
+        sys.exit(2)
+
+    # dataset folder name: owner_repo_commit
+    owner_repo = "/".join(args.url.rstrip("/").split("/")[-2:])
+    owner_repo_flat = owner_repo.replace("/", "_")
+    out_dir = Path(args.out_root) / f"{owner_repo_flat}_{commit}"
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    corpus_path = out_dir / "corpus.jsonl"
+    with corpus_path.open("w") as f:
+        for p in asts:
+            rec = {
+                "repo_url": args.url,
+                "commit": commit,
+                "ast_path": str(p.relative_to(repo)),
+            }
+            f.write(json.dumps(rec) + "\n")
+
+    print(f"[OK] Wrote {corpus_path}  records: {len(asts)}")
+
+    if args.zip:
+        # bundle: corpus.jsonl + IR tree
+        exports = Path(os.environ.get("RAID_DIR", repo.parent.parent)) / "exports"
+        exports.mkdir(parents=True, exist_ok=True)
+        zip_name = f"{owner_repo_flat}_{commit}_bundle.zip"
+        zip_path = exports / zip_name
+
+        # use system zip via subprocess to preserve paths
+        import subprocess
+        cmd = [
+            "zip","-r", str(zip_path),
+            str(corpus_path),
+            str(ir_root),
+            "-x","*.DS_Store"
+        ]
+        print("[ZIP] ", " ".join(cmd))
+        subprocess.check_call(cmd, cwd=str(Path(os.environ.get("RAID_DIR", repo.parent.parent))))
+        print(f"[OK] Bundle ready: {zip_path}")
+
+if __name__ == "__main__":
+    main()
diff --git a/testfile_root b/testfile_root
new file mode 100644
index 0000000..6824beb
--- /dev/null
+++ b/testfile_root
@@ -0,0 +1,2 @@
+hello
+EOF && ls /Users/aum/Desktop/leanagent-work/LeanAgent/testfile_root

From 17d166f6ba1f6e83bc7b37f1b9cc026032108f10 Mon Sep 17 00:00:00 2001
From: aumrp77 <aumrp77@gmail.com>
Date: Tue, 28 Oct 2025 22:21:12 -0400
Subject: [PATCH 13/29] Fix dynamic database datetime serialization

---
 dynamic_database.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dynamic_database.py b/dynamic_database.py
index 4f1b297..6ab7352 100644
--- a/dynamic_database.py
+++ b/dynamic_database.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 import time
-from datetime import datetime
+import datetime
 import json
 import os
 import random

From 458684d9c5ae4af4246ee7b6a983ca91a74032e7 Mon Sep 17 00:00:00 2001
From: aumrp77 <aumrp77@gmail.com>
Date: Tue, 28 Oct 2025 22:28:00 -0400
Subject: [PATCH 14/29] Use datetime.now helper correctly

---
 leanagent.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/leanagent.py b/leanagent.py
index 7f9801c..c4e1116 100644
--- a/leanagent.py
+++ b/leanagent.py
@@ -270,7 +270,7 @@ def prove_sorry_theorems(
     all_encountered_theorems: Set[Tuple[str, str, Tuple[int, int], Tuple[int, int]]] = (
         set()
     )
-    last_save_time = datetime.datetime.now()
+    last_save_time = datetime.now()
     save_interval = timedelta(minutes=30)
 
     # Load previously encountered theorems
@@ -333,7 +333,7 @@ def prove_sorry_theorems(
                 theorem_batch = []
                 positions_batch = []
 
-            current_time = datetime.datetime.now()
+            current_time = datetime.now()
             if current_time - last_save_time >= save_interval:
                 save_progress(all_encountered_theorems)
                 last_save_time = current_time

From b9f4f8573abf07256b12f3d04b054d065f9335db Mon Sep 17 00:00:00 2001
From: aumrp77 <aumrp77@gmail.com>
Date: Wed, 29 Oct 2025 19:28:52 -0400
Subject: [PATCH 15/29] Retry curriculum repo discovery until minimum satisfied

---
 leanagent.py | 59 +++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 45 insertions(+), 14 deletions(-)

diff --git a/leanagent.py b/leanagent.py
index c4e1116..e911522 100644
--- a/leanagent.py
+++ b/leanagent.py
@@ -429,23 +429,54 @@ def get_repos(curriculum_learning: bool, num_repos: int, dynamic_database_json_p
                 logger.warning("num_repos should be at least 3 for curriculum learning")
             
             
-            lean_git_repos, repos = search_github_repositories(lean_git_repos, repos, "Lean", num_repos)
-            
-            for i in range(len(lean_git_repos)):
-                repo = lean_git_repos[i]
-                print("\n\n")
-                logger.info(f"Processing new repo: {repo.url}")
-                result = add_repo_to_database(dynamic_database_json_path, repo, db)
-                if result is not None:
-                    logger.info(f"Successfully added repo {repo.url}")
-            
+            existing_repo_count = len(db.repositories)
+            target_repo_count = max(3, num_repos)
+
+            lean_git_repos, repos = search_github_repositories(
+                lean_git_repos, repos, "Lean", target_repo_count
+            )
+
+            processed_idx = 0
+            extra_searches = 0
+            max_extra_searches = 10
+
+            while len(db.repositories) < target_repo_count:
+                while (
+                    processed_idx < len(lean_git_repos)
+                    and len(db.repositories) < target_repo_count
+                ):
+                    repo = lean_git_repos[processed_idx]
+                    processed_idx += 1
+                    print("\n\n")
+                    logger.info(f"Processing new repo: {repo.url}")
+                    result = add_repo_to_database(
+                        dynamic_database_json_path, repo, db
+                    )
+                    if result is not None:
+                        logger.info(f"Successfully added repo {repo.url}")
+
+                if len(db.repositories) >= target_repo_count:
+                    break
+
+                if extra_searches >= max_extra_searches:
+                    raise ValueError(
+                        "Unable to find enough compatible repositories for curriculum learning"
+                    )
+
+                extra_searches += 1
+                needed = max(1, target_repo_count - len(db.repositories))
+                logger.info(
+                    f"Searching for {needed} additional repositories to meet the curriculum requirement"
+                )
+                lean_git_repos, repos = search_github_repositories(
+                    lean_git_repos, repos, "Lean", needed
+                )
+
+            newly_added = len(db.repositories) - existing_repo_count
             logger.info(
-                f"Successfully added {num_repos} repositories to the database"
+                f"Successfully added {newly_added} repositories to the database (total: {len(db.repositories)})"
             )
 
-            if len(db.repositories) < 3:
-                raise ValueError("The database should contain at least 3 repositories for curriculum learning")
-            
             sorted_repos, categorized_theorems, percentiles = (
                 sort_repositories_by_difficulty(db)
             )

From c84672e647f8be6c1e86dce664dcab91fe4e611a Mon Sep 17 00:00:00 2001
From: aumrp77 <aumrp77@gmail.com>
Date: Wed, 29 Oct 2025 19:38:49 -0400
Subject: [PATCH 16/29] Cache dataset exports when artifacts already present

---
 generate_benchmark_lean4.py | 68 ++++++++++++++++++++++++++++++++++---
 1 file changed, 64 insertions(+), 4 deletions(-)

diff --git a/generate_benchmark_lean4.py b/generate_benchmark_lean4.py
index 27056b5..d9ac591 100644
--- a/generate_benchmark_lean4.py
+++ b/generate_benchmark_lean4.py
@@ -9,7 +9,7 @@
 from copy import copy
 from datetime import datetime
 from pathlib import Path
-from typing import Dict, List, Union
+from typing import Dict, List, Union, Optional, Tuple
 import os
 import lean_dojo
 import networkx as nx
@@ -24,6 +24,49 @@
 SPLIT = Dict[SPLIT_NAME, List[TracedTheorem]]
 SPLIT_STRATEGY = str
 _LEAN4_VERSION_REGEX = re.compile(r"leanprover/lean4:(?P<version>.+?)")
+_REQUIRED_EXPORT_FILES = [
+    ("metadata.json",),
+    ("corpus.jsonl",),
+    ("traced_files.jsonl",),
+    ("random", "train.json"),
+    ("random", "val.json"),
+    ("random", "test.json"),
+    ("novel_premises", "train.json"),
+    ("novel_premises", "val.json"),
+    ("novel_premises", "test.json"),
+]
+
+
+def _existing_export_stats(dst_path: Union[str, Path]) -> Optional[Tuple[int, int, int]]:
+    """Return cached export statistics if the dataset artifacts already exist."""
+    dst_path = Path(dst_path)
+    required_paths = [dst_path.joinpath(*parts) for parts in _REQUIRED_EXPORT_FILES]
+
+    if not all(path.is_file() and path.stat().st_size > 0 for path in required_paths):
+        return None
+
+    metadata_path = dst_path / "metadata.json"
+    try:
+        metadata = json.load(metadata_path.open("rt"))
+    except (OSError, json.JSONDecodeError):
+        return None
+
+    required_keys = {"total_theorems", "num_premises", "num_files_traced"}
+    if not required_keys.issubset(metadata.keys()):
+        return None
+
+    try:
+        total_theorems = int(metadata["total_theorems"])
+        num_premises = int(metadata["num_premises"])
+        num_files_traced = int(metadata["num_files_traced"])
+    except (TypeError, ValueError):
+        return None
+
+    if total_theorems <= 0 or num_premises < 0 or num_files_traced <= 0:
+        return None
+
+    logger.info(f"Reusing cached export at {dst_path}")
+    return num_premises, num_files_traced, total_theorems
 
 
 def get_lean4_version_from_config(toolchain: str) -> str:
@@ -465,7 +508,19 @@ def export_data(
     logger.info("Successfully exported the premises")
 
     # Export metadata.
-    export_metadata(traced_repo, dst_path, **kwargs)
+    split_summary = {
+        strategy: {name: len(theorems) for name, theorems in split.items()}
+        for strategy, split in splits.items()
+    }
+    export_metadata(
+        traced_repo,
+        dst_path,
+        total_theorems=total_theorems,
+        num_premises=num_premises,
+        num_files_traced=num_files_traced,
+        split_counts=split_summary,
+        **kwargs,
+    )
     logger.info("Successfully exported the metadata")
 
     return num_premises, num_files_traced, total_theorems
@@ -556,9 +611,14 @@ def main(url, commit, dst_dir):
     except Exception as e:
         logger.info(f"Failed to trace repo {repo} because of {e}")
         return None, 0, 0, 10
-    
+
+    cached_stats = _existing_export_stats(dst_dir)
+    if cached_stats is not None:
+        num_premises, num_files_traced, total_theorems = cached_stats
+        return traced_repo, num_premises, num_files_traced, total_theorems
+
     safe_remove_dir(dst_dir)
-    
+
     splits = split_data(traced_repo)
     logger.info("Successfully split the data")
     num_premises, num_files_traced, total_theorems = export_data(

From f7b729e807215600da71b8bab51fa821aa91cfa7 Mon Sep 17 00:00:00 2001
From: aumrp77 <aumrp77@gmail.com>
Date: Wed, 29 Oct 2025 20:46:18 -0400
Subject: [PATCH 17/29] Revert export caching guard

---
 generate_benchmark_lean4.py | 68 +++----------------------------------
 1 file changed, 4 insertions(+), 64 deletions(-)

diff --git a/generate_benchmark_lean4.py b/generate_benchmark_lean4.py
index d9ac591..27056b5 100644
--- a/generate_benchmark_lean4.py
+++ b/generate_benchmark_lean4.py
@@ -9,7 +9,7 @@
 from copy import copy
 from datetime import datetime
 from pathlib import Path
-from typing import Dict, List, Union, Optional, Tuple
+from typing import Dict, List, Union
 import os
 import lean_dojo
 import networkx as nx
@@ -24,49 +24,6 @@
 SPLIT = Dict[SPLIT_NAME, List[TracedTheorem]]
 SPLIT_STRATEGY = str
 _LEAN4_VERSION_REGEX = re.compile(r"leanprover/lean4:(?P<version>.+?)")
-_REQUIRED_EXPORT_FILES = [
-    ("metadata.json",),
-    ("corpus.jsonl",),
-    ("traced_files.jsonl",),
-    ("random", "train.json"),
-    ("random", "val.json"),
-    ("random", "test.json"),
-    ("novel_premises", "train.json"),
-    ("novel_premises", "val.json"),
-    ("novel_premises", "test.json"),
-]
-
-
-def _existing_export_stats(dst_path: Union[str, Path]) -> Optional[Tuple[int, int, int]]:
-    """Return cached export statistics if the dataset artifacts already exist."""
-    dst_path = Path(dst_path)
-    required_paths = [dst_path.joinpath(*parts) for parts in _REQUIRED_EXPORT_FILES]
-
-    if not all(path.is_file() and path.stat().st_size > 0 for path in required_paths):
-        return None
-
-    metadata_path = dst_path / "metadata.json"
-    try:
-        metadata = json.load(metadata_path.open("rt"))
-    except (OSError, json.JSONDecodeError):
-        return None
-
-    required_keys = {"total_theorems", "num_premises", "num_files_traced"}
-    if not required_keys.issubset(metadata.keys()):
-        return None
-
-    try:
-        total_theorems = int(metadata["total_theorems"])
-        num_premises = int(metadata["num_premises"])
-        num_files_traced = int(metadata["num_files_traced"])
-    except (TypeError, ValueError):
-        return None
-
-    if total_theorems <= 0 or num_premises < 0 or num_files_traced <= 0:
-        return None
-
-    logger.info(f"Reusing cached export at {dst_path}")
-    return num_premises, num_files_traced, total_theorems
 
 
 def get_lean4_version_from_config(toolchain: str) -> str:
@@ -508,19 +465,7 @@ def export_data(
     logger.info("Successfully exported the premises")
 
     # Export metadata.
-    split_summary = {
-        strategy: {name: len(theorems) for name, theorems in split.items()}
-        for strategy, split in splits.items()
-    }
-    export_metadata(
-        traced_repo,
-        dst_path,
-        total_theorems=total_theorems,
-        num_premises=num_premises,
-        num_files_traced=num_files_traced,
-        split_counts=split_summary,
-        **kwargs,
-    )
+    export_metadata(traced_repo, dst_path, **kwargs)
     logger.info("Successfully exported the metadata")
 
     return num_premises, num_files_traced, total_theorems
@@ -611,14 +556,9 @@ def main(url, commit, dst_dir):
     except Exception as e:
         logger.info(f"Failed to trace repo {repo} because of {e}")
         return None, 0, 0, 10
-
-    cached_stats = _existing_export_stats(dst_dir)
-    if cached_stats is not None:
-        num_premises, num_files_traced, total_theorems = cached_stats
-        return traced_repo, num_premises, num_files_traced, total_theorems
-
+    
     safe_remove_dir(dst_dir)
-
+    
     splits = split_data(traced_repo)
     logger.info("Successfully split the data")
     num_premises, num_files_traced, total_theorems = export_data(

From 2f243a69c832d30af6f7af769425b183ce4f161f Mon Sep 17 00:00:00 2001
From: aumrp77 <aumrp77@gmail.com>
Date: Tue, 4 Nov 2025 08:22:40 -0500
Subject: [PATCH 18/29] Instrument tracing workflow and add skip prompts

---
 constants.py                |   3 +-
 generate_benchmark_lean4.py |  44 ++++++++++-----
 git_utils.py                | 103 ++++++++++++++++++++++++++++++++----
 leanagent.py                |  46 ++++++++++++++--
 trace_only.py               |  22 ++++++++
 5 files changed, 193 insertions(+), 25 deletions(-)
 create mode 100644 trace_only.py

diff --git a/constants.py b/constants.py
index d11b663..9313943 100644
--- a/constants.py
+++ b/constants.py
@@ -158,6 +158,7 @@
     "proost-assistant/ProostLean",
     "DavePearce/LeanEVM",
     "algebraic-dev/ash",
+    "google-deepmind/formal-conjectures",
     "FormalizedFormalLogic/Arithmetization",
     "cmu-l3/ntp-toolkit",
     "dwrensha/tryAtEachStep",
@@ -241,4 +242,4 @@
     
     # Added by Mo to find smaller repo to iterate on
     
-]
\ No newline at end of file
+]
diff --git a/generate_benchmark_lean4.py b/generate_benchmark_lean4.py
index 27056b5..7fed1d2 100644
--- a/generate_benchmark_lean4.py
+++ b/generate_benchmark_lean4.py
@@ -465,7 +465,19 @@ def export_data(
     logger.info("Successfully exported the premises")
 
     # Export metadata.
-    export_metadata(traced_repo, dst_path, **kwargs)
+    split_summary = {
+        strategy: {name: len(theorems) for name, theorems in split.items()}
+        for strategy, split in splits.items()
+    }
+    export_metadata(
+        traced_repo,
+        dst_path,
+        total_theorems=total_theorems,
+        num_premises=num_premises,
+        num_files_traced=num_files_traced,
+        split_counts=split_summary,
+        **kwargs,
+    )
     logger.info("Successfully exported the metadata")
 
     return num_premises, num_files_traced, total_theorems
@@ -529,17 +541,15 @@ def main(url, commit, dst_dir):
         logger.info("Unsupported version")
     v = v[1:]  # ignore "v" at beginning
 
-    lean_dir2 = f"/Users/motiwari/.elan/toolchains/leanprover--lean4---{v}"
-    lean_dir3 = f"/Users/motiwari/.elan/toolchains/leanprover--lean4---{v}"
-    logger.info(f"lean path2 {lean_dir2}")
-    logger.info(f"lean path3 {lean_dir3}")
-    if not os.path.exists(lean_dir2):
-        logger.info(f"Lean toolchain path 2 does not exist: {lean_dir2}")
-    if not os.path.exists(lean_dir3):
-        logger.info(f"Lean toolchain path 3 does not exist: {lean_dir3}")
-    os.environ["LEAN4_PATH"] = lean_dir2
-    os.environ["PATH"] = f"{lean_dir2}/bin:{os.environ.get('PATH', '')}"
-    logger.info(f"Switched to Lean toolchain at: {lean_dir2}")
+    elan_toolchains = Path(
+        os.environ.get("ELAN_TOOLCHAINS", Path.home() / ".elan" / "toolchains")
+    )
+    lean_dir = elan_toolchains / f"leanprover--lean4---{v}"
+    if not lean_dir.exists():
+        logger.warning(f"Lean toolchain path does not exist locally: {lean_dir}")
+    os.environ["LEAN4_PATH"] = str(lean_dir)
+    os.environ["PATH"] = f"{lean_dir}/bin:{os.environ.get('PATH', '')}"
+    logger.info(f"Switched to Lean toolchain at: {lean_dir}")
 
     logger.info(
         f"lean --version: {subprocess.run(['lean', '--version'], capture_output=True).stdout.decode('utf-8')}"
@@ -553,6 +563,12 @@ def main(url, commit, dst_dir):
         logger.info("Tracing the repo...")
         traced_repo = trace(repo)
         logger.info("Successfully traced the repo")
+        traced_files_count = len(traced_repo.traced_files)
+        deps_count = sum(len(tf.get_premise_definitions()) for tf in traced_repo.traced_files)
+        logger.info(
+            f"Trace summary for {url}@{commit}: "
+            f"{traced_files_count} traced files, {deps_count} premise definitions discovered"
+        )
     except Exception as e:
         logger.info(f"Failed to trace repo {repo} because of {e}")
         return None, 0, 0, 10
@@ -565,4 +581,8 @@ def main(url, commit, dst_dir):
         traced_repo, splits, dst_dir
     )
     logger.info("Successfully exported the data")
+    logger.info(
+        f"Export summary for {url}@{commit}: "
+        f"{total_theorems} theorems, {num_premises} premises, {num_files_traced} traced files"
+    )
     return traced_repo, num_premises, num_files_traced, total_theorems
diff --git a/git_utils.py b/git_utils.py
index 4883df1..b0e3672 100644
--- a/git_utils.py
+++ b/git_utils.py
@@ -15,7 +15,7 @@
 
 
 from loguru import logger
-from typing import Union, List, Tuple
+from typing import Union, List, Tuple, Optional
 import math
 import os
 
@@ -25,6 +25,46 @@
 BATCH_SIZE = 4
 from filenames import REPO_DIR, DATA_DIR
 
+MIN_SUPPORTED_LEAN_VERSION = (4, 6, 0)
+MIN_SUPPORTED_LEAN_VERSION_STR = "v4.6.0"
+PAUSE_AFTER_TRACE = os.environ.get("PAUSE_AFTER_TRACE", "0") == "1"
+
+
+def _parse_lean_version(version: str) -> Optional[Tuple[int, int, int]]:
+    version = version.lower().lstrip("v")
+    if not version:
+        return None
+    base = version.split("-")[0]
+    parts = base.split(".")
+    if len(parts) < 2:
+        return None
+    while len(parts) < 3:
+        parts.append("0")
+    try:
+        major, minor, patch = (int(parts[0]), int(parts[1]), int(parts[2]))
+    except ValueError:
+        return None
+    return major, minor, patch
+
+
+def _is_supported_lean_version(version: str) -> bool:
+    parsed = _parse_lean_version(version)
+    if parsed is None:
+        return False
+    return parsed >= MIN_SUPPORTED_LEAN_VERSION
+
+
+def _pause_after_trace(repo_url: str, status: str) -> None:
+    if not PAUSE_AFTER_TRACE:
+        return
+    try:
+        input(
+            f"[TRACE] {repo_url} finished with status '{status}'. "
+            "Press Enter to continue..."
+        )
+    except KeyboardInterrupt:
+        raise
+
 
 def clone_repo(repo_url):
     """Clone a git repository and return the path to the repository and its sha."""
@@ -360,38 +400,81 @@ def add_repo_to_database(dynamic_database_json_path, repo, db):
         url = url + ".git"
     logger.info(f"\n\nProcessing {url}")
 
+    normalized_url = url.replace(".git", "")
+
     sha, v = get_compatible_commit(url)
 
     if not sha:
         logger.info(f"Failed to find a compatible commit for {url}")
-        return None
+        status = "no_compatible_commit"
+        _pause_after_trace(normalized_url, status)
+        return status
 
     logger.info(f"Found compatible commit {sha} for {url} with lean version: {v}")
     
+    if db.get_repository(normalized_url, sha) is not None:
+        logger.info(
+            f"Repository {normalized_url}@{sha} already present in dynamic database. Skipping."
+        )
+        status = "already_present"
+        _pause_after_trace(normalized_url, status)
+        return status
+
+    parsed_version = _parse_lean_version(v)
+    if parsed_version is None or not _is_supported_lean_version(v):
+        logger.info(
+            f"Skipping {normalized_url} due to unsupported Lean toolchain {v}. "
+            f"Minimum required {MIN_SUPPORTED_LEAN_VERSION_STR}"
+        )
+        status = "unsupported_toolchain"
+        _pause_after_trace(normalized_url, status)
+        return status
+
     # Ensure that the repo is checked out to the compatible commit
     repo_name, _ = clone_repo(url)
     subprocess.run(["git", "-C", repo_name, "checkout", sha], check=True)
     logger.info(f"Checked out {url} to commit {sha}")
     
     
-    url = url.replace(".git", "")
-    repo = LeanGitRepo(url, sha)
+    repo = LeanGitRepo(normalized_url, sha)
     dir_name = repo.url.split("/")[-1] + "_" + sha
     dst_dir = os.path.join(DATA_DIR, dir_name)
     logger.info(f"Generating benchmark at {dst_dir}")
     
-    traced_repo, _, _, total_theorems = generate_benchmark_lean4.main(
+    traced_repo, num_premises, num_files_traced, total_theorems = generate_benchmark_lean4.main(
         repo.url, sha, dst_dir
     )
     
     if not traced_repo:
         logger.info(f"Failed to trace {url}")
-        return None
+        shutil.rmtree(dst_dir, ignore_errors=True)
+        status = "trace_failed"
+        _pause_after_trace(normalized_url, status)
+        return status
     
+    if total_theorems is None:
+        logger.info(f"Trace produced no theorem count for {url}")
+        shutil.rmtree(dst_dir, ignore_errors=True)
+        status = "missing_theorem_count"
+        _pause_after_trace(normalized_url, status)
+        return status
+
+    logger.info(
+        f"Trace produced {total_theorems} theorems for {url} "
+        f"(minimum required {3 * BATCH_SIZE})"
+    )
+
     if total_theorems < 3 * BATCH_SIZE:  # Require enough theorems for train/val/test
         logger.info(f"Not enough theorems found in {url}")
-        return None
-    
+        shutil.rmtree(dst_dir, ignore_errors=True)
+        status = "insufficient_theorems"
+        _pause_after_trace(normalized_url, status)
+        return status
+
+    logger.info(
+        f"Export includes {num_premises} premises across {num_files_traced} traced files"
+    )
+
     logger.info(f"Finished generating benchmark at {dst_dir}")
 
     # Add the new repo to the dynamic database
@@ -427,7 +510,9 @@ def add_repo_to_database(dynamic_database_json_path, repo, db):
     db.print_database_contents()
     
     db.to_json(dynamic_database_json_path)
-    return "Done"
+    status = "success"
+    _pause_after_trace(normalized_url, status)
+    return status
 
 def calculate_difficulty(theorem: Theorem) -> Union[float, None]:
     """Calculates the difficulty of a theorem."""
diff --git a/leanagent.py b/leanagent.py
index e911522..edaa5a1 100644
--- a/leanagent.py
+++ b/leanagent.py
@@ -52,6 +52,21 @@
 lean_git_repos = []
 repos = []
 
+SEED_REPOS = [
+    LeanGitRepo(
+        "https://github.com/ImperialCollegeLondon/FLT",
+        "b208a302cdcbfadce33d8165f0b054bfa17e2147",
+    ),
+    LeanGitRepo(
+        "https://github.com/HEPLean/PhysLean",
+        "60f1ebc3eb015f78a3719ee4085344a600d0af50",
+    ),
+    LeanGitRepo(
+        "https://github.com/verse-lab/veil",
+        "a9fe7205c57f7b6ee8b350bfc87b9b4b28c57781",
+    ),
+]
+
 
 @contextmanager
 def _locked(path: str, mode: str):
@@ -427,8 +442,26 @@ def get_repos(curriculum_learning: bool, num_repos: int, dynamic_database_json_p
         if is_main_process:
             if num_repos < 3:
                 logger.warning("num_repos should be at least 3 for curriculum learning")
-            
-            
+
+            failure_records: List[Tuple[str, str]] = []
+
+            for seed_repo in SEED_REPOS:
+                if db.get_repository(seed_repo.url, seed_repo.commit) is None:
+                    logger.info(
+                        f"Seeding database with {seed_repo.url}@{seed_repo.commit}"
+                    )
+                    result = add_repo_to_database(
+                        dynamic_database_json_path, seed_repo, db
+                    )
+                    if result in ("success", "already_present"):
+                        logger.info(f"Seeded repo {seed_repo.url}")
+                    else:
+                        failure_records.append((seed_repo.url, result))
+                else:
+                    logger.info(
+                        f"Seed repository {seed_repo.url}@{seed_repo.commit} already present"
+                    )
+
             existing_repo_count = len(db.repositories)
             target_repo_count = max(3, num_repos)
 
@@ -452,8 +485,10 @@ def get_repos(curriculum_learning: bool, num_repos: int, dynamic_database_json_p
                     result = add_repo_to_database(
                         dynamic_database_json_path, repo, db
                     )
-                    if result is not None:
+                    if result in ("success", "already_present"):
                         logger.info(f"Successfully added repo {repo.url}")
+                    else:
+                        failure_records.append((repo.url, result))
 
                 if len(db.repositories) >= target_repo_count:
                     break
@@ -472,6 +507,11 @@ def get_repos(curriculum_learning: bool, num_repos: int, dynamic_database_json_p
                     lean_git_repos, repos, "Lean", needed
                 )
 
+            if failure_records:
+                logger.warning("Tracing failures/skip summary:")
+                for repo_url, reason in failure_records:
+                    logger.warning(f"  {repo_url} -> {reason}")
+
             newly_added = len(db.repositories) - existing_repo_count
             logger.info(
                 f"Successfully added {newly_added} repositories to the database (total: {len(db.repositories)})"
diff --git a/trace_only.py b/trace_only.py
new file mode 100644
index 0000000..17c0d1b
--- /dev/null
+++ b/trace_only.py
@@ -0,0 +1,22 @@
+from lean_dojo import LeanGitRepo
+from dynamic_database import DynamicDatabase
+from git_utils import add_repo_to_database
+from filenames import RAID_DIR, DB_FILE_NAME
+from pathlib import Path
+
+SEED_REPOS = [
+    ("https://github.com/ImperialCollegeLondon/FLT", "b208a302cdcbfadce33d8165f0b054bfa17e2147"),
+    ("https://github.com/HEPLean/PhysLean", "60f1ebc3eb015f78a3719ee4085344a600d0af50"),
+    ("https://github.com/verse-lab/veil", "a9fe7205c57f7b6ee8b350bfc87b9b4b28c57781"),
+]
+
+db_path = Path(RAID_DIR) / DB_FILE_NAME
+if db_path.exists():
+    db = DynamicDatabase.from_json(db_path)
+else:
+    db = DynamicDatabase()
+
+for url, commit in SEED_REPOS:
+    repo = LeanGitRepo(url, commit)
+    print(f"Tracing {url}@{commit}")
+    add_repo_to_database(str(db_path), repo, db)

From f8d29d6fc952566e61697933d8090cbc4834e8b4 Mon Sep 17 00:00:00 2001
From: aumrp77 <aumrp77@gmail.com>
Date: Thu, 6 Nov 2025 10:06:34 -0500
Subject: [PATCH 19/29] Add paper tracing scripts and enable merged dataset
 build

---
 constants.py                    |  73 ++++++++++++
 leanagent.py                    |  36 ++++--
 scripts/build_merged_dataset.py | 124 ++++++++++++++++++++
 scripts/trace_paper_repos.py    | 202 ++++++++++++++++++++++++++++++++
 4 files changed, 427 insertions(+), 8 deletions(-)
 create mode 100644 scripts/build_merged_dataset.py
 create mode 100755 scripts/trace_paper_repos.py

diff --git a/constants.py b/constants.py
index 9313943..a801d9b 100644
--- a/constants.py
+++ b/constants.py
@@ -243,3 +243,76 @@
     # Added by Mo to find smaller repo to iterate on
     
 ]
+
+# Repos that appear in the paper – trace these first, in this order.
+PAPER_REPOS = [
+    {"url": "https://github.com/leanprover-community/PFR",
+     "commit": "fa398a5b853c7e94e3294c45e50c6aee013a2687"},
+
+    {"url": "https://github.com/leanprover-community/hairy-ball-theorem",
+     "commit": "a778826d19c8a7ddf1d26beeea628c45450612e6"},
+
+    {"url": "https://github.com/leanprover-community/coxeter",
+     "commit": "96af8aee7943ca8685ed1b00cc83a559ea389a97"},
+
+    {"url": "https://github.com/avigad/mathematics_in_lean_source",
+     "commit": "5297e0fb051367c48c0a084411853a576389ecf5"},
+
+    {"url": "https://github.com/leanprover-community/formal-book",
+     "commit": "6fbe8c2985008c0bfb30050750a71b90388ad3a3"},
+
+    {"url": "https://github.com/yangky11/miniF2F-lean4",
+     "commit": "9e445f5435407f014b88b44a98436d50dd7abd00"},
+
+    {"url": "https://github.com/lecopivo/SciLean",
+     "commit": "22d53b2f4e3db2a172e71da6eb9c916e62655744"},
+
+    {"url": "https://github.com/leanprover-community/carleson",
+     "commit": "bec7808b907190882fa1fa54ce749af297c6cf37"},
+
+    {"url": "https://github.com/leanprover-community/lean4-pdl",
+     "commit": "c7f649fe3c4891cf1a01c120e82ebc5f6199856e"},
+
+    {"url": "https://github.com/AlexKontorovich/PrimeNumberTheoremAnd",
+     "commit": "29baddd685660b5fedd7bd67f9916ae24253d566"},
+
+    {"url": "https://github.com/dwrensha/compfiles",
+     "commit": "f99bf6f2928d47dd1a445b414b3a723c2665f091"},
+
+    {"url": "https://github.com/ImperialCollegeLondon/FLT",
+     "commit": "b208a302cdcbfadce33d8165f0b054bfa17e2147"},
+
+    {"url": "https://github.com/TODO/debate",
+     "commit": "7fb39251b705797ee54e08c96177fabd29a5b5a3"},
+
+    {"url": "https://github.com/TODO/lean4lean",
+     "commit": "05b1f4a68c5facea96a5ee51c6a56fef21276e0f"},
+
+    {"url": "https://github.com/eric-wieser/lean-matrix-cookbook",
+     "commit": "f15a149d321ac99ff9b9c024b58e7882f564669f"},
+
+    {"url": "https://github.com/TODO/math-workshop",
+     "commit": "5acd4b933d47fd6c1032798a6046c1baf261445d"},
+
+    {"url": "https://github.com/TODO/LeanEuclid",
+     "commit": "f1912c3090eb82820575758efc31e40b9db86bb8"},
+
+    {"url": "https://github.com/FormalizedFormalLogic/Foundation",
+     "commit": "d5fe5d057a90a0703a745cdc318a1b6621490c21"},
+
+    {"url": "https://github.com/TODO/Con-nf",
+     "commit": "00bdc85ba7d486a9e544a0806a1018dd06fa3856"},
+
+    {"url": "https://github.com/TODO/Saturn",
+     "commit": "3811a9dd46cdfd5fa0c0c1896720c28d2ec4a42a"},
+
+    {"url": "https://github.com/ahhwuhu/zeta_3_irrational",
+     "commit": "914712200e463cfc97fe37e929d518dd58806a38"},
+
+    {"url": "https://github.com/TODO/Formalization-of-Constructable-Numbers",
+     "commit": "01ef1f22a04f2ba8081c5fb29413f515a0e52878"},
+
+    {"url": "https://github.com/LeanAPAP/LeanAPAP",
+     "commit": "951c660a8d7ba8e39f906fdf657674a984effa8b"},
+]
+
diff --git a/leanagent.py b/leanagent.py
index edaa5a1..8fb50a0 100644
--- a/leanagent.py
+++ b/leanagent.py
@@ -70,7 +70,9 @@
 
 @contextmanager
 def _locked(path: str, mode: str):
-    os.makedirs(os.path.dirname(path), exist_ok=True)
+    directory = os.path.dirname(path)
+    if directory:
+        os.makedirs(directory, exist_ok=True)
     with open(path, mode) as handle:
         fcntl.flock(handle.fileno(), fcntl.LOCK_EX)
         try:
@@ -87,9 +89,27 @@ def read_json_locked(path: str):
         return json.load(handle)
 
 
-def write_json_locked(path: str, obj) -> None:
+def write_json_locked(
+    path: str,
+    obj,
+    *,
+    indent: int = 2,
+    ensure_ascii: bool = False,
+    sort_keys: bool = False,
+) -> None:
     with _locked(path, "w") as handle:
-        json.dump(obj, handle, indent=2, sort_keys=True)
+        json.dump(
+            obj,
+            handle,
+            indent=indent,
+            ensure_ascii=ensure_ascii,
+            sort_keys=sort_keys,
+        )
+
+
+def save_database_locked(db: DynamicDatabase, path: str) -> None:
+    """Persist the dynamic database safely across processes."""
+    write_json_locked(path, db.to_dict(), ensure_ascii=False)
 
 
 def _eval(data, preds_map) -> Tuple[float, float, float]:
@@ -222,7 +242,7 @@ def process_theorem_batch(
         else:
             logger.warning(f"Unexpected result type")
 
-    db.to_json(dynamic_database_json_path)
+    save_database_locked(db, dynamic_database_json_path)
 
 
 def save_progress(all_encountered_theorems):
@@ -414,7 +434,7 @@ def initialize_database(dynamic_database_json_path: str) -> DynamicDatabase:
                 f"\nInitializing new database at {dynamic_database_json_path}\n"
             )
             db = DynamicDatabase()
-            db.to_json(dynamic_database_json_path)
+            save_database_locked(db, dynamic_database_json_path)
         else:
             try:
                 logger.info(f"Loading database from {dynamic_database_json_path}")
@@ -426,7 +446,7 @@ def initialize_database(dynamic_database_json_path: str) -> DynamicDatabase:
                     f"Error decoding JSON from {dynamic_database_json_path}. Initializing new database."
                 )
                 db = DynamicDatabase()
-                db.to_json(dynamic_database_json_path)
+                save_database_locked(db, dynamic_database_json_path)
     
     return db
 
@@ -522,7 +542,7 @@ def get_repos(curriculum_learning: bool, num_repos: int, dynamic_database_json_p
             )
             
             print("Sorted repositories. Saving now...")
-            db.to_json(dynamic_database_json_path)
+            save_database_locked(db, dynamic_database_json_path)
             save_sorted_repos(sorted_repos, "sorted_repos.json")
             
             print("Summary of theorem difficulties by URL:")
@@ -982,7 +1002,7 @@ def main():
                         prove_sorry_theorems(
                             db, prover, dynamic_database_json_path, repos_for_proving
                         )
-                    db.to_json(dynamic_database_json_path)
+                    save_database_locked(db, dynamic_database_json_path)
 
                     logger.info("Finished searching for proofs of sorry theorems")
 
diff --git a/scripts/build_merged_dataset.py b/scripts/build_merged_dataset.py
new file mode 100644
index 0000000..c1879f4
--- /dev/null
+++ b/scripts/build_merged_dataset.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+"""
+Build a merged dataset from already-traced repos.
+
+This script:
+  - Loads/creates the dynamic database at RAID/db_file.txt
+  - For each non-empty corpus under RAID/data/<name>_<sha>/corpus.jsonl,
+    it infers (repo_url, commit) from the first line and adds the repo to the DB
+    using git_utils.add_repo_to_database (reuses LeanDojo caches if present).
+  - Exports a merged dataset to RAID/data/merged_paper_subset.
+
+Run from repo root:
+  export RAID_DIR="$PWD/RAID"
+  export REPO_DIR="$RAID_DIR/repos"
+  python LeanAgent/scripts/build_merged_dataset.py
+"""
+
+from __future__ import annotations
+
+import json
+import os
+from pathlib import Path
+
+from loguru import logger
+import sys
+
+# Ensure repo modules are importable when running as a script
+HERE = Path(__file__).resolve()
+REPO_ROOT = HERE.parents[1]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+
+from lean_dojo import LeanGitRepo  # noqa: E402
+from dynamic_database import DynamicDatabase  # noqa: E402
+from filenames import RAID_DIR, DB_FILE_NAME  # noqa: E402
+from git_utils import add_repo_to_database  # noqa: E402
+from scripts.trace_paper_repos import PAPER_REPOS  # noqa: E402
+
+
+def iter_nonempty_corpora(data_root: Path):
+    for d in sorted(data_root.iterdir()):
+        cj = d / "corpus.jsonl"
+        if not cj.exists() or cj.stat().st_size == 0:
+            continue
+        yield d, cj
+
+
+_SLUG_TO_URL = {
+    item["name"]: f"https://github.com/{item['owner']}/{item['name']}"
+    for item in PAPER_REPOS
+}
+
+
+def _infer_repo_from_dir(dir_path: Path) -> tuple[str, str]:
+    name = dir_path.name
+    if "_" not in name:
+        raise ValueError(f"Directory name {name} does not contain commit suffix")
+    slug, commit = name.rsplit("_", 1)
+    if len(commit) != 40:
+        raise ValueError(f"Directory {name} missing 40-char commit suffix")
+    url = _SLUG_TO_URL.get(slug)
+    if not url:
+        raise ValueError(f"Unknown repo slug '{slug}'. Please add it to PAPER_REPOS.")
+    return url, commit
+
+
+def load_repo_from_corpus(corpus_path: Path) -> tuple[str, str]:
+    with corpus_path.open() as f:
+        first = f.readline()
+    url = commit = ""
+    if first:
+        try:
+            meta = json.loads(first)
+            url = meta.get("repo_url") or ""
+            commit = meta.get("commit") or ""
+        except Exception:
+            pass
+    if url and commit:
+        return url, commit
+    return _infer_repo_from_dir(corpus_path.parent)
+
+
+def main() -> None:
+    raid_dir = Path(RAID_DIR)
+    db_path = raid_dir / DB_FILE_NAME
+
+    db: DynamicDatabase
+    if not db_path.exists() or db_path.stat().st_size == 0:
+        logger.info(f"Initializing new database at {db_path}")
+        db = DynamicDatabase()
+        db.to_json(str(db_path))
+    else:
+        logger.info(f"Loading database from {db_path}")
+        db = DynamicDatabase.from_json(str(db_path))
+
+    # Add repos discovered from existing corpora
+    data_root = raid_dir / "data"
+    targets = []
+    for d, cj in iter_nonempty_corpora(data_root):
+        try:
+            url, commit = load_repo_from_corpus(cj)
+            targets.append((url, commit))
+        except Exception as e:
+            logger.warning(f"Skipping {d} due to: {e}")
+
+    logger.info(f"Found {len(targets)} repos with non-empty corpora to ingest")
+
+    for url, commit in targets:
+        repo = LeanGitRepo(url, commit)
+        logger.info(f"Ingesting {url}@{commit}")
+        status = add_repo_to_database(str(db_path), repo, db)
+        logger.info(f"Status for {url}: {status}")
+
+    # Export merged dataset
+    out_dir = raid_dir / "data" / "merged_paper_subset"
+    logger.info(f"Generating merged dataset at {out_dir}")
+    db.generate_merged_dataset(out_dir)
+    logger.info("DONE.")
+
+
+if __name__ == "__main__":
+    if not os.environ.get("RAID_DIR"):
+        raise SystemExit("Please set RAID_DIR and REPO_DIR before running.")
+    main()
diff --git a/scripts/trace_paper_repos.py b/scripts/trace_paper_repos.py
new file mode 100755
index 0000000..8d33d0a
--- /dev/null
+++ b/scripts/trace_paper_repos.py
@@ -0,0 +1,202 @@
+#!/usr/bin/env python3
+"""
+Trace the fixed set of paper repos and materialize corpus.jsonl
+next to RAID/data/<name>_<commit>/, mirroring the manual veil flow.
+
+Run from repo root:
+    export RAID_DIR="$PWD/RAID"
+    export REPO_DIR="$RAID_DIR/repos"
+    python scripts/trace_paper_repos.py
+"""
+
+import os
+import json
+import pathlib
+
+from lean_dojo import LeanGitRepo
+from lean_dojo.data_extraction.trace import get_traced_repo_path
+
+
+# hardcoded list reconstructed from the paper / convo
+PAPER_REPOS = [
+    # 1. teorth/pfr
+    # {
+    #     "owner": "teorth",
+    #     "name": "pfr",
+    #     "sha": "fa398a5b853c7e94e3294c45e50c6aee013a2687",
+    # },
+    # 2. avigad/mathematics_in_lean_source
+    {
+        "owner": "avigad",
+        "name": "mathematics_in_lean_source",
+        "sha": "5297e0fb051367c48c0a084411853a576389ecf5",
+    },
+    {
+        "owner": "verse-lab",
+        "name": "veil",
+        "sha": "a9fe7205c57f7b6ee8b350bfc87b9b4b28c57781",
+    },
+    # 3. miniF2F
+    {
+        "owner": "yangky11",
+        "name": "miniF2F-lean4",
+        "sha": "9e445f5435407f014b88b44a98436d50dd7abd00",
+    },
+    # 4. SciLean (in paper → we must make it work eventually)
+    # {
+    #     "owner": "lecopivo",
+    #     "name": "SciLean",
+    #     "sha": "22d53b2f4e3db2a172e71da6eb9c916e62655744",
+    # },
+    # 5. teorth/lean4-pdl
+    {
+        "owner": "teorth",
+        "name": "lean4-pdl",
+        "sha": "c7f649fe3c4891cf1a01c120e82ebc5f6199856e",
+    },
+    # 6. prime number theorem notes
+    {
+        "owner": "AlexKontorovich",
+        "name": "PrimeNumberTheoremAnd",
+        "sha": "29baddd685660b5fedd7bd67f9916ae24253d566",
+    },
+    # 7. compfiles
+    {
+        "owner": "dwrensha",
+        "name": "compfiles",
+        "sha": "f99bf6f2928d47dd1a445b414b3a723c2665f091",
+    },
+    # 8. FLT
+    {
+        "owner": "ImperialCollegeLondon",
+        "name": "FLT",
+        "sha": "b208a302cdcbfadce33d8165f0b054bfa17e2147",
+    },
+    {
+        "owner": "verse-lab",
+        "name": "veil",
+        "sha": "a9fe7205c57f7b6ee8b350bfc87b9b4b28c57781",
+    },
+    # 9. lean4-cli (paper mentions tooling repos; we saw this in your crawl)
+    {
+        "owner": "leanprover-community",
+        "name": "lean4-cli",
+        "sha": "05b1f4a68c5facea96a5ee51c6a56fef21276e0f",
+    },
+    # 10. matrix cookbook
+    {
+        "owner": "eric-wieser",
+        "name": "lean-matrix-cookbook",
+        "sha": "f15a149d321ac99ff9b9c024b58e7882f564669f",
+    },
+    # 11. LeanEuclid
+    {
+        "owner": "loganrjmurphy",
+        "name": "LeanEuclid",
+        "sha": "f1912c3090eb82820575758efc31e40b9db86bb8",
+    },
+    # 12. formalized logic foundation
+    {
+        "owner": "FormalizedFormalLogic",
+        "name": "Foundation",
+        "sha": "d5fe5d057a90a0703a745cdc318a1b6621490c21",
+    },
+    # 13. con-nf
+    {
+        "owner": "pengbaolin",
+        "name": "con-nf",
+        "sha": "00bdc85ba7d486a9e544a0806a1018dd06fa3856",
+    },
+    # 14. zeta_3_irrational
+    {
+        "owner": "ahhwuhu",
+        "name": "zeta_3_irrational",
+        "sha": "914712200e463cfc97fe37e929d518dd58806a38",
+    },
+    # 15. LeanAPAP
+    {
+        "owner": "judicael-pvt",
+        "name": "LeanAPAP",
+        "sha": "951c660a8d7ba8e39f906fdf657674a984effa8b",
+    },
+    # paper had a few that we couldn't map to GH — keep extensible
+]
+
+
+def make_corpus_from_repo(source_root: pathlib.Path, out_dir: pathlib.Path, url: str, commit: str) -> int:
+    """Scan .lake/build/ir for *.ast.json and write corpus.jsonl."""
+    ir_root = source_root / ".lake" / "build" / "ir"
+    if not ir_root.exists():
+        print(f"  !! no .lake/build/ir in {source_root}, skipping corpus.jsonl")
+        return 0
+
+    recs = []
+    for p in ir_root.rglob("*.ast.json"):
+        recs.append(
+            {
+                "repo_url": url,
+                "commit": commit,
+                "ast_path": str(p.relative_to(source_root)),
+            }
+        )
+
+    out_dir.mkdir(parents=True, exist_ok=True)
+    out_file = out_dir / "corpus.jsonl"
+    with out_file.open("w") as f:
+        for r in recs:
+            f.write(json.dumps(r) + "\n")
+    print(f"  wrote {len(recs)} records to {out_file}")
+    return len(recs)
+
+
+def main() -> None:
+    raid_dir = os.environ.get("RAID_DIR")
+    repo_dir = os.environ.get("REPO_DIR")
+
+    if not raid_dir or not repo_dir:
+        raise SystemExit("Please set RAID_DIR and REPO_DIR before running.")
+
+    raid_dir = pathlib.Path(raid_dir)
+    repo_dir = pathlib.Path(repo_dir)
+
+    for item in PAPER_REPOS:
+        url = f"https://github.com/{item['owner']}/{item['name']}"
+        commit = item["sha"]
+
+        print(f"\n=== tracing {url}@{commit} ===")
+        try:
+            repo = LeanGitRepo(url, commit)
+            traced_path = get_traced_repo_path(repo, build_deps=False)
+            traced_path = pathlib.Path(traced_path)
+            print(f"  lean_dojo traced into cache: {traced_path}")
+        except Exception as e:
+            print(f"  !! lean_dojo failed for {url}@{commit}: {e}")
+            continue
+
+        # repo as checked out by the earlier crawl
+        repo_root = repo_dir / item["owner"] / item["name"]
+        out_dir = raid_dir / "data" / f"{item['name']}_{commit}"
+
+        if not repo_root.exists():
+            print(f"  !! repo root {repo_root} not found — was it cloned under RAID/repos/?")
+
+        sources = [traced_path]
+        if repo_root.exists():
+            sources.append(repo_root)
+
+        exported = 0
+        for src in sources:
+            exported = make_corpus_from_repo(src, out_dir, url, commit)
+            if exported > 0:
+                break
+
+        if exported > 0:
+            print(f"  ✅ exported corpus for {item['name']} ({exported} files)")
+        else:
+            print(f"  ⚠ traced but no IR — likely a build/env issue for this repo")
+
+    print("\nDONE.")
+
+
+if __name__ == "__main__":
+    main()

From a9d441c2cbd50099ad1ab960103b47aa7ac5ef53 Mon Sep 17 00:00:00 2001
From: motiwari <mohittiwarinyc+github@gmail.com>
Date: Thu, 18 Sep 2025 11:13:37 -0700
Subject: [PATCH 20/29] Initial commit of changes

---
 __init__.py                         |   36 +
 constants.py                        |  102 ++
 container.py                        |  369 +++++++
 data_extraction/ExtractData.lean    |  530 +++++++++
 data_extraction/ast.py              | 1576 +++++++++++++++++++++++++++
 data_extraction/build_lean4_repo.py |  214 ++++
 data_extraction/cache.py            |  107 ++
 data_extraction/lean.py             |  702 ++++++++++++
 data_extraction/trace.py            |  135 +++
 data_extraction/traced_data.py      | 1224 +++++++++++++++++++++
 interaction/Lean4Repl.lean          |  357 ++++++
 interaction/dojo.py                 |  549 ++++++++++
 interaction/parse_goals.py          |   69 ++
 utils.py                            |  314 ++++++
 14 files changed, 6284 insertions(+)
 create mode 100644 __init__.py
 create mode 100644 container.py
 create mode 100644 data_extraction/ExtractData.lean
 create mode 100644 data_extraction/ast.py
 create mode 100644 data_extraction/build_lean4_repo.py
 create mode 100644 data_extraction/cache.py
 create mode 100644 data_extraction/lean.py
 create mode 100644 data_extraction/trace.py
 create mode 100644 data_extraction/traced_data.py
 create mode 100644 interaction/Lean4Repl.lean
 create mode 100644 interaction/dojo.py
 create mode 100644 interaction/parse_goals.py
 create mode 100644 utils.py

diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..8e8f7b3
--- /dev/null
+++ b/__init__.py
@@ -0,0 +1,36 @@
+import os
+from loguru import logger
+
+from .data_extraction.trace import (
+    trace,
+    get_traced_repo_path,
+    is_available_in_cache,
+)
+
+from .data_extraction.traced_data import (
+    TracedRepo,
+    TracedFile,
+    TracedTheorem,
+    TracedTactic,
+)
+from .interaction.dojo import (
+    CommandState,
+    TacticState,
+    LeanError,
+    TimeoutError,
+    TacticResult,
+    DojoCrashError,
+    DojoHardTimeoutError,
+    DojoInitError,
+    Dojo,
+    ProofFinished,
+    ProofGivenUp,
+)
+from .interaction.parse_goals import Declaration, Goal, parse_goals
+from .data_extraction.lean import get_latest_commit, LeanGitRepo, LeanFile, Theorem, Pos
+from .constants import __version__
+
+if os.geteuid() == 0:
+    logger.warning(
+        "Running LeanDojo as the root user may cause unexpected issues. Proceed with caution."
+    )
diff --git a/constants.py b/constants.py
index a801d9b..01061a3 100644
--- a/constants.py
+++ b/constants.py
@@ -316,3 +316,105 @@
      "commit": "951c660a8d7ba8e39f906fdf657674a984effa8b"},
 ]
 
+"""Constants controlling LeanDojo's behaviors. 
+Many of them are configurable via :ref:`environment-variables`.
+"""
+
+import os
+import re
+import sys
+import subprocess
+import multiprocessing
+from pathlib import Path
+from typing import Tuple
+from loguru import logger
+from dotenv import load_dotenv
+
+load_dotenv()
+
+__version__ = "1.9.0"
+
+logger.remove()
+if "VERBOSE" in os.environ or "DEBUG" in os.environ:
+    logger.add(sys.stderr, level="DEBUG")
+else:
+    logger.add(sys.stderr, level="INFO")
+
+CACHE_DIR = (
+    Path(os.environ["CACHE_DIR"])
+    if "CACHE_DIR" in os.environ
+    else Path.home() / ".cache/lean_dojo"
+).absolute()
+"""Cache directory for storing traced repos (see :ref:`caching`).
+"""
+
+REMOTE_CACHE_URL = "https://lean-dojo.s3.amazonaws.com"
+"""URL of the remote cache (see :ref:`caching`)."""
+
+DISABLE_REMOTE_CACHE = "DISABLE_REMOTE_CACHE" in os.environ
+"""Whether to disable remote caching (see :ref:`caching`) and build all repos locally.
+"""
+
+TMP_DIR = Path(os.environ["TMP_DIR"]).absolute() if "TMP_DIR" in os.environ else None
+"""Temporary directory used by LeanDojo for storing intermediate files
+"""
+
+MAX_NUM_PROCS = 32
+
+NUM_PROCS = int(os.getenv("NUM_PROCS", min(multiprocessing.cpu_count(), MAX_NUM_PROCS)))
+"""Number of threads to use
+"""
+
+NUM_WORKERS = NUM_PROCS - 1
+
+LEAN4_URL = "https://github.com/leanprover/lean4"
+"""The URL of the Lean 4 repo."""
+
+LEAN4_PACKAGES_DIR = Path(".lake/packages")
+"""The directory where Lean 4 dependencies are stored (since v4.3.0-rc2)."""
+
+LOAD_USED_PACKAGES_ONLY = "LOAD_USED_PACKAGES_ONLY" in os.environ
+"""Only load depdendency files that are actually used by the target repo."""
+
+LEAN4_BUILD_DIR = Path(".lake/build")
+
+TACTIC_CPU_LIMIT = int(os.getenv("TACTIC_CPU_LIMIT", 1))
+"""Number of CPUs for executing tactics when interacting with Lean (only useful when running within Docker).
+"""
+
+TACTIC_MEMORY_LIMIT = os.getenv("TACTIC_MEMORY_LIMIT", "32g")
+"""Maximum memory when interacting with Lean (only useful when running within Docker).
+"""
+
+CONTAINER = os.getenv("CONTAINER", "native")
+"""Container to use for running LeanDojo. Default to ``native`` but also support ``docker``. Using ``docker`` is recommended for Lean 3.
+"""
+
+DOCKER_AVAILABLE = os.system("docker version 1>/dev/null 2>/dev/null") == 0
+
+DOCKER_TAG = "yangky11/lean-dojo"
+
+if CONTAINER == "docker":
+    assert (
+        DOCKER_AVAILABLE
+    ), "Failed to access Docker. Please make sure Docker is running and you have access. Alternatively, you can try to run without Docker by setting the `CONTAINER` environment variable to `native` (see https://leandojo.readthedocs.io/en/latest/user-guide.html#advanced-running-within-docker)."
+    os.system(f"docker pull {DOCKER_TAG} 1>/dev/null 2>/dev/null")
+
+
+def check_git_version(min_version: Tuple[int, int, int]) -> Tuple[int, int, int]:
+    """Check the version of Git installed on the system."""
+    res = subprocess.run("git --version", shell=True, capture_output=True, check=True)
+    output = res.stdout.decode()
+    error = res.stderr.decode()
+    assert error == "", error
+    m = re.match(r"git version (?P<version>[0-9.]+)", output)
+    version = tuple(int(_) for _ in m["version"].split("."))
+
+    version_str = ".".join(str(_) for _ in version)
+    min_version_str = ".".join(str(_) for _ in min_version)
+    assert (
+        version >= min_version
+    ), f"Git version {version_str} is too old. Please upgrade to at least {min_version_str}."
+
+
+check_git_version((2, 25, 0))
diff --git a/container.py b/container.py
new file mode 100644
index 0000000..af9f25a
--- /dev/null
+++ b/container.py
@@ -0,0 +1,369 @@
+"""Containers provide runtime environment for running LeanDojo. 
+Currently, LeanDojo supports two types of containers: ``docker`` and ``native``. 
+The former is the default and recommended option, while the latter is experimental.
+"""
+
+import os
+import shlex
+import signal
+import shutil
+import tempfile
+import subprocess
+from pathlib import Path
+from loguru import logger
+from dataclasses import dataclass
+from abc import ABC, abstractmethod
+from typing import List, Dict, Union, Tuple, Optional
+
+from .constants import CONTAINER, DOCKER_TAG
+from .utils import execute, report_critical_failure, working_directory
+
+
+@dataclass(frozen=True)
+class Mount:
+    """A mount is a pair of source and destination paths."""
+
+    src: Path
+    dst: Path
+
+    def __post_init__(self):
+        object.__setattr__(self, "src", Path(self.src))
+        object.__setattr__(self, "dst", Path(self.dst))
+
+    def __iter__(self):
+        yield self.src
+        yield self.dst
+
+
+def create_mounts(mts: Dict[Union[str, Path], Union[str, Path]]) -> List[Mount]:
+    """Create a list of mounts from a dictionary."""
+    return [Mount(Path(k), Path(v)) for k, v in mts.items()]
+
+
+class Container(ABC):
+    """Abstract base class for containers."""
+
+    @abstractmethod
+    def run(
+        self,
+        command: str,
+        mounts: List[Mount],
+        envs: Dict[str, str],
+        as_current_user: bool,
+        capture_output: bool,
+        cpu_limit: Optional[int],
+        memory_limit: Optional[str],
+        work_dir: Optional[str],
+    ) -> None:
+        """Run a command in the container.
+
+        Args:
+            command (str): _description_
+            mounts (List[Mount]): _description_
+            envs (Dict[str, str]): _description_
+            as_current_user (bool): _description_
+            capture_output (bool): _description_
+            cpu_limit (Optional[int]): _description_
+            memory_limit (Optional[str]): _description_
+            work_dir (Optional[str]): _description_
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def run_interactive(
+        self,
+        command: str,
+        mounts: List[Mount],
+        envs: Dict[str, str],
+        as_current_user: bool,
+        cpu_limit: Optional[int],
+        memory_limit: Optional[str],
+        work_dir: Optional[str],
+    ) -> subprocess.Popen:
+        """Run a command in the container interactively.
+
+        Args:
+            command (str): _description_
+            mounts (List[Mount]): _description_
+            envs (Dict[str, str]): _description_
+            as_current_user (bool): _description_
+            cpu_limit (Optional[int]): _description_
+            memory_limit (Optional[str]): _description_
+            work_dir (Optional[str]): _description_
+
+        Returns:
+            subprocess.Popen: _description_
+        """
+        raise NotImplementedError
+
+
+def _copy_file_or_dir(src: Path, dst: Path, delete_existing: bool = False) -> None:
+    if src.is_file():
+        shutil.copy(src, dst)
+    else:
+        assert src.is_dir() and not src.is_relative_to(dst)
+        
+        # Modified by motiwari so as not to delete existing repos while mounting
+        if not delete_existing:
+            if not dst.exists():
+                shutil.copytree(src, dst, symlinks=True)
+                return
+        if dst.exists() and delete_existing:
+            shutil.rmtree(dst)
+            shutil.copytree(src, dst, symlinks=True)
+
+
+class NativeContainer(Container):
+    """A container that runs commands natively."""
+
+    def _mount_files(self, mounts: List[Mount]) -> None:
+        cwd = Path.cwd()
+        import ipdb; ipdb.set_trace()
+        for src, dst in mounts:
+            if dst.is_absolute():
+                dst = cwd / dst.relative_to(dst.root)
+            if src == cwd:
+                for path in src.glob("*"):
+                    p = dst / path.relative_to(src)
+                    p.parent.mkdir(parents=True, exist_ok=True)
+                    _copy_file_or_dir(path, p)
+                continue
+            assert not cwd.is_relative_to(src)
+            dst.parent.mkdir(parents=True, exist_ok=True)
+            _copy_file_or_dir(src, dst)
+
+    def _unmount_files(self, mounts: List[Mount]) -> None:
+        cwd = Path.cwd()
+
+        for src, dst in mounts:
+            if dst.is_absolute():
+                dst = cwd / dst.relative_to(dst.root)
+
+            if dst.exists():
+                if src.is_file():
+                    shutil.move(dst, src)
+                elif dst.is_relative_to(src):
+                    for path in dst.glob("*"):
+                        p = src / path.relative_to(dst)
+                        p.parent.mkdir(parents=True, exist_ok=True)
+                        _copy_file_or_dir(path, p)
+                    shutil.rmtree(dst)
+                else:
+                    with report_critical_failure(
+                        f"Failed to override the directory {src}"
+                    ):
+                        shutil.rmtree(src)
+                        shutil.move(dst, src)
+
+            for path in dst.parents:
+                if (
+                    path.exists()
+                    and path.is_relative_to(cwd)
+                    and len(list(path.glob("**/*"))) == 0
+                ):
+                    path.rmdir()
+
+    def _build_native_command(self, command: str, envs: Dict[str, str]) -> str:
+        if len(envs) == 0:
+            return command
+        else:
+            return " ".join(f"{k}={v}" for k, v in envs.items()) + " " + command
+
+    def run(
+        self,
+        command: str,
+        mounts: List[Mount] = [],
+        envs: Dict[str, str] = {},
+        as_current_user: bool = True,
+        capture_output: bool = False,
+        cpu_limit: Optional[int] = None,
+        memory_limit: Optional[str] = None,
+        work_dir: Union[Path, str, None] = None,
+    ) -> None:
+        assert as_current_user, "NativeContainer can only run as the current user."
+        assert memory_limit is None, "NativeContainer does not support memory limit."
+        assert cpu_limit is None, "NativeContainer does not support CPU limit."
+
+        import ipdb; ipdb.set_trace()
+        self._mount_files(mounts)
+
+        cmd = self._build_native_command(command, envs)
+        logger.debug(cmd)
+
+        if work_dir is None:
+            work_dir = Path.cwd()
+        else:
+            work_dir = Path(work_dir)
+            if work_dir.is_absolute():
+                work_dir = Path.cwd() / work_dir.relative_to(work_dir.root)
+
+        with working_directory(work_dir):
+            execute(cmd, capture_output=capture_output)
+
+        self._unmount_files(mounts)
+
+    def run_interactive(
+        self,
+        command: str,
+        mounts: List[Mount] = [],
+        envs: Dict[str, str] = {},
+        as_current_user: bool = True,
+        cpu_limit: Optional[int] = None,
+        memory_limit: Optional[str] = None,
+        work_dir: Optional[str] = None,
+    ) -> subprocess.Popen:
+        assert as_current_user, "NativeContainer can only run as the current user."
+
+        self._mount_files(mounts)
+        self.mounts = mounts
+
+        cmd = self._build_native_command(command, envs)
+        logger.debug(cmd)
+
+        if work_dir is None:
+            work_dir = Path.cwd()
+        else:
+            work_dir = Path(work_dir)
+            if work_dir.is_absolute():
+                work_dir = Path.cwd() / work_dir.relative_to(work_dir.root)
+
+        with working_directory(work_dir):
+            proc = subprocess.Popen(
+                shlex.split(cmd),
+                stdin=subprocess.PIPE,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                universal_newlines=True,
+                encoding="utf-8",
+                bufsize=1,
+            )
+
+        return proc
+
+    def cleanup(self) -> None:
+        self._unmount_files(self.mounts)
+
+
+class DockerContainer(Container):
+    """A container that runs commands in a Docker container."""
+
+    def __init__(self, image: str) -> None:
+        self.image = image
+        self.cid_file = None
+
+    def _build_docker_command(
+        self,
+        command: str,
+        mounts: List[Mount],
+        envs: Dict[str, str],
+        as_current_user: bool,
+        cpu_limit: Optional[int] = None,
+        memory_limit: Optional[str] = None,
+        work_dir: Optional[str] = None,
+        interactive: bool = False,
+    ) -> Tuple[str, Path]:
+        cid_file = Path(next(tempfile._get_candidate_names()) + ".cid")
+        cmd = f"docker run --cidfile {cid_file} --rm"
+        if as_current_user:
+            cmd += f" -u {os.getuid()}"
+        for src, dst in mounts:
+            cmd += f' --mount type=bind,src="{src}",target="{dst}"'
+        for k, v in envs.items():
+            cmd += f" --env {k}={v}"
+        if cpu_limit:
+            cmd += f" --cpus {cpu_limit}"
+        if memory_limit:
+            cmd += f" --memory {memory_limit}"
+        if work_dir:
+            cmd += f" --workdir {work_dir}"
+        if interactive:
+            cmd += " -i"
+        cmd += f" {self.image} {command}"
+        return cmd, cid_file
+
+    def run(
+        self,
+        command: str,
+        mounts: List[Mount] = [],
+        envs: Dict[str, str] = {},
+        as_current_user: bool = True,
+        capture_output: bool = False,
+        cpu_limit: Optional[int] = None,
+        memory_limit: Optional[str] = None,
+        work_dir: Optional[str] = None,
+    ) -> None:
+        cmd, cid_file = self._build_docker_command(
+            command,
+            mounts,
+            envs,
+            as_current_user,
+            cpu_limit,
+            memory_limit,
+            work_dir,
+            interactive=False,
+        )
+        logger.debug(cmd)
+
+        def _exit_gracefully(signum, frame):
+            cid = open(cid_file).read().strip()
+            execute(f"docker stop -t 1 {cid}", capture_output=True)
+            raise RuntimeError(f"Failed to execute {cmd}")
+
+        old_sigint = signal.signal(signal.SIGINT, _exit_gracefully)
+        old_sigterm = signal.signal(signal.SIGTERM, _exit_gracefully)
+
+        execute(cmd, capture_output=capture_output)
+
+        signal.signal(signal.SIGINT, old_sigint)
+        signal.signal(signal.SIGTERM, old_sigterm)
+        if cid_file.exists():
+            cid_file.unlink()
+
+    def run_interactive(
+        self,
+        command: str,
+        mounts: List[Mount] = [],
+        envs: Dict[str, str] = {},
+        as_current_user: bool = False,
+        cpu_limit: Optional[int] = None,
+        memory_limit: Optional[str] = None,
+        work_dir: Optional[str] = None,
+    ) -> subprocess.Popen:
+        cmd, self.cid_file = self._build_docker_command(
+            command,
+            mounts,
+            envs,
+            as_current_user,
+            cpu_limit,
+            memory_limit,
+            work_dir,
+            interactive=True,
+        )
+        logger.debug(cmd)
+        proc = subprocess.Popen(
+            shlex.split(cmd),
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            universal_newlines=True,
+            encoding="utf-8",
+            bufsize=1,
+        )
+        return proc
+
+    def cleanup(self) -> None:
+        # Cannot use `self.proc.terminate()` to stop Docker since it may be running as root.
+        if self.cid_file is None or not self.cid_file.exists():
+            return
+        cid = self.cid_file.open().read().strip()
+        os.system(f"docker stop -t 1 {cid} 1>/dev/null 2>/dev/null")
+
+
+def get_container() -> Container:
+    if CONTAINER == "docker":
+        return DockerContainer(DOCKER_TAG)
+    else:
+        assert (
+            CONTAINER == "native"
+        ), "Currently only `docker` and `native` are supported."
+        return NativeContainer()
diff --git a/data_extraction/ExtractData.lean b/data_extraction/ExtractData.lean
new file mode 100644
index 0000000..d161e79
--- /dev/null
+++ b/data_extraction/ExtractData.lean
@@ -0,0 +1,530 @@
+import Lean
+import Lake
+
+
+open Lean Elab System
+
+set_option maxHeartbeats 2000000  -- 10x the default maxHeartbeats.
+
+
+instance : ToJson Substring where
+  toJson s := toJson s.toString
+
+instance : ToJson String.Pos where
+  toJson n := toJson n.1
+
+deriving instance ToJson for SourceInfo
+deriving instance ToJson for Syntax.Preresolved
+deriving instance ToJson for Syntax
+deriving instance ToJson for Position
+
+
+namespace LeanDojo
+
+
+/--
+The trace of a tactic.
+-/
+structure TacticTrace where
+  stateBefore: String
+  stateAfter: String
+  pos: String.Pos      -- Start position of the tactic.
+  endPos: String.Pos   -- End position of the tactic.
+deriving ToJson
+
+
+/--
+The trace of a premise.
+-/
+structure PremiseTrace where
+  fullName: String            -- Fully-qualified name of the premise.
+  defPos: Option Position     -- Where the premise is defined.
+  defEndPos: Option Position
+  modName: String             -- In which module the premise is defined.
+  defPath: String             -- The path of the file where the premise is defined.
+  pos: Option Position        -- Where the premise is used.
+  endPos: Option Position
+deriving ToJson
+
+
+/--
+The trace of a Lean file.
+-/
+structure Trace where
+  commandASTs : Array Syntax    -- The ASTs of the commands in the file.
+  tactics: Array TacticTrace    -- All tactics in the file.
+  premises: Array PremiseTrace  -- All premises in the file.
+deriving ToJson
+
+
+abbrev TraceM := StateT Trace MetaM
+
+
+namespace Pp
+
+
+private def addLine (s : String) : String :=
+  if s.isEmpty then s else s ++ "\n"
+
+
+-- Similar to `Meta.ppGoal` but uses String instead of Format to make sure local declarations are separated by "\n".
+private def ppGoal (mvarId : MVarId) : MetaM String := do
+  match (← getMCtx).findDecl? mvarId with
+  | none          => return "unknown goal"
+  | some mvarDecl =>
+    let indent         := 2
+    let lctx           := mvarDecl.lctx
+    let lctx           := lctx.sanitizeNames.run' { options := (← getOptions) }
+    Meta.withLCtx lctx mvarDecl.localInstances do
+      -- The followint two `let rec`s are being used to control the generated code size.
+      -- Then should be remove after we rewrite the compiler in Lean
+      let rec pushPending (ids : List Name) (type? : Option Expr) (s : String) : MetaM String := do
+        if ids.isEmpty then
+          return s
+        else
+          let s := addLine s
+          match type? with
+          | none      => return s
+          | some type =>
+            let typeFmt ← Meta.ppExpr type
+            return (s ++ (Format.joinSep ids.reverse (format " ") ++ " :" ++ Format.nest indent (Format.line ++ typeFmt)).group).pretty
+      let rec ppVars (varNames : List Name) (prevType? : Option Expr) (s : String) (localDecl : LocalDecl) : MetaM (List Name × Option Expr × String) := do
+        match localDecl with
+        | .cdecl _ _ varName type _ _ =>
+          let varName := varName.simpMacroScopes
+          let type ← instantiateMVars type
+          if prevType? == none || prevType? == some type then
+            return (varName :: varNames, some type, s)
+          else do
+            let s ← pushPending varNames prevType? s
+            return ([varName], some type, s)
+        | .ldecl _ _ varName type val _ _ => do
+          let varName := varName.simpMacroScopes
+          let s ← pushPending varNames prevType? s
+          let s  := addLine s
+          let type ← instantiateMVars type
+          let typeFmt ← Meta.ppExpr type
+          let mut fmtElem  := format varName ++ " : " ++ typeFmt
+          let val ← instantiateMVars val
+          let valFmt ← Meta.ppExpr val
+          fmtElem := fmtElem ++ " :=" ++ Format.nest indent (Format.line ++ valFmt)
+          let s := s ++ fmtElem.group.pretty
+          return ([], none, s)
+      let (varNames, type?, s) ← lctx.foldlM (init := ([], none, "")) fun (varNames, prevType?, s) (localDecl : LocalDecl) =>
+         if localDecl.isAuxDecl || localDecl.isImplementationDetail then
+           -- Ignore auxiliary declarations and implementation details.
+           return (varNames, prevType?, s)
+         else
+           ppVars varNames prevType? s localDecl
+      let s ← pushPending varNames type? s
+      let goalTypeFmt ← Meta.ppExpr (← instantiateMVars mvarDecl.type)
+      let goalFmt := Meta.getGoalPrefix mvarDecl ++ Format.nest indent goalTypeFmt
+      let s := s ++ "\n" ++ goalFmt.pretty
+      match mvarDecl.userName with
+      | Name.anonymous => return s
+      | name           => return "case " ++ name.eraseMacroScopes.toString ++ "\n" ++ s
+
+
+def ppGoals (ctx : ContextInfo) (goals : List MVarId) : IO String :=
+  if goals.isEmpty then
+    return "no goals"
+  else
+    let fmt := ctx.runMetaM {} (return Std.Format.prefixJoin "\n\n" (← goals.mapM (ppGoal ·)))
+    return (← fmt).pretty.trim
+
+
+end Pp
+
+
+namespace Path
+
+/--
+Return the path of `path` relative to `parent`.
+-/
+def relativeTo (path parent : FilePath) : Option FilePath :=
+  let rec componentsRelativeTo (pathComps parentComps : List String) : Option FilePath :=
+    match pathComps, parentComps with
+    | _, [] => mkFilePath pathComps
+    | [], _ => none
+    | (h₁ :: t₁), (h₂ :: t₂) =>
+      if h₁ == h₂ then
+        componentsRelativeTo t₁ t₂
+      else
+        none
+
+    componentsRelativeTo path.components parent.components
+
+
+/--
+Return if the path `path` is relative to `parent`.
+-/
+def isRelativeTo (path parent : FilePath) : Bool :=
+  match relativeTo path parent with
+  | some _ => true
+  | none => false
+
+
+/--
+Convert the path `path` to an absolute path.
+-/
+def toAbsolute (path : FilePath) : IO FilePath := do
+  if path.isAbsolute then
+    pure path
+  else
+    let cwd ← IO.currentDir
+    pure $ cwd / path
+
+
+private def trim (path : FilePath) : FilePath :=
+  assert! path.isRelative
+  mkFilePath $ path.components.filter (· != ".")
+
+
+def packagesDir : FilePath :=
+  if Lake.defaultPackagesDir == "packages"  then
+    ".lake" / Lake.defaultPackagesDir
+  else
+    Lake.defaultPackagesDir
+
+
+def buildDir : FilePath :=
+  if Lake.defaultPackagesDir.fileName == "packages" then  -- Lean >= v4.3.0-rc2
+    ".lake/build"
+  else  -- Lean < v4.3.0-rc2
+   "build"
+
+
+def libDir : FilePath := buildDir / "lib"
+
+
+/--
+Convert the path of a *.lean file to its corresponding file (e.g., *.olean) in the "build" directory.
+-/
+def toBuildDir (subDir : FilePath) (path : FilePath) (ext : String) : Option FilePath :=
+  let path' := (trim path).withExtension ext
+  match relativeTo path' $ packagesDir / "lean4/src" with
+  | some p =>
+    match relativeTo p "lean/lake" with
+    | some p' => packagesDir / "lean4/lib/lean" / p'
+    | none => packagesDir / "lean4/lib" / p
+  | none => match relativeTo path' packagesDir with
+    | some p =>
+      match p.components with
+      | [] => none
+      | hd :: tl => packagesDir / hd / buildDir / subDir / (mkFilePath tl)
+    | none => buildDir / subDir / path'
+
+
+/--
+The reverse of `toBuildDir`.
+-/
+-- proofwidgets/build/lib/ProofWidgets/Compat.lean
+-- proofwidgets/.lake/build/lib
+def toSrcDir! (path : FilePath) (ext : String) : FilePath :=
+  let path' := (trim path).withExtension ext
+  match relativeTo path' $ packagesDir / "lean4/lib" with
+  | some p =>  -- E.g., `.lake/packages/lean4/lib/lean/Init/Prelude.olean` -> `.lake/packages/lean4/src/lean/Init/Prelude.lean`
+    packagesDir / "lean4/src" / p
+  | none =>
+    match relativeTo path' packagesDir with
+    | some p =>  -- E.g., `.lake/packages/aesop/.lake/build/lib/Aesop.olean`-> `.lake/packages/aesop/Aesop.lean`
+      let pkgName := p.components.head!
+      let sep := "build/lib/"
+      packagesDir / pkgName / (p.toString.splitOn sep |>.tail!.head!)
+    | none =>
+      -- E.g., `.lake/build/lib/Mathlib/LinearAlgebra/Basic.olean` -> `Mathlib/LinearAlgebra/Basic.lean`
+      relativeTo path' libDir |>.get!
+
+
+/--
+Create all parent directories of `p` if they don't exist.
+-/
+def makeParentDirs (p : FilePath) : IO Unit := do
+  let some parent := p.parent | throw $ IO.userError s!"Unable to get the parent of {p}"
+  IO.FS.createDirAll parent
+
+
+/--
+Return the *.lean file corresponding to a module name.
+-/
+def findLean (mod : Name) : IO FilePath := do
+  let modStr := mod.toString
+  if modStr.startsWith "«lake-packages»." then
+    return FilePath.mk (modStr.replace "«lake-packages»" "lake-packages" |>.replace "." "/") |>.withExtension "lean"
+  if modStr.startsWith "«.lake»." then
+    return FilePath.mk (modStr.replace "«.lake»" ".lake" |>.replace "." "/") |>.withExtension "lean"
+  let olean ← findOLean mod
+  -- Remove a "build/lib/" substring from the path.
+  let lean := olean.toString.replace ".lake/build/lib/" ""
+    |>.replace "build/lib/" "" |>.replace "lib/lean/Lake/" "lib/lean/lake/Lake/"
+  let mut path := FilePath.mk lean |>.withExtension "lean"
+  let leanLib ← getLibDir (← getBuildDir)
+  if let some p := relativeTo path leanLib then
+    path := packagesDir / "lean4/src/lean" / p
+  assert! ← path.pathExists
+  return path
+
+end Path
+
+
+namespace Traversal
+
+
+/--
+Extract tactic information from `TacticInfo` in `InfoTree`.
+-/
+private def visitTacticInfo (ctx : ContextInfo) (ti : TacticInfo) (parent : InfoTree) : TraceM Unit := do
+  match ti.stx.getKind with
+  | ``Lean.Parser.Term.byTactic =>
+    match ti.stx with
+    | .node _ _ #[.atom _ "by", .node _ ``Lean.Parser.Tactic.tacticSeq _] => pure ()
+    | _ => assert! false
+
+  | ``Lean.Parser.Tactic.tacticSeq =>
+    match ti.stx with
+    | .node _ _ #[.node _ ``Lean.Parser.Tactic.tacticSeq1Indented _] => pure ()
+    | .node _ _ #[.node _ ``Lean.Parser.Tactic.tacticSeqBracketed _] => pure ()
+    | _ => assert! false
+
+  | _ => pure ()
+
+  match parent with
+  | .node (Info.ofTacticInfo i) _ =>
+    match i.stx.getKind with
+    | ``Lean.Parser.Tactic.tacticSeq1Indented | ``Lean.Parser.Tactic.tacticSeqBracketed | ``Lean.Parser.Tactic.rewriteSeq =>
+      let ctxBefore := { ctx with mctx := ti.mctxBefore }
+      let ctxAfter := { ctx with mctx := ti.mctxAfter }
+      let stateBefore ← Pp.ppGoals ctxBefore ti.goalsBefore
+      let stateAfter ← Pp.ppGoals ctxAfter ti.goalsAfter
+      if stateBefore == "no goals" || stateBefore == stateAfter then
+        pure ()
+      else
+        let some posBefore := ti.stx.getPos? true | pure ()
+        let some posAfter := ti.stx.getTailPos? true | pure ()
+        match ti.stx with
+        | .node _ _ _ =>
+          modify fun trace => {
+            trace with tactics := trace.tactics.push {
+              stateBefore := stateBefore,
+              stateAfter := stateAfter,
+              pos := posBefore,
+              endPos := posAfter,
+             }
+          }
+        | _ => pure ()
+    | _ => pure ()
+  | _ => pure ()
+
+
+/--
+Extract premise information from `TermInfo` in `InfoTree`.
+-/
+private def visitTermInfo (ti : TermInfo) (env : Environment) : TraceM Unit := do
+  let some fullName := ti.expr.constName? | return ()
+  let fileMap ← getFileMap
+
+  let posBefore := match ti.toElabInfo.stx.getPos? with
+    | some posInfo => fileMap.toPosition posInfo
+    | none => none
+
+  let posAfter := match ti.toElabInfo.stx.getTailPos? with
+    | some posInfo => fileMap.toPosition posInfo
+    | none => none
+
+  let decRanges ← withEnv env $ findDeclarationRanges? fullName
+  let defPos := decRanges >>= fun (decR : DeclarationRanges) => decR.selectionRange.pos
+  let defEndPos := decRanges >>= fun (decR : DeclarationRanges) => decR.selectionRange.endPos
+
+  let modName :=
+  if let some modIdx := env.const2ModIdx.find? fullName then
+    env.header.moduleNames[modIdx.toNat]!
+  else
+    env.header.mainModule
+
+  let mut defPath := toString $ ← Path.findLean modName
+  if defPath.startsWith "./" then
+    defPath := defPath.drop 2
+  if defPath.startsWith "/lake/" then
+    defPath := ".lake/" ++ (defPath.drop 6)
+
+  if defPos != posBefore ∧ defEndPos != posAfter then  -- Don't include defintions as premises.
+    modify fun trace => {
+        trace with premises := trace.premises.push {
+          fullName := toString fullName,
+          defPos := defPos,
+          defEndPos := defEndPos,
+          defPath := defPath,
+          modName := toString modName,
+          pos := posBefore,
+          endPos := posAfter,
+        }
+    }
+
+
+private def visitInfo (ctx : ContextInfo) (i : Info) (parent : InfoTree) (env : Environment) : TraceM Unit := do
+  match i with
+  | .ofTacticInfo ti => visitTacticInfo ctx ti parent
+  | .ofTermInfo ti => visitTermInfo ti env
+  | _ => pure ()
+
+
+private partial def traverseTree (ctx: ContextInfo) (tree : InfoTree)
+(parent : InfoTree) (env : Environment) : TraceM Unit := do
+  match tree with
+  | .context ctx' t =>
+    match ctx'.mergeIntoOuter? ctx with
+    | some ctx' => traverseTree ctx' t tree env
+    | none => panic! "fail to synthesis contextInfo when traversing infoTree"
+  | .node i children =>
+    visitInfo ctx i parent env
+    for x in children do
+      traverseTree ctx x tree env
+  | _ => pure ()
+
+
+private def traverseTopLevelTree (tree : InfoTree) (env : Environment) : TraceM Unit := do
+  match tree with
+  | .context ctx t =>
+    match ctx.mergeIntoOuter? none with
+    | some ctx => traverseTree ctx t tree env
+    | none => panic! "fail to synthesis contextInfo for top-level infoTree"
+  | _ => pure ()
+
+
+/--
+Process an array of `InfoTree` (one for each top-level command in the file).
+-/
+def traverseForest (trees : Array InfoTree) (env : Environment) : TraceM Trace := do
+  for t in trees do
+    traverseTopLevelTree t env
+  get
+
+
+end Traversal
+
+
+open Traversal
+
+
+def getImports (header: Syntax) : IO String := do
+  -- Similar to `lean --deps` in Lean 3.
+  let mut s := ""
+
+  for dep in headerToImports header do
+    let oleanPath ← findOLean dep.module
+    if oleanPath.isRelative then
+      let leanPath := Path.toSrcDir! oleanPath "lean"
+      assert! ← leanPath.pathExists
+      s := s ++ "\n" ++ leanPath.toString
+    else if ¬(oleanPath.toString.endsWith "/lib/lean/Init.olean") then
+      let mut p := (Path.packagesDir / "lean4").toString ++ FilePath.pathSeparator.toString
+      let mut found := false
+      for c in (oleanPath.withExtension "lean").components do
+        if c == "lib" then
+          found := true
+          p := p ++ "src"
+          continue
+        if found then
+          p := p ++ FilePath.pathSeparator.toString ++ c
+      p := p.replace "/lean4/src/lean/Lake" "/lean4/src/lean/lake/Lake"
+      assert! ← FilePath.mk p |>.pathExists
+      s := s ++ "\n" ++ p
+
+  return s.trim
+
+
+/--
+Trace a *.lean file.
+-/
+unsafe def processFile (path : FilePath) : IO Unit := do
+  println! path
+  let input ← IO.FS.readFile path
+  enableInitializersExecution
+  let inputCtx := Parser.mkInputContext input path.toString
+  let (header, parserState, messages) ← Parser.parseHeader inputCtx
+  let (env, messages) ← processHeader header {} messages inputCtx
+
+  if messages.hasErrors then
+    for msg in messages.toList do
+      if msg.severity == .error then
+        println! "ERROR: {← msg.toString}"
+    throw $ IO.userError "Errors during import; aborting"
+
+  let env := env.setMainModule (← moduleNameOfFileName path none)
+  let commandState := { Command.mkState env messages {} with infoState.enabled := true }
+  let s ← IO.processCommands inputCtx parserState commandState
+  let env' := s.commandState.env
+  let commands := s.commands.pop -- Remove EOI command.
+  let trees := s.commandState.infoState.trees.toArray
+
+  let traceM := (traverseForest trees env').run' ⟨#[header] ++ commands, #[], #[]⟩
+  let (trace, _) ← traceM.run'.toIO {fileName := s!"{path}", fileMap := FileMap.ofString input} {env := env}
+
+  let cwd ← IO.currentDir
+  assert! cwd.fileName != "lean4"
+
+  let some relativePath := Path.relativeTo path cwd | throw $ IO.userError s!"Invalid path: {path}"
+  let json_path := Path.toBuildDir "ir" relativePath "ast.json" |>.get!
+  Path.makeParentDirs json_path
+  IO.FS.writeFile json_path (toJson trace).pretty
+
+  let dep_path := Path.toBuildDir "ir" relativePath "dep_paths" |>.get!
+  Path.makeParentDirs dep_path
+  IO.FS.writeFile dep_path (← getImports header)
+
+
+end LeanDojo
+
+
+open LeanDojo
+
+/--
+Whether a *.lean file should be traced.
+-/
+def shouldProcess (path : FilePath) (noDeps : Bool) : IO Bool := do
+  if (← path.isDir) ∨ path.extension != "lean" then
+    return false
+
+  let cwd ← IO.currentDir
+  let some relativePath := Path.relativeTo path cwd |
+    throw $ IO.userError s!"Invalid path: {path}"
+
+  if noDeps ∧ Path.isRelativeTo relativePath Path.packagesDir then
+    return false
+
+  let some oleanPath := Path.toBuildDir "lib" relativePath "olean" |
+    throw $ IO.userError s!"Invalid path: {path}"
+  return ← oleanPath.pathExists
+
+
+/--
+Trace all *.lean files in the current directory whose corresponding *.olean file exists.
+-/
+def processAllFiles (noDeps : Bool) : IO Unit := do
+    let cwd ← IO.currentDir
+    assert! cwd.fileName != "lean4"
+    println! "Extracting data at {cwd}"
+
+    let mut tasks := #[]
+    for path in ← System.FilePath.walkDir cwd do
+      if ← shouldProcess path noDeps then
+        println! path
+        let t ← IO.asTask $ IO.Process.run
+          {cmd := "lake", args := #["env", "lean", "--run", "ExtractData.lean", path.toString]}
+        tasks := tasks.push (t, path)
+
+    for (t, path) in tasks do
+      match ← IO.wait t with
+      | Except.error _ =>
+        println! s!"WARNING: Failed to process {path}"
+        pure ()
+        -- throw e
+      | Except.ok _ => pure ()
+
+
+unsafe def main (args : List String) : IO Unit := do
+  match args with
+  | ["noDeps"] => processAllFiles (noDeps := true)
+  | [path] => processFile (← Path.toAbsolute ⟨path⟩)
+  | [] => processAllFiles (noDeps := false)
+  | _ => throw $ IO.userError "Invalid arguments"
diff --git a/data_extraction/ast.py b/data_extraction/ast.py
new file mode 100644
index 0000000..ff716ac
--- /dev/null
+++ b/data_extraction/ast.py
@@ -0,0 +1,1576 @@
+from lxml import etree
+from pathlib import Path
+from dataclasses import dataclass, field
+from xml.sax.saxutils import escape, unescape
+from typing import List, Dict, Any, Optional, Callable, Tuple, Generator
+
+from ..utils import (
+    camel_case,
+    is_optional_type,
+    remove_optional_type,
+    parse_int_list,
+    parse_str_list,
+)
+from .lean import Pos, LeanFile
+
+
+@dataclass(frozen=True)
+class Node:
+    lean_file: LeanFile
+    start: Optional[Pos]
+    end: Optional[Pos]
+    children: List["Node"] = field(repr=False)
+
+    @classmethod
+    def from_data(cls, node_data: Dict[str, Any], lean_file: LeanFile) -> "Node":
+        subcls = cls._kind_to_node_type(node_data["kind"])
+        return subcls.from_data(node_data, lean_file)
+
+    @classmethod
+    def _kind_to_node_type(cls, kind: str) -> type:
+        prefix = "Lean.Parser."
+        if kind.startswith(prefix):
+            kind = kind[len(prefix) :]
+        cls_name = camel_case(kind.replace(".", "_")) + "Node"
+        gbs = globals()
+        if cls_name in gbs:
+            return gbs[cls_name]  # type: ignore
+        else:
+            # logger.warning(kind)
+            return OtherNode
+
+    @classmethod
+    def kind(cls: type) -> str:
+        return cls.__name__[:-4].lower()
+
+    def traverse_preorder(
+        self,
+        callback: Callable[["Node", List["Node"]], Any],
+        node_cls: Optional[type],
+        parents: List["Node"] = [],
+    ) -> None:
+        if node_cls is None or isinstance(self, node_cls):
+            if callback(self, parents):
+                return
+        for child in self.children:
+            child.traverse_preorder(callback, node_cls, parents + [self])
+
+    def traverse_postorder(
+        self,
+        callback: Callable[["Node", List[Any]], Any],
+    ) -> Any:
+        return callback(
+            self, [child.traverse_postorder(callback) for child in self.children]
+        )
+
+    def to_xml(self, parent: etree.Element) -> None:
+        tree = etree.SubElement(parent, self.__class__.__name__)
+
+        for k in self.__dataclass_fields__:
+            if k in ("lean_file", "children"):
+                continue
+            v = getattr(self, k)
+            if v is not None:
+                v = escape(str(v), entities={'"': "&quot;"})
+                tree.set(k, v)
+
+        for child in self.children:
+            child.to_xml(tree)
+
+    @classmethod
+    def from_xml(cls, tree: etree.Element, lean_file: LeanFile) -> "Node":
+        subcls = globals()[tree.tag]
+        start = Pos.from_str(tree.attrib["start"]) if "start" in tree.attrib else None
+        end = Pos.from_str(tree.attrib["end"]) if "end" in tree.attrib else None
+        children = [Node.from_xml(subtree, lean_file) for subtree in tree]
+        kwargs = {}
+
+        for field in subcls.__dataclass_fields__.values():
+            if field.name in ("lean_file", "start", "end", "children"):
+                continue
+            v = tree.attrib.get(field.name, None)
+            if v is None:
+                kwargs[field.name] = None
+                continue
+
+            assert isinstance(v, str)
+            v = unescape(v, entities={"&quot;": '"'})
+            tp = (
+                remove_optional_type(field.type)
+                if is_optional_type(field.type)
+                else field.type
+            )
+            if tp is Pos:
+                kwargs[field.name] = Pos.from_str(v)
+            elif tp is Path:
+                kwargs[field.name] = Path(v)
+            elif tp is List[int]:
+                kwargs[field.name] = parse_int_list(v)
+            elif tp is List[str]:
+                kwargs[field.name] = parse_str_list(v)
+            else:
+                kwargs[field.name] = v  # type: ignore
+
+        return subcls(lean_file, start, end, children, **kwargs)  # type: ignore
+
+    def get_closure(self) -> Tuple[Pos, Pos]:
+        return self.start, self.end
+
+
+def _parse_pos(info: Dict[str, Any], lean_file: LeanFile) -> Pos:
+    if "synthetic" in info and not info["synthetic"]["canonical"]:
+        return None
+
+    if (
+        "original" in info
+    ):  # | original (leading : Substring) (pos : String.Pos) (trailing : Substring) (endPos : String.Pos)
+        start, end = info["original"]["pos"], info["original"]["endPos"]
+    else:
+        assert (
+            "synthetic" in info
+        )  # | synthetic (pos : String.Pos) (endPos : String.Pos) (canonical := false)
+        start, end = info["synthetic"]["pos"], info["synthetic"]["endPos"]
+
+    start = lean_file.convert_pos(start)
+    end = lean_file.convert_pos(end)
+
+    return start, end
+
+
+@dataclass(frozen=True)
+class AtomNode(Node):
+    leading: str
+    trailing: str
+    val: str
+
+    @classmethod
+    def from_data(
+        cls, atom_data: Dict[str, Any], lean_file: LeanFile
+    ) -> Optional["AtomNode"]:
+        info = atom_data["info"]
+        start, end = _parse_pos(info, lean_file)
+
+        if "original" in info:
+            leading = info["original"]["leading"]
+            trailing = info["original"]["trailing"]
+        else:
+            assert "synthetic" in info
+            leading = info["synthetic"]["leading"]
+            trailing = info["synthetic"]["trailing"]
+
+        return cls(lean_file, start, end, [], leading, trailing, atom_data["val"])
+
+
+@dataclass(frozen=True)
+class IdentNode(Node):
+    leading: str
+    trailing: str
+    raw_val: str
+    val: str
+
+    full_name: Optional[str] = None
+    mod_name: Optional[str] = None
+    def_path: Optional[str] = None
+    def_start: Optional[Pos] = None
+    def_end: Optional[Pos] = None
+
+    @classmethod
+    def from_data(
+        cls, ident_data: Dict[str, Any], lean_file: LeanFile
+    ) -> Optional["IdentNode"]:
+        info = ident_data["info"]
+        start, end = _parse_pos(info, lean_file)
+        assert ident_data["preresolved"] == []
+
+        if "original" in info:
+            leading = info["original"]["leading"]
+            trailing = info["original"]["trailing"]
+        else:
+            assert "synthetic" in info
+            leading = info["synthetic"]["leading"]
+            trailing = info["synthetic"]["trailing"]
+
+        return cls(
+            lean_file,
+            start,
+            end,
+            [],
+            leading,
+            trailing,
+            ident_data["rawVal"],
+            ident_data["val"],
+        )
+
+    @property
+    def is_mutual(self) -> bool:
+        return not isinstance(self.full_name, str)
+
+
+def is_leaf(node: Node) -> bool:
+    return isinstance(node, AtomNode) or isinstance(node, IdentNode)
+
+
+@dataclass(frozen=True)
+class FileNode(Node):
+    @classmethod
+    def from_data(cls, data: Dict[str, Any], lean_file: LeanFile) -> "FileNode":
+        children = []
+
+        def _get_closure(node: Node, child_spans: List[Tuple[Pos, Pos]]):
+            if len(child_spans) == 0:
+                return node.start, node.end
+
+            child_starts = [s for s, _ in child_spans if s is not None]
+            if len(child_starts) == 0:
+                start = None
+            else:
+                start = min(child_starts)
+
+            child_ends = [e for _, e in child_spans if e is not None]
+            if len(child_ends) == 0:
+                end = None
+            else:
+                end = max(child_ends)
+
+            if node.start is None:
+                object.__setattr__(node, "start", start)
+            else:
+                assert node.start == start
+
+            if node.end is None:
+                object.__setattr__(node, "end", end)
+            else:
+                assert node.end == end
+
+            return start, end
+
+        for i, d in enumerate(data["commandASTs"]):
+            node_data = d["node"]
+            if i == 0:
+                assert node_data["kind"] == "Lean.Parser.Module.header"
+            node = Node.from_data(node_data, lean_file)
+            node.traverse_postorder(_get_closure)
+            children.append(node)
+
+        return cls(lean_file, lean_file.start_pos, lean_file.end_pos, children)
+
+
+def _parse_children(node_data: Dict[str, Any], lean_file: LeanFile) -> List[Node]:
+    children = []
+
+    for d in node_data["args"]:
+        if (
+            "node" in d
+        ):  # | node   (info : SourceInfo) (kind : SyntaxNodeKind) (args : Array Syntax) : Syntax
+            node = Node.from_data(d["node"], lean_file)
+        elif "atom" in d:  # | atom   (info : SourceInfo) (val : String) : Syntax
+            node = AtomNode.from_data(d["atom"], lean_file)
+        elif (
+            "ident" in d
+        ):  # | ident  (info : SourceInfo) (rawVal : Substring) (val : Name) (preresolved : List Syntax.Preresolved) : Syntax
+            node = IdentNode.from_data(d["ident"], lean_file)
+        else:
+            raise ValueError(d)
+
+        if node is not None:
+            children.append(node)
+
+    return children
+
+
+@dataclass(frozen=True)
+class TermAttrkindNode(Node):
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "TermAttrkindNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        return cls(lean_file, start, end, children)
+
+
+@dataclass(frozen=True)
+class TermAttrkindAntiquotNode(Node):
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "TermAttrkindAntiquotNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        return cls(lean_file, start, end, children)
+
+
+@dataclass(frozen=True)
+class IdentAntiquotNode(Node):
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "IdentAntiquotNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        return cls(lean_file, start, end, children)
+
+    def get_ident(self) -> str:
+        return "".join(gc.val for gc in self.children if is_leaf(gc))
+
+
+@dataclass(frozen=True)
+class LeanElabCommandCommandIrreducibleDefNode(Node):
+    name: Optional[str]
+    full_name: Optional[str] = None
+
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "LeanElabCommandCommandIrreducibleDefNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+
+        if isinstance(children[0], CommandDeclmodifiersAntiquotNode):
+            name = None
+        else:
+            assert isinstance(children[0], CommandDeclmodifiersNode)
+            assert (
+                isinstance(children[1], AtomNode)
+                and children[1].val == "irreducible_def"
+            )
+            declid_node = children[2]
+            assert isinstance(declid_node, CommandDeclidNode)
+            ident_node = declid_node.children[0]
+            assert isinstance(ident_node, IdentNode)
+            name = ident_node.val
+
+        return cls(lean_file, start, end, children, name)
+
+
+@dataclass(frozen=True)
+class GroupNode(Node):
+    @classmethod
+    def from_data(cls, node_data: Dict[str, Any], lean_file: LeanFile) -> "GroupNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        return cls(lean_file, start, end, children)
+
+
+@dataclass(frozen=True)
+class MathlibTacticLemmaNode(Node):
+    name: str
+    full_name: Optional[str] = None
+    _is_private_decl: Optional[bool] = (
+        False  # `_is_private` doesn't play well with lxml.
+    )
+
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "MathlibTacticLemmaNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+
+        assert isinstance(children[0], CommandDeclmodifiersNode)
+        assert isinstance(children[1], GroupNode)
+        assert (
+            isinstance(children[1].children[0], AtomNode)
+            and children[1].children[0].val == "lemma"
+        )
+        declid_node = children[1].children[1]
+        assert isinstance(declid_node, CommandDeclidNode)
+        ident_node = declid_node.children[0]
+        assert isinstance(ident_node, IdentNode)
+        name = ident_node.val
+
+        return cls(lean_file, start, end, children, name)
+
+    def is_private(self) -> bool:
+        return self._is_private_decl
+
+    def get_proof_node(self) -> Node:
+        decl_val_node = self.children[1].children[3]
+        if isinstance(
+            decl_val_node, (CommandDeclvalsimpleNode, CommandWherestructinstNode)
+        ):
+            return decl_val_node.children[1]
+        else:
+            return decl_val_node
+
+    def has_tactic_proof(self) -> bool:
+        node = self.get_proof_node()
+        return isinstance(node, TermBytacticNode)
+
+    @property
+    def is_mutual(self) -> bool:
+        return not isinstance(self.name, str)
+
+
+@dataclass(frozen=True)
+class LemmaNode(Node):
+    name: str
+    full_name: Optional[str] = None
+    _is_private_decl: Optional[bool] = (
+        False  # `_is_private` doesn't play well with lxml.
+    )
+
+    @classmethod
+    def from_data(cls, node_data: Dict[str, Any], lean_file: LeanFile) -> "LemmaNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+
+        assert isinstance(children[0], CommandDeclmodifiersNode)
+        assert isinstance(children[1], GroupNode)
+        assert (
+            isinstance(children[1].children[0], AtomNode)
+            and children[1].children[0].val == "lemma"
+        )
+        declid_node = children[1].children[1]
+        assert isinstance(declid_node, CommandDeclidNode)
+        ident_node = declid_node.children[0]
+        assert isinstance(ident_node, IdentNode)
+        name = ident_node.val
+
+        return cls(lean_file, start, end, children, name)
+
+    def is_private(self) -> bool:
+        return self._is_private_decl
+
+    def get_proof_node(self) -> Node:
+        decl_val_node = self.children[1].children[3]
+        if isinstance(
+            decl_val_node,
+            (
+                CommandDeclvalsimpleNode,
+                CommandWherestructinstNode,
+            ),
+        ):
+            return decl_val_node.children[1]
+        else:
+            return decl_val_node
+
+    def has_tactic_proof(self) -> bool:
+        node = self.get_proof_node()
+        return isinstance(node, TermBytacticNode)
+
+    @property
+    def is_mutual(self) -> bool:
+        return not isinstance(self.name, str)
+
+
+@dataclass(frozen=True)
+class CommandDeclarationNode(Node):
+    name: str
+    full_name: Optional[str] = None
+
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "CommandDeclarationNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+
+        if isinstance(children[0], CommandDeclmodifiersAntiquotNode):
+            name = None
+        else:
+            assert isinstance(children[0], CommandDeclmodifiersNode)
+            assert isinstance(
+                children[1],
+                (
+                    CommandDefNode,
+                    CommandDefinitionNode,
+                    CommandTheoremNode,
+                    CommandInductiveNode,
+                    CommandClassinductiveNode,
+                    CommandStructureNode,
+                    CommandInstanceNode,
+                    CommandAbbrevNode,
+                    CommandOpaqueNode,
+                    CommandAxiomNode,
+                    CommandExampleNode,
+                ),
+            )
+            name = children[1].name
+
+            if children[0].is_private():
+                for child in children:
+                    if isinstance(child, CommandTheoremNode):
+                        object.__setattr__(child, "_is_private_decl", True)
+
+        return cls(lean_file, start, end, children, name)
+
+    @property
+    def is_theorem(self) -> bool:
+        return isinstance(self.children[1], CommandTheoremNode)
+
+    def get_theorem_node(self) -> "CommandTheoremNode":
+        assert self.is_theorem
+        return self.children[1]
+
+    @property
+    def is_example(self) -> bool:
+        return isinstance(self.children[1], CommandExampleNode)
+
+
+@dataclass(frozen=True)
+class CommandDeclmodifiersAntiquotNode(Node):
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "CommandDeclmodifiersAntiquotNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        return cls(lean_file, start, end, children)
+
+
+@dataclass(frozen=True)
+class CommandDeclmodifiersNode(Node):
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "CommandDeclmodifiersNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        return cls(lean_file, start, end, children)
+
+    def is_private(self) -> bool:
+        result = False
+
+        def _callback(node: CommandPrivateNode, _) -> bool:
+            nonlocal result
+            result = True
+            return True
+
+        self.traverse_preorder(_callback, CommandPrivateNode)
+        return result
+
+
+@dataclass(frozen=True)
+class CommandPrivateNode(Node):
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "CommandPrivateNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        return cls(lean_file, start, end, children)
+
+
+@dataclass(frozen=True)
+class CommandOpenNode(Node):
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "CommandOpenNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        return cls(lean_file, start, end, children)
+
+
+@dataclass(frozen=True)
+class CommandOpenonlyNode(Node):
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "CommandOpenonlyNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        return cls(lean_file, start, end, children)
+
+
+@dataclass(frozen=True)
+class NullNode(Node):
+    @classmethod
+    def from_data(cls, node_data: Dict[str, Any], lean_file: LeanFile) -> "NullNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        return cls(lean_file, start, end, children)
+
+
+@dataclass(frozen=True)
+class CommandStructuretkNode(Node):
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "CommandStructuretkNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        assert isinstance(children[0], AtomNode) and children[0].val == "structure"
+        return cls(lean_file, start, end, children)
+
+
+@dataclass(frozen=True)
+class CommandClasstkNode(Node):
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "CommandClasstkNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        assert isinstance(children[0], AtomNode) and children[0].val == "class"
+        return cls(lean_file, start, end, children)
+
+
+@dataclass(frozen=True)
+class CommandStructureNode(Node):
+    name: str
+
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "CommandStructureNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+
+        assert isinstance(children[0], (CommandStructuretkNode, CommandClasstkNode))
+        if isinstance(children[1], CommandDeclidAntiquotNode):
+            name = None
+        else:
+            assert isinstance(children[1], CommandDeclidNode)
+            decl_id_node = children[1]
+            ident_node = decl_id_node.children[0]
+
+            assert isinstance(ident_node, IdentNode)
+            name = ident_node.val
+
+        return cls(lean_file, start, end, children, name)
+
+
+@dataclass(frozen=True)
+class CommandInductiveNode(Node):
+    name: Optional[str]
+
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "CommandInductiveNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+
+        assert isinstance(children[0], AtomNode) and children[0].val == "inductive"
+
+        if isinstance(children[1], CommandDeclidAntiquotNode):
+            name = None
+        else:
+            assert isinstance(children[1], CommandDeclidNode)
+            decl_id_node = children[1]
+            ident_node = decl_id_node.children[0]
+            assert isinstance(ident_node, IdentNode)
+            name = ident_node.val
+
+        return cls(lean_file, start, end, children, name)
+
+
+@dataclass(frozen=True)
+class CommandClassinductiveNode(Node):
+    name: str
+
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "CommandClassinductiveNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+
+        assert (
+            isinstance(children[0].children[0], AtomNode)
+            and children[0].children[0].val == "class"
+        )
+        assert (
+            isinstance(children[0].children[1], AtomNode)
+            and children[0].children[1].val == "inductive"
+        )
+
+        if isinstance(children[1], CommandDeclidAntiquotNode):
+            name = None
+        else:
+            assert isinstance(children[1], CommandDeclidNode)
+            decl_id_node = children[1]
+            ident_node = decl_id_node.children[0]
+            assert isinstance(ident_node, IdentNode)
+            name = ident_node.val
+
+        return cls(lean_file, start, end, children, name)
+
+
+@dataclass(frozen=True)
+class TermHoleNode(Node):
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "TermHoleNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        assert len(children) == 1 and isinstance(
+            children[0],
+            (
+                AtomNode,
+                TokenAntiquotNode,
+            ),
+        )
+        return cls(lean_file, start, end, children)
+
+
+@dataclass(frozen=True)
+class LeanBinderidentNode(Node):
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "LeanBinderidentNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        assert len(children) == 1 and isinstance(
+            children[0],
+            (
+                TermHoleNode,
+                IdentNode,
+                IdentAntiquotNode,
+            ),
+        )
+        return cls(lean_file, start, end, children)
+
+    def get_ident(self) -> Optional[str]:
+        if isinstance(self.children[0], TermHoleNode):
+            return None
+        else:
+            assert isinstance(self.children[0], IdentNode)
+            return self.children[0].val
+
+
+@dataclass(frozen=True)
+class LeanBinderidentAntiquotNode(Node):
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "LeanBinderidentAntiquotNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        return cls(lean_file, start, end, children)
+
+    def get_ident(self) -> Optional[str]:
+        return None
+
+
+@dataclass(frozen=True)
+class StdTacticAliasAliasNode(Node):
+    name: str
+    full_name: Optional[str] = None
+
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "StdTacticAliasAliasNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+
+        assert isinstance(children[0], CommandDeclmodifiersNode)
+        assert isinstance(children[1], AtomNode) and children[1].val == "alias"
+        if isinstance(children[2], IdentAntiquotNode):
+            name = None
+        else:
+            ident_node = children[2]
+            assert isinstance(ident_node, IdentNode)
+            name = ident_node.val
+        return cls(lean_file, start, end, children, name)
+
+
+@dataclass(frozen=True)
+class StdTacticAliasAliaslrNode(Node):
+    name: List[str]
+    full_name: Optional[List[str]] = None
+
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "StdTacticAliasAliaslrNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+
+        assert isinstance(children[0], CommandDeclmodifiersNode)
+        assert isinstance(children[1], AtomNode) and children[1].val == "alias"
+        assert isinstance(children[2], AtomNode) and children[2].val == "⟨"
+        assert isinstance(children[4], AtomNode) and children[4].val == ","
+        assert isinstance(children[6], AtomNode) and children[6].val == "⟩"
+
+        name = []
+        assert isinstance(
+            children[3], (LeanBinderidentNode, LeanBinderidentAntiquotNode)
+        )
+        name.append(children[3].get_ident())
+        assert isinstance(
+            children[5], (LeanBinderidentNode, LeanBinderidentAntiquotNode)
+        )
+        name.append(children[5].get_ident())
+        name = [n for n in name if n is not None]
+
+        return cls(lean_file, start, end, children, name)
+
+    @property
+    def is_mutual(self) -> bool:
+        return True
+
+
+@dataclass(frozen=True)
+class CommandAbbrevNode(Node):
+    name: str
+
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "CommandAbbrevNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+
+        assert isinstance(children[0], AtomNode) and children[0].val == "abbrev"
+        declid_node = children[1]
+        if isinstance(declid_node, CommandDeclidAntiquotNode):
+            name = None
+        else:
+            assert isinstance(declid_node, CommandDeclidNode)
+            ident_node = declid_node.children[0]
+            assert isinstance(ident_node, IdentNode)
+            name = ident_node.val
+
+        return cls(lean_file, start, end, children, name)
+
+
+@dataclass(frozen=True)
+class CommandOpaqueNode(Node):
+    name: str
+
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "CommandOpaqueNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+
+        assert isinstance(children[0], AtomNode) and children[0].val == "opaque"
+        declid_node = children[1]
+        if isinstance(declid_node, CommandDeclidAntiquotNode):
+            name = None
+        else:
+            assert isinstance(declid_node, CommandDeclidNode)
+            ident_node = declid_node.children[0]
+            assert isinstance(ident_node, IdentNode)
+            name = ident_node.val
+
+        return cls(lean_file, start, end, children, name)
+
+
+@dataclass(frozen=True)
+class CommandAxiomNode(Node):
+    name: str
+
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "CommandAxiomNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+
+        assert isinstance(children[0], AtomNode) and children[0].val == "axiom"
+        declid_node = children[1]
+        if isinstance(declid_node, CommandDeclidAntiquotNode):
+            name = None
+        else:
+            assert isinstance(declid_node, CommandDeclidNode)
+            ident_node = declid_node.children[0]
+            assert isinstance(ident_node, IdentNode)
+            name = ident_node.val
+
+        return cls(lean_file, start, end, children, name)
+
+
+@dataclass(frozen=True)
+class CommandExampleNode(Node):
+    name: str
+
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "CommandExampleNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        assert isinstance(children[0], AtomNode) and children[0].val == "example"
+        name = None
+        return cls(lean_file, start, end, children, name)
+
+
+@dataclass(frozen=True)
+class CommandInstanceNode(Node):
+    name: str
+
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "CommandInstanceNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        if isinstance(children[0], TermAttrkindAntiquotNode):
+            name = None
+        else:
+            assert isinstance(children[0], TermAttrkindNode)
+            assert isinstance(children[1], AtomNode) and children[1].val == "instance"
+            if children[3].children != []:
+                declid_node = children[3].children[0]
+                if isinstance(declid_node, CommandDeclidNode):
+                    ident_node = declid_node.children[0]
+                    assert isinstance(ident_node, IdentNode)
+                    name = ident_node.val
+                else:
+                    name = None
+            else:
+                name = None
+        return cls(lean_file, start, end, children, name)
+
+
+@dataclass(frozen=True)
+class CommandDefNode(Node):
+    name: str
+
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "CommandDefNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+
+        if isinstance(children[0], TokenAntiquotNode) or isinstance(
+            children[1], CommandDeclidAntiquotNode
+        ):
+            name = None
+        else:
+            assert isinstance(children[0], AtomNode) and children[0].val == "def"
+            assert isinstance(children[1], CommandDeclidNode)
+            decl_id_node = children[1]
+            ident_node = decl_id_node.children[0]
+
+            if isinstance(ident_node, IdentNode):
+                name = ident_node.val
+            else:
+                assert isinstance(ident_node, IdentAntiquotNode)
+                name = ident_node.get_ident()
+
+        return cls(lean_file, start, end, children, name)
+
+
+@dataclass(frozen=True)
+class CommandDefinitionNode(Node):
+    name: str
+
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "CommandDefinitionNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+
+        if isinstance(children[0], TokenAntiquotNode) or isinstance(
+            children[1], CommandDeclidAntiquotNode
+        ):
+            name = None
+        else:
+            assert isinstance(children[0], AtomNode) and children[0].val == "def"
+            assert isinstance(children[1], CommandDeclidNode)
+            decl_id_node = children[1]
+            ident_node = decl_id_node.children[0]
+
+            if isinstance(ident_node, IdentNode):
+                name = ident_node.val
+            else:
+                assert isinstance(ident_node, IdentAntiquotNode)
+                name = ident_node.get_ident()
+
+        return cls(lean_file, start, end, children, name)
+
+
+@dataclass(frozen=True)
+class CommandDeclidAntiquotNode(Node):
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "CommandDeclidAntiquotNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        return cls(lean_file, start, end, children)
+
+
+@dataclass(frozen=True)
+class CommandDeclidNode(Node):
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "CommandDeclidNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        return cls(lean_file, start, end, children)
+
+
+@dataclass(frozen=True)
+class CommandDeclvalsimpleNode(Node):
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "CommandDeclvalsimpleNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        return cls(lean_file, start, end, children)
+
+
+@dataclass(frozen=True)
+class TokenAntiquotNode(Node):
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "TokenAntiquotNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        return cls(lean_file, start, end, children)
+
+
+@dataclass(frozen=True)
+class CommandDeclvaleqnsNode(Node):
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "CommandDeclvaleqnsNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        return cls(lean_file, start, end, children)
+
+
+@dataclass(frozen=True)
+class CommandWherestructinstNode(Node):
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "CommandWherestructinstNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        return cls(lean_file, start, end, children)
+
+
+@dataclass(frozen=True)
+class CommandDeclsigNode(Node):
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "CommandDeclsigNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        return cls(lean_file, start, end, children)
+
+
+@dataclass(frozen=True)
+class TermExplicitbinderNode(Node):
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "TermExplicitbinderNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        return cls(lean_file, start, end, children)
+
+
+@dataclass(frozen=True)
+class TermTypespecNode(Node):
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "TermTypespecNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        return cls(lean_file, start, end, children)
+
+
+@dataclass(frozen=True)
+class CommandTheoremNode(Node):
+    name: str
+    full_name: Optional[str] = None
+    _is_private_decl: Optional[bool] = (
+        False  # `_is_private` doesn't play well with lxml.
+    )
+
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "CommandTheoremNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+
+        assert isinstance(children[0], AtomNode) and children[0].val == "theorem"
+
+        declid_node = children[1]
+        if isinstance(declid_node, CommandDeclidAntiquotNode):
+            name = None
+        else:
+            ident_node = declid_node.children[0]
+            if isinstance(ident_node, IdentNode):
+                name = ident_node.val
+            else:
+                assert isinstance(ident_node, IdentAntiquotNode)
+                name = ident_node.get_ident()
+
+        if not isinstance(children[1], CommandDeclidAntiquotNode):
+            assert isinstance(children[2], CommandDeclsigNode)
+            decl_val_node = children[3]
+            assert isinstance(
+                decl_val_node,
+                (
+                    CommandDeclvalsimpleNode,
+                    CommandDeclvaleqnsNode,
+                    CommandWherestructinstNode,
+                ),
+            )
+
+            if isinstance(decl_val_node, CommandDeclvalsimpleNode):
+                assert (
+                    isinstance(decl_val_node.children[0], AtomNode)
+                    and decl_val_node.children[0].val == ":="
+                )
+            elif isinstance(decl_val_node, CommandWherestructinstNode):
+                assert (
+                    isinstance(decl_val_node.children[0], AtomNode)
+                    and decl_val_node.children[0].val == "where"
+                )
+
+        return cls(lean_file, start, end, children, name)
+
+    def is_private(self) -> bool:
+        return self._is_private_decl
+
+    def get_proof_node(self) -> Node:
+        decl_val_node = self.children[3]
+        if isinstance(
+            decl_val_node,
+            (
+                CommandDeclvalsimpleNode,
+                CommandWherestructinstNode,
+            ),
+        ):
+            return decl_val_node.children[1]
+        else:
+            return decl_val_node
+
+    def has_tactic_proof(self) -> bool:
+        node = self.get_proof_node()
+        return isinstance(node, TermBytacticNode)
+
+    @property
+    def is_mutual(self) -> bool:
+        return not isinstance(self.name, str)
+
+
+@dataclass(frozen=True)
+class TermBytacticNode(Node):
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "TermBytacticNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        return cls(lean_file, start, end, children)
+
+
+@dataclass(frozen=True)
+class TacticTacticseq1IndentedAntiquotNode(Node):
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "TacticTacticseq1IndentedAntiquotNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        return cls(lean_file, start, end, children)
+
+    def get_tactic_nodes(
+        self, atomic_only: bool = False
+    ) -> Generator[Node, None, None]:
+        return
+
+
+@dataclass(frozen=True)
+class TacticTacticseqNode(Node):
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "TacticTacticseqNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        assert len(children) == 1 and isinstance(
+            children[0],
+            (
+                TacticTacticseq1IndentedNode,
+                TacticTacticseqbracketedNode,
+                TacticTacticseq1IndentedAntiquotNode,
+            ),
+        )
+        return cls(lean_file, start, end, children)
+
+    def get_tactic_nodes(
+        self, atomic_only: bool = False
+    ) -> Generator[Node, None, None]:
+        yield from self.children[0].get_tactic_nodes(atomic_only)
+
+
+@dataclass(frozen=True)
+class TacticTacticseq1IndentedNode(Node):
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "TacticTacticseq1IndentedNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        assert len(children) == 1 and isinstance(children[0], NullNode)
+        return cls(lean_file, start, end, children)
+
+    def get_tactic_nodes(
+        self, atomic_only: bool = False
+    ) -> Generator[Node, None, None]:
+        for i, tac_node in enumerate(self.children[0].children):
+            if i % 2 == 0:
+                if not atomic_only or not contains_tactic(tac_node):
+                    yield tac_node
+            else:
+                assert isinstance(tac_node, NullNode) or isinstance(tac_node, AtomNode)
+
+
+@dataclass(frozen=True)
+class TacticTacticseqbracketedNode(Node):
+    state_before: Optional[str] = None
+    state_after: Optional[str] = None
+    tactic: Optional[str] = None
+
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "TacticTacticseqbracketedNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        assert len(children) == 3
+        return cls(lean_file, start, end, children)
+
+    @property
+    def tactic_nodes(self) -> List[Node]:
+        children = self.children
+        if not isinstance(children[0], AtomNode) or children[0].val != "{":
+            return []
+
+        assert isinstance(children[1], NullNode)
+        assert isinstance(children[2], AtomNode) and children[2].val == "}"
+        nodes = []
+        for i, tac_node in enumerate(children[1].children):
+            if i % 2 == 0:
+                nodes.append(tac_node)
+            else:
+                assert isinstance(tac_node, NullNode) or isinstance(tac_node, AtomNode)
+        return nodes
+
+    def get_tactic_nodes(
+        self, atomic_only: bool = False
+    ) -> Generator[Node, None, None]:
+        children = self.children
+        if isinstance(children[0], AtomNode) and children[0].val == "{":
+            assert isinstance(children[1], NullNode)
+            assert isinstance(children[2], AtomNode) and children[2].val == "}"
+            for i, tac_node in enumerate(children[1].children):
+                if i % 2 == 0:
+                    if not atomic_only or not contains_tactic(tac_node):
+                        yield tac_node
+                else:
+                    assert isinstance(tac_node, NullNode) or isinstance(
+                        tac_node, AtomNode
+                    )
+
+
+def contains_tactic(node: Node) -> bool:
+    result = False
+
+    def _callback(x, _) -> bool:
+        if x is not node and isinstance(
+            x,
+            (
+                TacticTacticseq1IndentedNode,
+                TacticTacticseqbracketedNode,
+            ),
+        ):
+            nonlocal result
+            result = True
+            return True
+
+    node.traverse_preorder(_callback, node_cls=None)
+    return result
+
+
+@dataclass(frozen=True)
+class ModuleHeaderNode(Node):
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "ModuleHeaderNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        return cls(lean_file, start, end, children)
+
+
+@dataclass(frozen=True)
+class ModulePreludeNode(Node):
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "ModulePreludeNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        return cls(lean_file, start, end, children)
+
+
+@dataclass(frozen=True)
+class ModulePreludeNode(Node):
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "ModulePreludeNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        return cls(lean_file, start, end, children)
+
+
+@dataclass(frozen=True)
+class ModuleImportNode(Node):
+    module: Optional[str]
+    path: Optional[Path] = None
+
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "ModuleImportNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+
+        assert isinstance(children[0], AtomNode) and children[0].val == "import"
+        if isinstance(children[2], IdentNode):
+            module = children[2].val
+        else:
+            module = None
+
+        return cls(lean_file, start, end, children, module)
+
+
+@dataclass(frozen=True)
+class CommandModuledocNode(Node):
+    comment: str
+
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "CommandModuledocNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        assert len(children) == 2 and all(isinstance(_, AtomNode) for _ in children)
+        assert children[0].val == "/-!"
+        comment = children[1].val
+        return cls(lean_file, start, end, children, comment)
+
+
+@dataclass(frozen=True)
+class CommandDoccommentNode(Node):
+    comment: str
+
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "CommandDoccommentNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        assert len(children) == 2 and all(isinstance(_, AtomNode) for _ in children)
+        assert children[0].val == "/--"
+        comment = children[1].val
+        return cls(lean_file, start, end, children, comment)
+
+
+@dataclass(frozen=True)
+class CommandNamespaceNode(Node):
+    name: str
+
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "CommandNamespaceNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+
+        assert len(children) == 2
+        assert isinstance(children[0], AtomNode) and children[0].val == "namespace"
+        if isinstance(children[1], IdentNode):
+            name = children[1].val
+        else:
+            name = None
+
+        return cls(lean_file, start, end, children, name)
+
+
+@dataclass(frozen=True)
+class CommandSectionNode(Node):
+    name: Optional[str]
+
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "CommandNamespaceNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+
+        assert len(children) == 2
+        assert isinstance(children[0], AtomNode) and children[0].val == "section"
+        assert isinstance(children[1], NullNode)
+
+        if len(children[1].children) == 1 and isinstance(
+            children[1].children[0], IdentNode
+        ):
+            name = children[1].children[0].val
+        else:
+            name = None
+
+        return cls(lean_file, start, end, children, name)
+
+
+@dataclass(frozen=True)
+class CommandNoncomputablesectionNode(Node):
+    name: Optional[str]
+
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "CommandNoncomputablesectionNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+
+        assert len(children) == 3
+        assert isinstance(children[0], AtomNode) and children[0].val == "noncomputable"
+        assert isinstance(children[1], AtomNode) and children[1].val == "section"
+        assert isinstance(children[2], NullNode)
+
+        if len(children[2].children) == 1 and isinstance(
+            children[2].children[0], IdentNode
+        ):
+            name = children[2].children[0].val
+        else:
+            name = None
+
+        return cls(lean_file, start, end, children, name)
+
+
+@dataclass(frozen=True)
+class CommandEndNode(Node):
+    name: Optional[str]
+
+    @classmethod
+    def from_data(
+        cls, node_data: Dict[str, Any], lean_file: LeanFile
+    ) -> "CommandEndNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+
+        assert len(children) == 2
+        assert isinstance(children[0], AtomNode) and children[0].val == "end"
+        assert isinstance(children[1], NullNode)
+
+        if len(children[1].children) == 1 and isinstance(
+            children[1].children[0], IdentNode
+        ):
+            name = children[1].children[0].val
+        else:
+            name = None
+
+        return cls(lean_file, start, end, children, name)
+
+
+@dataclass(frozen=True)
+class OtherNode(Node):
+    kind: str  # type: ignore
+    state_before: Optional[str] = None
+    state_after: Optional[str] = None
+    tactic: Optional[str] = None
+
+    @classmethod
+    def from_data(cls, node_data: Dict[str, Any], lean_file: LeanFile) -> "OtherNode":
+        assert node_data["info"] == "none"
+        start, end = None, None
+        children = _parse_children(node_data, lean_file)
+        return cls(lean_file, start, end, children, node_data["kind"])
+
+
+def is_potential_premise_lean4(node: Node) -> bool:
+    """Check if ``node`` is a theorem/definition that can be used as a premise."""
+    if (isinstance(node, CommandDeclarationNode) and not node.is_example) or isinstance(
+        node,
+        (
+            LemmaNode,
+            MathlibTacticLemmaNode,
+            LeanElabCommandCommandIrreducibleDefNode,
+            StdTacticAliasAliasNode,
+            StdTacticAliasAliaslrNode,
+        ),
+    ):
+        return node.name is not None
+    else:
+        return False
+
+
+def is_mutual_lean4(node: Node) -> bool:
+    return (
+        isinstance(node, (IdentNode, CommandTheoremNode, StdTacticAliasAliaslrNode))
+        and node.is_mutual
+    )
diff --git a/data_extraction/build_lean4_repo.py b/data_extraction/build_lean4_repo.py
new file mode 100644
index 0000000..a15dd61
--- /dev/null
+++ b/data_extraction/build_lean4_repo.py
@@ -0,0 +1,214 @@
+"""Build Lean 4 projects in Docker.
+
+Only this file runs in Docker. So it must be self-contained.
+"""
+
+import os
+import re
+import shutil
+import argparse
+import itertools
+import subprocess
+from tqdm import tqdm
+from loguru import logger
+from time import sleep, monotonic
+from pathlib import Path, PurePath
+from multiprocessing import Process
+from contextlib import contextmanager
+from typing import Union, List, Optional, Generator
+
+
+def run_cmd(cmd: Union[str, List[str]], capture_output: bool = False) -> Optional[str]:
+    """Run a shell command.
+
+    Args:
+        cmd (Union[str, List[str]]): A command or a list of commands.
+    """
+    if isinstance(cmd, list):
+        cmd = " && ".join(cmd)
+    res = subprocess.run(cmd, shell=True, capture_output=capture_output, check=True)
+    if capture_output:
+        return res.stdout.decode()
+    else:
+        return None
+
+
+def record_paths(dir: Path, root: Path, lean_bin: Path) -> None:
+    """Run ``lean --deps`` for all Lean files in ``dir`` to record its dependencies.
+
+    Args:
+        dir (Path): The directory containing Lean files.
+    """
+    dir = Path(dir)
+
+    for p in dir.glob("**/*.lean"):
+        with p.with_suffix(".dep_paths").open("wt") as oup:
+            for line in run_cmd(
+                f"{lean_bin} --deps {p}", capture_output=True
+            ).splitlines():
+                olean_path = PurePath(line.strip())
+                assert olean_path.suffix == ".olean"
+                lean_path = olean_path.relative_to(root).with_suffix(".lean")
+                oup.write(str(lean_path) + "\n")
+
+
+def remove_files(dir: Path, suffix: str) -> None:
+    """Remove all files in ``dir`` that end with ``suffix``."""
+    for p in Path(dir).glob(f"**/*{suffix}"):
+        p.unlink()
+
+
+_PROGRESSBAR_UPDATE_INTERNAL = 5
+
+
+def _monitor(paths: List[Path], num_total: int) -> None:
+    with tqdm(total=num_total) as pbar:
+        while True:
+            time_start = monotonic()
+            try:
+                num_done = len(
+                    list(
+                        itertools.chain.from_iterable(
+                            p.glob(f"**/*.ast.json") for p in paths
+                        )
+                    )
+                )
+            except Exception:
+                continue
+            time_elapsed = monotonic() - time_start
+            if time_elapsed < _PROGRESSBAR_UPDATE_INTERNAL:
+                sleep(_PROGRESSBAR_UPDATE_INTERNAL - time_elapsed)
+            pbar.update(num_done - pbar.n)
+            if num_done >= num_total:
+                break
+    print("")
+
+
+@contextmanager
+def launch_progressbar(paths: List[Union[str, Path]]) -> Generator[None, None, None]:
+    """Launch an async progressbar to monitor the progress of tracing the repo."""
+    paths = [Path(p) for p in paths]
+    olean_files = list(
+        itertools.chain.from_iterable(p.glob("**/*.olean") for p in paths)
+    )
+    num_total = len(olean_files)
+    p = Process(target=_monitor, args=(paths, num_total), daemon=True)
+    p.start()
+    yield
+    p.kill()
+
+
+def get_lean_version() -> str:
+    """Get the version of Lean."""
+    output = run_cmd("lean --version", capture_output=True).strip()
+    m = re.match(r"Lean \(version (?P<version>\S+?),", output)
+    return m["version"]
+
+
+def check_files(packages_path: str, no_deps: bool) -> bool:
+    """Check if all *.lean files have been processed to produce *.ast.json and *.dep_paths files."""
+    cwd = Path.cwd()
+    packages_path = cwd / packages_path
+    jsons = {
+        p.with_suffix("").with_suffix("")
+        for p in cwd.glob("**/build/ir/**/*.ast.json")
+        if not no_deps or not p.is_relative_to(packages_path)
+    }
+    deps = {
+        p.with_suffix("")
+        for p in cwd.glob("**/build/ir/**/*.dep_paths")
+        if not no_deps or not p.is_relative_to(packages_path)
+    }
+    oleans = {
+        Path(str(p.with_suffix("")).replace("/build/lib/", "/build/ir/"))
+        for p in cwd.glob("**/build/lib/**/*.olean")
+        if not no_deps or not p.is_relative_to(packages_path)
+    }
+    assert len(jsons) <= len(oleans) and len(deps) <= len(oleans)
+    missing_jsons = {p.with_suffix(".ast.json") for p in oleans - jsons}
+    missing_deps = {p.with_suffix(".dep_paths") for p in oleans - deps}
+    if len(missing_jsons) > 0 or len(missing_deps) > 0:
+        for p in missing_jsons.union(missing_deps):
+            logger.warning(f"Missing {p}")
+        return False
+    return True
+
+
+def is_new_version(v: str) -> bool:
+    """Check if ``v`` is at least `4.3.0-rc2`."""
+    major, minor, patch = [int(_) for _ in v.split("-")[0].split(".")]
+    if major < 4 or (major == 4 and minor < 3):
+        return False
+    if (
+        major > 4
+        or (major == 4 and minor > 3)
+        or (major == 4 and minor == 3 and patch > 0)
+    ):
+        return True
+    assert major == 4 and minor == 3 and patch == 0
+    if "4.3.0-rc" in v:
+        rc = int(v.split("-")[1][2:])
+        return rc >= 2
+    else:
+        return True
+
+
+def main() -> None:
+    import ipdb; ipdb.set_trace()
+    parser = argparse.ArgumentParser()
+    parser.add_argument("repo_name")
+    parser.add_argument("--no-deps", action="store_true")
+    args = parser.parse_args()
+
+    num_procs = int(os.environ["NUM_PROCS"])
+    repo_name = args.repo_name
+    os.chdir(repo_name)
+    
+    if is_new_version(get_lean_version()):
+            packages_path = ".lake/packages"
+            build_path = ".lake/build"
+    else:
+        packages_path = "lake-packages"
+        build_path = "build"
+    
+    # if check_files(packages_path, args.no_deps):
+    #     logger.info(f"The repo {repo_name} has already been traced.")
+    #     return
+
+    # If the lean4 package exists, we assume the build has completed and we just need to trace
+    if (Path(".lake/packages/lean4") if is_new_version(get_lean_version()) else Path("lake-packages/lean4")).exists():
+        logger.info(f"The repo {repo_name} has already been built, but has not been traced.")
+    else:
+        # Build the repo using lake.
+        logger.info(f"Building {repo_name}")
+        if args.no_deps:
+            # The additional *.olean files wouldn't matter.
+            try:
+                run_cmd("lake exe cache get")
+            except subprocess.CalledProcessError:
+                pass
+        run_cmd("lake build")
+
+        # Copy the Lean 4 stdlib into the path of packages.
+        lean_prefix = run_cmd(f"lean --print-prefix", capture_output=True).strip()
+        shutil.copytree(lean_prefix, f"{packages_path}/lean4")
+
+    
+    # Run ExtractData.lean to extract ASTs, tactic states, and premise information.
+    dirs_to_monitor = [build_path]
+    if not args.no_deps:
+        dirs_to_monitor.append(packages_path)
+    
+    logger.info(f"Tracing {repo_name}")
+    with launch_progressbar(dirs_to_monitor):
+        cmd = f"lake env lean --threads {num_procs} --run ExtractData.lean"
+        if args.no_deps:
+            cmd += " noDeps"
+        logger.debug(cmd)
+        run_cmd(cmd, capture_output=True)
+
+    assert check_files(packages_path, args.no_deps), "Some files failed to be processed."
+
+
+if __name__ == "__main__":
+    main()
diff --git a/data_extraction/cache.py b/data_extraction/cache.py
new file mode 100644
index 0000000..20bae84
--- /dev/null
+++ b/data_extraction/cache.py
@@ -0,0 +1,107 @@
+"""Cache manager of traced repos.
+"""
+
+import os
+import shutil
+import tarfile
+from pathlib import Path
+from loguru import logger
+from filelock import FileLock
+from dataclasses import dataclass, field
+from typing import Optional, Tuple, Generator
+
+from ..utils import (
+    execute,
+    url_exists,
+    get_repo_info,
+    report_critical_failure,
+)
+from ..constants import (
+    CACHE_DIR,
+    DISABLE_REMOTE_CACHE,
+    REMOTE_CACHE_URL,
+)
+
+
+def _split_git_url(url: str) -> Tuple[str, str]:
+    """Split a Git URL into user name and repo name."""
+    if url.endswith("/"):
+        url = url[:-1]
+        assert not url.endswith("/"), f"Unexpected URL: {url}"
+    fields = url.split("/")
+    user_name = fields[-2]
+    repo_name = fields[-1]
+    return user_name, repo_name
+
+
+def _format_dirname(url: str, commit: str) -> str:
+    user_name, repo_name = _split_git_url(url)
+    return f"{user_name}-{repo_name}-{commit}"
+
+
+_CACHE_CORRPUTION_MSG = "The cache may have been corrupted!"
+
+
+@dataclass(frozen=True, eq=False)
+class Cache:
+    """Cache manager."""
+
+    cache_dir: Path
+    lock: FileLock = field(init=False, repr=False)
+
+    def __iter__(self) -> Generator[Path, None, None]:
+        """Iterate over all traced repos in the cache."""
+        yield from self.cache_dir.glob("*")
+
+    def __post_init__(self):
+        if not os.path.exists(self.cache_dir):
+            self.cache_dir.mkdir(parents=True)
+        lock_path = self.cache_dir.with_suffix(".lock")
+        object.__setattr__(self, "lock", FileLock(lock_path))
+
+    def get(self, url: str, commit: str) -> Optional[Path]:
+        """Get the path of a traced repo with URL ``url`` and commit hash ``commit``. Return None if no such repo can be found."""
+        _, repo_name = _split_git_url(url)
+        dirname = _format_dirname(url, commit)
+        dirpath = self.cache_dir / dirname
+
+        with self.lock:
+            if dirpath.exists():
+                assert (dirpath / repo_name).exists()
+                return dirpath / repo_name
+
+            elif not DISABLE_REMOTE_CACHE:
+                url = os.path.join(REMOTE_CACHE_URL, f"{dirname}.tar.gz")
+                if not url_exists(url):
+                    return None
+                logger.info(
+                    f"Downloading the traced repo from the remote cache. Set the environment variable `DISABLE_REMOTE_CACHE` if you want to trace the repo locally."
+                )
+                execute(f"wget {url} -O {dirpath}.tar.gz")
+
+                with report_critical_failure(_CACHE_CORRPUTION_MSG):
+                    with tarfile.open(f"{dirpath}.tar.gz") as tar:
+                        tar.extractall(self.cache_dir)
+                    os.remove(f"{dirpath}.tar.gz")
+                    assert (dirpath / repo_name).exists()
+
+                return dirpath / repo_name
+
+            else:
+                return None
+
+    def store(self, src: Path) -> Path:
+        """Store a traced repo at path ``src``. Return its path in the cache."""
+        url, commit = get_repo_info(src)
+        dirpath = self.cache_dir / _format_dirname(url, commit)
+        _, repo_name = _split_git_url(url)
+        if not dirpath.exists():
+            with self.lock:
+                with report_critical_failure(_CACHE_CORRPUTION_MSG):
+                    shutil.copytree(src, dirpath / repo_name)
+        return dirpath / repo_name
+
+
+cache = Cache(CACHE_DIR)
+"""A global :class:`Cache` object managing LeanDojo's caching of traced repos (see :ref:`caching`).
+"""
diff --git a/data_extraction/lean.py b/data_extraction/lean.py
new file mode 100644
index 0000000..187288c
--- /dev/null
+++ b/data_extraction/lean.py
@@ -0,0 +1,702 @@
+"""This module define classes for repos, files, and theorems in Lean. 
+Objects of these classes contain only surface information, without extracting any trace.
+"""
+
+import re
+import os
+import json
+import toml
+import time
+import urllib
+import webbrowser
+from pathlib import Path
+from loguru import logger
+from functools import cache
+from github import Github, Auth
+from dataclasses import dataclass, field
+from github.Repository import Repository
+from typing import List, Dict, Any, Generator, Union, Optional, Tuple, Iterator
+
+
+from ..utils import (
+    execute,
+    read_url,
+    url_exists,
+    get_repo_info,
+    working_directory,
+)
+from ..constants import LEAN4_URL
+from .cache import _split_git_url
+
+
+GITHUB_ACCESS_TOKEN = os.getenv("GITHUB_ACCESS_TOKEN", None)
+"""GiHub personal access token is optional. 
+If provided, it can increase the rate limit for GitHub API calls.
+"""
+
+if GITHUB_ACCESS_TOKEN:
+    logger.debug("Using GitHub personal access token for authentication")
+    GITHUB = Github(auth=Auth.Token(GITHUB_ACCESS_TOKEN))
+    GITHUB.get_user().login
+else:
+    logger.debug(
+        "Using GitHub without authentication. Don't be surprised if you hit the API rate limit."
+    )
+    GITHUB = Github()
+
+LEAN4_REPO = GITHUB.get_repo("leanprover/lean4")
+"""The GitHub Repo for Lean 4 itself."""
+
+LEAN4_NIGHTLY_REPO = GITHUB.get_repo("leanprover/lean4-nightly")
+"""The GitHub Repo for Lean 4 nightly releases."""
+
+_URL_REGEX = re.compile(r"(?P<url>.*?)/*")
+
+
+def normalize_url(url: str) -> str:
+    return _URL_REGEX.fullmatch(url)["url"]  # Remove trailing `/`.
+
+
+@cache
+def url_to_repo(url: str, num_retries: int = 2) -> Repository:
+    url = normalize_url(url)
+    backoff = 1
+
+    while True:
+        try:
+            return GITHUB.get_repo("/".join(url.split("/")[-2:]))
+        except Exception as ex:
+            if num_retries <= 0:
+                raise ex
+            num_retries -= 1
+            logger.debug(f'url_to_repo("{url}") failed. Retrying...')
+            time.sleep(backoff)
+            backoff *= 2
+
+
+@cache
+def get_latest_commit(url: str) -> str:
+    """Get the hash of the latest commit of the Git repo at ``url``."""
+    repo = url_to_repo(url)
+    return repo.get_branch(repo.default_branch).commit.sha
+
+
+def cleanse_string(s: Union[str, Path]) -> str:
+    """Replace : and / with _ in a string."""
+    return str(s).replace("/", "_").replace(":", "_")
+
+
+@cache
+def _to_commit_hash(repo: Repository, label: str) -> str:
+    """Convert a tag or branch to a commit hash."""
+    logger.debug(f"Querying the commit hash for {repo.name} {label}")
+    
+    # Poor man's cache
+    if repo.name == "lean4":
+        if label == "v4.23.0-rc2":
+            return "ad1a017949674a947f0d6794cbf7130d642c6530"
+        elif label == "v4.17.0":
+            return "306f36116535cd226329f562b4675b8b6dbf948c"
+        elif label == "v4.8.0-rc2":
+            return "873ef2d894af80d8fc672e35f7e28bae314a1f6f"
+    
+    # if the label is a commit hash, return it directly
+    if len(label) == 40 and _COMMIT_REGEX.fullmatch(label.strip()):
+        return label
+
+    for branch in repo.get_branches():
+        if branch.name == label:
+            print(f"Found branch {branch.name} with commit {branch.commit.sha}")
+            return branch.commit.sha
+
+    for tag in repo.get_tags():
+        if tag.name == label:
+            print(f"Found tag {tag.name} with commit {tag.commit.sha}")
+            return tag.commit.sha
+
+    raise ValueError(f"Invalid tag or branch: `{label}` for {repo}")
+
+
+@dataclass(eq=True, unsafe_hash=True)
+class Pos:
+    """Position in source files.
+
+    We use 1-index to keep it consistent with code editors such as Visual Studio Code.
+    """
+
+    line_nb: int
+    """Line number
+    """
+
+    column_nb: int
+    """Column number
+    """
+
+    @classmethod
+    def from_str(cls, s: str) -> "Pos":
+        """Construct a :class:`Pos` object from its string representation, e.g., :code:`"(323, 1109)"`."""
+        assert s.startswith("(") and s.endswith(
+            ")"
+        ), f"Invalid string representation of a position: {s}"
+        line, column = s[1:-1].split(",")
+        line_nb = int(line)
+        column_nb = int(column)
+        return cls(line_nb, column_nb)
+
+    def __iter__(self) -> Generator[int, None, None]:
+        yield self.line_nb
+        yield self.column_nb
+
+    def __repr__(self) -> str:
+        return repr(tuple(self))
+
+    def __lt__(self, other):
+        return self.line_nb < other.line_nb or (
+            self.line_nb == other.line_nb and self.column_nb < other.column_nb
+        )
+
+    def __le__(self, other):
+        return self < other or self == other
+
+
+@dataclass(frozen=True)
+class LeanFile:
+    """A Lean source file (:file:`*.lean`)."""
+
+    root_dir: Path = field(repr=False)
+    """Root directory of the traced repo this :class:`LeanFile` object belongs to.
+
+    ``root_dir`` must be an absolute path, e.g., :file:`/home/kaiyu/traced_lean-example/lean-example`
+    """
+
+    path: Path
+    """Relative path w.r.t. ``root_dir``
+    
+    E.g., :file:`lean-example/src/example.lean`
+    """
+
+    code: List[str] = field(init=False, repr=False)
+    """Raw source code as a list of lines."""
+
+    endwith_newline: bool = field(init=False, repr=False)
+    """Whether the last line ends with a newline."""
+
+    num_bytes: List[int] = field(init=False, repr=False)
+    """The number of UTF-8 bytes of each line, including newlines.
+    """
+
+    def __post_init__(self) -> None:
+        assert (
+            self.root_dir.is_absolute()
+        ), f"Root directory must be an absolute path: {self.root_dir}"
+        assert self.path.suffix == ".lean", f"File extension must be .lean: {self.path}"
+        assert not self.path.is_absolute(), f"Path must be a relative path: {self.path}"
+
+        code = []
+        endwith_newline = None
+        num_bytes = []
+
+        for line in self.abs_path.open("rb"):
+            if b"\r\n" in line:
+                raise RuntimeError(
+                    f"{self.abs_path} contains Windows-style line endings. This is discouraged (see https://github.com/leanprover-community/mathlib4/pull/6506)."
+                )
+            if line.endswith(b"\n"):
+                endwith_newline = True
+                line = line[:-1]
+            else:
+                endwith_newline = False
+            code.append(line.decode("utf-8"))
+            num_bytes.append(len(line) + 1)
+
+        object.__setattr__(self, "code", code)
+        object.__setattr__(self, "endwith_newline", endwith_newline)
+        object.__setattr__(self, "num_bytes", num_bytes)
+
+    @property
+    def abs_path(self) -> Path:
+        """Absolute path of a :class:`LeanFile` object.
+
+        E.g., :file:`/home/kaiyu/traced_lean-example/lean-example/src/example.lean`
+        """
+        return self.root_dir / self.path
+
+    @property
+    def num_lines(self) -> int:
+        """Number of lines in a source file."""
+        return len(self.code)
+
+    def num_columns(self, line_nb: int) -> int:
+        """Number of columns in a source file."""
+        return len(self.get_line(line_nb))
+
+    @property
+    def start_pos(self) -> Pos:
+        """Return the start position of a source file.
+
+        Returns:
+            Pos: A :class:`Pos` object representing the start of this file.
+        """
+        return Pos(1, 1)
+
+    @property
+    def end_pos(self) -> Pos:
+        """Return the end position of a source file.
+
+        Args:
+            zero_indexed (bool, optional): Whether to use 0-index instead of 1-index. Defaults to False.
+
+        Returns:
+            Pos: A :class:`Pos` object representing the end of this file.
+        """
+        # Line and column numbers are 1-indexed by default.
+        line_nb = self.num_lines
+        column_nb = 1 + len(self.code[-1])
+        return Pos(line_nb, column_nb)
+
+    def convert_pos(self, byte_idx: int) -> Pos:
+        """Convert a byte index (:code:`String.Pos` in Lean 4) to a :class:`Pos` object."""
+        n = 0
+        for i, num_bytes in enumerate(self.num_bytes, start=1):
+            n += num_bytes
+            if n > byte_idx:
+                line_byte_idx = byte_idx - (n - num_bytes)
+                if line_byte_idx == 0:
+                    return Pos(i, 1)
+
+                line = self.get_line(i)
+                m = 0
+
+                for j, c in enumerate(line, start=1):
+                    m += len(c.encode("utf-8"))
+                    if m >= line_byte_idx:
+                        return Pos(i, j + 1)
+
+        raise ValueError(f"Invalid byte index {byte_idx} in {self.path}.")
+
+    def offset(self, pos: Pos, delta: int) -> Pos:
+        """Off set a position by a given number."""
+        line_nb, column_nb = pos
+        num_columns = len(self.get_line(line_nb)) - column_nb + 1
+        if delta <= num_columns:
+            return Pos(line_nb, column_nb + delta)
+        delta_left = delta - num_columns - 1
+
+        for i in range(line_nb, self.num_lines):
+            line = self.code[i]
+            l = len(line)
+            if delta_left <= l:
+                return Pos(i + 1, delta_left + 1)
+            delta_left -= l + 1
+
+        if delta_left == 0 and self.endwith_newline:
+            return Pos(self.num_lines + 1, 1)
+
+        raise ValueError(f"Invalid offset {delta} in {self.path}: {pos}.")
+
+    def get_line(self, line_nb: int) -> str:
+        """Return a given line of the source file.
+
+        Args:
+            line_nb (int): Line number (1-indexed).
+        """
+        return self.code[line_nb - 1]
+
+    def __getitem__(self, key) -> str:
+        """Return a code segment given its start/end positions.
+
+        This enables ``lean_file[start:end]``.
+
+        Args:
+            key (slice): A slice of two :class:`Pos` objects for the start/end of the code segment.
+        """
+        assert isinstance(key, slice) and key.step is None
+        if key.start is None:
+            start_line = start_column = 1
+        else:
+            start_line, start_column = key.start
+        if key.stop is None:
+            end_line = self.num_lines
+            end_column = 1 + len(self.get_line(end_line))
+        else:
+            end_line, end_column = key.stop
+        if start_line == end_line:
+            assert start_column <= end_column
+            return self.get_line(start_line)[start_column - 1 : end_column - 1]
+        else:
+            assert start_line < end_line
+            code_slice = [self.code[start_line - 1][start_column - 1 :]]
+            for line_nb in range(start_line + 1, end_line):
+                code_slice.append(self.get_line(line_nb))
+            code_slice.append(self.get_line(end_line)[: end_column - 1])
+            return "\n".join(code_slice)
+
+
+_COMMIT_REGEX = re.compile(r"[0-9a-z]+")
+_LEAN4_VERSION_REGEX = re.compile(r"leanprover/lean4:(?P<version>.+?)")
+
+
+def get_lean4_version_from_config(toolchain: str) -> str:
+    """Return the required Lean version given a ``lean-toolchain`` config."""
+    m = _LEAN4_VERSION_REGEX.fullmatch(toolchain.strip())
+    assert m is not None, "Invalid config."
+    return m["version"]
+
+
+def get_lean4_commit_from_config(config_dict: Dict[str, Any]) -> str:
+    """Return the required Lean commit given a ``lean-toolchain`` config."""
+    assert "content" in config_dict, "config_dict must have a 'content' field"
+    config = config_dict["content"].strip()
+    prefix = "leanprover/lean4:"
+
+    if config == f"{prefix}nightly":
+        latest_tag = LEAN4_NIGHTLY_REPO.get_tags()[0]
+        return latest_tag.commit.sha
+
+    assert config.startswith(prefix), f"Invalid Lean 4 version: {config}"
+    version = config[len(prefix) :]
+
+    if version.startswith("nightly"):
+        return _to_commit_hash(LEAN4_NIGHTLY_REPO, version)
+    else:
+        return _to_commit_hash(LEAN4_REPO, version)
+
+
+URL = TAG = COMMIT = str
+
+
+@dataclass(frozen=True)
+class RepoInfoCache:
+    """To minize the number of network requests, we cache and re-use the info
+    of all repos, assuming it does not change during the execution of LeanDojo."""
+
+    tag2commit: Dict[Tuple[URL, TAG], COMMIT] = field(default_factory=dict)
+    lean_version: Dict[Tuple[URL, COMMIT], str] = field(default_factory=dict)
+
+
+info_cache = RepoInfoCache()
+
+
+_LAKEFILE_LEAN_GIT_REQUIREMENT_REGEX = re.compile(
+    r"require\s+(?P<name>\S+)\s+from\s+git\s+\"(?P<url>.+?)\"(\s+@\s+\"(?P<rev>\S+)\")?"
+)
+
+_LAKEFILE_LEAN_LOCAL_REQUIREMENT_REGEX = re.compile(r"require \S+ from \"")
+
+_LAKEFILE_TOML_REQUIREMENT_REGEX = re.compile(r"(?<=\[\[require\]\]).+(?=\n\n)")
+
+
+def is_supported_version(v) -> bool:
+    """Check if ``v`` is at least `v4.3.0-rc2`."""
+    if not v.startswith("v"):
+        return False
+    v = v[1:]
+    major, minor, patch = [int(_) for _ in v.split("-")[0].split(".")]
+    if major < 4 or (major == 4 and minor < 3):
+        return False
+    if (
+        major > 4
+        or (major == 4 and minor > 3)
+        or (major == 4 and minor == 3 and patch > 0)
+    ):
+        return True
+    assert major == 4 and minor == 3 and patch == 0
+    if "4.3.0-rc" in v:
+        rc = int(v.split("-")[1][2:])
+        return rc >= 2
+    else:
+        return True
+
+
+@dataclass(frozen=True)
+class LeanGitRepo:
+    """Git repo of a Lean project."""
+
+    url: str
+    """The repo's Github URL.
+
+    Note that we only support Github as of now.
+    """
+
+    commit: str
+    """The repo's commit hash.
+    
+    You can also use tags such as ``v3.5.0``. They will be converted to commit hashes.
+    """
+
+    repo: Repository = field(init=False, repr=False)
+    """A :class:`github.Repository` object.
+    """
+
+    lean_version: str = field(init=False, repr=False)
+    """Required Lean version.
+    """
+
+    def __post_init__(self) -> None:
+        if "github.com" not in self.url:
+            raise ValueError(f"{self.url} is not a Github URL")
+        if not self.url.startswith("https://"):
+            raise ValueError(f"{self.url} is not a valid URL")
+        object.__setattr__(self, "url", normalize_url(self.url))
+        object.__setattr__(self, "repo", url_to_repo(self.url))
+
+        # Convert tags or branches to commit hashes
+        if not (len(self.commit) == 40 and _COMMIT_REGEX.fullmatch(self.commit)):
+            if (self.url, self.commit) in info_cache.tag2commit:
+                commit = info_cache.tag2commit[(self.url, self.commit)]
+            else:
+                commit = _to_commit_hash(self.repo, self.commit)
+                assert _COMMIT_REGEX.fullmatch(commit), f"Invalid commit hash: {commit}"
+                info_cache.tag2commit[(self.url, self.commit)] = commit
+            object.__setattr__(self, "commit", commit)
+
+        # Determine the required Lean version.
+        if (self.url, self.commit) in info_cache.lean_version:
+            lean_version = info_cache.lean_version[(self.url, self.commit)]
+        elif self.is_lean4:
+            lean_version = self.commit
+        else:
+            config = self.get_config("lean-toolchain")
+            lean_version = get_lean4_commit_from_config(config)
+            v = get_lean4_version_from_config(config["content"])
+            if not is_supported_version(v):
+                logger.warning(
+                    f"{self} relies on an unsupported Lean version: {lean_version}"
+                )
+        info_cache.lean_version[(self.url, self.commit)] = lean_version
+        object.__setattr__(self, "lean_version", lean_version)
+
+    @classmethod
+    def from_path(cls, path: Path) -> "LeanGitRepo":
+        """Construct a :class:`LeanGitRepo` object from the path to a local Git repo."""
+        url, commit = get_repo_info(path)
+        return cls(url, commit)
+
+    @property
+    def name(self) -> str:
+        return self.repo.name
+
+    @property
+    def is_lean4(self) -> bool:
+        return self.url == LEAN4_URL
+
+    @property
+    def commit_url(self) -> str:
+        return os.path.join(self.url, f"tree/{self.commit}")
+
+    def show(self) -> None:
+        """Show the repo in the default browser."""
+        webbrowser.open(self.commit_url)
+
+    def exists(self) -> bool:
+        return url_exists(self.commit_url)
+
+    def clone_and_checkout(self) -> None:
+        """Clone the repo to the current working directory and checkout a specific commit."""
+        # Check if the repo already exists.
+        # If it exists, we assume it has been checked out to the correct commit.
+        
+        user_name, repo_name = _split_git_url(self.url)
+        local_repo_path = Path(os.environ["REPO_DIR"]) / user_name / repo_name
+        if os.path.exists(local_repo_path):
+            logger.info(f"{self} already exists locally.")
+        else:
+            logger.debug(f"Cloning {self}")
+            execute(f"git clone -n --recursive {self.url}", capture_output=True)
+        
+        
+            with working_directory(local_repo_path):
+                execute(
+                    f"git checkout {self.commit} && git submodule update --recursive",
+                    capture_output=True,
+                )
+
+    def get_dependencies(
+        self, path: Union[str, Path, None] = None
+    ) -> Dict[str, "LeanGitRepo"]:
+        """Return the dependencies required by the target repo.
+
+        Args:
+            path (Union[str, Path, None], optional): Root directory of the repo if it is on the disk.
+
+        Returns:
+            Dict[str, :class:`LeanGitRepo`]: A dictionary mapping the name of each
+            dependency to its :class:`LeanGitRepo` object.
+        """
+        logger.debug(f"Querying the dependencies of {self}")
+
+        toolchain = (
+            self.get_config("lean-toolchain")
+            if path is None
+            else {"content": (Path(path) / "lean-toolchain").open().read()}
+        )
+        commit = get_lean4_commit_from_config(toolchain)
+        deps = {"lean4": LeanGitRepo(LEAN4_URL, commit)}
+
+        try:
+            lake_manifest = (
+                self.get_config("lake-manifest.json", num_retries=0)
+                if path is None
+                else json.load((Path(path) / "lake-manifest.json").open())
+            )
+            for pkg in lake_manifest["packages"]:
+                deps[pkg["name"]] = LeanGitRepo(pkg["url"], pkg["rev"])
+        except Exception:
+            for name, repo in self._parse_lakefile_dependencies(path):
+                if name not in deps:
+                    deps[name] = repo
+                for dd_name, dd_repo in repo.get_dependencies().items():
+                    deps[dd_name] = dd_repo
+
+        return deps
+
+    def _parse_lakefile_dependencies(
+        self, path: Union[str, Path, None]
+    ) -> List[Tuple[str, "LeanGitRepo"]]:
+        if self.uses_lakefile_lean():
+            return self._parse_lakefile_lean_dependencies(path)
+        else:
+            return self._parse_lakefile_toml_dependencies(path)
+
+    def _parse_lakefile_lean_dependencies(
+        self, path: Union[str, Path, None]
+    ) -> List[Tuple[str, "LeanGitRepo"]]:
+        lakefile = (
+            self.get_config("lakefile.lean")["content"]
+            if path is None
+            else (Path(path) / "lakefile.lean").open().read()
+        )
+
+        if _LAKEFILE_LEAN_LOCAL_REQUIREMENT_REGEX.search(lakefile):
+            raise ValueError("Local dependencies are not supported.")
+
+        return self._parse_deps(_LAKEFILE_LEAN_GIT_REQUIREMENT_REGEX.finditer(lakefile))
+
+    def _parse_deps(
+        self, matches: Union[Iterator[re.Match[str]], Dict[str, str]]
+    ) -> List[Tuple[str, "LeanGitRepo"]]:
+        deps = []
+
+        for m in matches:
+            url = m["url"]
+            if url.endswith(".git"):
+                url = url[:-4]
+            if url.startswith("git@"):
+                url = "https://" + url[4:].replace(":", "/")
+
+            rev = m["rev"]
+            if rev is None:
+                commit = get_latest_commit(url)
+            elif len(rev) == 40 and _COMMIT_REGEX.fullmatch(rev):
+                commit = rev
+            else:
+                try:
+                    commit = _to_commit_hash(url_to_repo(url), rev)
+                except ValueError:
+                    commit = get_latest_commit(url)
+                assert _COMMIT_REGEX.fullmatch(commit)
+
+            deps.append((m["name"], LeanGitRepo(url, commit)))
+
+        return deps
+
+    def _parse_lakefile_toml_dependencies(
+        self, path: Union[str, Path, None]
+    ) -> List[Tuple[str, "LeanGitRepo"]]:
+        lakefile = (
+            self.get_config("lakefile.toml")["content"]
+            if path is None
+            else (Path(path) / "lakefile.toml").open().read()
+        )
+        matches = dict()
+
+        for requirement in _LAKEFILE_TOML_REQUIREMENT_REGEX.finditer(lakefile):
+            for line in requirement.strip().splitlines():
+                key, value = line.split("=")
+                key = key.strip()
+                value = value.strip()
+                if key == "path":
+                    raise ValueError("Local dependencies are not supported.")
+                if key == "git":
+                    matches["url"] = value
+                if key == "rev":
+                    matches["rev"] = value
+                if key == "name":
+                    matches["name"] = value
+
+        return self._parse_deps(lakefile, matches)
+
+    def get_license(self) -> Optional[str]:
+        """Return the content of the ``LICENSE`` file."""
+        assert "github.com" in self.url, f"Unsupported URL: {self.url}"
+        url = self.url.replace("github.com", "raw.githubusercontent.com")
+        license_url = f"{url}/{self.commit}/LICENSE"
+        try:
+            return read_url(license_url)
+        except urllib.error.HTTPError:
+            return None
+
+    def _get_config_url(self, filename: str) -> str:
+        assert "github.com" in self.url, f"Unsupported URL: {self.url}"
+        url = self.url.replace("github.com", "raw.githubusercontent.com")
+        return f"{url}/{self.commit}/{filename}"
+
+    def get_config(self, filename: str, num_retries: int = 2) -> Dict[str, Any]:
+        """Return the repo's files."""
+        config_url = self._get_config_url(filename)
+        content = read_url(config_url, num_retries)
+        if filename.endswith(".toml"):
+            return toml.loads(content)
+        elif filename.endswith(".json"):
+            return json.loads(content)
+        else:
+            return {"content": content}
+
+    def uses_lakefile_lean(self) -> bool:
+        """Check if the repo uses a ``lakefile.lean``."""
+        url = self._get_config_url("lakefile.lean")
+        return url_exists(url)
+
+    def uses_lakefile_toml(self) -> bool:
+        """Check if the repo uses a ``lakefile.toml``."""
+        url = self._get_config_url("lakefile.toml")
+        return url_exists(url)
+
+
+@dataclass(frozen=True)
+class Theorem:
+    """Theorem in Lean.
+
+    Theorems are named constants of type :code:`Prop`. They are typically defined
+    using the keywords :code:`theorem` or :code:`lemma`, but it's possible to use other
+    keywords such as :code:`def` or :code:`instance`
+    """
+
+    repo: LeanGitRepo
+    """Lean repo the theorem comes from.
+    """
+
+    file_path: Path
+    """Lean source file the theorem comes from.
+    """
+
+    full_name: str
+    """Fully qualified name of the theorem.
+    """
+
+    def __post_init__(self) -> None:
+        if isinstance(self.file_path, str):
+            object.__setattr__(self, "file_path", Path(self.file_path))
+        assert (
+            self.file_path.suffix == ".lean"
+        ), f"File extension must be .lean: {self.file_path}"
+
+    @property
+    def uid(self) -> str:
+        """Unique identifier of the theorem."""
+        return f"{cleanse_string(self.repo.url)}@{cleanse_string(self.repo.commit)}:{cleanse_string(self.file_path.__str__())}:{cleanse_string(self.full_name)}"
+
+    @property
+    def uhash(self) -> str:
+        """Unique hash of the theorem."""
+        return str(hash(self.uid) ** 2)
diff --git a/data_extraction/trace.py b/data_extraction/trace.py
new file mode 100644
index 0000000..cd1ddc2
--- /dev/null
+++ b/data_extraction/trace.py
@@ -0,0 +1,135 @@
+"""This module provides the main interfaces for tracing Lean repos, i.e., extracting data from them.
+To estimate the time for tracing a repo, a good rule of thumb is 1.5x the time for compiling the repo using :code:`leanpkg build`.
+A repo has to be traced only once, and the traced repo will be stored in a cache for fast access in the future.
+"""
+
+import os
+import shutil
+from pathlib import Path
+from loguru import logger
+from typing import Union, Optional
+from subprocess import CalledProcessError
+
+from .cache import cache, _split_git_url
+from .lean import LeanGitRepo
+from ..constants import NUM_PROCS
+from .traced_data import TracedRepo
+from ..utils import working_directory
+from ..container import create_mounts, get_container, NativeContainer
+
+
+LEAN4_BUILD_SCRIPT_PATH = Path(__file__).with_name("build_lean4_repo.py")
+LEAN4_DATA_EXTRACTOR_PATH = Path(__file__).with_name("ExtractData.lean")
+
+def _trace(repo: LeanGitRepo, build_deps: bool) -> None:
+    assert (
+        repo.exists()
+    ), f"The {repo} does not exist. Please check the URL `{repo.commit_url}`."
+
+    # Trace `repo` in the current working directory.
+    assert not repo.is_lean4, "Cannot trace Lean 4 itself."
+    
+    user_name, repo_name = _split_git_url(repo.url)
+    local_repo_path = Path(os.environ.get("RAID_DIR")) / "repos" / user_name / repo_name
+    if not local_repo_path.exists():
+        repo.clone_and_checkout()
+
+    logger.debug(f"Tracing {repo}")
+    container = get_container()
+    mts = {
+        Path(os.environ.get("RAID_DIR")) / "repos" / user_name / repo_name: f"/workspace/{user_name}/{repo_name}",
+        LEAN4_BUILD_SCRIPT_PATH: f"/workspace/{LEAN4_BUILD_SCRIPT_PATH.name}",
+        LEAN4_DATA_EXTRACTOR_PATH: f"/workspace/{LEAN4_DATA_EXTRACTOR_PATH.name}",
+    }
+    
+    
+    cmd = f"python build_lean4_repo.py {user_name}/{repo_name}"
+    if not build_deps:
+        cmd += " --no-deps"
+
+    try:
+        import ipdb; ipdb.set_trace()
+        container.run(
+            cmd,
+            create_mounts(mts),
+            {"NUM_PROCS": NUM_PROCS},
+            as_current_user=True,
+            work_dir="/workspace",
+        )
+    except CalledProcessError as ex:
+        if repo.is_lean4 and isinstance(container, NativeContainer):
+            logger.error(
+                "Failed to build Lean 4 without Docker. See https://leandojo.readthedocs.io/en/latest/user-guide.html#advanced-running-within-docker."
+            )
+        raise ex
+
+
+def is_available_in_cache(repo: LeanGitRepo) -> bool:
+    """Check if ``repo`` has a traced repo available in the cache (including the remote cache)."""
+    return cache.get(repo.url, repo.commit) is not None
+
+
+def get_traced_repo_path(repo: LeanGitRepo, build_deps: bool = True) -> Path:
+    """Return the path of a traced repo in the cache.
+
+    The function will trace a repo if it is not available in the cache. See :ref:`caching` for details.
+
+    Args:
+        repo (LeanGitRepo): The Lean repo to trace.
+        build_deps (bool): Whether to build the dependencies of ``repo``. Defaults to True.
+
+    Returns:
+        Path: The path of the traced repo in the cache, e.g. :file:`/home/kaiyu/.cache/lean_dojo/leanprover-community-mathlib-2196ab363eb097c008d4497125e0dde23fb36db2`
+    """
+    path = cache.get(repo.url, repo.commit)
+    
+    if path is None:
+        logger.info(f"Tracing {repo}")
+        user_name, repo_name = _split_git_url(repo.url)
+        with Path(os.environ.get("RAID_DIR")) / "repos" / user_name / repo_name as tmp_dir:
+            print(tmp_dir)
+            logger.debug(f"Working in the temporary directory {tmp_dir}")
+            print("About to trace")
+            _trace(repo, build_deps)
+            traced_repo = TracedRepo.from_traced_files(tmp_dir, build_deps)
+            traced_repo.save_to_disk()
+            path = cache.store(tmp_dir)
+    else:
+        logger.debug("The traced repo is available in the cache.")
+    return path
+
+
+def trace(
+    repo: LeanGitRepo,
+    dst_dir: Optional[Union[str, Path]] = None,
+    build_deps: bool = True,
+) -> TracedRepo:
+    """Trace a repo (and its dependencies), saving the results to ``dst_dir``.
+
+    The function only traces the repo when it's not available in the cache. Otherwise,
+    it directly copies the traced repo from the cache to ``dst_dir``. See :ref:`caching` for details.
+
+    Args:
+        repo (LeanGitRepo): The Lean repo to trace.
+        dst_dir (Union[str, Path]): The directory for saving the traced repo. If None, the traced repo is only saved in the cahe.
+        build_deps (bool): Whether to build the dependencies of ``repo``. Defaults to True.
+
+    Returns:
+        TracedRepo: A :class:`TracedRepo` object corresponding to the files at ``dst_dir``.
+    """
+    if dst_dir is not None:
+        dst_dir = Path(dst_dir)
+        assert (
+            not dst_dir.exists()
+        ), f"The destination directory {dst_dir} already exists."
+
+    cached_path = get_traced_repo_path(repo, build_deps)
+    logger.info(f"Loading the traced repo from {cached_path}")
+    traced_repo = TracedRepo.load_from_disk(cached_path, build_deps)
+    traced_repo.check_sanity()
+
+    if dst_dir is not None:
+        dst_dir.mkdir(parents=True)
+        shutil.copytree(cached_path, dst_dir / cached_path.name)
+
+    return traced_repo
diff --git a/data_extraction/traced_data.py b/data_extraction/traced_data.py
new file mode 100644
index 0000000..82317b2
--- /dev/null
+++ b/data_extraction/traced_data.py
@@ -0,0 +1,1224 @@
+"""This module defines traced repos/files/theorems.
+"""
+
+import re
+import os
+import ray
+import json
+import random
+import itertools
+import webbrowser
+import networkx as nx
+from tqdm import tqdm
+from lxml import etree
+from pathlib import Path
+from loguru import logger
+from dataclasses import dataclass, field
+from typing import List, Optional, Dict, Any, Tuple, Union
+
+from ..utils import (
+    is_git_repo,
+    compute_md5,
+    ray_actor_pool,
+    to_lean_path,
+    to_dep_path,
+    to_json_path,
+    to_xml_path,
+)
+from .ast import *
+from .lean import LeanFile, LeanGitRepo, Theorem, Pos
+from ..constants import NUM_WORKERS, LOAD_USED_PACKAGES_ONLY, LEAN4_PACKAGES_DIR
+
+
+@dataclass(frozen=True)
+class Comment:
+    """A comment in a Lean file."""
+
+    start: Pos
+    end: Pos
+    text: str
+
+    def __post_init__(self) -> None:
+        assert isinstance(self.start, Pos)
+        assert isinstance(self.end, Pos)
+        assert self.start <= self.end
+        assert isinstance(self.text, str)
+
+    def to_xml(self, parent: etree.Element) -> None:
+        tree = etree.SubElement(parent, self.__class__.__name__)
+        tree.set("start", str(self.start))
+        tree.set("end", str(self.end))
+        tree.set("text", self.text)
+
+    @classmethod
+    def from_xml(cls, tree: etree.Element) -> "Comment":
+        start = Pos.from_str(tree.attrib["start"])
+        end = Pos.from_str(tree.attrib["end"])
+        text = tree.attrib["text"]
+        return cls(start, end, text)
+
+
+def _collect_lean4_comments(ast: FileNode) -> List[Comment]:
+    comments = []
+
+    def _callback(node, _):
+        nonlocal comments
+        if isinstance(node, CommandModuledocNode) or isinstance(
+            node, CommandDoccommentNode
+        ):
+            comments.append(Comment(node.start, node.end, node.comment))
+        elif is_leaf(node) and node.trailing.strip().startswith("--"):
+            num_spaces = node.trailing.index("--")
+            text = node.trailing[num_spaces:]
+            start = node.lean_file.offset(node.end, num_spaces)
+            end = node.lean_file.offset(start, len(text))
+            comments.append(Comment(start, end, text))
+
+    ast.traverse_preorder(_callback, node_cls=None)
+    return comments
+
+
+_SINGLE_LINE_COMMENT_REGEX = r"--.*?(\n|$)"
+_MULTI_LINE_COMMENT_REGEX = r"/-.*?(-/|$)"
+_COMMENT_REGEX = re.compile(
+    f"{_SINGLE_LINE_COMMENT_REGEX}|{_MULTI_LINE_COMMENT_REGEX}", re.DOTALL
+)
+
+
+def get_code_without_comments(
+    lean_file: LeanFile, start: Pos, end: Pos, comments: List[Comment]
+) -> str:
+    """Return the code in ``lean_file`` from ``start`` to ``end`` with comments removed.
+
+    Args:
+        lean_file (LeanFile): The lean source file.
+        start (Pos): The start position.
+        end (Pos): The end position.
+        comments (List[Comment]): A list of :class:`Comment` objects.
+
+    Returns:
+        str: Human-written code with comments removed.
+    """
+    base = start
+    code_segs = []
+
+    for c in comments:
+        if base <= c.start and c.end <= end:
+            code_segs.append(lean_file[base : c.start])
+            base = c.end
+
+    code_segs.append(lean_file[base:end])
+    code = "".join(code_segs)
+
+    code = _COMMENT_REGEX.sub("", code)
+    assert "--" not in code and "/-" not in code
+
+    return code.strip()
+
+
+@dataclass(frozen=True)
+class TracedTactic:
+    """A traced tactic is a tactic annotated with additional information including
+    its AST and the states before/after the tactic.
+    """
+
+    ast: Node = field(repr=False)
+    """AST of the tactic.
+    """
+
+    traced_theorem: Optional["TracedTheorem"] = field(
+        default=None, repr=False, compare=False
+    )
+    """The traced theorem this tactic belongs to.
+    """
+
+    def __getstate__(self) -> Dict[str, Any]:
+        d = {k: v for k, v in self.__dict__.items() if k != "traced_theorem"}
+        d["traced_theorem"] = None  # Avoid serializing the traced theorem.
+        return d
+
+    @property
+    def tactic(self) -> str:
+        """The raw tactic string."""
+        return self.ast.tactic
+
+    @property
+    def state_before(self) -> str:
+        """Pretty-printed state before applying the tactic."""
+        assert self.ast.state_before is not None
+        return self.ast.state_before
+
+    @property
+    def state_after(self) -> str:
+        """Pretty-printed state after applying the tactic."""
+        assert self.ast.state_after is not None
+        return self.ast.state_after
+
+    @property
+    def start(self) -> Pos:
+        """Start position in :file:`*.lean` file."""
+        return self.ast.start
+
+    @property
+    def end(self) -> Pos:
+        """End position in :file:`*.lean` file."""
+        return self.ast.end
+
+    def to_string(self) -> str:
+        return f"{self.__class__.__name__}(tactic={self.tactic}, state_before={self.state_before}, state_after={self.state_after})"
+
+    def __str__(self) -> str:
+        return self.to_string()
+
+    def __repr__(self) -> str:
+        return self.to_string()
+
+    def get_annotated_tactic(self) -> Tuple[str, List[Dict[str, Any]]]:
+        """Return the tactic annotated with premise information.
+
+        Premises in the tactic are marked by ``<a> ... </a>``. For example,
+        :code:`rw [add_comm b]` contains a premise :code:`add_comm` and therefore
+        becomes :code:`rw [<a>add_comm</a> b]`. In addition, the function returns
+        the provenance (full name, file path, line/column numbers) of all premises.
+
+        Returns:
+            Tuple[str, List[Dict[str, Any]]]: The first return value is the tactic string marked by ``<a> ... </a>``. The second return value is a list of provenances.
+        """
+        assert self.traced_theorem != None
+        lean_file = self.traced_theorem.traced_file.lean_file
+        annot_tac = []
+        provenances = []
+        cur = self.start
+
+        def _callback4(node: IdentNode, _):
+            nonlocal cur
+
+            if (
+                node.full_name is not None
+                and node.mod_name is not None
+                and node.def_start is not None
+                and node.def_end is not None
+            ):
+                if cur <= node.start:
+                    annot_tac.append(lean_file[cur : node.start])
+                    annot_tac.append("<a>" + lean_file[node.start : node.end] + "</a>")
+                    prov = {"full_name": node.full_name}
+                    prov["def_path"] = node.def_path
+                    prov["def_pos"] = list(node.def_start)
+                    prov["def_end_pos"] = list(node.def_end)
+                    provenances.append(prov)
+                    cur = node.end
+
+        self.ast.traverse_preorder(_callback4, IdentNode)
+        annot_tac.append(lean_file[cur : self.end])
+
+        return "".join(annot_tac), provenances
+
+
+@dataclass(frozen=True)
+class TracedTheorem:
+    """A traced theorem is a theorem with additional information such as the AST."""
+
+    root_dir: Path = field(repr=False)
+    """Root directory of the corresponding traced repo.
+    """
+
+    theorem: Theorem
+    """The corresponding :class:`Theorem` object.
+    """
+
+    ast: Union[CommandTheoremNode, LemmaNode, MathlibTacticLemmaNode] = field(
+        repr=False, compare=False
+    )
+    """AST of the theorem.
+    """
+
+    comments: List[Comment] = field(repr=False, compare=False)
+    """All comments in the theorem/proof.
+    """
+
+    traced_file: Optional["TracedFile"] = field(default=None, repr=False, compare=False)
+    """The traced file this theorem belongs to.
+    """
+
+    def __post_init__(self) -> None:
+        assert (
+            self.root_dir.is_absolute() and self.root_dir == self.traced_file.root_dir
+        )
+
+    def __getstate__(self) -> Dict[str, Any]:
+        d = {k: v for k, v in self.__dict__.items() if k != "traced_file"}
+        d["traced_file"] = None
+        return d
+
+    @property
+    def start(self) -> Pos:
+        """Start position in :file:`*.lean` file."""
+        return self.ast.start
+
+    @property
+    def end(self) -> Pos:
+        """End position in :file:`*.lean` file."""
+        return self.ast.end
+
+    @property
+    def repo(self) -> LeanGitRepo:
+        """The Lean repo this theorem belongs to."""
+        return self.theorem.repo
+
+    @property
+    def file_path(self) -> Path:
+        """The theorem's file path (relative to the root directory)."""
+        return self.theorem.file_path
+
+    @property
+    def traced_repo(self) -> "TracedRepo":
+        """The traced repo this theorem belongs to."""
+        if self.traced_file is None:
+            return None
+        else:
+            return self.traced_file.traced_repo
+
+    @property
+    def is_private(self) -> bool:
+        """Check if the theorem is private."""
+        return self.ast.is_private()
+
+    def show(self) -> None:
+        """Show the theorem in the default browser."""
+        url = os.path.join(
+            self.repo.url,
+            "blob",
+            self.repo.commit,
+            self.file_path,
+            f"#L{self.start.line_nb}-L{self.end.line_nb}",
+        )
+        webbrowser.open(url)
+
+    def has_tactic_proof(self) -> bool:
+        """Check if the theorem has a tactic-style proof."""
+        return self.ast.has_tactic_proof()
+
+    def get_proof_node(self) -> Node:
+        """Return the AST of the theorem's proof."""
+        return self.ast.get_proof_node()
+
+    def locate_proof(self) -> Tuple[Pos, Pos]:
+        """Return the start/end positions of the proof."""
+        start, end = self.get_proof_node().get_closure()
+        if end < self.end:
+            end = self.end
+        return start, end
+
+    def get_tactic_proof(self) -> Optional[str]:
+        """Return the tactic-style proof (if any)."""
+        if not self.has_tactic_proof():
+            return None
+        node = self.get_proof_node()
+        start, end = node.get_closure()
+        proof = get_code_without_comments(node.lean_file, start, end, self.comments)
+        if not re.match(r"^(by|begin)\s", proof):
+            return None
+        else:
+            return proof
+
+    def get_theorem_statement(self) -> str:
+        """Return the theorem statement."""
+        proof_start, _ = self.locate_proof()
+        return get_code_without_comments(
+            self.traced_file.lean_file, self.ast.start, proof_start, self.comments
+        )
+
+    def get_single_tactic_proof(self) -> Optional[str]:
+        """Wrap the proof into a single (potentially very long) tactic."""
+        if not self.has_tactic_proof():
+            return None
+        node = self.get_proof_node()
+        start, end = node.get_closure()
+        proof = get_code_without_comments(node.lean_file, start, end, self.comments)
+
+        raise NotImplementedError
+        assert isinstance(node.children[0], AtomNode) and node.children[0].val == "by"
+        assert proof.startswith("by")
+        proof = proof[len("by") :].strip()
+
+        return proof
+
+    def get_premise_full_names(self) -> List[str]:
+        """Return the fully qualified names of all premises used in the proof."""
+        names = []
+
+        def _callback(node: IdentNode, _: List[Node]):
+            if node.full_name is not None:
+                names.append(node.full_name)
+
+        self.ast.traverse_preorder(_callback, node_cls=IdentNode)
+
+        return names
+
+    def get_traced_tactics(self, atomic_only: bool = False) -> List[TracedTactic]:
+        """Return a list of traced tactics in the proof."""
+        tacs = self._get_traced_tactics_lean4(atomic_only)
+
+        # Deduplicate.
+        signatures = set()
+        tacs_dedup = []
+        for t in tacs:
+            sig = (t.state_before, t.tactic, t.state_after)
+            if sig not in signatures:
+                signatures.add(sig)
+                tacs_dedup.append(t)
+
+        return tacs_dedup
+
+    def _get_traced_tactics_lean4(
+        self, atomic_only: bool = False
+    ) -> List[TracedTactic]:
+        tacs = []
+
+        def _callback(node, _):
+            if not isinstance(
+                node,
+                (
+                    TacticTacticseq1IndentedNode,
+                    TacticTacticseqbracketedNode,
+                ),
+            ):
+                return
+            for tac_node in node.get_tactic_nodes(atomic_only):
+                if (
+                    hasattr(tac_node, "state_before")
+                    and tac_node.state_before is not None
+                ):
+                    # Tactics outside theorem/lemma definitions are not recorded.
+                    tacs.append(TracedTactic(tac_node, self))
+
+        self.ast.traverse_preorder(_callback, node_cls=None)
+        return tacs
+
+    def get_num_tactics(self) -> int:
+        """Return the number of tactics in the proof."""
+        return len(self.get_traced_tactics())
+
+
+_TAG_INDEX_REGEX = re.compile(r"(?P<key>\S+)\[(?P<idx>\d+)\]$")
+
+
+def _qualify_name(name: str, prefix: str) -> str:
+    """Qualify a name with a prefix."""
+    if name.startswith("_root_."):
+        return name[len("_root_.") :]
+    elif prefix == "":
+        return name
+    else:
+        return f"{prefix}.{name}"
+
+
+def _fix_indentation(tac: str, indent: int) -> str:
+    """Fix the indentation of a tactic."""
+    lines = tac.splitlines()
+    if len(lines) == 1:
+        return tac
+    else:
+        lines_new = [lines[0]]
+        for l in lines[1:]:
+            for i in range(len(l)):
+                if l[i] != " " or i >= indent:
+                    lines_new.append(l[i:])
+                    break
+
+        return "\n".join(lines_new)
+
+
+@dataclass(eq=False)
+class TracedFile:
+    """A traced file is a Lean source file annotated with syntactic/semantic information
+    such as tactic states, Lean expressions, and abstract syntax trees (ASTs).
+    """
+
+    root_dir: Path
+    """Root directory (in absolute path) of the corresponding traced repo.
+    """
+
+    repo: LeanGitRepo
+    """The Lean repo this traced file belongs to.
+    """
+
+    lean_file: LeanFile
+    """Lean source file of this traced file.
+    """
+
+    ast: FileNode = field(repr=False)
+    """Abstract syntax tree (AST) of the entire :code:`*.lean` file.
+    
+    AST nodes are defined in :ref:`lean_dojo.data_extraction.ast`. 
+    """
+
+    comments: List[Comment] = field(repr=False)
+    """All comments in the :code:`*.lean` file.
+    """
+
+    traced_repo: Optional["TracedRepo"] = field(default=None, repr=False)
+    """The traced repo this traced file belongs to.
+    
+    Note that ``traced_repo`` will become None after the traced file is serialized/deserialized on its own.
+    """
+
+    def __post_init__(self) -> None:
+        assert self.root_dir.is_absolute(), f"{self.root_dir} is not an absolute path"
+
+    def __getstate__(self) -> Dict[str, Any]:
+        d = {k: v for k, v in self.__dict__.items() if k != "traced_repo"}
+        d["traced_repo"] = None
+        return d
+
+    @property
+    def path(self) -> Path:
+        """Path of the :file:`*.lean` file relative to the root directory."""
+        return self.lean_file.path
+
+    @property
+    def abs_path(self) -> Path:
+        """Absolute path of the :code:`*.lean` file."""
+        return self.root_dir / self.path
+
+    @property
+    def has_prelude(self) -> bool:
+        """Check whether the file starts with :code:``prelude``.
+
+        :code:``prelude`` instructs Lean NOT to include its built-in library automatically.
+        """
+        result = False
+
+        def _callback(node: ModulePreludeNode, _: List[Node]):
+            nonlocal result
+            result = True
+            return True  # Stop traversing.
+
+        self.ast.traverse_preorder(_callback, node_cls=ModulePreludeNode)
+        return result
+
+    @classmethod
+    def from_traced_file(
+        cls, root_dir: Union[str, Path], json_path: Path, repo: LeanGitRepo
+    ) -> "TracedFile":
+        """Construct a :class:`TracedFile` object by parsing a :file:`*.ast.json` file
+        produced by :code:`lean --ast --tsast --tspp` (Lean 3) or :file:`ExtractData.lean` (Lean 4).
+
+        Args:
+            root_dir (Union[str, Path]): Root directory of the traced repo.
+            json_path (Path): Path of the :file:`*.ast.json` file relative to ``root_dir``.
+        """
+        root_dir = Path(root_dir)
+        root_dir = root_dir.resolve()
+        if not json_path.is_absolute():
+            json_path = root_dir / json_path
+        if not json_path.exists():
+            raise FileNotFoundError(f"{json_path} does not exist")
+        assert json_path.suffixes == [
+            ".ast",
+            ".json",
+        ], f"{json_path} is not a *.ast.json file"
+
+        return cls._from_lean4_traced_file(root_dir, json_path, repo)
+
+    @classmethod
+    def _from_lean4_traced_file(
+        cls, root_dir: Path, json_path: Path, repo: LeanGitRepo
+    ) -> "TracedFile":
+        lean_path = to_lean_path(root_dir, json_path, repo)
+        lean_file = LeanFile(root_dir, lean_path)
+
+        data = json.load(json_path.open())
+
+        data["module_paths"] = []
+        for line in (
+            json_path.with_suffix("").with_suffix("").with_suffix(".dep_paths").open()
+        ):
+            line = line.strip()
+            if line == "":
+                break
+            data["module_paths"].append(line)
+
+        ast = FileNode.from_data(data, lean_file)
+        comments = _collect_lean4_comments(ast)
+        TracedFile._post_process_lean4(
+            ast,
+            lean_file,
+            data["tactics"],
+            data["premises"],
+            data["module_paths"],
+            comments,
+        )
+
+        return cls(root_dir, repo, lean_file, ast, comments)
+
+    @classmethod
+    def _post_process_lean4(
+        cls,
+        ast: FileNode,
+        lean_file: LeanFile,
+        tactics_data: List[Dict[str, Any]],
+        premises_data: List[Dict[str, Any]],
+        imports_data: List[str],
+        comments: List[Comment],
+    ) -> None:
+        pos2tactics = {}
+        for t in tactics_data:
+            start = lean_file.convert_pos(t["pos"])
+            end = lean_file.convert_pos(t["endPos"])
+            pos2tactics[(start, end)] = t
+
+        pos2premises = {}
+        for p in premises_data:
+            if (
+                p is None
+                or p["pos"] is None
+                or p["endPos"] is None
+                or p["fullName"] is None
+                or p["fullName"] == "[anonymous]"
+            ):
+                continue
+            start_line_nb, start_column_nb = p["pos"]["line"], p["pos"]["column"]
+            end_line_nb, end_column_nb = p["endPos"]["line"], p["endPos"]["column"]
+            start = Pos(line_nb=start_line_nb, column_nb=start_column_nb + 1)
+            end = Pos(line_nb=end_line_nb, column_nb=end_column_nb + 1)
+            pos2premises[(start, end)] = p
+
+        inside_sections_namespaces = []
+
+        def _callback(node: Node, _):
+            if (
+                isinstance(
+                    node,
+                    (
+                        CommandNamespaceNode,
+                        CommandSectionNode,
+                        CommandNoncomputablesectionNode,
+                    ),
+                )
+                and node.name is not None
+            ):
+                inside_sections_namespaces.append(node)
+            elif (
+                isinstance(node, CommandEndNode)
+                and node.name is not None
+                and len(inside_sections_namespaces) > 0
+            ):
+                inside_sections_namespaces.pop()
+            elif is_potential_premise_lean4(node):
+                prefix = ".".join(
+                    ns.name
+                    for ns in inside_sections_namespaces
+                    if isinstance(ns, CommandNamespaceNode)
+                )
+                full_name = (
+                    [_qualify_name(name, prefix) for name in node.name]
+                    if is_mutual_lean4(node)
+                    else _qualify_name(node.name, prefix)
+                )
+                object.__setattr__(node, "full_name", full_name)
+                if isinstance(node, CommandDeclarationNode) and node.is_theorem:
+                    object.__setattr__(node.get_theorem_node(), "full_name", full_name)
+            elif isinstance(
+                node,
+                (
+                    TacticTacticseq1IndentedNode,
+                    TacticTacticseqbracketedNode,
+                ),
+            ):
+                for tac_node in node.get_tactic_nodes():
+                    assert isinstance(
+                        tac_node, (OtherNode, TacticTacticseqbracketedNode)
+                    )
+                    if (tac_node.start, tac_node.end) not in pos2tactics:
+                        continue
+                    t = pos2tactics[(tac_node.start, tac_node.end)]
+                    tac = get_code_without_comments(
+                        lean_file, tac_node.start, tac_node.end, comments
+                    )
+                    tac = _fix_indentation(tac, tac_node.start.column_nb - 1)
+                    object.__setattr__(tac_node, "state_before", t["stateBefore"])
+                    object.__setattr__(tac_node, "state_after", t["stateAfter"])
+                    object.__setattr__(tac_node, "tactic", tac)
+            elif isinstance(node, IdentNode):
+                start, end = node.get_closure()
+                if (start, end) in pos2premises:
+                    assert start is not None
+                    assert end is not None
+                    p = pos2premises[(start, end)]
+                    prem = get_code_without_comments(lean_file, start, end, comments)
+                    prem = _fix_indentation(prem, start.column_nb - 1)
+                    if p["fullName"] is not None:
+                        object.__setattr__(node, "full_name", p["fullName"])
+                    if p["modName"] is not None:
+                        object.__setattr__(node, "mod_name", p["modName"])
+                    if p["defPath"] is not None:
+                        object.__setattr__(node, "def_path", p["defPath"])
+                    if p["defPos"] is not None and p["defEndPos"] is not None:
+                        def_start_line_nb, def_start_column_nb = (
+                            p["defPos"]["line"],
+                            p["defPos"]["column"],
+                        )
+                        def_end_line_nb, def_end_column_nb = (
+                            p["defEndPos"]["line"],
+                            p["defEndPos"]["column"],
+                        )
+                        def_start = Pos(
+                            line_nb=def_start_line_nb, column_nb=def_start_column_nb + 1
+                        )
+                        def_end = Pos(
+                            line_nb=def_end_line_nb, column_nb=def_end_column_nb + 1
+                        )
+                        object.__setattr__(node, "def_start", def_start)
+                        object.__setattr__(node, "def_end", def_end)
+            elif isinstance(node, ModuleImportNode):
+                node_module_name = object.__getattribute__(node, "module")
+                if node_module_name is not None:
+                    suffix = node_module_name.replace(".", "/")
+                    for import_line in imports_data:
+                        if import_line.endswith(
+                            suffix + ".lean"
+                        ) or import_line.endswith(suffix + "/default.lean"):
+                            object.__setattr__(node, "path", Path(import_line))
+
+        ast.traverse_preorder(_callback, node_cls=None)
+
+    def check_sanity(self) -> None:
+        """Perform some basic sanity checks.
+
+        The function raises exceptions in case of unsuccessful checks.
+        """
+        assert isinstance(self.root_dir, Path)
+        assert isinstance(self.lean_file, LeanFile)
+        isinstance(self.ast, FileNode)
+
+        assert self.lean_file.root_dir == self.root_dir
+
+        for t in self.get_traced_theorems():
+            assert str(self.lean_file.path).endswith(str(t.theorem.file_path))
+            assert t.traced_file is None or t.traced_file is self
+
+    def traverse_preorder(self, callback, node_cls: Optional[type] = None):
+        """Traverse the AST in preorder.
+
+        Args:
+            callback (function): Callback function for visiting AST nodes.
+            node_cls (Optional[type], optional): Restrict the application of
+                ``callback`` to only nodes of type ``node_cls``.
+                Defaults to None, which means applying ``callback`` to all.
+        """
+        self.ast.traverse_preorder(callback, node_cls)
+
+    def _get_repo_and_relative_path(self) -> Tuple[LeanGitRepo, Path]:
+        """Return the repo this file belongs to, as well as the file's path relative to it."""
+        if self.path.is_relative_to(LEAN4_PACKAGES_DIR):
+            # The theorem belongs to one of the dependencies.
+            p = self.path.relative_to(LEAN4_PACKAGES_DIR)
+            name = p.parts[0]
+            repo = self.traced_repo.dependencies[name]
+            return repo, p.relative_to(name)
+        else:
+            # The theorem belongs to the traced repo itself.
+            return self.traced_repo.repo, self.path
+
+    def get_traced_theorem(
+        self, thm_or_name: Union[Theorem, str]
+    ) -> Optional[TracedTheorem]:
+        """Return a :class:`TracedTheorem` object given an :class:`Theorem` object
+        or its fully-qualified name."""
+        if isinstance(thm_or_name, Theorem):
+            thm = thm_or_name
+        else:
+            repo, path = self._get_repo_and_relative_path()
+            thm = Theorem(repo, path, thm_or_name)
+        result = None
+        private_result = None
+
+        def _callback(
+            node: Union[CommandTheoremNode, LemmaNode, MathlibTacticLemmaNode], _
+        ) -> None:
+            nonlocal result, private_result
+            if not isinstance(
+                node,
+                (
+                    CommandTheoremNode,
+                    LemmaNode,
+                    MathlibTacticLemmaNode,
+                ),
+            ):
+                return False
+            if node.full_name == thm.full_name:
+                comments = self._filter_comments(node.start, node.end)
+                t = TracedTheorem(self.root_dir, thm, node, comments, self)
+                if t.is_private:
+                    private_result = t
+                else:
+                    result = t
+
+        self.ast.traverse_preorder(_callback, node_cls=None)
+
+        # Prioritize non-private theorems.
+        if result is None:
+            result = private_result
+        return result
+
+    def get_traced_theorems(self) -> List[TracedTheorem]:
+        """Return a list of traced theorem in this traced file."""
+        traced_theorems = []
+
+        def _callback(
+            node: Union[CommandTheoremNode, LemmaNode, MathlibTacticLemmaNode], _
+        ) -> None:
+            if not isinstance(
+                node,
+                (
+                    CommandTheoremNode,
+                    LemmaNode,
+                    MathlibTacticLemmaNode,
+                ),
+            ):
+                return False
+            repo, path = self._get_repo_and_relative_path()
+            thm = Theorem(repo, path, node.full_name)
+            comments = self._filter_comments(node.start, node.end)
+            traced_theorems.append(
+                TracedTheorem(self.root_dir, thm, node, comments, self)
+            )
+            # No need to traverse the subtree since theorems cannot be nested.
+            return True
+
+        self.traverse_preorder(_callback, node_cls=None)
+        return traced_theorems
+
+    def _filter_comments(self, start: Pos, end: Pos) -> List[Comment]:
+        """Return a list of comments that are contained in the given range."""
+        comments = []
+        for c in self.comments:
+            if c.start < start:
+                assert c.end <= start
+            elif c.start < end:
+                assert c.end <= end
+                comments.append(c)
+        return comments
+
+    def get_direct_dependencies(self, repo: LeanGitRepo) -> List[Tuple[str, Path]]:
+        """Return the names and paths of all modules imported by the current :file:`*.lean` file."""
+        deps = set()
+
+        if not self.has_prelude:  # Add the prelude as a dependency.
+            init_lean = Path("src/lean/Init.lean")
+            if self.root_dir.name == "lean4":
+                deps.add(("Init", init_lean))
+            else:
+                deps.add(("Init", LEAN4_PACKAGES_DIR / "lean4" / init_lean))
+
+        def _callback(node: ModuleImportNode, _) -> None:
+            if node.module is not None and node.path is not None:
+                deps.add((node.module, node.path))
+
+        self.traverse_preorder(_callback, node_cls=ModuleImportNode)
+        return list(deps)
+
+    def get_premise_definitions(self) -> List[Dict[str, Any]]:
+        """Return all theorems and definitions defined in the current file that
+        can be potentially used as premises.
+
+        Returns:
+            List[Dict[str, Any]]: _description_
+        """
+        results = []
+
+        def _callback4(node: Node, _) -> None:
+            if is_potential_premise_lean4(node):
+                start, end = node.get_closure()
+                if isinstance(node, CommandDeclarationNode) and node.is_theorem:
+                    # We assume theorems are defined using keywords "theorem"
+                    # or "lemma" but not, e.g., "def".
+                    proof_start, _ = (
+                        node.get_theorem_node().get_proof_node().get_closure()
+                    )
+                    code = get_code_without_comments(
+                        self.lean_file, start, proof_start, self.comments
+                    )
+                    if code.endswith(":="):
+                        code = code[:-2].strip()
+                else:
+                    code = get_code_without_comments(
+                        self.lean_file, start, end, self.comments
+                    )
+                # TODO: For alias, restate_axiom, etc., the code is not very informative.
+                if is_mutual_lean4(node):
+                    for s in node.full_name:
+                        results.append(
+                            {
+                                "full_name": s,
+                                "code": code,
+                                "start": list(start),
+                                "end": list(end),
+                                "kind": node.kind(),
+                            }
+                        )
+                else:
+                    results.append(
+                        {
+                            "full_name": node.full_name,
+                            "code": code,
+                            "start": list(start),
+                            "end": list(end),
+                            "kind": node.kind(),
+                        }
+                    )
+
+        self.traverse_preorder(_callback4, node_cls=None)
+        return results
+
+    def to_xml(self) -> str:
+        """Serialize a :class:`TracedFile` object to XML."""
+        tree = etree.Element(self.__class__.__name__)
+
+        tree.set("path", str(self.path))
+        tree.set("md5", compute_md5(self.abs_path))
+
+        self.ast.to_xml(tree)
+
+        if self.comments is not None:
+            comments_node = etree.SubElement(tree, "Comments")
+            for c in self.comments:
+                c.to_xml(comments_node)
+
+        return etree.tostring(tree, encoding="utf-8", pretty_print=True).decode()
+
+    @classmethod
+    def from_xml(
+        cls,
+        root_dir: Union[str, Path],
+        path: Union[str, Path],
+        repo: LeanGitRepo,
+    ) -> "TracedFile":
+        """Load a :class:`TracedFile` object from its :file:`*.trace.xml` file.
+
+        Args:
+            root_dir (Union[str, Path]): Root directory of the traced repo.
+            path (Union[str, Path]): Path of the :file:`*.trace.xml` file relative to ``root_dir``.
+            repo (LeanGitRepo): The repo to which the traced file belongs.
+        """
+        root_dir = Path(root_dir)
+        path = Path(path)
+        assert path.suffixes == [".trace", ".xml"]
+        lean_path = to_lean_path(root_dir, path, repo)
+        lean_file = LeanFile(root_dir, lean_path)
+
+        tree = etree.parse(path).getroot()
+        assert tree.tag == "TracedFile"
+        assert tree.attrib["path"] == str(lean_path)
+        assert tree.attrib["md5"] == compute_md5(lean_file.abs_path)
+
+        ast_tree, comments_tree = list(tree)
+        ast = FileNode.from_xml(ast_tree, lean_file)
+        comments = [Comment.from_xml(c) for c in comments_tree]
+
+        return cls(root_dir, repo, lean_file, ast, comments)
+
+
+def _save_xml_to_disk(tf: TracedFile) -> None:
+    xml_path = tf.root_dir / to_xml_path(tf.root_dir, tf.path, tf.repo)
+    with xml_path.open("wt") as oup:
+        oup.write(tf.to_xml())
+
+
+def _build_dependency_graph(
+    seed_files: List[TracedFile], root_dir: Path, repo: LeanGitRepo
+) -> nx.DiGraph:
+    G = nx.DiGraph()
+
+    for tf in seed_files:
+        tf_path_str = str(tf.path)
+        assert not G.has_node(tf_path_str)
+        G.add_node(tf_path_str, traced_file=tf)
+
+    traced_files = seed_files.copy()
+    i = 0
+
+    while i < len(traced_files):
+        tf = traced_files[i]
+        tf_path_str = str(tf.path)
+
+        for dep_module, dep_path in tf.get_direct_dependencies(repo):
+            dep_path_str = str(dep_path)
+            if not G.has_node(dep_path_str):
+                json_path = to_json_path(root_dir, dep_path, repo)
+                tf_dep = TracedFile.from_traced_file(root_dir, json_path, repo)
+                G.add_node(dep_path_str, traced_file=tf_dep)
+                traced_files.append(tf_dep)
+
+            G.add_edge(tf_path_str, dep_path_str, module=dep_module)
+
+        i += 1
+
+    assert nx.is_directed_acyclic_graph(G)
+    return G
+
+
+@ray.remote
+class _TracedRepoHelper:
+    """
+    Helper class serving as Ray actor.
+    """
+
+    def __init__(self, root_dir: Path, repo: LeanGitRepo) -> None:
+        self.root_dir = root_dir
+        self.repo = repo
+
+    def parse_traced_file(self, path: Path) -> TracedFile:
+        return TracedFile.from_traced_file(self.root_dir, path, self.repo)
+
+    def save_xml_to_disk(self, tf: TracedFile) -> None:
+        return _save_xml_to_disk(tf)
+
+    def load_xml_from_disk(self, path: Path) -> TracedFile:
+        return TracedFile.from_xml(self.root_dir, path, self.repo)
+
+
+@dataclass(frozen=True, eq=False)
+class TracedRepo:
+    """A traced repo is a Lean repo of traced files and additional information, such as
+    other repos it depends on, as well as the dependency graph between files.
+    """
+
+    repo: LeanGitRepo
+    """The corresponding Lean repo.
+    """
+
+    dependencies: Dict[str, LeanGitRepo]
+    """Dictionary mapping the name of each dependency to a :class:`LeanGitRepo` object.
+    """
+
+    root_dir: Path
+    """Root directory of the traced repo.
+    """
+
+    traced_files: List[TracedFile] = field(repr=False)
+    """List of traced files in the repo."""
+
+    traced_files_graph: Optional[nx.DiGraph] = field(repr=False)
+    """Dependency graph between files in the repo.
+    
+    The graph is a DAG, and there is an edge from file :file:`X` to file :file:`Y`
+    if and only if :file:`X` imports :file:`Y`
+    """
+
+    def __post_init__(self) -> None:
+        assert self.root_dir.is_absolute()
+
+    def __setstate__(self, state) -> None:
+        object.__setattr__(self, "__dict__", state)
+        self._update_traced_files()
+
+    @property
+    def name(self) -> str:
+        """Name of the repo."""
+        return self.repo.name
+
+    def show(self) -> None:
+        """Show the repo in the default browser."""
+        self.repo.show()
+
+    def check_sanity(self) -> None:
+        """Perform some basic sanity checks.
+
+        The function raises exceptions in case of unsuccessful checks.
+        """
+        logger.debug(f"Checking the sanity of {self}")
+        assert isinstance(self.repo, LeanGitRepo)
+        assert isinstance(self.dependencies, dict)
+        for k, v in self.dependencies.items():
+            assert isinstance(k, str) and isinstance(v, LeanGitRepo)
+        assert isinstance(self.root_dir, Path)
+        assert self.traced_files_graph is None or isinstance(
+            self.traced_files_graph, nx.DiGraph
+        )
+
+        assert self.repo not in self.dependencies.values()
+
+        json_files = {
+            p.relative_to(self.root_dir) for p in self.root_dir.glob("**/*.ast.json")
+        }
+        lean_files = {
+            p.relative_to(self.root_dir) for p in self.root_dir.glob("**/*.lean")
+        }
+        xml_files = {
+            p.relative_to(self.root_dir) for p in self.root_dir.glob("**/*.trace.xml")
+        }
+        path_files = {
+            p.relative_to(self.root_dir) for p in self.root_dir.glob("**/*.dep_paths")
+        }
+
+        if self.traced_files_graph is not None:
+            if not LOAD_USED_PACKAGES_ONLY:
+                assert len(json_files) == self.traced_files_graph.number_of_nodes()
+
+            for path_str, tf_node in self.traced_files_graph.nodes.items():
+                tf = tf_node["traced_file"]
+                path = Path(path_str)
+                tf.check_sanity()
+                assert tf.path == path and tf.root_dir == self.root_dir
+                assert tf.traced_repo is None or tf.traced_repo is self
+                assert path in lean_files
+                assert (
+                    to_dep_path(self.root_dir, path, self.repo) in path_files
+                ), to_dep_path(self.root_dir, path, self.repo)
+                assert (
+                    to_json_path(self.root_dir, path, self.repo) in json_files
+                ), to_json_path(self.root_dir, path, self.repo)
+                if len(xml_files) > 0:
+                    assert (
+                        to_xml_path(self.root_dir, path, self.repo) in xml_files
+                    ), to_xml_path(self.root_dir, path, self.repo)
+
+    @classmethod
+    def from_traced_files(
+        cls, root_dir: Union[str, Path], build_deps: bool = True
+    ) -> "TracedRepo":
+        """Construct a :class:`TracedRepo` object by parsing :file:`*.ast.json` and :file:`*.path` files
+           produced by :code:`lean --ast --tsast --tspp` (Lean 3) or :file:`ExtractData.lean` (Lean 4).
+
+        Args:
+            root_dir (Union[str, Path]): Root directory of the traced repo.
+            build_deps (bool, optional): Whether to build the dependency graph between files.
+        """
+        root_dir = Path(root_dir).resolve()
+        if not is_git_repo(root_dir):
+            raise RuntimeError(f"{root_dir} is not a Git repo.")
+        repo = LeanGitRepo.from_path(root_dir)
+
+        json_paths = list(root_dir.glob("**/*.ast.json"))
+        random.shuffle(json_paths)
+        logger.debug(
+            f"Parsing {len(json_paths)} *.ast.json files in {root_dir} with {NUM_WORKERS} workers"
+        )
+
+        if NUM_WORKERS <= 1:
+            traced_files = [
+                TracedFile.from_traced_file(root_dir, path, repo)
+                for path in tqdm(json_paths)
+            ]
+        else:
+            with ray_actor_pool(_TracedRepoHelper, root_dir, repo) as pool:
+                traced_files = list(
+                    tqdm(
+                        pool.map_unordered(
+                            lambda a, p: a.parse_traced_file.remote(p), json_paths
+                        ),
+                        total=len(json_paths),
+                    )
+                )
+
+        dependencies = repo.get_dependencies(root_dir)
+        if build_deps:
+            traced_files_graph = _build_dependency_graph(traced_files, root_dir, repo)
+        else:
+            traced_files_graph = None
+
+        traced_repo = cls(
+            repo, dependencies, root_dir, traced_files, traced_files_graph
+        )
+        traced_repo._update_traced_files()
+        return traced_repo
+
+    def get_traced_file(self, path: Union[str, Path]) -> TracedFile:
+        """Return a traced file by its path."""
+        return self.traced_files_graph.nodes[str(path)]["traced_file"]
+
+    def _update_traced_files(self) -> None:
+        for tf in self.traced_files:
+            tf.traced_repo = self
+
+    def save_to_disk(self) -> None:
+        """Save all traced files in the repo to the disk as :file:`*.trace.xml` files."""
+        num_traced_files = len(self.traced_files)
+        logger.debug(
+            f"Saving {num_traced_files} traced XML files to {self.root_dir} with {NUM_WORKERS} workers"
+        )
+        if NUM_WORKERS <= 1:
+            for tf in tqdm(self.traced_files, total=num_traced_files):
+                _save_xml_to_disk(tf)
+        else:
+            with ray_actor_pool(_TracedRepoHelper, self.root_dir, self.repo) as pool:
+                list(
+                    tqdm(
+                        pool.map_unordered(
+                            lambda a, tf: a.save_xml_to_disk.remote(tf),
+                            self.traced_files,
+                        ),
+                        total=num_traced_files,
+                    )
+                )
+
+    @classmethod
+    def load_from_disk(
+        cls, root_dir: Union[str, Path], build_deps: bool = True
+    ) -> "TracedRepo":
+        """Load a traced repo from :file:`*.trace.xml` files."""
+        root_dir = Path(root_dir).resolve()
+        if not is_git_repo(root_dir):
+            raise RuntimeError(f"{root_dir} is not a Git repo.")
+        repo = LeanGitRepo.from_path(root_dir)
+
+        xml_paths = list(root_dir.glob("**/*.trace.xml"))
+        logger.debug(
+            f"Loading {len(xml_paths)} traced XML files from {root_dir} with {NUM_WORKERS} workers"
+        )
+
+        # Start from files in the target repo as seeds.
+        # Only load dependency files that are actually used.
+        if LOAD_USED_PACKAGES_ONLY:
+            xml_paths = [
+                p
+                for p in xml_paths
+                if not "lake-packages/" in str(p) and not ".lake/packages" in str(p)
+            ]
+
+        if NUM_WORKERS <= 1:
+            traced_files = [
+                TracedFile.from_xml(root_dir, path, repo) for path in tqdm(xml_paths)
+            ]
+        else:
+            with ray_actor_pool(_TracedRepoHelper, root_dir, repo) as pool:
+                traced_files = list(
+                    tqdm(
+                        pool.map_unordered(
+                            lambda a, path: a.load_xml_from_disk.remote(path), xml_paths
+                        ),
+                        total=len(xml_paths),
+                    )
+                )
+
+        dependencies = repo.get_dependencies(root_dir)
+        if build_deps:
+            traced_files_graph = _build_dependency_graph(traced_files, root_dir, repo)
+        else:
+            traced_files_graph = None
+
+        traced_repo = cls(
+            repo, dependencies, root_dir, traced_files, traced_files_graph
+        )
+        traced_repo._update_traced_files()
+        return traced_repo
+
+    def get_traced_theorems(self) -> List[TracedTheorem]:
+        """Return all traced theorems in the repo."""
+        return list(
+            itertools.chain.from_iterable(
+                tf.get_traced_theorems() for tf in self.traced_files
+            )
+        )
+
+    def get_traced_theorem(self, thm: Theorem) -> Optional[TracedTheorem]:
+        """Return a :class:`TracedTheorem` object corresponding to ``thm``"""
+        if thm.repo == self.repo:
+            path = Path(thm.repo.name) / thm.file_path
+        else:
+            assert thm.repo in self.dependencies.values()
+            path = Path(self.name) / LEAN4_PACKAGES_DIR / thm.repo.name / thm.file_path
+        return self.get_traced_file(path).get_traced_theorem(thm.full_name)
diff --git a/interaction/Lean4Repl.lean b/interaction/Lean4Repl.lean
new file mode 100644
index 0000000..dad9b66
--- /dev/null
+++ b/interaction/Lean4Repl.lean
@@ -0,0 +1,357 @@
+-- REPL for interacting with Lean 4 via the command line.
+import Lean.Message
+import Lean.Elab.Tactic
+import Lean.Elab.Frontend
+
+open Lean Lean.Meta Lean.Elab Lean.Elab.Command Lean.Elab.Tactic
+
+namespace LeanDojo
+
+
+/-- Print the response as JSON. --/
+private def printResponse {α : Type _} [ToJson α] (res : α) : IO Unit := do
+  let json := (toJson res).pretty 99999999999999999
+  println! "REPL> {json}"
+  (← IO.getStdout).flush
+
+
+/-- Join a list of strings using a separator. --/
+private def join (l : List String) (sep : String := "\n") : String :=
+  match l with
+  | [] => ""
+  | first :: others => others.foldl (fun r s => r ++ sep ++ s) first
+
+
+/-- A request to REPL. --/
+structure Request where
+  /-- Tactic/command state ID on which to execute the request. -/
+  sid: Nat
+  /-- Tactic/command. --/
+  cmd: String
+deriving FromJson, ToJson
+
+
+/-- A response to REPL. --/
+structure Response where
+  /-- New tactic/command state ID. --/
+  sid : Option Nat := none
+  /-- Next tactic state. --/
+  tacticState : Option String := none
+  /-- Error message. --/
+  error: Option String := none
+deriving ToJson
+
+
+/-- The state of the REPL. --/
+structure ReplState (σ : Type _) where
+  /-- Saved tactic/command states. --/
+  savedStates : Array σ
+  /-- The first solved tactic state. --/
+  solvedState : Option σ
+
+
+/-- Get the saved tactic state with the given ID. --/
+private def getSavedState? (m : Type → Type) [Monad m] {σ : Type _} [MonadState (ReplState σ) m] (sid : Nat) : m (Option σ) := do
+  return (← get).savedStates[sid]?
+
+
+/-- Get the initial tactic state. --/
+private def getInitialState! (m : Type → Type) [Monad m] {σ : Type _} [MonadState (ReplState σ) m] [MonadError m] : m σ := do
+  let some ts ← getSavedState? m 0 | throwError "[fatal] no initial state"
+  return ts
+
+
+/-- Get the next state ID. --/
+private def getNextSid (m : Type → Type) [Monad m] {σ : Type _} [MonadState (ReplState σ) m] : m Nat := do
+  return (← get).savedStates.size
+
+
+namespace TacticRepl
+
+
+/-- The tactic REPL monad. --/
+abbrev TacticReplM := StateT (ReplState Tactic.SavedState) TacticM
+
+
+instance : MonadLift IO TacticReplM where
+  monadLift x := liftM x
+
+
+/-- Insert a tactic state into the REPL state. --/
+private def insertTacticState (ts : Tactic.SavedState) : TacticReplM Unit := do
+  let succeeded := ts.tactic.goals.isEmpty
+  modifyGet fun s => ((), ⟨s.savedStates.push ts,
+    match s.solvedState with
+    | some _ => s.solvedState
+    | none => if succeeded then ts else none
+  ⟩)
+
+
+/-- Pretty print the given tactic state. --/
+def ppTacticState (ts : Tactic.SavedState) : TacticM String := do
+    match ts.tactic.goals with
+    | [] => return "no goals"
+    | [g] => return (← Meta.ppGoal g).pretty
+    | goals =>
+      return (← goals.foldlM (fun a b => do return a ++ "\n\n" ++ (← Meta.ppGoal b).pretty) "").trim
+
+
+/-- Initialize the REPL. --/
+private def initializeTacticRepl : TacticM Tactic.SavedState := do
+  if not (← isProp (← getMainTarget)) then
+    throwError "[fatal] not_a_theorem"
+  pruneSolvedGoals
+  let ts ← Tactic.saveState
+  let ts_str ← ppTacticState ts
+  let res : Response := {sid := some 0, tacticState := ts_str}
+  printResponse res
+  return ts
+
+
+private def levels2Names : List Level → NameSet
+  | [] => NameSet.empty
+  | Level.param n :: us => (levels2Names us).insert n
+  | _ :: us => levels2Names us
+
+
+private def collectFromLevel : Level → NameSet
+| Level.zero => NameSet.empty
+| Level.succ l => collectFromLevel l
+| Level.param n => NameSet.empty.insert n
+| Level.max l1 l2 => (collectFromLevel l1).union $ collectFromLevel l2
+| Level.imax l1 l2 => (collectFromLevel l1).union $ collectFromLevel l2
+| Level.mvar _ => NameSet.empty
+
+
+private def collectLevelParams : Expr → NameSet
+  | .sort u => collectFromLevel u
+  | .const _ us => levels2Names us
+  | .app fm arg => (collectLevelParams fm).union $ collectLevelParams arg
+  | .lam _ binderType body _ => (collectLevelParams binderType).union $ collectLevelParams body
+  | .forallE _ binderType body _ => (collectLevelParams binderType).union $ collectLevelParams body
+  | .letE _ type value body _ => ((collectLevelParams type).union $ collectLevelParams value).union $ collectLevelParams body
+  | .mdata _ expr => collectLevelParams expr
+  | .proj _ _ struct => collectLevelParams struct
+  | _ => NameSet.empty
+
+
+private def collectFVarsAux : Expr → NameSet
+  | .fvar fvarId => NameSet.empty.insert fvarId.name
+  | .app fm arg => (collectFVarsAux fm).union $ collectFVarsAux arg
+  | .lam _ binderType body _ => (collectFVarsAux binderType).union $ collectFVarsAux body
+  | .forallE _ binderType body _ => (collectFVarsAux binderType).union $ collectFVarsAux body
+  | .letE _ type value body _ => ((collectFVarsAux type).union $ collectFVarsAux value).union $ collectFVarsAux body
+  | .mdata _ expr => collectFVarsAux expr
+  | .proj _ _ struct => collectFVarsAux struct
+  | _ => NameSet.empty
+
+
+private def collectFVars (e : Expr) : MetaM (Array Expr) := do
+  let names := collectFVarsAux e
+  let mut fvars := #[]
+  for ldecl in ← getLCtx do
+    if ldecl.isImplementationDetail then
+      continue
+    if names.contains ldecl.fvarId.name then
+      fvars := fvars.push $ .fvar ldecl.fvarId
+  return fvars
+
+
+private def abstractAllLambdaFVars (e : Expr) : MetaM Expr := do
+  let mut e' := e
+  while e'.hasFVar do
+    let fvars ← collectFVars e'
+    if fvars.isEmpty then
+      break
+    e' ← mkLambdaFVars fvars e'
+  return e'
+
+
+private def validateProof : TacticReplM Response := do
+  let ts ← Tactic.saveState
+
+  -- Go to the initial state and grab the goal's metavariable ID.
+  let ts0 ← getInitialState! TacticReplM
+  ts0.restore
+  let [goalId] ← getGoals | throwError "[fatal] more than one initial goal"
+  let tgt ← getMainTarget >>= instantiateMVars
+  let tgt_fmt ← ppExpr tgt
+
+  -- Check its assigned Expr in the current state.
+  ts.restore
+  let some pf ← getExprMVarAssignment? goalId | throwError "[fatal] goal not assigned"
+  let pf ← instantiateMVars pf
+  let pft ← inferType pf >>= instantiateMVars
+  let pft_fmt ← ppExpr pft
+
+  if ! (← withTransparency .all (isExprDefEq tgt pft)) then
+    return {error := s!"proof type mismatch: {tgt_fmt} != {pft_fmt}"}
+
+  ts0.restore
+  let pf ← goalId.withContext $ abstractAllLambdaFVars pf
+  let pft ← inferType pf >>= instantiateMVars
+
+  ts.restore
+  if pf.hasSorry then
+    return {error := "proof contains `sorry`"}
+
+  if pf.hasExprMVar then
+    return {error := "proof contains metavariables"}
+
+  -- Kernel type check.
+  let lvls := (collectLevelParams pf).toList
+  let decl := Declaration.thmDecl {
+      name := Name.anonymous, type := pft, value := pf
+      levelParams := lvls
+  }
+  try
+    let _ ← addDecl decl
+  catch ex =>
+    return {error := s!"kernel type check failed: {← ex.toMessageData.toString}"}
+
+  let ts_str ← ppTacticState ts
+  let next_tsid ← getNextSid TacticReplM
+  insertTacticState ts
+  return {sid := next_tsid, tacticState := ts_str}
+
+
+private def handleRunTac (req : Request) : TacticReplM Response := do
+  match ← getSavedState? TacticReplM req.sid with
+  | none => throwError s!"[fatal] unknown tsid: {req.sid}"
+  | some ts =>
+    match Parser.runParserCategory (← getEnv) `tactic req.cmd "<stdin>" with
+    | .error err => return {error := err}
+    | .ok stx =>
+      ts.restore
+
+      try
+        monadLift $ commitIfNoEx (evalTactic stx)
+        let s ← getThe Core.State
+        if s.messages.hasErrors then
+          let messages := s.messages.toList.filter fun m => m.severity == MessageSeverity.error
+          return { error := join $ ← (messages.map (·.data)).mapM fun md => md.toString }
+      catch ex =>
+        return {error := ← ex.toMessageData.toString}
+
+      pruneSolvedGoals
+      if (← getGoals).isEmpty then
+        validateProof
+      else
+        let ts' ← Tactic.saveState
+        let ts'_str ← ppTacticState ts'
+        let next_tsid ← getNextSid TacticReplM
+        insertTacticState ts'
+        return {sid := next_tsid, tacticState := ts'_str}
+
+
+end TacticRepl
+
+
+private def loop (m : Type → Type) [Monad m] [MonadLift IO m] [MonadError m] (handler : Request → m Response) : m Unit := do
+  while true do
+    let line ← (← IO.getStdin).getLine
+    if line.trim == "exit" then
+      break
+    match (Json.parse line) with
+    | .error err => throwError s!"[fatal] failed to parse JSON {err}"
+    | .ok cmd =>
+      match (fromJson? cmd : Except String Request) with
+      | .error err => throwError s!"[fatal] parse_failed: data={err}"
+      | .ok req => (← handler req) |> printResponse
+
+
+namespace TacticRepl
+
+/--
+{"sid": 0, "cmd": "skip"}
+{"sid": 1, "cmd": "rw [add_assoc, add_comm b, ←add_assoc]"}
+exit
+--/
+def repl : TacticM Unit := do
+  withMainContext do
+    -- Print the initial goal.
+    let ts ← initializeTacticRepl
+    -- Interaction through the command line.
+    let loop := LeanDojo.loop TacticReplM handleRunTac
+    let (_, s) ← loop.run {savedStates := #[ts], solvedState := none}
+    -- Close the proof if we have found a solved tactic state.
+    match s.solvedState with
+    | none => return ()
+    | some ts' => ts'.restore
+  IO.Process.exit 0
+
+
+end TacticRepl
+
+
+namespace CommandRepl
+
+
+/-- The REPL monad. --/
+abbrev CommandReplM := StateT (ReplState Command.State) CommandElabM
+
+
+instance : MonadLift IO CommandReplM where
+  monadLift x := liftM x
+
+
+/-- Insert a command state into the REPL state. --/
+private def insertCommandState (cs : Command.State) : CommandReplM Unit := do
+  modifyGet fun s => ((), ⟨s.savedStates.push cs, none⟩)
+
+
+/-- Initialize the REPL. --/
+private def initializeRepl : CommandElabM Command.State := do
+  let res : Response := {sid := some 0}
+  printResponse res
+  return (← get)
+
+
+private def handleRunCmd (req : Request) : CommandReplM Response := do
+  match ← getSavedState? CommandReplM req.sid with
+  | none => throwError s!"[fatal] unknown csid: {req.sid}"
+  | some cs =>
+    let inputCtx := Parser.mkInputContext req.cmd "<stdin>"
+    let parserState := { : Parser.ModuleParserState }
+    let cs' := (← IO.processCommands inputCtx parserState cs).commandState
+
+    -- Collect error messages and print other messages.
+    let messages := cs'.messages.toList
+    let mut errors := #[]
+    for msg in messages do
+      let s ← msg.data.toString
+      if msg.severity == MessageSeverity.error then
+        errors := errors.push s
+      else
+        println! s.trim
+    let err_msg := if errors.isEmpty then none else some (join errors.toList)
+
+    let next_csid ← getNextSid CommandReplM
+    insertCommandState cs'
+    return {sid := next_csid, error := err_msg}
+
+
+/--
+{"sid": 0, "cmd": "#eval 1"}
+{"sid": 1, "cmd": "#eval x"}
+{"sid": 0, "cmd": "def x := 1"}
+{"sid": 3, "cmd": "#eval x"}
+exit
+--/
+def repl : CommandElabM Unit := do
+  let cs ← initializeRepl
+  let loop := LeanDojo.loop CommandReplM handleRunCmd
+  let _ ← loop.run {savedStates := #[cs], solvedState := none}
+  IO.Process.exit 0
+
+end CommandRepl
+
+end LeanDojo
+
+
+/-- The `lean_dojo_repl` tactic. --/
+elab "lean_dojo_repl" : tactic => LeanDojo.TacticRepl.repl
+
+
+/-- The `#lean_dojo_repl` command. --/
+elab "#lean_dojo_repl" : command => LeanDojo.CommandRepl.repl
diff --git a/interaction/dojo.py b/interaction/dojo.py
new file mode 100644
index 0000000..e13c576
--- /dev/null
+++ b/interaction/dojo.py
@@ -0,0 +1,549 @@
+import re
+import os
+import sys
+import json
+import time
+import signal
+import shutil
+import psutil
+from pathlib import Path
+from loguru import logger
+from tempfile import mkdtemp
+from shutil import ignore_patterns
+from subprocess import TimeoutExpired
+from dataclasses import dataclass, field
+from typing import Union, Tuple, List, Dict, Any, Optional
+
+from ..constants import (
+    TMP_DIR,
+    TACTIC_CPU_LIMIT,
+    TACTIC_MEMORY_LIMIT,
+)
+from ..utils import to_json_path
+from .parse_goals import parse_goals, Goal
+from ..data_extraction.trace import get_traced_repo_path
+from ..data_extraction.lean import Theorem, LeanGitRepo, Pos
+from ..container import get_container, Mount, NativeContainer, DockerContainer
+from ..data_extraction.traced_data import TracedFile, get_code_without_comments
+
+
+_REPL_PROMPT = "REPL>"
+
+
+@dataclass(frozen=True)
+class CommandState:
+    id: int = field(compare=False)
+    message: Optional[str] = field(default=None, compare=False)
+
+
+@dataclass(frozen=True)
+class TacticState:
+    pp: str
+    id: int = field(compare=False)
+    message: Optional[str] = field(default=None, compare=False)
+    goals: List[Goal] = field(init=False, compare=False, repr=False)
+
+    def __post_init__(self) -> None:
+        goals = parse_goals(self.pp)
+        assert len(goals) == self.pp.count("⊢")
+        object.__setattr__(self, "goals", goals)
+
+    @property
+    def num_goals(self) -> int:
+        return len(self.goals)
+
+
+@dataclass(frozen=True)
+class ProofFinished:
+    tactic_state_id: int
+    message: Optional[str] = field(default=None, compare=False)
+
+
+@dataclass(frozen=True)
+class ProofGivenUp:
+    pass
+
+
+@dataclass(frozen=True)
+class LeanError:
+    error: str
+
+
+@dataclass(frozen=True)
+class TimeoutError:
+    error: str
+
+
+TacticResult = Union[
+    TacticState,
+    ProofFinished,
+    LeanError,
+    TimeoutError,
+    ProofGivenUp,
+]
+
+CommandResult = Union[CommandState, LeanError, TimeoutError]
+
+State = Union[CommandState, TacticState]
+
+
+class DojoCrashError(Exception):
+    @property
+    def is_out_of_memory(self) -> bool:
+        return str(self) == "OOM"
+
+
+class DojoHardTimeoutError(Exception):
+    pass
+
+
+class DojoInitError(Exception):
+    pass
+
+
+def _kill_descendants(proc: psutil.Process) -> None:
+    for child in proc.children():
+        _kill_descendants(child)
+    proc.kill()
+
+
+class Dojo:
+    """Gym-like environment for programmatic interaction with Lean through tactics or commands."""
+
+    entry: Union[Theorem, Tuple[LeanGitRepo, Path, int]]
+    hard_timeout: Optional[float]
+    additional_imports: List[str]
+    repo: LeanGitRepo
+    file_path: Path
+    is_successful: Optional[bool] = None
+    is_crashed: bool = False
+    has_timedout: bool = False
+
+    def __init__(
+        self,
+        entry: Union[Theorem, Tuple[LeanGitRepo, Path, int]],
+        hard_timeout: Optional[float] = None,
+        additional_imports: List[str] = [],
+    ):
+        """Initialize Dojo.
+
+        Args:
+            entry (Union[Theorem, Tuple[LeanGitRepo, Path, int]]): When a Theorem is given,
+                the :class:`Dojo` object enables interaction with the theorem through tactics.
+                When a tuple of (repo, file_path, line_nb) is given (only supported in Lean 4),
+                the :class:`Dojo` object enables interaction with Lean through commands (similar to a REPL).
+            hard_timeout (Optional[float], optional): Hard timeout in seconds. Defaults to None.
+        """
+        self.entry = entry
+        self.hard_timeout = hard_timeout
+        self.additional_imports = additional_imports
+
+        if self.uses_tactics:
+            assert isinstance(entry, Theorem)
+            self.repo, self.file_path = entry.repo, entry.file_path
+            self.is_successful = False
+        else:
+            assert self.uses_commands
+            assert isinstance(entry, tuple)
+            self.repo, self.file_path, _ = entry
+            self.file_path = Path(self.file_path)
+
+        if self.hard_timeout is None:
+            logger.warning("Using Lean 4 without a hard timeout may hang indefinitely.")
+
+    @property
+    def uses_tactics(self) -> bool:
+        return isinstance(self.entry, Theorem)
+
+    @property
+    def uses_commands(self) -> bool:
+        return isinstance(self.entry, tuple)
+
+    def __enter__(self) -> Tuple["Dojo", State]:
+        """Initialize Dojo."""
+        logger.debug(f"Initializing Dojo for {self.entry}")
+
+        # Work in a temporary directory.
+        self.origin_dir = Path.cwd()
+        self.tmp_dir = Path(mkdtemp(dir=TMP_DIR))
+
+        try:
+            self._install_handlers()
+            os.chdir(self.tmp_dir)
+
+            # Copy and `cd` into the repo.
+            traced_repo_path = get_traced_repo_path(self.repo)
+            shutil.copytree(
+                traced_repo_path,
+                self.repo.name,
+                ignore=ignore_patterns("*.dep_paths", "*.ast.json", "*.trace.xml"),
+            )
+            os.chdir(self.repo.name)
+
+            # Replace the human-written proof with a `repl` tactic.
+            try:
+                traced_file = self._locate_traced_file(traced_repo_path)
+            except FileNotFoundError:
+                raise DojoInitError(
+                    f"Cannot find the *.ast.json file for {self.entry} in {traced_repo_path}."
+                )
+
+            self._modify_file(traced_file)
+
+            # Run the modified file in a container.
+            self.container = get_container()
+            logger.debug(f"Launching the proof using {type(self.container)}")
+            mts = [Mount(Path.cwd(), Path(f"/workspace/{self.repo.name}"))]
+            self.container.run(
+                "lake build Lean4Repl",
+                mts,
+                as_current_user=True,
+                capture_output=True,
+                work_dir=f"/workspace/{self.repo.name}",
+                cpu_limit=None,
+                memory_limit=None,
+                envs={},
+            )
+            assert re.fullmatch(r"\d+g", TACTIC_MEMORY_LIMIT)
+            memory_limit = 1024 * int(TACTIC_MEMORY_LIMIT[:-1])
+            cmd = f"lake env lean --threads={TACTIC_CPU_LIMIT} --memory={memory_limit} {self.file_path}"
+
+            self.proc = self.container.run_interactive(
+                cmd,
+                mts,
+                cpu_limit=None,
+                memory_limit=None,
+                work_dir=f"/workspace/{self.repo.name}",
+                as_current_user=True,
+                envs={},
+            )
+
+            # Get the initial tactic state.
+            try:
+                res = json.loads(self._read_next_line()[0])
+            except Exception as ex:
+                if traced_file.has_prelude:
+                    raise DojoInitError(
+                        "Currently LeanDojo does not support interacting with proofs in prelude files."
+                    )
+                elif isinstance(ex, EOFError):
+                    raise DojoInitError("EOF")
+                else:
+                    raise ex
+
+            assert res["error"] is None
+
+            # logger.debug(f"Response: {res}")
+            if self.uses_tactics:
+                assert res["tacticState"] != "no goals"
+                init_state: State = TacticState(
+                    self._post_process(res["tacticState"]),
+                    res["sid"],
+                )
+            else:
+                assert self.uses_commands
+                init_state = CommandState(int(res["sid"]))
+
+            self.start_time = time.monotonic()
+            self._set_timer()
+
+            return self, init_state
+
+        except Exception as ex:
+            os.chdir(self.origin_dir)
+            shutil.rmtree(self.tmp_dir)
+            raise ex
+
+    def _locate_traced_file(self, traced_repo_path: Path) -> TracedFile:
+        json_path = to_json_path(traced_repo_path, self.file_path, self.repo)
+        return TracedFile.from_traced_file(traced_repo_path, json_path, self.repo)
+
+    def _set_timer(self) -> None:
+        if self.hard_timeout is not None:
+            signal.signal(signal.SIGALRM, self._handle_hard_timeout)
+            signal.alarm(int(self.hard_timeout))
+
+    def _cancel_timer(self) -> None:
+        if self.hard_timeout is not None:
+            signal.alarm(0)
+            signal.signal(signal.SIGALRM, signal.SIG_DFL)
+
+    def _handle_hard_timeout(self, signum: Any, frame: Any) -> None:
+        logger.debug(f"Hard timeout in {self}")
+        self.has_timedout = True
+        raise DojoHardTimeoutError()
+
+    def _install_handlers(self) -> None:
+        self.old_sigint = signal.signal(signal.SIGINT, self._exit_gracefully)
+        self.old_sigterm = signal.signal(signal.SIGTERM, self._exit_gracefully)
+
+    def _uninstall_handlers(self) -> None:
+        signal.signal(signal.SIGINT, self.old_sigint)
+        signal.signal(signal.SIGTERM, self.old_sigterm)
+
+    def _exit_gracefully(self, signum: Any, frame: Any) -> None:
+        logger.debug("Exiting gracefully.")
+        sys.exit(-1)
+
+    def _cleanup(self) -> None:
+        logger.debug("Cleaning up.")
+        try:
+            self._cleanup_container()
+            self._cleanup_proc()
+        finally:
+            self._cleanup_tmp_dir()
+            self._uninstall_handlers()
+
+    def _cleanup_container(self) -> None:
+        """Clean up the container."""
+        logger.debug("Cleaning up the container.")
+        assert isinstance(self.container, DockerContainer) or isinstance(
+            self.container, NativeContainer
+        )
+        self.container.cleanup()
+
+    def _cleanup_proc(self) -> None:
+        """Clean up the subprocess."""
+        logger.debug(f"Cleaning up the subprocess {self.proc.pid}.")
+        _kill_descendants(psutil.Process(self.proc.pid))
+        """
+        self.proc.terminate()
+        try:
+            self.proc.wait(timeout=0.5)
+        except TimeoutExpired:
+            self.proc.kill()
+        """
+
+    def _cleanup_tmp_dir(self) -> None:
+        """Clean up the temporary directory."""
+        logger.debug("Cleaning up the temporary directory.")
+        os.chdir(self.origin_dir)
+        if self.tmp_dir is not None and os.path.exists(self.tmp_dir):
+            shutil.rmtree(self.tmp_dir)
+
+    def __exit__(self, exc_type: None, exc_val: None, exc_tb: None) -> None:
+        """Exit Dojo.
+
+        Args:
+            exc_type (None): _description_
+            exc_val (None): _description_
+            exc_tb (None): _description_
+        """
+        # Cancel the hard timeout.
+        self._cancel_timer()
+        self._cleanup()
+
+    def _post_process(self, tactic_state: str) -> str:
+        """Post-process the pretty-printed tactic state.
+
+        Args:
+            tactic_state (str): _description_
+
+        Returns:
+            str: _description_
+        """
+        m = re.match(r"\d+ goals\n", tactic_state)
+        if m is not None:
+            return tactic_state[m.end() :]
+        else:
+            return tactic_state
+
+    def _get_imports(self) -> str:
+        imports = ["Lean4Repl"] + self.additional_imports
+        return "\n".join(f"import {_}" for _ in imports) + "\n\n"
+
+    def _modify_file(self, traced_file: TracedFile) -> None:
+        logger.debug(f"Modifying {traced_file.lean_file.path}")
+
+        if self.uses_tactics:
+            # Interaction through tactics.
+            modified_code = self._modify_proof(traced_file)
+        else:
+            # Interaction through commands (supported only in Lean 4 via CommandElabM).
+            lean_file = traced_file.lean_file
+            pos = Pos(line_nb=self.entry[2], column_nb=1)
+            code_before = get_code_without_comments(
+                lean_file, lean_file.start_pos, pos, traced_file.comments
+            )
+            modified_code = (
+                self._get_imports()
+                + code_before
+                + "set_option maxHeartbeats 0 in\n#lean_dojo_repl\n\n"
+                + lean_file[pos:]
+            )
+
+        repl_file = "Lean4Repl.lean"
+        repl_dst = Path(repl_file)
+
+        if os.path.exists("lakefile.lean"):
+            with open("lakefile.lean", "a") as oup:
+                oup.write("\nlean_lib Lean4Repl {\n\n}\n")
+        else:
+            assert os.path.exists("lakefile.toml")
+            with open("lakefile.toml", "a") as oup:
+                oup.write('\n[[lean_lib]]\nname = "Lean4Repl"\n')
+
+        if os.path.exists("lakefile.olean"):
+            os.remove("lakefile.olean")
+        if os.path.exists(".lake/lakefile.olean"):
+            os.remove(".lake/lakefile.olean")
+
+        # Copy the REPL code to the right directory.
+        repl_src = Path(__file__).with_name(repl_file)
+        repl_code = repl_src.open().read()
+        if repl_dst.exists():
+            raise DojoInitError(f"{repl_dst} exists")
+        with repl_dst.open("wt") as oup:
+            oup.write(repl_code)
+
+        # Write the modified code to the file.
+        with self.file_path.open("wt") as oup:
+            oup.write(modified_code)
+
+    def _modify_proof(self, traced_file: TracedFile) -> str:
+        # Modify the proof and set up the `repl` tactic.
+        assert isinstance(self.entry, Theorem)
+        traced_theorem = traced_file.get_traced_theorem(self.entry)
+        if traced_theorem is None:
+            raise DojoInitError(
+                f"Failed to locate the theorem with `{self.entry.full_name}` as its fully qualified name"
+            )
+        proof_start, proof_end = traced_theorem.locate_proof()
+        lean_file = traced_file.lean_file
+
+        code_import = self._get_imports()
+        code_proof = "\nby\n  lean_dojo_repl\n  sorry\n"
+        code_before_theorem = get_code_without_comments(
+            lean_file, lean_file.start_pos, traced_theorem.start, traced_file.comments
+        )
+        code_thereom = lean_file[traced_theorem.start : proof_start]
+        modified_code = (
+            code_import
+            + code_before_theorem
+            + "\nset_option maxHeartbeats 0 in\n"
+            + code_thereom
+            + code_proof
+            + lean_file[proof_end:]
+        )
+
+        return str(modified_code)
+
+    def run_tac(self, state: TacticState, tactic: str) -> TacticResult:
+        if not isinstance(state, TacticState):
+            raise RuntimeError(
+                f"Attempting to run a tactic on an invalid state {state}."
+            )
+        assert isinstance(tactic, str), f"Invalid tactic {tactic}"
+
+        tsid = state.id
+        req = json.dumps({"sid": tsid, "cmd": tactic}, ensure_ascii=False)
+        res = self._submit_request(req)
+
+        if res["error"] is not None:
+            if "proof contains `sorry`" in res["error"]:
+                return ProofGivenUp()
+            elif "try_for_time tactic failed, timeout" in res["error"]:
+                return TimeoutError(res["error"].strip())
+            else:
+                return LeanError(res["error"].strip())
+        elif res["tacticState"] == "no goals":
+            self.is_successful = True
+            return ProofFinished(res["sid"], res["message"])
+        else:
+            tactic_state = self._post_process(res["tacticState"])
+            return TacticState(
+                tactic_state,
+                res["sid"],
+                res["message"],
+            )
+
+    def run_cmd(self, state: CommandState, command: str) -> CommandResult:
+        if not isinstance(state, CommandState):
+            raise RuntimeError(
+                f"Attempting to run a command on an invalid state {state}."
+            )
+        assert isinstance(command, str), f"Invalid command {command}"
+
+        csid = state.id
+        req = json.dumps({"sid": csid, "cmd": command}, ensure_ascii=False)
+        res = self._submit_request(req)
+
+        if res["error"] is not None:
+            return LeanError(res["error"].strip())
+        else:
+            return CommandState(res["sid"], res["message"])
+
+    def _submit_request(self, req: str) -> Dict[str, Any]:
+        """Submit a request to Lean and get the response.
+
+        Args:
+            req (str): _description_
+
+        Raises:
+            DojoCrashError: _description_
+
+        Returns:
+            Dict[str, Any]: _description_
+        """
+        if self.proc.stdin is None:
+            raise RuntimeError("self.proc.stdin is not initialized")
+        self._check_alive()
+        logger.debug(req)
+        self.proc.stdin.write(req + "\n")
+        try:
+            res, msg = self._read_next_line()
+        except EOFError:
+            raise DojoCrashError("Unexpected EOF")
+        try:
+            result: Dict[str, Any] = json.loads(res)
+        except json.decoder.JSONDecodeError:
+            raise DojoCrashError(f"Invalid JSON: {res}")
+
+        result["message"] = msg
+        return result
+
+    def _check_alive(self) -> None:
+        exit_code = self.proc.poll()
+        if exit_code is None:
+            return
+        elif exit_code == 137:
+            raise DojoCrashError("OOM")
+        else:
+            raise DojoCrashError(f"Unknown exit code: {exit_code}")
+
+    def _read_next_line(self) -> Tuple[str, str]:
+        """Read the next line from `self.proc`.
+
+        Raises:
+            EOFError: _description_
+            DojoCrashError: _description_
+            DojoInitError: _description_
+
+        Returns:
+            str: _description_
+        """
+        if self.proc.stdout is None:
+            raise RuntimeError("self.proc.stout is not initialized")
+        msg: List[str] = []
+        while True:
+            line = self.proc.stdout.readline().strip()
+            logger.debug(line)
+            if line == "":
+                raise EOFError
+            if line.startswith(_REPL_PROMPT):
+                self._check_alive()
+                return line[len(_REPL_PROMPT) :].strip(), "\n".join(msg)
+            elif "error: " in line:
+                if (
+                    "error: deep recursion was detected" in line
+                    or "error: [fatal] not_a_theorem" in line
+                ):
+                    self.is_crashed = True
+                    raise DojoCrashError(line)
+                elif "error: unknown package" in line:
+                    self.is_crashed = True
+                    raise DojoInitError(line)
+                else:
+                    pass
+            else:
+                msg.append(line)
diff --git a/interaction/parse_goals.py b/interaction/parse_goals.py
new file mode 100644
index 0000000..6472731
--- /dev/null
+++ b/interaction/parse_goals.py
@@ -0,0 +1,69 @@
+"""Utilities for parsing Lean's pretty-printed proof goals.
+"""
+
+import re
+from typing import List
+from dataclasses import dataclass
+
+
+_DECL_REGEX = re.compile(
+    r"(?<=\n)(?P<idents>.+?)\s+\:(?P<lean_type>.+?)\n(?=\S)", re.DOTALL
+)
+"""Regex for a line of declarations in the local context.
+
+It can be a single declaration such as ``x : Nat`` or multiple declarations such as ``x y : Nat``.
+"""
+
+
+_CASE_REGEX = re.compile(r"case\s\S+\n")
+
+
+_SPACE_REGEX = re.compile(r"\s+")
+
+
+@dataclass(frozen=True)
+class Declaration:
+    """A declaration in the local context."""
+
+    ident: str
+    lean_type: str
+
+    def __post_init__(self) -> None:
+        assert _SPACE_REGEX.search(self.ident) is None
+
+
+def _parse_local_context(ctx_pp: str) -> List[Declaration]:
+    """Parse the local context of a goal."""
+    m = _CASE_REGEX.match(ctx_pp)
+    if m is not None:
+        ctx_pp = ctx_pp[m.end() :]
+
+    decls = []
+    for m in _DECL_REGEX.finditer("\n" + ctx_pp + "⊢"):
+        lean_type = m["lean_type"].strip()
+        if lean_type.endswith(","):
+            lean_type = lean_type[:-1].strip()
+        for ident in m["idents"].strip().split():
+            decls.append(Declaration(ident.strip(), lean_type))
+    return decls
+
+
+@dataclass(frozen=True)
+class Goal:
+    """A goal in Lean."""
+
+    assumptions: List[Declaration]
+    conclusion: str
+
+    @classmethod
+    def from_pp(cls, pp: str) -> "Goal":
+        """Parse a pretty-printed goal."""
+        assert pp.count("⊢") == 1
+        ctx, concl = pp.split("⊢")
+        assumptions = _parse_local_context(ctx)
+        return cls(assumptions, concl.strip())
+
+
+def parse_goals(pp: str) -> List[Goal]:
+    """Parse a list of pretty-printed goals."""
+    return [Goal.from_pp(g) for g in pp.split("\n\n") if "⊢" in g]
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000..9891276
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,314 @@
+"""Utility functions used internally by LeanDojo.
+"""
+
+import re
+import os
+import ray
+import time
+import urllib
+import typing
+import hashlib
+import tempfile
+import subprocess
+from pathlib import Path
+from loguru import logger
+from functools import cache
+from contextlib import contextmanager
+from ray.util.actor_pool import ActorPool
+from typing import Tuple, Union, List, Generator, Optional
+
+from .constants import NUM_WORKERS, TMP_DIR, LEAN4_PACKAGES_DIR, LEAN4_BUILD_DIR
+
+
+@contextmanager
+def working_directory(
+    path: Optional[Union[str, Path]] = None
+) -> Generator[Path, None, None]:
+    """Context manager setting the current working directory (CWD) to ``path`` (or a temporary directory if ``path`` is None).
+
+    The original CWD is restored after the context manager exits.
+
+    Args:
+        path (Optional[Union[str, Path]], optional): The desired CWD. Defaults to None.
+
+    Yields:
+        Generator[Path, None, None]: A ``Path`` object representing the CWD.
+    """
+    origin = Path.cwd()
+    if path is None:
+        tmp_dir = tempfile.TemporaryDirectory(dir=TMP_DIR)
+        path = tmp_dir.__enter__()
+        is_temporary = True
+    else:
+        is_temporary = False
+
+    path = Path(path)
+    if not path.exists():
+        path.mkdir(parents=True)
+    os.chdir(path)
+
+    try:
+        yield path
+    finally:
+        os.chdir(origin)
+        if is_temporary:
+            tmp_dir.__exit__(None, None, None)
+
+
+@contextmanager
+def ray_actor_pool(
+    actor_cls: type, *args, **kwargs
+) -> Generator[ActorPool, None, None]:
+    """Create a pool of Ray Actors of class ``actor_cls``.
+
+    Args:
+        actor_cls (type): A Ray Actor class (annotated by ``@ray.remote``).
+        *args: Position arguments passed to ``actor_cls``.
+        **kwargs: Keyword arguments passed to ``actor_cls``.
+
+    Yields:
+        Generator[ActorPool, None, None]: A :class:`ray.util.actor_pool.ActorPool` object.
+    """
+    assert not ray.is_initialized()
+    ray.init()
+    pool = ActorPool([actor_cls.remote(*args, **kwargs) for _ in range(NUM_WORKERS)])
+    try:
+        yield pool
+    finally:
+        ray.shutdown()
+
+
+@contextmanager
+def report_critical_failure(msg: str) -> Generator[None, None, None]:
+    """Context manager logging ``msg`` in case of any exception.
+
+    Args:
+        msg (str): The message to log in case of exceptions.
+
+    Raises:
+        ex: Any exception that may be raised within the context manager.
+    """
+    try:
+        yield
+    except Exception as ex:
+        logger.error(msg)
+        raise ex
+
+
+def execute(
+    cmd: Union[str, List[str]], capture_output: bool = False
+) -> Optional[Tuple[str, str]]:
+    """Execute the shell command ``cmd`` and optionally return its output.
+
+    Args:
+        cmd (Union[str, List[str]]): The shell command to execute.
+        capture_output (bool, optional): Whether to capture and return the output. Defaults to False.
+
+    Returns:
+        Optional[Tuple[str, str]]: The command's output, including stdout and stderr (None if ``capture_output == False``).
+    """
+    try:
+        res = subprocess.run(cmd, shell=True, capture_output=capture_output, check=True)
+    except subprocess.CalledProcessError as ex:
+        if capture_output:
+            logger.info(ex.stdout.decode())
+            logger.error(ex.stderr.decode())
+        raise ex
+    if not capture_output:
+        return None
+    output = res.stdout.decode()
+    error = res.stderr.decode()
+    return output, error
+
+
+def compute_md5(path: Path) -> str:
+    """Return the MD5 hash of the file ``path``."""
+    # The file could be large
+    # See: https://stackoverflow.com/questions/48122798/oserror-errno-22-invalid-argument-when-reading-a-huge-file
+    hasher = hashlib.md5()
+    with path.open("rb") as inp:
+        while True:
+            block = inp.read(64 * (1 << 20))
+            if not block:
+                break
+            hasher.update(block)
+    return hasher.hexdigest()
+
+
+_CAMEL_CASE_REGEX = re.compile(r"(_|-)+")
+
+
+def camel_case(s: str) -> str:
+    """Convert the string ``s`` to camel case."""
+    return _CAMEL_CASE_REGEX.sub(" ", s).title().replace(" ", "")
+
+
+@cache
+def get_repo_info(path: Path) -> Tuple[str, str]:
+    """Get the URL and commit hash of the Git repo at ``path``.
+
+    Args:
+        path (Path): Path to the Git repo.
+
+    Returns:
+        Tuple[str, str]: URL and (most recent) hash commit
+    """
+    with working_directory(path):
+        # Get the URL.
+        url_msg, _ = execute(f"git remote get-url origin", capture_output=True)
+        url = url_msg.strip()
+        # Get the commit.
+        commit_msg, _ = execute(f"git log -n 1", capture_output=True)
+        m = re.search(r"(?<=^commit )[a-z0-9]+", commit_msg)
+        assert m is not None
+        commit = m.group()
+
+    if url.startswith("git@"):
+        assert url.endswith(".git")
+        url = url[: -len(".git")].replace(":", "/").replace("git@", "https://")
+
+    return url, commit
+
+
+def is_optional_type(tp: type) -> bool:
+    """Test if ``tp`` is Optional[X]."""
+    if typing.get_origin(tp) != Union:
+        return False
+    args = typing.get_args(tp)
+    return len(args) == 2 and args[1] == type(None)
+
+
+def remove_optional_type(tp: type) -> type:
+    """Given Optional[X], return X."""
+    if typing.get_origin(tp) != Union:
+        return False
+    args = typing.get_args(tp)
+    if len(args) == 2 and args[1] == type(None):
+        return args[0]
+    else:
+        raise ValueError(f"{tp} is not Optional")
+
+
+@cache
+def read_url(url: str, num_retries: int = 2) -> str:
+    """Read the contents of the URL ``url``. Retry if failed"""
+    backoff = 1
+    while True:
+        try:
+            with urllib.request.urlopen(url) as f:
+                return f.read().decode()
+        except Exception as ex:
+            if num_retries <= 0:
+                raise ex
+            num_retries -= 1
+            logger.debug(f"Request to {url} failed. Retrying...")
+            time.sleep(backoff)
+            backoff *= 2
+
+
+@cache
+def url_exists(url: str) -> bool:
+    """Return True if the URL ``url`` exists."""
+    try:
+        with urllib.request.urlopen(url) as _:
+            return True
+    except urllib.error.HTTPError:
+        return False
+
+
+def parse_int_list(s: str) -> List[int]:
+    assert s.startswith("[") and s.endswith("]")
+    return [int(_) for _ in s[1:-1].split(",") if _ != ""]
+
+
+def parse_str_list(s: str) -> List[str]:
+    assert s.startswith("[") and s.endswith("]")
+    return [_.strip()[1:-1] for _ in s[1:-1].split(",") if _ != ""]
+
+
+@cache
+def is_git_repo(path: Path) -> bool:
+    """Check if ``path`` is a Git repo."""
+    with working_directory(path):
+        return (
+            os.system("git rev-parse --is-inside-work-tree 1>/dev/null 2>/dev/null")
+            == 0
+        )
+
+
+def _from_lean_path(root_dir: Path, path: Path, repo, ext: str) -> Path:
+    assert path.suffix == ".lean"
+    if path.is_absolute():
+        path = path.relative_to(root_dir)
+
+    assert root_dir.name != "lean4"
+    if path.is_relative_to(LEAN4_PACKAGES_DIR / "lean4/src/lean/lake"):
+        # E.g., "lake-packages/lean4/src/lean/lake/Lake/CLI/Error.lean"
+        p = path.relative_to(LEAN4_PACKAGES_DIR / "lean4/src/lean/lake")
+        return LEAN4_PACKAGES_DIR / "lean4/lib/lean" / p.with_suffix(ext)
+    elif path.is_relative_to(LEAN4_PACKAGES_DIR / "lean4/src"):
+        # E.g., "lake-packages/lean4/src/lean/Init.lean"
+        p = path.relative_to(LEAN4_PACKAGES_DIR / "lean4/src").with_suffix(ext)
+        return LEAN4_PACKAGES_DIR / "lean4/lib" / p
+    elif path.is_relative_to(LEAN4_PACKAGES_DIR):
+        # E.g., "lake-packages/std/Std.lean"
+        p = path.relative_to(LEAN4_PACKAGES_DIR).with_suffix(ext)
+        repo_name = p.parts[0]
+        return (
+            LEAN4_PACKAGES_DIR
+            / repo_name
+            / LEAN4_BUILD_DIR
+            / "ir"
+            / p.relative_to(repo_name)
+        )
+    else:
+        # E.g., "Mathlib/LinearAlgebra/Basics.lean"
+        return LEAN4_BUILD_DIR / "ir" / path.with_suffix(ext)
+
+
+def to_xml_path(root_dir: Path, path: Path, repo) -> Path:
+    return _from_lean_path(root_dir, path, repo, ext=".trace.xml")
+
+
+def to_dep_path(root_dir: Path, path: Path, repo) -> Path:
+    return _from_lean_path(root_dir, path, repo, ext=".dep_paths")
+
+
+def to_json_path(root_dir: Path, path: Path, repo) -> Path:
+    return _from_lean_path(root_dir, path, repo, ext=".ast.json")
+
+
+def to_lean_path(root_dir: Path, path: Path, repo) -> bool:
+    if path.is_absolute():
+        path = path.relative_to(root_dir)
+
+    if path.suffix in (".xml", ".json"):
+        path = path.with_suffix("").with_suffix(".lean")
+    else:
+        assert path.suffix == ".dep_paths"
+        path = path.with_suffix(".lean")
+
+    assert root_dir.name != "lean4"
+    if path == LEAN4_PACKAGES_DIR / "lean4/lib/lean/Lake.lean":
+        return LEAN4_PACKAGES_DIR / "lean4/src/lean/lake/Lake.lean"
+    elif path.is_relative_to(LEAN4_PACKAGES_DIR / "lean4/lib/lean/Lake"):
+        # E.g., "lake-packages/lean4/lib/lean/Lake/Util/List.lean"
+        p = path.relative_to(LEAN4_PACKAGES_DIR / "lean4/lib/lean/Lake")
+        return LEAN4_PACKAGES_DIR / "lean4/src/lean/lake/Lake" / p
+    elif path.is_relative_to(LEAN4_PACKAGES_DIR / "lean4/lib"):
+        # E.g., "lake-packages/lean4/lib/lean/Init.lean"
+        p = path.relative_to(LEAN4_PACKAGES_DIR / "lean4/lib")
+        return LEAN4_PACKAGES_DIR / "lean4/src" / p
+    elif path.is_relative_to(LEAN4_PACKAGES_DIR):
+        # E.g., "lake-packages/std/build/ir/Std.lean"
+        p = path.relative_to(LEAN4_PACKAGES_DIR)
+        repo_name = p.parts[0]
+        return (
+            LEAN4_PACKAGES_DIR
+            / repo_name
+            / p.relative_to(Path(repo_name) / LEAN4_BUILD_DIR / "ir")
+        )
+    else:
+        # E.g., ".lake/build/ir/Mathlib/LinearAlgebra/Basics.lean" or "build/ir/Mathlib/LinearAlgebra/Basics.lean"
+        assert path.is_relative_to(LEAN4_BUILD_DIR / "ir"), path
+        return path.relative_to(LEAN4_BUILD_DIR / "ir")

From c20d300c43fc765073e43e53de7877a085f7b79f Mon Sep 17 00:00:00 2001
From: aumrp77 <aumrp77@gmail.com>
Date: Thu, 23 Oct 2025 23:21:05 -0400
Subject: [PATCH 21/29] Fix _to_commit_hash compatibility fallback

---
 data_extraction/lean.py | 106 +++++++++++++++++++++++++++-------------
 1 file changed, 72 insertions(+), 34 deletions(-)

diff --git a/data_extraction/lean.py b/data_extraction/lean.py
index 187288c..6238f93 100644
--- a/data_extraction/lean.py
+++ b/data_extraction/lean.py
@@ -88,35 +88,69 @@ def cleanse_string(s: Union[str, Path]) -> str:
 
 @cache
 def _to_commit_hash(repo: Repository, label: str) -> str:
-    """Convert a tag or branch to a commit hash."""
+    """Convert a tag or branch to a commit hash with fallback strategies."""
     logger.debug(f"Querying the commit hash for {repo.name} {label}")
-    
-    # Poor man's cache
-    if repo.name == "lean4":
-        if label == "v4.23.0-rc2":
-            return "ad1a017949674a947f0d6794cbf7130d642c6530"
-        elif label == "v4.17.0":
-            return "306f36116535cd226329f562b4675b8b6dbf948c"
-        elif label == "v4.8.0-rc2":
-            return "873ef2d894af80d8fc672e35f7e28bae314a1f6f"
-    
-    # if the label is a commit hash, return it directly
-    if len(label) == 40 and _COMMIT_REGEX.fullmatch(label.strip()):
-        return label
 
-    for branch in repo.get_branches():
-        if branch.name == label:
-            print(f"Found branch {branch.name} with commit {branch.commit.sha}")
-            return branch.commit.sha
+    label_stripped = (label or "").strip()
+    if len(label_stripped) == 40 and _COMMIT_REGEX.fullmatch(label_stripped):
+        return label_stripped
+
+    candidates = []
+    if label_stripped.startswith("v"):
+        candidates += [label_stripped, label_stripped.lstrip("v")]
+    else:
+        candidates += [label_stripped, f"v{label_stripped}"]
+
+    base = label_stripped[1:] if label_stripped.startswith("v") else label_stripped
+    if "-rc" in base:
+        base_no_rc = base.split("-rc", 1)[0]
+        candidates += [base_no_rc, f"v{base_no_rc}"]
+
+    seen = set()
+    uniq = []
+    for cand in candidates:
+        if cand and cand not in seen:
+            uniq.append(cand)
+            seen.add(cand)
+    candidates = uniq
+
+    for cand in candidates:
+        try:
+            ref = repo.get_git_ref(f"tags/{cand}")
+            obj = ref.object
+            if obj.type == "tag":
+                tag_obj = repo.get_git_tag(obj.sha)
+                if tag_obj.object.type == "commit":
+                    return tag_obj.object.sha
+            elif obj.type == "commit":
+                return obj.sha
+        except Exception:
+            pass
+
+        try:
+            return repo.get_commit(cand).sha
+        except Exception:
+            pass
 
-    for tag in repo.get_tags():
-        if tag.name == label:
-            print(f"Found tag {tag.name} with commit {tag.commit.sha}")
-            return tag.commit.sha
+        try:
+            data = read_url(f"https://api.github.com/repos/{repo.full_name}/commits/{cand}")
+            sha = json.loads(data).get("sha")
+            if sha:
+                return sha
+        except Exception:
+            pass
 
     raise ValueError(f"Invalid tag or branch: `{label}` for {repo}")
 
 
+def _to_commit_hash_compat(repo: Repository, label: str) -> str:
+    """Compatibility wrapper: supports both (repo, label) and (label) call signatures."""
+    try:
+        return _to_commit_hash(repo, label)
+    except TypeError:
+        return _to_commit_hash(label)
+
+
 @dataclass(eq=True, unsafe_hash=True)
 class Pos:
     """Position in source files.
@@ -357,9 +391,9 @@ def get_lean4_commit_from_config(config_dict: Dict[str, Any]) -> str:
     version = config[len(prefix) :]
 
     if version.startswith("nightly"):
-        return _to_commit_hash(LEAN4_NIGHTLY_REPO, version)
+        return _to_commit_hash_compat(LEAN4_NIGHTLY_REPO, version)
     else:
-        return _to_commit_hash(LEAN4_REPO, version)
+                return _to_commit_hash_compat(LEAN4_REPO, version)
 
 
 URL = TAG = COMMIT = str
@@ -445,7 +479,7 @@ def __post_init__(self) -> None:
             if (self.url, self.commit) in info_cache.tag2commit:
                 commit = info_cache.tag2commit[(self.url, self.commit)]
             else:
-                commit = _to_commit_hash(self.repo, self.commit)
+                commit = _to_commit_hash_compat(self.repo, self.commit)
                 assert _COMMIT_REGEX.fullmatch(commit), f"Invalid commit hash: {commit}"
                 info_cache.tag2commit[(self.url, self.commit)] = commit
             object.__setattr__(self, "commit", commit)
@@ -498,18 +532,22 @@ def clone_and_checkout(self) -> None:
         
         user_name, repo_name = _split_git_url(self.url)
         local_repo_path = Path(os.environ["REPO_DIR"]) / user_name / repo_name
+        local_repo_path.parent.mkdir(parents=True, exist_ok=True)
+
         if os.path.exists(local_repo_path):
             logger.info(f"{self} already exists locally.")
         else:
             logger.debug(f"Cloning {self}")
-            execute(f"git clone -n --recursive {self.url}", capture_output=True)
-        
-        
-            with working_directory(local_repo_path):
-                execute(
-                    f"git checkout {self.commit} && git submodule update --recursive",
-                    capture_output=True,
-                )
+            execute(
+                f"git clone -n --recursive {self.url} {local_repo_path}",
+                capture_output=True,
+            )
+
+        with working_directory(local_repo_path):
+            execute(
+                f"git checkout {self.commit} && git submodule update --recursive",
+                capture_output=True,
+            )
 
     def get_dependencies(
         self, path: Union[str, Path, None] = None
@@ -591,7 +629,7 @@ def _parse_deps(
                 commit = rev
             else:
                 try:
-                    commit = _to_commit_hash(url_to_repo(url), rev)
+                    commit = _to_commit_hash_compat(url_to_repo(url), rev)
                 except ValueError:
                     commit = get_latest_commit(url)
                 assert _COMMIT_REGEX.fullmatch(commit)

From eb8eac6ca1f8039a462b47cbfcef57dcf2a18c0c Mon Sep 17 00:00:00 2001
From: aumrp77 <aumrp77@gmail.com>
Date: Tue, 28 Oct 2025 19:14:51 -0400
Subject: [PATCH 22/29] Patch LeanDojo tracing and repo handling for LeanAgent
 pipeline

---
 container.py                        |  3 --
 data_extraction/build_lean4_repo.py |  1 -
 data_extraction/lean.py             | 69 +++++++++++++++++++----------
 data_extraction/trace.py            |  6 ++-
 4 files changed, 51 insertions(+), 28 deletions(-)

diff --git a/container.py b/container.py
index af9f25a..2022eb2 100644
--- a/container.py
+++ b/container.py
@@ -118,7 +118,6 @@ class NativeContainer(Container):
 
     def _mount_files(self, mounts: List[Mount]) -> None:
         cwd = Path.cwd()
-        import ipdb; ipdb.set_trace()
         for src, dst in mounts:
             if dst.is_absolute():
                 dst = cwd / dst.relative_to(dst.root)
@@ -134,7 +133,6 @@ def _mount_files(self, mounts: List[Mount]) -> None:
 
     def _unmount_files(self, mounts: List[Mount]) -> None:
         cwd = Path.cwd()
-
         for src, dst in mounts:
             if dst.is_absolute():
                 dst = cwd / dst.relative_to(dst.root)
@@ -184,7 +182,6 @@ def run(
         assert memory_limit is None, "NativeContainer does not support memory limit."
         assert cpu_limit is None, "NativeContainer does not support CPU limit."
 
-        import ipdb; ipdb.set_trace()
         self._mount_files(mounts)
 
         cmd = self._build_native_command(command, envs)
diff --git a/data_extraction/build_lean4_repo.py b/data_extraction/build_lean4_repo.py
index a15dd61..dc82257 100644
--- a/data_extraction/build_lean4_repo.py
+++ b/data_extraction/build_lean4_repo.py
@@ -154,7 +154,6 @@ def is_new_version(v: str) -> bool:
 
 
 def main() -> None:
-    import ipdb; ipdb.set_trace()
     parser = argparse.ArgumentParser()
     parser.add_argument("repo_name")
     parser.add_argument("--no-deps", action="store_true")
diff --git a/data_extraction/lean.py b/data_extraction/lean.py
index 6238f93..9a63618 100644
--- a/data_extraction/lean.py
+++ b/data_extraction/lean.py
@@ -50,11 +50,18 @@
 LEAN4_NIGHTLY_REPO = GITHUB.get_repo("leanprover/lean4-nightly")
 """The GitHub Repo for Lean 4 nightly releases."""
 
+TOOLCHAIN_OVERRIDES = {
+    "ImperialCollegeLondon/FLT": "leanprover/lean4:v4.25.0-rc1",
+}
+
 _URL_REGEX = re.compile(r"(?P<url>.*?)/*")
 
 
 def normalize_url(url: str) -> str:
-    return _URL_REGEX.fullmatch(url)["url"]  # Remove trailing `/`.
+    cleaned = _URL_REGEX.fullmatch(url)["url"]  # Remove trailing `/`.
+    if cleaned.endswith(".git"):
+        cleaned = cleaned[:-4]
+    return cleaned
 
 
 @cache
@@ -143,12 +150,17 @@ def _to_commit_hash(repo: Repository, label: str) -> str:
     raise ValueError(f"Invalid tag or branch: `{label}` for {repo}")
 
 
-def _to_commit_hash_compat(repo: Repository, label: str) -> str:
-    """Compatibility wrapper: supports both (repo, label) and (label) call signatures."""
+def _to_commit_hash_compat(*args, **kwargs):
+    """
+    Compatibility wrapper for LeanDojo versions that define _to_commit_hash with either:
+      (repo: Repository, label: str)  OR  (label: str)
+    """
     try:
-        return _to_commit_hash(repo, label)
+        return _to_commit_hash(*args, **kwargs)
     except TypeError:
-        return _to_commit_hash(label)
+        if len(args) == 2 and not kwargs:
+            return _to_commit_hash(args[1])
+        raise
 
 
 @dataclass(eq=True, unsafe_hash=True)
@@ -525,29 +537,40 @@ def show(self) -> None:
     def exists(self) -> bool:
         return url_exists(self.commit_url)
 
+    def toolchain_spec(self) -> Optional[str]:
+        owner_repo = "/".join(self.url.split("/")[-2:])
+        if owner_repo in TOOLCHAIN_OVERRIDES:
+            return TOOLCHAIN_OVERRIDES[owner_repo]
+        try:
+            config = self.get_config("lean-toolchain")
+        except Exception:
+            return None
+        content = (config.get("content") or "").strip()
+        return content or None
+
     def clone_and_checkout(self) -> None:
-        """Clone the repo to the current working directory and checkout a specific commit."""
-        # Check if the repo already exists.
-        # If it exists, we assume it has been checked out to the correct commit.
-        
-        user_name, repo_name = _split_git_url(self.url)
-        local_repo_path = Path(os.environ["REPO_DIR"]) / user_name / repo_name
+        """
+        Clone the repo into $REPO_DIR/<owner>/<name> (creating parents), then checkout the pinned commit
+        and update submodules. If it already exists, assume it's correct and skip.
+        """
+        owner, name = _split_git_url(self.url)
+        base = Path(os.environ.get("REPO_DIR", "."))
+        local_repo_path = (base / owner / name).resolve()
         local_repo_path.parent.mkdir(parents=True, exist_ok=True)
 
-        if os.path.exists(local_repo_path):
-            logger.info(f"{self} already exists locally.")
-        else:
-            logger.debug(f"Cloning {self}")
-            execute(
-                f"git clone -n --recursive {self.url} {local_repo_path}",
-                capture_output=True,
-            )
+        if local_repo_path.exists():
+            logger.info(f"{self} already exists locally at {local_repo_path}.")
+            return
+
+        logger.debug(f"Cloning {self} into {local_repo_path}")
+        execute(
+            f'git clone -n --recursive "{self.url}" "{local_repo_path}"',
+            capture_output=True,
+        )
 
         with working_directory(local_repo_path):
-            execute(
-                f"git checkout {self.commit} && git submodule update --recursive",
-                capture_output=True,
-            )
+            execute(f'git checkout "{self.commit}"', capture_output=True)
+            execute("git submodule update --init --recursive", capture_output=True)
 
     def get_dependencies(
         self, path: Union[str, Path, None] = None
diff --git a/data_extraction/trace.py b/data_extraction/trace.py
index cd1ddc2..560cd31 100644
--- a/data_extraction/trace.py
+++ b/data_extraction/trace.py
@@ -35,6 +35,11 @@ def _trace(repo: LeanGitRepo, build_deps: bool) -> None:
         repo.clone_and_checkout()
 
     logger.debug(f"Tracing {repo}")
+    toolchain = repo.toolchain_spec()
+    if toolchain:
+        logger.info(f"{repo} declares toolchain {toolchain}")
+    else:
+        logger.warning(f"No lean-toolchain found for {repo}; proceeding without explicit toolchain.")
     container = get_container()
     mts = {
         Path(os.environ.get("RAID_DIR")) / "repos" / user_name / repo_name: f"/workspace/{user_name}/{repo_name}",
@@ -48,7 +53,6 @@ def _trace(repo: LeanGitRepo, build_deps: bool) -> None:
         cmd += " --no-deps"
 
     try:
-        import ipdb; ipdb.set_trace()
         container.run(
             cmd,
             create_mounts(mts),

From a3347e95bfd4de47486e66bf4b70b5ca6451cfe7 Mon Sep 17 00:00:00 2001
From: aumrp77 <aumrp77@gmail.com>
Date: Tue, 28 Oct 2025 22:21:38 -0400
Subject: [PATCH 23/29] Skip tracing when artifacts already present

---
 data_extraction/build_lean4_repo.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/data_extraction/build_lean4_repo.py b/data_extraction/build_lean4_repo.py
index dc82257..9947cfc 100644
--- a/data_extraction/build_lean4_repo.py
+++ b/data_extraction/build_lean4_repo.py
@@ -162,20 +162,22 @@ def main() -> None:
     num_procs = int(os.environ["NUM_PROCS"])
     repo_name = args.repo_name
     os.chdir(repo_name)
-    
-    if is_new_version(get_lean_version()):
-            packages_path = ".lake/packages"
-            build_path = ".lake/build"
+
+    lean_version = get_lean_version()
+    use_new_layout = is_new_version(lean_version)
+    if use_new_layout:
+        packages_path = ".lake/packages"
+        build_path = ".lake/build"
     else:
         packages_path = "lake-packages"
         build_path = "build"
-    
-    # if check_files(packages_path, args.no_deps):
-    #     logger.info(f"The repo {repo_name} has already been traced.")
-    #     return
+
+    if check_files(packages_path, args.no_deps):
+        logger.info(f"The repo {repo_name} has already been traced.")
+        return
 
     # If the lean4 package exists, we assume the build has completed and we just need to trace
-    if (Path(".lake/packages/lean4") if is_new_version(get_lean_version()) else Path("lake-packages/lean4")).exists():
+    if (Path(".lake/packages/lean4") if use_new_layout else Path("lake-packages/lean4")).exists():
         logger.info(f"The repo {repo_name} has already been built, but has not been traced.")
     else:
         # Build the repo using lake.

From 9277838324064bac52f748a3c63d622628709049 Mon Sep 17 00:00:00 2001
From: aumrp77 <aumrp77@gmail.com>
Date: Tue, 4 Nov 2025 08:24:52 -0500
Subject: [PATCH 24/29] Ensure ExtractData available per trace run

---
 data_extraction/build_lean4_repo.py | 32 ++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/data_extraction/build_lean4_repo.py b/data_extraction/build_lean4_repo.py
index 9947cfc..b698752 100644
--- a/data_extraction/build_lean4_repo.py
+++ b/data_extraction/build_lean4_repo.py
@@ -94,8 +94,12 @@ def launch_progressbar(paths: List[Union[str, Path]]) -> Generator[None, None, N
     num_total = len(olean_files)
     p = Process(target=_monitor, args=(paths, num_total), daemon=True)
     p.start()
-    yield
-    p.kill()
+    try:
+        yield
+    finally:
+        p.join(timeout=1)
+        if p.is_alive():
+            p.terminate()
 
 
 def get_lean_version() -> str:
@@ -163,6 +167,10 @@ def main() -> None:
     repo_name = args.repo_name
     os.chdir(repo_name)
 
+    extractor_src = Path(__file__).with_name("ExtractData.lean").resolve()
+    extractor_dst = Path("ExtractData.lean")
+    shutil.copy2(extractor_src, extractor_dst)
+
     lean_version = get_lean_version()
     use_new_layout = is_new_version(lean_version)
     if use_new_layout:
@@ -172,10 +180,6 @@ def main() -> None:
         packages_path = "lake-packages"
         build_path = "build"
 
-    if check_files(packages_path, args.no_deps):
-        logger.info(f"The repo {repo_name} has already been traced.")
-        return
-
     # If the lean4 package exists, we assume the build has completed and we just need to trace
     if (Path(".lake/packages/lean4") if use_new_layout else Path("lake-packages/lean4")).exists():
         logger.info(f"The repo {repo_name} has already been built, but has not been traced.")
@@ -201,12 +205,16 @@ def main() -> None:
         dirs_to_monitor.append(packages_path)
     
     logger.info(f"Tracing {repo_name}")
-    with launch_progressbar(dirs_to_monitor):
-        cmd = f"lake env lean --threads {num_procs} --run ExtractData.lean"
-        if args.no_deps:
-            cmd += " noDeps"
-        logger.debug(cmd)
-        run_cmd(cmd, capture_output=True)
+    try:
+        with launch_progressbar(dirs_to_monitor):
+            cmd = f"lake env lean --threads {num_procs} --run ExtractData.lean"
+            if args.no_deps:
+                cmd += " noDeps"
+            logger.debug(cmd)
+            run_cmd(cmd, capture_output=True)
+    finally:
+        if extractor_dst.exists():
+            extractor_dst.unlink()
 
     assert check_files(packages_path, args.no_deps), "Some files failed to be processed."
 

From db18ef8213d096955c01acc03fee125190479847 Mon Sep 17 00:00:00 2001
From: aumrp77 <aumrp77@gmail.com>
Date: Thu, 6 Nov 2025 10:20:59 -0500
Subject: [PATCH 25/29] Patch macOS dylib handling and lean-toolchain parser

---
 data_extraction/build_lean4_repo.py | 72 ++++++++++++++++++++++++++++-
 data_extraction/lean.py             | 52 +++++++++++++++++++--
 2 files changed, 118 insertions(+), 6 deletions(-)

diff --git a/data_extraction/build_lean4_repo.py b/data_extraction/build_lean4_repo.py
index b698752..5655a2d 100644
--- a/data_extraction/build_lean4_repo.py
+++ b/data_extraction/build_lean4_repo.py
@@ -4,6 +4,7 @@
 """
 
 import os
+import sys
 import re
 import shutil
 import argparse
@@ -33,6 +34,60 @@ def run_cmd(cmd: Union[str, List[str]], capture_output: bool = False) -> Optiona
         return None
 
 
+def is_macos() -> bool:
+    return sys.platform == "darwin"
+
+
+def _patch_dylib(path: Path) -> None:
+    """Adjust __DATA_CONST flags so macOS 15 accepts the library."""
+    try:
+        subprocess.run(
+            [
+                "xcrun",
+                "vtool",
+                "-set",
+                "segprot",
+                "__DATA_CONST",
+                "r--",
+                "rw-",
+                str(path),
+            ],
+            check=True,
+            capture_output=True,
+        )
+        subprocess.run(
+            [
+                "xcrun",
+                "vtool",
+                "-set",
+                "segflags",
+                "__DATA_CONST",
+                "0x4",
+                str(path),
+            ],
+            check=True,
+            capture_output=True,
+        )
+        subprocess.run(
+            ["codesign", "--force", "--sign", "-", str(path)],
+            check=True,
+            capture_output=True,
+        )
+    except (subprocess.CalledProcessError, FileNotFoundError) as ex:
+        logger.warning(f"Failed to patch {path}: {ex}")
+
+
+def patch_dylibs(root: Path) -> None:
+    if not is_macos():
+        return
+    dylibs = list(root.rglob("*.dylib"))
+    if not dylibs:
+        return
+    logger.debug(f"Patching {len(dylibs)} dylibs under {root}")
+    for dylib in dylibs:
+        _patch_dylib(dylib)
+
+
 def record_paths(dir: Path, root: Path, lean_bin: Path) -> None:
     """Run ``lean --deps`` for all Lean files in ``dir`` to record its dependencies.
 
@@ -192,7 +247,22 @@ def main() -> None:
                 run_cmd("lake exe cache get")
             except subprocess.CalledProcessError:
                 pass
-        run_cmd("lake build")
+
+        # Try building; on macOS, if the build fails due to SG_READ_ONLY, patch dylibs and retry once.
+        try:
+            run_cmd("lake build")
+        except subprocess.CalledProcessError as e:
+            if is_macos():
+                logger.warning("lake build failed; patching dylibs for macOS and retrying once")
+                patch_dylibs(Path(packages_path))
+                patch_dylibs(Path(build_path))
+                run_cmd("lake build")
+            else:
+                raise
+
+        # Ensure final artifacts are patched as well.
+        patch_dylibs(Path(packages_path))
+        patch_dylibs(Path(build_path))
 
         # Copy the Lean 4 stdlib into the path of packages.
         lean_prefix = run_cmd(f"lean --print-prefix", capture_output=True).strip()
diff --git a/data_extraction/lean.py b/data_extraction/lean.py
index 9a63618..d1705f7 100644
--- a/data_extraction/lean.py
+++ b/data_extraction/lean.py
@@ -382,6 +382,18 @@ def __getitem__(self, key) -> str:
 _LEAN4_VERSION_REGEX = re.compile(r"leanprover/lean4:(?P<version>.+?)")
 
 
+def _read_toolchain_content(config_dict: Dict[str, Any]) -> str:
+    """Extract the textual content of a lean-toolchain definition."""
+    content = config_dict.get("content")
+    if content is None:
+        download_url = config_dict.get("download_url")
+        if download_url:
+            content = read_url(download_url)
+        else:
+            raise KeyError("config_dict must have a 'content' field or a 'download_url'")
+    return content.strip()
+
+
 def get_lean4_version_from_config(toolchain: str) -> str:
     """Return the required Lean version given a ``lean-toolchain`` config."""
     m = _LEAN4_VERSION_REGEX.fullmatch(toolchain.strip())
@@ -391,8 +403,7 @@ def get_lean4_version_from_config(toolchain: str) -> str:
 
 def get_lean4_commit_from_config(config_dict: Dict[str, Any]) -> str:
     """Return the required Lean commit given a ``lean-toolchain`` config."""
-    assert "content" in config_dict, "config_dict must have a 'content' field"
-    config = config_dict["content"].strip()
+    config = _read_toolchain_content(config_dict)
     prefix = "leanprover/lean4:"
 
     if config == f"{prefix}nightly":
@@ -404,8 +415,39 @@ def get_lean4_commit_from_config(config_dict: Dict[str, Any]) -> str:
 
     if version.startswith("nightly"):
         return _to_commit_hash_compat(LEAN4_NIGHTLY_REPO, version)
-    else:
-                return _to_commit_hash_compat(LEAN4_REPO, version)
+
+    def _try_labels(labels: List[str]):
+        for label in labels:
+            try:
+                return _to_commit_hash_compat(LEAN4_REPO, label)
+            except ValueError:
+                continue
+        raise ValueError
+
+    labels_to_try = [version]
+    if not version.startswith("v"):
+        labels_to_try.append(f"v{version}")
+
+    try:
+        return _try_labels(labels_to_try)
+    except ValueError:
+        tags = LEAN4_REPO.get_tags()
+        for tag in tags:
+            if tag.name == version or tag.name == f"v{version}":
+                logger.warning(
+                    f"Falling back to Lean tag {tag.name} for toolchain version {version}."
+                )
+                return tag.commit.sha
+            if version in tag.name:
+                logger.warning(
+                    f"Approximating Lean toolchain {version} with tag {tag.name}."
+                )
+                return tag.commit.sha
+
+        logger.warning(
+            f"Unable to resolve Lean toolchain {version}; falling back to latest commit of leanprover/lean4."
+        )
+        return LEAN4_REPO.get_commits()[0].sha
 
 
 URL = TAG = COMMIT = str
@@ -504,7 +546,7 @@ def __post_init__(self) -> None:
         else:
             config = self.get_config("lean-toolchain")
             lean_version = get_lean4_commit_from_config(config)
-            v = get_lean4_version_from_config(config["content"])
+            v = get_lean4_version_from_config(_read_toolchain_content(config))
             if not is_supported_version(v):
                 logger.warning(
                     f"{self} relies on an unsupported Lean version: {lean_version}"

From 2cf489ab4a9a27b4d08f4f24afbf1206e8060e63 Mon Sep 17 00:00:00 2001
From: aumrp77 <aumrp77@gmail.com>
Date: Tue, 18 Nov 2025 09:24:20 -0500
Subject: [PATCH 26/29] Update tracing workflow (Docker + build_deps)

---
 .dockerignore                 |   8 +
 Dockerfile.arm                |  16 ++
 leanagent.py                  |  38 +++--
 scripts/trace_paper_repos.py  | 157 ++++++++----------
 workspace/build_lean4_repo.py | 293 ++++++++++++++++++++++++++++++++++
 5 files changed, 404 insertions(+), 108 deletions(-)
 create mode 100644 .dockerignore
 create mode 100644 Dockerfile.arm
 create mode 100644 workspace/build_lean4_repo.py

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..8ccc9f2
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,8 @@
+RAID
+workspace
+**/.lake
+**/__pycache__
+**/*.olean
+**/*.log
+.git
+.venv
diff --git a/Dockerfile.arm b/Dockerfile.arm
new file mode 100644
index 0000000..f9178ed
--- /dev/null
+++ b/Dockerfile.arm
@@ -0,0 +1,16 @@
+FROM ubuntu:22.04
+
+RUN apt-get update && apt-get install -y \
+      curl git python3 python3-pip python3-venv build-essential \
+      && rm -rf /var/lib/apt/lists/*
+
+RUN curl https://raw.githubusercontent.com/elan/elan/master/elan-init.sh -sSf \
+    | sh -s -- -y --default-toolchain leanprover/lean4:v4.9.0
+ENV PATH="/root/.elan/bin:${PATH}"
+
+WORKDIR /workspace/LeanAgent
+COPY . .
+
+RUN python3 -m venv .venv && \
+    .venv/bin/pip install --upgrade pip && \
+    .venv/bin/pip install -r requirements.txt
diff --git a/leanagent.py b/leanagent.py
index 8fb50a0..5b2afb8 100644
--- a/leanagent.py
+++ b/leanagent.py
@@ -112,6 +112,10 @@ def save_database_locked(db: DynamicDatabase, path: str) -> None:
     write_json_locked(path, db.to_dict(), ensure_ascii=False)
 
 
+def append_text_locked(path: str, *chunks: str) -> None:
+    """Atomically append textual data while coordinating concurrent writers."""
+    with _locked(path, "a") as handle:
+        handle.writelines(chunks)
 def _eval(data, preds_map) -> Tuple[float, float, float]:
     """Evaluates the retrieval model."""
     R1 = []
@@ -892,11 +896,11 @@ def main():
                         os.path.join(dataset_path, d) for d in os.listdir(dataset_path)
                     ]
                     if is_main_process:
-                        with open(EVAL_RESULTS_FILE_PATH, "a") as f:
-                            f.write("\n\n\n")
-                            f.write(
-                                f"Results for {dir_name} with lambda = {lambda_value}"
-                            )
+                        append_text_locked(
+                            EVAL_RESULTS_FILE_PATH,
+                            "\n\n\n",
+                            f"Results for {dir_name} with lambda = {lambda_value}",
+                        )
                     for data_path in testing_paths:
                         if "merged" not in data_path:
                             continue
@@ -919,11 +923,13 @@ def main():
                             total_R1.append(R1)
                             total_R10.append(R10)
                             total_MRR.append(MRR)
-                            with open(EVAL_RESULTS_FILE_PATH, "a") as f:
-                                f.write("\n\n\n")
-                                f.write(f"Intermediate results for {data_path}")
-                                f.write("\n\n\n")
-                                f.write(f"R@1 = {R1} %, R@10 = {R10} %, MRR = {MRR}")
+                            append_text_locked(
+                                EVAL_RESULTS_FILE_PATH,
+                                "\n\n\n",
+                                f"Intermediate results for {data_path}",
+                                "\n\n\n",
+                                f"R@1 = {R1} %, R@10 = {R10} %, MRR = {MRR}",
+                            )
 
                     if is_main_process:
                         avg_R1 = np.mean(total_R1)
@@ -935,13 +941,13 @@ def main():
                         )
 
                         if not os.path.exists(EVAL_RESULTS_FILE_PATH):
-                            open(EVAL_RESULTS_FILE_PATH, "w").close()
+                            append_text_locked(EVAL_RESULTS_FILE_PATH, "")
 
-                        with open(EVAL_RESULTS_FILE_PATH, "a") as f:
-                            f.write("\n\n\n")
-                            f.write(
-                                f"Average R@1 = {avg_R1} %, R@10 = {avg_R10} %, MRR = {avg_MRR}"
-                            )
+                        append_text_locked(
+                            EVAL_RESULTS_FILE_PATH,
+                            "\n\n\n",
+                            f"Average R@1 = {avg_R1} %, R@10 = {avg_R10} %, MRR = {avg_MRR}",
+                        )
                 else:
                     model_checkpoint_path = f"{RAID_DIR}/checkpoints/mathlib4_29dcec074de168ac2bf835a77ef68bbe069194c5.ckpt"
                     if result is None:
diff --git a/scripts/trace_paper_repos.py b/scripts/trace_paper_repos.py
index 8d33d0a..54de9b2 100755
--- a/scripts/trace_paper_repos.py
+++ b/scripts/trace_paper_repos.py
@@ -12,114 +12,87 @@
 import os
 import json
 import pathlib
+import sys
 
-from lean_dojo import LeanGitRepo
-from lean_dojo.data_extraction.trace import get_traced_repo_path
+HERE = pathlib.Path(__file__).resolve()
+REPO_ROOT = HERE.parents[1]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+
+from lean_dojo import LeanGitRepo  # noqa: E402
+from lean_dojo.data_extraction.trace import get_traced_repo_path  # noqa: E402
 
 
 # hardcoded list reconstructed from the paper / convo
+# ==== Already traced ====
+# 1. teorth/pfr FAITHFUL
+# 2. avigad/mathematics_in_lean_source
+# 3. yangky11/miniF2F-lean4
+# 6. AlexKontorovich/PrimeNumberTheoremAnd
+# 7. dwrensha/compfiles
+# 8. ImperialCollegeLondon/FLT
+# 9. verse-lab/veil
+# 10. eric-wieser/lean-matrix-cookbook
+
+# ==== Heavy / needs fix ====
+# 4. lecopivo/SciLean (macOS SG_READ_ONLY crash)
+# 11. loganrjmurphy/LeanEuclid (same)
+
+# ==== Remaining targets ====
+# PAPER_REPOS = [
+#     {
+#         "owner": "dwrensha",
+#         "name": "compfiles",
+#         "sha": "f99bf6f2928d47dd1a445b414b3a723c2665f091",
+#     },
+#     {
+#         "owner": "avigad",
+#         "name": "mathematics_in_lean_source",
+#         "sha": "5297e0fb051367c48c0a084411853a576389ecf5",
+#     },
+#     {
+#         "owner": "yangky11",
+#         "name": "miniF2F-lean4",
+#         "sha": "9e445f5435407f014b88b44a98436d50dd7abd00",
+#     },
+#     {
+#         "owner": "teorth",
+#         "name": "pfr",
+#         "sha": "fa398a5b853c7e94e3294c45e50c6aee013a2687",
+#     },
+#     {
+#         "owner": "ImperialCollegeLondon",
+#         "name": "FLT",
+#         "sha": "b208a302cdcbfadce33d8165f0b054bfa17e2147",
+#     },
+#     {
+#         "owner": "verse-lab",
+#         "name": "veil",
+#         "sha": "a9fe7205c57f7b6ee8b350bfc87b9b4b28c57781",
+#     },
+# ]
+
 PAPER_REPOS = [
-    # 1. teorth/pfr
-    # {
-    #     "owner": "teorth",
-    #     "name": "pfr",
-    #     "sha": "fa398a5b853c7e94e3294c45e50c6aee013a2687",
-    # },
-    # 2. avigad/mathematics_in_lean_source
-    {
-        "owner": "avigad",
-        "name": "mathematics_in_lean_source",
-        "sha": "5297e0fb051367c48c0a084411853a576389ecf5",
-    },
-    {
-        "owner": "verse-lab",
-        "name": "veil",
-        "sha": "a9fe7205c57f7b6ee8b350bfc87b9b4b28c57781",
-    },
-    # 3. miniF2F
-    {
-        "owner": "yangky11",
-        "name": "miniF2F-lean4",
-        "sha": "9e445f5435407f014b88b44a98436d50dd7abd00",
-    },
-    # 4. SciLean (in paper → we must make it work eventually)
-    # {
-    #     "owner": "lecopivo",
-    #     "name": "SciLean",
-    #     "sha": "22d53b2f4e3db2a172e71da6eb9c916e62655744",
-    # },
-    # 5. teorth/lean4-pdl
-    {
-        "owner": "teorth",
-        "name": "lean4-pdl",
-        "sha": "c7f649fe3c4891cf1a01c120e82ebc5f6199856e",
-    },
-    # 6. prime number theorem notes
-    {
-        "owner": "AlexKontorovich",
-        "name": "PrimeNumberTheoremAnd",
-        "sha": "29baddd685660b5fedd7bd67f9916ae24253d566",
-    },
-    # 7. compfiles
-    {
-        "owner": "dwrensha",
-        "name": "compfiles",
-        "sha": "f99bf6f2928d47dd1a445b414b3a723c2665f091",
-    },
-    # 8. FLT
-    {
-        "owner": "ImperialCollegeLondon",
-        "name": "FLT",
-        "sha": "b208a302cdcbfadce33d8165f0b054bfa17e2147",
-    },
-    {
-        "owner": "verse-lab",
-        "name": "veil",
-        "sha": "a9fe7205c57f7b6ee8b350bfc87b9b4b28c57781",
-    },
-    # 9. lean4-cli (paper mentions tooling repos; we saw this in your crawl)
-    {
-        "owner": "leanprover-community",
-        "name": "lean4-cli",
-        "sha": "05b1f4a68c5facea96a5ee51c6a56fef21276e0f",
-    },
-    # 10. matrix cookbook
     {
-        "owner": "eric-wieser",
-        "name": "lean-matrix-cookbook",
-        "sha": "f15a149d321ac99ff9b9c024b58e7882f564669f",
+        "owner": "lecopivo",
+        "name": "SciLean",
+        "sha": "22d53b2f4e3db2a172e71da6eb9c916e62655744",
     },
-    # 11. LeanEuclid
     {
         "owner": "loganrjmurphy",
         "name": "LeanEuclid",
         "sha": "f1912c3090eb82820575758efc31e40b9db86bb8",
     },
-    # 12. formalized logic foundation
     {
         "owner": "FormalizedFormalLogic",
         "name": "Foundation",
         "sha": "d5fe5d057a90a0703a745cdc318a1b6621490c21",
     },
-    # 13. con-nf
-    {
-        "owner": "pengbaolin",
-        "name": "con-nf",
-        "sha": "00bdc85ba7d486a9e544a0806a1018dd06fa3856",
-    },
-    # 14. zeta_3_irrational
-    {
-        "owner": "ahhwuhu",
-        "name": "zeta_3_irrational",
-        "sha": "914712200e463cfc97fe37e929d518dd58806a38",
-    },
-    # 15. LeanAPAP
-    {
-        "owner": "judicael-pvt",
-        "name": "LeanAPAP",
-        "sha": "951c660a8d7ba8e39f906fdf657674a984effa8b",
-    },
-    # paper had a few that we couldn't map to GH — keep extensible
+#     {
+#         "owner": "TODO",
+#         "name": "lean4lean",
+#         "sha": "05b1f4a68c5facea96a5ee51c6a56fef21276e0f",
+#     },
 ]
 
 
@@ -166,7 +139,7 @@ def main() -> None:
         print(f"\n=== tracing {url}@{commit} ===")
         try:
             repo = LeanGitRepo(url, commit)
-            traced_path = get_traced_repo_path(repo, build_deps=False)
+            traced_path = get_traced_repo_path(repo, build_deps=True)
             traced_path = pathlib.Path(traced_path)
             print(f"  lean_dojo traced into cache: {traced_path}")
         except Exception as e:
diff --git a/workspace/build_lean4_repo.py b/workspace/build_lean4_repo.py
new file mode 100644
index 0000000..5655a2d
--- /dev/null
+++ b/workspace/build_lean4_repo.py
@@ -0,0 +1,293 @@
+"""Build Lean 4 projects in Docker.
+
+Only this file runs in Docker. So it must be self-contained.
+"""
+
+import os
+import sys
+import re
+import shutil
+import argparse
+import itertools
+import subprocess
+from tqdm import tqdm
+from loguru import logger
+from time import sleep, monotonic
+from pathlib import Path, PurePath
+from multiprocessing import Process
+from contextlib import contextmanager
+from typing import Union, List, Optional, Generator
+
+
+def run_cmd(cmd: Union[str, List[str]], capture_output: bool = False) -> Optional[str]:
+    """Run a shell command.
+
+    Args:
+        cmd (Union[str, List[str]]): A command or a list of commands.
+    """
+    if isinstance(cmd, list):
+        cmd = " && ".join(cmd)
+    res = subprocess.run(cmd, shell=True, capture_output=capture_output, check=True)
+    if capture_output:
+        return res.stdout.decode()
+    else:
+        return None
+
+
+def is_macos() -> bool:
+    return sys.platform == "darwin"
+
+
+def _patch_dylib(path: Path) -> None:
+    """Adjust __DATA_CONST flags so macOS 15 accepts the library."""
+    try:
+        subprocess.run(
+            [
+                "xcrun",
+                "vtool",
+                "-set",
+                "segprot",
+                "__DATA_CONST",
+                "r--",
+                "rw-",
+                str(path),
+            ],
+            check=True,
+            capture_output=True,
+        )
+        subprocess.run(
+            [
+                "xcrun",
+                "vtool",
+                "-set",
+                "segflags",
+                "__DATA_CONST",
+                "0x4",
+                str(path),
+            ],
+            check=True,
+            capture_output=True,
+        )
+        subprocess.run(
+            ["codesign", "--force", "--sign", "-", str(path)],
+            check=True,
+            capture_output=True,
+        )
+    except (subprocess.CalledProcessError, FileNotFoundError) as ex:
+        logger.warning(f"Failed to patch {path}: {ex}")
+
+
+def patch_dylibs(root: Path) -> None:
+    if not is_macos():
+        return
+    dylibs = list(root.rglob("*.dylib"))
+    if not dylibs:
+        return
+    logger.debug(f"Patching {len(dylibs)} dylibs under {root}")
+    for dylib in dylibs:
+        _patch_dylib(dylib)
+
+
+def record_paths(dir: Path, root: Path, lean_bin: Path) -> None:
+    """Run ``lean --deps`` for all Lean files in ``dir`` to record its dependencies.
+
+    Args:
+        dir (Path): The directory containing Lean files.
+    """
+    dir = Path(dir)
+
+    for p in dir.glob("**/*.lean"):
+        with p.with_suffix(".dep_paths").open("wt") as oup:
+            for line in run_cmd(
+                f"{lean_bin} --deps {p}", capture_output=True
+            ).splitlines():
+                olean_path = PurePath(line.strip())
+                assert olean_path.suffix == ".olean"
+                lean_path = olean_path.relative_to(root).with_suffix(".lean")
+                oup.write(str(lean_path) + "\n")
+
+
+def remove_files(dir: Path, suffix: str) -> None:
+    """Remove all files in ``dir`` that end with ``suffix``."""
+    for p in Path(dir).glob(f"**/*{suffix}"):
+        p.unlink()
+
+
+_PROGRESSBAR_UPDATE_INTERNAL = 5
+
+
+def _monitor(paths: List[Path], num_total: int) -> None:
+    with tqdm(total=num_total) as pbar:
+        while True:
+            time_start = monotonic()
+            try:
+                num_done = len(
+                    list(
+                        itertools.chain.from_iterable(
+                            p.glob(f"**/*.ast.json") for p in paths
+                        )
+                    )
+                )
+            except Exception:
+                continue
+            time_elapsed = monotonic() - time_start
+            if time_elapsed < _PROGRESSBAR_UPDATE_INTERNAL:
+                sleep(_PROGRESSBAR_UPDATE_INTERNAL - time_elapsed)
+            pbar.update(num_done - pbar.n)
+            if num_done >= num_total:
+                break
+    print("")
+
+
+@contextmanager
+def launch_progressbar(paths: List[Union[str, Path]]) -> Generator[None, None, None]:
+    """Launch an async progressbar to monitor the progress of tracing the repo."""
+    paths = [Path(p) for p in paths]
+    olean_files = list(
+        itertools.chain.from_iterable(p.glob("**/*.olean") for p in paths)
+    )
+    num_total = len(olean_files)
+    p = Process(target=_monitor, args=(paths, num_total), daemon=True)
+    p.start()
+    try:
+        yield
+    finally:
+        p.join(timeout=1)
+        if p.is_alive():
+            p.terminate()
+
+
+def get_lean_version() -> str:
+    """Get the version of Lean."""
+    output = run_cmd("lean --version", capture_output=True).strip()
+    m = re.match(r"Lean \(version (?P<version>\S+?),", output)
+    return m["version"]
+
+
+def check_files(packages_path: str, no_deps: bool) -> bool:
+    """Check if all *.lean files have been processed to produce *.ast.json and *.dep_paths files."""
+    cwd = Path.cwd()
+    packages_path = cwd / packages_path
+    jsons = {
+        p.with_suffix("").with_suffix("")
+        for p in cwd.glob("**/build/ir/**/*.ast.json")
+        if not no_deps or not p.is_relative_to(packages_path)
+    }
+    deps = {
+        p.with_suffix("")
+        for p in cwd.glob("**/build/ir/**/*.dep_paths")
+        if not no_deps or not p.is_relative_to(packages_path)
+    }
+    oleans = {
+        Path(str(p.with_suffix("")).replace("/build/lib/", "/build/ir/"))
+        for p in cwd.glob("**/build/lib/**/*.olean")
+        if not no_deps or not p.is_relative_to(packages_path)
+    }
+    assert len(jsons) <= len(oleans) and len(deps) <= len(oleans)
+    missing_jsons = {p.with_suffix(".ast.json") for p in oleans - jsons}
+    missing_deps = {p.with_suffix(".dep_paths") for p in oleans - deps}
+    if len(missing_jsons) > 0 or len(missing_deps) > 0:
+        for p in missing_jsons.union(missing_deps):
+            logger.warning(f"Missing {p}")
+        return False
+    return True
+
+
+def is_new_version(v: str) -> bool:
+    """Check if ``v`` is at least `4.3.0-rc2`."""
+    major, minor, patch = [int(_) for _ in v.split("-")[0].split(".")]
+    if major < 4 or (major == 4 and minor < 3):
+        return False
+    if (
+        major > 4
+        or (major == 4 and minor > 3)
+        or (major == 4 and minor == 3 and patch > 0)
+    ):
+        return True
+    assert major == 4 and minor == 3 and patch == 0
+    if "4.3.0-rc" in v:
+        rc = int(v.split("-")[1][2:])
+        return rc >= 2
+    else:
+        return True
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("repo_name")
+    parser.add_argument("--no-deps", action="store_true")
+    args = parser.parse_args()
+
+    num_procs = int(os.environ["NUM_PROCS"])
+    repo_name = args.repo_name
+    os.chdir(repo_name)
+
+    extractor_src = Path(__file__).with_name("ExtractData.lean").resolve()
+    extractor_dst = Path("ExtractData.lean")
+    shutil.copy2(extractor_src, extractor_dst)
+
+    lean_version = get_lean_version()
+    use_new_layout = is_new_version(lean_version)
+    if use_new_layout:
+        packages_path = ".lake/packages"
+        build_path = ".lake/build"
+    else:
+        packages_path = "lake-packages"
+        build_path = "build"
+
+    # If the lean4 package exists, we assume the build has completed and we just need to trace
+    if (Path(".lake/packages/lean4") if use_new_layout else Path("lake-packages/lean4")).exists():
+        logger.info(f"The repo {repo_name} has already been built, but has not been traced.")
+    else:
+        # Build the repo using lake.
+        logger.info(f"Building {repo_name}")
+        if args.no_deps:
+            # The additional *.olean files wouldn't matter.
+            try:
+                run_cmd("lake exe cache get")
+            except subprocess.CalledProcessError:
+                pass
+
+        # Try building; on macOS, if the build fails due to SG_READ_ONLY, patch dylibs and retry once.
+        try:
+            run_cmd("lake build")
+        except subprocess.CalledProcessError as e:
+            if is_macos():
+                logger.warning("lake build failed; patching dylibs for macOS and retrying once")
+                patch_dylibs(Path(packages_path))
+                patch_dylibs(Path(build_path))
+                run_cmd("lake build")
+            else:
+                raise
+
+        # Ensure final artifacts are patched as well.
+        patch_dylibs(Path(packages_path))
+        patch_dylibs(Path(build_path))
+
+        # Copy the Lean 4 stdlib into the path of packages.
+        lean_prefix = run_cmd(f"lean --print-prefix", capture_output=True).strip()
+        shutil.copytree(lean_prefix, f"{packages_path}/lean4")
+
+    
+    # Run ExtractData.lean to extract ASTs, tactic states, and premise information.
+    dirs_to_monitor = [build_path]
+    if not args.no_deps:
+        dirs_to_monitor.append(packages_path)
+    
+    logger.info(f"Tracing {repo_name}")
+    try:
+        with launch_progressbar(dirs_to_monitor):
+            cmd = f"lake env lean --threads {num_procs} --run ExtractData.lean"
+            if args.no_deps:
+                cmd += " noDeps"
+            logger.debug(cmd)
+            run_cmd(cmd, capture_output=True)
+    finally:
+        if extractor_dst.exists():
+            extractor_dst.unlink()
+
+    assert check_files(packages_path, args.no_deps), "Some files failed to be processed."
+
+
+if __name__ == "__main__":
+    main()

From ec5e4edd35295e3083c86e0b5fe9b8e24fb18646 Mon Sep 17 00:00:00 2001
From: aumrp77 <aumrp77@gmail.com>
Date: Fri, 21 Nov 2025 01:02:53 -0500
Subject: [PATCH 27/29] Trace more paper repos

---
 scripts/trace_paper_repos.py | 180 ++++++++++++++++++++---------------
 1 file changed, 104 insertions(+), 76 deletions(-)

diff --git a/scripts/trace_paper_repos.py b/scripts/trace_paper_repos.py
index 54de9b2..da43365 100755
--- a/scripts/trace_paper_repos.py
+++ b/scripts/trace_paper_repos.py
@@ -9,9 +9,11 @@
     python scripts/trace_paper_repos.py
 """
 
-import os
 import json
+import os
 import pathlib
+import shutil
+import subprocess
 import sys
 
 HERE = pathlib.Path(__file__).resolve()
@@ -24,78 +26,102 @@
 
 
 # hardcoded list reconstructed from the paper / convo
-# ==== Already traced ====
-# 1. teorth/pfr FAITHFUL
-# 2. avigad/mathematics_in_lean_source
-# 3. yangky11/miniF2F-lean4
-# 6. AlexKontorovich/PrimeNumberTheoremAnd
-# 7. dwrensha/compfiles
-# 8. ImperialCollegeLondon/FLT
-# 9. verse-lab/veil
-# 10. eric-wieser/lean-matrix-cookbook
-
-# ==== Heavy / needs fix ====
-# 4. lecopivo/SciLean (macOS SG_READ_ONLY crash)
-# 11. loganrjmurphy/LeanEuclid (same)
-
-# ==== Remaining targets ====
-# PAPER_REPOS = [
-#     {
-#         "owner": "dwrensha",
-#         "name": "compfiles",
-#         "sha": "f99bf6f2928d47dd1a445b414b3a723c2665f091",
-#     },
-#     {
-#         "owner": "avigad",
-#         "name": "mathematics_in_lean_source",
-#         "sha": "5297e0fb051367c48c0a084411853a576389ecf5",
-#     },
-#     {
-#         "owner": "yangky11",
-#         "name": "miniF2F-lean4",
-#         "sha": "9e445f5435407f014b88b44a98436d50dd7abd00",
-#     },
-#     {
-#         "owner": "teorth",
-#         "name": "pfr",
-#         "sha": "fa398a5b853c7e94e3294c45e50c6aee013a2687",
-#     },
-#     {
-#         "owner": "ImperialCollegeLondon",
-#         "name": "FLT",
-#         "sha": "b208a302cdcbfadce33d8165f0b054bfa17e2147",
-#     },
-#     {
-#         "owner": "verse-lab",
-#         "name": "veil",
-#         "sha": "a9fe7205c57f7b6ee8b350bfc87b9b4b28c57781",
-#     },
-# ]
-
+# ==== Already traced / paper list ====
 PAPER_REPOS = [
-    {
-        "owner": "lecopivo",
-        "name": "SciLean",
-        "sha": "22d53b2f4e3db2a172e71da6eb9c916e62655744",
-    },
-    {
-        "owner": "loganrjmurphy",
-        "name": "LeanEuclid",
-        "sha": "f1912c3090eb82820575758efc31e40b9db86bb8",
-    },
-    {
-        "owner": "FormalizedFormalLogic",
-        "name": "Foundation",
-        "sha": "d5fe5d057a90a0703a745cdc318a1b6621490c21",
-    },
-#     {
-#         "owner": "TODO",
-#         "name": "lean4lean",
-#         "sha": "05b1f4a68c5facea96a5ee51c6a56fef21276e0f",
-#     },
+    # {"owner": "teorth", "name": "pfr", "sha": "fa398a5b853c7e94e3294c45e50c6aee013a2687"},  # ALREADY TRACED
+    # {"owner": "leanprover-community", "name": "hairy-ball-theorem", "sha": "a778826d19c8a7ddf1d26beeea628c45450612e6"},           # not found
+    # {"owner": "leanprover-community", "name": "coxeter", "sha": "96af8aee7943ca8685ed1b00cc83a559ea389a97"},            # not found
+    # {"owner": "avigad", "name": "mathematics_in_lean_source", "sha": "5297e0fb051367c48c0a084411853a576389ecf5"},  # ALREADY TRACED
+    {"owner": "mo271", "name": "FormalBook", "sha": "6fbe8c2985008c0bfb30050750a71b90388ad3a3"},  # searched commit hashes; original table SHA/owner invalid
+    # {"owner": "yangky11", "name": "miniF2F-lean4", "sha": "9e445f5435407f014b88b44a98436d50dd7abd00"},  # ALREADY TRACED
+    # {"owner": "lecopivo", "name": "SciLean", "sha": "22d53b2f4e3db2a172e71da6eb9c916e62655744"},  # ALREADY TRACED
+    {"owner": "fpvandoorn", "name": "carleson", "sha": "bec7808b907190882fa1fa54ce749af297c6cf37"},  # searched commit hashes; original table SHA/owner invalid
+    {"owner": "m4lvin", "name": "lean4-pdl", "sha": "c7f649fe3c4891cf1a01c120e82ebc5f6199856e"},  # searched commit hashes; original table SHA/owner invalid
+    # {"owner": "AlexKontorovich", "name": "PrimeNumberTheoremAnd", "sha": "29baddd685660b5fedd7bd67f9916ae24253d566"},  # ALREADY TRACED
+    # {"owner": "dwrensha", "name": "compfiles", "sha": "f99bf6f2928d47dd1a445b414b3a723c2665f091"},  # ALREADY TRACED
+    # {"owner": "ImperialCollegeLondon", "name": "FLT", "sha": "b208a302cdcbfadce33d8165f0b054bfa17e2147"},  # ALREADY TRACED
+    # {"owner": "Bachmann", "name": "debate", "sha": "7fb39251b705797ee54e08c96177fabd29a5b5a3"},   # not found
+    # {"owner": "digama0", "name": "lean4lean", "sha": "05b1f4a68c5facea96a5ee51c6a56fef21276e0f"},  # ALREADY TRACED
+    # {"owner": "eric-wieser", "name": "lean-matrix-cookbook", "sha": "f15a149d321ac99ff9b9c024b58e7882f564669f"},  # ALREADY TRACED
+    # {"owner": "yuma-mizuno", "name": "lean-math-workshop", "sha": "5acd4b933d47fd6c1032798a6046c1baf261445d"},  # ALREADY TRACED
+    # {"owner": "loganrjmurphy", "name": "LeanEuclid", "sha": "f1912c3090eb82820575758efc31e40b9db86bb8"},                      SMT ERROR
+    # {"owner": "FormalizedFormalLogic", "name": "Foundation", "sha": "d5fe5d057a90a0703a745cdc318a1b6621490c21"},                      SMT ERROR
+    # {"owner": "leanprover-community", "name": "con-nf", "sha": "00bdc85ba7d486a9e544a0806a1018dd06fa3856"},  # ALREADY TRACED
+    # {"owner": "siddhartha-gadgil", "name": "Saturn", "sha": "3811a9dd46cdfd5fa0c0c1896720c28d2ec4a42a"},    # ALREADY TRACED
+    # {"owner": "ahhwuhu", "name": "zeta_3_irrational", "sha": "914712200e463cfc97fe37e929d518dd58806a38"},  # ALREADY TRACED
+    # {"owner": "EmilGedda", "name": "Formalization-of-Constructable-Numbers", "sha": "01ef1f22a04f2ba8081c5fb29413f515a0e52878"},  # not found
+    {"owner": "YaelDillies", "name": "LeanAPAP", "sha": "951c660a8d7ba8e39f906fdf657674a984effa8b"},  # searched commit hashes; original table SHA/owner invalid
 ]
 
 
+PINNED_DEPS = {
+    "loganrjmurphy/LeanEuclid": [
+        {
+            "url": "https://github.com/leanprover-community/mathlib4",
+            "type": "git",
+            "subDir": None,
+            "rev": "b2c9f64fbc8dfe4c1b15b2bc6ab5a6f472fc047e",
+            "name": "mathlib",
+            "manifestFile": "lake-manifest.json",
+            "inputRev": "b2c9f64fbc8dfe4c1b15b2bc6ab5a6f472fc047e",
+            "inherited": False,
+            "configFile": "lakefile.lean",
+        },
+        {
+            "url": "https://github.com/yangky11/lean-smt.git",
+            "type": "git",
+            "subDir": None,
+            "rev": "a3c0e8ab1e07d74b8fd745e7b3c4b83c6d859bbb",
+            "name": "smt",
+            "manifestFile": "lake-manifest.json",
+            "inputRev": "a3c0e8ab1e07d74b8fd745e7b3c4b83c6d859bbb",
+            "inherited": False,
+            "configFile": "lakefile.lean",
+        },
+    ]
+}
+
+
+def apply_dependency_pins(repo_root: pathlib.Path, owner: str, name: str) -> None:
+    repo_key = f"{owner}/{name}"
+    targets = PINNED_DEPS.get(repo_key)
+    if not targets:
+        return
+
+    manifest_path = repo_root / "lake-manifest.json"
+    if manifest_path.exists():
+        try:
+            manifest = json.load(manifest_path.open())
+        except json.JSONDecodeError:
+            manifest = {}
+    else:
+        manifest = {}
+
+    manifest.setdefault("packagesDir", ".lake/packages")
+    packages = manifest.setdefault("packages", [])
+    changed = False
+
+    for target in targets:
+        for pkg in packages:
+            if pkg.get("name") == target["name"]:
+                if pkg != target:
+                    pkg.update(target)
+                    changed = True
+                break
+        else:
+            packages.append(dict(target))
+            changed = True
+
+    if changed:
+        manifest_path.write_text(json.dumps(manifest, indent=2))
+        print(f"  pinned lake-manifest.json for {repo_key}")
+
+    lake_dir = repo_root / ".lake"
+    if lake_dir.exists():
+        shutil.rmtree(lake_dir)
+        print(f"  removed stale .lake directory for {repo_key}")
+
+
 def make_corpus_from_repo(source_root: pathlib.Path, out_dir: pathlib.Path, url: str, commit: str) -> int:
     """Scan .lake/build/ir for *.ast.json and write corpus.jsonl."""
     ir_root = source_root / ".lake" / "build" / "ir"
@@ -136,6 +162,15 @@ def main() -> None:
         url = f"https://github.com/{item['owner']}/{item['name']}"
         commit = item["sha"]
 
+        # repo as checked out by the earlier crawl
+        repo_root = repo_dir / item["owner"] / item["name"]
+        out_dir = raid_dir / "data" / f"{item['name']}_{commit}"
+
+        if repo_root.exists():
+            apply_dependency_pins(repo_root, item["owner"], item["name"])
+        else:
+            print(f"  !! repo root {repo_root} not found — was it cloned under RAID/repos/?")
+
         print(f"\n=== tracing {url}@{commit} ===")
         try:
             repo = LeanGitRepo(url, commit)
@@ -146,13 +181,6 @@ def main() -> None:
             print(f"  !! lean_dojo failed for {url}@{commit}: {e}")
             continue
 
-        # repo as checked out by the earlier crawl
-        repo_root = repo_dir / item["owner"] / item["name"]
-        out_dir = raid_dir / "data" / f"{item['name']}_{commit}"
-
-        if not repo_root.exists():
-            print(f"  !! repo root {repo_root} not found — was it cloned under RAID/repos/?")
-
         sources = [traced_path]
         if repo_root.exists():
             sources.append(repo_root)

From 242778ac288bc2ab0bdeb79140f4cbd25bfe67d9 Mon Sep 17 00:00:00 2001
From: aumrp77 <aumrp77@gmail.com>
Date: Mon, 8 Dec 2025 20:18:52 -0500
Subject: [PATCH 28/29] Add training scripts and fix merged dataset build

---
 generate_benchmark_lean4.py     |   2 +-
 scripts/build_merged_dataset.py |  98 +++++++++++++++-
 scripts/process_local_traces.py | 130 +++++++++++++++++++++
 scripts/setup_vm.sh             |  70 +++++++++++
 scripts/trace_paper_repos.py    |  46 ++++----
 train_leanagent.py              | 199 ++++++++++++++++++++++++++++++++
 6 files changed, 515 insertions(+), 30 deletions(-)
 create mode 100644 scripts/process_local_traces.py
 create mode 100644 scripts/setup_vm.sh
 create mode 100644 train_leanagent.py

diff --git a/generate_benchmark_lean4.py b/generate_benchmark_lean4.py
index 7fed1d2..9d20504 100644
--- a/generate_benchmark_lean4.py
+++ b/generate_benchmark_lean4.py
@@ -233,7 +233,7 @@ def export_proofs(
     total_theorems = 0
     for strategy, split in splits.items():
         split_dir = dst_path / strategy
-        split_dir.mkdir(parents=True)
+        split_dir.mkdir(parents=True, exist_ok=True)
 
         for name, theorems in split.items():
             data = []
diff --git a/scripts/build_merged_dataset.py b/scripts/build_merged_dataset.py
index c1879f4..075ce7a 100644
--- a/scripts/build_merged_dataset.py
+++ b/scripts/build_merged_dataset.py
@@ -49,6 +49,9 @@ def iter_nonempty_corpora(data_root: Path):
     item["name"]: f"https://github.com/{item['owner']}/{item['name']}"
     for item in PAPER_REPOS
 }
+# Add alternative names for repos with different directory names
+_SLUG_TO_URL["formal_book"] = "https://github.com/mo271/FormalBook"
+_SLUG_TO_URL["hairy-ball-theorem-lean"] = "https://github.com/leanprover-community/hairy-ball-theorem"
 
 
 def _infer_repo_from_dir(dir_path: Path) -> tuple[str, str]:
@@ -95,21 +98,104 @@ def main() -> None:
 
     # Add repos discovered from existing corpora
     data_root = raid_dir / "data"
+    if not data_root.exists():
+        logger.warning(f"{data_root} does not exist. Checking {raid_dir} directly...")
+        data_root = raid_dir
+
     targets = []
     for d, cj in iter_nonempty_corpora(data_root):
         try:
+            # We don't strictly need url/commit here if we trust metadata.json, 
+            # but it's good for logging.
             url, commit = load_repo_from_corpus(cj)
-            targets.append((url, commit))
+            targets.append((d, cj, url, commit))
         except Exception as e:
             logger.warning(f"Skipping {d} due to: {e}")
 
     logger.info(f"Found {len(targets)} repos with non-empty corpora to ingest")
 
-    for url, commit in targets:
-        repo = LeanGitRepo(url, commit)
-        logger.info(f"Ingesting {url}@{commit}")
-        status = add_repo_to_database(str(db_path), repo, db)
-        logger.info(f"Status for {url}: {status}")
+    from dynamic_database import Repository
+    import requests
+    import re
+
+    def get_lean_version_from_github(url, commit):
+        """Fetch lean-toolchain from GitHub and parse version."""
+        try:
+            raw_url = url.replace("github.com", "raw.githubusercontent.com")
+            config_url = f"{raw_url}/{commit}/lean-toolchain"
+            response = requests.get(config_url, timeout=10)
+            if response.status_code == 200:
+                content = response.text.strip()
+                # Parse version like "leanprover/lean4:v4.8.0" -> "v4.8.0"
+                match = re.search(r"leanprover/lean4:(.+)", content)
+                if match:
+                    return match.group(1)
+                return content # Fallback to full string if regex fails
+        except Exception as e:
+            logger.warning(f"Failed to fetch lean-toolchain for {url}@{commit}: {e}")
+        return "v4.0.0" # Ultimate fallback
+
+    for d, cj, url, commit in targets:
+        logger.info(f"Ingesting {url}@{commit} from {d}")
+        
+        # 1. Read metadata.json
+        meta_path = d / "metadata.json"
+        meta = {}
+        if meta_path.exists():
+            try:
+                with open(meta_path, "r") as f:
+                    meta = json.load(f)
+            except Exception as e:
+                logger.error(f"Failed to read metadata for {d}: {e}")
+
+        # 2. Construct Repository data
+        from_repo = meta.get("from_repo", {})
+        repo_url = from_repo.get("url", url)
+        repo_commit = from_repo.get("commit", commit)
+        
+        date_processed = meta.get("date_processed")
+        if not date_processed:
+             date_processed = meta.get("creation_time", "2024-01-01T00:00:00.000000")
+
+        lean_dojo_ver = meta.get("lean_dojo_version") or meta.get("leandojo_version", "0.0.1")
+        
+        # Fetch REAL Lean version if missing
+        lean_ver = meta.get("lean_version")
+        if not lean_ver:
+            logger.info(f"Fetching real Lean version for {repo_url}...")
+            lean_ver = get_lean_version_from_github(repo_url, repo_commit)
+            logger.info(f"Got version: {lean_ver}")
+
+        # Create dummy theorems folder if missing
+        theorems_dir = d / "random"
+        if not theorems_dir.exists():
+            theorems_dir.mkdir(parents=True, exist_ok=True)
+
+        repo_data = {
+            "url": repo_url,
+            "name": repo_url.split("/")[-1] if repo_url else d.name.split("_")[0],
+            "commit": repo_commit,
+            "lean_version": lean_ver,
+            "lean_dojo_version": lean_dojo_ver,
+            "metadata": {
+                "date_processed": date_processed
+            },
+            "theorems_folder": str(theorems_dir),
+            "premise_files_corpus": str(cj),
+            "files_traced": str(d / "traced_files.jsonl")
+        }
+
+        # 3. Add to DB
+        try:
+            repo = Repository.from_dict(repo_data)
+            db.add_repository(repo)
+            logger.info(f"Successfully added {repo_url} to DB")
+        except Exception as e:
+            logger.error(f"Failed to add repo {repo_url} to DB: {e}")
+
+    # Save updated database
+    logger.info(f"Saving database with {len(db.repositories)} repositories to {db_path}")
+    db.to_json(str(db_path))
 
     # Export merged dataset
     out_dir = raid_dir / "data" / "merged_paper_subset"
diff --git a/scripts/process_local_traces.py b/scripts/process_local_traces.py
new file mode 100644
index 0000000..4be7717
--- /dev/null
+++ b/scripts/process_local_traces.py
@@ -0,0 +1,130 @@
+import json
+import pathlib
+import os
+import shutil
+import sys
+
+# Add the parent directory (LeanAgent root) to sys.path to allow imports
+current_dir = pathlib.Path(__file__).parent.resolve()
+lean_agent_root = current_dir.parent
+sys.path.insert(0, str(lean_agent_root))
+
+from lean_dojo.data_extraction.traced_data import TracedRepo
+from generate_benchmark_lean4 import export_premises, export_proofs, split_data, export_metadata
+
+def process_repo(source_root: pathlib.Path, out_dir: pathlib.Path, url: str, commit: str) -> bool:
+    """
+    Load a traced Lean repository from source_root,
+    export premises to corpus.jsonl, and export theorems to random/.
+    """
+    print(f"  Loading TracedRepo from {source_root}...")
+    try:
+        traced_repo = TracedRepo.from_traced_files(source_root, build_deps=True)
+    except Exception as e:
+        print(f"  !! Failed to load TracedRepo: {e}")
+        return False
+        
+    print(f"  Exporting to {out_dir}...")
+    out_dir.mkdir(parents=True, exist_ok=True)
+    
+    # Step 1: Export premises to corpus.jsonl
+    try:
+        export_premises(traced_repo, out_dir)
+        print(f"    ✓ Exported premises → {out_dir / 'corpus.jsonl'}")
+    except Exception as e:
+        print(f"  !! Failed to export premises: {e}")
+        return False
+
+    # Step 2: Export theorems (THE FIX - this was missing!)
+    try:
+        print(f"    Extracting theorems...")
+        splits = split_data(traced_repo, num_val_pct=0.02, num_test_pct=0.02)
+        total_theorems = export_proofs(splits, out_dir, traced_repo)
+        export_metadata(traced_repo, out_dir)  # Fixed: removed splits argument
+        print(f"    ✓ Exported {total_theorems} theorems → {out_dir / 'random/'}")
+    except Exception as e:
+        print(f"  !! Failed to export theorems: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+    return True
+
+# VM repo paths - comment out repos you want to skip
+VM_REPOS = [
+    {"name": "FLT", "owner": "ImperialCollegeLondon", "commit": "b208a302cdcbfadce33d8165f0b054bfa17e2147", "local_path": "/home/aum/repos_cache/FLT/ImperialCollegeLondon-FLT-b208a302cdcbfadce33d8165f0b054bfa17e2147/FLT"},
+    {"name": "formal_book", "owner": "mo271", "commit": "6fbe8c2985008c0bfb30050750a71b90388ad3a3", "local_path": "/home/aum/repos_cache/FormalBook/mo271-formal_book-6fbe8c2985008c0bfb30050750a71b90388ad3a3/formal_book"},
+    {"name": "Formalisation-of-constructable-numbers", "owner": "Louis", "commit": "01ef1f22a04f2ba8081c5fb29413f515a0e52878", "local_path": "/home/aum/repos_cache/Formalization-of-Constructable-Numbers/Louis-Le-Grand-Formalisation-of-constructable-numbers-01ef1f22a04f2ba8081c5fb29413f515a0e52878/Formalisation-of-constructable-numbers"},
+    {"name": "Foundation", "owner": "FormalizedFormalLogic", "commit": "d5fe5d057a90a0703a745cdc318a1b6621490c21", "local_path": "/home/aum/repos_cache/Foundation/FormalizedFormalLogic-Foundation-d5fe5d057a90a0703a745cdc318a1b6621490c21/Foundation"},
+    {"name": "LeanAPAP", "owner": "YaelDillies", "commit": "951c660a8d7ba8e39f906fdf657674a984effa8b", "local_path": "/home/aum/repos_cache/LeanAPAP/YaelDillies-LeanAPAP-951c660a8d7ba8e39f906fdf657674a984effa8b/LeanAPAP"},
+    {"name": "LeanEuclid", "owner": "loganrjmurphy", "commit": "f1912c3090eb82820575758efc31e40b9db86bb8", "local_path": "/home/aum/repos_cache/LeanEuclid/loganrjmurphy-LeanEuclid-f1912c3090eb82820575758efc31e40b9db86bb8/LeanEuclid"},
+    {"name": "PrimeNumberTheoremAnd", "owner": "AlexKontorovich", "commit": "29baddd685660b5fedd7bd67f9916ae24253d566", "local_path": "/home/aum/repos_cache/PrimeNumberTheoremAnd/AlexKontorovich-PrimeNumberTheoremAnd-29baddd685660b5fedd7bd67f9916ae24253d566/PrimeNumberTheoremAnd"},
+    {"name": "Saturn", "owner": "siddhartha", "commit": "3811a9dd46cdfd5fa0c0c1896720c28d2ec4a42a", "local_path": "/home/aum/repos_cache/Saturn/siddhartha-gadgil-Saturn-3811a9dd46cdfd5fa0c0c1896720c28d2ec4a42a/Saturn"},
+    {"name": "SciLean", "owner": "lecopivo", "commit": "22d53b2f4e3db2a172e71da6eb9c916e62655744", "local_path": "/home/aum/repos_cache/SciLean/lecopivo-SciLean-22d53b2f4e3db2a172e71da6eb9c916e62655744/SciLean"},
+    {"name": "carleson", "owner": "fpvandoorn", "commit": "bec7808b907190882fa1fa54ce749af297c6cf37", "local_path": "/home/aum/repos_cache/carleson/fpvandoorn-carleson-bec7808b907190882fa1fa54ce749af297c6cf37/carleson"},
+    {"name": "compfiles", "owner": "dwrensha", "commit": "f99bf6f2928d47dd1a445b414b3a723c2665f091", "local_path": "/home/aum/repos_cache/compfiles/dwrensha-compfiles-f99bf6f2928d47dd1a445b414b3a723c2665f091/compfiles"},
+    {"name": "con-nf", "owner": "leanprover", "commit": "00bdc85ba7d486a9e544a0806a1018dd06fa3856", "local_path": "/home/aum/repos_cache/con-nf/leanprover-community-con-nf-00bdc85ba7d486a9e544a0806a1018dd06fa3856/con-nf"},
+    {"name": "coxeter", "owner": "NUS", "commit": "96af8aee7943ca8685ed1b00cc83a559ea389a97", "local_path": "/home/aum/repos_cache/coxeter/NUS-Math-Formalization-coxeter-96af8aee7943ca8685ed1b00cc83a559ea389a97/coxeter"},
+    {"name": "hairy-ball-theorem-lean", "owner": "corent1234", "commit": "a778826d19c8a7ddf1d26beeea628c45450612e6", "local_path": "/home/aum/repos_cache/hairy-ball-theorem/corent1234-hairy-ball-theorem-lean-a778826d19c8a7ddf1d26beeea628c45450612e6/hairy-ball-theorem-lean"},
+    {"name": "lean-math-workshop", "owner": "yuma", "commit": "5acd4b933d47fd6c1032798a6046c1baf261445d", "local_path": "/home/aum/repos_cache/lean-math-workshop/yuma-mizuno-lean-math-workshop-5acd4b933d47fd6c1032798a6046c1baf261445d/lean-math-workshop"},
+    {"name": "lean-matrix-cookbook", "owner": "eric", "commit": "f15a149d321ac99ff9b9c024b58e7882f564669f", "local_path": "/home/aum/repos_cache/lean-matrix-cookbook/eric-wieser-lean-matrix-cookbook-f15a149d321ac99ff9b9c024b58e7882f564669f/lean-matrix-cookbook"},
+    {"name": "lean4-pdl", "owner": "m4lvin", "commit": "c7f649fe3c4891cf1a01c120e82ebc5f6199856e", "local_path": "/home/aum/repos_cache/lean4-pdl/m4lvin-lean4-pdl-c7f649fe3c4891cf1a01c120e82ebc5f6199856e/lean4-pdl"},
+    {"name": "lean4lean", "owner": "digama0", "commit": "05b1f4a68c5facea96a5ee51c6a56fef21276e0f", "local_path": "/home/aum/repos_cache/lean4lean/digama0-lean4lean-05b1f4a68c5facea96a5ee51c6a56fef21276e0f/lean4lean"},
+    {"name": "mathematics_in_lean_source", "owner": "avigad", "commit": "5297e0fb051367c48c0a084411853a576389ecf5", "local_path": "/home/aum/repos_cache/mathematics_in_lean_source/avigad-mathematics_in_lean_source-5297e0fb051367c48c0a084411853a576389ecf5/mathematics_in_lean_source"},
+    {"name": "miniF2F-lean4", "owner": "yangky11", "commit": "9e445f5435407f014b88b44a98436d50dd7abd00", "local_path": "/home/aum/repos_cache/miniF2F-lean4/yangky11-miniF2F-lean4-9e445f5435407f014b88b44a98436d50dd7abd00/miniF2F-lean4"},
+    {"name": "pfr", "owner": "teorth", "commit": "fa398a5b853c7e94e3294c45e50c6aee013a2687", "local_path": "/home/aum/repos_cache/pfr/teorth-pfr-fa398a5b853c7e94e3294c45e50c6aee013a2687/pfr"},
+    {"name": "zeta_3_irrational", "owner": "ahhwuhu", "commit": "914712200e463cfc97fe37e929d518dd58806a38", "local_path": "/home/aum/repos_cache/zeta_3_irrational/ahhwuhu-zeta_3_irrational-914712200e463cfc97fe37e929d518dd58806a38/zeta_3_irrational"},
+]
+
+def main():
+    # Configuration for VM
+    raid_dir = pathlib.Path.home() / "LeanAgent" / "RAID"
+    
+    print("="*80)
+    print("VM BATCH EXTRACTION - Processing Local Traces (Sequential)")
+    print("="*80)
+    print(f"Found {len(VM_REPOS)} repos to process")
+    print("="*80)
+    
+    successful = []
+    failed = []
+
+    for i, repo in enumerate(VM_REPOS, 1):
+        print(f"\n[{i}/{len(VM_REPOS)}] Processing {repo['name']}...")
+        source_root = pathlib.Path(repo["local_path"])
+        
+        if not source_root.exists():
+            print(f"  !! Source path does not exist: {source_root}")
+            failed.append(repo['name'])
+            continue
+
+        out_dir = raid_dir / "data" / f"{repo['name']}_{repo['commit']}"
+        url = f"https://github.com/{repo['owner']}/{repo['name']}"
+        
+        success = process_repo(source_root, out_dir, url, repo["commit"])
+        
+        if success:
+            print(f"  ✓ Successfully processed {repo['name']}.")
+            successful.append(repo['name'])
+        else:
+            print(f"  ✗ Failed to process {repo['name']}.")
+            failed.append(repo['name'])
+    
+    # Summary
+    print("\n" + "="*80)
+    print("EXTRACTION SUMMARY")
+    print("="*80)
+    print(f"\n✓ Successful: {len(successful)}/{len(VM_REPOS)}")
+    for name in successful:
+        print(f"  - {name}")
+    
+    if failed:
+        print(f"\n✗ Failed: {len(failed)}/{len(VM_REPOS)}")
+        for name in failed:
+            print(f"  - {name}")
+    
+    print(f"\nResults saved to: {raid_dir / 'data'}")
+    print("="*80)
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/setup_vm.sh b/scripts/setup_vm.sh
new file mode 100644
index 0000000..ea67cce
--- /dev/null
+++ b/scripts/setup_vm.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+# VM Setup Script - Run this on the GCP VM
+# Installs all dependencies for LeanAgent extraction
+
+set -e
+
+echo "========================================="
+echo "Setting up VM for Lean Theorem Extraction"
+echo "========================================="
+
+# Update system
+echo "Step 1/8: Updating system packages..."
+sudo apt-get update -y
+sudo apt-get upgrade -y
+
+# Install Python 3.10
+echo "Step 2/8: Installing Python 3.10..."
+sudo apt-get install -y software-properties-common
+sudo add-apt-repository -y ppa:deadsnakes/ppa
+sudo apt-get update -y
+sudo apt-get install -y python3.10 python3.10-venv python3.10-dev python3-pip
+
+# Install system dependencies
+echo "Step 3/8: Installing system dependencies..."
+sudo apt-get install -y git unzip wget curl build-essential
+
+# Install gdown for Google Drive downloads
+echo "Step 4/8: Installing gdown..."
+pip3 install gdown
+
+# Clone LeanAgent repo
+echo "Step 5/8: Cloning LeanAgent repository..."
+cd ~
+git clone https://github.com/lean-dojo/LeanAgent.git
+cd LeanAgent
+
+# Create virtual environment
+echo "Step 6/8: Creating Python virtual environment..."
+python3.10 -m venv venv
+source venv/bin/activate
+
+# Install LeanAgent with all dependencies
+echo "Step 7/8: Installing LeanAgent and dependencies..."
+pip install --upgrade pip
+pip install -e .
+
+# Create directories and set environment variables
+echo "Step 8/8: Setting up directories and environment..."
+mkdir -p ~/LeanAgent/RAID/data
+mkdir -p ~/repos_cache
+
+# Set RAID_DIR environment variable
+echo 'export RAID_DIR=~/LeanAgent/RAID' >> ~/.bashrc
+export RAID_DIR=~/LeanAgent/RAID
+
+echo ""
+echo "========================================="
+echo "✅ VM Setup Complete!"
+echo "========================================="
+echo ""
+echo "Environment configured:"
+echo "  - Python 3.10 installed"
+echo "  - LeanAgent installed with all dependencies"
+echo "  - RAID_DIR set to ~/LeanAgent/RAID"
+echo "  - Virtual environment at ~/LeanAgent/venv"
+echo ""
+echo "Next steps:"
+echo "  1. Download repos from Google Drive"
+echo "  2. Run extraction script"
+echo ""
diff --git a/scripts/trace_paper_repos.py b/scripts/trace_paper_repos.py
index da43365..ba25aa9 100755
--- a/scripts/trace_paper_repos.py
+++ b/scripts/trace_paper_repos.py
@@ -28,29 +28,29 @@
 # hardcoded list reconstructed from the paper / convo
 # ==== Already traced / paper list ====
 PAPER_REPOS = [
-    # {"owner": "teorth", "name": "pfr", "sha": "fa398a5b853c7e94e3294c45e50c6aee013a2687"},  # ALREADY TRACED
-    # {"owner": "leanprover-community", "name": "hairy-ball-theorem", "sha": "a778826d19c8a7ddf1d26beeea628c45450612e6"},           # not found
-    # {"owner": "leanprover-community", "name": "coxeter", "sha": "96af8aee7943ca8685ed1b00cc83a559ea389a97"},            # not found
-    # {"owner": "avigad", "name": "mathematics_in_lean_source", "sha": "5297e0fb051367c48c0a084411853a576389ecf5"},  # ALREADY TRACED
-    {"owner": "mo271", "name": "FormalBook", "sha": "6fbe8c2985008c0bfb30050750a71b90388ad3a3"},  # searched commit hashes; original table SHA/owner invalid
-    # {"owner": "yangky11", "name": "miniF2F-lean4", "sha": "9e445f5435407f014b88b44a98436d50dd7abd00"},  # ALREADY TRACED
-    # {"owner": "lecopivo", "name": "SciLean", "sha": "22d53b2f4e3db2a172e71da6eb9c916e62655744"},  # ALREADY TRACED
-    {"owner": "fpvandoorn", "name": "carleson", "sha": "bec7808b907190882fa1fa54ce749af297c6cf37"},  # searched commit hashes; original table SHA/owner invalid
-    {"owner": "m4lvin", "name": "lean4-pdl", "sha": "c7f649fe3c4891cf1a01c120e82ebc5f6199856e"},  # searched commit hashes; original table SHA/owner invalid
-    # {"owner": "AlexKontorovich", "name": "PrimeNumberTheoremAnd", "sha": "29baddd685660b5fedd7bd67f9916ae24253d566"},  # ALREADY TRACED
-    # {"owner": "dwrensha", "name": "compfiles", "sha": "f99bf6f2928d47dd1a445b414b3a723c2665f091"},  # ALREADY TRACED
-    # {"owner": "ImperialCollegeLondon", "name": "FLT", "sha": "b208a302cdcbfadce33d8165f0b054bfa17e2147"},  # ALREADY TRACED
-    # {"owner": "Bachmann", "name": "debate", "sha": "7fb39251b705797ee54e08c96177fabd29a5b5a3"},   # not found
-    # {"owner": "digama0", "name": "lean4lean", "sha": "05b1f4a68c5facea96a5ee51c6a56fef21276e0f"},  # ALREADY TRACED
-    # {"owner": "eric-wieser", "name": "lean-matrix-cookbook", "sha": "f15a149d321ac99ff9b9c024b58e7882f564669f"},  # ALREADY TRACED
-    # {"owner": "yuma-mizuno", "name": "lean-math-workshop", "sha": "5acd4b933d47fd6c1032798a6046c1baf261445d"},  # ALREADY TRACED
-    # {"owner": "loganrjmurphy", "name": "LeanEuclid", "sha": "f1912c3090eb82820575758efc31e40b9db86bb8"},                      SMT ERROR
-    # {"owner": "FormalizedFormalLogic", "name": "Foundation", "sha": "d5fe5d057a90a0703a745cdc318a1b6621490c21"},                      SMT ERROR
-    # {"owner": "leanprover-community", "name": "con-nf", "sha": "00bdc85ba7d486a9e544a0806a1018dd06fa3856"},  # ALREADY TRACED
-    # {"owner": "siddhartha-gadgil", "name": "Saturn", "sha": "3811a9dd46cdfd5fa0c0c1896720c28d2ec4a42a"},    # ALREADY TRACED
-    # {"owner": "ahhwuhu", "name": "zeta_3_irrational", "sha": "914712200e463cfc97fe37e929d518dd58806a38"},  # ALREADY TRACED
-    # {"owner": "EmilGedda", "name": "Formalization-of-Constructable-Numbers", "sha": "01ef1f22a04f2ba8081c5fb29413f515a0e52878"},  # not found
-    {"owner": "YaelDillies", "name": "LeanAPAP", "sha": "951c660a8d7ba8e39f906fdf657674a984effa8b"},  # searched commit hashes; original table SHA/owner invalid
+    {"owner": "leanprover-community", "name": "hairy-ball-theorem", "sha": "a778826d19c8a7ddf1d26beeea628c45450612e6"},
+    {"owner": "leanprover-community", "name": "coxeter", "sha": "96af8aee7943ca8685ed1b00cc83a559ea389a97"},
+    {"owner": "loganrjmurphy", "name": "LeanEuclid", "sha": "f1912c3090eb82820575758efc31e40b9db86bb8"},
+    {"owner": "Louis-Le-Grand", "name": "Formalisation-of-constructable-numbers", "sha": "01ef1f22a04f2ba8081c5fb29413f515a0e52878"},
+    {"owner": "yuma-mizuno", "name": "lean-math-workshop", "sha": "5acd4b933d47fd6c1032798a6046c1baf261445d"},
+    {"owner": "google-deepmind", "name": "debate", "sha": "7fb39251b705797ee54e08c96177fabd29a5b5a3"},
+    {"owner": "teorth", "name": "pfr", "sha": "fa398a5b853c7e94e3294c45e50c6aee013a2687"},
+    {"owner": "avigad", "name": "mathematics_in_lean_source", "sha": "5297e0fb051367c48c0a084411853a576389ecf5"},
+    {"owner": "fpvandoorn", "name": "carleson", "sha": "bec7808b907190882fa1fa54ce749af297c6cf37"},
+    {"owner": "lecopivo", "name": "SciLean", "sha": "22d53b2f4e3db2a172e71da6eb9c916e62655744"},
+    {"owner": "mo271", "name": "FormalBook", "sha": "6fbe8c2985008c0bfb30050750a71b90388ad3a3"},
+    {"owner": "yangky11", "name": "miniF2F-lean4", "sha": "9e445f5435407f014b88b44a98436d50dd7abd00"},
+    {"owner": "m4lvin", "name": "lean4-pdl", "sha": "c7f649fe3c4891cf1a01c120e82ebc5f6199856e"},
+    {"owner": "AlexKontorovich", "name": "PrimeNumberTheoremAnd", "sha": "29baddd685660b5fedd7bd67f9916ae24253d566"},
+    {"owner": "dwrensha", "name": "compfiles", "sha": "f99bf6f2928d47dd1a445b414b3a723c2665f091"},
+    {"owner": "ImperialCollegeLondon", "name": "FLT", "sha": "b208a302cdcbfadce33d8165f0b054bfa17e2147"},
+    {"owner": "digama0", "name": "lean4lean", "sha": "05b1f4a68c5facea96a5ee51c6a56fef21276e0f"},
+    {"owner": "eric-wieser", "name": "lean-matrix-cookbook", "sha": "f15a149d321ac99ff9b9c024b58e7882f564669f"},
+    {"owner": "FormalizedFormalLogic", "name": "Foundation", "sha": "d5fe5d057a90a0703a745cdc318a1b6621490c21"},
+    {"owner": "leanprover-community", "name": "con-nf", "sha": "00bdc85ba7d486a9e544a0806a1018dd06fa3856"},
+    {"owner": "siddhartha-gadgil", "name": "Saturn", "sha": "3811a9dd46cdfd5fa0c0c1896720c28d2ec4a42a"},
+    {"owner": "ahhwuhu", "name": "zeta_3_irrational", "sha": "914712200e463cfc97fe37e929d518dd58806a38"},
+    {"owner": "YaelDillies", "name": "LeanAPAP", "sha": "951c660a8d7ba8e39f906fdf657674a984effa8b"},
 ]
 
 
diff --git a/train_leanagent.py b/train_leanagent.py
new file mode 100644
index 0000000..f7cc050
--- /dev/null
+++ b/train_leanagent.py
@@ -0,0 +1,199 @@
+import os
+import sys
+import json
+import traceback
+import torch
+import pytorch_lightning as pl
+from pytorch_lightning import seed_everything
+from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor, ModelCheckpoint
+from pytorch_lightning.strategies import DDPStrategy
+from datetime import timedelta
+from loguru import logger
+
+# Import from existing leanagent modules
+from filenames import RAID_DIR, DATA_DIR, CHECKPOINT_DIR, DB_FILE_NAME
+from dynamic_database import DynamicDatabase
+from retrieval.datamodule import RetrievalDataModule
+from retrieval.model import PremiseRetriever
+import generate_benchmark_lean4
+
+def initialize_database(dynamic_database_json_path: str) -> DynamicDatabase:
+    """Initializes or loads the dynamic database."""
+    if not os.path.exists(dynamic_database_json_path):
+        raise FileNotFoundError(f"Database file not found at {dynamic_database_json_path}. Please run build_merged_dataset.py first.")
+    
+    logger.info(f"Loading database from {dynamic_database_json_path}")
+    db = DynamicDatabase.from_json(dynamic_database_json_path)
+    logger.info(f"Loaded database with {len(db.repositories)} repositories")
+    return db
+
+def main():
+    """
+    Simplified training script for LeanAgent.
+    """
+    try:
+        # Configuration
+        BATCH_SIZE = 4
+        current_epoch = 0
+        epochs_per_repo = 1 # We treat the merged dataset as one "repo" for epoch counting
+        lambda_value = 0.1 # For progressive training
+        
+        # Paths
+        if not RAID_DIR:
+            raise ValueError("RAID_DIR environment variable is not set.")
+            
+        dynamic_database_json_path = os.path.join(RAID_DIR, DB_FILE_NAME)
+        # We use the merged dataset we created
+        # Note: build_merged_dataset.py created it at RAID_DIR/data/merged_paper_subset
+        new_data_path = os.path.join(DATA_DIR, "merged_paper_subset")
+        
+        if not os.path.exists(new_data_path):
+             raise FileNotFoundError(f"Merged dataset not found at {new_data_path}")
+
+        # Setup
+        logger.info("Configuring LeanDojo...")
+        generate_benchmark_lean4.configure_leandojo()
+        logger.info("LeanDojo configured")
+
+        db = initialize_database(dynamic_database_json_path)
+        
+        # Training Setup
+        logger.info("Starting Training Loop")
+        
+        # Find latest checkpoint or use default
+        model_checkpoint_path = None
+        try:
+            # Simple logic to find latest checkpoint
+            all_checkpoints = [os.path.join(CHECKPOINT_DIR, f) for f in os.listdir(CHECKPOINT_DIR) if f.endswith(".ckpt")]
+            if all_checkpoints:
+                model_checkpoint_path = max(all_checkpoints, key=os.path.getmtime)
+                logger.info(f"Found latest checkpoint: {model_checkpoint_path}")
+        except Exception as e:
+            logger.warning(f"Could not find existing checkpoints: {e}")
+
+        if not model_checkpoint_path:
+             # Fallback to a base checkpoint if available, or let the model initialize from scratch/huggingface
+             # The original script defaults to a specific mathlib checkpoint. 
+             # We will try to use that if it exists, otherwise None (which might fail if PremiseRetriever expects it)
+             default_ckpt = f"{RAID_DIR}/checkpoints/mathlib4_29dcec074de168ac2bf835a77ef68bbe069194c5.ckpt"
+             if os.path.exists(default_ckpt):
+                 model_checkpoint_path = default_ckpt
+                 logger.info(f"Using default mathlib checkpoint: {model_checkpoint_path}")
+             else:
+                 logger.warning("No checkpoint found. Training might start from scratch or fail if a base model is required.")
+
+        seed_everything(3407)
+        
+        if not torch.cuda.is_available():
+            logger.warning("CUDA is not available. Training will be extremely slow on CPU.")
+            device = torch.device("cpu")
+        else:
+            device = torch.device("cuda")
+
+        config = {
+            "model_name": "kaiyuy/leandojo-lean4-retriever-byt5-small",
+            "lr": 1e-3,
+            "warmup_steps": 1000,
+            "max_seq_len": 512,
+            "num_retrieved": 100,
+        }
+
+        # Load Model
+        if model_checkpoint_path:
+             model = PremiseRetriever.load(model_checkpoint_path, device, freeze=False, config=config)
+             logger.info(f"Loaded premise retriever from {model_checkpoint_path}")
+        else:
+             # If no checkpoint, initialize fresh model from HuggingFace
+             logger.info("Initializing new model from HuggingFace config...")
+             model = PremiseRetriever(
+                 model_name=config["model_name"],
+                 lr=config["lr"],
+                 warmup_steps=config["warmup_steps"],
+                 max_seq_len=config["max_seq_len"],
+                 num_retrieved=config["num_retrieved"]
+             )
+
+        model.train()
+        model.set_lambda(lambda_value)
+
+        # Callbacks
+        dir_name = "merged_paper_subset"
+        filename_suffix = f"_lambda_{lambda_value}"
+        
+        checkpoint_callback = ModelCheckpoint(
+            dirpath=CHECKPOINT_DIR,
+            filename=dir_name + filename_suffix + "_{epoch}-{Recall@10_val:.2f}",
+            verbose=True,
+            save_top_k=-1,
+            every_n_epochs=1,
+            monitor="Recall@10_val",
+            mode="max",
+        )
+
+        early_stop_callback = EarlyStopping(
+            monitor="Recall@10_val", patience=5, mode="max", verbose=True
+        )
+
+        lr_monitor = LearningRateMonitor(logging_interval="step")
+
+        # Environment for DDP
+        VERY_LONG_TIMEOUT = 7 * 24 * 60 * 60  # 1 week
+        os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "1"
+        os.environ["NCCL_TIMEOUT"] = str(VERY_LONG_TIMEOUT * 1000)
+
+        custom_log_dir = os.path.join(RAID_DIR, "lightning_logs", f"{dir_name}_lambda_{lambda_value}")
+        os.makedirs(custom_log_dir, exist_ok=True)
+
+        # Trainer
+        # Adjust devices based on availability
+        num_gpus = torch.cuda.device_count()
+        devices = num_gpus if num_gpus > 0 else 1
+        accelerator = "gpu" if num_gpus > 0 else "cpu"
+        strategy = DDPStrategy(timeout=timedelta(seconds=VERY_LONG_TIMEOUT)) if num_gpus > 1 else "auto"
+
+        trainer = pl.Trainer(
+            accelerator=accelerator,
+            gradient_clip_val=1.0,
+            precision="bf16-mixed" if num_gpus > 0 else 32, # bf16 might not work on CPU
+            strategy=strategy,
+            devices=devices,
+            accumulate_grad_batches=4,
+            callbacks=[lr_monitor, checkpoint_callback, early_stop_callback],
+            max_epochs=current_epoch + 5, # Train for 5 epochs for now
+            log_every_n_steps=1,
+            num_sanity_val_steps=0,
+            default_root_dir=custom_log_dir,
+        )
+
+        # Data Module
+        corpus_path = os.path.join(new_data_path, "corpus.jsonl")
+        data_path_random = os.path.join(new_data_path, "random")
+        
+        logger.info(f"Loading data from {data_path_random}")
+        data_module = RetrievalDataModule(
+            data_path=data_path_random,
+            corpus_path=corpus_path,
+            num_negatives=3,
+            num_in_file_negatives=1,
+            model_name="google/byt5-small",
+            batch_size=BATCH_SIZE,
+            eval_batch_size=64,
+            max_seq_len=1024,
+            num_workers=4,
+        )
+        data_module.setup(stage="fit")
+
+        logger.info(f"Training dataset size: {len(data_module.ds_train)}")
+        logger.info(f"Validation dataset size: {len(data_module.ds_val)}")
+
+        # Train
+        logger.info("Starting trainer.fit...")
+        trainer.fit(model, datamodule=data_module, ckpt_path=model_checkpoint_path)
+        logger.info("Training finished!")
+
+    except Exception as e:
+        logger.error(f"An error occurred: {e}")
+        traceback.print_exc()
+
+if __name__ == "__main__":
+    main()

From 3f145c54b74a50b82b0a4a28bc817bb4f09fe079 Mon Sep 17 00:00:00 2001
From: aumrp77 <aumrp77@gmail.com>
Date: Mon, 5 Jan 2026 19:27:44 +0530
Subject: [PATCH 29/29] Fix lifelong learning bugs for 23-repo reproduction

---
 leanagent.py | 115 +++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 107 insertions(+), 8 deletions(-)

diff --git a/leanagent.py b/leanagent.py
index 5b2afb8..8f9779d 100644
--- a/leanagent.py
+++ b/leanagent.py
@@ -624,10 +624,10 @@ def main():
         current_epoch = 0
         epochs_per_repo = 1
         run_progressive_training = True
-        use_fisher = False
-        single_repo = True
+        use_fisher = True  # FIXED: Enable EWC for lifelong learning
+        single_repo = False  # FIXED: Enable cumulative learning across repos
         curriculum_learning = True
-        num_repos = 3
+        num_repos = 23  # FIXED: Full paper reproduction (was 3)
         dynamic_database_json_path = os.path.join(RAID_DIR, DB_FILE_NAME)
 
         lambdas = None
@@ -644,6 +644,9 @@ def main():
         logger.info("LeanDojo configured")
 
         db = initialize_database(dynamic_database_json_path)
+        # FIXED: Create required directories
+        os.makedirs(FISHER_DIR, exist_ok=True)
+        os.makedirs(CHECKPOINT_DIR, exist_ok=True)
         logger.info(f"Found {num_repos} repositories")
 
         lean_git_repos, repos, updated_repos = get_repos(curriculum_learning, num_repos, dynamic_database_json_path, db)
@@ -688,7 +691,8 @@ def main():
                     repos_for_proving = []
 
                     # Create a directory for the merged dataset if it doesn't exist
-                    dst_dir = Path(RAID_DIR) / DATA_DIR / f"merged_with_new_{dir_name}"
+                    # FIXED: Use DATA_DIR directly (it's already full path)
+                    dst_dir = Path(DATA_DIR) / f"merged_with_new_{dir_name}"
                     if (repo.url, repo.commit) not in repos_for_merged_dataset:
                         logger.info("Adding repo to repos_for_merged_dataset")
                         repos_for_merged_dataset.append((repo.url, repo.commit))
@@ -962,10 +966,53 @@ def main():
                     if ray.is_initialized():
                         logger.info("Shutting down Ray before proving")
                         ray.shutdown()
+                    
+                    # ADDED: Compute Fisher Information Matrix after training (before proving)
+                    if use_fisher and i < num_repos - 1:  # Don't compute Fisher after last repo
+                        logger.info("="*80)
+                        logger.info("COMPUTING FISHER INFORMATION MATRIX")
+                        logger.info("="*80)
+                        
+                        from retrieval.fisher_computation_module import FisherComputationModule
+                        
+                        # Create Fisher computation module with current best model
+                        fisher_module = FisherComputationModule(best_model)
+                        
+                        # Setup trainer for Fisher computation
+                        fisher_trainer = pl.Trainer(
+                            accelerator="gpu",
+                            precision="bf16-mixed",
+                            strategy=ddp_strategy,
+                            devices=4,
+                            max_epochs=1,
+                            log_every_n_steps=1,
+                            num_sanity_val_steps=0,
+                        )
+                        
+                        try:
+                            logger.info("Computing Fisher matrix...")
+                            fisher_trainer.strategy.barrier()
+                            fisher_trainer.fit(fisher_module, datamodule=data_module)
+                            fisher_trainer.strategy.barrier()
+                            
+                            # Save the Fisher Information Matrix
+                            if fisher_trainer.is_global_zero:
+                                fisher_file_path = os.path.join(
+                                    FISHER_DIR,
+                                    f"fisher_info_{dir_name}_distributed.pkl",
+                                )
+                                fisher_module.save_fisher_info(fisher_file_path)
+                                logger.info(f"Fisher Information Matrix saved at {fisher_file_path}")
+                        except Exception as e:
+                            logger.error(f"Error during Fisher computation: {str(e)}")
+                            print(traceback.format_exc())
+                        
+                        logger.info("Finished computing Fisher matrix")
 
                     # Set up the prover
                     use_vllm = False
-                    corpus_path = dst_dir + "/corpus.jsonl"
+                    # FIXED: Use os.path.join instead of string concatenation
+                    corpus_path = os.path.join(str(dst_dir), "corpus.jsonl")
                     tactic = (
                         None  # `None` since we are not using a fixed tactic generator
                     )
@@ -1034,9 +1081,61 @@ def main():
                 logger.info("Finished processing the repository")
                 current_epoch += epochs_per_repo
                 logger.info(f"current epoch: {current_epoch}")
-                if use_fisher:
-                    # Need to return to compute the FIM
-                    return
+                # FIXED: Removed early return to allow all 23 repos to be processed
+                # Fisher computation will happen between repos via external script
+                # if use_fisher:
+                #     # Need to return to compute the FIM\n                #     return
+                
+                # ADDED: Second sorry proving pass after all repos ("Add. After")
+                if is_main_process and i == num_repos - 1:
+                    logger.info("=" * 80)
+                    logger.info("STARTING SECOND PASS: 'Add. After' with final model")
+                    logger.info("=" * 80)
+                    
+                    if ray.is_initialized():
+                        logger.info("Shutting down Ray before second pass")
+                        ray.shutdown()
+                    
+                    # Use the final checkpoint  
+                    try:
+                        final_checkpoint = find_latest_checkpoint()
+                        logger.info(f"Using final checkpoint for second pass: {final_checkpoint}")
+                    except FileNotFoundError:
+                        logger.error("No checkpoint found for second pass")
+                        
+                    # Create new prover with final model
+                    final_prover = DistributedProver(
+                        use_vllm,
+                        ckpt_path,
+                        corpus_path,
+                        tactic,
+                        module,
+                        num_workers,
+                        num_gpus=num_gpus,
+                        timeout=timeout,
+                        max_expansions=max_expansions,
+                        num_sampled_tactics=num_sampled_tactics,
+                        raid_dir=RAID_DIR,
+                        checkpoint_dir=CHECKPOINT_DIR,
+                        debug=debug,
+                        run_progressive_training=run_progressive_training,
+                    )
+                    
+                    # Reprove ALL repos with final model
+                    logger.info("Reproving ALL repositories with final model for 'Add. After' pass")
+                    prove_sorry_theorems(
+                        db,
+                        final_prover,
+                        dynamic_database_json_path,
+                        repos_to_include=None,  # All repos
+                    )
+                    
+                    save_database_locked(db, dynamic_database_json_path)
+                    logger.info("Completed 'Add. After' pass")
+                    
+                    if ray.is_initialized():
+                        logger.info("Shutting down Ray after second pass")
+                        ray.shutdown()
 
     except Exception as e:
         logger.info(f"An error occurred: {e}", file=sys.stderr)